Checkpoint.

2026-04-30 01:29:16 -05:00 · 2026-04-30 01:29:16 -05:00 · 6d7eae0356
commit 6d7eae0356
parent 55c1ae1c3e
48 changed files with 8714 additions and 366 deletions
--- a/runtime/build.sh
+++ b/runtime/build.sh
@ -1,18 +1,38 @@
 #!/usr/bin/env bash
-# Assemble the W65816 runtime library to runtime/libgcc.o.
-# Run after editing runtime/src/*.s.
+# Build the entire W65816 runtime — assemble *.s, compile *.c.
+# Run after editing anything under runtime/src/.

 set -euo pipefail
 PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
 LLVM_MC="$PROJECT_ROOT/tools/llvm-mos-build/bin/llvm-mc"
+CLANG="$PROJECT_ROOT/tools/llvm-mos-build/bin/clang"

-[ -x "$LLVM_MC" ] || {
-    echo "llvm-mc not found at $LLVM_MC" >&2
-    exit 1
+[ -x "$LLVM_MC" ] || { echo "llvm-mc not found at $LLVM_MC" >&2; exit 1; }
+[ -x "$CLANG" ]   || { echo "clang not found at $CLANG" >&2; exit 1; }
+
+SRC="$PROJECT_ROOT/runtime/src"
+OUT="$PROJECT_ROOT/runtime"
+
+asm() {
+    local s="$1"
+    local o="$OUT/$(basename "${s%.s}").o"
+    echo "  AS  $(basename "$s")"
+    "$LLVM_MC" -arch=w65816 -filetype=obj "$s" -o "$o"
 }

-"$LLVM_MC" -arch=w65816 -filetype=obj \
-    "$PROJECT_ROOT/runtime/src/libgcc.s" \
-    -o "$PROJECT_ROOT/runtime/libgcc.o"
+cc() {
+    local c="$1"
+    local o="$OUT/$(basename "${c%.c}").o"
+    echo "  CC  $(basename "$c")"
+    "$CLANG" -target w65816 -O2 -ffunction-sections \
+        -I"$PROJECT_ROOT/runtime/include" \
+        -c "$c" -o "$o"
+}

-echo "built runtime/libgcc.o"
+asm "$SRC/crt0.s"
+asm "$SRC/libgcc.s"
+cc  "$SRC/libc.c"
+cc  "$SRC/softFloat.c"
+cc  "$SRC/softDouble.c"
+
+echo "runtime built: $(ls -1 "$OUT"/*.o | wc -l) objects"
--- a/runtime/include/assert.h
+++ b/runtime/include/assert.h
@ -0,0 +1,14 @@
+#ifndef _ASSERT_H
+#define _ASSERT_H
+
+void __assert_fail(const char *expr, const char *file, unsigned int line,
+                   const char *func) __attribute__((noreturn));
+
+#ifdef NDEBUG
+# define assert(x) ((void)0)
+#else
+# define assert(x) ((x) ? (void)0 : \
+    __assert_fail(#x, __FILE__, __LINE__, __func__))
+#endif
+
+#endif
--- a/runtime/include/ctype.h
+++ b/runtime/include/ctype.h
@ -0,0 +1,16 @@
+#ifndef _CTYPE_H
+#define _CTYPE_H
+
+int isdigit(int c);
+int isupper(int c);
+int islower(int c);
+int isalpha(int c);
+int isalnum(int c);
+int isspace(int c);
+int isxdigit(int c);
+int isprint(int c);
+int ispunct(int c);
+int toupper(int c);
+int tolower(int c);
+
+#endif
--- a/runtime/include/errno.h
+++ b/runtime/include/errno.h
@ -0,0 +1,17 @@
+#ifndef _ERRNO_H
+#define _ERRNO_H
+
+extern int errno;
+int *__errno_location(void);
+
+// Standard error codes (subset; matches glibc numbering).
+#define EPERM   1
+#define ENOENT  2
+#define EIO     5
+#define EBADF   9
+#define ENOMEM  12
+#define EACCES  13
+#define EINVAL  22
+#define ENOSPC  28
+
+#endif
--- a/runtime/include/iigs/toolbox.h
+++ b/runtime/include/iigs/toolbox.h
@ -0,0 +1,112 @@
+// IIgs toolbox helpers — minimal inline-asm wrappers for the most
+// commonly-used Apple IIgs system calls.
+//
+// Toolbox dispatch on the IIgs goes through the Tool Locator at
+// $E10000.  Each routine is identified by a 16-bit "tool number"
+// (low byte = tool set, high byte = function within set), loaded
+// into X, and called via JSL $E10000.
+//
+// Args go on the stack (push order: rightmost first), then the
+// caller pushes a result-space slot if the routine returns something
+// non-i16-or-pointer, then JSL.
+//
+// This header keeps things simple: each function inlines a tiny
+// asm block specific to that call.  No #include guards on bigger
+// abstractions; users that want full toolbox coverage should write
+// their own wrappers using the same pattern.
+//
+// LIMITATIONS:
+//   - Only a handful of routines wrapped.  Calypsi has full toolbox.
+//   - No error-handling — caller checks the return.
+//   - Single-bank only.  Cross-bank toolbox calls need different
+//     dispatch logic.
+
+#ifndef IIGS_TOOLBOX_H
+#define IIGS_TOOLBOX_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Tool number convention: high byte = function, low byte = tool set.
+// Common tool sets: 04 = Misc, 0E = QuickDraw II, 18 = Window Mgr.
+
+// Misc Tool Set ---------------------------------------------------
+
+// WriteCString (Misc Tool $290B) — write a NUL-terminated string to
+// the text screen.  Arg: 16-bit pointer pushed before the call.
+// Returns nothing.
+static inline void TBoxWriteCString(const char *s) {
+    __asm__ volatile (
+        "pha\n"                 // push C-string pointer
+        "ldx #0x290B\n"         // tool number (function 0x29, set 0x0B)
+        "jsl 0xe10000\n"        // tool dispatcher
+        :
+        : "a"(s)
+        : "x", "y", "memory"
+    );
+}
+
+// SysBeep (Misc Tool $0303) — short beep through the speaker.
+static inline void TBoxBeep(void) {
+    __asm__ volatile (
+        "ldx #0x0303\n"
+        "jsl 0xe10000\n"
+        :
+        :
+        : "x", "y", "memory"
+    );
+}
+
+// ReadKey (Event Mgr; simplified — actually KeyTrans/etc).  Returns
+// the next pending key in A, or 0 if none.  This wraps GetNextEvent
+// internally on a real GS; for the simple console harness it polls
+// the keyboard buffer.
+static inline char TBoxReadKey(void) {
+    char r;
+    __asm__ volatile (
+        "ldx #0x250A\n"         // GetEvent (placeholder; refine in real port)
+        "jsl 0xe10000\n"
+        : "=a"(r)
+        :
+        : "x", "y", "memory"
+    );
+    return r;
+}
+
+// ConsoleQuit — clean program shutdown via GS/OS Quit.  Pushes a
+// pConditionTbl pointer (here, 0 for no condition) before JSL.
+static inline void TBoxQuit(void) {
+    __asm__ volatile (
+        "pea 0\n"               // pConditionTbl = NULL
+        "pea 0\n"               // pParm
+        "ldx #0x2029\n"         // GS/OS Quit
+        "jsl 0xe100a8\n"        // GS/OS dispatcher (different addr)
+        :
+        :
+        : "x", "y", "memory"
+    );
+    while (1) {}                // unreachable
+}
+
+// QuickDraw II ----------------------------------------------------
+
+// QDStartUp / QDShutDown (sketches — real ones take more args).
+// Real apps typically use QuickDraw II via the "shell" startup
+// sequence; this is for educational/sim scenarios.
+static inline void TBoxQDStartUp(void) {
+    __asm__ volatile (
+        "pea 0\n" "pea 0\n" "pea 0\n"     // dummy direct-page handle
+        "ldx #0x0204\n"
+        "jsl 0xe10000\n"
+        :
+        :
+        : "x", "y", "memory"
+    );
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // IIGS_TOOLBOX_H
--- a/runtime/include/setjmp.h
+++ b/runtime/include/setjmp.h
@ -0,0 +1,11 @@
+// W65816 setjmp/longjmp — saves SP, return address (24-bit), and DP.
+// jmp_buf is 8 bytes of opaque storage.
+#ifndef _SETJMP_H
+#define _SETJMP_H
+
+typedef unsigned char jmp_buf[8];
+
+int  setjmp(jmp_buf env);
+void longjmp(jmp_buf env, int val) __attribute__((noreturn));
+
+#endif
--- a/runtime/include/stdio.h
+++ b/runtime/include/stdio.h
@ -0,0 +1,36 @@
+#ifndef _STDIO_H
+#define _STDIO_H
+
+#include <stdarg.h>
+
+typedef struct __sFILE FILE;
+typedef unsigned int   size_t;
+
+extern FILE *stdin;
+extern FILE *stdout;
+extern FILE *stderr;
+
+int  putchar(int c);
+int  puts(const char *s);
+int  printf(const char *fmt, ...);
+int  vprintf(const char *fmt, va_list ap);
+int  fprintf(FILE *stream, const char *fmt, ...);
+int  fputc(int c, FILE *stream);
+int  fputs(const char *s, FILE *stream);
+int  fflush(FILE *stream);
+int  fclose(FILE *stream);
+
+FILE *fopen(const char *path, const char *mode);
+size_t fread(void *ptr, size_t size, size_t nmemb, FILE *stream);
+size_t fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream);
+int    fseek(FILE *stream, long offset, int whence);
+long   ftell(FILE *stream);
+int    feof(FILE *stream);
+int    ferror(FILE *stream);
+void   clearerr(FILE *stream);
+
+#define SEEK_SET 0
+#define SEEK_CUR 1
+#define SEEK_END 2
+
+#endif
--- a/runtime/include/stdlib.h
+++ b/runtime/include/stdlib.h
@ -0,0 +1,24 @@
+#ifndef _STDLIB_H
+#define _STDLIB_H
+
+typedef unsigned int size_t;
+
+void  *malloc(size_t n);
+void  *calloc(size_t nmemb, size_t size);
+void  *realloc(void *ptr, size_t n);
+void   free(void *p);
+
+int    abs(int n);
+long   labs(long n);
+int    atoi(const char *s);
+
+void   exit(int code) __attribute__((noreturn));
+void   abort(void)    __attribute__((noreturn));
+
+typedef void (*__atexit_fn)(void);
+int    atexit(__atexit_fn fn);
+
+#define EXIT_SUCCESS 0
+#define EXIT_FAILURE 1
+
+#endif
--- a/runtime/include/string.h
+++ b/runtime/include/string.h
@ -0,0 +1,23 @@
+#ifndef _STRING_H
+#define _STRING_H
+
+typedef unsigned int size_t;
+
+void  *memcpy(void *dst, const void *src, size_t n);
+void  *memmove(void *dst, const void *src, size_t n);
+void  *memset(void *dst, int c, size_t n);
+int    memcmp(const void *a, const void *b, size_t n);
+void  *memchr(const void *s, int c, size_t n);
+
+size_t strlen(const char *s);
+char  *strcpy(char *dst, const char *src);
+char  *strncpy(char *dst, const char *src, size_t n);
+int    strcmp(const char *a, const char *b);
+int    strncmp(const char *a, const char *b, size_t n);
+char  *strchr(const char *s, int c);
+char  *strrchr(const char *s, int c);
+char  *strstr(const char *haystack, const char *needle);
+
+char  *strerror(int err);
+
+#endif
--- a/runtime/include/time.h
+++ b/runtime/include/time.h
@ -0,0 +1,12 @@
+#ifndef _TIME_H
+#define _TIME_H
+
+typedef long          time_t;
+typedef unsigned long clock_t;
+
+#define CLOCKS_PER_SEC 60   // IIgs vsync tick (placeholder)
+
+time_t  time(time_t *t);
+clock_t clock(void);
+
+#endif
--- a/runtime/src/crt0.s
+++ b/runtime/src/crt0.s
@ -0,0 +1,95 @@
+; crt0 — C runtime startup for the W65816 backend.
+;
+; Entry point invoked by the loader (or the OMF dispatcher).  Sets up
+; the processor mode the rest of the runtime expects, zeroes BSS,
+; calls main, and halts on return.
+;
+; Conventions:
+;   - Native mode (E=0), 16-bit M and X (REP #$30) on entry to main.
+;   - DP=0, DBR=0 — assumed by the C runtime.
+;   - Linker-emitted symbols: __bss_start, __bss_end (16-bit addrs).
+
+	.text
+
+	.globl __start
+__start:
+	; Disable IRQ first — the IIgs ROM hands a vsync IRQ on every frame,
+	; and its handler runs in 8-bit M/X mode, corrupting our state if
+	; we leave I clear.  SEI is fine in either emulation or native
+	; mode and is always 1 byte / 2 cycles.
+	sei
+	; Native mode + 16-bit registers.
+	clc
+	xce
+	rep #0x30
+	; Disable IIgs peripheral interrupt sources at the chip level —
+	; SEI alone leaves the hardware lines asserted, and the IRQ trap
+	; in ROM keeps re-firing if the source isn't quiesced.
+	sep #0x20
+	.byte 0xa9, 0x00         ; lda #$00 (8-bit M)
+	sta 0xc041               ; INTEN = 0  (clear AN3/mouse/0.25s/VBL/mouse-IRQ enables)
+	sta 0xc023               ; VGCINT = 0 (clear external/1-sec/scan-line IRQ enables)
+	sta 0xc032               ; SCANINT clear
+	rep #0x20
+
+	; Top-of-stack at $01FF (one bank).  Loaders may already do this.
+	lda #0x01ff
+	tcs
+
+	; Zero BSS.  X iterates from __bss_start to __bss_end; each
+	; iteration writes one byte of zero at addr X (via DP=0 +
+	; offset 0 — which is just X).  Wraps in 8-bit M for the
+	; byte-store.
+	rep #0x10                ; ensure X is 16-bit
+	ldx #__bss_start
+.Lbss_loop:
+	cpx #__bss_end
+	bcs .Lbss_done           ; X >= end -> done
+	sep #0x20                ; 8-bit M for 1-byte store
+	; llvm-mc doesn't track SEP/REP — `lda #$0` after SEP gets
+	; encoded as a 3-byte 16-bit immediate, so the CPU reads
+	; `a9 00 00` = LDA #$00 then BRK.  Force the 1-byte form
+	; with raw bytes.
+	.byte 0xa9, 0x00         ; lda #$00 (8-bit M imm)
+	sta 0x0, x               ; *(uint8_t *)X = 0   (DP=0)
+	rep #0x20
+	inx
+	bra .Lbss_loop
+.Lbss_done:
+
+	; Run static constructors.  The linker emits
+	; __init_array_start / __init_array_end around the .init_array
+	; section; each entry is a 16-bit function pointer.  Walk and
+	; JSL each via __jsl_indir.
+	rep #0x30                ; native, 16-bit M and X
+	ldx #__init_array_start
+.Linit_loop:
+	cpx #__init_array_end
+	bcs .Linit_done
+	; __jsl_indir does `JMP (__indirTarget)` — reads a 16-bit ptr
+	; from __indirTarget and JMPs there.  So __indirTarget must
+	; hold the function pointer itself (NOT the address of the
+	; init_array slot).  Dereference the entry: ($E0)→A.
+	stx 0xe0                 ; entry addr -> DP scratch
+	ldy #0
+	; llvm-mc parses `lda (0xe0), y` as `lda 0xe0, y` (absolute,Y);
+	; force the DP-indirect-Y opcode B1 with raw bytes.
+	.byte 0xb1, 0xe0         ; lda ($E0), y → A = mem[X]
+	sta __indirTarget        ; __indirTarget = function pointer
+	phx                      ; preserve X across the call
+	jsl __jsl_indir
+	plx
+	inx
+	inx
+	bra .Linit_loop
+.Linit_done:
+
+	; Call main.  Standard W65816 ABI: i16 first arg in A; we pass
+	; nothing.  After return, A holds the exit code.
+	jsl main
+
+	; Halt via BRK $00.  MAME / debuggers catch this as a clean
+	; program termination.
+	.byte 0x00, 0x00
+
+	.size __start, . - __start
--- a/runtime/src/libc.c
+++ b/runtime/src/libc.c
@ -0,0 +1,664 @@
+// Minimal libc for the W65816 backend.  Provides:
+//   string.h: memcpy, memset, memmove, memcmp, strlen, strcpy, strcmp,
+//             strncpy, strncmp, strchr, strrchr
+//   ctype.h:  isdigit, isalpha, isalnum, isspace, isupper, islower,
+//             toupper, tolower, isxdigit, isprint, ispunct
+//   stdlib.h: abs, labs, atoi
+//
+// All functions are straightforward implementations using only
+// integer ops.  Each is short enough that internal conditional
+// branches stay within 8-bit PCREL reach.
+//
+// Output goes (eventually) through a putchar stub that targets a
+// memory-mapped IO port or a MAME-debug Lua hook; for now putchar
+// is provided as a weak stub that does nothing.
+
+typedef unsigned int  size_t;
+typedef int           ssize_t;
+typedef unsigned char u8;
+
+// ---- string.h ----
+
+void *memcpy(void *dst, const void *src, size_t n) {
+    char *d = (char *)dst;
+    const char *s = (const char *)src;
+    while (n--) *d++ = *s++;
+    return dst;
+}
+
+void *memmove(void *dst, const void *src, size_t n) {
+    char *d = (char *)dst;
+    const char *s = (const char *)src;
+    if (d < s) {
+        while (n--) *d++ = *s++;
+    } else {
+        d += n; s += n;
+        while (n--) *--d = *--s;
+    }
+    return dst;
+}
+
+void *memset(void *dst, int c, size_t n) {
+    char *d = (char *)dst;
+    while (n--) *d++ = (char)c;
+    return dst;
+}
+
+int memcmp(const void *a, const void *b, size_t n) {
+    const u8 *p = (const u8 *)a;
+    const u8 *q = (const u8 *)b;
+    while (n--) {
+        if (*p != *q) return *p - *q;
+        p++; q++;
+    }
+    return 0;
+}
+
+size_t strlen(const char *s) {
+    size_t n = 0;
+    while (*s++) n++;
+    return n;
+}
+
+char *strcpy(char *dst, const char *src) {
+    char *d = dst;
+    while ((*d++ = *src++)) {}
+    return dst;
+}
+
+char *strncpy(char *dst, const char *src, size_t n) {
+    char *d = dst;
+    while (n && (*d = *src)) { d++; src++; n--; }
+    while (n--) *d++ = 0;
+    return dst;
+}
+
+int strcmp(const char *a, const char *b) {
+    while (*a && *a == *b) { a++; b++; }
+    return (u8)*a - (u8)*b;
+}
+
+int strncmp(const char *a, const char *b, size_t n) {
+    while (n && *a && *a == *b) { a++; b++; n--; }
+    if (!n) return 0;
+    return (u8)*a - (u8)*b;
+}
+
+char *strchr(const char *s, int c) {
+    while (*s) {
+        if (*s == (char)c) return (char *)s;
+        s++;
+    }
+    if ((char)c == 0) return (char *)s;
+    return 0;
+}
+
+char *strrchr(const char *s, int c) {
+    const char *r = 0;
+    while (*s) {
+        if (*s == (char)c) r = s;
+        s++;
+    }
+    if ((char)c == 0) return (char *)s;
+    return (char *)r;
+}
+
+// ---- ctype.h ----
+
+int isdigit(int c) { return c >= '0' && c <= '9'; }
+int isupper(int c) { return c >= 'A' && c <= 'Z'; }
+int islower(int c) { return c >= 'a' && c <= 'z'; }
+int isalpha(int c) { return isupper(c) || islower(c); }
+int isalnum(int c) { return isalpha(c) || isdigit(c); }
+int isspace(int c) {
+    return c == ' ' || c == '\t' || c == '\n' ||
+           c == '\r' || c == '\v' || c == '\f';
+}
+int isxdigit(int c) {
+    return isdigit(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
+}
+int isprint(int c)  { return c >= 0x20 && c < 0x7f; }
+int ispunct(int c)  { return isprint(c) && !isalnum(c) && c != ' '; }
+
+int toupper(int c) { return islower(c) ? c - 32 : c; }
+int tolower(int c) { return isupper(c) ? c + 32 : c; }
+
+// ---- stdlib.h ----
+
+int abs(int n)        { return n < 0 ? -n : n; }
+long labs(long n)     { return n < 0 ? -n : n; }
+
+int atoi(const char *s) {
+    int sign = 1;
+    int n = 0;
+    while (isspace(*s)) s++;
+    if (*s == '-') { sign = -1; s++; }
+    else if (*s == '+') { s++; }
+    while (isdigit(*s)) {
+        n = n * 10 + (*s - '0');
+        s++;
+    }
+    return sign * n;
+}
+
+// ---- stdio.h essentials (stubs) ----
+
+// putchar: by default, writes to direct-page slot $E2 (which the
+// emulator harness can poll).  Real targets (MAME with our IIgs
+// glue, or a console emulator) override this with a strong
+// definition.  Marked `weak` so users can replace it.
+__attribute__((weak))
+int putchar(int c) {
+    *(volatile char *)0xE2 = (char)c;
+    return c;
+}
+
+int puts(const char *s) {
+    while (*s) { putchar(*s); s++; }
+    putchar('\n');
+    return 0;
+}
+
+// ---- minimal printf ----
+
+// Forward-declared because varargs use stdarg.h's __builtin_va_list,
+// but our libc doesn't include stdarg.h yet — clang's built-in
+// va_arg/va_start/va_end work without an explicit include on most
+// targets.  Re-declare the types/macros locally to avoid including
+// the system header (which would pull in target-specific quirks).
+typedef __builtin_va_list  va_list;
+#define va_start(ap, last) __builtin_va_start(ap, last)
+#define va_arg(ap, ty)     __builtin_va_arg(ap, ty)
+#define va_end(ap)         __builtin_va_end(ap)
+
+static void writeUDec(unsigned int n) {
+    char buf[6];   // 16-bit: max 5 digits + null
+    int i = 0;
+    if (n == 0) { putchar('0'); return; }
+    while (n > 0) { buf[i++] = '0' + (n % 10); n /= 10; }
+    while (i > 0) putchar(buf[--i]);
+}
+
+static void writeDec(int n) {
+    if (n < 0) { putchar('-'); writeUDec((unsigned int)(-n)); }
+    else        writeUDec((unsigned int)n);
+}
+
+static void writeULong(unsigned long n) {
+    char buf[11];  // 32-bit: max 10 digits + null
+    int i = 0;
+    if (n == 0) { putchar('0'); return; }
+    while (n > 0) { buf[i++] = '0' + (n % 10); n /= 10; }
+    while (i > 0) putchar(buf[--i]);
+}
+
+static void writeHex(unsigned int n, int width) {
+    static const char digits[] = "0123456789abcdef";
+    char buf[5];
+    int i = 0;
+    if (n == 0) { buf[i++] = '0'; }
+    while (n > 0) { buf[i++] = digits[n & 0xF]; n >>= 4; }
+    while (i < width) buf[i++] = '0';
+    while (i > 0) putchar(buf[--i]);
+}
+
+static void writeStr(const char *s) {
+    if (!s) s = "(null)";
+    while (*s) { putchar(*s); s++; }
+}
+
+// Each format-spec handler is its own function so vprintf's main loop
+// stays small (avoids the W65816 backend's long-branch limitation
+// which fails to relax conditional branches > 128 bytes; nesting all
+// the format handlers inline produced functions whose internal Bxx
+// targets exceeded that range).
+__attribute__((noinline))
+static void writeSignedLong(long n) {
+    if (n < 0) { putchar('-'); writeULong((unsigned long)(-n)); }
+    else        writeULong((unsigned long)n);
+}
+
+// Minimal %f / %g support.  Uses double soft-float; precision capped
+// at 6 fractional digits (the C default).  Doesn't handle Inf/NaN
+// specially — prints the integer extraction, which will be 0 for
+// non-finite values.  Not IEEE-precise (intermediate truncation in
+// the soft-double mul/div), but good enough for typical formatted
+// numeric output.
+__attribute__((noinline))
+static void writeDouble(double v, int prec) {
+    if (prec < 0) prec = 6;
+    if (prec > 9) prec = 9;
+    if (v < 0) { putchar('-'); v = -v; }
+    long ipart = (long)v;
+    writeULong((unsigned long)ipart);
+    if (prec == 0) return;
+    putchar('.');
+    double frac = v - (double)ipart;
+    // Multiply fraction by 10^prec, then print as integer with leading zeros.
+    long mul = 1;
+    for (int i = 0; i < prec; i++) mul *= 10;
+    long fdigits = (long)(frac * (double)mul);
+    if (fdigits < 0) fdigits = -fdigits;
+    char buf[10];
+    int n = 0;
+    long scale = mul / 10;
+    while (n < prec) {
+        if (scale == 0) scale = 1;
+        long d = fdigits / scale;
+        buf[n++] = '0' + (char)(d % 10);
+        scale /= 10;
+        if (scale == 0) break;
+    }
+    while (n < prec) buf[n++] = '0';
+    for (int i = 0; i < n; i++) putchar(buf[i]);
+}
+
+int vprintf(const char *fmt, va_list ap) {
+    int count = 0;
+    while (*fmt) {
+        char c = *fmt++;
+        if (c != '%') { putchar(c); count++; continue; }
+        // Optional width (honoured for %x and %f).
+        int width = 0;
+        while (*fmt >= '0' && *fmt <= '9') {
+            width = width * 10 + (*fmt - '0');
+            fmt++;
+        }
+        // Optional precision (.N) — used by %f.
+        int prec = -1;
+        if (*fmt == '.') {
+            fmt++;
+            prec = 0;
+            while (*fmt >= '0' && *fmt <= '9') {
+                prec = prec * 10 + (*fmt - '0');
+                fmt++;
+            }
+        }
+        int isLong = 0;
+        if (*fmt == 'l') { isLong = 1; fmt++; }
+        char spec = *fmt++;
+        if (spec == 'd' || spec == 'i') {
+            if (isLong) writeSignedLong(va_arg(ap, long));
+            else        writeDec(va_arg(ap, int));
+        } else if (spec == 'u') {
+            if (isLong) writeULong(va_arg(ap, unsigned long));
+            else        writeUDec(va_arg(ap, unsigned int));
+        } else if (spec == 'x' || spec == 'X') {
+            writeHex(va_arg(ap, unsigned int), width);
+        } else if (spec == 'c') {
+            putchar(va_arg(ap, int));
+        } else if (spec == 's') {
+            writeStr(va_arg(ap, const char *));
+        } else if (spec == 'f' || spec == 'F' ||
+                   spec == 'g' || spec == 'G' ||
+                   spec == 'e' || spec == 'E') {
+            writeDouble(va_arg(ap, double), prec);
+        } else if (spec == 'p') {
+            putchar('0'); putchar('x');
+            writeHex(va_arg(ap, unsigned int), 4);
+        } else if (spec == '%') {
+            putchar('%');
+        } else {
+            putchar('%'); putchar(spec);
+        }
+        count++;
+    }
+    return count;
+}
+
+int printf(const char *fmt, ...) {
+    va_list ap;
+    va_start(ap, fmt);
+    int r = vprintf(fmt, ap);
+    va_end(ap);
+    return r;
+}
+
+// ---- additional string.h ----
+
+void *memchr(const void *s, int c, size_t n) {
+    const u8 *p = (const u8 *)s;
+    while (n--) {
+        if (*p == (u8)c) return (void *)p;
+        p++;
+    }
+    return 0;
+}
+
+char *strstr(const char *haystack, const char *needle) {
+    if (!*needle) return (char *)haystack;
+    while (*haystack) {
+        const char *h = haystack;
+        const char *n = needle;
+        while (*n && *h == *n) { h++; n++; }
+        if (!*n) return (char *)haystack;
+        haystack++;
+    }
+    return 0;
+}
+
+// ---- malloc/free — first-fit allocator with coalescing-on-free ----
+//
+// Heap lives between the static-data top (linker-supplied __heap_start)
+// and a soft cap.  Each allocated block is preceded by a 2-byte header
+// holding the block's payload size in bytes.  Free blocks add a 2-byte
+// "next" pointer after the size, forming a singly-linked free list.
+//
+// malloc: first-fit walk of the free list; split the chosen block when
+// the remainder is large enough to host its own header+next.
+// free: insert onto the head of the free list, then coalesce with any
+// adjacent free blocks (forward and backward via free-list scan).
+//
+// The bump fallback (top of heap) is used when the free list has no
+// suitable block.
+
+// Linker-supplied weak symbols; fallback to fixed defaults so a static
+// link without crt0 still has SOMETHING.
+extern char __heap_start[] __attribute__((weak));
+extern char __heap_end[]   __attribute__((weak));
+
+#define HEAP_DEFAULT_START  ((char *)0x4000)
+#define HEAP_DEFAULT_END    ((char *)0xBF00)
+
+typedef struct FreeBlk {
+    size_t           size;       // payload size, NOT including header
+    struct FreeBlk  *next;       // valid only while in the free list
+} FreeBlk;
+
+#define HDR_SZ          ((size_t)2)             // sizeof(size_t) only
+#define FREE_NODE_SZ    ((size_t)4)             // size + next ptr
+#define MIN_SPLIT       ((size_t)(FREE_NODE_SZ + 2))  // 6 bytes
+
+static FreeBlk      *freeList = (FreeBlk *)0;
+static char         *bumpPtr  = (char *)0;
+static char         *heapEnd  = (char *)0;
+// Use the bumpPtr nonzero-ness as the "initialized" flag — sidesteps
+// an i1-narrowing isel bug on a dedicated bool flag.
+static void mallocInitOnce(void) {
+    if (bumpPtr) return;
+    bumpPtr = __heap_start ? __heap_start : HEAP_DEFAULT_START;
+    heapEnd = __heap_end   ? __heap_end   : HEAP_DEFAULT_END;
+    freeList = (FreeBlk *)0;
+}
+
+void *malloc(size_t n) {
+    mallocInitOnce();
+    if (n == 0) n = 1;
+    n = (n + 1) & ~(size_t)1;            // round up to 2 bytes
+    if (n < FREE_NODE_SZ - HDR_SZ)
+        n = FREE_NODE_SZ - HDR_SZ;       // ensure freed block can hold next-ptr
+    // First-fit on free list.
+    FreeBlk **link = &freeList;
+    FreeBlk  *cur  = freeList;
+    while (cur) {
+        if (cur->size >= n) {
+            // Split if there's room for a separate free block.
+            if (cur->size >= n + MIN_SPLIT) {
+                size_t rem = cur->size - n - HDR_SZ;
+                FreeBlk *tail = (FreeBlk *)((char *)cur + HDR_SZ + n);
+                tail->size = rem;
+                tail->next = cur->next;
+                cur->size  = n;
+                *link = tail;
+            } else {
+                *link = cur->next;
+            }
+            return (char *)cur + HDR_SZ;
+        }
+        link = &cur->next;
+        cur  = cur->next;
+    }
+    // Bump-allocate from the high end.
+    char *p = bumpPtr;
+    if (p + HDR_SZ + n > heapEnd) return (void *)0;
+    *(size_t *)p = n;
+    bumpPtr = p + HDR_SZ + n;
+    return p + HDR_SZ;
+}
+
+void free(void *p) {
+    if (!p) return;
+    FreeBlk *blk = (FreeBlk *)((char *)p - HDR_SZ);
+    blk->next = freeList;
+    freeList = blk;
+    // Coalesce: walk the free list and merge adjacent blocks.  O(n^2)
+    // in the worst case but n is small in practice.
+    FreeBlk *a = freeList;
+    while (a) {
+        FreeBlk **link = &a->next;
+        FreeBlk  *b    = a->next;
+        while (b) {
+            char *aEnd = (char *)a + HDR_SZ + a->size;
+            char *bEnd = (char *)b + HDR_SZ + b->size;
+            if (aEnd == (char *)b) {
+                a->size += HDR_SZ + b->size;
+                *link = b->next;
+                b = *link;
+                continue;
+            }
+            if (bEnd == (char *)a) {
+                b->size += HDR_SZ + a->size;
+                // Remove `a` from the list (a is freeList head if first).
+                // Simpler: relink b in place of a, but a is at top.
+                // For correctness, just skip — coalesce on next pass.
+                link = &b->next;
+                b    = b->next;
+                continue;
+            }
+            link = &b->next;
+            b    = b->next;
+        }
+        a = a->next;
+    }
+}
+
+void *calloc(size_t nmemb, size_t size) {
+    size_t total = nmemb * size;
+    void *p = malloc(total);
+    if (p) memset(p, 0, total);
+    return p;
+}
+
+void *realloc(void *ptr, size_t n) {
+    if (!ptr) return malloc(n);
+    if (n == 0) { free(ptr); return (void *)0; }
+    size_t old = *(size_t *)((char *)ptr - HDR_SZ);
+    if (n <= old) return ptr;
+    void *q = malloc(n);
+    if (!q) return (void *)0;
+    memcpy(q, ptr, old);
+    free(ptr);
+    return q;
+}
+
+// ---- exit ----
+//
+// Standard exit() halts via BRK.  Programs running under the IIgs
+// runtime typically would call back into GS/OS Quit; here we just
+// wedge the CPU.
+
+void exit(int code) {
+    (void)code;
+    // BRK $00 — halts a 65816 in BRK, MAME's debugger catches.
+    __asm__ volatile (".byte 0x00, 0x00");
+    while (1) {}  // unreachable
+}
+
+// ---- errno ----
+//
+// Single global errno cell.  Library functions that want to report a
+// failure code write here.  The `errno` macro in <errno.h> expands to
+// `(*__errno_location())` — we provide that for source compatibility,
+// but most code can just touch `errno` directly.
+int errno = 0;
+int *__errno_location(void) { return &errno; }
+
+char *strerror(int err) {
+    switch (err) {
+    case 0:  return (char *)"Success";
+    case 1:  return (char *)"Operation not permitted";
+    case 2:  return (char *)"No such file or directory";
+    case 5:  return (char *)"Input/output error";
+    case 9:  return (char *)"Bad file descriptor";
+    case 12: return (char *)"Out of memory";
+    case 13: return (char *)"Permission denied";
+    case 22: return (char *)"Invalid argument";
+    case 28: return (char *)"No space left on device";
+    default: return (char *)"Unknown error";
+    }
+}
+
+// ---- time.h ----
+//
+// W65816/IIgs has no standard clock from C's perspective.  Provide
+// stubs that return 0 / -1 so code that calls time() at least links.
+// A real implementation would call ReadTimeHex (GS/OS toolbox) or
+// poll the IIgs real-time clock.
+
+typedef long time_t;
+typedef unsigned long clock_t;
+
+time_t time(time_t *t) {
+    if (t) *t = 0;
+    return 0;
+}
+
+clock_t clock(void) {
+    return (clock_t)0;
+}
+
+// ---- FILE* abstraction (minimal) ----
+//
+// stdin / stdout / stderr exist as opaque non-NULL pointers.  fputs /
+// fputc forward to puts/putchar (which currently no-op or hit a debug
+// hook).  fprintf forwards to printf, ignoring the stream.  fflush is
+// a no-op.  Real file I/O via GS/OS toolbox is a separate feature
+// (would need open/read/write/close + a file-descriptor table).
+
+typedef struct __sFILE { unsigned int magic; } FILE;
+
+static FILE __stdin_obj  = { 1 };
+static FILE __stdout_obj = { 2 };
+static FILE __stderr_obj = { 3 };
+FILE *stdin  = &__stdin_obj;
+FILE *stdout = &__stdout_obj;
+FILE *stderr = &__stderr_obj;
+
+int fputc(int c, FILE *stream) { (void)stream; return putchar(c); }
+int fputs(const char *s, FILE *stream) { (void)stream; return puts(s); }
+int fflush(FILE *stream) { (void)stream; return 0; }
+int fclose(FILE *stream) { (void)stream; return 0; }
+
+int fprintf(FILE *stream, const char *fmt, ...) {
+    (void)stream;
+    va_list ap;
+    __builtin_va_start(ap, fmt);
+    int r = vprintf(fmt, ap);
+    __builtin_va_end(ap);
+    return r;
+}
+
+// ---- assert ----
+//
+// __assert_fail is what most assert() macros call.  Print a message
+// (if we have stderr) and exit.
+
+void __assert_fail(const char *expr, const char *file, unsigned int line,
+                   const char *func) {
+    fprintf(stderr, "%s:%u: %s: Assertion `%s' failed.\n",
+            file, line, func, expr);
+    exit(1);
+}
+
+// ---- abort ----
+void abort(void) {
+    exit(127);
+}
+
+// ---- atexit (stub — single slot) ----
+typedef void (*AtexitFn)(void);
+static AtexitFn __atexitFn = (AtexitFn)0;
+int atexit(AtexitFn fn) {
+    if (__atexitFn) return -1;
+    __atexitFn = fn;
+    return 0;
+}
+
+// ---- File I/O via GS/OS toolbox calls ----
+//
+// On a real Apple IIgs running GS/OS, these route through the GS/OS
+// dispatcher at $E100A8.  When running outside GS/OS (e.g., bare
+// MAME tests), every call returns failure so user code degrades
+// gracefully instead of trapping.
+//
+// Pclass-1 parameter blocks are stack-allocated as packed structs
+// matching the GS/OS class-1 layout; we pass the block's pointer
+// and call number to a single helper.
+
+typedef unsigned long u32_t;
+typedef unsigned int  u16_t;
+typedef int           s16_t;
+
+// File descriptor table: fopen returns a FILE* whose 'magic' field
+// holds (u16)refNum + 0x8000 — distinguishing real fds from the
+// pre-baked stdin/stdout/stderr.
+#define FOPEN_MAGIC_BASE 0x8000
+
+// Static table of refNum-bearing FILE objects.  16 simultaneous opens.
+#define MAX_OPEN_FDS 16
+static FILE __fds[MAX_OPEN_FDS];
+static unsigned char __fdInUse[MAX_OPEN_FDS];
+
+// GS/OS call helper.  Invokes the dispatcher with X=callNum, A=parmsLow,
+// PHA before JSL pushes A as the parmblock pointer.  Returns the toolerror
+// code (0 = success).  Inline asm; calls into bank E1.
+static inline u16_t __gsosCall(u16_t callNum, void *parms) {
+    u16_t err;
+    __asm__ volatile (
+        "pha\n"
+        "phx\n"          // we'd push the parm-block ptr, but...
+        "ldx %1\n"
+        "lda %2\n"
+        "pha\n"
+        "jsl 0xe100a8\n"
+        "sta %0\n"
+        : "=r"(err)
+        : "r"(callNum), "r"(parms)
+        : "x", "y", "memory"
+    );
+    return err;
+}
+
+// Stub fopen: try GS/OS Open ($2010) — but we don't have parm-block
+// definitions wired here.  For now, return NULL (failure).  A full
+// implementation would build an Open_GSOSp class-1 block, fill in
+// pathname (Pascal string), requestAccess, etc., call __gsosCall,
+// then copy refNum out.
+FILE *fopen(const char *path, const char *mode) {
+    (void)path; (void)mode;
+    return (FILE *)0;
+}
+
+unsigned int fread(void *ptr, unsigned int size, unsigned int nmemb, FILE *stream) {
+    (void)ptr; (void)size; (void)nmemb; (void)stream;
+    return 0;
+}
+
+unsigned int fwrite(const void *ptr, unsigned int size, unsigned int nmemb, FILE *stream) {
+    (void)ptr; (void)size; (void)nmemb; (void)stream;
+    return 0;
+}
+
+int fseek(FILE *stream, long offset, int whence) {
+    (void)stream; (void)offset; (void)whence;
+    return -1;
+}
+
+long ftell(FILE *stream) {
+    (void)stream;
+    return -1L;
+}
+
+int feof(FILE *stream) { (void)stream; return 1; }
+int ferror(FILE *stream) { (void)stream; return 0; }
+void clearerr(FILE *stream) { (void)stream; }
--- a/runtime/src/libgcc.s
+++ b/runtime/src/libgcc.s
@ -638,3 +638,543 @@ __divmodsi_setup:
 	sta	0xe6
 .Lsetsi_b_pos:
 	rts
+
+; ====================================================================
+; i64 (long long) helpers.
+;
+; Calling convention (i64 first arg is split via i32-first-arg path):
+;   A   = arg0_lo[0..15]    (lowest word)
+;   X   = arg0_lo[16..31]
+;   4,S = arg0_hi[0..15]
+;   6,S = arg0_hi[16..31]   (highest word)
+;   For binary ops (mul/div/mod), arg1 follows on the stack:
+;   8,S = arg1_lo[0..15]
+;   10,S = arg1_lo[16..31]
+;   12,S = arg1_hi[0..15]
+;   14,S = arg1_hi[16..31]
+;   For shift ops, the count occupies a single i16 at 8,S.
+;
+; Return ABI (matches LowerReturn for i64):
+;   A   = result_lo[0..15]
+;   X   = result_lo[16..31]
+;   Y   = result_hi[0..15]
+;   DP $F0..$F1 = result_hi[16..31]
+;
+; Scratch DP layout (per-libcall, no overlap between concurrent calls):
+;   $E0..$E7 = a (8 bytes; 4 16-bit words)
+;   $E8..$EF = b OR product (8 bytes)
+;
+; All routines run with REP #$30 (M=0, X=0).
+; ====================================================================
+
+; --------------------------------------------------------------------
+; __divmoddi4_stash — common entry point.  Stashes a -> $E0..$E7,
+; b -> $E8..$EF.  Used by __udivdi3 / __umoddi3 / __divdi3 / __moddi3
+; setup; signed variants flip signs around it.
+; --------------------------------------------------------------------
+__divmoddi4_stash:
+	sta	0xe0			; a_lo_lo
+	stx	0xe2			; a_lo_hi
+	lda	0x4, s
+	sta	0xe4			; a_hi_lo
+	lda	0x6, s
+	sta	0xe6			; a_hi_hi
+	lda	0x8, s
+	sta	0xe8			; b_lo_lo
+	lda	0xa, s
+	sta	0xea			; b_lo_hi
+	lda	0xc, s
+	sta	0xec			; b_hi_lo
+	lda	0xe, s
+	sta	0xee			; b_hi_hi
+	rts
+
+; --------------------------------------------------------------------
+; Helper: pack the result at $E0..$E7 into the i64 return ABI.
+; Trashes A, Y.  Caller falls through to RTL.
+; --------------------------------------------------------------------
+__retdi:
+	lda	0xe6
+	sta	0xf0
+	lda	0xe4
+	tay
+	lda	0xe2
+	tax
+	lda	0xe0
+	rtl
+
+; --------------------------------------------------------------------
+; __ashldi3 — i64 left shift by n.  Per-bit loop.  Y holds count.
+; --------------------------------------------------------------------
+	.globl __ashldi3
+__ashldi3:
+	sta	0xe0
+	stx	0xe2
+	lda	0x4, s
+	sta	0xe4
+	lda	0x6, s
+	sta	0xe6
+	lda	0x8, s
+	tay				; Y = count
+.Lashldi_loop:
+	cpy	#0x0
+	beq	.Lashldi_done
+	asl	0xe0
+	rol	0xe2
+	rol	0xe4
+	rol	0xe6
+	dey
+	bra	.Lashldi_loop
+.Lashldi_done:
+	brl	__retdi
+
+; --------------------------------------------------------------------
+; __lshrdi3 — i64 logical right shift.  LSR top word, ROR rest.
+; --------------------------------------------------------------------
+	.globl __lshrdi3
+__lshrdi3:
+	sta	0xe0
+	stx	0xe2
+	lda	0x4, s
+	sta	0xe4
+	lda	0x6, s
+	sta	0xe6
+	lda	0x8, s
+	tay
+.Llshrdi_loop:
+	cpy	#0x0
+	beq	.Llshrdi_done
+	lsr	0xe6
+	ror	0xe4
+	ror	0xe2
+	ror	0xe0
+	dey
+	bra	.Llshrdi_loop
+.Llshrdi_done:
+	brl	__retdi
+
+; --------------------------------------------------------------------
+; __ashrdi3 — i64 arithmetic right shift.  Same as lshrdi3 but the top
+; bit replicates: sign-extend by ASL/ROR which would clear; instead
+; take a copy of the sign and OR it back, OR use cmp/sbc trick — use
+; the standard idiom: capture sign before LSR via "asl; ror" so C is
+; preserved.  Simpler: copy bit 15 of $E7 into C before each shift.
+; --------------------------------------------------------------------
+	.globl __ashrdi3
+__ashrdi3:
+	sta	0xe0
+	stx	0xe2
+	lda	0x4, s
+	sta	0xe4
+	lda	0x6, s
+	sta	0xe6
+	lda	0x8, s
+	tay
+.Lashrdi_loop:
+	cpy	#0x0
+	beq	.Lashrdi_done
+	; "ASL $E6" sets C from bit 15 (the sign), then we ROR $E6 back.
+	; Net effect on $E6: arithmetic right shift by 1 (sign preserved).
+	; The carry chain into $E4..$E0 is the new bit 15.
+	lda	0xe6
+	asl	a			; C = sign bit; A = (sign<<1) | rest
+	ror	0xe6			; $E6: (sign << 15) | ($E6 >> 1)
+	ror	0xe4
+	ror	0xe2
+	ror	0xe0
+	dey
+	bra	.Lashrdi_loop
+.Lashrdi_done:
+	brl	__retdi
+
+; --------------------------------------------------------------------
+; __muldi3 — i64 multiply (low 64 bits of 64x64 product).
+; Shift-and-add over a (64 bits).  Product accumulates at $F2..$F9
+; (above the return DP slot, scratch).  Need a fresh 8-byte product
+; slot since $E0..$EF holds operands.
+; --------------------------------------------------------------------
+	.globl __muldi3
+__muldi3:
+	jsr	__divmoddi4_stash
+	; Clear product P0..P3 at $F2..$F8.
+	lda	#0x0
+	sta	0xf2
+	sta	0xf4
+	sta	0xf6
+	sta	0xf8
+	; Loop 64 times on a's bits.
+	ldy	#0x40
+.Lmuldi_loop:
+	; Test bit 0 of a (= LSR a; C = old bit 0).
+	lda	0xe0
+	lsr	a
+	sta	0xe0
+	lda	0xe2
+	ror	a
+	sta	0xe2
+	lda	0xe4
+	ror	a
+	sta	0xe4
+	lda	0xe6
+	ror	a
+	sta	0xe6
+	bcc	.Lmuldi_noadd
+	; Add b ($E8..$EE) to product ($F2..$F8).
+	clc
+	lda	0xf2
+	adc	0xe8
+	sta	0xf2
+	lda	0xf4
+	adc	0xea
+	sta	0xf4
+	lda	0xf6
+	adc	0xec
+	sta	0xf6
+	lda	0xf8
+	adc	0xee
+	sta	0xf8
+.Lmuldi_noadd:
+	; Shift b left by 1 (so each iteration uses next bit position).
+	asl	0xe8
+	rol	0xea
+	rol	0xec
+	rol	0xee
+	dey
+	bne	.Lmuldi_loop
+	; Move product into return slots ($E0..$E7) and tail-call __retdi.
+	lda	0xf2
+	sta	0xe0
+	lda	0xf4
+	sta	0xe2
+	lda	0xf6
+	sta	0xe4
+	lda	0xf8
+	sta	0xe6
+	brl	__retdi
+
+; --------------------------------------------------------------------
+; __ucmpdi2 — unsigned i64 compare.  Returns 0 if a<b, 1 if a==b,
+; 2 if a>b (libgcc convention).  We emit i16 result in A (with the
+; high bytes don't-care).
+; --------------------------------------------------------------------
+	.globl __ucmpdi2
+__ucmpdi2:
+	; Compare from MSB downwards.  Stash a/b first so we have a stable
+	; layout.
+	jsr	__divmoddi4_stash
+	; Compare $E6 vs $EE (a_hi_hi vs b_hi_hi).
+	lda	0xe6
+	cmp	0xee
+	bne	.Lucmpdi_decided
+	lda	0xe4
+	cmp	0xec
+	bne	.Lucmpdi_decided
+	lda	0xe2
+	cmp	0xea
+	bne	.Lucmpdi_decided
+	lda	0xe0
+	cmp	0xe8
+	bne	.Lucmpdi_decided
+	; Equal.
+	lda	#0x1
+	rtl
+.Lucmpdi_decided:
+	; Carry clear  -> a < b -> return 0.
+	; Carry set, Z clear -> a > b -> return 2.
+	bcc	.Lucmpdi_lt
+	lda	#0x2
+	rtl
+.Lucmpdi_lt:
+	lda	#0x0
+	rtl
+
+; --------------------------------------------------------------------
+; __cmpdi2 — signed i64 compare.  Same {0,1,2} return convention.
+; Implemented by flipping the high-word sign bits before doing an
+; unsigned compare ($N XOR $8000 swaps the signed-int order to
+; unsigned-int order).
+; --------------------------------------------------------------------
+	.globl __cmpdi2
+__cmpdi2:
+	jsr	__divmoddi4_stash
+	lda	0xe6
+	eor	#0x8000
+	sta	0xe6
+	lda	0xee
+	eor	#0x8000
+	sta	0xee
+	; Unsigned compare on the rewritten values.
+	lda	0xe6
+	cmp	0xee
+	bne	.Lcmpdi_decided
+	lda	0xe4
+	cmp	0xec
+	bne	.Lcmpdi_decided
+	lda	0xe2
+	cmp	0xea
+	bne	.Lcmpdi_decided
+	lda	0xe0
+	cmp	0xe8
+	bne	.Lcmpdi_decided
+	lda	#0x1
+	rtl
+.Lcmpdi_decided:
+	bcc	.Lcmpdi_lt
+	lda	#0x2
+	rtl
+.Lcmpdi_lt:
+	lda	#0x0
+	rtl
+
+; --------------------------------------------------------------------
+; __udivdi3 / __umoddi3 — unsigned 64-bit divide / modulo.  Restoring
+; division: shift dividend left into a remainder register, conditionally
+; subtract the divisor.  The two libcalls share the core; quotient
+; lands at $E0..$E7, remainder at $F2..$F8.  Each entry sets a flag in
+; X to select which to return.
+; --------------------------------------------------------------------
+	.globl __udivdi3
+__udivdi3:
+	jsr	__divmoddi4_stash
+	jsr	__udivmoddi_core
+	brl	__retdi
+
+	.globl __umoddi3
+__umoddi3:
+	jsr	__divmoddi4_stash
+	jsr	__udivmoddi_core
+	; Move remainder ($F2..$F8) -> $E0..$E7 for return.
+	lda	0xf2
+	sta	0xe0
+	lda	0xf4
+	sta	0xe2
+	lda	0xf6
+	sta	0xe4
+	lda	0xf8
+	sta	0xe6
+	brl	__retdi
+
+; Core: dividend at $E0..$E6, divisor at $E8..$EE.
+; Output: quotient at $E0..$E6, remainder at $F2..$F8.
+__udivmoddi_core:
+	; Clear remainder $F2..$F8.
+	lda	#0x0
+	sta	0xf2
+	sta	0xf4
+	sta	0xf6
+	sta	0xf8
+	ldy	#0x40
+.Ludivmoddi_loop:
+	; Shift left: dividend (becomes quotient) and remainder together
+	; as a 128-bit register.  bit shifted out of dividend top -> remainder LSB.
+	asl	0xe0
+	rol	0xe2
+	rol	0xe4
+	rol	0xe6
+	rol	0xf2
+	rol	0xf4
+	rol	0xf6
+	rol	0xf8
+	; Try remainder - divisor.  If no borrow, accept and set quotient bit.
+	sec
+	lda	0xf2
+	sbc	0xe8
+	sta	0xfa			; tentative subtract result at $FA..$
+	lda	0xf4
+	sbc	0xea
+	sta	0xfc
+	lda	0xf6
+	sbc	0xec
+	sta	0xfe
+	lda	0xf8
+	sbc	0xee
+	; A holds new high word.  C = !borrow.
+	bcc	.Ludivmoddi_skip
+	; Accept: remainder = remainder - divisor, quotient bit 0 = 1.
+	sta	0xf8
+	lda	0xfe
+	sta	0xf6
+	lda	0xfc
+	sta	0xf4
+	lda	0xfa
+	sta	0xf2
+	; Set bit 0 of dividend (which we shifted left, so position is open).
+	lda	0xe0
+	ora	#0x1
+	sta	0xe0
+.Ludivmoddi_skip:
+	dey
+	bne	.Ludivmoddi_loop
+	rts
+
+; --------------------------------------------------------------------
+; __divdi3 / __moddi3 — signed 64-bit divide / modulo.  Take absolute
+; values, run the unsigned core, fix up the sign.
+;   div: sign(quotient) = sign(a) XOR sign(b)
+;   mod: sign(remainder) = sign(a)
+; --------------------------------------------------------------------
+	.globl __divdi3
+__divdi3:
+	jsr	__divmoddi4_stash
+	; Track signs: bit 15 of $E6 (a) and $EE (b).  Save XOR in a temp.
+	lda	0xe6
+	eor	0xee
+	and	#0x8000
+	sta	0xfa			; sign of quotient at $FA
+	; Abs(a)
+	jsr	__absdi_a
+	; Abs(b)
+	jsr	__absdi_b
+	jsr	__udivmoddi_core
+	; Fix quotient sign: if $FA != 0, negate $E0..$E6.
+	lda	0xfa
+	beq	.Ldivdi_pos
+	jsr	__negdi_a
+.Ldivdi_pos:
+	brl	__retdi
+
+	.globl __moddi3
+__moddi3:
+	jsr	__divmoddi4_stash
+	; Mod sign = sign of a.
+	lda	0xe6
+	and	#0x8000
+	sta	0xfa
+	jsr	__absdi_a
+	jsr	__absdi_b
+	jsr	__udivmoddi_core
+	; Move remainder to $E0..$E6.
+	lda	0xf2
+	sta	0xe0
+	lda	0xf4
+	sta	0xe2
+	lda	0xf6
+	sta	0xe4
+	lda	0xf8
+	sta	0xe6
+	; Apply sign.
+	lda	0xfa
+	beq	.Lmoddi_pos
+	jsr	__negdi_a
+.Lmoddi_pos:
+	brl	__retdi
+
+; --- subroutines used by signed div/mod ---
+
+; __absdi_a: if $E6 has sign bit set, negate $E0..$E6.
+__absdi_a:
+	lda	0xe6
+	bpl	.Labsdi_a_done
+	jsr	__negdi_a
+.Labsdi_a_done:
+	rts
+
+; __absdi_b: if $EE has sign bit set, negate $E8..$EE.
+__absdi_b:
+	lda	0xee
+	bpl	.Labsdi_b_done
+	jsr	__negdi_b
+.Labsdi_b_done:
+	rts
+
+; __negdi_a: 2's complement negate $E0..$E6.
+__negdi_a:
+	sec
+	lda	#0x0
+	sbc	0xe0
+	sta	0xe0
+	lda	#0x0
+	sbc	0xe2
+	sta	0xe2
+	lda	#0x0
+	sbc	0xe4
+	sta	0xe4
+	lda	#0x0
+	sbc	0xe6
+	sta	0xe6
+	rts
+
+; __negdi_b: 2's complement negate $E8..$EE.
+__negdi_b:
+	sec
+	lda	#0x0
+	sbc	0xe8
+	sta	0xe8
+	lda	#0x0
+	sbc	0xea
+	sta	0xea
+	lda	#0x0
+	sbc	0xec
+	sta	0xec
+	lda	#0x0
+	sbc	0xee
+	sta	0xee
+	rts
+
+; --------------------------------------------------------------------
+; setjmp(jmp_buf env) - save calling environment, return 0
+; longjmp(jmp_buf env, int val) - restore environment, return val (or 1 if val == 0)
+;
+; jmp_buf layout (8 bytes):
+;   [0..1]  = caller's stack pointer (SP+3 at entry to setjmp)
+;   [2..3]  = return address PC lo:hi (16 bits)
+;   [4]     = return address bank (1 byte)
+;   [5..6]  = direct page register (DP)
+;   [7]     = reserved / padding
+;
+; Caller-save convention: longjmp doesn't restore X / Y / A — caller's
+; setjmp returned 0 with all-callee-savable regs already preserved by
+; setjmp's caller.
+; --------------------------------------------------------------------
+	.globl setjmp
+setjmp:
+	sta	0xe0		; jmp_buf addr -> DP scratch
+	tsc			; A = current SP
+	clc
+	adc	#0x3		; A = caller's SP (undo JSL push)
+	ldy	#0
+	sta	(0xe0), y	; env[0..1] = caller SP
+	lda	0x1, s		; A = retaddr lo:hi
+	ldy	#2
+	sta	(0xe0), y	; env[2..3] = retaddr lo:hi
+	sep	#0x20
+	lda	0x3, s		; A_lo = bank
+	ldy	#4
+	sta	(0xe0), y	; env[4] = bank
+	rep	#0x20
+	tdc			; A = DP
+	ldy	#5
+	sta	(0xe0), y	; env[5..6] = DP
+	lda	#0		; setjmp returns 0
+	rtl
+
+	.globl longjmp
+longjmp:
+	sta	0xe0		; jmp_buf addr -> DP scratch
+	lda	0x4, s		; A = val (2nd arg, on stack)
+	sta	0xe2		; save val
+	; Restore SP: env[0..1] - 3 (so the upcoming PHAs land at the right slots).
+	ldy	#0
+	lda	(0xe0), y	; A = saved SP
+	sec
+	sbc	#0x3
+	tcs			; SP = saved_SP - 3
+	; Push retaddr: bank, then 16-bit lo:hi.  RTL pulls lo, hi, bank.
+	sep	#0x20
+	ldy	#4
+	lda	(0xe0), y	; bank
+	pha
+	rep	#0x20
+	ldy	#2
+	lda	(0xe0), y	; lo:hi
+	pha
+	; Restore DP.
+	ldy	#5
+	lda	(0xe0), y
+	tcd
+	; Compute return value: val if nonzero, else 1.
+	lda	0xe2
+	bne	.Llj_done
+	lda	#1
+.Llj_done:
+	rtl
--- a/runtime/src/softDouble.c
+++ b/runtime/src/softDouble.c
@ -0,0 +1,267 @@
+// Real double-precision IEEE 754 soft-float for the W65816.  Treats
+// a `double` as `unsigned long long` (64-bit) and operates on its
+// bit pattern.  Returns by-value at the i64 ABI A:X:Y:DP[$F0].
+//
+// Earlier attempts crashed the Register Coalescer; the greedy
+// regalloc landing fixed the underlying register pressure problem.
+// Each routine is broken into small helpers to keep frames shallow.
+
+// Local typedefs (no stdint.h — clang's host stdint pulls glibc).
+typedef unsigned long long u64;
+typedef long long          s64;
+typedef unsigned long      u32;
+typedef long               s32;
+typedef unsigned int       u16;
+typedef int                s16;
+typedef unsigned char      u8;
+
+#define DSIGN_BIT  0x8000000000000000ULL
+#define DEXP_MASK  0x7FF0000000000000ULL
+#define DMANT_MASK 0x000FFFFFFFFFFFFFULL
+#define DMANT_LEAD 0x0010000000000000ULL
+#define DEXP_SHIFT 52
+#define DEXP_BIAS  1023
+
+static inline u64 dpack(u64 sign, s16 exp, u64 mant) {
+    if (mant == 0) return sign;
+    u64 e = (u64)(exp + DEXP_BIAS);
+    if (e >= 2047) {
+        // Overflow → infinity.
+        return sign | DEXP_MASK;
+    }
+    if ((s16)e <= 0) {
+        // Underflow → zero (flush-to-zero, no subnormals).
+        return sign;
+    }
+    return sign | (e << DEXP_SHIFT) | (mant & DMANT_MASK);
+}
+
+// Decompose `x` into sign / unbiased-exp / mantissa-with-leading-bit.
+// Returns the class: 0=zero, 1=normal, 2=infinity, 3=NaN.
+static u16 dclass(u64 x, u64 *out_sign, s16 *out_exp, u64 *out_mant) {
+    *out_sign = x & DSIGN_BIT;
+    s16 e = (s16)((x >> DEXP_SHIFT) & 0x7FF);
+    u64 m = x & DMANT_MASK;
+    if (e == 0) {
+        *out_exp = 0;
+        *out_mant = 0;
+        return 0;
+    }
+    if (e == 0x7FF) {
+        *out_exp = 0x7FF;
+        *out_mant = m;
+        return (m == 0) ? 2 : 3;
+    }
+    *out_exp = e - DEXP_BIAS;
+    *out_mant = m | DMANT_LEAD;
+    return 1;
+}
+
+u64 __adddf3(u64 a, u64 b) {
+    u64 sa, sb, ma, mb;
+    s16 ea, eb;
+    u16 ca = dclass(a, &sa, &ea, &ma);
+    u16 cb = dclass(b, &sb, &eb, &mb);
+    if (ca == 0) return b;
+    if (cb == 0) return a;
+    // Align mantissas to common exponent.
+    if (ea > eb) {
+        s16 d = ea - eb;
+        if (d > 54) return a;
+        mb >>= d;
+        eb = ea;
+    } else if (eb > ea) {
+        s16 d = eb - ea;
+        if (d > 54) return b;
+        ma >>= d;
+        ea = eb;
+    }
+    u64 mr;
+    u64 sr;
+    if (sa == sb) {
+        mr = ma + mb;
+        sr = sa;
+    } else {
+        if (ma >= mb) {
+            mr = ma - mb;
+            sr = sa;
+        } else {
+            mr = mb - ma;
+            sr = sb;
+        }
+    }
+    if (mr == 0) return 0;
+    // Renormalize.
+    while ((mr & DMANT_LEAD) == 0 && (mr & ~DMANT_MASK) == 0) {
+        mr <<= 1;
+        ea--;
+    }
+    while (mr & ~(DMANT_LEAD | DMANT_MASK)) {
+        mr >>= 1;
+        ea++;
+    }
+    return dpack(sr, ea, mr);
+}
+
+u64 __subdf3(u64 a, u64 b) {
+    return __adddf3(a, b ^ DSIGN_BIT);
+}
+
+u64 __negdf2(u64 a) {
+    return a ^ DSIGN_BIT;
+}
+
+u64 __muldf3(u64 a, u64 b) {
+    u64 sa, sb, ma, mb;
+    s16 ea, eb;
+    u16 ca = dclass(a, &sa, &ea, &ma);
+    u16 cb = dclass(b, &sb, &eb, &mb);
+    u64 sr = sa ^ sb;
+    if (ca == 0 || cb == 0) return sr;
+    // Truncated 64*64 → high-64 product via 32*32 partials.  We only
+    // need the upper bits of the 106-bit product because the mantissas
+    // are 53 bits each.
+    u32 alo = (u32)ma;
+    u32 ahi = (u32)(ma >> 32);
+    u32 blo = (u32)mb;
+    u32 bhi = (u32)(mb >> 32);
+    u64 ll = (u64)alo * (u64)blo;
+    u64 lh = (u64)alo * (u64)bhi;
+    u64 hl = (u64)ahi * (u64)blo;
+    u64 hh = (u64)ahi * (u64)bhi;
+    u64 mid = lh + hl + (ll >> 32);
+    u64 prod_hi = hh + (mid >> 32);
+    s16 er = ea + eb;
+    while (prod_hi & ~(DMANT_LEAD | DMANT_MASK)) {
+        prod_hi >>= 1;
+        er++;
+    }
+    while ((prod_hi & DMANT_LEAD) == 0 && prod_hi != 0) {
+        prod_hi <<= 1;
+        er--;
+    }
+    return dpack(sr, er, prod_hi);
+}
+
+u64 __divdf3(u64 a, u64 b) {
+    u64 sa, sb, ma, mb;
+    s16 ea, eb;
+    u16 ca = dclass(a, &sa, &ea, &ma);
+    u16 cb = dclass(b, &sb, &eb, &mb);
+    u64 sr = sa ^ sb;
+    if (ca == 0) return sr;
+    if (cb == 0) return sr | DEXP_MASK;  // div-by-zero → inf
+    // Long division: shift a left by 11 to make room for quotient bits.
+    u64 q = 0;
+    u64 r = ma;
+    for (int i = 0; i < 53; i++) {
+        r <<= 1;
+        q <<= 1;
+        if (r >= mb) {
+            r -= mb;
+            q |= 1;
+        }
+    }
+    s16 er = ea - eb;
+    while (q & ~(DMANT_LEAD | DMANT_MASK)) {
+        q >>= 1;
+        er++;
+    }
+    while ((q & DMANT_LEAD) == 0 && q != 0) {
+        q <<= 1;
+        er--;
+    }
+    return dpack(sr, er, q);
+}
+
+s16 __cmpdf2(u64 a, u64 b) {
+    u64 sa = a & DSIGN_BIT;
+    u64 sb = b & DSIGN_BIT;
+    if (sa != sb) {
+        // Negative < positive (unless both zero).
+        if ((a | b) << 1 == 0) return 0;
+        return sa ? -1 : 1;
+    }
+    if (a == b) return 0;
+    if (sa) return a < b ? 1 : -1;
+    return a < b ? -1 : 1;
+}
+
+s16 __unorddf2(u64 a, u64 b) {
+    // Returns nonzero if either is NaN.
+    u64 ea = (a >> DEXP_SHIFT) & 0x7FF;
+    u64 eb = (b >> DEXP_SHIFT) & 0x7FF;
+    if (ea == 0x7FF && (a & DMANT_MASK) != 0) return 1;
+    if (eb == 0x7FF && (b & DMANT_MASK) != 0) return 1;
+    return 0;
+}
+
+s16 __eqdf2(u64 a, u64 b) { return __cmpdf2(a, b) != 0; }
+s16 __nedf2(u64 a, u64 b) { return __cmpdf2(a, b) != 0; }
+s16 __ltdf2(u64 a, u64 b) { return __cmpdf2(a, b) <  0; }
+s16 __ledf2(u64 a, u64 b) { return __cmpdf2(a, b) <= 0; }
+s16 __gtdf2(u64 a, u64 b) { return __cmpdf2(a, b) >  0; }
+s16 __gedf2(u64 a, u64 b) { return __cmpdf2(a, b) >= 0; }
+
+// double <-> float conversions.
+u64 __extendsfdf2(u32 x) {
+    u64 sign = ((u64)x & 0x80000000UL) << 32;
+    s16 e = (s16)((x >> 23) & 0xFF);
+    u32 m = x & 0x7FFFFFUL;
+    if (e == 0) return sign;
+    if (e == 0xFF) {
+        return sign | DEXP_MASK | ((u64)m << 29);
+    }
+    s16 unbiased = e - 127;
+    return dpack(sign, unbiased, ((u64)m << 29) | DMANT_LEAD);
+}
+
+u32 __truncdfsf2(u64 x) {
+    u64 sign = (x & DSIGN_BIT) >> 32;
+    s16 e = (s16)((x >> DEXP_SHIFT) & 0x7FF);
+    u64 m = x & DMANT_MASK;
+    if (e == 0) return (u32)sign;
+    if (e == 0x7FF) {
+        return (u32)sign | 0x7F800000UL | (u32)(m >> 29);
+    }
+    s16 unbiased = e - DEXP_BIAS;
+    s16 fexp = unbiased + 127;
+    if (fexp >= 255) return (u32)sign | 0x7F800000UL;
+    if (fexp <= 0) return (u32)sign;
+    return (u32)sign | ((u32)fexp << 23) | (u32)((m >> 29) & 0x7FFFFFUL);
+}
+
+// double <-> integer conversions.
+u64 __floatsidf(s32 x) {
+    if (x == 0) return 0;
+    u64 sign = (x < 0) ? DSIGN_BIT : 0;
+    u64 m = (u64)((x < 0) ? -x : x);
+    s16 e = 0;
+    while ((m & DMANT_LEAD) == 0) { m <<= 1; e--; }
+    e += 31 + 21;  // shift to put bit-31 at bit-52
+    return dpack(sign, e, m);
+}
+
+
+u64 __floatunsidf(u32 x) {
+    if (x == 0) return 0;
+    u64 m = (u64)x;
+    s16 e = 0;
+    while ((m & DMANT_LEAD) == 0) { m <<= 1; e--; }
+    e += 31 + 21;
+    return dpack(0, e, m);
+}
+
+s32 __fixdfsi(u64 x) {
+    u64 sign = x & DSIGN_BIT;
+    s16 e = (s16)((x >> DEXP_SHIFT) & 0x7FF);
+    if (e == 0) return 0;
+    if (e == 0x7FF) return sign ? (s32)0x80000000 : 0x7FFFFFFF;
+    s16 unbiased = e - DEXP_BIAS;
+    if (unbiased < 0) return 0;
+    if (unbiased > 30) return sign ? (s32)0x80000000 : 0x7FFFFFFF;
+    u64 m = (x & DMANT_MASK) | DMANT_LEAD;
+    s16 shift = 52 - unbiased;
+    if (shift >= 0) m >>= shift; else m <<= -shift;
+    return sign ? -(s32)m : (s32)m;
+}
--- a/runtime/src/softDouble.s
+++ b/runtime/src/softDouble.s
@ -0,0 +1,91 @@
+; Stub double-precision soft-float — every routine returns 0.
+;
+; The C-based softDouble.c hit two compiler issues simultaneously:
+; (1) Register Coalescer crash on the multi-tied-def-with-i64 pattern;
+; (2) PEI "frame offset out of stack-relative range" because the
+; spilled u64s push the local frame past the 8-bit ,S addressing
+; limit.  Both are real compiler bugs that require non-trivial
+; backend work to fix.  Until then, these stubs let programs that
+; reference but don't actually evaluate `double` link cleanly;
+; programs that DO use double get zero values back.
+;
+; Symbol set matches what clang's i64-routed double libcalls expect.
+; ABI: i64 result returned via A:X:Y:DP[$F0] (matches LowerReturn).
+
+	.text
+
+; Helper macro idiom: stub returning 64-bit zero.
+.macro RET_ZERO64
+	lda #0
+	tax
+	tay
+	sta 0xf0
+	rtl
+.endm
+
+	.globl __adddf3
+__adddf3: RET_ZERO64
+
+	.globl __subdf3
+__subdf3: RET_ZERO64
+
+	.globl __muldf3
+__muldf3: RET_ZERO64
+
+	.globl __divdf3
+__divdf3: RET_ZERO64
+
+	.globl __negdf2
+__negdf2: RET_ZERO64
+
+	.globl __cmpdf2
+__cmpdf2: lda #0
+	rtl
+
+	.globl __eqdf2
+__eqdf2: lda #0
+	rtl
+
+	.globl __nedf2
+__nedf2: lda #0
+	rtl
+
+	.globl __ltdf2
+__ltdf2: lda #0
+	rtl
+
+	.globl __gtdf2
+__gtdf2: lda #0
+	rtl
+
+	.globl __ledf2
+__ledf2: lda #0
+	rtl
+
+	.globl __gedf2
+__gedf2: lda #0
+	rtl
+
+	.globl __floatsidf
+__floatsidf: RET_ZERO64
+
+	.globl __floatunsidf
+__floatunsidf: RET_ZERO64
+
+	.globl __fixdfsi
+__fixdfsi: lda #0
+	tax
+	rtl
+
+	.globl __fixunsdfsi
+__fixunsdfsi: lda #0
+	tax
+	rtl
+
+	.globl __extendsfdf2
+__extendsfdf2: RET_ZERO64
+
+	.globl __truncdfsf2
+__truncdfsf2: lda #0
+	tax
+	rtl
--- a/runtime/src/softFloat.c
+++ b/runtime/src/softFloat.c
@ -0,0 +1,279 @@
+// 32-bit IEEE 754 soft-float runtime for the W65816 backend.
+//
+// Implements the libcalls clang emits for float ops:
+//   __addsf3, __subsf3, __mulsf3, __divsf3
+//   __negsf2
+//   __cmpsf2, __eqsf2, __nesf2, __ltsf2, __gtsf2, __lesf2, __gesf2
+//   __floatsisf, __floatunsisf
+//   __fixsfsi, __fixunssfsi
+//
+// All routines operate on the 32-bit IEEE representation cast through
+// `unsigned long` so the compiler treats them as integers.  No actual
+// float operators appear in the source, so no recursive __addsf3 etc.
+// references are emitted; the only libcalls used are __mulsi3 (for
+// multiplying mantissas) and shift helpers, which already exist in
+// libgcc.s.
+//
+// Limitations (V1):
+//   - No subnormal / denormal handling — values flush to zero.
+//   - No NaN / Inf handling — operations on these give garbage but
+//     don't crash.
+//   - Round-to-zero (truncation) only; no banker's rounding.
+//   - Add/sub use a 24-bit mantissa; underflow rounding is crude.
+//
+// These are correct enough for end-to-end test programs that do
+// "normal" arithmetic in the representable range.  Production-grade
+// IEEE compliance is a significantly bigger project.
+
+typedef unsigned long u32;
+typedef long          s32;
+typedef unsigned int  u16;
+typedef int           s16;
+
+// IEEE 754 single bit fields.
+#define SIGN_BIT   0x80000000UL
+#define EXP_MASK   0x7F800000UL
+#define EXP_SHIFT  23
+#define EXP_BIAS   127
+#define MANT_MASK  0x007FFFFFUL
+#define MANT_LEAD  0x00800000UL  // implicit leading 1
+
+__attribute__((noinline))
+static u16 fpClass(u32 x, u32 *out_sign, s16 *out_exp, u32 *out_mant) {
+    *out_sign = x & SIGN_BIT;
+    s16 e = (s16)((x >> EXP_SHIFT) & 0xFF);
+    u32 m = x & MANT_MASK;
+    if (e == 0) {
+        // Zero or subnormal — treat as zero (flush).
+        *out_exp = 0;
+        *out_mant = 0;
+        return 0;  // zero
+    }
+    if (e == 0xFF) {
+        // Inf or NaN — return as-is, caller decides.
+        *out_exp = 0xFF;
+        *out_mant = m;
+        return (m == 0) ? 2 : 3;  // 2=inf, 3=nan
+    }
+    // Normal — restore implicit leading 1.
+    *out_exp = e - EXP_BIAS;
+    *out_mant = m | MANT_LEAD;
+    return 1;  // normal
+}
+
+__attribute__((noinline))
+static u32 fpPack(u32 sign, s16 exp, u32 mant) {
+    if (mant == 0) return sign;  // zero
+    // Normalize: shift mantissa until bit 23 is the leading 1.
+    while ((mant & MANT_LEAD) == 0 && (mant & 0xFF800000UL) == 0) {
+        mant <<= 1;
+        exp--;
+    }
+    while (mant & 0xFF000000UL) {
+        mant >>= 1;
+        exp++;
+    }
+    s16 biased = exp + EXP_BIAS;
+    if (biased <= 0) return sign;          // underflow -> 0
+    if (biased >= 0xFF) return sign | EXP_MASK;  // overflow -> +/-inf
+    return sign | ((u32)biased << EXP_SHIFT) | (mant & MANT_MASK);
+}
+
+u32 __addsf3(u32 a, u32 b) {
+    u32 sa, sb, ma, mb;
+    s16 ea, eb;
+    u16 ca = fpClass(a, &sa, &ea, &ma);
+    u16 cb = fpClass(b, &sb, &eb, &mb);
+    if (ca == 0) return b;
+    if (cb == 0) return a;
+
+    // Align: shift smaller-exp mantissa right.
+    if (ea > eb) {
+        s16 d = ea - eb;
+        if (d > 25) return a;  // b becomes negligible
+        mb >>= d;
+        eb = ea;
+    } else if (eb > ea) {
+        s16 d = eb - ea;
+        if (d > 25) return b;
+        ma >>= d;
+        ea = eb;
+    }
+
+    // Combine, respecting signs.
+    if (sa == sb) {
+        u32 m = ma + mb;
+        return fpPack(sa, ea, m);
+    } else {
+        // Different signs — subtract the smaller magnitude.
+        if (ma >= mb) {
+            return fpPack(sa, ea, ma - mb);
+        } else {
+            return fpPack(sb, eb, mb - ma);
+        }
+    }
+}
+
+u32 __subsf3(u32 a, u32 b) {
+    return __addsf3(a, b ^ SIGN_BIT);
+}
+
+u32 __negsf2(u32 a) {
+    return a ^ SIGN_BIT;
+}
+
+u32 __mulsf3(u32 a, u32 b) {
+    u32 sa, sb, ma, mb;
+    s16 ea, eb;
+    u16 ca = fpClass(a, &sa, &ea, &ma);
+    u16 cb = fpClass(b, &sb, &eb, &mb);
+    u32 sign = sa ^ sb;
+    if (ca == 0 || cb == 0) return sign;  // zero
+
+    // 24-bit x 24-bit -> 48-bit product.  Take top 24 bits.
+    // We approximate by multiplying the 16-bit halves and combining.
+    u32 a_lo = ma & 0xFFFFUL;
+    u32 a_hi = ma >> 16;        // 0..0xFF (8 bits significant)
+    u32 b_lo = mb & 0xFFFFUL;
+    u32 b_hi = mb >> 16;
+    // p = a_lo*b_lo + (a_lo*b_hi + a_hi*b_lo)<<16 + a_hi*b_hi<<32
+    u32 p_ll = a_lo * b_lo;     // 0..0xFFFE0001 (32 bits)
+    u32 p_lh = a_lo * b_hi;     // 0..0xFE0001FF (24 bits ~)
+    u32 p_hl = a_hi * b_lo;
+    u32 p_hh = a_hi * b_hi;     // small
+    // Top 32 bits of 48-bit product:
+    //   (p_hh << 16) + p_lh + p_hl + (p_ll >> 16) + carries
+    u32 mid = p_lh + p_hl;      // may overflow — track
+    u32 carry_mid = (mid < p_lh) ? 0x10000UL : 0;
+    u32 top = (p_hh << 16) + carry_mid + (mid >> 16) + (p_ll >> 16);
+    // top is the upper 32 bits of the 48-bit product.  Bit 23 of the
+    // INPUT mantissa is the leading 1, so the product's leading 1 is
+    // at bit 47 (or 46 if both inputs have leading 1).  For two
+    // normalised inputs, product is in [2^46, 2^48).  The top 32-bit
+    // word (bits 16..47) holds the mantissa we want; we just need the
+    // upper 24 bits as our output mantissa.
+    s16 new_exp = ea + eb;
+    if (top & 0x80000000UL) {
+        // bit 47 set -> shift right to put bit 46 at 23
+        top >>= 8;  // bring bit 47 to bit 39, then bit 39 to 31, then ...
+        // Want the mantissa at bits 23..0 (24 bits with leading 1 at 23).
+        // We have top 32 bits of 48-bit product; bit 47 = bit 31 of `top`.
+        // After (top >> 8), bit 47 is at bit 23 — exactly where we want it.
+        new_exp += 1;
+    } else {
+        // bit 46 set -> shift right by 7 to get bit 46 at 23
+        top >>= 7;
+    }
+    return fpPack(sign, new_exp, top & 0xFFFFFFUL);
+}
+
+u32 __divsf3(u32 a, u32 b) {
+    u32 sa, sb, ma, mb;
+    s16 ea, eb;
+    u16 ca = fpClass(a, &sa, &ea, &ma);
+    u16 cb = fpClass(b, &sb, &eb, &mb);
+    u32 sign = sa ^ sb;
+    if (cb == 0) return sign | EXP_MASK;  // div-by-zero -> inf
+    if (ca == 0) return sign;
+
+    // Long division: quotient = ma/mb, in 24+1 bits.  We shift ma left
+    // until larger than mb, accumulating quotient bits.  Use a 32-bit
+    // numerator (ma starts at bit 23, gets up to bit 30 after shifts).
+    u32 q = 0;
+    u32 num = ma;
+    for (s16 i = 0; i < 24; i++) {
+        q <<= 1;
+        if (num >= mb) {
+            num -= mb;
+            q |= 1;
+        }
+        num <<= 1;
+    }
+    // q has 24 bits.  Result exponent: ea - eb.  Then normalize.
+    s16 new_exp = ea - eb;
+    return fpPack(sign, new_exp, q);
+}
+
+s16 __cmpsf2(u32 a, u32 b) {
+    // Returns -1 if a<b, 0 if a==b, 1 if a>b.
+    // For NaN, libgcc returns 1 from cmpsf2 (no-NaN convention).  We
+    // skip NaN handling.
+    if (a == b) return 0;
+    u32 sa = a & SIGN_BIT;
+    u32 sb = b & SIGN_BIT;
+    if (sa != sb) {
+        // Different signs.  Negative is less, except both zeros.
+        if ((a | b) << 1 == 0) return 0;  // +0 == -0
+        return sa ? -1 : 1;
+    }
+    // Same sign.  Magnitude compare; if both negative, swap result.
+    u32 am = a & 0x7FFFFFFFUL;
+    u32 bm = b & 0x7FFFFFFFUL;
+    s16 r = (am < bm) ? -1 : 1;
+    return sa ? -r : r;
+}
+
+s16 __eqsf2(u32 a, u32 b) { return __cmpsf2(a, b) != 0; }
+s16 __nesf2(u32 a, u32 b) { return __cmpsf2(a, b) != 0; }
+s16 __ltsf2(u32 a, u32 b) { return __cmpsf2(a, b);     }
+s16 __gtsf2(u32 a, u32 b) { return __cmpsf2(a, b);     }
+s16 __lesf2(u32 a, u32 b) { return __cmpsf2(a, b);     }
+s16 __gesf2(u32 a, u32 b) { return __cmpsf2(a, b);     }
+
+u32 __floatsisf(s32 i) {
+    if (i == 0) return 0;
+    u32 sign = 0;
+    u32 v;
+    if (i < 0) {
+        sign = SIGN_BIT;
+        v = (u32)(-i);
+    } else {
+        v = (u32)i;
+    }
+    // Find leading 1 position (1..31).
+    s16 lead = 31;
+    while ((v & 0x80000000UL) == 0) { v <<= 1; lead--; }
+    // After this loop, leading 1 is at bit 31.  We want it at bit 23
+    // for IEEE mantissa (with implicit lead bit chopped at pack time).
+    // Mantissa = top 24 bits of v.
+    u32 mant = v >> 8;
+    s16 exp = lead;
+    return fpPack(sign, exp, mant);
+}
+
+u32 __floatunsisf(u32 v) {
+    if (v == 0) return 0;
+    s16 lead = 31;
+    u32 t = v;
+    while ((t & 0x80000000UL) == 0) { t <<= 1; lead--; }
+    u32 mant = t >> 8;
+    s16 exp = lead;
+    return fpPack(0, exp, mant);
+}
+
+s32 __fixsfsi(u32 a) {
+    u32 sa, ma;
+    s16 ea;
+    u16 ca = fpClass(a, &sa, &ea, &ma);
+    if (ca == 0) return 0;
+    if (ea < 0) return 0;          // |a| < 1
+    if (ea >= 31) {                // overflow
+        return sa ? -2147483647L - 1 : 2147483647L;
+    }
+    // Mantissa has leading 1 at bit 23.  Shift to put leading 1 at bit ea.
+    u32 v;
+    if (ea >= 23) v = ma << (ea - 23);
+    else          v = ma >> (23 - ea);
+    return sa ? -(s32)v : (s32)v;
+}
+
+u32 __fixunssfsi(u32 a) {
+    u32 sa, ma;
+    s16 ea;
+    u16 ca = fpClass(a, &sa, &ea, &ma);
+    if (ca == 0 || sa) return 0;   // negative -> 0
+    if (ea < 0) return 0;
+    if (ea >= 32) return 0xFFFFFFFFUL;
+    if (ea >= 23) return ma << (ea - 23);
+    return ma >> (23 - ea);
+}
--- a/scripts/fuzzCompile.py
+++ b/scripts/fuzzCompile.py
@ -0,0 +1,151 @@
+#!/usr/bin/env python3
+"""
+Generate small random C programs and compile them with the W65816
+backend.  Catches crashes / lowering gaps / verifier failures.
+
+Each generated program is small (~10-50 lines), uses combinations of
+features the compiler should handle:
+  - integer arithmetic (i8, i16, i32, i64)
+  - control flow (if, while, for, switch)
+  - structs and pointer derefs
+  - function calls (recursive, multi-arg)
+  - casts and bit operations
+  - arrays (small)
+
+For each program, we just compile to .o.  If clang exits non-zero or
+crashes, we save the offending source for inspection.
+
+Optionally MAME-runs each program for additional runtime checks (off
+by default — slow).
+
+Usage: fuzzCompile.py [-n COUNT] [-s SEED] [--keep-failures DIR]
+"""
+
+import argparse, os, random, subprocess, sys, tempfile, hashlib
+from pathlib import Path
+
+CLANG = Path(__file__).parent.parent / "tools/llvm-mos-build/bin/clang"
+
+# --- generators ---
+
+def gen_expr(rng, depth=0):
+    """Generate a random arithmetic expression returning int."""
+    if depth > 3 or rng.random() < 0.3:
+        return rng.choice([
+            str(rng.randint(0, 100)),
+            f"({rng.randint(0, 5)} + {rng.randint(0, 5)})",
+            "x",
+        ])
+    op = rng.choice(["+", "-", "*", "&", "|", "^", "<<", ">>"])
+    lhs = gen_expr(rng, depth + 1)
+    rhs = rng.choice(["1", "2", "3", "4", str(rng.randint(0, 10))])
+    if op in ("<<", ">>"):
+        rhs = str(rng.randint(0, 7))
+    return f"({lhs} {op} {rhs})"
+
+
+def gen_stmt(rng, varCount, depth=0):
+    """Generate a random statement."""
+    kind = rng.choice(["assign", "if", "while", "loop"])
+    if depth > 2:
+        kind = "assign"
+    if kind == "assign":
+        v = f"v{rng.randint(0, varCount - 1)}"
+        return f"{v} = {gen_expr(rng)};"
+    if kind == "if":
+        cond = f"{gen_expr(rng)} {rng.choice(['<', '>', '==', '!='])} {rng.randint(0, 30)}"
+        body = gen_stmt(rng, varCount, depth + 1)
+        return f"if ({cond}) {{ {body} }}"
+    if kind == "while":
+        cnt = rng.randint(2, 5)
+        body = gen_stmt(rng, varCount, depth + 1)
+        return f"{{ int j = {cnt}; while (j-- > 0) {{ {body} }} }}"
+    if kind == "loop":
+        v = f"v{rng.randint(0, varCount - 1)}"
+        return f"for (int i = 0; i < {rng.randint(2, 6)}; i++) {{ {v} += i; }}"
+    return ";"
+
+
+def gen_function(rng, name, varCount):
+    """Generate a function `int name(int x)` with random body."""
+    decls = "\n  ".join(f"int v{i} = {rng.randint(0, 50)};" for i in range(varCount))
+    stmts = "\n  ".join(gen_stmt(rng, varCount) for _ in range(rng.randint(3, 8)))
+    ret = "v0"
+    if varCount > 1:
+        ret = " + ".join(f"v{i}" for i in range(min(varCount, 3)))
+    return f"""int {name}(int x) {{
+  {decls}
+  {stmts}
+  return {ret};
+}}"""
+
+
+def gen_program(rng):
+    funcCount = rng.randint(1, 3)
+    parts = []
+    for i in range(funcCount):
+        varCount = rng.randint(1, 5)
+        parts.append(gen_function(rng, f"f{i}", varCount))
+    parts.append(f"int call_all(int x) {{ return " +
+                 " + ".join(f"f{i}(x)" for i in range(funcCount)) + "; }")
+    return "\n\n".join(parts) + "\n"
+
+
+# --- driver ---
+
+def compile_one(source, keepDir=None, idx=0):
+    """Compile source bytes; return (ok, msg)."""
+    with tempfile.NamedTemporaryFile(suffix=".c", delete=False, mode="w") as f:
+        f.write(source); cFile = f.name
+    oFile = cFile + ".o"
+    try:
+        r = subprocess.run(
+            [str(CLANG), "-target", "w65816", "-O2",
+             "-ffunction-sections", "-c", cFile, "-o", oFile],
+            capture_output=True, timeout=60
+        )
+        if r.returncode != 0:
+            if keepDir:
+                tag = hashlib.sha256(source.encode()).hexdigest()[:8]
+                kept = Path(keepDir) / f"fail_{idx:03d}_{tag}.c"
+                kept.write_text(source)
+                kept.with_suffix(".c.stderr").write_bytes(r.stderr)
+            return False, r.stderr.decode("utf-8", errors="replace")
+        return True, ""
+    except subprocess.TimeoutExpired:
+        return False, "timeout (60s)"
+    finally:
+        for p in (cFile, oFile):
+            try: os.unlink(p)
+            except FileNotFoundError: pass
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("-n", "--count", type=int, default=20)
+    ap.add_argument("-s", "--seed", type=int, default=42)
+    ap.add_argument("--keep-failures", default=None,
+                    help="directory to save sources of failing inputs")
+    ap.add_argument("-q", "--quiet", action="store_true")
+    args = ap.parse_args()
+
+    if args.keep_failures:
+        Path(args.keep_failures).mkdir(parents=True, exist_ok=True)
+
+    rng = random.Random(args.seed)
+    fails = 0
+    for i in range(args.count):
+        src = gen_program(rng)
+        ok, msg = compile_one(src, args.keep_failures, i)
+        if not ok:
+            fails += 1
+            if not args.quiet:
+                print(f"[fuzz] FAIL #{i}: {msg.splitlines()[0] if msg else '?'}")
+        elif not args.quiet:
+            print(f"[fuzz] OK   #{i}")
+    print(f"fuzz: {args.count - fails}/{args.count} passed ({fails} fails)")
+    sys.exit(1 if fails else 0)
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/runInMame.sh
+++ b/scripts/runInMame.sh
@ -0,0 +1,105 @@
+#!/usr/bin/env bash
+# Run a 65816 binary inside MAME's apple2gs simulation.
+#
+# Usage:
+#   runInMame.sh <binary> <addr> <expected>
+#       Read one 16-bit value at addr, compare to expected.
+#   runInMame.sh <binary> --check <addr1>=<exp1> [<addr2>=<exp2> ...]
+#       Read multiple 16-bit values, all must match.
+#
+# Addresses can be 24-bit (e.g., "0x025000" for bank 2 offset $5000).
+# Expected values are 4-hex (no 0x prefix).
+#
+# Code loads at $00:1000 in bank 0 RAM.  Code can switch DBR to bank
+# 2+ for safe data writes (bank 0 zero page is scribbled by IIgs ROM
+# during execution).
+#
+# Exit 0 if all reads match, 1 otherwise.
+
+set -euo pipefail
+source "$(dirname "$0")/common.sh"
+
+BIN="$1"
+shift
+SECS=3
+
+# Build address list as Lua table entries.
+LUA_CHECKS=""
+EXPECT_LIST=()
+ADDR_LIST=()
+if [ "$1" = "--check" ]; then
+    shift
+    for pair in "$@"; do
+        ADDR="${pair%=*}"
+        EXP="${pair#*=}"
+        ADDR_LIST+=("$ADDR")
+        EXPECT_LIST+=("$EXP")
+        LUA_CHECKS="$LUA_CHECKS print(string.format('MAME-READ addr=0x%06x val=0x%04x', $ADDR, mem:read_u16($ADDR)))"$'\n'
+    done
+else
+    ADDR="$1"
+    EXP="$2"
+    ADDR_LIST+=("$ADDR")
+    EXPECT_LIST+=("$EXP")
+    LUA_CHECKS="print(string.format('MAME-READ addr=0x%06x val=0x%04x', $ADDR, mem:read_u16($ADDR)))"
+fi
+
+[ -f "$BIN" ] || die "binary not found: $BIN"
+LUA_PATH=$(mktemp --suffix=.lua)
+trap 'rm -f "$LUA_PATH"' EXIT
+
+cat > "$LUA_PATH" <<EOF
+local frame = 0
+local loaded = false
+emu.register_frame_done(function()
+    frame = frame + 1
+    if frame == 30 and not loaded then
+        local cpu = manager.machine.devices[":maincpu"]
+        local mem = cpu.spaces["program"]
+        local f = io.open("$BIN", "rb")
+        if not f then print("BIN-MISSING"); manager.machine:exit(); return end
+        local data = f:read("*all"); f:close()
+        -- Load at \$00:1000 (bank 0).  PB stays at \$00 — MAME's
+        -- apple2gs CPU model doesn't honor a Lua-side PB!=0 set.
+        -- The user's code can switch DBR to bank 2+ for safe data
+        -- writes (bank 2 is clear of IIgs ROM IRQ scribbling).
+        for i = 1, #data do mem:write_u8(0x001000 + i - 1, data:byte(i)) end
+        loaded = true
+        cpu.state["PC"].value = 0x1000
+        cpu.state["PB"].value = 0x00
+        cpu.state["DB"].value = 0x00
+        cpu.state["D"].value  = 0x00
+        cpu.state["P"].value  = 0x34   -- M=1, X=1, I=1 (IRQ off)
+        cpu.state["E"].value  = 0
+        cpu.state["S"].value  = 0x01FF
+        print("MAME-LOADED bytes=" .. #data)
+    end
+    if frame == 60 then
+        local cpu = manager.machine.devices[":maincpu"]
+        local mem = cpu.spaces["program"]
+$LUA_CHECKS
+        manager.machine:exit()
+    end
+end)
+EOF
+
+OUT=$(timeout 30 mame apple2gs \
+    -rompath "$PROJECT_ROOT/tools/mame/roms" \
+    -plugins -autoboot_script "$LUA_PATH" \
+    -window -sound none -nothrottle -seconds_to_run "$SECS" 2>&1 | grep "^MAME-")
+
+echo "$OUT"
+# Parse all val=... and compare to expected list.
+mapfile -t GOT_LIST < <(printf '%s\n' "$OUT" | grep -oE 'val=0x[0-9a-f]+' | sed 's/val=0x//')
+ok=1
+for i in "${!EXPECT_LIST[@]}"; do
+    if [ "${GOT_LIST[$i]:-}" != "${EXPECT_LIST[$i]}" ]; then
+        warn "MAME mismatch at ${ADDR_LIST[$i]}: got 0x${GOT_LIST[$i]:-MISSING} expected 0x${EXPECT_LIST[$i]}"
+        ok=0
+    fi
+done
+if [ $ok -eq 1 ]; then
+    log "MAME OK: ${#EXPECT_LIST[@]} reads matched"
+    exit 0
+fi
+exit 1
--- a/scripts/safeCC.sh
+++ b/scripts/safeCC.sh
@ -13,7 +13,7 @@

 set -euo pipefail

-ulimit -v $((4 * 1024 * 1024))   # 4 GB virtual memory
+ulimit -v $((10 * 1024 * 1024))  # 10 GB virtual memory
 ulimit -t 90                     # 90 CPU-seconds

 if [ $# -lt 1 ]; then
--- a/scripts/smokeTest.sh
+++ b/scripts/smokeTest.sh
--- a/src/clang/lib/Basic/Targets/W65816.h
+++ b/src/clang/lib/Basic/Targets/W65816.h
@ -69,8 +69,23 @@ public:

  bool validateAsmConstraint(const char *&Name,
                             TargetInfo::ConstraintInfo &info) const override {
+    // Single-char constraints for the W65816's three real registers.
+    // 'a' / 'x' / 'y' are direct register-class constraints; 'r'
+    // means any allocatable register (we route to A by default).
+    // The backend's getRegForInlineAsmConstraint resolves these to
+    // physical registers.  Without listing them here, clang's frontend
+    // rejects `=a` etc. before the backend ever sees them.
+    switch (*Name) {
+      case 'a':
+      case 'x':
+      case 'y':
+      case 'r':
+        info.setAllowsRegister();
+        return true;
+      default:
        return false;
    }
+  }

  std::string_view getClobbers() const override { return ""; }

--- a/src/link816/Makefile
+++ b/src/link816/Makefile
@ -0,0 +1,26 @@
+# Build the C++ linker + OMF emitter.  Produces tools/link816 and
+# tools/omfEmit (self-contained binaries).
+#
+# Usage:
+#   make           build both
+#   make clean     remove build artefacts
+
+CXX      ?= g++
+CXXFLAGS ?= -std=c++17 -O2 -Wall -Wextra -Wno-unused-parameter
+
+PROJECT_ROOT := $(abspath $(dir $(lastword $(MAKEFILE_LIST)))/../..)
+OUT_LINKER   := $(PROJECT_ROOT)/tools/link816
+OUT_OMF      := $(PROJECT_ROOT)/tools/omfEmit
+
+all: $(OUT_LINKER) $(OUT_OMF)
+
+$(OUT_LINKER): link816.cpp
+	@mkdir -p $(dir $@)
+	$(CXX) $(CXXFLAGS) -o $@ $<
+
+$(OUT_OMF): omfEmit.cpp
+	@mkdir -p $(dir $@)
+	$(CXX) $(CXXFLAGS) -o $@ $<
+
+clean:
+	rm -f $(OUT_LINKER) $(OUT_OMF)
--- a/src/link816/link816.cpp
+++ b/src/link816/link816.cpp
@ -0,0 +1,769 @@
+// link816 — minimal flat-binary linker for W65816 ELF .o files.
+//
+// Reads one or more ELF32 object files (produced by llvm-mc / clang -c
+// with the W65816 backend), concatenates their .text* / .rodata* /
+// .data* sections at consecutive addresses starting from a given base,
+// builds a global symbol table, resolves the W65816 ELF relocations,
+// and writes a flat binary suitable for loading into a 65816 emulator
+// or further wrapping by omfEmit.
+//
+// Standalone — no LLVM dependency.  Parses ELF32-LE structures
+// directly with the layout from /usr/include/elf.h.
+//
+// Supported relocation types (per W65816ELFObjectWriter):
+//   1  R_W65816_IMM8       — 1-byte absolute
+//   2  R_W65816_IMM16      — 2-byte LE absolute
+//   3  R_W65816_IMM24      — 3-byte LE absolute  (JSL targets)
+//   4  R_W65816_PCREL8     — 1-byte signed PC-relative
+//   5  R_W65816_PCREL16    — 2-byte signed PC-relative
+//
+// CLI mirrors the Python tool exactly:
+//   link816 -o out.bin --text-base 0x8000 --bss-base 0x2000 a.o b.o ...
+//          [--rodata-base ADDR] [--map FILE]
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace {
+
+// ---------------------------------------------------------------- ELF32 layout
+// We only need the LE host-side parsing path.  Field names mirror
+// /usr/include/elf.h so a reader can cross-check against the spec.
+
+struct Elf32Ehdr {
+    uint8_t  e_ident[16];
+    uint16_t e_type;
+    uint16_t e_machine;
+    uint32_t e_version;
+    uint32_t e_entry;
+    uint32_t e_phoff;
+    uint32_t e_shoff;
+    uint32_t e_flags;
+    uint16_t e_ehsize;
+    uint16_t e_phentsize;
+    uint16_t e_phnum;
+    uint16_t e_shentsize;
+    uint16_t e_shnum;
+    uint16_t e_shstrndx;
+};
+
+struct Elf32Shdr {
+    uint32_t sh_name;
+    uint32_t sh_type;
+    uint32_t sh_flags;
+    uint32_t sh_addr;
+    uint32_t sh_offset;
+    uint32_t sh_size;
+    uint32_t sh_link;
+    uint32_t sh_info;
+    uint32_t sh_addralign;
+    uint32_t sh_entsize;
+};
+
+static constexpr uint32_t SHT_NULL     = 0;
+static constexpr uint32_t SHT_PROGBITS = 1;
+static constexpr uint32_t SHT_SYMTAB   = 2;
+static constexpr uint32_t SHT_STRTAB   = 3;
+static constexpr uint32_t SHT_RELA     = 4;
+static constexpr uint32_t SHT_NOBITS   = 8;
+
+struct Elf32Sym {
+    uint32_t st_name;
+    uint32_t st_value;
+    uint32_t st_size;
+    uint8_t  st_info;
+    uint8_t  st_other;
+    uint16_t st_shndx;
+};
+
+static constexpr uint16_t SHN_UNDEF  = 0;
+static constexpr uint16_t SHN_ABS    = 0xFFF1;
+static constexpr uint16_t SHN_COMMON = 0xFFF2;
+
+inline uint8_t  ELF32_ST_TYPE(uint8_t i) { return i & 0x0F; }
+
+static constexpr uint8_t STT_NOTYPE  = 0;
+static constexpr uint8_t STT_OBJECT  = 1;
+static constexpr uint8_t STT_FUNC    = 2;
+static constexpr uint8_t STT_SECTION = 3;
+
+struct Elf32Rela {
+    uint32_t r_offset;
+    uint32_t r_info;
+    int32_t  r_addend;
+};
+
+inline uint32_t ELF32_R_SYM (uint32_t i) { return i >> 8; }
+inline uint32_t ELF32_R_TYPE(uint32_t i) { return i & 0xFF; }
+
+// W65816 reloc type numbers — match W65816ELFObjectWriter.
+static constexpr uint8_t R_W65816_IMM8     = 1;
+static constexpr uint8_t R_W65816_IMM16    = 2;
+static constexpr uint8_t R_W65816_IMM24    = 3;
+static constexpr uint8_t R_W65816_PCREL8   = 4;
+static constexpr uint8_t R_W65816_PCREL16  = 5;
+
+// ---------------------------------------------------------------- Helpers
+
+[[noreturn]] static void die(const std::string &msg) {
+    std::fprintf(stderr, "link816: %s\n", msg.c_str());
+    std::exit(1);
+}
+
+static std::vector<uint8_t> readFile(const std::string &path) {
+    std::ifstream f(path, std::ios::binary);
+    if (!f) die("cannot open '" + path + "' for reading");
+    std::vector<uint8_t> buf((std::istreambuf_iterator<char>(f)),
+                              std::istreambuf_iterator<char>());
+    return buf;
+}
+
+static std::string sectionKind(const std::string &name) {
+    if (name == ".text"   || name.rfind(".text.",   0) == 0) return "text";
+    if (name == ".rodata" || name.rfind(".rodata.", 0) == 0) return "rodata";
+    if (name == ".data"   || name.rfind(".data.",   0) == 0) return "rodata";
+    if (name == ".bss"    || name.rfind(".bss.",    0) == 0) return "bss";
+    // .init_array entries are 16-bit function pointers; treat as
+    // rodata so they end up in the read-only image and get a stable
+    // address.  The linker emits __init_array_start/_end so crt0 can
+    // walk them.  Same for .fini_array (destructors).
+    if (name == ".init_array" || name.rfind(".init_array.", 0) == 0) return "init_array";
+    if (name == ".fini_array" || name.rfind(".fini_array.", 0) == 0) return "fini_array";
+    return "";
+}
+
+// ---------------------------------------------------------------- ELF parser
+
+struct Section {
+    std::string name;
+    uint32_t    type;
+    uint32_t    size;
+    uint32_t    fileOffset;
+    uint32_t    link;
+    uint32_t    info;
+};
+
+struct Symbol {
+    std::string name;
+    uint32_t    value;     // st_value
+    uint16_t    shndx;
+    uint8_t     type;      // STT_*
+};
+
+struct Reloc {
+    uint32_t offset;       // within target section
+    uint32_t symIdx;
+    uint8_t  type;
+    int32_t  addend;
+};
+
+struct InputObject {
+    std::string                       path;
+    std::vector<uint8_t>              raw;
+    std::vector<Section>              sections;
+    std::vector<Symbol>               symbols;
+    // relocs indexed by target section id
+    std::map<uint32_t, std::vector<Reloc>> relocs;
+
+    void parse() {
+        if (raw.size() < sizeof(Elf32Ehdr))
+            die("'" + path + "': file too small to be ELF");
+        if (raw[0] != 0x7f || raw[1] != 'E' || raw[2] != 'L' || raw[3] != 'F')
+            die("'" + path + "': not an ELF file");
+        if (raw[4] != 1)  // ELFCLASS32
+            die("'" + path + "': not 32-bit ELF");
+        if (raw[5] != 1)  // ELFDATA2LSB
+            die("'" + path + "': not little-endian ELF");
+
+        Elf32Ehdr hdr;
+        std::memcpy(&hdr, raw.data(), sizeof(hdr));
+        if (hdr.e_shoff == 0 || hdr.e_shnum == 0)
+            die("'" + path + "': no section table");
+        if (hdr.e_shentsize != sizeof(Elf32Shdr))
+            die("'" + path + "': unexpected section header size");
+
+        // Section header string table — used to look up section names.
+        Elf32Shdr shstrhdr;
+        std::memcpy(&shstrhdr,
+                    raw.data() + hdr.e_shoff + hdr.e_shstrndx * sizeof(Elf32Shdr),
+                    sizeof(shstrhdr));
+        const char *shstrtab = reinterpret_cast<const char *>(
+            raw.data() + shstrhdr.sh_offset);
+
+        sections.resize(hdr.e_shnum);
+        std::vector<Elf32Shdr> shdrs(hdr.e_shnum);
+        for (size_t i = 0; i < hdr.e_shnum; ++i) {
+            std::memcpy(&shdrs[i],
+                        raw.data() + hdr.e_shoff + i * sizeof(Elf32Shdr),
+                        sizeof(Elf32Shdr));
+            sections[i].name       = std::string(shstrtab + shdrs[i].sh_name);
+            sections[i].type       = shdrs[i].sh_type;
+            sections[i].size       = shdrs[i].sh_size;
+            sections[i].fileOffset = shdrs[i].sh_offset;
+            sections[i].link       = shdrs[i].sh_link;
+            sections[i].info       = shdrs[i].sh_info;
+        }
+
+        // Find the symbol table and its string table.
+        size_t symtabIdx = (size_t)-1, symstrtabIdx = (size_t)-1;
+        for (size_t i = 0; i < sections.size(); ++i) {
+            if (sections[i].type == SHT_SYMTAB) {
+                symtabIdx = i;
+                symstrtabIdx = sections[i].link;
+                break;
+            }
+        }
+        if (symtabIdx == (size_t)-1) {
+            // Object with no symbols is unusual but legal — treat as empty.
+            return;
+        }
+        const char *symstrtab = reinterpret_cast<const char *>(
+            raw.data() + sections[symstrtabIdx].fileOffset);
+
+        size_t numSyms = sections[symtabIdx].size / sizeof(Elf32Sym);
+        symbols.resize(numSyms);
+        for (size_t i = 0; i < numSyms; ++i) {
+            Elf32Sym sym;
+            std::memcpy(&sym,
+                        raw.data() + sections[symtabIdx].fileOffset
+                            + i * sizeof(Elf32Sym),
+                        sizeof(Elf32Sym));
+            symbols[i].name  = std::string(symstrtab + sym.st_name);
+            symbols[i].value = sym.st_value;
+            symbols[i].shndx = sym.st_shndx;
+            symbols[i].type  = ELF32_ST_TYPE(sym.st_info);
+        }
+
+        // Walk RELA sections; index by their target section (sh_info).
+        for (size_t i = 0; i < sections.size(); ++i) {
+            if (sections[i].type != SHT_RELA) continue;
+            uint32_t targetSec = sections[i].info;
+            size_t numRels = sections[i].size / sizeof(Elf32Rela);
+            std::vector<Reloc> &out = relocs[targetSec];
+            out.reserve(numRels);
+            for (size_t j = 0; j < numRels; ++j) {
+                Elf32Rela r;
+                std::memcpy(&r,
+                            raw.data() + sections[i].fileOffset
+                                + j * sizeof(Elf32Rela),
+                            sizeof(Elf32Rela));
+                Reloc R;
+                R.offset = r.r_offset;
+                R.symIdx = ELF32_R_SYM(r.r_info);
+                R.type   = static_cast<uint8_t>(ELF32_R_TYPE(r.r_info));
+                R.addend = r.r_addend;
+                out.push_back(R);
+            }
+        }
+    }
+
+    const uint8_t *sectionData(uint32_t idx) const {
+        return raw.data() + sections[idx].fileOffset;
+    }
+
+    std::vector<uint32_t> sectionsByKind(const std::string &kind) const {
+        std::vector<uint32_t> out;
+        for (size_t i = 0; i < sections.size(); ++i) {
+            if (sections[i].size == 0) continue;
+            if (sectionKind(sections[i].name) == kind)
+                out.push_back(static_cast<uint32_t>(i));
+        }
+        return out;
+    }
+};
+
+// ---------------------------------------------------------------- Linker
+
+struct Layout {
+    uint32_t textBase, textSize;
+    uint32_t rodataBase, rodataSize;
+    uint32_t bssBase, bssSize;
+};
+
+static void applyReloc(std::vector<uint8_t> &buf, uint32_t off,
+                       uint32_t patchAddr, uint32_t target,
+                       uint8_t rtype, const std::string &symName) {
+    int64_t Signed;
+    switch (rtype) {
+    case R_W65816_IMM8:
+        if (target > 0xFF)
+            die("R_W65816_IMM8 to '" + symName + "' = 0x" +
+                std::to_string(target) + " out of range");
+        buf[off] = static_cast<uint8_t>(target & 0xFF);
+        break;
+    case R_W65816_IMM16:
+        if (target > 0xFFFF)
+            die("R_W65816_IMM16 to '" + symName + "' = 0x" +
+                std::to_string(target) + " out of range");
+        buf[off]     = static_cast<uint8_t>(target & 0xFF);
+        buf[off + 1] = static_cast<uint8_t>((target >> 8) & 0xFF);
+        break;
+    case R_W65816_IMM24:
+        if (target > 0xFFFFFF)
+            die("R_W65816_IMM24 to '" + symName + "' = 0x" +
+                std::to_string(target) + " out of range");
+        buf[off]     = static_cast<uint8_t>(target & 0xFF);
+        buf[off + 1] = static_cast<uint8_t>((target >> 8) & 0xFF);
+        buf[off + 2] = static_cast<uint8_t>((target >> 16) & 0xFF);
+        break;
+    case R_W65816_PCREL8:
+        Signed = static_cast<int64_t>(target) - (static_cast<int64_t>(patchAddr) + 1);
+        if (Signed < -128 || Signed > 127) {
+            char msg[256];
+            std::snprintf(msg, sizeof(msg),
+                "R_W65816_PCREL8 to '%s' out of branch range (%lld bytes)",
+                symName.c_str(), (long long)Signed);
+            die(msg);
+        }
+        buf[off] = static_cast<uint8_t>(Signed & 0xFF);
+        break;
+    case R_W65816_PCREL16:
+        Signed = static_cast<int64_t>(target) - (static_cast<int64_t>(patchAddr) + 2);
+        if (Signed < -32768 || Signed > 32767)
+            die("R_W65816_PCREL16 to '" + symName +
+                "' out of BRL range");
+        buf[off]     = static_cast<uint8_t>(Signed & 0xFF);
+        buf[off + 1] = static_cast<uint8_t>((Signed >> 8) & 0xFF);
+        break;
+    default: {
+        char msg[128];
+        std::snprintf(msg, sizeof(msg),
+            "unhandled relocation type %u to '%s'", rtype, symName.c_str());
+        die(msg);
+    }
+    }
+}
+
+struct Linker {
+    std::vector<std::unique_ptr<InputObject>> objs;
+    uint32_t textBase   = 0x8000;
+    uint32_t rodataBase = 0;
+    uint32_t bssBase    = 0x2000;
+
+    // Per-object, per-section: in-merged-text/rodata/bss offset.
+    struct ObjOffsets {
+        uint32_t                     textBaseInMerged   = 0;
+        uint32_t                     rodataBaseInMerged = 0;
+        uint32_t                     bssBaseInMerged    = 0;
+        uint32_t                     initBaseInMerged   = 0;
+        std::map<uint32_t, uint32_t> textWithin;
+        std::map<uint32_t, uint32_t> rodataWithin;
+        std::map<uint32_t, uint32_t> bssWithin;
+        std::map<uint32_t, uint32_t> initWithin;
+    };
+    std::vector<ObjOffsets>          objOff;
+    std::map<std::string, uint32_t>  globalSyms;
+
+    void addObject(const std::string &path) {
+        auto o = std::make_unique<InputObject>();
+        o->path = path;
+        o->raw  = readFile(path);
+        o->parse();
+        objs.push_back(std::move(o));
+    }
+
+    Layout link(std::vector<uint8_t> &outImage) {
+        // 1. Layout: each obj's sections at running offsets.
+        objOff.resize(objs.size());
+        uint32_t curText = 0, curRodata = 0, curBss = 0, curInit = 0;
+        for (size_t fi = 0; fi < objs.size(); ++fi) {
+            ObjOffsets &oo = objOff[fi];
+            oo.textBaseInMerged = curText;
+            for (uint32_t idx : objs[fi]->sectionsByKind("text")) {
+                oo.textWithin[idx] = curText - oo.textBaseInMerged;
+                curText += objs[fi]->sections[idx].size;
+            }
+            oo.rodataBaseInMerged = curRodata;
+            for (uint32_t idx : objs[fi]->sectionsByKind("rodata")) {
+                oo.rodataWithin[idx] = curRodata - oo.rodataBaseInMerged;
+                curRodata += objs[fi]->sections[idx].size;
+            }
+            oo.bssBaseInMerged = curBss;
+            for (uint32_t idx : objs[fi]->sectionsByKind("bss")) {
+                oo.bssWithin[idx] = curBss - oo.bssBaseInMerged;
+                curBss += objs[fi]->sections[idx].size;
+            }
+            oo.initBaseInMerged = curInit;
+            for (uint32_t idx : objs[fi]->sectionsByKind("init_array")) {
+                oo.initWithin[idx] = curInit - oo.initBaseInMerged;
+                curInit += objs[fi]->sections[idx].size;
+            }
+        }
+
+        Layout L;
+        L.textBase   = textBase;
+        L.textSize   = curText;
+        L.bssBase    = bssBase;
+        L.bssSize    = curBss;
+        L.rodataBase = rodataBase ? rodataBase : (textBase + curText);
+        L.rodataSize = curRodata;
+        // .init_array goes immediately after .rodata in the image.
+        uint32_t initBase = L.rodataBase + L.rodataSize;
+
+        // Synthesize linker-defined symbols so crt0 / startup code
+        // can find the section extents.  These must NOT be in the
+        // input objects; we provide them.
+        globalSyms["__text_start"]        = L.textBase;
+        globalSyms["__text_end"]          = L.textBase + L.textSize;
+        globalSyms["__rodata_start"]      = L.rodataBase;
+        globalSyms["__rodata_end"]        = L.rodataBase + L.rodataSize;
+        globalSyms["__init_array_start"]  = initBase;
+        globalSyms["__init_array_end"]    = initBase + curInit;
+        globalSyms["__bss_start"]         = L.bssBase;
+        globalSyms["__bss_end"]           = L.bssBase + L.bssSize;
+        globalSyms["__heap_start"]        = L.bssBase + L.bssSize;
+        globalSyms["__heap_end"]          = 0xBF00;  // bank 0 hi-RAM ceiling (below IIgs ROM windows)
+
+        // 2. Build global symbol map.
+        for (size_t fi = 0; fi < objs.size(); ++fi) {
+            const auto &obj = *objs[fi];
+            const auto &oo  = objOff[fi];
+            for (const Symbol &sym : obj.symbols) {
+                if (sym.name.empty()) continue;
+                if (sym.shndx == SHN_UNDEF || sym.shndx == SHN_ABS ||
+                    sym.shndx == SHN_COMMON || sym.shndx >= obj.sections.size())
+                    continue;
+                const auto &sec = obj.sections[sym.shndx];
+                std::string kind = sectionKind(sec.name);
+                uint32_t addr = 0;
+                if (kind == "text") {
+                    auto it = oo.textWithin.find(sym.shndx);
+                    addr = textBase + oo.textBaseInMerged
+                         + (it == oo.textWithin.end() ? 0 : it->second)
+                         + sym.value;
+                } else if (kind == "rodata") {
+                    auto it = oo.rodataWithin.find(sym.shndx);
+                    addr = L.rodataBase + oo.rodataBaseInMerged
+                         + (it == oo.rodataWithin.end() ? 0 : it->second)
+                         + sym.value;
+                } else if (kind == "bss") {
+                    auto it = oo.bssWithin.find(sym.shndx);
+                    addr = bssBase + oo.bssBaseInMerged
+                         + (it == oo.bssWithin.end() ? 0 : it->second)
+                         + sym.value;
+                } else if (kind == "init_array") {
+                    auto it = oo.initWithin.find(sym.shndx);
+                    addr = initBase + oo.initBaseInMerged
+                         + (it == oo.initWithin.end() ? 0 : it->second)
+                         + sym.value;
+                } else {
+                    continue;
+                }
+                globalSyms[sym.name] = addr;  // last def wins
+            }
+        }
+
+        // 3. Build text and rodata buffers.
+        std::vector<uint8_t> textBuf;
+        textBuf.reserve(curText);
+        for (size_t fi = 0; fi < objs.size(); ++fi) {
+            for (uint32_t idx : objs[fi]->sectionsByKind("text")) {
+                const uint8_t *p = objs[fi]->sectionData(idx);
+                textBuf.insert(textBuf.end(), p, p + objs[fi]->sections[idx].size);
+            }
+        }
+        std::vector<uint8_t> rodataBuf;
+        rodataBuf.reserve(curRodata);
+        for (size_t fi = 0; fi < objs.size(); ++fi) {
+            for (uint32_t idx : objs[fi]->sectionsByKind("rodata")) {
+                const uint8_t *p = objs[fi]->sectionData(idx);
+                rodataBuf.insert(rodataBuf.end(), p,
+                                 p + objs[fi]->sections[idx].size);
+            }
+        }
+
+        // Resolve a reloc to (target, name) using the symbol table and the
+        // per-object section base map.  Used by every .rela.{text,rodata,
+        // init_array} application below.
+        auto resolveSym = [&](const InputObject &obj, const ObjOffsets &oo,
+                              const Reloc &r,
+                              uint32_t &target, std::string &resolvedName) {
+            if (r.symIdx >= obj.symbols.size())
+                die(obj.path + ": reloc symIdx out of range");
+            const Symbol &sym = obj.symbols[r.symIdx];
+            if (sym.type == STT_SECTION) {
+                if (sym.shndx >= obj.sections.size())
+                    die(obj.path + ": section symbol shndx out of range");
+                const auto &refSec = obj.sections[sym.shndx];
+                std::string kind = sectionKind(refSec.name);
+                uint32_t base = 0;
+                if (kind == "text") {
+                    auto wIt = oo.textWithin.find(sym.shndx);
+                    base = textBase + oo.textBaseInMerged
+                         + (wIt == oo.textWithin.end() ? 0 : wIt->second);
+                } else if (kind == "rodata") {
+                    auto wIt = oo.rodataWithin.find(sym.shndx);
+                    base = L.rodataBase + oo.rodataBaseInMerged
+                         + (wIt == oo.rodataWithin.end() ? 0 : wIt->second);
+                } else if (kind == "bss") {
+                    auto wIt = oo.bssWithin.find(sym.shndx);
+                    base = bssBase + oo.bssBaseInMerged
+                         + (wIt == oo.bssWithin.end() ? 0 : wIt->second);
+                } else if (kind == "init_array") {
+                    auto wIt = oo.initWithin.find(sym.shndx);
+                    base = initBase + oo.initBaseInMerged
+                         + (wIt == oo.initWithin.end() ? 0 : wIt->second);
+                } else {
+                    die(obj.path + ": reloc against unknown section '"
+                        + refSec.name + "'");
+                }
+                target = base + r.addend;
+                resolvedName = refSec.name;
+            } else {
+                auto sIt = globalSyms.find(sym.name);
+                if (sIt == globalSyms.end())
+                    die(obj.path + ": undefined symbol '" + sym.name + "'");
+                target = sIt->second + r.addend;
+                resolvedName = sym.name;
+            }
+        };
+
+        // 4. Apply relocations to text buffer.
+        for (size_t fi = 0; fi < objs.size(); ++fi) {
+            const auto &obj = *objs[fi];
+            const auto &oo  = objOff[fi];
+            for (uint32_t textIdx : obj.sectionsByKind("text")) {
+                auto it = obj.relocs.find(textIdx);
+                if (it == obj.relocs.end()) continue;
+                uint32_t inMerged = oo.textBaseInMerged + oo.textWithin.at(textIdx);
+                for (const Reloc &r : it->second) {
+                    uint32_t patchOff = inMerged + r.offset;
+                    uint32_t patchAddr = textBase + patchOff;
+                    uint32_t target;
+                    std::string resolvedName;
+                    resolveSym(obj, oo, r, target, resolvedName);
+                    applyReloc(textBuf, patchOff, patchAddr, target, r.type,
+                               resolvedName);
+                }
+            }
+        }
+
+        // 4b. Apply relocations to rodata/data buffer.  Globals like
+        // `int *p = &v;` need their initializer patched at link time
+        // (the .o emits a placeholder 0 + a R_W65816_IMM16 reloc).
+        // Without this, every initialized pointer or function-pointer
+        // table in the program reads 0 at runtime.
+        for (size_t fi = 0; fi < objs.size(); ++fi) {
+            const auto &obj = *objs[fi];
+            const auto &oo  = objOff[fi];
+            for (uint32_t rdIdx : obj.sectionsByKind("rodata")) {
+                auto it = obj.relocs.find(rdIdx);
+                if (it == obj.relocs.end()) continue;
+                uint32_t inMerged = oo.rodataBaseInMerged + oo.rodataWithin.at(rdIdx);
+                for (const Reloc &r : it->second) {
+                    uint32_t patchOff = inMerged + r.offset;
+                    uint32_t patchAddr = L.rodataBase + patchOff;
+                    uint32_t target;
+                    std::string resolvedName;
+                    resolveSym(obj, oo, r, target, resolvedName);
+                    applyReloc(rodataBuf, patchOff, patchAddr, target,
+                               r.type, resolvedName);
+                }
+            }
+        }
+
+        // 5. Compose output: text || (gap) || rodata.  bss is virtual.
+        outImage.clear();
+        outImage = std::move(textBuf);
+        if (L.rodataBase != textBase + curText) {
+            uint32_t gap = L.rodataBase - (textBase + curText);
+            outImage.insert(outImage.end(), gap, 0);
+        }
+        outImage.insert(outImage.end(), rodataBuf.begin(), rodataBuf.end());
+
+        // Build init_array buffer + apply its relocations (entries are
+        // 16-bit function pointers needing IMM16 reloc).
+        std::vector<uint8_t> initBuf;
+        initBuf.reserve(curInit);
+        for (size_t fi = 0; fi < objs.size(); ++fi) {
+            for (uint32_t idx : objs[fi]->sectionsByKind("init_array")) {
+                const uint8_t *p = objs[fi]->sectionData(idx);
+                initBuf.insert(initBuf.end(), p,
+                               p + objs[fi]->sections[idx].size);
+            }
+        }
+        for (size_t fi = 0; fi < objs.size(); ++fi) {
+            const auto &obj = *objs[fi];
+            const auto &oo  = objOff[fi];
+            for (uint32_t idx : obj.sectionsByKind("init_array")) {
+                auto it = obj.relocs.find(idx);
+                if (it == obj.relocs.end()) continue;
+                uint32_t inMerged = oo.initBaseInMerged + oo.initWithin.at(idx);
+                for (const Reloc &r : it->second) {
+                    if (r.symIdx >= obj.symbols.size())
+                        die(obj.path + ": reloc references invalid symbol");
+                    const Symbol &sym = obj.symbols[r.symIdx];
+                    uint32_t target;
+                    if (sym.name.empty() || sym.shndx < obj.sections.size()) {
+                        // Section-relative: resolve against section base.
+                        if (sym.shndx >= obj.sections.size())
+                            die(obj.path + ": reloc bad shndx");
+                        const auto &refSec = obj.sections[sym.shndx];
+                        std::string kind = sectionKind(refSec.name);
+                        uint32_t base = 0;
+                        if (kind == "text") {
+                            auto wIt = oo.textWithin.find(sym.shndx);
+                            base = textBase + oo.textBaseInMerged
+                                 + (wIt == oo.textWithin.end() ? 0 : wIt->second);
+                        } else if (kind == "rodata") {
+                            auto wIt = oo.rodataWithin.find(sym.shndx);
+                            base = L.rodataBase + oo.rodataBaseInMerged
+                                 + (wIt == oo.rodataWithin.end() ? 0 : wIt->second);
+                        } else {
+                            die(obj.path + ": init_array reloc against non-text/rodata");
+                        }
+                        target = base + r.addend;
+                    } else {
+                        auto sIt = globalSyms.find(sym.name);
+                        if (sIt == globalSyms.end())
+                            die(obj.path + ": undefined symbol '" + sym.name + "'");
+                        target = sIt->second + r.addend;
+                    }
+                    uint32_t patchOff  = inMerged + r.offset;
+                    uint32_t patchAddr = initBase + patchOff;
+                    applyReloc(initBuf, patchOff, patchAddr, target, r.type,
+                               sym.name);
+                }
+            }
+        }
+        outImage.insert(outImage.end(), initBuf.begin(), initBuf.end());
+
+        lastLayout = L;
+        return L;
+    }
+
+    void writeMap(const std::string &path) const {
+        std::ofstream f(path);
+        if (!f) die("cannot open '" + path + "' for writing");
+        char buf[256];
+        // Section layout summary at top.
+        std::snprintf(buf, sizeof(buf),
+                      "# section layout\n"
+                      ".text   : 0x%06x .. 0x%06x  (%6u bytes)\n"
+                      ".rodata : 0x%06x .. 0x%06x  (%6u bytes)\n"
+                      ".bss    : 0x%06x .. 0x%06x  (%6u bytes)\n",
+                      lastLayout.textBase,
+                      lastLayout.textBase + lastLayout.textSize,
+                      lastLayout.textSize,
+                      lastLayout.rodataBase,
+                      lastLayout.rodataBase + lastLayout.rodataSize,
+                      lastLayout.rodataSize,
+                      lastLayout.bssBase,
+                      lastLayout.bssBase + lastLayout.bssSize,
+                      lastLayout.bssSize);
+        f.write(buf, std::strlen(buf));
+        // Per-input-file contributions to .text (size in bytes).
+        std::snprintf(buf, sizeof(buf), "\n# per-input-file .text contributions\n");
+        f.write(buf, std::strlen(buf));
+        for (size_t fi = 0; fi < objs.size(); ++fi) {
+            uint32_t bytes = 0;
+            for (uint32_t idx : objs[fi]->sectionsByKind("text"))
+                bytes += objs[fi]->sections[idx].size;
+            std::snprintf(buf, sizeof(buf), "%6u  %s\n", bytes,
+                          objs[fi]->path.c_str());
+            f.write(buf, std::strlen(buf));
+        }
+        // Symbol table sorted by address.
+        std::snprintf(buf, sizeof(buf), "\n# global symbols (sorted by address)\n");
+        f.write(buf, std::strlen(buf));
+        std::vector<std::pair<uint32_t, std::string>> sorted;
+        for (const auto &kv : globalSyms) sorted.emplace_back(kv.second, kv.first);
+        std::sort(sorted.begin(), sorted.end());
+        for (const auto &p : sorted) {
+            std::snprintf(buf, sizeof(buf), "0x%06x  %s\n",
+                          p.first, p.second.c_str());
+            f.write(buf, std::strlen(buf));
+        }
+        // Backwards-compat: also emit the old `name = 0x...` lines so
+        // existing smoke greps still match.
+        for (const auto &kv : globalSyms) {
+            std::snprintf(buf, sizeof(buf), "%s = 0x%06x\n",
+                          kv.first.c_str(), kv.second);
+            f.write(buf, std::strlen(buf));
+        }
+    }
+
+    // Stash the last layout so writeMap can use it.
+    Layout lastLayout;
+};
+
+// ---------------------------------------------------------------- CLI
+
+static uint32_t parseInt(const std::string &s) {
+    char *end = nullptr;
+    unsigned long v = std::strtoul(s.c_str(), &end, 0);
+    if (end == s.c_str() || *end != '\0')
+        die("bad numeric value '" + s + "'");
+    return static_cast<uint32_t>(v);
+}
+
+static void usage(const char *argv0) {
+    std::fprintf(stderr,
+        "usage: %s -o <output> [--text-base ADDR] [--rodata-base ADDR]\n"
+        "           [--bss-base ADDR] [--map FILE] <input.o> ...\n",
+        argv0);
+    std::exit(2);
+}
+
+} // anonymous namespace
+
+int main(int argc, char **argv) {
+    std::string outPath;
+    std::string mapPath;
+    Linker linker;
+
+    int i = 1;
+    while (i < argc) {
+        std::string a = argv[i];
+        if (a == "-o" || a == "--output") {
+            if (++i >= argc) usage(argv[0]);
+            outPath = argv[i++];
+        } else if (a == "--text-base") {
+            if (++i >= argc) usage(argv[0]);
+            linker.textBase = parseInt(argv[i++]);
+        } else if (a == "--rodata-base") {
+            if (++i >= argc) usage(argv[0]);
+            linker.rodataBase = parseInt(argv[i++]);
+        } else if (a == "--bss-base") {
+            if (++i >= argc) usage(argv[0]);
+            linker.bssBase = parseInt(argv[i++]);
+        } else if (a == "--map") {
+            if (++i >= argc) usage(argv[0]);
+            mapPath = argv[i++];
+        } else if (a == "-h" || a == "--help") {
+            usage(argv[0]);
+        } else if (!a.empty() && a[0] == '-') {
+            die("unknown option '" + a + "'");
+        } else {
+            linker.addObject(a);
+            i++;
+        }
+    }
+    if (outPath.empty() || linker.objs.empty()) usage(argv[0]);
+
+    std::vector<uint8_t> image;
+    Layout L = linker.link(image);
+
+    std::ofstream f(outPath, std::ios::binary);
+    if (!f) die("cannot open '" + outPath + "' for writing");
+    f.write(reinterpret_cast<const char *>(image.data()), image.size());
+
+    if (!mapPath.empty()) linker.writeMap(mapPath);
+
+    std::fprintf(stderr,
+        "linked: text=[0x%04x+%u] rodata=[0x%04x+%u] bss=[0x%04x+%u] "
+        "-> %s (%zu bytes)\n",
+        L.textBase, L.textSize, L.rodataBase, L.rodataSize,
+        L.bssBase, L.bssSize,
+        outPath.c_str(), image.size());
+
+    return 0;
+}
--- a/src/link816/omfEmit.cpp
+++ b/src/link816/omfEmit.cpp
@ -0,0 +1,201 @@
+// omfEmit — wrap a flat binary in a minimal Apple IIgs OMF v2.1
+// container so GS/OS can load and execute it.
+//
+// Single-segment output (CODE, kind=0), no INTERSEG opcodes (multi-
+// segment output is a follow-on).  Header layout per OMF 2.1 spec:
+// 44-byte fixed header + 10-byte LOAD_NAME + 32-byte SEG_NAME, then
+// the body (DS opcode for the payload, END opcode terminator).
+//
+// CLI mirrors the Python tool exactly:
+//   omfEmit --input flat.bin --map flat.map --base 0x8000
+//           --entry main --output prog.omf [--name SEG]
+
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+
+namespace {
+
+[[noreturn]] static void die(const std::string &msg) {
+    std::fprintf(stderr, "omfEmit: %s\n", msg.c_str());
+    std::exit(1);
+}
+
+static std::vector<uint8_t> readFile(const std::string &path) {
+    std::ifstream f(path, std::ios::binary);
+    if (!f) die("cannot open '" + path + "' for reading");
+    return std::vector<uint8_t>((std::istreambuf_iterator<char>(f)),
+                                 std::istreambuf_iterator<char>());
+}
+
+static std::map<std::string, uint32_t> readMap(const std::string &path) {
+    std::map<std::string, uint32_t> syms;
+    std::ifstream f(path);
+    if (!f) die("cannot open '" + path + "' for reading");
+    std::string line;
+    while (std::getline(f, line)) {
+        auto eq = line.find(" = ");
+        if (eq == std::string::npos) continue;
+        std::string name = line.substr(0, eq);
+        std::string addr = line.substr(eq + 3);
+        // Trim trailing whitespace.
+        while (!name.empty() && std::isspace((unsigned char)name.back()))
+            name.pop_back();
+        while (!addr.empty() && std::isspace((unsigned char)addr.back()))
+            addr.pop_back();
+        try {
+            syms[name] = std::stoul(addr, nullptr, 16);
+        } catch (...) { /* skip non-hex entries */ }
+    }
+    return syms;
+}
+
+// Emit little-endian.
+static void put32(std::vector<uint8_t> &v, uint32_t x) {
+    v.push_back(x & 0xFF);
+    v.push_back((x >> 8) & 0xFF);
+    v.push_back((x >> 16) & 0xFF);
+    v.push_back((x >> 24) & 0xFF);
+}
+static void put16(std::vector<uint8_t> &v, uint16_t x) {
+    v.push_back(x & 0xFF);
+    v.push_back((x >> 8) & 0xFF);
+}
+
+static std::vector<uint8_t> emitOMF(const std::vector<uint8_t> &image,
+                                     uint32_t entryOffset,
+                                     const std::string &name) {
+    // Body: DS (literal data) + END.
+    std::vector<uint8_t> body;
+    if (!image.empty()) {
+        body.push_back(0xF1);                       // DS opcode
+        put32(body, static_cast<uint32_t>(image.size()));
+        body.insert(body.end(), image.begin(), image.end());
+    }
+    body.push_back(0x00);                           // END opcode
+
+    // LOAD_NAME: 10 bytes, space-padded.
+    std::string loadName = name.substr(0, 10);
+    while (loadName.size() < 10) loadName += ' ';
+
+    // SEG_NAME: 1-byte length prefix + 31 bytes (truncated, padded with NUL).
+    std::string segNameTxt = name.substr(0, 31);
+    std::vector<uint8_t> segName;
+    segName.push_back(static_cast<uint8_t>(segNameTxt.size()));
+    for (char c : segNameTxt) segName.push_back((uint8_t)c);
+    while (segName.size() < 32) segName.push_back(0);
+
+    constexpr uint16_t DISPNAME = 44;
+    const uint16_t DISPDATA = DISPNAME + 10 + 32;
+    const uint32_t LENGTH   = static_cast<uint32_t>(image.size());
+    const uint32_t BYTECNT  = DISPDATA + static_cast<uint32_t>(body.size());
+    const uint32_t RESSPC   = 0;
+    const uint32_t BANKSIZE = 0x10000;
+    const uint16_t KIND     = 0x0000;  // CODE
+    const uint32_t ORG      = 0;
+    const uint32_t ALIGN    = 0;
+    const uint8_t  NUMSEX   = 0;
+    const uint16_t SEGNUM   = 1;
+    const uint32_t ENTRY    = entryOffset;
+
+    std::vector<uint8_t> hdr;
+    put32(hdr, BYTECNT);
+    put32(hdr, RESSPC);
+    put32(hdr, LENGTH);
+    hdr.push_back(0x00);                            // undefined
+    hdr.push_back(10);                              // LABLEN
+    hdr.push_back(4);                               // NUMLEN
+    hdr.push_back(0x21);                            // VERSION 2.1
+    put32(hdr, BANKSIZE);
+    put16(hdr, KIND);
+    hdr.push_back(0x00); hdr.push_back(0x00);       // undefined (2 bytes)
+    put32(hdr, ORG);
+    put32(hdr, ALIGN);
+    hdr.push_back(NUMSEX);
+    hdr.push_back(0x00);                            // undefined
+    put16(hdr, SEGNUM);
+    put32(hdr, ENTRY);
+    put16(hdr, DISPNAME);
+    put16(hdr, DISPDATA);
+
+    if (hdr.size() != 44) die("internal: header size != 44");
+
+    std::vector<uint8_t> out;
+    out.insert(out.end(), hdr.begin(), hdr.end());
+    out.insert(out.end(), loadName.begin(), loadName.end());
+    out.insert(out.end(), segName.begin(), segName.end());
+    out.insert(out.end(), body.begin(), body.end());
+    return out;
+}
+
+static uint32_t parseInt(const std::string &s) {
+    return static_cast<uint32_t>(std::stoul(s, nullptr, 0));
+}
+
+static void usage(const char *argv0) {
+    std::fprintf(stderr,
+        "usage: %s --input FLAT --map FILE --base ADDR --entry SYM\n"
+        "           --output OMF [--name NAME]\n",
+        argv0);
+    std::exit(2);
+}
+
+} // namespace
+
+int main(int argc, char **argv) {
+    std::string input, mapFile, output, entry = "main", name;
+    uint32_t base = 0;
+    bool baseSet = false;
+
+    int i = 1;
+    while (i < argc) {
+        std::string a = argv[i];
+        if (a == "--input") { if (++i >= argc) usage(argv[0]); input = argv[i++]; }
+        else if (a == "--map") { if (++i >= argc) usage(argv[0]); mapFile = argv[i++]; }
+        else if (a == "--base") { if (++i >= argc) usage(argv[0]); base = parseInt(argv[i++]); baseSet = true; }
+        else if (a == "--entry") { if (++i >= argc) usage(argv[0]); entry = argv[i++]; }
+        else if (a == "--name") { if (++i >= argc) usage(argv[0]); name = argv[i++]; }
+        else if (a == "--output" || a == "-o") { if (++i >= argc) usage(argv[0]); output = argv[i++]; }
+        else if (a == "-h" || a == "--help") usage(argv[0]);
+        else die("unknown option '" + a + "'");
+    }
+    if (input.empty() || mapFile.empty() || !baseSet || output.empty())
+        usage(argv[0]);
+
+    auto image = readFile(input);
+    auto syms  = readMap(mapFile);
+
+    auto it = syms.find(entry);
+    if (it == syms.end())
+        die("entry symbol '" + entry + "' not in map");
+    uint32_t entryAddr = it->second;
+    if (entryAddr < base || entryAddr >= base + image.size())
+        die("entry symbol outside linked image");
+    uint32_t entryOff = entryAddr - base;
+
+    if (name.empty()) {
+        // Default name: output basename without extension.
+        size_t slash = output.find_last_of('/');
+        std::string base_n = (slash == std::string::npos) ? output
+                                                          : output.substr(slash + 1);
+        size_t dot = base_n.find_last_of('.');
+        name = (dot == std::string::npos) ? base_n : base_n.substr(0, dot);
+    }
+
+    auto blob = emitOMF(image, entryOff, name);
+    std::ofstream f(output, std::ios::binary);
+    if (!f) die("cannot open '" + output + "' for writing");
+    f.write(reinterpret_cast<const char *>(blob.data()), blob.size());
+
+    std::fprintf(stderr,
+        "OMF: 1 segment, %zu bytes payload, entry='%s' at +0x%x -> %s "
+        "(%zu bytes total)\n",
+        image.size(), entry.c_str(), entryOff,
+        output.c_str(), blob.size());
+    return 0;
+}
--- a/src/llvm/lib/Target/W65816/CMakeLists.txt
+++ b/src/llvm/lib/Target/W65816/CMakeLists.txt
@ -25,6 +25,13 @@ add_llvm_target(W65816CodeGen
  W65816SelectionDAGInfo.cpp
  W65816Subtarget.cpp
  W65816StackSlotCleanup.cpp
+  W65816SepRepCleanup.cpp
+  W65816BranchExpand.cpp
+  W65816TiedDefSpill.cpp
+  W65816ABridgeViaX.cpp
+  W65816WidenAcc16.cpp
+  W65816SpillToX.cpp
+  W65816NegYIndY.cpp
  W65816TargetMachine.cpp
  W65816AsmPrinter.cpp
  W65816MCInstLower.cpp
--- a/src/llvm/lib/Target/W65816/MCTargetDesc/W65816AsmBackend.cpp
+++ b/src/llvm/lib/Target/W65816/MCTargetDesc/W65816AsmBackend.cpp
@ -16,14 +16,19 @@
 #include "MCTargetDesc/W65816MCTargetDesc.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCTargetOptions.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"

+// W65816::BRA / W65816::BRL opcodes are exported by W65816MCTargetDesc.h
+// (which already includes the generated header).
+
 using namespace llvm;

 namespace {
@ -120,6 +125,48 @@ public:
      OS << char(0xEA);
    return true;
  }
+
+  // ----------------------------------------------------------------
+  // Relaxation: BRA (signed-8 displacement) -> BRL (signed-16).  When
+  // the assembler determines that a forward/backward BRA's target lies
+  // beyond +/-128 bytes, it asks us first via mayNeedRelaxation /
+  // fixupNeedsRelaxation, then via relaxInstruction to materialise the
+  // longer form.  Both BRA (0x80 dd) and BRL (0x82 dd dd) have the
+  // same operand semantics (PC-relative) so the rewrite is just an
+  // opcode swap with the fixup kind upgraded from fixup_8_pcrel to
+  // fixup_16_pcrel.
+  //
+  // We do NOT relax conditional Bxx instructions yet: the 65816 has
+  // no long conditional branch, so the standard trick is to invert
+  // and span: `BNE l: ... -> BEQ skip; BRL l; skip:`.  That requires
+  // emitting two instructions in place of one and shifting all
+  // subsequent fixup offsets, which the layered MCAsmBackend API
+  // doesn't support cleanly.  A higher-level codegen pass (or a
+  // pre-emit MIR pass) is the right place for that.  Until then,
+  // out-of-range conditional branches still error out via the
+  // applyFixup diagnostic above.
+  bool mayNeedRelaxation(unsigned Opcode, ArrayRef<MCOperand> Operands,
+                         const MCSubtargetInfo &STI) const override {
+    return Opcode == W65816::BRA;
+  }
+
+  bool fixupNeedsRelaxationAdvanced(const MCFragment &F, const MCFixup &Fixup,
+                                    const MCValue &Target, uint64_t Value,
+                                    bool Resolved) const override {
+    if (Fixup.getKind() != W65816::fixup_8_pcrel)
+      return false;
+    int64_t Signed = static_cast<int64_t>(Value);
+    return Signed < -128 || Signed > 127;
+  }
+
+  void relaxInstruction(MCInst &Inst,
+                        const MCSubtargetInfo &STI) const override {
+    if (Inst.getOpcode() == W65816::BRA) {
+      Inst.setOpcode(W65816::BRL);
+      // Operand stays the same (the symbol/expression).  The encoder
+      // will pick the BRL encoding (3 bytes) and emit fixup_16_pcrel.
+    }
+  }
 };

 } // end anonymous namespace
--- a/src/llvm/lib/Target/W65816/MCTargetDesc/W65816ELFObjectWriter.cpp
+++ b/src/llvm/lib/Target/W65816/MCTargetDesc/W65816ELFObjectWriter.cpp
@ -42,12 +42,26 @@ protected:
    // (EM_, R_*) pair is unique; once a real EM_ value is assigned for the
    // W65816 target (see SESSION_STATE.md open question on ELF EM_), swap
    // these for the canonical R_W65816_* names.
-    switch (Fixup.getKind()) {
+    //
+    // Generic FK_Data_* fixups are also accepted — the asm parser creates
+    // them for things like `.word foo` and the JMP/JML address operand
+    // when no target-specific fixup kind is hinted.  Map them to the
+    // matching size-based reloc; PC-relative variants pick the *_pcrel
+    // forms.  Without this, every hand-written .s reference to an extern
+    // symbol came through `getRelocType` as a default-value (UB) reloc
+    // type — observed as type 249 — and broke link816.py.
+    auto Kind = Fixup.getKind();
+    switch (Kind) {
    case W65816::fixup_8:        return 1;  // R_W65816_IMM8
    case W65816::fixup_16:       return 2;  // R_W65816_IMM16
    case W65816::fixup_24:       return 3;  // R_W65816_IMM24
    case W65816::fixup_8_pcrel:  return 4;  // R_W65816_PCREL8
    case W65816::fixup_16_pcrel: return 5;  // R_W65816_PCREL16
+    case FK_Data_1:              return IsPCRel ? 4 : 1;
+    case FK_Data_2:              return IsPCRel ? 5 : 2;
+    case FK_Data_4:              return 3;  // truncated to IMM24 (we have
+                                            // no 32-bit reloc); .long is
+                                            // unusual on a 16-bit target.
    default:
      llvm_unreachable("W65816: unknown fixup kind");
    }
--- a/src/llvm/lib/Target/W65816/W65816.h
+++ b/src/llvm/lib/Target/W65816/W65816.h
@ -59,9 +59,60 @@ FunctionPass *createW65816ISelDag(W65816TargetMachine &TM,
 // W65816StackSlotCleanup.cpp.
 FunctionPass *createW65816StackSlotCleanup();

+// Post-PEI cleanup: coalesces adjacent SEP/REP toggles emitted by
+// STA8fi expansions when two i8 stores sit back-to-back.  Each STA8fi
+// emits SEP/STA/REP; consecutive expansions produce REP/SEP toggles
+// that cancel.  See W65816SepRepCleanup.cpp.
+FunctionPass *createW65816SepRepCleanup();
+
+// Pre-emit pass: expands long conditional branches into the
+// `INVERTED_Bxx skip ; BRA target ; skip:` pattern when the byte
+// distance to the target exceeds the +/-128 reach of an 8-bit-PCREL
+// branch.  The unconditional BRA is then auto-relaxed to BRL by
+// the assembler when its target is also far.  See W65816BranchExpand.cpp.
+FunctionPass *createW65816BranchExpand();
+
+// Pre-RA pass: when a tied-def Acc16 instruction has a source vreg
+// whose value is also used after the consumer, fast regalloc fails
+// to preserve it (the tied physreg gets overwritten).  We insert
+// explicit STAfi/LDAfi spill+reload around the consumer to fix this.
+// See W65816TiedDefSpill.cpp.
+FunctionPass *createW65816TiedDefSpill();
+
+// Pre-RA pass: same trigger as TiedDefSpill, but bridges via X/Y
+// (Idx16) instead of stack when the post-consumer range is free of
+// X/Y clobbers.  Saves 6 cycles + 2 bytes per bridge versus the stack
+// route.  See W65816ABridgeViaX.cpp.
+FunctionPass *createW65816ABridgeViaX();
+
+// Pre-RA pass: promote Acc16 vregs (= {A}) to Wide16 (= {A, IMG0..7}).
+// Lets greedy regalloc spread i16 pressure across A and the DP-backed
+// imaginaries.  See W65816WidenAcc16.cpp.
+FunctionPass *createW65816WidenAcc16();
+
+// Post-RA peephole: replace STAfi/LDAfi spill pairs (5+5 cyc) with
+// TAX/TXA bridges (2+2 cyc) when X is dead during the spill window.
+// Targets fast-regalloc's habit of spilling A unnecessarily; the
+// 3x speedup is the biggest single per-iteration win we can get
+// without switching to a smarter allocator.  See W65816SpillToX.cpp.
+FunctionPass *createW65816SpillToX();
+
+// Pre-emit peephole: rewrite `LDY #neg ; (LDA|STA) (sr,S),Y` to
+// pre-add the offset to the pointer with Y=0.  The 65816 spec for
+// (sr,S),Y is a 24-bit add (DBR | (mem16(sr+S) + Y)) MOD $1000000,
+// so signed-negative Y crosses bank boundaries.  See W65816NegYIndY.cpp.
+FunctionPass *createW65816NegYIndY();
+
 void initializeW65816AsmPrinterPass(PassRegistry &);
 void initializeW65816DAGToDAGISelLegacyPass(PassRegistry &);
 void initializeW65816StackSlotCleanupPass(PassRegistry &);
+void initializeW65816SepRepCleanupPass(PassRegistry &);
+void initializeW65816BranchExpandPass(PassRegistry &);
+void initializeW65816TiedDefSpillPass(PassRegistry &);
+void initializeW65816ABridgeViaXPass(PassRegistry &);
+void initializeW65816WidenAcc16Pass(PassRegistry &);
+void initializeW65816SpillToXPass(PassRegistry &);
+void initializeW65816NegYIndYPass(PassRegistry &);

 } // namespace llvm

--- a/src/llvm/lib/Target/W65816/W65816ABridgeViaX.cpp
+++ b/src/llvm/lib/Target/W65816/W65816ABridgeViaX.cpp
@ -0,0 +1,260 @@
+//===-- W65816ABridgeViaX.cpp - Pre-RA bridge of Acc16 vregs via X -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Pre-regalloc complement to W65816TiedDefSpill.  Where TiedDefSpill
+// preserves a multi-use Acc16 vreg by spilling it to a fresh stack
+// slot around the tied-def consumer, this pass tries to do the same
+// preservation via TAX/TXA: copy to an Idx16 vreg before the consumer
+// (regalloc puts it in X or Y, expansion lowers the COPY to TAX/TAY),
+// copy back to a fresh Acc16 vreg after.
+//
+// Win per bridged pair:
+//   stack spill: STA dp,S (5 cyc) + LDA dp,S (5 cyc) + 1 frame slot
+//   X bridge   : TAX (2 cyc) + TXA (2 cyc) + no frame growth
+// Net 6 cycles + 2 bytes saved per bridge — and we avoid one PHA per
+// stack slot we didn't allocate.
+//
+// Bail conditions (fall back to TiedDefSpill's stack route):
+//   - any MI between consumer and SrcReg's last use clobbers Idx16
+//     (LDX/LDY/INX/DEX/INY/DEY/TAX/TAY/TXY/TYX/PHX/PHY/PLX/PLY/etc.)
+//   - any call in the range (calls clobber X and Y per ABI)
+//   - SrcReg is used in a different MBB (cross-MBB liveness needs more
+//     analysis; deferred)
+//
+// Runs before TiedDefSpill so the latter doesn't double-process the
+// same candidates.
+//
+//===----------------------------------------------------------------------===//
+
+#include "W65816.h"
+#include "W65816InstrInfo.h"
+#include "W65816Subtarget.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "w65816-a-bridge-via-x"
+
+namespace {
+
+class W65816ABridgeViaX : public MachineFunctionPass {
+public:
+  static char ID;
+  W65816ABridgeViaX() : MachineFunctionPass(ID) {}
+  StringRef getPassName() const override {
+    return "W65816 Acc16 bridge via X";
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+
+} // namespace
+
+char W65816ABridgeViaX::ID = 0;
+
+INITIALIZE_PASS(W65816ABridgeViaX, DEBUG_TYPE,
+                "W65816 Acc16 bridge via X", false, false)
+
+FunctionPass *llvm::createW65816ABridgeViaX() {
+  return new W65816ABridgeViaX();
+}
+
+// Same allowlist as TiedDefSpill — we target the same consumers.
+static bool isTiedAcc16Consumer(unsigned Opc) {
+  switch (Opc) {
+  case W65816::ADCfi:
+  case W65816::SBCfi:
+  case W65816::ANDfi:
+  case W65816::ORAfi:
+  case W65816::EORfi:
+  case W65816::ADCabs:
+  case W65816::SBCabs:
+  case W65816::ADCi16imm:
+  case W65816::SBCi16imm:
+  case W65816::ANDi16imm:
+  case W65816::ORAi16imm:
+  case W65816::EORi16imm:
+    return true;
+  default:
+    return false;
+  }
+}
+
+static bool hasTiedSrcDef(const MachineInstr &MI) {
+  if (!isTiedAcc16Consumer(MI.getOpcode())) return false;
+  for (unsigned i = 0; i < MI.getNumOperands(); ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg() || !MO.isUse()) continue;
+    if (MI.isRegTiedToDefOperand(i)) return true;
+  }
+  return false;
+}
+
+// Pre-RA check for "instruction may clobber an Img16 (DP $D0..$DF)
+// register."  Calls clobber them caller-save.  Any other DP load/store
+// to that range would too — but we don't currently have non-libcall
+// emitters into $D0..$DF, so the call check covers it.  Conservative
+// extras: anything that could touch DP overall is excluded.
+static bool clobbersImg(const MachineInstr &MI,
+                        const MachineRegisterInfo &MRI) {
+  if (MI.isCall()) return true;
+  // Bail on any MI that defs an Img16 or its DP physreg — none should
+  // exist before our pass runs, but cover the case for robustness.
+  for (const auto &MO : MI.operands()) {
+    if (!MO.isReg() || !MO.isDef()) continue;
+    Register R = MO.getReg();
+    if (!R.isValid()) continue;
+    if (R.isPhysical()) {
+      if (R == W65816::IMG0 || R == W65816::IMG1 || R == W65816::IMG2 ||
+          R == W65816::IMG3 || R == W65816::IMG4 || R == W65816::IMG5 ||
+          R == W65816::IMG6 || R == W65816::IMG7)
+        return true;
+      continue;
+    }
+    const TargetRegisterClass *RC = MRI.getRegClass(R);
+    if (RC == &W65816::Img16RegClass) return true;
+  }
+  return false;
+}
+
+bool W65816ABridgeViaX::runOnMachineFunction(MachineFunction &MF) {
+  if (!MF.getRegInfo().getNumVirtRegs()) return false;
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const W65816Subtarget &STI = MF.getSubtarget<W65816Subtarget>();
+  const W65816InstrInfo *TII = STI.getInstrInfo();
+  bool Changed = false;
+
+  // Snapshot candidates before mutating MIR.
+  struct Candidate {
+    MachineBasicBlock *MBB;
+    MachineInstr *MI;
+    unsigned OpIdx;
+  };
+  SmallVector<Candidate, 8> Candidates;
+
+  for (auto &MBB : MF) {
+    for (auto &MI : MBB) {
+      if (!hasTiedSrcDef(MI)) continue;
+      for (unsigned i = 0; i < MI.getNumOperands(); ++i) {
+        const MachineOperand &MO = MI.getOperand(i);
+        if (!MO.isReg() || !MO.isUse()) continue;
+        if (!MI.isRegTiedToDefOperand(i)) continue;
+        Register R = MO.getReg();
+        if (!R.isVirtual()) continue;
+        if (MRI.getRegClass(R) != &W65816::Acc16RegClass) continue;
+
+        // Mirror TiedDefSpill's "needs spill" criterion exactly:
+        // SrcReg has a post-consumer COPY to a physreg.
+        bool needSpill = false;
+        bool badUse = false;
+        for (auto &U : MRI.use_nodbg_instructions(R)) {
+          if (&U == &MI) continue;
+          if (U.isPHI()) { badUse = true; break; }
+          if (U.isCopy()) {
+            const MachineOperand &Dst = U.getOperand(0);
+            if (Dst.isReg() && Dst.getReg().isPhysical()) {
+              needSpill = true;
+              continue;
+            }
+          }
+        }
+        if (needSpill && !badUse) {
+          Candidates.push_back({&MBB, &MI, i});
+        }
+      }
+    }
+  }
+
+  for (auto C : Candidates) {
+    MachineInstr *MI = C.MI;
+    MachineBasicBlock *MBB = C.MBB;
+    unsigned OpIdx = C.OpIdx;
+    Register SrcReg = MI->getOperand(OpIdx).getReg();
+    if (!SrcReg.isVirtual()) continue;
+    if (MRI.getRegClass(SrcReg) != &W65816::Acc16RegClass) continue;
+
+    // Determine the post-consumer-use range in MI's MBB.  All uses
+    // outside MBB disqualify (cross-MBB X/Y liveness too complex
+    // for first cut — fall through to TiedDefSpill).
+    bool sameMBBOnly = true;
+    auto LastUseIt = MBB->end();
+    for (auto &U : MRI.use_nodbg_instructions(SrcReg)) {
+      if (&U == MI) continue;
+      if (U.getParent() != MBB) { sameMBBOnly = false; break; }
+      // Track latest use (in MBB order).
+      auto It = MachineBasicBlock::iterator(&U);
+      bool afterMI = false;
+      for (auto Walk = MachineBasicBlock::iterator(MI), End = MBB->end();
+           Walk != End; ++Walk) {
+        if (Walk == It) { afterMI = true; break; }
+      }
+      if (!afterMI) continue;  // pre-consumer use stays on SrcReg
+      // Pick the latest such It as LastUseIt.
+      bool isLater = (LastUseIt == MBB->end());
+      if (!isLater) {
+        for (auto Walk = std::next(It); Walk != MBB->end(); ++Walk) {
+          if (Walk == LastUseIt) { isLater = true; break; }
+        }
+      }
+      if (isLater) LastUseIt = It;
+    }
+    if (!sameMBBOnly || LastUseIt == MBB->end()) continue;
+
+    // Scan from just after MI to LastUseIt: bail if anything could
+    // clobber an IMGn (calls and other DP-touchers).
+    bool imgClobbered = false;
+    for (auto It = std::next(MachineBasicBlock::iterator(MI));
+         It != LastUseIt; ++It) {
+      if (It->isDebugInstr()) continue;
+      if (clobbersImg(*It, MRI)) { imgClobbered = true; break; }
+    }
+    if (imgClobbered) continue;
+
+    // Bridge.  Park SrcReg in an Img16 (DP-backed) vreg around the
+    // consumer; restore via COPY back to a fresh Acc16 vreg afterward.
+    // Regalloc allocates the Img16 vreg to one of IMG0..IMG7 (DP slots
+    // $D0..$DE).  copyPhysReg lowers the COPYs to STA dp / LDA dp
+    // (4 cyc each); spills don't touch the system stack at all.
+    DebugLoc DL = MI->getDebugLoc();
+    Register ImgReg = MRI.createVirtualRegister(&W65816::Img16RegClass);
+    BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), ImgReg)
+        .addReg(SrcReg);
+    Register NewReg = MRI.createVirtualRegister(&W65816::Acc16RegClass);
+    auto AfterMI = std::next(MachineBasicBlock::iterator(MI));
+    BuildMI(*MBB, AfterMI, DL, TII->get(TargetOpcode::COPY), NewReg)
+        .addReg(ImgReg);
+
+    // Rewrite uses of SrcReg that come AFTER MI in the same MBB.
+    SmallVector<MachineOperand *, 4> ToRewrite;
+    for (auto &U : MRI.use_nodbg_operands(SrcReg)) {
+      if (U.getParent() == MI) continue;
+      MachineBasicBlock *UseMBB = U.getParent()->getParent();
+      if (UseMBB != MBB) continue;
+      bool After = false;
+      for (auto Walk = MachineBasicBlock::iterator(MI),
+                End = MBB->end(); Walk != End; ++Walk) {
+        if (&*Walk == U.getParent()) { After = true; break; }
+      }
+      if (After) ToRewrite.push_back(&U);
+    }
+    for (auto *MO : ToRewrite) {
+      MO->setReg(NewReg);
+      MO->setIsKill(false);
+    }
+    Changed = true;
+  }
+
+  return Changed;
+}
--- a/src/llvm/lib/Target/W65816/W65816AsmPrinter.cpp
+++ b/src/llvm/lib/Target/W65816/W65816AsmPrinter.cpp
@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//

 #include "MCTargetDesc/W65816InstPrinter.h"
+#include "W65816MachineFunctionInfo.h"
 #include "W65816MCInstLower.h"
 #include "W65816TargetMachine.h"
 #include "TargetInfo/W65816TargetInfo.h"
@ -82,6 +83,23 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
  switch (MI->getOpcode()) {
  default:
    break;
+  case W65816::ADJCALLSTACKDOWN:
+  case W65816::ADJCALLSTACKUP: {
+    // PEI's eliminateCallFramePseudoInstr removes these *only* when the
+    // function has frame work (StackSize > 0 or any FrameIndex use).
+    // Functions that just tail-call into a libcall (e.g. `int toInt(float
+    // x) { return (int)x; }` lowers to a single jsl __fixsfsi) have
+    // neither; PEI skips its call-frame phase and the pseudo survives
+    // to MC.  AsmStreamer renders the pseudo's "# ADJCALLSTACK..."
+    // string as a comment, but MCObjectStreamer asks the encoder to
+    // emit bytes — which fails ("Unsupported instruction MCInst 337").
+    // Dropping it here is correct: when amt is zero (the "no frame"
+    // path) the call sequence is a no-op anyway; when non-zero, PEI
+    // would have replaced it with PLA-loop / TSC-ADC sequence already.
+    // If we ever see a non-zero amount slip through, that's a real
+    // bug — emit nothing and trust the comment-stripped path.
+    return;
+  }
  case W65816::LDXi16imm: {
    MCInst Ldx;
    Ldx.setOpcode(W65816::LDX_Imm16);
@ -97,11 +115,20 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
    return;
  }
  case W65816::LDAi8imm: {
+    // i8 immediate — requires M=1 so the CPU reads only 1 immediate
+    // byte.  The function runs in M=0 (prologue convention), so wrap
+    // with SEP/REP.  Adjacent i8 ops collapse via W65816SepRepCleanup.
+    MCInst Sep; Sep.setOpcode(W65816::SEP);
+    Sep.addOperand(MCOperand::createImm(0x20));
+    EmitToStreamer(*OutStreamer, Sep);
    MCInst Lda;
    Lda.setOpcode(W65816::LDA_Imm8);
    int64_t Val = MI->getOperand(1).getImm() & 0xFF;
    Lda.addOperand(MCOperand::createImm(Val));
    EmitToStreamer(*OutStreamer, Lda);
+    MCInst Rep; Rep.setOpcode(W65816::REP);
+    Rep.addOperand(MCOperand::createImm(0x20));
+    EmitToStreamer(*OutStreamer, Rep);
    return;
  }
  case W65816::LDAabs: {
@ -148,6 +175,10 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
  case W65816::ADCi8imm:
  case W65816::SBCi8imm: {
    bool IsSub = MI->getOpcode() == W65816::SBCi8imm;
+    // SEP/REP wrap (see LDAi8imm comment).
+    MCInst Sep; Sep.setOpcode(W65816::SEP);
+    Sep.addOperand(MCOperand::createImm(0x20));
+    EmitToStreamer(*OutStreamer, Sep);
    MCInst Carry;
    Carry.setOpcode(IsSub ? W65816::SEC : W65816::CLC);
    EmitToStreamer(*OutStreamer, Carry);
@ -156,6 +187,9 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
    int64_t Val = MI->getOperand(2).getImm() & 0xFF;
    Op.addOperand(MCOperand::createImm(Val));
    EmitToStreamer(*OutStreamer, Op);
+    MCInst Rep; Rep.setOpcode(W65816::REP);
+    Rep.addOperand(MCOperand::createImm(0x20));
+    EmitToStreamer(*OutStreamer, Rep);
    return;
  }
  case W65816::ANDi8imm:
@ -174,21 +208,55 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
    // encoder only takes the low byte anyway.
    int64_t Val = MI->getOperand(2).getImm() & 0xFF;
    Op.addOperand(MCOperand::createImm(Val));
+    // SEP/REP wrap (see LDAi8imm comment).
+    MCInst Sep; Sep.setOpcode(W65816::SEP);
+    Sep.addOperand(MCOperand::createImm(0x20));
+    EmitToStreamer(*OutStreamer, Sep);
    EmitToStreamer(*OutStreamer, Op);
+    MCInst Rep; Rep.setOpcode(W65816::REP);
+    Rep.addOperand(MCOperand::createImm(0x20));
+    EmitToStreamer(*OutStreamer, Rep);
    return;
  }
  case W65816::LDA8abs: {
+    // i8 absolute load — same byte sequence as LDA_Abs in M=0, but
+    // semantically loads 1 byte not 2.  Need M=1 wrap so we don't
+    // also pull in the byte at addr+1 (often another global, which is
+    // harmless to read but corrupts A_hi for any consumer that cares).
+    MCInst Sep; Sep.setOpcode(W65816::SEP);
+    Sep.addOperand(MCOperand::createImm(0x20));
+    EmitToStreamer(*OutStreamer, Sep);
    MCInst Lda;
    Lda.setOpcode(W65816::LDA_Abs);
    Lda.addOperand(lowerOperand(MI->getOperand(1), MCInstLowering));
    EmitToStreamer(*OutStreamer, Lda);
+    MCInst Rep; Rep.setOpcode(W65816::REP);
+    Rep.addOperand(MCOperand::createImm(0x20));
+    EmitToStreamer(*OutStreamer, Rep);
    return;
  }
  case W65816::STA8abs: {
+    // STA_Abs is 16-bit when M=0, 8-bit when M=1.  Pure-i8 functions
+    // run with M=1 and a bare STA is correct.  M=0 functions need an
+    // SEP/REP wrap so the STA stores only one byte — without it, the
+    // store clobbers the byte at addr+1 (potentially another global).
+    bool UsesAcc8 = MI->getMF()
+                        ->getInfo<W65816MachineFunctionInfo>()
+                        ->getUsesAcc8();
+    if (!UsesAcc8) {
+      MCInst Sep; Sep.setOpcode(W65816::SEP);
+      Sep.addOperand(MCOperand::createImm(0x20));
+      EmitToStreamer(*OutStreamer, Sep);
+    }
    MCInst Sta;
    Sta.setOpcode(W65816::STA_Abs);
    Sta.addOperand(lowerOperand(MI->getOperand(1), MCInstLowering));
    EmitToStreamer(*OutStreamer, Sta);
+    if (!UsesAcc8) {
+      MCInst Rep; Rep.setOpcode(W65816::REP);
+      Rep.addOperand(MCOperand::createImm(0x20));
+      EmitToStreamer(*OutStreamer, Rep);
+    }
    return;
  }
  case W65816::ADCabs:
@ -224,11 +292,19 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
    return;
  }
  case W65816::CMPi8imm: {
+    // i8 immediate compare — needs M=1 so the CPU only reads 1 byte
+    // for the immediate.  See LDAi8imm comment for the wrap rationale.
+    MCInst Sep; Sep.setOpcode(W65816::SEP);
+    Sep.addOperand(MCOperand::createImm(0x20));
+    EmitToStreamer(*OutStreamer, Sep);
    MCInst Cmp;
    Cmp.setOpcode(W65816::CMP_Imm8);
    int64_t Val = MI->getOperand(1).getImm() & 0xFF;
    Cmp.addOperand(MCOperand::createImm(Val));
    EmitToStreamer(*OutStreamer, Cmp);
+    MCInst Rep; Rep.setOpcode(W65816::REP);
+    Rep.addOperand(MCOperand::createImm(0x20));
+    EmitToStreamer(*OutStreamer, Rep);
    return;
  }
  case W65816::CMPabs: {
@ -283,6 +359,28 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
    EmitToStreamer(*OutStreamer, Pha);
    return;
  }
+  case W65816::ALLOCAfi: {
+    // VLA / dynamic_stackalloc: A holds size on entry; on exit A holds
+    // pointer to the allocated region.
+    //   TSC                ; A = SP
+    //   SEC                ; clear borrow
+    //   SBC size  (in $E0) ; A = SP - size
+    //   TCS                ; SP = A
+    //   INC A              ; A = SP + 1, the lowest byte of the region
+    // Size is in A on entry — but we need A=SP after TSC, so first
+    // stash the size to DP scratch.
+    MCInst Sta1; Sta1.setOpcode(W65816::STA_DP);
+    Sta1.addOperand(MCOperand::createImm(0xE0));
+    EmitToStreamer(*OutStreamer, Sta1);
+    MCInst Tsc; Tsc.setOpcode(W65816::TSC); EmitToStreamer(*OutStreamer, Tsc);
+    MCInst Sec; Sec.setOpcode(W65816::SEC); EmitToStreamer(*OutStreamer, Sec);
+    MCInst Sbc; Sbc.setOpcode(W65816::SBC_DP);
+    Sbc.addOperand(MCOperand::createImm(0xE0));
+    EmitToStreamer(*OutStreamer, Sbc);
+    MCInst Tcs; Tcs.setOpcode(W65816::TCS); EmitToStreamer(*OutStreamer, Tcs);
+    MCInst Ina; Ina.setOpcode(W65816::INA); EmitToStreamer(*OutStreamer, Ina);
+    return;
+  }
  case W65816::PUSH16X: {
    MCInst Phx;
    Phx.setOpcode(W65816::PHX);
@ -352,6 +450,19 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
    EmitToStreamer(*OutStreamer, Inc);
    return;
  }
+  case W65816::NEGA8: {
+    // EOR #$FF; INC A — same idea as NEGA16 but in 8-bit M.
+    // The function context is already 8-bit M when an i8-only path
+    // is selected, so no SEP/REP wrap is needed here.
+    MCInst Eor;
+    Eor.setOpcode(W65816::EOR_Imm8);
+    Eor.addOperand(MCOperand::createImm(0xFF));
+    EmitToStreamer(*OutStreamer, Eor);
+    MCInst Inc;
+    Inc.setOpcode(W65816::INA);
+    EmitToStreamer(*OutStreamer, Inc);
+    return;
+  }
  case W65816::NEGC16: {
    // (subc 0, x) — lo half of multi-precision negate.
    // EOR #$FFFF; CLC; ADC #1.  C-out = 1 iff result = 0 (i.e. x was 0),
--- a/src/llvm/lib/Target/W65816/W65816BranchExpand.cpp
+++ b/src/llvm/lib/Target/W65816/W65816BranchExpand.cpp
@ -0,0 +1,378 @@
+//===-- W65816BranchExpand.cpp - Long conditional branch expansion --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Lengthens conditional branches that target an MBB further than +/-128
+// bytes away.  The 65816 has BRL (signed-16, ±32K) for unconditional
+// branches but no long *conditional* branch, so we expand
+//
+//     Bxx Target              -->     INV_Bxx Skip
+//     fall-through Skip               BRA     Target
+//                                 Skip:
+//                                     fall-through
+//
+// The unconditional BRA is later auto-relaxed to BRL by W65816AsmBackend
+// when its displacement exceeds 8 bits (in the same way that an
+// assembler-time `bra label` to a label > 127 bytes away gets promoted).
+//
+// Algorithm:
+//
+//   1.  Pre-split: any MBB that has more than one conditional terminator
+//       (the multi-branch SELECT_CC pattern emits two Bxx in one MBB)
+//       is sliced after every conditional Bxx that isn't the LAST one.
+//       After this, each MBB has at most one conditional terminator,
+//       which my expansion logic can handle cleanly.
+//
+//   2.  Iterate to fixed-point.  In each iteration, recompute byte
+//       distances (using TII::getInstSizeInBytes for accuracy) and
+//       expand every conditional whose target is more than
+//       EXPAND_DIST_THRESHOLD bytes away.  Each expansion adds 3 bytes
+//       (the Bridge MBB's BRA), which can push another inner branch
+//       over the threshold; iterate until no further expansions.
+//
+// Runs at addPreEmitPass, after PEI so all FrameIndex references and
+// pseudo expansions have stable byte sizes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "W65816.h"
+#include "W65816InstrInfo.h"
+#include "W65816Subtarget.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "w65816-branch-expand"
+
+namespace {
+
+class W65816BranchExpand : public MachineFunctionPass {
+public:
+  static char ID;
+  W65816BranchExpand() : MachineFunctionPass(ID) {}
+  StringRef getPassName() const override {
+    return "W65816 conditional branch expansion";
+  }
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+
+} // namespace
+
+char W65816BranchExpand::ID = 0;
+
+INITIALIZE_PASS(W65816BranchExpand, DEBUG_TYPE,
+                "W65816 conditional branch expansion", false, false)
+
+FunctionPass *llvm::createW65816BranchExpand() {
+  return new W65816BranchExpand();
+}
+
+// Map a conditional branch opcode to its inverted form.  Returns 0 if
+// not a recognised conditional Bxx.
+static unsigned invertedConditional(unsigned Opc) {
+  switch (Opc) {
+  case W65816::BEQ: return W65816::BNE;
+  case W65816::BNE: return W65816::BEQ;
+  case W65816::BCC: return W65816::BCS;
+  case W65816::BCS: return W65816::BCC;
+  case W65816::BMI: return W65816::BPL;
+  case W65816::BPL: return W65816::BMI;
+  case W65816::BVC: return W65816::BVS;
+  case W65816::BVS: return W65816::BVC;
+  default: return 0;
+  }
+}
+
+// Byte-accurate distance estimate from a specific branch instruction
+// to its target MBB.  Starts counting at the BRANCH (not at the MBB
+// start) and stops at the target MBB's start.  This matters because a
+// branch at the END of a large MBB has a tiny actual distance to the
+// next-laid-out MBB even though the MBB itself is huge.
+static unsigned estimateDistance(MachineFunction &MF,
+                                 const TargetInstrInfo *TII,
+                                 const MachineInstr &Br,
+                                 MachineBasicBlock *To) {
+  const MachineBasicBlock *From = Br.getParent();
+  if (From == To) return 0;
+
+  // Two cases by layout direction:
+  //   forward: bytes after Br in From, plus all of MBBs strictly
+  //            between, plus 0 (branch lands at To's start).
+  //   backward: bytes before Br in From, plus all of MBBs strictly
+  //             between, plus all of To.
+  int FromIdx = -1, ToIdx = -1, Idx = 0;
+  for (auto &MBB : MF) {
+    if (&MBB == From) FromIdx = Idx;
+    if (&MBB == To)   ToIdx   = Idx;
+    Idx++;
+  }
+  if (FromIdx < 0 || ToIdx < 0) return 1000;  // unknown — assume far
+
+  unsigned Bytes = 0;
+  if (ToIdx > FromIdx) {
+    // Forward: count from Br to end of From, then between, then 0.
+    bool past = false;
+    for (const auto &MI : *From) {
+      if (&MI == &Br) past = true;
+      if (past) Bytes += TII->getInstSizeInBytes(MI);
+    }
+    Idx = 0;
+    for (auto &MBB : MF) {
+      if (Idx > FromIdx && Idx < ToIdx)
+        for (const auto &MI : MBB) Bytes += TII->getInstSizeInBytes(MI);
+      Idx++;
+    }
+  } else {
+    // Backward: count Br's preceding bytes in From, plus between, plus all of To.
+    for (const auto &MI : *From) {
+      if (&MI == &Br) break;
+      Bytes += TII->getInstSizeInBytes(MI);
+    }
+    Idx = 0;
+    for (auto &MBB : MF) {
+      if (Idx > ToIdx && Idx < FromIdx)
+        for (const auto &MI : MBB) Bytes += TII->getInstSizeInBytes(MI);
+      if (Idx == ToIdx)
+        for (const auto &MI : MBB) Bytes += TII->getInstSizeInBytes(MI);
+      Idx++;
+    }
+  }
+  return Bytes;
+}
+
+// Step 1 — pre-split: any MBB with > 1 conditional terminator gets
+// sliced after each non-final conditional, so every MBB ends up with
+// at most one conditional terminator.  Returns true if any MBB was
+// split.
+static bool splitMultiBranchMBBs(MachineFunction &MF,
+                                 const TargetInstrInfo *TII) {
+  bool Changed = false;
+  // Snapshot MBBs first (we mutate the list during iteration).
+  SmallVector<MachineBasicBlock *, 16> MBBs;
+  for (auto &MBB : MF) MBBs.push_back(&MBB);
+
+  for (MachineBasicBlock *MBB : MBBs) {
+    // Find the first conditional terminator that has another
+    // conditional terminator after it.  Slice MBB right after it.
+    bool Sliced = true;
+    while (Sliced) {
+      Sliced = false;
+      // Walk terminators forward.
+      auto firstTerm = MBB->getFirstTerminator();
+      MachineBasicBlock::iterator splitAfter = MBB->end();
+      MachineBasicBlock::iterator firstCond  = MBB->end();
+      for (auto it = firstTerm; it != MBB->end(); ++it) {
+        if (invertedConditional(it->getOpcode()) != 0) {
+          if (firstCond == MBB->end()) {
+            firstCond = it;
+          } else {
+            splitAfter = firstCond;  // split AFTER this earlier conditional
+            break;
+          }
+        }
+      }
+      if (splitAfter == MBB->end()) break;
+
+      // Create new MBB; transfer everything after splitAfter to it.
+      auto *NewMBB = MF.CreateMachineBasicBlock(MBB->getBasicBlock());
+      MF.insert(std::next(MBB->getIterator()), NewMBB);
+      // Move instructions [splitAfter+1 .. end) to NewMBB.
+      auto moveStart = std::next(splitAfter);
+      NewMBB->splice(NewMBB->end(), MBB, moveStart, MBB->end());
+      // Transfer successors that aren't the splitAfter's target.
+      MachineBasicBlock *splitTgt = nullptr;
+      if (splitAfter->getNumOperands() >= 1 &&
+          splitAfter->getOperand(0).isMBB())
+        splitTgt = splitAfter->getOperand(0).getMBB();
+      // All of MBB's existing successors that aren't splitTgt move to
+      // NewMBB.  splitTgt stays as MBB's own successor (the conditional
+      // branch target).  EXCEPTION: if any branch instruction we moved
+      // into NewMBB *also* targets splitTgt (the multi-branch SELECT_CC
+      // case where both Bxx point at the same MBB), splitTgt must also
+      // be a successor of NewMBB.
+      SmallVector<MachineBasicBlock *, 4> OldSuccs(MBB->successors().begin(),
+                                                    MBB->successors().end());
+      for (auto *S : OldSuccs) {
+        if (S == splitTgt) continue;
+        MBB->removeSuccessor(S);
+        NewMBB->addSuccessor(S);
+      }
+      // Walk NewMBB's instructions; for each MBB-operand reference,
+      // ensure that target is a NewMBB successor.
+      for (auto &MI : *NewMBB) {
+        for (unsigned i = 0; i < MI.getNumOperands(); ++i) {
+          const auto &OP = MI.getOperand(i);
+          if (!OP.isMBB()) continue;
+          auto *RefMBB = OP.getMBB();
+          if (!NewMBB->isSuccessor(RefMBB))
+            NewMBB->addSuccessor(RefMBB);
+        }
+      }
+      // MBB falls through to NewMBB now.
+      MBB->addSuccessor(NewMBB);
+      // The splitAfter conditional already targets splitTgt (still in
+      // MBB->successors()).  Done — process the same MBB again to
+      // see if another split is needed (multi-multi-branch case).
+      Changed = true;
+      Sliced = true;
+      (void)TII;  // unused for now
+    }
+  }
+  return Changed;
+}
+
+// Drop conditional branches whose target matches the unconditional
+// branch immediately following them (both edges go to the same MBB,
+// so the conditional is dead).  This pattern survives upstream cleanup
+// when the branches were emitted by the W65816 SELECT_CC inserter or
+// by codegenprepare on an `br i1 %c, label %X, label %X` IR shape.
+// Returns true if any MI was deleted.
+static bool dropDeadConditionalsToBRATarget(MachineFunction &MF) {
+  bool Changed = false;
+  for (auto &MBB : MF) {
+    auto T = MBB.getFirstTerminator();
+    while (T != MBB.end()) {
+      auto Next = std::next(T);
+      if (Next == MBB.end()) break;
+      unsigned CondOpc = T->getOpcode();
+      if (invertedConditional(CondOpc) == 0) { ++T; continue; }
+      unsigned UncondOpc = Next->getOpcode();
+      if (UncondOpc != W65816::BRA && UncondOpc != W65816::BRL) {
+        ++T; continue;
+      }
+      if (T->getNumOperands() < 1 || !T->getOperand(0).isMBB()) { ++T; continue; }
+      if (Next->getNumOperands() < 1 || !Next->getOperand(0).isMBB()) { ++T; continue; }
+      if (T->getOperand(0).getMBB() != Next->getOperand(0).getMBB()) { ++T; continue; }
+      // Conditional and unconditional target the same MBB.  Drop the
+      // conditional; the unconditional already covers both edges.
+      auto Erase = T++;
+      Erase->eraseFromParent();
+      Changed = true;
+    }
+  }
+  return Changed;
+}
+
+bool W65816BranchExpand::runOnMachineFunction(MachineFunction &MF) {
+  const auto &STI = MF.getSubtarget<W65816Subtarget>();
+  const auto *TII = STI.getInstrInfo();
+  bool AnyChanged = false;
+
+  // Step 0: drop dead conditionals (Bxx X immediately followed by BRA X
+  // — both edges to the same MBB).  Cheap and removes false-positive
+  // candidates from the distance-based expansion below.
+  AnyChanged |= dropDeadConditionalsToBRATarget(MF);
+
+  // Step 1: split multi-conditional-terminator MBBs.
+  AnyChanged |= splitMultiBranchMBBs(MF, TII);
+
+  // Step 2: iterate to fixed-point.  Each expansion adds 3 bytes
+  // (bridge BRA), which may push another previously-OK branch over
+  // the threshold.  Cap at MAX_ITERS to avoid pathological cases.
+  const unsigned EXPAND_DIST_THRESHOLD = 100;  // safe under +/-128
+  const unsigned MAX_ITERS = 10;
+  for (unsigned iter = 0; iter < MAX_ITERS; ++iter) {
+    bool Changed = false;
+
+    // Collect candidates.  After step 1, each MBB has at most one
+    // conditional terminator, so we walk terminators().
+    SmallVector<std::pair<MachineBasicBlock *, MachineInstr *>, 8> Candidates;
+    for (auto &MBB : MF) {
+      for (auto &MI : MBB.terminators()) {
+        unsigned Opc = MI.getOpcode();
+        if (invertedConditional(Opc) == 0) continue;
+        if (MI.getNumOperands() < 1 || !MI.getOperand(0).isMBB()) continue;
+        MachineBasicBlock *Target = MI.getOperand(0).getMBB();
+        unsigned Dist = estimateDistance(MF, TII, MI, Target);
+        if (Dist > EXPAND_DIST_THRESHOLD)
+          Candidates.emplace_back(&MBB, &MI);
+      }
+    }
+
+    for (auto [MBB, BrMI] : Candidates) {
+      unsigned Opc = BrMI->getOpcode();
+      unsigned InvOpc = invertedConditional(Opc);
+      MachineBasicBlock *Target = BrMI->getOperand(0).getMBB();
+      DebugLoc DL = BrMI->getDebugLoc();
+
+      // Layout transformation:
+      //   MBB:    ... ; Bxx Target ; (fall-through Skip)
+      // Becomes:
+      //   MBB:    ... ; INV_Bxx Skip
+      //   Bridge:       BRA Target
+      //   Skip:   (= original MBB's fall-through successor)
+      //
+      // After splitMultiBranchMBBs, MBB has ONE conditional terminator
+      // (BrMI) and at most one unconditional terminator after it (which
+      // we leave alone — it's the fall-through-or-explicit branch).
+      // MBB's successors are {Target, Skip} where Skip is whichever
+      // is not Target.
+      MachineBasicBlock *Skip = nullptr;
+      for (auto *S : MBB->successors()) {
+        if (S != Target) { Skip = S; break; }
+      }
+      if (!Skip) continue;  // function-end conditional — rare; skip
+
+      // Create Bridge MBB.
+      MachineBasicBlock *Bridge =
+          MF.CreateMachineBasicBlock(MBB->getBasicBlock());
+      MF.insert(std::next(MBB->getIterator()), Bridge);
+
+      // Replace successor edges: MBB used to have {Target, Skip}; now
+      // it has {Bridge, Skip}.  Bridge has {Target}.
+      MBB->removeSuccessor(Target);
+      MBB->addSuccessor(Bridge);
+      Bridge->addSuccessor(Target);
+
+      // Erase original Bxx, emit inverted Bxx targeting Skip.
+      BrMI->eraseFromParent();
+      // Insert at MBB's terminator position so any unconditional
+      // fall-through marker after stays after.
+      auto insertPt = MBB->getFirstTerminator();
+      BuildMI(*MBB, insertPt, DL, TII->get(InvOpc)).addMBB(Skip);
+
+      // Bridge: BRL Target.  Always emit the long form rather than
+      // relying on the assembler to relax BRA→BRL — the relaxation
+      // path is fragile in mixed-fragment scenarios (MC layout
+      // doesn't always re-evaluate after layout shifts) and we'd
+      // rather pay 1 extra byte per long branch than risk a silent
+      // PCREL8 fixup that can't be resolved at link time.
+      BuildMI(Bridge, DL, TII->get(W65816::BRL)).addMBB(Target);
+
+      Changed = true;
+    }
+    AnyChanged = AnyChanged || Changed;
+    if (!Changed) break;
+  }
+
+  // Step 3: re-run the dead-conditional sweep.  Expansion introduces
+  // `INV_Bxx Skip ; BRA Target` pairs; when the original codegen
+  // already had `BRA Skip` after the (now-erased) Bxx, those collapse
+  // into `INV_Bxx X ; BRA X` — the conditional is dead.
+  AnyChanged |= dropDeadConditionalsToBRATarget(MF);
+
+  // Step 4: drop trailing `BRA next_MBB` / `BRL next_MBB` when the
+  // target is the immediately-following layout MBB.  Block-placement
+  // sometimes leaves these as explicit branches even though
+  // fall-through suffices.  Saves 3 bytes / 3 cycles each.
+  for (auto MBBIt = MF.begin(); MBBIt != MF.end(); ++MBBIt) {
+    auto NextMBB = std::next(MBBIt);
+    if (NextMBB == MF.end()) continue;
+    auto Last = MBBIt->getLastNonDebugInstr();
+    if (Last == MBBIt->end()) continue;
+    unsigned Op = Last->getOpcode();
+    if (Op != W65816::BRA && Op != W65816::BRL) continue;
+    if (Last->getNumOperands() < 1 || !Last->getOperand(0).isMBB()) continue;
+    if (Last->getOperand(0).getMBB() != &*NextMBB) continue;
+    Last->eraseFromParent();
+    AnyChanged = true;
+  }
+  return AnyChanged;
+}
--- a/src/llvm/lib/Target/W65816/W65816FrameLowering.cpp
+++ b/src/llvm/lib/Target/W65816/W65816FrameLowering.cpp
@ -14,56 +14,19 @@

 #include "W65816FrameLowering.h"
 #include "W65816InstrInfo.h"
+#include "W65816MachineFunctionInfo.h"
 #include "W65816Subtarget.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instructions.h"
 #include "llvm/Support/ErrorHandling.h"

 using namespace llvm;

-// "Wide" = needs to live in a 16-bit register at some point during the
-// function body.  i8 and i1 are fine in 8-bit M.  Pointer operands that
-// are constant addresses (globals, externs) are fine too — they're
-// immediate operands of LDA/STA, not values held in A.  A non-constant
-// pointer (function arg, computed value) does need to sit in A as 16
-// bits for stack-relative-indirect addressing.
-static bool isWideTyForMode(Type *T, const llvm::Value *V) {
-  if (!T || T->isVoidTy()) return false;
-  if (T->isIntegerTy(8) || T->isIntegerTy(1)) return false;
-  if (T->isPointerTy() && V && (isa<GlobalValue>(V) || isa<Constant>(V)))
-    return false;
-  return true;
-}
-
-// Some IR ops, even when their visible types are all i8, lower to
-// sequences that need 16-bit M during execution: signed compares (via
-// SEXT to i16 + cmp), variable shifts (libcall via i16-promoted args),
-// constant shifts > 4 (also routed through i16 via LowerShift), and
-// any sext of an i8 (synthesized as a SELECT_CC with i16 mask ops).
-// Detect those here so the prologue picks 16-bit M up front.
-static bool instrLowersToWide(const Instruction &I) {
-  if (auto *Cmp = dyn_cast<ICmpInst>(&I)) {
-    if (Cmp->isSigned() &&
-        Cmp->getOperand(0)->getType()->isIntegerTy(8))
-      return true;
-  }
-  if (isa<SExtInst>(&I) &&
-      I.getOperand(0)->getType()->isIntegerTy(8))
-    return true;
-  unsigned Op = I.getOpcode();
-  if ((Op == Instruction::Shl || Op == Instruction::LShr ||
-       Op == Instruction::AShr) &&
-      I.getType()->isIntegerTy(8))
-    return true;
-  return false;
-}
+// (The pure-i8-detection helpers were removed when the prologue went
+// to "always 16-bit M".  See emitPrologue comment.)

 W65816FrameLowering::W65816FrameLowering(const W65816Subtarget &STI)
    : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, Align(1), 0,
@ -79,7 +42,18 @@ bool W65816FrameLowering::hasFPImpl(const MachineFunction &MF) const {
 }

 bool W65816FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
-  return !MF.getFrameInfo().hasVarSizedObjects();
+  // Returning false is required for correctness: LowerCall pushes
+  // outgoing args via PUSH16 (PHA), which incrementally shifts SP
+  // between ADJCALLSTACKDOWN and ADJCALLSTACKUP.  With a reserved
+  // call frame, PEI assumes SP is stable across calls and bakes
+  // FrameOffset+StackSize into LDA_StackRel.  Then any FI access
+  // that the scheduler interleaves with pushed args (e.g. loading
+  // a *later* arg from the caller's frame to push it) reads from
+  // the wrong offset — silently miscompiling 2+ arg libcalls.
+  // hasReservedCallFrame=false makes PEI add the DOWN-amount to
+  // FI offsets between ADJCALLSTACKDOWN and ADJCALLSTACKUP,
+  // recovering correctness.
+  return false;
 }

 void W65816FrameLowering::emitPrologue(MachineFunction &MF,
@ -95,41 +69,22 @@ void W65816FrameLowering::emitPrologue(MachineFunction &MF,
  MachineBasicBlock::iterator MBBI = MBB.begin();
  DebugLoc DL;

-  // Heuristic: choose 8-bit M (REP #$10 + SEP #$20) only for "pure-i8"
-  // functions — those whose signature and body use no type wider than
-  // i8 (no i16 ops, no pointers).  Any wider type forces 16-bit M
-  // (REP #$30) since pointer dereferences and stack-relative addressing
-  // need M=1 to load/store 16 bits at a time.  In 16-bit M functions,
-  // individual i8 ops are wrapped with SEP/REP at the pseudo level.
-  // A future REP/SEP scheduling pass (design doc 3.3) will replace
-  // this whole-function decision with a per-region one.
-  const Function &F = MF.getFunction();
-  bool HasWide = isWideTyForMode(F.getReturnType(), nullptr);
-  for (const Argument &Arg : F.args()) {
-    if (isWideTyForMode(Arg.getType(), &Arg)) { HasWide = true; break; }
-  }
-  if (!HasWide) {
-    for (const BasicBlock &BB : F) {
-      if (HasWide) break;
-      for (const Instruction &I : BB) {
-        if (isWideTyForMode(I.getType(), &I)) { HasWide = true; break; }
-        if (instrLowersToWide(I)) { HasWide = true; break; }
-        for (const Value *Op : I.operands()) {
-          if (isWideTyForMode(Op->getType(), Op)) { HasWide = true; break; }
-        }
-        if (HasWide) break;
-      }
-    }
-  }
-  bool UsesAcc8 = !HasWide;
+  // Always enter in 16-bit M+X (REP #$30).  Per-instruction i8 ops wrap
+  // themselves with SEP #$20 / REP #$20 in their AsmPrinter expansion;
+  // W65816SepRepCleanup coalesces adjacent toggles so back-to-back i8
+  // ops collapse into a single SEP/REP region (recovering the byte-
+  // heavy "pure-i8" prologue's efficiency without its hazards).
+  //
+  // The earlier "pure-i8" heuristic (REP #$10 + SEP #$20 prologue) was
+  // a silent miscompile: late-stage i8→i16 sign extension and any other
+  // i16 op the back-end emits *without* a wrap — `and #$ff`, `eor #$80`,
+  // `adc #$ff80`, etc. — would assemble as 3-byte i16 immediates but
+  // execute in M=1 where the CPU only reads the low byte.  The next
+  // immediate byte then becomes the next opcode (often $00 = BRK).
+  // Caught by tracing inc_g for `char inc_g(void) { g++; return g; }`.
  (void)MRI;
-
-  if (UsesAcc8) {
-    BuildMI(MBB, MBBI, DL, TII.get(W65816::REP)).addImm(0x10);
-    BuildMI(MBB, MBBI, DL, TII.get(W65816::SEP)).addImm(0x20);
-  } else {
+  MF.getInfo<W65816MachineFunctionInfo>()->setUsesAcc8(false);
  BuildMI(MBB, MBBI, DL, TII.get(W65816::REP)).addImm(0x30);
-  }

  // Reserve stack space for locals/spills.
  //
@ -152,18 +107,35 @@ void W65816FrameLowering::emitPrologue(MachineFunction &MF,
  // and corrupt it (was a latent silent crash for 8-bit M functions
  // that needed any spilling).
  uint64_t StackSize = MF.getFrameInfo().getStackSize();
+  bool HasVLA = MF.getFrameInfo().hasVarSizedObjects();
+
+  // For VLA functions, save entry SP to DP $F4..$F5 BEFORE any frame
+  // allocation so the epilogue can restore it directly (undoing both
+  // the static frame and any dynamic_stackalloc bytes).  $F4 is the
+  // saved-SP slot; $F0..$F1 is reserved for i64 return high-half;
+  // $E0..$EF is libcall scratch.  TAY around the TSC preserves A
+  // (which holds arg0).
+  if (HasVLA) {
+    BuildMI(MBB, MBBI, DL, TII.get(W65816::TAY));    // save A
+    BuildMI(MBB, MBBI, DL, TII.get(W65816::TSC));    // A = SP
+    BuildMI(MBB, MBBI, DL, TII.get(W65816::STA_DP)).addImm(0xF4);
+    BuildMI(MBB, MBBI, DL, TII.get(W65816::TYA));    // restore A
+  }
+
  if (StackSize > 0) {
-    if (UsesAcc8) {
-      // 8-bit M: 1 PHA per byte.  Preserves A.
-      for (uint64_t i = 0; i < StackSize; ++i)
-        BuildMI(MBB, MBBI, DL, TII.get(W65816::PHA));
-    } else if (StackSize <= 14 && (StackSize % 2) == 0) {
-      // 16-bit M, small frame: N/2 PHAs.  Preserves A.
+    // Cycle math: each PHA is 4 cyc; the TSC-sequence (TAY+TSC+SEC+
+    // SBC+TCS+TYA) is 13 cyc total.  N PHAs win on cycles when 4*N <= 13,
+    // i.e. up to 3 PHAs (6-byte frame).  At N=4 (8 bytes): 16 cyc PHAs vs
+    // 13 cyc TSC-seq → TSC wins.  Threshold at 6 bytes for speed.
+    // (Bytes: N PHAs cost N bytes; TSC-seq costs 8 bytes.  We're
+    // optimizing for speed per the project directive.)
+    if (StackSize <= 6 && (StackSize % 2) == 0) {
+      // Small frame: N/2 PHAs.  Preserves A.
      for (uint64_t i = 0; i < StackSize / 2; ++i)
        BuildMI(MBB, MBBI, DL, TII.get(W65816::PHA));
    } else {
-      // 16-bit M, larger frame: TAY/TSC/.../TYA bracket.  Preserves A
-      // via Y as a temp.
+      // Larger frame: TAY/TSC/.../TYA bracket.  Preserves A via Y as a
+      // temp.
      BuildMI(MBB, MBBI, DL, TII.get(W65816::TAY));
      BuildMI(MBB, MBBI, DL, TII.get(W65816::TSC));
      BuildMI(MBB, MBBI, DL, TII.get(W65816::SEC));
@ -180,7 +152,8 @@ void W65816FrameLowering::emitEpilogue(MachineFunction &MF,
  // Mirror image of the prologue: release any reserved frame bytes
  // before the RTL.
  uint64_t StackSize = MF.getFrameInfo().getStackSize();
-  if (StackSize == 0)
+  bool HasVLA = MF.getFrameInfo().hasVarSizedObjects();
+  if (StackSize == 0 && !HasVLA)
    return;

  const W65816Subtarget &STI = MF.getSubtarget<W65816Subtarget>();
@ -189,46 +162,27 @@ void W65816FrameLowering::emitEpilogue(MachineFunction &MF,
  // Insert before the terminator (the return).
  DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();

-  // Mirror the prologue's pure-i8 detection: skip the 16-bit stack
-  // adjustment only if the function ran in 8-bit M (no wide types
-  // anywhere).
-  const Function &F = MF.getFunction();
-  bool HasWide = isWideTyForMode(F.getReturnType(), nullptr);
-  if (!HasWide) {
-    for (const Argument &Arg : F.args()) {
-      if (isWideTyForMode(Arg.getType(), &Arg)) { HasWide = true; break; }
-    }
-  }
-  if (!HasWide) {
-    for (const BasicBlock &BB : F) {
-      if (HasWide) break;
-      for (const Instruction &I : BB) {
-        if (isWideTyForMode(I.getType(), &I)) { HasWide = true; break; }
-        if (instrLowersToWide(I)) { HasWide = true; break; }
-        for (const Value *Op : I.operands()) {
-          if (isWideTyForMode(Op->getType(), Op)) { HasWide = true; break; }
-        }
-        if (HasWide) break;
-      }
-    }
-  }
-  // 8-bit M epilogue.  Save A in Y(low) via TAY, pop N bytes via N
-  // PLAs (each pops 1 byte in 8-bit M), restore A via TYA.  Y is
-  // caller-saved by our ABI so we can use it freely.  Total cost:
-  // N + 2 bytes per epilogue.
-  if (!HasWide) {
-    BuildMI(MBB, MBBI, DL, TII.get(W65816::TAY));   // save A in Y
-    for (uint64_t i = 0; i < StackSize; ++i)
-      BuildMI(MBB, MBBI, DL, TII.get(W65816::PLA)); // pop frame bytes
-    BuildMI(MBB, MBBI, DL, TII.get(W65816::TYA));   // restore A from Y
+  // VLA cleanup: restore entry SP from DP $F4 (saved in prologue).
+  // This subsumes BOTH the static frame and any dynamic_stackalloc
+  // bytes — we can skip the per-byte PLY/PLA loop entirely.  Preserve
+  // A through TAY/TYA since it holds the return value.
+  if (HasVLA) {
+    BuildMI(MBB, MBBI, DL, TII.get(W65816::TAY));
+    BuildMI(MBB, MBBI, DL, TII.get(W65816::LDA_DP)).addImm(0xF4);
+    BuildMI(MBB, MBBI, DL, TII.get(W65816::TCS));
+    BuildMI(MBB, MBBI, DL, TII.get(W65816::TYA));
    return;
  }

+  // Prologue is always 16-bit M now (see emitPrologue).  No 8-bit
+  // epilogue branch needed.
+
  // 16-bit M epilogue.  Mirror the prologue: A holds the return value
  // at this point and MUST be preserved.  Small frames release via
  // N/2 PLY (pop into Y, discard); larger frames use
  // TAY/TSC/CLC/ADC #N/TCS/TYA.
-  if (StackSize <= 14 && (StackSize % 2) == 0) {
+  // Mirror the prologue threshold (see comment there).
+  if (StackSize <= 6 && (StackSize % 2) == 0) {
    for (uint64_t i = 0; i < StackSize / 2; ++i)
      BuildMI(MBB, MBBI, DL, TII.get(W65816::PLY));
    return;
--- a/src/llvm/lib/Target/W65816/W65816ISelLowering.cpp
+++ b/src/llvm/lib/Target/W65816/W65816ISelLowering.cpp
@ -84,7 +84,11 @@ W65816TargetLowering::W65816TargetLowering(const TargetMachine &TM,
  // expansions that load through that pointer and bump it.  This makes
  // <stdarg.h>-style functions (e.g. printf-likes) compile cleanly.
  setOperationAction(ISD::VASTART, MVT::Other, Custom);
-  setOperationAction(ISD::VAARG,   MVT::Other, Expand);
+  // Custom VAARG so we DON'T align the va_list pointer.  The default
+  // expansion rounds up to the type's preferred alignment (S16 = 2),
+  // but caller-pushed args land at PHA's resulting odd S+1 address.
+  // Aligning would skip the low byte and read garbage.
+  setOperationAction(ISD::VAARG,   MVT::Other, Custom);
  setOperationAction(ISD::VACOPY,  MVT::Other, Expand);
  setOperationAction(ISD::VAEND,   MVT::Other, Expand);

@ -99,6 +103,20 @@ W65816TargetLowering::W65816TargetLowering(const TargetMachine &TM,
  setOperationAction(ISD::SMUL_LOHI, MVT::i16, Expand);
  setOperationAction(ISD::UMUL_LOHI, MVT::i16, Expand);
  setOperationAction(ISD::MUL,    MVT::i16, LibCall);
+  // CTPOP/CTLZ/CTTZ/ROTL/ROTR — no hardware support.  Expand lets the
+  // type legalizer rewrite into a sequence of basic ops.  Without
+  // this, e.g. `x && !(x & (x-1))` (LLVM canonicalises to popcount==1)
+  // or `(x << 1) | (x >> 15)` (canonicalised to rotl) hit "Cannot
+  // Select" at isel.
+  for (MVT VT : {MVT::i8, MVT::i16, MVT::i32}) {
+    setOperationAction(ISD::CTPOP, VT, Expand);
+    setOperationAction(ISD::CTLZ,  VT, Expand);
+    setOperationAction(ISD::CTTZ,  VT, Expand);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
+    setOperationAction(ISD::ROTL, VT, Expand);
+    setOperationAction(ISD::ROTR, VT, Expand);
+  }
  setOperationAction(ISD::SDIV,   MVT::i16, LibCall);
  setOperationAction(ISD::UDIV,   MVT::i16, LibCall);
  setOperationAction(ISD::SREM,   MVT::i16, LibCall);
@ -167,10 +185,21 @@ W65816TargetLowering::W65816TargetLowering(const TargetMachine &TM,
  // to UINT_MAX makes LLVM never form a jump table.
  setMinimumJumpTableEntries(UINT_MAX);

+  // Variable-length arrays / dynamic stack allocation.  Lowered to
+  // `tsc; sec; sbc size; tcs; inc a` — A returns the address of the
+  // allocated region.  Limitation: this shifts SP, so any FrameIndex
+  // accessed *after* a DYNAMIC_STACKALLOC reads from a wrong offset
+  // (we have no frame pointer).  Suitable for the common pattern
+  // "alloca; initialise; pass; return"; complex VLA use mixed with
+  // local-variable access across the alloca will miscompile.  A real
+  // FP (DP slot or X-as-FP) would lift this restriction.
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i16, Custom);
+
  // Opt into PerformDAGCombine on LOAD nodes — needed for the
  // address-select reverse combine (see W65816TargetLowering::
  // PerformDAGCombine).
-  setTargetDAGCombine(ISD::LOAD);
+  // setTargetDAGCombine(ISD::LOAD); // bisecting pickif hang
+  setTargetDAGCombine(ISD::SHL);
 }

 // Map an LLVM SETCC condition to a W65816 branch.  Returns the condition
@ -369,6 +398,34 @@ SDValue W65816TargetLowering::LowerSignExtend(SDValue Op,
  return DAG.getNode(ISD::SUB, DL, MVT::i16, Xor, Sign);
 }

+// VAARG: load *ap, advance ap by sizeof(VT).  Unlike the default
+// expansion, we do NOT align ap to the type's preferred alignment —
+// caller-pushed varargs land at byte-granular addresses (PHA from an
+// odd S leaves the low byte at S+1 which is even, but our prologue's
+// TSC-sequence can produce odd S, etc.).  Aligning ap would skip the
+// pushed value's low byte.
+static SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  SDValue Chain   = Op.getOperand(0);
+  SDValue VAListPtr = Op.getOperand(1);
+  EVT VT = Op.getValueType();
+  // Load current ap.
+  SDValue Ap = DAG.getLoad(MVT::i16, DL, Chain, VAListPtr,
+                            MachinePointerInfo());
+  Chain = Ap.getValue(1);
+  // Load value at ap.
+  SDValue Val = DAG.getLoad(VT, DL, Chain, Ap, MachinePointerInfo());
+  Chain = Val.getValue(1);
+  // ap += sizeof(VT) (rounded up to whole bytes — i8 takes 1, i16/i32/i64
+  // take their byte size).  No extra alignment.
+  unsigned Size = (VT.getSizeInBits() + 7) / 8;
+  SDValue NewAp = DAG.getNode(ISD::ADD, DL, MVT::i16, Ap,
+                               DAG.getConstant(Size, DL, MVT::i16));
+  // Store new ap.
+  Chain = DAG.getStore(Chain, DL, NewAp, VAListPtr, MachinePointerInfo());
+  return DAG.getMergeValues({Val, Chain}, DL);
+}
+
 // VASTART: store the address of the first vararg slot (recorded by
 // LowerFormalArguments via VarArgsFrameIndex) to the va_list pointer.
 // va_list is just `i16 *next` here — minimum implementation.
@ -395,20 +452,73 @@ SDValue W65816TargetLowering::LowerOperation(SDValue Op,
  case ISD::SELECT_CC:      return LowerSELECT_CC(Op, DAG);
  case ISD::SIGN_EXTEND:    return LowerSignExtend(Op, DAG);
  case ISD::VASTART:        return LowerVASTART(Op, DAG);
+  case ISD::VAARG:          return LowerVAARG(Op, DAG);
  case ISD::SHL:
  case ISD::SRL:
  case ISD::SRA:            return LowerShift(Op, DAG);
+  case ISD::DYNAMIC_STACKALLOC: return LowerDynamicStackalloc(Op, DAG);
  default:
    llvm_unreachable("W65816: unexpected operation in LowerOperation");
  }
 }

+std::pair<unsigned, const TargetRegisterClass *>
+W65816TargetLowering::getRegForInlineAsmConstraint(
+    const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
+  // Strip leading '{' and trailing '}' for the long form.
+  StringRef C = Constraint;
+  if (C.size() >= 2 && C.front() == '{' && C.back() == '}')
+    C = C.substr(1, C.size() - 2);
+
+  if (VT == MVT::i8) {
+    if (C == "a") return {W65816::A, &W65816::Acc8RegClass};
+    if (C == "x") return {W65816::X, &W65816::Idx8RegClass};
+    if (C == "y") return {W65816::Y, &W65816::Idx8RegClass};
+    if (C == "r") return {W65816::A, &W65816::Acc8RegClass};
+  } else {  // i16 default; pointer types fold here too
+    if (C == "a") return {W65816::A, &W65816::Acc16RegClass};
+    if (C == "x") return {W65816::X, &W65816::Idx16RegClass};
+    if (C == "y") return {W65816::Y, &W65816::Idx16RegClass};
+    if (C == "r") return {W65816::A, &W65816::Acc16RegClass};
+  }
+  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+}
+
+SDValue W65816TargetLowering::LowerDynamicStackalloc(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+  // (DYNAMIC_STACKALLOC chain, size, align) -> (ptr, chain).
+  // Lowered as: stash entry SP -> DP $F4 (handled by emitPrologue when
+  // MFI.hasVarSizedObjects), then `tsc; sec; sbc size; tcs; inc a`.
+  // The epilogue restores SP from $F4.
+  //
+  // Limitation: any FrameIndex (local, spill slot, parameter) accessed
+  // *after* the alloca reads from a wrong stack-relative offset because
+  // PEI bakes FI offsets relative to the static-frame SP, not the
+  // post-alloca SP.  A real frame pointer would lift this; for now we
+  // accept the limitation and document it.  The simplest safe pattern
+  // is "VLA at end of function, used immediately, no further FI access";
+  // anything else is at-your-own-risk until FP support lands.
+  SDLoc DL(Op);
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size = Op.getOperand(1);
+  SDValue ChainAndPtr = DAG.getNode(W65816ISD::ALLOCA, DL,
+                                    DAG.getVTList(MVT::i16, MVT::Other),
+                                    Chain, Size);
+  SDValue Ptr = ChainAndPtr.getValue(0);
+  SDValue NewChain = ChainAndPtr.getValue(1);
+  return DAG.getMergeValues({Ptr, NewChain}, DL);
+}
+
 SDValue W65816TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
  // i8 shifts: promote to i16, shift, truncate.  SRA promotes via SEXT
  // (preserves sign for arithmetic right shift); SHL/SRL via ZEXT
  // (logical / left shifts don't care about high bits).  This routes
  // i8 shifts through the same i16 fast paths and libcalls — no
-  // parallel qi3 libcall set needed.
+  // parallel qi3 libcall set needed.  The DAG combiner would otherwise
+  // narrow `(trunc (shl (zext X), K))` back to `(shl X, K)` of i8,
+  // re-entering this hook in an infinite loop; the
+  // `isTypeDesirableForOp(SHL/SRL/SRA, i8) -> false` override above
+  // disables that combine.
  if (Op.getValueType() == MVT::i8) {
    SDLoc DL(Op);
    SDValue X = Op.getOperand(0);
@ -419,6 +529,20 @@ SDValue W65816TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
    SDValue N16 = N.getValueType() == MVT::i16
                      ? N
                      : DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, N);
+    // Special case: i8 SRA by 7 of a sign-extended value is the
+    // sign-fill operation — every result bit is the input's bit 7.
+    // For sext(i8 x), bit 15 == bit 7, so `(sra (sext x), 7)` yields
+    // the same result as `(sra (sext x), 15)`, which we have a tight
+    // 4-insn pattern for via SRA15A.  Avoids the __ashrhi3 libcall
+    // (~10 insns plus arg push/pop overhead) — abs8 dropped from 47
+    // to 35 insns with this rewrite in place.
+    if (Op.getOpcode() == ISD::SRA) {
+      if (auto *C = dyn_cast<ConstantSDNode>(N)) {
+        if (C->getZExtValue() == 7) {
+          N16 = DAG.getConstant(15, DL, MVT::i16);
+        }
+      }
+    }
    SDValue R16 = DAG.getNode(Op.getOpcode(), DL, MVT::i16, X16, N16);
    return DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, R16);
  }
@ -435,11 +559,18 @@ SDValue W65816TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
  SDValue Amount = Op.getOperand(1);
  if (auto *C = dyn_cast<ConstantSDNode>(Amount)) {
    uint64_t N = C->getZExtValue();
-    if (N >= 1 && N <= 4)
+    // SHL/SRL by 1..7 chain ASLA16/LSRA16; by 8 use SHL8A/SRL8A; by 9..14
+    // chain on top of those.  All have inline tablegen patterns.
+    if ((Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL) &&
+        N >= 1 && N <= 14)
      return Op;
-    if ((N == 15 || N == 8) &&
+    // SHL/SRL by 15 is just (asl/ror to put bit 0/15 into low/high).
+    if (N == 15 &&
        (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL))
      return Op;
+    // SRA only has inline patterns at 1 and 15 (sign-fill).
+    if (N == 1 && Op.getOpcode() == ISD::SRA)
+      return Op;
    if (N == 15 && Op.getOpcode() == ISD::SRA)
      return Op;
  }
@ -579,11 +710,11 @@ W65816TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,

  if (CLI.IsTailCall)
    CLI.IsTailCall = false;
-  // Up to 2 return values: i8/i16 in A, or split i32 in A:X.  The
-  // result-read loop at the end of this function honors the same
-  // ordering as LowerReturn.
-  if (Ins.size() > 2)
-    report_fatal_error("W65816: multi-return calls not yet supported");
+  // Up to 4 return halves (i64 split): i8/i16 in A; i32 in A:X;
+  // i64 in A:X:Y plus DP $F0..$F1 for the highest half.  See
+  // LowerReturn comment for the ABI.
+  if (Ins.size() > 4)
+    report_fatal_error("W65816: return type wider than 64 bits not supported");

  // Indirect calls (function pointers): redirect through the runtime
  // trampoline `__jsl_indir`.  The 65816 has no JSL-indirect; instead,
@ -713,20 +844,29 @@ W65816TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
  Chain = DAG.getCALLSEQ_END(Chain, StackBytes, 0, Glue, DL);
  Glue = Chain.getValue(1);

-  // Read return value(s).  Mirrors LowerReturn: i8/i16 in A, i32 in A:X.
-  if (Ins.size() > 2)
-    report_fatal_error("W65816: return type not yet supported");
-  static constexpr Register RetRegs[2] = {W65816::A, W65816::X};
+  // Read return value(s).  Mirrors LowerReturn: i8/i16 in A, i32 in A:X,
+  // i64 in A:X:Y plus a load from DP $F0 for the highest half.
+  if (Ins.size() > 4)
+    report_fatal_error("W65816: return type wider than 64 bits not supported");
+  static constexpr Register RetRegs[3] = {W65816::A, W65816::X, W65816::Y};
  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
    MVT VT = Ins[i].VT;
    if (VT != MVT::i16 && VT != MVT::i8)
-      report_fatal_error("W65816: return type not yet supported");
-    if (i == 1 && VT != MVT::i16)
-      report_fatal_error("W65816: split return must be i16");
+      report_fatal_error("W65816: return half must be i8 or i16");
+    if (i >= 1 && VT != MVT::i16)
+      report_fatal_error("W65816: split return halves must all be i16");
+    if (i < 3) {
      SDValue V = DAG.getCopyFromReg(Chain, DL, RetRegs[i], VT, Glue);
      Chain = V.getValue(1);
      Glue = V.getValue(2);
      InVals.push_back(V);
+    } else {
+      // 4th half: load from DP $F0.
+      SDValue DPAddr = DAG.getConstant(0xF0, DL, MVT::i16);
+      SDValue V = DAG.getLoad(VT, DL, Chain, DPAddr, MachinePointerInfo());
+      Chain = V.getValue(1);
+      InVals.push_back(V);
+    }
  }

  return Chain;
@ -740,36 +880,52 @@ SDValue W65816TargetLowering::LowerReturn(
  // Return ABI:
  //   i8/i16:  value in A.
  //   i32:     low half (Outs[0]) in A, high half (Outs[1]) in X.
+  //   i64:     halves in A, X, Y, and a fixed direct-page slot at $F0..$F1
+  //            (Outs[0..2] -> A,X,Y; Outs[3] stored to the DP slot).
  //   wider:   not yet supported.
-  // Type legalization splits an i32 return into 2 consecutive i16 Outs.
-  // Emission order matters: we copy the high half to X *first* so that
-  // the regalloc can place both halves through the only Acc16 reg (A)
-  // without conflict.  The TAX in copyPhysReg preserves A, so the
-  // subsequent copy of the low half to A doesn't clobber the high.
-  // Emitting low->A first would force a spill since computing the high
-  // would overwrite A while the low is still live for RTL.
-  if (Outs.size() > 2)
-    report_fatal_error("W65816: return type not yet supported");
+  // Type legalization splits an i32 into 2 consecutive i16 Outs and an
+  // i64 into 4.  Emission order matters: we copy the *highest* halves
+  // first so that the regalloc can place each through A (the only
+  // ALU reg) without conflict.  The TAX/TAY in copyPhysReg preserves
+  // A, so subsequent low-half copies to A don't clobber.
+  if (Outs.size() > 4)
+    report_fatal_error("W65816: return type wider than 64 bits not supported");
  for (unsigned i = 0; i != Outs.size(); ++i) {
    MVT VT = Outs[i].VT;
    if (VT != MVT::i16 && VT != MVT::i8)
-      report_fatal_error("W65816: return type not yet supported");
-    if (i == 1 && VT != MVT::i16)
-      report_fatal_error("W65816: split return must be i16");
+      report_fatal_error("W65816: return half must be i8 or i16");
+    if (i >= 1 && VT != MVT::i16)
+      report_fatal_error("W65816: split return halves must all be i16");
  }
  SDValue Glue;
-  SmallVector<SDValue, 4> RetOps(1, Chain);
-  if (Outs.size() == 2) {
+  SmallVector<SDValue, 8> RetOps(1, Chain);
+
+  // Outs[3] -> store to DP $F0 (only for i64 returns).  Done first so
+  // its computation can use A freely before A holds the low result.
+  if (Outs.size() >= 4) {
+    SDValue DPAddr = DAG.getConstant(0xF0, DL, MVT::i16);
+    Chain = DAG.getStore(Chain, DL, OutVals[3], DPAddr, MachinePointerInfo());
+  }
+  // Outs[2] -> Y.
+  if (Outs.size() >= 3) {
+    Chain = DAG.getCopyToReg(Chain, DL, W65816::Y, OutVals[2], Glue);
+    Glue = Chain.getValue(1);
+  }
+  // Outs[1] -> X.
+  if (Outs.size() >= 2) {
    Chain = DAG.getCopyToReg(Chain, DL, W65816::X, OutVals[1], Glue);
    Glue = Chain.getValue(1);
  }
+  // Outs[0] -> A.
  if (!Outs.empty()) {
    Chain = DAG.getCopyToReg(Chain, DL, W65816::A, OutVals[0], Glue);
    Glue = Chain.getValue(1);
    RetOps.push_back(DAG.getRegister(W65816::A, Outs[0].VT));
  }
-  if (Outs.size() == 2)
+  if (Outs.size() >= 2)
    RetOps.push_back(DAG.getRegister(W65816::X, Outs[1].VT));
+  if (Outs.size() >= 3)
+    RetOps.push_back(DAG.getRegister(W65816::Y, Outs[2].VT));

  RetOps[0] = Chain;
  if (Glue.getNode())
@ -778,83 +934,33 @@ SDValue W65816TargetLowering::LowerReturn(
  return DAG.getNode(W65816ISD::RET_GLUE, DL, MVT::Other, RetOps);
 }

-// DAG combine: undo clang's `load(SELECT_CC(fi, fi))` rewrite of
-// `c ? *p : *q` when both ptrs are FrameIndex.  Without this, the
-// SELECT_CC matcher (which expects Acc16 inputs) fails to match the
-// FrameIndex tval/fval.  We rewrite back to the original
-// `SELECT_CC(load(fi), load(fi))` shape — safe because both stack
-// slots are guaranteed valid memory.  We deliberately do NOT do this
-// for arbitrary pointers, since reading from both branches could
-// touch invalid memory or memory-mapped IO with side effects.
 SDValue
 W65816TargetLowering::PerformDAGCombine(SDNode *N,
                                        DAGCombinerInfo &DCI) const {
-  if (N->getOpcode() != ISD::LOAD)
-    return SDValue();
-  LoadSDNode *Ld = cast<LoadSDNode>(N);
-  if (!Ld->isSimple())
-    return SDValue();
-  SDValue Ptr = Ld->getBasePtr();
-
-  // Pre-legalize SELECT (cond, T, F): undo the address-select if both
-  // pointer operands are FrameIndex.
-  if (Ptr.getOpcode() == ISD::SELECT) {
-    SDValue T = Ptr.getOperand(1);
-    SDValue F = Ptr.getOperand(2);
-    if (T.getOpcode() != ISD::FrameIndex ||
-        F.getOpcode() != ISD::FrameIndex)
-      return SDValue();
+  // (shl i32 X, K) -> chain of K (add x, x) for small K.  After type
+  // legalisation the i32 add splits via ADDC/ADDE pseudos which expand
+  // to native ASL/ROL + carry-chain — much cheaper than the type-
+  // legaliser's SHL_PARTS expansion which uses our 3-insn SRL15A trick
+  // to compute the bit crossing the half boundary.  Each ADD expands to
+  // ~10 insns; SHL_PARTS expansion is ~26 for K=1, ~33 for K=2, ~34 for
+  // K=3.  ADD-chain wins at K<=2 and breaks even at K=3 — cap at K=2.
+  // `x*N` (which the combiner canonicalises pow-of-2 muls to `x<<K`)
+  // benefits the most.  i16 SHL by 1..15 has dedicated ASLA16 patterns
+  // already, so we restrict the rewrite to i32+.
+  if (N->getOpcode() == ISD::SHL && N->getValueType(0).getSizeInBits() >= 32) {
+    if (auto *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
+      uint64_t K = C->getZExtValue();
+      if (K >= 1 && K <= 2) {
        SelectionDAG &DAG = DCI.DAG;
-    EVT VT = N->getValueType(0);
+        SDValue X = N->getOperand(0);
        SDLoc DL(N);
-    SDValue Chain = Ld->getChain();
-    MachineFunction &MF = DAG.getMachineFunction();
-    int TFI = cast<FrameIndexSDNode>(T)->getIndex();
-    int FFI = cast<FrameIndexSDNode>(F)->getIndex();
-    SDValue LoadT = DAG.getLoad(VT, DL, Chain, T,
-                                MachinePointerInfo::getFixedStack(MF, TFI));
-    SDValue LoadF = DAG.getLoad(VT, DL, Chain, F,
-                                MachinePointerInfo::getFixedStack(MF, FFI));
-    SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
-                                   LoadT.getValue(1), LoadF.getValue(1));
-    SDValue NewSel = DAG.getNode(ISD::SELECT, DL, VT,
-                                 Ptr.getOperand(0), LoadT, LoadF);
-    DCI.CombineTo(N, NewSel, NewChain);
-    return SDValue(N, 0);
+        EVT VT = N->getValueType(0);
+        SDValue R = X;
+        for (uint64_t i = 0; i < K; ++i)
+          R = DAG.getNode(ISD::ADD, DL, VT, R, R);
+        return R;
+      }
    }
-
-  // Match either pre-legalize ISD::SELECT_CC (LHS,RHS,T,F,CC) or our
-  // post-legalize W65816ISD::SELECT_CC (T,F,CC,glue).  We only sink the
-  // load into both branches when both branch values are FrameIndex —
-  // safe because stack slots are guaranteed valid memory.  For
-  // arbitrary pointers, side-effecting reads make this unsafe.
-  if (Ptr.getOpcode() == ISD::SELECT_CC) {
-    SDValue T = Ptr.getOperand(2);
-    SDValue F = Ptr.getOperand(3);
-    if (T.getOpcode() != ISD::FrameIndex ||
-        F.getOpcode() != ISD::FrameIndex)
-      return SDValue();
-
-    SelectionDAG &DAG = DCI.DAG;
-    EVT VT = N->getValueType(0);
-    SDLoc DL(N);
-    SDValue Chain = Ld->getChain();
-    MachineFunction &MF = DAG.getMachineFunction();
-    int TFI = cast<FrameIndexSDNode>(T)->getIndex();
-    int FFI = cast<FrameIndexSDNode>(F)->getIndex();
-
-    SDValue LoadT = DAG.getLoad(VT, DL, Chain, T,
-                                MachinePointerInfo::getFixedStack(MF, TFI));
-    SDValue LoadF = DAG.getLoad(VT, DL, Chain, F,
-                                MachinePointerInfo::getFixedStack(MF, FFI));
-    SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
-                                   LoadT.getValue(1), LoadF.getValue(1));
-
-    SDValue NewSel = DAG.getNode(ISD::SELECT_CC, DL, VT,
-                                 Ptr.getOperand(0), Ptr.getOperand(1),
-                                 LoadT, LoadF, Ptr.getOperand(4));
-    DCI.CombineTo(N, NewSel, NewChain);
-    return SDValue(N, 0);
  }
  return SDValue();
 }
@ -1076,9 +1182,11 @@ W65816TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
    MI.eraseFromParent();
    return BB;
  }
+  case W65816::SELECT_CC8:
  case W65816::SELECT_CC16: {
    const W65816Subtarget &STI = BB->getParent()->getSubtarget<W65816Subtarget>();
    const W65816InstrInfo &TII = *STI.getInstrInfo();
+    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
    DebugLoc DL = MI.getDebugLoc();
    MachineFunction *MF = BB->getParent();
    const BasicBlock *LLVM_BB = BB->getBasicBlock();
@ -1095,32 +1203,93 @@ W65816TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                    std::next(MachineBasicBlock::iterator(MI)), BB->end());
    sinkMBB->transferSuccessorsAndUpdatePHIs(BB);

+    unsigned CC = MI.getOperand(3).getImm();
+
+    // Helper: if `OpReg` is defined by a single-use, side-effect-free,
+    // constant-source LDA in thisMBB, MOVE that LDA into `DstMBB` (at
+    // its start).  Returns true on success.
+    auto tryHoistConstInit = [&](Register OpReg,
+                                 MachineBasicBlock *DstMBB) -> bool {
+      if (!OpReg.isVirtual()) return false;
+      if (!MRI.hasOneNonDBGUse(OpReg)) return false;
+      MachineInstr *Def = MRI.getUniqueVRegDef(OpReg);
+      if (!Def || Def->getParent() != thisMBB) return false;
+      if (Def->getOpcode() != W65816::LDAi16imm &&
+          Def->getOpcode() != W65816::LDAi8imm)
+        return false;
+      if (Def->getNumOperands() < 2 || !Def->getOperand(1).isImm())
+        return false;
+      Def->removeFromParent();
+      DstMBB->insert(DstMBB->begin(), Def);
+      return true;
+    };
+
+    Register TValReg = MI.getOperand(1).getReg();
+    Register FValReg = MI.getOperand(2).getReg();
+    auto IsConstLda = [&](Register R) {
+      if (!R.isVirtual() || !MRI.hasOneNonDBGUse(R)) return false;
+      MachineInstr *D = MRI.getUniqueVRegDef(R);
+      return D && D->getParent() == thisMBB &&
+             (D->getOpcode() == W65816::LDAi16imm ||
+              D->getOpcode() == W65816::LDAi8imm) &&
+             D->getNumOperands() >= 2 && D->getOperand(1).isImm();
+    };
+
+    bool BothConst = (CC < W65816CC::COND_GT_MB) &&
+                     IsConstLda(TValReg) && IsConstLda(FValReg);
+
+    if (BothConst) {
+      // 4-block diamond: thisMBB has only the test (CMP) and Bxx; the
+      // tval and fval LDAs each live in their own destination block,
+      // which is reached only via the branch — so neither LDA's flag
+      // side-effect can corrupt the CMP→Bxx test window.  This is the
+      // proper fix for the "LDA between CMP and Bxx" bug catalogued in
+      // project_known_issue_lda_flags.md (replacing the earlier 3-block
+      // workaround that only hoisted fval).
+      //
+      //   thisMBB:  ...; CMP; Bxx tvalMBB
+      //   copy0MBB: LDA #fval; BRA sinkMBB    (FALSE path)
+      //   tvalMBB:  LDA #tval                (TRUE path; falls to sink)
+      //   sinkMBB:  PHI [tval from tvalMBB, fval from copy0MBB]
+      MachineBasicBlock *tvalMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+      MF->insert(sinkMBB->getIterator(), tvalMBB);
+      BB->addSuccessor(copy0MBB);
+      BB->addSuccessor(tvalMBB);
+      copy0MBB->addSuccessor(sinkMBB);
+      tvalMBB->addSuccessor(sinkMBB);
+      unsigned BrOp = getBranchOpcodeForCC(CC);
+      BuildMI(thisMBB, DL, TII.get(BrOp)).addMBB(tvalMBB);
+      BuildMI(copy0MBB, DL, TII.get(W65816::BRA)).addMBB(sinkMBB);
+      tryHoistConstInit(TValReg, tvalMBB);
+      tryHoistConstInit(FValReg, copy0MBB);
+      BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII.get(W65816::PHI),
+              MI.getOperand(0).getReg())
+          .addReg(TValReg).addMBB(tvalMBB)
+          .addReg(FValReg).addMBB(copy0MBB);
+    } else {
+      // 3-block diamond: keep the existing layout and (where possible)
+      // hoist fval into copy0MBB.  Used when one or both operands are
+      // computed values (not constants), or when the multi-branch CC
+      // requires two Bxx in thisMBB.
      BB->addSuccessor(copy0MBB);
      BB->addSuccessor(sinkMBB);
-
-    unsigned CC = MI.getOperand(3).getImm();
      if (CC < W65816CC::COND_GT_MB) {
-      // Single-branch: Bxx sinkMBB.
        unsigned BrOp = getBranchOpcodeForCC(CC);
        BuildMI(thisMBB, DL, TII.get(BrOp)).addMBB(sinkMBB);
      } else {
-      // Multi-branch: two Bxx.  Each may target sinkMBB (true) or
-      // copy0MBB (false).  Fall-through is the OTHER block.
        MultiBranch MB = getMultiBranch(CC);
        MachineBasicBlock *Tgt1 = MB.FirstToTrue  ? sinkMBB : copy0MBB;
        MachineBasicBlock *Tgt2 = MB.SecondToTrue ? sinkMBB : copy0MBB;
        BuildMI(thisMBB, DL, TII.get(MB.First)).addMBB(Tgt1);
        BuildMI(thisMBB, DL, TII.get(MB.Second)).addMBB(Tgt2);
      }
-
-    // copy0MBB falls through to sinkMBB.
      copy0MBB->addSuccessor(sinkMBB);
-
-    // sinkMBB:  dst = PHI [tval, thisMBB], [fval, copy0MBB].
+      tryHoistConstInit(FValReg, copy0MBB);
      BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII.get(W65816::PHI),
              MI.getOperand(0).getReg())
-        .addReg(MI.getOperand(1).getReg()).addMBB(thisMBB)
-        .addReg(MI.getOperand(2).getReg()).addMBB(copy0MBB);
+          .addReg(TValReg).addMBB(thisMBB)
+          .addReg(FValReg).addMBB(copy0MBB);
+    }

    MI.eraseFromParent();
    return sinkMBB;
--- a/src/llvm/lib/Target/W65816/W65816ISelLowering.h
+++ b/src/llvm/lib/Target/W65816/W65816ISelLowering.h
@ -82,6 +82,33 @@ public:

  SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;

+  // Inline-asm register constraints.  Supports:
+  //   "a" / "{a}"  — accumulator (A) — Acc16 (or Acc8 for i8 type)
+  //   "x" / "{x}"  — index X — Idx16 (or Idx8)
+  //   "y" / "{y}"  — index Y — Idx16 (or Idx8)
+  //   "r"          — any allocatable register — Acc16 by default
+  // Letting users name A/X/Y opens up direct toolbox-call sequences,
+  // hand-written math kernels, and any other place where the back-end
+  // doesn't already know to use a particular reg.
+  std::pair<unsigned, const TargetRegisterClass *>
+  getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                               StringRef Constraint,
+                               MVT VT) const override;
+
+  // Classify single-letter constraints 'a','x','y' as register-class
+  // constraints so SelectionDAGBuilder routes them to the resolver
+  // above rather than reporting "unknown asm constraint."
+  ConstraintType getConstraintType(StringRef Constraint) const override {
+    if (Constraint.size() == 1) {
+      switch (Constraint[0]) {
+        case 'a': case 'x': case 'y': case 'r':
+          return C_RegisterClass;
+        default: break;
+      }
+    }
+    return TargetLowering::getConstraintType(Constraint);
+  }
+
  // Force i32 / i64 shifts through a libcall (__ashlsi3 / __lshrsi3 /
  // __ashrsi3) instead of LLVM's default ExpandToParts strategy, which
  // emits an SHL_PARTS node we have no pattern for.  ExpandToParts also
@ -96,6 +123,30 @@ public:
                                                              ExpansionFactor);
  }

+  // i16 MUL goes through __mulhi3 libcall.  Tell the DAG combiner that
+  // decomposing a constant multiply into shifts and adds is profitable:
+  // a libcall is ~12 instructions, while `(mul x, 3)` -> `(add x, (shl
+  // x, 1))` is 5.  i32 stays libcall — the per-half shift+add+chain
+  // expansion comes out larger than the __mulsi3 call.
+  bool decomposeMulByConstant(LLVMContext &Context, EVT VT,
+                              SDValue C) const override {
+    return VT == MVT::i16;
+  }
+
+  // The DAG combiner has a transform `(trunc (shl X, K)) -> (shl (trunc X), K)`
+  // gated on `isTypeDesirableForOp(SHL, NarrowVT)`.  Our LowerShift expands
+  // i8 SHL/SRL/SRA to `(trunc (shift (zext X), K))`; the combiner then
+  // narrows it back to `(shift X, K)` of i8, which re-enters LowerShift —
+  // an infinite loop that hangs `unsigned char x << 1` at -O1/-O2.
+  // Return false for shifts on i8 to disable that narrowing combine and
+  // keep the operation in i16 once we've widened it.
+  bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override {
+    if (VT == MVT::i8 &&
+        (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA))
+      return false;
+    return TargetLowering::isTypeDesirableForOp(Opc, VT);
+  }
+
 private:
  SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
  SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
@ -104,6 +155,7 @@ private:
  SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
  SDValue LowerSignExtend(SDValue Op, SelectionDAG &DAG) const;
  SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerDynamicStackalloc(SDValue Op, SelectionDAG &DAG) const;
 };

 } // namespace llvm
--- a/src/llvm/lib/Target/W65816/W65816InstrInfo.cpp
+++ b/src/llvm/lib/Target/W65816/W65816InstrInfo.cpp
@ -30,6 +30,22 @@ W65816InstrInfo::W65816InstrInfo(const W65816Subtarget &STI)
                         W65816::ADJCALLSTACKUP),
      RI() {}

+// Maps IMGn to its DP address ($D0..$DE in steps of 2).  Returns -1 if
+// the reg isn't an IMG.
+static int imgDPAddr(Register R) {
+  switch (R) {
+  case W65816::IMG0: return 0xD0;
+  case W65816::IMG1: return 0xD2;
+  case W65816::IMG2: return 0xD4;
+  case W65816::IMG3: return 0xD6;
+  case W65816::IMG4: return 0xD8;
+  case W65816::IMG5: return 0xDA;
+  case W65816::IMG6: return 0xDC;
+  case W65816::IMG7: return 0xDE;
+  default: return -1;
+  }
+}
+
 void W65816InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator I,
                                  const DebugLoc &DL, Register DestReg,
@ -57,6 +73,25 @@ void W65816InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
    BuildMI(MBB, I, DL, get(W65816::TYA));
    return;
  }
+  // A → IMGn / IMGn → A: STA dp / LDA dp.  IMGn is DP-backed at fixed
+  // addresses $D0..$DE — see imgDPAddr above.
+  int srcImg = imgDPAddr(SrcReg);
+  int dstImg = imgDPAddr(DestReg);
+  if (DestReg == W65816::A && srcImg >= 0) {
+    BuildMI(MBB, I, DL, get(W65816::LDA_DP)).addImm(srcImg);
+    return;
+  }
+  if (dstImg >= 0 && SrcReg == W65816::A) {
+    BuildMI(MBB, I, DL, get(W65816::STA_DP)).addImm(dstImg);
+    return;
+  }
+  // IMGn → IMGm: route through A.  Caller is responsible for ensuring
+  // A is dead at this program point (regalloc usually arranges this).
+  if (srcImg >= 0 && dstImg >= 0) {
+    BuildMI(MBB, I, DL, get(W65816::LDA_DP)).addImm(srcImg);
+    BuildMI(MBB, I, DL, get(W65816::STA_DP)).addImm(dstImg);
+    return;
+  }
  llvm_unreachable("W65816: cross-class copyPhysReg not yet implemented");
 }

@ -134,3 +169,94 @@ bool W65816InstrInfo::isReMaterializableImpl(const MachineInstr &MI) const {
  const MachineFrameInfo &MFI = MI.getMF()->getFrameInfo();
  return MFI.isFixedObjectIndex(FIOp.getIndex());
 }
+
+int W65816InstrInfo::getSPAdjust(const MachineInstr &MI) const {
+  unsigned Opc = MI.getOpcode();
+  // ADJCALLSTACKDOWN returns 0 (we don't pre-shift SP — PUSH16 does
+  // it incrementally).  ADJCALLSTACKUP returns -N where N is the
+  // first immediate (= total pushed bytes); this counterbalances
+  // the +2 contributions accumulated from each PUSH16 so SPAdj
+  // returns to 0 at the end of the call sequence.
+  if (Opc == W65816::ADJCALLSTACKDOWN)
+    return 0;
+  if (Opc == W65816::ADJCALLSTACKUP) {
+    // The immediate is the byte count.
+    if (MI.getNumOperands() > 0 && MI.getOperand(0).isImm())
+      return -static_cast<int>(MI.getOperand(0).getImm());
+    return 0;
+  }
+  if (Opc == W65816::PUSH16 || Opc == W65816::PUSH16X)
+    return 2;
+  return TargetInstrInfo::getSPAdjust(MI);
+}
+
+unsigned W65816InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
+  // Meta-instructions emit nothing — PHI nodes get eliminated, COPY
+  // gets lowered to TXA/TYA/TAY/TAX or LDA/STA, KILL/IMPLICIT_DEF/
+  // BUNDLE/CFI_INSTRUCTION/DBG_VALUE leave no bytes.  For COPY we
+  // could be more precise (1 or 2 bytes depending on transfer) but
+  // returning 0 is fine: the size estimate just needs to be a lower
+  // bound for the BranchExpand pass's distance estimate.
+  if (MI.isMetaInstruction()) return 0;
+
+  unsigned Opc = MI.getOpcode();
+
+  // ADJCALLSTACKDOWN / ADJCALLSTACKUP get expanded to PLA loops or
+  // TSC/CLC/ADC/TCS bracket; estimate ~8 bytes worst case.
+  if (Opc == W65816::ADJCALLSTACKDOWN || Opc == W65816::ADJCALLSTACKUP)
+    return 8;
+
+  // Pseudo expansions handled by AsmPrinter that emit multiple
+  // bytes need explicit estimates; a missing case underestimates
+  // and risks branch-range errors.  Rough byte counts below mirror
+  // each pseudo's expansion in W65816AsmPrinter::emitInstruction.
+  switch (Opc) {
+  // i8 immediate ops wrap with SEP/REP: SEP(2) + op(2) + REP(2) = 6.
+  case W65816::LDAi8imm:
+  case W65816::ADCi8imm:
+  case W65816::SBCi8imm:
+  case W65816::ANDi8imm:
+  case W65816::ORAi8imm:
+  case W65816::EORi8imm:
+  case W65816::CMPi8imm:
+    return 6 + (Opc == W65816::ADCi8imm || Opc == W65816::SBCi8imm ? 1 : 0);
+  // i8 abs load wraps: SEP(2) + LDA_Abs(3) + REP(2) = 7.
+  case W65816::LDA8abs:
+    return 7;
+  // i8 abs store wraps: SEP(2) + STA_Abs(3) + REP(2) = 7.
+  case W65816::STA8abs:
+    return 7;
+  // STA8fi: SEP(2) + STA d,S(2) + REP(2) = 6 (PEI expansion).
+  case W65816::STA8fi:
+    return 6;
+  // i16 ADC/SBC pseudos prepend CLC/SEC: 1 + 3 = 4 bytes.
+  case W65816::ADCi16imm:
+  case W65816::SBCi16imm:
+  case W65816::ADCabs:
+  case W65816::SBCabs:
+    return 4;
+  // ADDframe: TSC + CLC + ADC #imm = 1 + 1 + 3 = 5.
+  case W65816::ADDframe:
+    return 5;
+  // ALLOCAfi: STA dp + TSC + SEC + SBC dp + TCS + INC A = 2+1+1+2+1+1 = 8.
+  case W65816::ALLOCAfi:
+    return 8;
+  // PUSH16 / PUSH16X: PHA / PHX = 1 byte.
+  case W65816::PUSH16:
+  case W65816::PUSH16X:
+    return 1;
+  // JSLpseudo: jsl is 4 bytes.
+  case W65816::JSLpseudo:
+    return 4;
+  default:
+    break;
+  }
+
+  // Real (non-pseudo) instruction: tablegen-defined Size.
+  unsigned Size = MI.getDesc().getSize();
+  if (Size != 0) return Size;
+
+  // Fallback for any pseudo we forgot to enumerate: 4 bytes is a
+  // pessimistic-but-safe upper bound on most W65816 instructions.
+  return 4;
+}
--- a/src/llvm/lib/Target/W65816/W65816InstrInfo.h
+++ b/src/llvm/lib/Target/W65816/W65816InstrInfo.h
@ -69,6 +69,31 @@ public:
  Register isStoreToStackSlot(const MachineInstr &MI,
                              int &FrameIndex) const override;

+  // Byte-accurate size of an instruction (or an upper bound for
+  // pseudos that AsmPrinter expands to multiple MC instructions).
+  // Used by W65816BranchExpand to compute branch distances precisely
+  // enough to decide when to lengthen a conditional branch.  Real
+  // instructions with a Size set in tablegen get that value;
+  // pseudos that emit nothing (PHI, COPY, ADJCALLSTACKDOWN/UP,
+  // KILL, IMPLICIT_DEF, REG_SEQUENCE, BUNDLE, etc.) report 0 bytes;
+  // codegen pseudos with Size==0 in tablegen but a non-trivial
+  // AsmPrinter expansion get an upper-bound estimate.
+  unsigned getInstSizeInBytes(const MachineInstr &MI) const override;
+
+  // PEI uses this to track the running SP shift inside a call
+  // sequence and pass it to eliminateFrameIndex as SPAdj.  Our
+  // ADJCALLSTACKDOWN does NOT physically shift SP — the PUSH16/PUSH16X
+  // pseudos do that incrementally as args get pushed.  Override the
+  // default so PEI knows: ADJCALLSTACKDOWN/UP contribute 0 (no SP
+  // shift), PUSH16/PUSH16X contribute +2 each (one byte-pair pushed).
+  // Without this override, PEI applies the full ADJCALLSTACKDOWN
+  // amount as SPAdj at the very *start* of the call sequence,
+  // producing FI offsets that pretend SP has already shifted — and
+  // any STAfi/LDAfi to a *local* before the actual PUSH16 happens
+  // ends up writing past the locals into the caller's stack
+  // (corrupting the return address, observed for `int eval(int a,
+  // int b, int c) { return a*b + c; }` under fast regalloc).
+  int getSPAdjust(const MachineInstr &MI) const override;
 };

 } // namespace llvm
--- a/src/llvm/lib/Target/W65816/W65816InstrInfo.td
+++ b/src/llvm/lib/Target/W65816/W65816InstrInfo.td
@ -79,6 +79,14 @@ def SDT_W65816SelectCC : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>,
 def W65816selectcc : SDNode<"W65816ISD::SELECT_CC", SDT_W65816SelectCC,
                            [SDNPInGlue]>;

+// Dynamic stack allocation: takes (chain, size:i16) and returns
+// (ptr:i16, chain).  Lowers to TSC; SEC; SBC size; TCS; INC A in
+// AsmPrinter.  See LowerDynamicStackalloc.
+def SDT_W65816Alloca : SDTypeProfile<1, 1, [SDTCisVT<0, i16>,
+                                            SDTCisVT<1, i16>]>;
+def W65816alloca : SDNode<"W65816ISD::ALLOCA", SDT_W65816Alloca,
+                          [SDNPHasChain, SDNPSideEffect]>;
+
 //===----------------------------------------------------------------------===//
 // Pseudo Instructions
 //===----------------------------------------------------------------------===//
@ -107,6 +115,17 @@ def ADDframe : W65816Pseudo<(outs Acc16:$dst),
                            (ins i16imm:$base, i16imm:$offset),
                            "# ADDframe PSEUDO", []>;

+// VLA / dynamic_stackalloc: takes a 16-bit byte count in A, returns
+// the address of the allocated region in A.  Expanded at AsmPrinter
+// time to: TSC; SEC; SBC count; TCS; INC A.  Has side effects
+// (changes SP).  Both $dst and $size are tied to A; explicit
+// Defs/Uses on SP keep regalloc honest about the side effect.
+let Defs = [SP], Uses = [SP], hasSideEffects = 1,
+    Constraints = "$size = $dst" in
+def ALLOCAfi : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$size),
+                            "# ALLOCAfi $dst, $size",
+                            [(set Acc16:$dst, (W65816alloca Acc16:$size))]>;
+
 // The retglue node lowers directly to RTL (see Returns section below).
 // No separate RET pseudo — the real MC instruction handles the pattern.

@ -139,6 +158,18 @@ def SELECT_CC16 : W65816Pseudo<(outs Acc16:$dst),
                                     (W65816selectcc Acc16:$tval,
                                                     Acc16:$fval,
                                                     timm:$cc))]>;
+// i8 mirror.  Without this, `c ? a : b` patterns where the result is
+// i8 (e.g. `unsigned char to_lower(char c)`) fail isel with "Cannot
+// Select" — pre-existing bug.  EmitInstrWithCustomInserter handles
+// both the i8 and i16 forms identically; the only difference is the
+// register class on the operands.
+def SELECT_CC8  : W65816Pseudo<(outs Acc8:$dst),
+                               (ins Acc8:$tval, Acc8:$fval, i8imm:$cc),
+                               "# SELECT_CC8 $dst, $tval, $fval, $cc",
+                               [(set Acc8:$dst,
+                                     (W65816selectcc Acc8:$tval,
+                                                     Acc8:$fval,
+                                                     timm:$cc))]>;
 }

 //===----------------------------------------------------------------------===//
@ -151,15 +182,19 @@ def SELECT_CC16 : W65816Pseudo<(outs Acc16:$dst),
 // pseudo here to its real MC counterpart.
 //===----------------------------------------------------------------------===//

+// NOTE: LDA / LDX physically update N and Z, but we deliberately do
+// NOT model that with `Defs = [P]`.  Adding `Defs = [P]` lets the
+// scheduler legally place an LDA between CMP and Bxx (P just gets
+// re-defined; the latest def is what Bxx tests) — same flag-corruption
+// bug, different mechanism.  The proper fix is the 4-block SELECT_CC
+// inserter (landed) for SETCC patterns and a similar BR_CC stub-block
+// pass (still TODO) for `while`/`for`/`if-goto` tests — see
+// memory/project_known_issue_lda_flags.md.
 let isAsCheapAsAMove = 1, isReMaterializable = 1,
    hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
 def LDAi16imm : W65816Pseudo<(outs Acc16:$dst), (ins i16imm:$imm),
                             "# LDAi16imm $dst, $imm",
                             [(set Acc16:$dst, (i16 imm:$imm))]>;
-// Materialise an i16 constant directly in X (Idx16).  Useful when the
-// constant's only consumer is `CopyToReg($x)` — saves an LDA+TAX
-// round-trip (and the A-clobber that round-trip implies).  Common for
-// the high half of `(zext i16 to i32)` returns, where hi=const-zero.
 let isReMaterializable = 1, isAsCheapAsAMove = 1, hasSideEffects = 0,
    mayLoad = 0, mayStore = 0 in
 def LDXi16imm : W65816Pseudo<(outs Idx16:$dst), (ins i16imm:$imm),
@ -405,6 +440,25 @@ def : Pat<(srl Acc16:$src, (i16 3)),
 def : Pat<(srl Acc16:$src, (i16 4)),
          (LSRA16 (LSRA16 (LSRA16 (LSRA16 Acc16:$src))))>;

+// Shift counts 5..7 — chained single-bit shifts.  Earlier these were
+// withheld because the DAG combiner narrowed `(trunc (shl (zext X), N))`
+// back to `(shl X, N)` on i8 and re-entered LowerShift in a loop; the
+// `isTypeDesirableForOp(SHL/SRL/SRA, i8) -> false` override in
+// W65816TargetLowering now blocks that combine, so the patterns are
+// safe.  Cheaper than __ashlhi3/__lshrhi3 for these counts.
+def : Pat<(shl Acc16:$src, (i16 5)),
+          (ASLA16 (ASLA16 (ASLA16 (ASLA16 (ASLA16 Acc16:$src)))))>;
+def : Pat<(shl Acc16:$src, (i16 6)),
+          (ASLA16 (ASLA16 (ASLA16 (ASLA16 (ASLA16 (ASLA16 Acc16:$src))))))>;
+def : Pat<(shl Acc16:$src, (i16 7)),
+          (ASLA16 (ASLA16 (ASLA16 (ASLA16 (ASLA16 (ASLA16 (ASLA16 Acc16:$src)))))))>;
+def : Pat<(srl Acc16:$src, (i16 5)),
+          (LSRA16 (LSRA16 (LSRA16 (LSRA16 (LSRA16 Acc16:$src)))))>;
+def : Pat<(srl Acc16:$src, (i16 6)),
+          (LSRA16 (LSRA16 (LSRA16 (LSRA16 (LSRA16 (LSRA16 Acc16:$src))))))>;
+def : Pat<(srl Acc16:$src, (i16 7)),
+          (LSRA16 (LSRA16 (LSRA16 (LSRA16 (LSRA16 (LSRA16 (LSRA16 Acc16:$src)))))))>;
+
 // Increment / decrement of A by 1.  Match `(add x, 1)` and `(add x, -1)`
 // (LLVM canonicalises sub-by-1 to add-by-(-1)).
 let Constraints = "$src = $dst",
@ -431,6 +485,13 @@ let Constraints = "$src = $dst",
 def NEGA16 : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src),
                          "# NEGA16 $dst, $src",
                          [(set Acc16:$dst, (sub (i16 0), Acc16:$src))]>;
+// i8 mirror.  Without this the codegen falls into the generic SBC
+// path: `LDA #0; SEC; SBC slot` plus 8-bit M-mode prologue and
+// PHA/PLA bracketing — ~12 insns for `-x`.  NEGA8 expands to
+// `EOR #$FF; INA` (2 insns in 8-bit M).
+def NEGA8  : W65816Pseudo<(outs Acc8:$dst), (ins Acc8:$src),
+                          "# NEGA8 $dst, $src",
+                          [(set Acc8:$dst, (sub (i8 0), Acc8:$src))]>;
 }

 // Multi-precision negation: lo + hi halves of `-x` where x is i32.
@ -535,6 +596,35 @@ def SHL8A : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src),
                         "# SHL8A $dst, $src",
                         [(set Acc16:$dst, (shl Acc16:$src, (i16 8)))]>;
 }
+
+// Shift counts 9..14: SHL builds on SHL8A (XBA + low-byte mask) and chains
+// 1..6 ASLs after it; SRL mirrors via SRL8A + LSRA chains.  The
+// isTypeDesirableForOp override prevents the i8-shift combine loop that
+// kept these out of tablegen earlier.
+def : Pat<(shl Acc16:$src, (i16 9)),
+          (ASLA16 (SHL8A Acc16:$src))>;
+def : Pat<(shl Acc16:$src, (i16 10)),
+          (ASLA16 (ASLA16 (SHL8A Acc16:$src)))>;
+def : Pat<(shl Acc16:$src, (i16 11)),
+          (ASLA16 (ASLA16 (ASLA16 (SHL8A Acc16:$src))))>;
+def : Pat<(shl Acc16:$src, (i16 12)),
+          (ASLA16 (ASLA16 (ASLA16 (ASLA16 (SHL8A Acc16:$src)))))>;
+def : Pat<(shl Acc16:$src, (i16 13)),
+          (ASLA16 (ASLA16 (ASLA16 (ASLA16 (ASLA16 (SHL8A Acc16:$src))))))>;
+def : Pat<(shl Acc16:$src, (i16 14)),
+          (ASLA16 (ASLA16 (ASLA16 (ASLA16 (ASLA16 (ASLA16 (SHL8A Acc16:$src)))))))>;
+def : Pat<(srl Acc16:$src, (i16 9)),
+          (LSRA16 (SRL8A Acc16:$src))>;
+def : Pat<(srl Acc16:$src, (i16 10)),
+          (LSRA16 (LSRA16 (SRL8A Acc16:$src)))>;
+def : Pat<(srl Acc16:$src, (i16 11)),
+          (LSRA16 (LSRA16 (LSRA16 (SRL8A Acc16:$src))))>;
+def : Pat<(srl Acc16:$src, (i16 12)),
+          (LSRA16 (LSRA16 (LSRA16 (LSRA16 (SRL8A Acc16:$src)))))>;
+def : Pat<(srl Acc16:$src, (i16 13)),
+          (LSRA16 (LSRA16 (LSRA16 (LSRA16 (LSRA16 (SRL8A Acc16:$src))))))>;
+def : Pat<(srl Acc16:$src, (i16 14)),
+          (LSRA16 (LSRA16 (LSRA16 (LSRA16 (LSRA16 (LSRA16 (SRL8A Acc16:$src)))))))>;
 // (sra x, 15): sign-fill — yields $0000 if x is non-negative, $FFFF
 // if negative.  Used by i32 sext-from-i16 type-legalization for the
 // hi half (avoids the __ashrhi3 libcall path).  Sequence:
@ -585,11 +675,24 @@ let mayLoad = 1, hasSideEffects = 0, mayStore = 0,
 def LDAfi : W65816Pseudo<(outs Acc16:$dst), (ins memfi:$addr),
                         "# LDAfi $dst, $addr", []>;
 }
-let mayStore = 1, hasSideEffects = 0, mayLoad = 0 in {
+// STAfi accepts Wide16 src so greedy can park the value in IMGn instead
+// of A.  When src is in IMGn, eliminateFrameIndex prepends a LDA dp;
+// hence Defs = [A] (the IMG case clobbers A).
+let mayStore = 1, hasSideEffects = 0, mayLoad = 0, Defs = [A] in {
 def STAfi : W65816Pseudo<(outs),
-                         (ins Acc16:$src, memfi:$addr),
+                         (ins Wide16:$src, memfi:$addr),
                         "# STAfi $src, $addr", []>;
 }
+// i8 truncating store to a FrameIndex slot.  eliminateFrameIndex wraps
+// it in SEP #$20 / STA d,S / REP #$20 so only one byte is written.
+// Without the wrap, a 16-bit STA writes the byte at slot+1 too, which
+// corrupts the next stack slot (or return address for the last slot of
+// an alloca).  Defs P because SEP/REP modify the M bit.
+let mayStore = 1, hasSideEffects = 1, mayLoad = 0, Defs = [P] in {
+def STA8fi : W65816Pseudo<(outs),
+                          (ins Acc16:$src, memfi:$addr),
+                          "# STA8fi $src, $addr", []>;
+}

 // ComplexPattern bridging FrameIndex SDValues to memfi.  See
 // SelectFrameIndex in W65816ISelDAGToDAG.cpp.
@ -600,14 +703,13 @@ def : Pat<(i16 (load addr_fi:$addr)),
 def : Pat<(store Acc16:$src, addr_fi:$addr),
          (STAfi Acc16:$src, addr_fi:$addr)>;

-// i8 access to a FrameIndex slot.  The slots holding i8 values are
-// allocated as 2 bytes (CC_W65816 promotes i8 args to i16; spills also
-// align), so reading 2 bytes is safe even for an i8 value — we just
-// narrow to Acc8.  Extending loads mask the high byte (zext) or leave
-// it (anyext).  Truncating store writes the full i16 (overwrites the
-// 2-byte slot's high byte with whatever sits in A's high byte; safe
-// since the slot holds an i8 and no other consumer reads that high
-// byte).
+// i8 access to a FrameIndex slot.  Loads read 2 bytes via 16-bit LDA
+// — the high byte is harmless (extending loads mask or sign-extend it,
+// narrowing loads narrow back to Acc8 / discard).  Stores must write
+// only one byte: i8 alloca arrays pack adjacent slots one byte apart,
+// and a 16-bit STA at the last slot of the array would corrupt the
+// return address.  Truncating stores route through STA8fi which wraps
+// the STA in SEP #$20 / REP #$20.
 def : Pat<(i8 (load addr_fi:$addr)),
          (COPY_TO_REGCLASS (LDAfi addr_fi:$addr), Acc8)>;
 def : Pat<(i16 (zextloadi8 addr_fi:$addr)),
@ -615,9 +717,9 @@ def : Pat<(i16 (zextloadi8 addr_fi:$addr)),
 def : Pat<(i16 (extloadi8 addr_fi:$addr)),
          (LDAfi addr_fi:$addr)>;
 def : Pat<(store Acc8:$src, addr_fi:$addr),
-          (STAfi (COPY_TO_REGCLASS Acc8:$src, Acc16), addr_fi:$addr)>;
+          (STA8fi (COPY_TO_REGCLASS Acc8:$src, Acc16), addr_fi:$addr)>;
 def : Pat<(truncstorei8 Acc16:$src, addr_fi:$addr),
-          (STAfi Acc16:$src, addr_fi:$addr)>;
+          (STA8fi Acc16:$src, addr_fi:$addr)>;

 // Frame-index folding into ADC / SBC / AND / ORA / EOR / CMP.  Same
 // shape as the *abs variants but the second operand is a stack slot.
@ -975,8 +1077,8 @@ def STP : InstImplied<0xDB, "stp">;
 // AsmParser has no way to know the current M/X bits, so it always
 // reaches for the _Imm16 form.  Codegen can still select _Imm8
 // explicitly once we have 8-bit patterns.
-def LDA_Imm8  : InstImm8<0xA9, "lda">  { let MHigh = 1; let DecoderNamespace = "W65816MHigh"; let isCodeGenOnly = 1; }
-def LDA_Imm16 : InstImm16<0xA9, "lda"> { let MLow  = 1; }
+def LDA_Imm8  : InstImm8<0xA9, "lda">  { let MHigh = 1; let DecoderNamespace = "W65816MHigh"; let isCodeGenOnly = 1; let Defs = [A]; }
+def LDA_Imm16 : InstImm16<0xA9, "lda"> { let MLow  = 1;                                                              let Defs = [A]; }
 def LDA_DP    : InstDP<0xA5, "lda">;
 def LDA_Abs   : InstAbs<0xAD, "lda">;
 def LDA_Long  : InstAbsLong<0xAF, "lda">;
@ -993,8 +1095,8 @@ def STA_AbsX : InstAbsX<0x9D, "sta">;
 def STA_AbsY : InstAbsY<0x99, "sta">;

 //---------------------------------------------------------------- LDX (load X)
-def LDX_Imm8  : InstImm8<0xA2, "ldx">  { let XHigh = 1; let DecoderNamespace = "W65816XHigh"; let isCodeGenOnly = 1; }
-def LDX_Imm16 : InstImm16<0xA2, "ldx"> { let XLow  = 1; }
+def LDX_Imm8  : InstImm8<0xA2, "ldx">  { let XHigh = 1; let DecoderNamespace = "W65816XHigh"; let isCodeGenOnly = 1; let Defs = [X]; }
+def LDX_Imm16 : InstImm16<0xA2, "ldx"> { let XLow  = 1;                                                              let Defs = [X]; }
 def LDX_DP    : InstDP<0xA6, "ldx">;
 def LDX_Abs   : InstAbs<0xAE, "ldx">;
 def LDX_DPY   : InstDPY<0xB6, "ldx">;
@ -1006,8 +1108,8 @@ def STX_Abs : InstAbs<0x8E, "stx">;
 def STX_DPY : InstDPY<0x96, "stx">;

 //---------------------------------------------------------------- LDY (load Y)
-def LDY_Imm8  : InstImm8<0xA0, "ldy">  { let XHigh = 1; let DecoderNamespace = "W65816XHigh"; let isCodeGenOnly = 1; }
-def LDY_Imm16 : InstImm16<0xA0, "ldy"> { let XLow  = 1; }
+def LDY_Imm8  : InstImm8<0xA0, "ldy">  { let XHigh = 1; let DecoderNamespace = "W65816XHigh"; let isCodeGenOnly = 1; let Defs = [Y]; }
+def LDY_Imm16 : InstImm16<0xA0, "ldy"> { let XLow  = 1;                                                              let Defs = [Y]; }
 def LDY_DP    : InstDP<0xA4, "ldy">;
 def LDY_Abs   : InstAbs<0xAC, "ldy">;
 def LDY_DPX   : InstDPX<0xB4, "ldy">;
@ -1109,14 +1211,18 @@ def ROR_DP  : InstDP<0x66, "ror">;
 def ROR_Abs : InstAbs<0x6E, "ror">;

 //---------------------------------------------------------------- Transfers
-def TAX : InstImplied<0xAA, "tax"> { let mayLoad = 0; let mayStore = 0; }
-def TAY : InstImplied<0xA8, "tay"> { let mayLoad = 0; let mayStore = 0; }
-def TXA : InstImplied<0x8A, "txa"> { let mayLoad = 0; let mayStore = 0; }
-def TYA : InstImplied<0x98, "tya"> { let mayLoad = 0; let mayStore = 0; }
-def TXY : InstImplied<0x9B, "txy"> { let mayLoad = 0; let mayStore = 0; }
-def TYX : InstImplied<0xBB, "tyx"> { let mayLoad = 0; let mayStore = 0; }
-def TXS : InstImplied<0x9A, "txs"> { let mayLoad = 0; let mayStore = 0; }
-def TSX : InstImplied<0xBA, "tsx"> { let mayLoad = 0; let mayStore = 0; }
+// Defs/Uses metadata is critical: without it, machine-cp doesn't see
+// that TAX (etc.) reads the source register, and may delete a `$a =
+// COPY $x` immediately preceding it as a "dead store" — corrupting
+// the data flow.  See feedback_w65816_implied_ops.md for the canary.
+def TAX : InstImplied<0xAA, "tax"> { let mayLoad = 0; let mayStore = 0; let Defs = [X]; let Uses = [A]; }
+def TAY : InstImplied<0xA8, "tay"> { let mayLoad = 0; let mayStore = 0; let Defs = [Y]; let Uses = [A]; }
+def TXA : InstImplied<0x8A, "txa"> { let mayLoad = 0; let mayStore = 0; let Defs = [A]; let Uses = [X]; }
+def TYA : InstImplied<0x98, "tya"> { let mayLoad = 0; let mayStore = 0; let Defs = [A]; let Uses = [Y]; }
+def TXY : InstImplied<0x9B, "txy"> { let mayLoad = 0; let mayStore = 0; let Defs = [Y]; let Uses = [X]; }
+def TYX : InstImplied<0xBB, "tyx"> { let mayLoad = 0; let mayStore = 0; let Defs = [X]; let Uses = [Y]; }
+def TXS : InstImplied<0x9A, "txs"> { let mayLoad = 0; let mayStore = 0; let Defs = [SP]; let Uses = [X]; }
+def TSX : InstImplied<0xBA, "tsx"> { let mayLoad = 0; let mayStore = 0; let Defs = [X]; let Uses = [SP]; }
 def TCD : InstImplied<0x5B, "tcd"> { let mayLoad = 0; let mayStore = 0; }
 def TDC : InstImplied<0x7B, "tdc"> { let mayLoad = 0; let mayStore = 0; }
 def TCS : InstImplied<0x1B, "tcs"> { let mayLoad = 0; let mayStore = 0; }
--- a/src/llvm/lib/Target/W65816/W65816MachineFunctionInfo.h
+++ b/src/llvm/lib/Target/W65816/W65816MachineFunctionInfo.h
@ -34,6 +34,12 @@ class W65816MachineFunctionInfo : public MachineFunctionInfo {
  /// Virtual register holding the struct-return pointer for sret returns.
  Register SRetReturnReg;

+  /// True iff the function's prologue chose 8-bit M (SEP #$20).  Pure-i8
+  /// functions run with M=1; everything else runs with M=0.  AsmPrinter
+  /// reads this when expanding pseudos whose width depends on M (e.g.
+  /// STA8abs needs an SEP/REP wrap in M=0 to avoid a 2-byte store).
+  bool UsesAcc8 = false;
+
 public:
  W65816MachineFunctionInfo() = default;

@ -56,6 +62,9 @@ public:

  int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
  void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; }
+
+  bool getUsesAcc8() const { return UsesAcc8; }
+  void setUsesAcc8(bool V) { UsesAcc8 = V; }
 };

 } // namespace llvm
--- a/src/llvm/lib/Target/W65816/W65816NegYIndY.cpp
+++ b/src/llvm/lib/Target/W65816/W65816NegYIndY.cpp
@ -0,0 +1,152 @@
+//===-- W65816NegYIndY.cpp - Fix negative-Y indirect addressing -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+//
+//===----------------------------------------------------------------------===//
+//
+// Pre-emit peephole that rewrites
+//
+//   LDY #imm        ; imm signed-negative (>= 0x8000 unsigned)
+//   LDA (sr,S),Y    ; or STA
+//
+// into
+//
+//   LDA sr,S        ; A = ptr
+//   CLC ; ADC #imm  ; A = ptr + imm  (signed add wraps within 16 bits in A)
+//   TAX             ; X = adjusted ptr
+//   ; for LDA path: LDA $0000,X    ; A = DBR:X
+//   ; for STA path: TAY (save A) ; ... ; TYA before STA $0000,X
+//
+// Why: the WDC W65816 spec says (sr,S),Y computes
+//
+//     EA = (DBR | (mem16(sr+S) + Y)) MOD $1000000
+//
+// — a 24-bit add.  When Y is signed-negative (e.g. $FFFE for "-2"), the
+// addition crosses bank boundaries: ptr=$5DB3 + $FFFE = $015DB1, NOT
+// $005DB1.  Caught by `arr[-1]` and bubble-sort swaps with `arr[j-1]`.
+//
+// Using `abs,X` with operand $0000 and X = adjusted-ptr avoids the
+// problem because X is < 16 bits and operand + X stays within DBR
+// when the operand is small.
+//
+//===----------------------------------------------------------------------===//
+
+#include "W65816.h"
+#include "W65816InstrInfo.h"
+#include "W65816Subtarget.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "w65816-neg-y-indy"
+
+namespace {
+
+class W65816NegYIndY : public MachineFunctionPass {
+public:
+  static char ID;
+  W65816NegYIndY() : MachineFunctionPass(ID) {}
+  StringRef getPassName() const override {
+    return "W65816 negative-Y indirect-Y rewriter";
+  }
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+
+} // namespace
+
+char W65816NegYIndY::ID = 0;
+
+INITIALIZE_PASS(W65816NegYIndY, DEBUG_TYPE,
+                "W65816 negative-Y indirect-Y rewriter", false, false)
+
+FunctionPass *llvm::createW65816NegYIndY() { return new W65816NegYIndY(); }
+
+bool W65816NegYIndY::runOnMachineFunction(MachineFunction &MF) {
+  const W65816InstrInfo *TII =
+      MF.getSubtarget<W65816Subtarget>().getInstrInfo();
+  bool Changed = false;
+  for (MachineBasicBlock &MBB : MF) {
+    int LastY = -1;
+    MachineInstr *LastLDY = nullptr;
+    for (auto It = MBB.begin(), End = MBB.end(); It != End; ) {
+      MachineInstr &MI = *It++;
+      if (MI.isDebugInstr()) continue;
+      unsigned Opc = MI.getOpcode();
+      if (Opc == W65816::LDY_Imm16 && MI.getNumOperands() >= 1 &&
+          MI.getOperand(0).isImm()) {
+        LastY = (int)(MI.getOperand(0).getImm() & 0xFFFF);
+        LastLDY = &MI;
+        continue;
+      }
+      bool IsLDA = Opc == W65816::LDA_StackRelIndY;
+      bool IsSTA = Opc == W65816::STA_StackRelIndY;
+      if ((IsLDA || IsSTA) && LastY != -1 && (LastY & 0x8000)) {
+        // Negative Y.  Rewrite via TAX + LDA/STA $0000,X.
+        if (MI.getNumOperands() < 1 || !MI.getOperand(0).isImm())
+          continue;
+        unsigned Disp = MI.getOperand(0).getImm() & 0xFF;
+        DebugLoc DL = MI.getDebugLoc();
+        if (IsLDA) {
+          // LDA disp,S ; CLC ; ADC #neg ; TAX ; LDA $0000,X
+          BuildMI(MBB, MI, DL, TII->get(W65816::LDA_StackRel))
+              .addImm(Disp)
+              .addReg(W65816::A, RegState::ImplicitDefine);
+          BuildMI(MBB, MI, DL, TII->get(W65816::CLC))
+              .addReg(W65816::P, RegState::ImplicitDefine);
+          BuildMI(MBB, MI, DL, TII->get(W65816::ADC_Imm16))
+              .addImm(LastY)
+              .addReg(W65816::A, RegState::Implicit)
+              .addReg(W65816::A, RegState::ImplicitDefine)
+              .addReg(W65816::P, RegState::Implicit)
+              .addReg(W65816::P, RegState::ImplicitDefine);
+          BuildMI(MBB, MI, DL, TII->get(W65816::TAX));
+          BuildMI(MBB, MI, DL, TII->get(W65816::LDA_AbsX))
+              .addImm(0)
+              .addReg(W65816::A, RegState::ImplicitDefine);
+        } else { // STA
+          // A holds the value to store.  TAY (save A in Y) ;
+          // LDA disp,S ; CLC ; ADC #neg ; TAX ; TYA ; STA $0000,X
+          BuildMI(MBB, MI, DL, TII->get(W65816::TAY));
+          BuildMI(MBB, MI, DL, TII->get(W65816::LDA_StackRel))
+              .addImm(Disp)
+              .addReg(W65816::A, RegState::ImplicitDefine);
+          BuildMI(MBB, MI, DL, TII->get(W65816::CLC))
+              .addReg(W65816::P, RegState::ImplicitDefine);
+          BuildMI(MBB, MI, DL, TII->get(W65816::ADC_Imm16))
+              .addImm(LastY)
+              .addReg(W65816::A, RegState::Implicit)
+              .addReg(W65816::A, RegState::ImplicitDefine)
+              .addReg(W65816::P, RegState::Implicit)
+              .addReg(W65816::P, RegState::ImplicitDefine);
+          BuildMI(MBB, MI, DL, TII->get(W65816::TAX));
+          BuildMI(MBB, MI, DL, TII->get(W65816::TYA));
+          BuildMI(MBB, MI, DL, TII->get(W65816::STA_AbsX))
+              .addImm(0)
+              .addReg(W65816::A, RegState::Implicit);
+        }
+        // Erase original LDY and the (sr,s),Y op.
+        if (LastLDY) { LastLDY->eraseFromParent(); LastLDY = nullptr; }
+        MI.eraseFromParent();
+        LastY = -1;
+        Changed = true;
+        continue;
+      }
+      switch (Opc) {
+      case W65816::TAY: case W65816::TXY:
+      case W65816::INY: case W65816::DEY:
+      case W65816::PLY: case W65816::LDY_DP: case W65816::LDY_Abs:
+      case W65816::LDY_DPX: case W65816::LDY_AbsX:
+        LastY = -1;
+        LastLDY = nullptr;
+        break;
+      default:
+        if (MI.isCall()) { LastY = -1; LastLDY = nullptr; }
+        break;
+      }
+    }
+  }
+  return Changed;
+}
--- a/src/llvm/lib/Target/W65816/W65816RegisterInfo.cpp
+++ b/src/llvm/lib/Target/W65816/W65816RegisterInfo.cpp
@ -74,7 +74,47 @@ bool W65816RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
  bool IsSub = false;
  switch (Opc) {
  case W65816::LDAfi: NewOpc = W65816::LDA_StackRel; break;
-  case W65816::STAfi: NewOpc = W65816::STA_StackRel; break;
+  case W65816::STAfi: {
+    // Wide16-source STAfi: if the source ended up in IMGn (DP-backed),
+    // prepend LDA dp so the value reaches A before the actual store.
+    int FI = MI.getOperand(FIOperandNum).getIndex();
+    int FrameOffset = MFI.getObjectOffset(FI);
+    int ImmOffset = MI.getOperand(FIOperandNum + 1).getImm();
+    // +1 skew for locals: the 65816 SP points to next-FREE byte (empty
+    // descending), but LLVM PEI assigns FrameOffset assuming SP points
+    // to the first-USED byte (full descending).  Without the +1, slot 0
+    // ends up at S+0 — exactly where the next JSL writes its return
+    // address bank.  Args have positive FrameOffset (caller pushed them
+    // at S+1..S+N already, the JSL push naturally puts them at S+4+N
+    // in callee), so they don't need the skew.
+    int Offset = FrameOffset + ImmOffset + (int)MFI.getStackSize() + SPAdj;
+    if (FrameOffset < 0) Offset += 1;
+    if (Offset < 0 || Offset > 0xFF)
+      report_fatal_error("W65816: frame offset out of stack-relative range");
+    Register Src = MI.getOperand(0).getReg();
+    int srcDP = -1;
+    switch (Src) {
+    case W65816::IMG0: srcDP = 0xD0; break;
+    case W65816::IMG1: srcDP = 0xD2; break;
+    case W65816::IMG2: srcDP = 0xD4; break;
+    case W65816::IMG3: srcDP = 0xD6; break;
+    case W65816::IMG4: srcDP = 0xD8; break;
+    case W65816::IMG5: srcDP = 0xDA; break;
+    case W65816::IMG6: srcDP = 0xDC; break;
+    case W65816::IMG7: srcDP = 0xDE; break;
+    default: break;
+    }
+    if (srcDP >= 0) {
+      BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
+              TII.get(W65816::LDA_DP)).addImm(srcDP);
+    }
+    BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
+            TII.get(W65816::STA_StackRel))
+        .addImm(Offset)
+        .addReg(W65816::A, RegState::Implicit);
+    MI.eraseFromParent();
+    return true;
+  }
  case W65816::ADCfi: NewOpc = W65816::ADC_StackRel; NeedsCarryPrefix = true; break;
  case W65816::SBCfi: NewOpc = W65816::SBC_StackRel; NeedsCarryPrefix = true; IsSub = true; break;
  // ADCEfi / SBCEfi are the chained-carry variants used as the hi half of a
@ -88,6 +128,31 @@ bool W65816RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
  case W65816::CMPfi: NewOpc = W65816::CMP_StackRel; break;
  case W65816::LDAfi_indY: NewOpc = W65816::LDA_StackRelIndY; break;
  case W65816::STAfi_indY: NewOpc = W65816::STA_StackRelIndY; break;
+  case W65816::STA8fi: {
+    // i8 truncating store via stack-rel.  Wrap the store in
+    // SEP #$20 / STA d,S / REP #$20 so only one byte is written.  We
+    // assume entry M=0 (16-bit accumulator) per the function prologue;
+    // restoring REP #$20 after the STA preserves that invariant.
+    int FI = MI.getOperand(FIOperandNum).getIndex();
+    int FrameOffset = MFI.getObjectOffset(FI);
+    int ImmOffset = MI.getOperand(FIOperandNum + 1).getImm();
+    int Offset = FrameOffset + ImmOffset + (int)MFI.getStackSize() + SPAdj;
+    if (FrameOffset < 0) Offset += 1;  // empty-descending SP skew (see STAfi)
+    if (Offset < 0 || Offset > 0xFF)
+      report_fatal_error("W65816: frame offset out of stack-relative range");
+    BuildMI(*MI.getParent(), II, MI.getDebugLoc(), TII.get(W65816::SEP))
+        .addImm(0x20)
+        .addReg(W65816::P, RegState::ImplicitDefine);
+    BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
+            TII.get(W65816::STA_StackRel))
+        .addImm(Offset)
+        .addReg(W65816::A, RegState::Implicit);
+    BuildMI(*MI.getParent(), II, MI.getDebugLoc(), TII.get(W65816::REP))
+        .addImm(0x20)
+        .addReg(W65816::P, RegState::ImplicitDefine);
+    MI.eraseFromParent();
+    return true;
+  }
  case W65816::ADDframe: {
    // LEA-equivalent: emit "TSC; CLC; ADC #disp" so A holds SP + disp,
    // i.e. the address of the stack slot.  TSC has no carry side-effect
@ -97,7 +162,8 @@ bool W65816RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
    int FI = MI.getOperand(FIOperandNum).getIndex();
    int FrameOffset = MFI.getObjectOffset(FI);
    int ImmOffset = MI.getOperand(FIOperandNum + 1).getImm();
-    int Disp = FrameOffset + ImmOffset + (int)MFI.getStackSize();
+    int Disp = FrameOffset + ImmOffset + (int)MFI.getStackSize() + SPAdj;
+    if (FrameOffset < 0) Disp += 1;  // empty-descending SP skew (see STAfi)
    if (Disp < 0 || Disp > 0xFFFF)
      report_fatal_error("W65816: frame offset out of i16 LEA range");
    // TSC: A = SP (implicit def of A, use of SP).
@ -128,17 +194,30 @@ bool W65816RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
  // WDC stack-relative addressing: `LDA disp,S` computes effective
  // address S + disp.  Both fixed objects (args) and local objects
  // are stored at addresses relative to entry-SP; my prologue has
-  // shifted S down by StackSize.  So:
+  // shifted S down by StackSize.  Plus, between ADJCALLSTACKDOWN and
+  // ADJCALLSTACKUP, PUSH16/PHA shifts SP further by SPAdj.  So:
  //   address = entry_S + FrameOffset
-  //   S       = entry_S - StackSize
+  //   S       = entry_S - StackSize - SPAdj
  //   disp    = address - S
-  //           = FrameOffset + StackSize
-  int Offset = FrameOffset + ImmOffset + (int)MFI.getStackSize();
+  //           = FrameOffset + StackSize + SPAdj
+  // PLUS a +1 skew for locals: the 65816 SP is empty-descending (points
+  // to next-FREE byte), but LLVM PEI assigns FrameOffset assuming SP is
+  // full-descending (points to first-USED byte).  Without +1, slot 0
+  // ends up at S+0 — clobbered by the next JSL retaddr push.  Args have
+  // positive FrameOffset and don't need the skew.
+  int Offset = FrameOffset + ImmOffset + (int)MFI.getStackSize() + SPAdj;
+  if (FrameOffset < 0) Offset += 1;

  if (Offset < 0 || Offset > 0xFF) {
    report_fatal_error("W65816: frame offset out of stack-relative range");
  }

+  // (Prologue-PHA fold reverted — it was correct in isolation but
+  // surfaced a separate compile-time hazard via the DAG combiner on
+  // shift-by-1 i8.  Saved 1 op per affected function but at the cost
+  // of huge compile slowdowns.  Re-enable once the DAG combiner
+  // interaction is understood.)
+
  // Emit the carry-prep instruction first if the operation needs it.
  if (NeedsCarryPrefix) {
    BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
--- a/src/llvm/lib/Target/W65816/W65816RegisterInfo.h
+++ b/src/llvm/lib/Target/W65816/W65816RegisterInfo.h
@ -36,6 +36,20 @@ public:
                           RegScavenger *RS = nullptr) const override;

  Register getFrameRegister(const MachineFunction &MF) const override;
+
+  // Use the FORWARD frame-index elimination pass.  The default
+  // backward pass treats the entire call sequence as if SP were
+  // already shifted by the full ADJCALLSTACKDOWN amount, which is
+  // wrong for our scheme: ADJCALLSTACKDOWN is a no-op and PUSH16
+  // shifts SP incrementally.  The forward pass tracks SPAdj per-MI
+  // (driven by W65816InstrInfo::getSPAdjust), so a STAfi BEFORE any
+  // PUSH16 in the sequence sees SPAdj=0 and writes to the actual
+  // local slot, while a LDAfi AFTER a PUSH16 sees SPAdj=2 and
+  // accounts for the shift.  Without this override, eval(a*b+c)
+  // and similar functions silently corrupt the caller's return
+  // address by writing to a "local" that's actually beyond the
+  // reserved frame.
+  bool eliminateFrameIndicesBackwards() const override { return false; }
 };

 } // namespace llvm
--- a/src/llvm/lib/Target/W65816/W65816RegisterInfo.td
+++ b/src/llvm/lib/Target/W65816/W65816RegisterInfo.td
@ -10,10 +10,10 @@
 //  Declarations that describe the W65816 register file
 //===----------------------------------------------------------------------===//

-class W65816Reg<bits<4> num, string n> : Register<n> {
-  field bits<4> Num = num;
+class W65816Reg<bits<8> num, string n> : Register<n> {
+  field bits<8> Num = num;
  let Namespace = "W65816";
-  let HWEncoding{3-0} = num;
+  let HWEncoding{7-0} = num;
  let DwarfNumbers = [num];
 }

@ -38,6 +38,23 @@ def PBR : W65816Reg<6, "pbr">, DwarfRegNum<[6]>;
 def PC  : W65816Reg<7, "pc">,  DwarfRegNum<[7]>;
 def P   : W65816Reg<8, "p">,   DwarfRegNum<[8]>;

+// Imaginary 16-bit registers backed by direct-page slots $D0..$DE.
+// The regalloc treats them as physical registers with cheap LDA/STA dp
+// inter-register moves.  This relieves pressure on the single Acc16
+// register (A) so greedy regalloc can succeed on functions with
+// multiple simultaneously-live i16 vregs.  Caller-save: callees may
+// freely overwrite them, so regalloc spills around any call that
+// might touch them.  Their HWEncoding is never emitted (asmprinter
+// translates IMGn references into LDA/STA dp with the right address).
+def IMG0 : W65816Reg<16, "img0">, DwarfRegNum<[16]>;
+def IMG1 : W65816Reg<17, "img1">, DwarfRegNum<[17]>;
+def IMG2 : W65816Reg<18, "img2">, DwarfRegNum<[18]>;
+def IMG3 : W65816Reg<19, "img3">, DwarfRegNum<[19]>;
+def IMG4 : W65816Reg<20, "img4">, DwarfRegNum<[20]>;
+def IMG5 : W65816Reg<21, "img5">, DwarfRegNum<[21]>;
+def IMG6 : W65816Reg<22, "img6">, DwarfRegNum<[22]>;
+def IMG7 : W65816Reg<23, "img7">, DwarfRegNum<[23]>;
+
 //===----------------------------------------------------------------------===//
 //  Register Classes
 //===----------------------------------------------------------------------===//
@ -52,6 +69,25 @@ def Acc16 : RegisterClass<"W65816", [i16], 16, (add A)>;
 def Idx8  : RegisterClass<"W65816", [i8],  8,  (add X, Y)>;
 def Idx16 : RegisterClass<"W65816", [i16], 16, (add X, Y)>;

+// Imaginary i16 registers backed by DP slots $D0..$DE.  Vregs in this
+// class lower to LDA/STA dp on cross-class moves to A (4 cyc each
+// way).  Used by ABridgeViaX (and future regalloc-pressure passes) as
+// an alternative parking spot to stack spills.  Caller-save: a callee
+// may freely overwrite $D0..$DF, so the allocator must spill IMGn
+// vregs around any call.
+def Img16 : RegisterClass<"W65816", [i16], 16,
+                          (add IMG0, IMG1, IMG2, IMG3,
+                               IMG4, IMG5, IMG6, IMG7)>;
+
+// Acc-or-IMG combined class.  Vregs that are not constrained to A
+// (i.e., not the source of an arithmetic op) get widened to this
+// class pre-RA so greedy regalloc can pick A or any IMGn.  Listing
+// A first so the allocator's default order prefers A; cross-class
+// moves to/from A are LDA/STA dp via copyPhysReg.
+def Wide16 : RegisterClass<"W65816", [i16], 16,
+                           (add A, IMG0, IMG1, IMG2, IMG3,
+                                IMG4, IMG5, IMG6, IMG7)>;
+
 def PtrRegs : RegisterClass<"W65816", [i16], 16, (add SP)>;

 // Single-register class for the processor status register, used for condition
--- a/src/llvm/lib/Target/W65816/W65816SepRepCleanup.cpp
+++ b/src/llvm/lib/Target/W65816/W65816SepRepCleanup.cpp
@ -0,0 +1,301 @@
+//===-- W65816SepRepCleanup.cpp - Coalesce adjacent SEP/REP toggles -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Post-PEI peephole that drops adjacent `REP #$20 ; SEP #$20` (or vice
+// versa) pairs that toggle the M-bit redundantly.
+//
+// The STA8fi expansion in W65816RegisterInfo::eliminateFrameIndex emits
+// `SEP #$20 / STA d,S / REP #$20` so each i8 store runs with M=1.  When
+// two STA8fi sit back-to-back in the MIR (no 16-bit ALU op between
+// them), the post-PEI stream contains:
+//
+//     SEP #$20
+//     STA d1, S
+//     REP #$20    <-- toggle
+//     SEP #$20    <-- toggle (cancels above)
+//     STA d2, S
+//     REP #$20
+//
+// The middle REP/SEP pair is a no-op: both stores can run in one M=1
+// region.  We drop them to leave:
+//
+//     SEP #$20
+//     STA d1, S
+//     STA d2, S
+//     REP #$20
+//
+// Saves 2 bytes / 6 cycles per coalesced pair.  Symmetric `SEP/REP`
+// pairs (M=1 then M=0 with nothing in between) are also dropped — they
+// can arise around inline-asm or hand-written assembly snippets.
+//
+// Runs at addPreEmitPass (after PEI has expanded STA8fi).
+//
+//===----------------------------------------------------------------------===//
+
+#include "W65816.h"
+#include "W65816InstrInfo.h"
+#include "W65816Subtarget.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "w65816-sep-rep-cleanup"
+
+namespace {
+
+class W65816SepRepCleanup : public MachineFunctionPass {
+public:
+  static char ID;
+
+  W65816SepRepCleanup() : MachineFunctionPass(ID) {}
+
+  StringRef getPassName() const override {
+    return "W65816 SEP/REP toggle coalescing";
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+
+} // namespace
+
+char W65816SepRepCleanup::ID = 0;
+
+INITIALIZE_PASS(W65816SepRepCleanup, DEBUG_TYPE,
+                "W65816 SEP/REP toggle coalescing", false, false)
+
+FunctionPass *llvm::createW65816SepRepCleanup() {
+  return new W65816SepRepCleanup();
+}
+
+// Returns the immediate value of `op` if MI is a `SEP #imm` or `REP #imm`,
+// else -1.
+static int getSepRepImm(const MachineInstr &MI, unsigned Opc) {
+  if (MI.getOpcode() != Opc)
+    return -1;
+  if (MI.getNumOperands() < 1 || !MI.getOperand(0).isImm())
+    return -1;
+  return MI.getOperand(0).getImm();
+}
+
+// Returns true if MI may consume the carry or overflow flag — these
+// are the flags that ADC/SBC define but INA/DEA don't.  Conservative:
+// any branch that reads C or V counts, plus the chained ADC/SBC ops
+// that wait for a prior carry-out.  Anything else (CMP, CLC, SEC,
+// LDA, STA, AND, ORA, EOR, etc.) re-defines or doesn't read C/V.
+static bool readsCarryOrV(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case W65816::BCS:                 // reads C
+  case W65816::BCC:                 // reads C
+  case W65816::BVS:                 // reads V
+  case W65816::BVC:                 // reads V
+  case W65816::ADC_StackRel:        // reads C as carry-in
+  case W65816::ADC_Imm16:
+  case W65816::ADC_Imm8:
+  case W65816::ADC_DP:
+  case W65816::ADC_Abs:
+  case W65816::SBC_StackRel:
+  case W65816::SBC_Imm16:
+  case W65816::SBC_Imm8:
+  case W65816::SBC_DP:
+  case W65816::SBC_Abs:
+  case W65816::ROL_A:               // rotates fold C in
+  case W65816::ROR_A:
+  case W65816::ROL_DP:
+  case W65816::ROL_Abs:
+  case W65816::ROR_DP:
+  case W65816::ROR_Abs:
+    return true;
+  default:
+    return false;
+  }
+}
+
+// Returns true if `Op` is one of the flag-redefining opcodes (CLC, SEC,
+// CMP*, CPX*, CPY*, REP, SEP) — observing C/V before this is safe.
+// Includes the pseudo CMP* variants (CMPi16imm etc.) since this peephole
+// runs at pre-emit, BEFORE the AsmPrinter expands them.
+static bool isFlagRedefiner(unsigned Op) {
+  switch (Op) {
+  case W65816::CLC:
+  case W65816::SEC:
+  case W65816::CMP_Imm8: case W65816::CMP_Imm16:
+  case W65816::CMP_StackRel: case W65816::CMP_DP: case W65816::CMP_Abs:
+  case W65816::CMPi16imm: case W65816::CMPi8imm:
+  case W65816::CMPfi:     case W65816::CMPabs:
+  case W65816::CMP_RR:
+  case W65816::CPX_Imm8: case W65816::CPX_Imm16:
+  case W65816::CPX_DP:   case W65816::CPX_Abs:
+  case W65816::CPY_Imm8: case W65816::CPY_Imm16:
+  case W65816::CPY_DP:   case W65816::CPY_Abs:
+  case W65816::REP:      case W65816::SEP:
+    return true;
+  default: return false;
+  }
+}
+
+// Returns true if a subsequent MI in the same MBB observes the C/V
+// flags before any flag-redefiner clears the dependency.  At MBB end,
+// extends one step into each successor: if any successor's first
+// (non-debug) MI reads C/V before redefining them, the flag is live
+// across the edge — bail.  This is critical for loop bodies where
+// the back-edge re-enters the same MBB at LDA/PHA (neither reads C/V),
+// so a per-iteration `clc; adc #2` is foldable.  Cross-MBB carry chains
+// would normally use ADCEi16imm (not ADCi16imm), so this is safe.
+static bool carryFlagLiveAfter(MachineBasicBlock::iterator After,
+                               MachineBasicBlock &MBB) {
+  // Phase 1: scan within this MBB.
+  for (auto Probe = std::next(After); Probe != MBB.end(); ++Probe) {
+    if (Probe->isDebugInstr()) continue;
+    if (readsCarryOrV(*Probe)) return true;
+    if (isFlagRedefiner(Probe->getOpcode())) return false;
+    if (Probe->isCall()) return false;        // callee resets flags
+  }
+  // Phase 2: peek into each successor's first few MIs.  We BAIL only on
+  // a positive C/V read; reaching MBB end or peek-cap without finding
+  // one is treated as "carry dead" — ADCi16imm's carry-out is never
+  // used in carry chains (those use ADCEi16imm), so a stray carry
+  // floating into RTL or an unrelated arithmetic op causes no harm.
+  const unsigned MaxPeek = 6;
+  for (MachineBasicBlock *Succ : MBB.successors()) {
+    unsigned Peeked = 0;
+    for (auto &MI : *Succ) {
+      if (MI.isDebugInstr()) continue;
+      if (readsCarryOrV(MI)) return true;
+      if (isFlagRedefiner(MI.getOpcode()) || MI.isCall()) break;
+      if (++Peeked >= MaxPeek) break;
+    }
+  }
+  return false;
+}
+
+// Convert `ADCi16imm dst, src, ±1`/`±2` and `SBCi16imm` similarly to
+// INA / INA;INA / DEA / DEA;DEA chains when C/V are dead.  ADCi16imm
+// is a pseudo that expands to CLC+ADC_Imm16 (4B/5cyc).  INA is 1B/2cyc.
+// Savings per ±1: 3B/3cyc; per ±2: 2B/1cyc.  SBCi16imm is symmetric
+// (sub by N == add by -N), so SBC #1 → DEA, SBC #-1 → INA, etc.
+static bool foldImmAdcToInaDea(MachineBasicBlock &MBB,
+                               const W65816InstrInfo &TII) {
+  bool Changed = false;
+  auto It = MBB.begin();
+  while (It != MBB.end()) {
+    unsigned Op = It->getOpcode();
+    bool isAdc = (Op == W65816::ADCi16imm);
+    bool isSbc = (Op == W65816::SBCi16imm);
+    if ((!isAdc && !isSbc) || It->getNumOperands() < 3 ||
+        !It->getOperand(2).isImm()) { ++It; continue; }
+    int64_t Imm = (int16_t)It->getOperand(2).getImm();
+    // For SBC, negate: SBC by +N is "subtract N", same as ADC by -N.
+    int64_t Effective = isSbc ? -Imm : Imm;
+    if (Effective < -2 || Effective > 2 || Effective == 0) { ++It; continue; }
+    if (carryFlagLiveAfter(It, MBB)) { ++It; continue; }
+
+    DebugLoc DL = It->getDebugLoc();
+    unsigned NewOpc = (Effective > 0) ? W65816::INA : W65816::DEA;
+    unsigned Count = (Effective > 0) ? Effective : -Effective;
+    for (unsigned i = 0; i < Count; ++i)
+      BuildMI(MBB, It, DL, TII.get(NewOpc));
+    auto NextIt = std::next(It);
+    It->eraseFromParent();
+    It = NextIt;
+    Changed = true;
+  }
+  return Changed;
+}
+
+bool W65816SepRepCleanup::runOnMachineFunction(MachineFunction &MF) {
+  bool Changed = false;
+  const auto &STI = MF.getSubtarget<W65816Subtarget>();
+  const auto &TII = *STI.getInstrInfo();
+  for (MachineBasicBlock &MBB : MF) {
+    SmallVector<MachineInstr *, 8> Toggles;
+    for (MachineInstr &MI : MBB) {
+      unsigned Opc = MI.getOpcode();
+      if (Opc == W65816::REP || Opc == W65816::SEP)
+        Toggles.push_back(&MI);
+    }
+    SmallPtrSet<MachineInstr *, 8> Erased;
+    for (MachineInstr *First : Toggles) {
+      if (Erased.count(First)) continue;
+      // The next non-debug instruction must be the matching opposite
+      // toggle with the same imm.
+      auto It = std::next(First->getIterator());
+      while (It != MBB.end() && It->isDebugInstr()) ++It;
+      if (It == MBB.end()) continue;
+      MachineInstr &Next = *It;
+      // Look for REP-then-SEP or SEP-then-REP with matching imm.
+      unsigned FirstOpc = First->getOpcode();
+      unsigned WantOpc = (FirstOpc == W65816::REP) ? W65816::SEP : W65816::REP;
+      int FirstImm = getSepRepImm(*First, FirstOpc);
+      int NextImm = getSepRepImm(Next, WantOpc);
+      if (FirstImm < 0 || NextImm < 0 || FirstImm != NextImm) continue;
+      Erased.insert(First);
+      Erased.insert(&Next);
+      First->eraseFromParent();
+      Next.eraseFromParent();
+      Changed = true;
+    }
+
+    // Second peephole: collapse `ADCi16imm src, ±1/±2` (and SBCi16imm)
+    // into INA/DEA chains when the carry flag they would set is unused.
+    // ADCi16imm is a pseudo (expands to CLC+ADC_Imm16); we rewrite it
+    // here BEFORE the AsmPrinter expansion runs.  But this pass runs at
+    // pre-emit, AFTER post-RA pseudo expansion.  ADCi16imm survives
+    // because its MCInst lowering is in W65816AsmPrinter (not in the
+    // generic post-RA pseudo expander), so it's still in the MIR here.
+    Changed |= foldImmAdcToInaDea(MBB, TII);
+
+    // Third peephole: drop `LDY_Imm16 K` when Y already holds K from
+    // an earlier LDY in the same MBB and no intervening MI clobbered
+    // Y.  Custom inserter emits LDY #0 before every LDAfi_indY/STAfi_indY,
+    // even though Y already holds 0 from a previous emit — the
+    // redundant LDYs survive MachineLICM because Y is a phys reg and
+    // the inserter binds them tightly to each use.
+    int yKnown = -1;  // -1 means unknown; otherwise the immediate
+    auto It2 = MBB.begin();
+    while (It2 != MBB.end()) {
+      MachineInstr &MI = *It2;
+      if (MI.isDebugInstr()) { ++It2; continue; }
+      unsigned Op = MI.getOpcode();
+      if (Op == W65816::LDY_Imm16 && MI.getNumOperands() >= 1 &&
+          MI.getOperand(0).isImm()) {
+        int K = MI.getOperand(0).getImm() & 0xFFFF;
+        if (yKnown == K) {
+          auto Erase = It2++;
+          Erase->eraseFromParent();
+          Changed = true;
+          continue;
+        }
+        yKnown = K;
+      } else {
+        // Conservatively invalidate yKnown on anything that touches Y
+        // or on calls / inline asm / any instruction that doesn't have
+        // a clean "no Y effect" guarantee.  Cheaper to underclaim than
+        // miscompile.
+        switch (Op) {
+        case W65816::LDAfi_indY:    // reads Y, doesn't def it — keep yKnown
+        case W65816::STAfi_indY:
+        case W65816::LDA_StackRelIndY:
+        case W65816::STA_StackRelIndY:
+          break;
+        case W65816::TAY: case W65816::TXY:
+        case W65816::INY: case W65816::DEY:
+        case W65816::PLY: case W65816::LDY_DP: case W65816::LDY_Abs:
+        case W65816::LDY_DPX: case W65816::LDY_AbsX:
+          yKnown = -1; break;
+        default:
+          if (MI.isCall()) yKnown = -1;
+          break;
+        }
+      }
+      ++It2;
+    }
+  }
+  return Changed;
+}
--- a/src/llvm/lib/Target/W65816/W65816SpillToX.cpp
+++ b/src/llvm/lib/Target/W65816/W65816SpillToX.cpp
@ -0,0 +1,365 @@
+//===-- W65816SpillToX.cpp - Replace stack spills with TAX/TXA -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Post-RA peephole: replace stack-spill/reload pairs with TAX/TXA (or
+// TAY/TYA) when the index register is dead during the spill window.
+//
+// Fast regalloc spills A to stack via STAfi/LDAfi, costing ~12 cycles
+// per round-trip (sta is 5 cycles + lda is 5 cycles + the displacement
+// dispatch).  But the W65816 has TAX (2 cycles) + TXA (2 cycles), a
+// 3x speedup if X is free during the spill window.
+//
+// We scan each basic block for the pattern:
+//
+//     STAfi $a, slot, 0
+//     ...   (instructions that don't touch X or A's slot, don't kill A)
+//     LDAfi $a, slot, 0
+//
+// If no instruction in the gap reads or writes X (or P-flags-dependent
+// X side effects, etc.), we rewrite the pair as:
+//
+//     TAX
+//     ...
+//     TXA
+//
+// This saves 4 bytes (stack-rel addressing is 2 bytes per op vs TAX/TXA
+// at 1 byte each) AND saves the memory traffic.  Net: ~8 cycles per
+// converted pair.
+//
+// Conservative liveness: we treat X as "in use" if ANY instruction in
+// the gap references W65816::X (def or use).  False positives mean
+// we keep the slow stack form; false negatives are correctness bugs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "W65816.h"
+#include "W65816InstrInfo.h"
+#include "W65816Subtarget.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "w65816-spill-to-x"
+
+namespace {
+
+class W65816SpillToX : public MachineFunctionPass {
+public:
+  static char ID;
+  W65816SpillToX() : MachineFunctionPass(ID) {}
+  StringRef getPassName() const override {
+    return "W65816 spill-to-X peephole";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+
+} // namespace
+
+char W65816SpillToX::ID = 0;
+
+INITIALIZE_PASS(W65816SpillToX, DEBUG_TYPE, "W65816 spill-to-X peephole",
+                false, false)
+
+FunctionPass *llvm::createW65816SpillToX() {
+  return new W65816SpillToX();
+}
+
+// Classifies how an MI interacts with X.
+enum XEffect { XNone = 0, XReads = 1, XDefs = 2, XBoth = 3 };
+
+// Most W65816 transfer/index opcodes (TAX, INX, LDX, STX, CPX, etc.)
+// are tablegen'd as `InstImplied` with no Defs/Uses metadata, so the
+// MCInstrDesc carries no implicit X operand and a generic operand
+// scan misses them.  We hard-code the X-effect per opcode instead.
+// Calls clobber X under our caller-saved-X ABI.
+static XEffect xEffect(const MachineInstr &MI, const TargetRegisterInfo *TRI) {
+  switch (MI.getOpcode()) {
+  case W65816::TAX:                        // X := A
+  case W65816::TYX:                        // X := Y
+  case W65816::TSX:                        // X := SP
+  case W65816::PLX:                        // X := pop
+    return XDefs;
+  case W65816::TXA:                        // A := X
+  case W65816::TXY:                        // Y := X
+  case W65816::TXS:                        // SP := X
+  case W65816::PHX:                        // push X
+    return XReads;
+  case W65816::INX:                        // X := X+1
+  case W65816::DEX:                        // X := X-1
+    return XBoth;
+  default:
+    break;
+  }
+  if (MI.isCall()) return XBoth;           // caller-clobbered X
+  // Generic operand scan for opcodes that carry X explicitly (LDX/STX/CPX
+  // pseudos) or any properly-modelled implicit defs/uses.
+  int eff = XNone;
+  for (const auto &MO : MI.operands()) {
+    if (!MO.isReg()) continue;
+    Register R = MO.getReg();
+    if (!R.isPhysical()) continue;
+    bool isX = R == W65816::X || (TRI && TRI->regsOverlap(R, W65816::X));
+    if (!isX) continue;
+    if (MO.isDef()) eff |= XDefs; else eff |= XReads;
+  }
+  return (XEffect)eff;
+}
+
+// Convenience wrapper: returns true if MI references X in any way.
+static bool touchesX(const MachineInstr &MI, const TargetRegisterInfo *TRI) {
+  return xEffect(MI, TRI) != XNone;
+}
+
+// Returns true if MI is `STAfi $a, slot, 0`.
+static int matchSTAfi(const MachineInstr &MI) {
+  if (MI.getOpcode() != W65816::STAfi) return -1;
+  if (MI.getNumOperands() < 3) return -1;
+  if (!MI.getOperand(0).isReg() || MI.getOperand(0).getReg() != W65816::A)
+    return -1;
+  if (!MI.getOperand(1).isFI()) return -1;
+  if (!MI.getOperand(2).isImm() || MI.getOperand(2).getImm() != 0) return -1;
+  return MI.getOperand(1).getIndex();
+}
+
+// Returns FI if MI is `LDAfi slot, 0` defining $a, else -1.
+static int matchLDAfi(const MachineInstr &MI) {
+  if (MI.getOpcode() != W65816::LDAfi) return -1;
+  if (MI.getNumOperands() < 3) return -1;
+  if (!MI.getOperand(0).isReg() || MI.getOperand(0).getReg() != W65816::A)
+    return -1;
+  if (!MI.getOperand(1).isFI()) return -1;
+  if (!MI.getOperand(2).isImm() || MI.getOperand(2).getImm() != 0) return -1;
+  return MI.getOperand(1).getIndex();
+}
+
+// Returns true if MI reads or writes the slot at FrameIndex FI.
+static bool referencesSlot(const MachineInstr &MI, int FI) {
+  for (const auto &MO : MI.operands()) {
+    if (MO.isFI() && MO.getIndex() == FI) return true;
+  }
+  return false;
+}
+
+bool W65816SpillToX::runOnMachineFunction(MachineFunction &MF) {
+  const W65816Subtarget &STI = MF.getSubtarget<W65816Subtarget>();
+  const W65816InstrInfo *TII = STI.getInstrInfo();
+  const TargetRegisterInfo *TRI = STI.getRegisterInfo();
+  bool Changed = false;
+  // Slots whose last reference we erased — candidates for reclamation.
+  SmallSet<int, 8> SlotsTouched;
+
+  for (auto &MBB : MF) {
+    // Pass 1: collect (STAfi, slot) entries.
+    SmallVector<std::pair<MachineInstr *, int>, 8> Stas;
+    for (auto &MI : MBB) {
+      int FI = matchSTAfi(MI);
+      if (FI != -1) Stas.push_back({&MI, FI});
+    }
+
+    // For each STAfi, scan forward for the matching LDAfi with no
+    // intervening X touch or slot reference.  Process in REVERSE
+    // order so any nested pair is converted first; the outer pair's
+    // gap scan then sees the inner TAX/TXA (which touches X) and
+    // bails — preventing a mid-bridge X clobber.
+    for (auto It = Stas.rbegin(); It != Stas.rend(); ++It) {
+      auto [StaMI, FI] = *It;
+      bool xTouched = false;
+      bool gapEmpty = true;
+      MachineInstr *LdaMI = nullptr;
+      for (auto Scan = std::next(MachineBasicBlock::iterator(StaMI));
+           Scan != MBB.end(); ++Scan) {
+        MachineInstr &MI2 = *Scan;
+        if (MI2.isDebugInstr()) continue;
+
+        // Look for the matching LDAfi.  TAX preserves A so we don't
+        // need to check A liveness — only whether X was free.
+        if (matchLDAfi(MI2) == FI) { LdaMI = &MI2; break; }
+
+        // Bail if X is touched (use or def, including implicit on
+        // calls) or if the slot is referenced by something else
+        // (which would invalidate the saved value).
+        if (touchesX(MI2, TRI)) { xTouched = true; break; }
+        if (referencesSlot(MI2, FI)) break;
+        gapEmpty = false;
+      }
+
+      // Defer empty-gap pairs to StackSlotCleanup, which deletes both
+      // (A still holds the stored value across an empty gap).  That
+      // beats our TAX+TXA conversion (0 instr vs 2 instr).
+      if (!LdaMI || xTouched || gapEmpty) continue;
+
+      // X-live-after-LDA check: TXA (the LDAfi replacement) clobbers X.
+      // If anything downstream of the LDA reads X — including the next
+      // JSL's implicit $x — then we'd silently corrupt X.  Caught by
+      // i32 first-arg functions where $x is live-in (= arg0_hi) and
+      // a libcall later in the block expects $x intact.  Scan from just
+      // past LDA to end-of-block; if any instr uses X, bail.
+      bool xUsedAfter = false;
+      for (auto Scan = std::next(MachineBasicBlock::iterator(LdaMI));
+           Scan != MBB.end(); ++Scan) {
+        const MachineInstr &MI3 = *Scan;
+        if (MI3.isDebugInstr()) continue;
+        XEffect eff = xEffect(MI3, TRI);
+        if (eff & XReads) { xUsedAfter = true; break; }
+        if (eff & XDefs) break;  // X redefined; no longer live
+      }
+      // Also bail if X is live-in to MBB and nothing has defined X
+      // between MBB start and STA — the live-in value is needed past
+      // the LDA point.
+      if (!xUsedAfter && MBB.isLiveIn(W65816::X)) {
+        bool xRedefBeforeSta = false;
+        for (auto Scan = MBB.begin();
+             Scan != MachineBasicBlock::iterator(StaMI); ++Scan) {
+          const MachineInstr &MI3 = *Scan;
+          if (MI3.isDebugInstr()) continue;
+          if (xEffect(MI3, TRI) & XDefs) { xRedefBeforeSta = true; break; }
+        }
+        if (!xRedefBeforeSta) xUsedAfter = true;
+      }
+      if (xUsedAfter) continue;
+
+      // Cross-block use check: if the slot is referenced anywhere
+      // OUTSIDE the [STA, LDA] window (including other blocks), the
+      // STA we'd erase is feeding those other reads — eliding it
+      // would silently corrupt them.  Caught by sumTable() returning
+      // a stale phi value because the loop's STA-to-merge-slot was
+      // eliminated; the merge block's LDA then read the bb.0-init 0
+      // instead of the loop's accumulated sum.
+      bool externalUse = false;
+      for (auto &OtherMBB : MF) {
+        for (auto &OtherMI : OtherMBB) {
+          if (&OtherMI == StaMI || &OtherMI == LdaMI) continue;
+          // Walk inside-window range and skip those refs.
+          if (&OtherMBB == &MBB) {
+            // We already verified the gap doesn't reference FI; only
+            // STA/LDA themselves are allowed users in this block.
+          }
+          if (referencesSlot(OtherMI, FI)) {
+            externalUse = true;
+            break;
+          }
+        }
+        if (externalUse) break;
+      }
+      if (externalUse) continue;
+
+      // Replace STAfi with TAX, LDAfi with TXA.
+      DebugLoc StaDL = StaMI->getDebugLoc();
+      DebugLoc LdaDL = LdaMI->getDebugLoc();
+      MachineBasicBlock *MBB2 = StaMI->getParent();
+      auto StaIt = MachineBasicBlock::iterator(StaMI);
+      auto LdaIt = MachineBasicBlock::iterator(LdaMI);
+      BuildMI(*MBB2, StaIt, StaDL, TII->get(W65816::TAX));
+      BuildMI(*MBB2, LdaIt, LdaDL, TII->get(W65816::TXA))
+          .addReg(W65816::A, RegState::ImplicitDefine);
+      StaMI->eraseFromParent();
+      LdaMI->eraseFromParent();
+      SlotsTouched.insert(FI);
+      Changed = true;
+    }
+
+    // Post-pass: collapse `TAX ; TXA` (or `TXA ; TAX`) pairs whose
+    // observable effect is dead.  These appear when an inner STA/LDA
+    // pair (originally between an outer pair we converted) was deleted
+    // by StackSlotCleanup or coalesced by stack-slot-coloring, leaving
+    // our TAX/TXA bookends adjacent.
+    //
+    // Distinct effect per ordering:
+    //   TAX;TXA  : net effect is `X := A` (A unchanged, X clobbered).
+    //              Removable iff X dead afterwards.
+    //   TXA;TAX  : net effect is `A := X` (X unchanged, A clobbered).
+    //              Removable iff A dead afterwards.
+    //
+    // The earlier code mis-handled TXA;TAX as if it clobbered X; in
+    // fact X comes through the pair unchanged.
+    auto It = MBB.begin();
+    while (It != MBB.end()) {
+      auto Next = std::next(It);
+      if (Next == MBB.end()) break;
+      bool isTaxThenTxa = It->getOpcode() == W65816::TAX &&
+                          Next->getOpcode() == W65816::TXA;
+      bool isTxaThenTax = It->getOpcode() == W65816::TXA &&
+                          Next->getOpcode() == W65816::TAX;
+      if (!isTaxThenTxa && !isTxaThenTax) { ++It; continue; }
+
+      // Choose which physreg's liveness matters based on which value
+      // the pair clobbers.
+      Register Clobbered = isTaxThenTxa ? W65816::X : W65816::A;
+
+      bool observed = false;
+      bool killedByDef = false;
+      for (auto Tail = std::next(Next); Tail != MBB.end(); ++Tail) {
+        if (Tail->isDebugInstr()) continue;
+        if (Tail->readsRegister(Clobbered, TRI)) { observed = true; break; }
+        // Calls clobber both A and X (caller-saved).
+        if (Tail->isCall()) { killedByDef = true; break; }
+        // Opcode-based defs (TAX/TXA tablegen has no Defs metadata).
+        if (Clobbered == W65816::X) {
+          XEffect E = xEffect(*Tail, TRI);
+          if (E & XReads) { observed = true; break; }
+          if (E & XDefs)  { killedByDef = true; break; }
+        } else {
+          // For A: any LDA*/PLA/TXA/TYA/INA/DEA/arith op redefines A.
+          unsigned Op = Tail->getOpcode();
+          if (Op == W65816::TXA || Op == W65816::TYA ||
+              Op == W65816::INA || Op == W65816::DEA ||
+              Op == W65816::PLA) { killedByDef = true; break; }
+          if (Tail->modifiesRegister(W65816::A, TRI)) {
+            killedByDef = true; break;
+          }
+        }
+      }
+      if (observed) { ++It; continue; }
+      if (!killedByDef) {
+        bool liveOut = false;
+        for (MachineBasicBlock *Succ : MBB.successors()) {
+          if (Succ->isLiveIn(Clobbered)) { liveOut = true; break; }
+        }
+        if (liveOut) { ++It; continue; }
+      }
+
+      auto Erase1 = It++;
+      auto Erase2 = It++;
+      Erase1->eraseFromParent();
+      Erase2->eraseFromParent();
+      Changed = true;
+    }
+  }
+
+  // Reclaim frame slots whose last reference we just erased.  Without
+  // this, PEI still allocates space for them and emits the prologue
+  // PHA, even though the slot is unused — wastes 1 PHA (4 cyc) and
+  // 1 PLY per call.  RemoveStackObject marks the slot dead by setting
+  // its size to ~0ULL; PEI ignores those when computing frame size.
+  if (!SlotsTouched.empty()) {
+    MachineFrameInfo &MFI = MF.getFrameInfo();
+    for (int FI : SlotsTouched) {
+      bool stillUsed = false;
+      for (auto &MBB : MF) {
+        for (auto &MI : MBB) {
+          if (referencesSlot(MI, FI)) { stillUsed = true; break; }
+        }
+        if (stillUsed) break;
+      }
+      if (!stillUsed) MFI.RemoveStackObject(FI);
+    }
+  }
+  return Changed;
+}
--- a/src/llvm/lib/Target/W65816/W65816StackSlotCleanup.cpp
+++ b/src/llvm/lib/Target/W65816/W65816StackSlotCleanup.cpp
--- a/src/llvm/lib/Target/W65816/W65816TargetMachine.cpp
+++ b/src/llvm/lib/Target/W65816/W65816TargetMachine.cpp
@ -40,6 +40,10 @@ LLVMInitializeW65816Target() {
  initializeW65816AsmPrinterPass(PR);
  initializeW65816DAGToDAGISelLegacyPass(PR);
  initializeW65816StackSlotCleanupPass(PR);
+  initializeW65816ABridgeViaXPass(PR);
+  initializeW65816WidenAcc16Pass(PR);
+  initializeW65816SpillToXPass(PR);
+  initializeW65816NegYIndYPass(PR);
 }

 static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
@ -75,7 +79,20 @@ public:
  }

  bool addInstSelector() override;
+  void addPreRegAlloc() override;
  void addPostRegAlloc() override;
+  void addPreEmitPass() override;
+
+  // W65816's only 16-bit ALU register is A.  We use fast regalloc by
+  // default — always succeeds, ~30-50% bigger code than greedy in
+  // pathological cases but correctness is paramount.  Greedy fails
+  // outright on functions with 4+ simultaneously live i16 vregs (heap
+  // sift etc.).  TiedDefSpill (pre-RA) handles the tied-def-multi-use
+  // hazard for the sub-pattern that's frequent enough to matter.
+  //
+  FunctionPass *createTargetRegisterAllocator(bool /*Optimized*/) override {
+    return createGreedyRegisterAllocator();
+  }
 };

 } // namespace
@ -84,8 +101,40 @@ TargetPassConfig *W65816TargetMachine::createPassConfig(PassManagerBase &PM) {
  return new W65816PassConfig(*this, PM);
 }

+void W65816PassConfig::addPreRegAlloc() {
+  addPass(createW65816ABridgeViaX());
+  addPass(createW65816TiedDefSpill());
+  addPass(createW65816WidenAcc16());
+}
+
 void W65816PassConfig::addPostRegAlloc() {
+  // SpillToX converts STA/LDA pairs to TAX/TXA bridges; StackSlotCleanup
+  // then deletes still-adjacent redundant spills.  A second SpillToX
+  // invocation collapses any TAX/TXA pair left adjacent by cleanup
+  // (e.g. when an inner copy between bridge endpoints went away).
+  addPass(createW65816SpillToX());
  addPass(createW65816StackSlotCleanup());
+  addPass(createW65816SpillToX());
+}
+
+void W65816PassConfig::addPreEmitPass() {
+  // SpillToX one more time: now that postrapseudos has expanded
+  // physreg-COPY pseudos into the real TAX/TXA opcodes, adjacent
+  // TXA;TAX pairs (which the earlier SpillToX invocations couldn't
+  // see in COPY form) become collapsable.
+  addPass(createW65816SpillToX());
+  // Rewrite negative-Y indirect-Y stack-rel ops.  Must run BEFORE
+  // BranchExpand because the rewrite expands one instruction into
+  // several and shifts branch distances.
+  addPass(createW65816NegYIndY());
+  // Branch expansion runs after that so the BRA introduced for long
+  // conditional branches gets seen by SepRepCleanup (which can
+  // coalesce SEP/REP brackets across the new bridge MBBs).
+  // Distance estimation now uses TII::getInstSizeInBytes so it's
+  // byte-accurate; the 110-byte threshold leaves margin without
+  // expanding short branches that would otherwise survive as Bxx.
+  addPass(createW65816BranchExpand());
+  addPass(createW65816SepRepCleanup());
 }

 MachineFunctionInfo *W65816TargetMachine::createMachineFunctionInfo(
--- a/src/llvm/lib/Target/W65816/W65816TiedDefSpill.cpp
+++ b/src/llvm/lib/Target/W65816/W65816TiedDefSpill.cpp
@ -0,0 +1,244 @@
+//===-- W65816TiedDefSpill.cpp - Pre-RA spill insertion for tied-def ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Pre-regalloc pass: when a tied-def Acc16 instruction (ADCfi, SBCfi,
+// ANDfi, ORAfi, EORfi, ADCi16imm, SBCi16imm, ANDi16imm, ORAi16imm,
+// EORi16imm, ADCabs, SBCabs, ANDabs, ORAabs, EORabs, INA_PSEUDO,
+// DEA_PSEUDO, ASLA16, LSRA16, NEGA16, SHL8A, SRL8A, SRA15A, etc.) has
+// a source vreg whose value is *also* needed past the consumer, fast
+// regalloc fails to insert the necessary save/restore on its own.
+// (Acc16 has exactly one physical register, so the consumer's
+// tied-def overwrites the source; with multiple consumers/post-uses
+// the source must be spilled and reloaded.)
+//
+// We insert that explicitly here:
+//
+//     %dst = TIED_OP %src, ...    (where %src is also used after)
+// becomes
+//     STAfi %src, freshSlot, 0
+//     %dst = TIED_OP %src, ...    (now safely consumes %src)
+//     %src_reload = LDAfi freshSlot, 0
+//     ... post-consumer uses replaced with %src_reload
+//
+// Runs pre-RA so the new vregs participate in regalloc's liveness
+// analysis and get assigned A.
+//
+//===----------------------------------------------------------------------===//
+
+#include "W65816.h"
+#include "W65816InstrInfo.h"
+#include "W65816Subtarget.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "w65816-tied-def-spill"
+
+namespace {
+
+class W65816TiedDefSpill : public MachineFunctionPass {
+public:
+  static char ID;
+  W65816TiedDefSpill() : MachineFunctionPass(ID) {}
+  StringRef getPassName() const override {
+    return "W65816 tied-def spill insertion";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<MachineDominatorTreeWrapperPass>();
+    AU.addPreserved<MachineDominatorTreeWrapperPass>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+
+} // namespace
+
+char W65816TiedDefSpill::ID = 0;
+
+INITIALIZE_PASS(W65816TiedDefSpill, DEBUG_TYPE,
+                "W65816 tied-def spill insertion", false, false)
+
+FunctionPass *llvm::createW65816TiedDefSpill() {
+  return new W65816TiedDefSpill();
+}
+
+// Allowlist of tied-def consumer pseudos that are known to fail
+// fast regalloc when their source has multiple uses.  Restricting
+// to this set avoids regressing other patterns whose existing
+// regalloc behaviour is correct.
+//
+// All entries below have shape `(outs Acc16:$dst), (ins Acc16:$src,
+// memfi:$addr)` or similar tied-source-Acc16 + side-load form,
+// matching the failure pattern observed in `bump` / `eval`.
+static bool isTiedAcc16Consumer(unsigned Opc) {
+  switch (Opc) {
+  case W65816::ADCfi:
+  case W65816::SBCfi:
+  case W65816::ANDfi:
+  case W65816::ORAfi:
+  case W65816::EORfi:
+  case W65816::ADCabs:
+  case W65816::SBCabs:
+  case W65816::ADCi16imm:
+  case W65816::SBCi16imm:
+  case W65816::ANDi16imm:
+  case W65816::ORAi16imm:
+  case W65816::EORi16imm:
+    return true;
+  default:
+    return false;
+  }
+}
+
+static bool hasTiedSrcDef(const MachineInstr &MI) {
+  if (!isTiedAcc16Consumer(MI.getOpcode())) return false;
+  for (unsigned i = 0; i < MI.getNumOperands(); ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg() || !MO.isUse()) continue;
+    if (MI.isRegTiedToDefOperand(i)) return true;
+  }
+  return false;
+}
+
+bool W65816TiedDefSpill::runOnMachineFunction(MachineFunction &MF) {
+  // Only pre-RA: skip if vregs are already gone.
+  if (!MF.getRegInfo().getNumVirtRegs())
+    return false;
+
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const W65816Subtarget &STI = MF.getSubtarget<W65816Subtarget>();
+  const W65816InstrInfo *TII = STI.getInstrInfo();
+  MachineDominatorTree &MDT =
+      getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
+  bool Changed = false;
+
+  // Snapshot all candidate (MBB, MI, src-operand-index) tuples first;
+  // we mutate the MBB during processing.
+  struct Candidate { MachineBasicBlock *MBB; MachineInstr *MI; unsigned OpIdx; };
+  SmallVector<Candidate, 8> Candidates;
+
+  for (auto &MBB : MF) {
+    for (auto &MI : MBB) {
+      if (!hasTiedSrcDef(MI)) continue;
+      // For each tied-source operand, check if the source vreg has
+      // any use other than this MI.  If yes, queue for spill.
+      for (unsigned i = 0; i < MI.getNumOperands(); ++i) {
+        const MachineOperand &MO = MI.getOperand(i);
+        if (!MO.isReg() || !MO.isUse()) continue;
+        if (!MI.isRegTiedToDefOperand(i)) continue;
+        Register Reg = MO.getReg();
+        if (!Reg.isVirtual()) continue;
+        // Count uses excluding this one.  If any other instruction
+        // reads Reg, we need to preserve it across the tied-def
+        // consumer.
+        // Conservative: only spill when one of the OTHER uses is a
+        // COPY to a *physreg* (typically a return-value setup or a
+        // call-arg copy).  This is the canary pattern fast regalloc
+        // mishandles — value flowing both into a tied-def consumer
+        // AND into a physreg copy at the end of a BB.  Other patterns
+        // (vreg-to-vreg COPY, store, etc.) tend to be handled by fast
+        // correctly, and triggering on them inflates frame size
+        // (vprintf-class functions overflow the 8-bit stack-rel
+        // range otherwise).
+        bool NeedSpill = false;
+        bool BadUse = false;
+        for (auto &U : MRI.use_nodbg_instructions(Reg)) {
+          if (&U == &MI) continue;
+          if (U.isPHI()) { BadUse = true; break; }
+          if (U.isCopy()) {
+            const MachineOperand &Dst = U.getOperand(0);
+            if (Dst.isReg() && Dst.getReg().isPhysical()) {
+              NeedSpill = true;
+              continue;
+            }
+          }
+        }
+        if (NeedSpill && !BadUse)
+          Candidates.push_back({&MBB, &MI, i});
+      }
+    }
+  }
+
+  for (auto C : Candidates) {
+    MachineInstr *MI = C.MI;
+    MachineBasicBlock *MBB = C.MBB;
+    unsigned OpIdx = C.OpIdx;
+    Register SrcReg = MI->getOperand(OpIdx).getReg();
+    if (!SrcReg.isVirtual()) continue;
+
+    const TargetRegisterClass *RC = MRI.getRegClass(SrcReg);
+    if (RC != &W65816::Acc16RegClass)
+      continue;
+
+    int FI = MF.getFrameInfo().CreateStackObject(2, Align(2),
+                                                 /*isSpillSlot=*/true);
+    DebugLoc DL = MI->getDebugLoc();
+
+    // Insert STAfi $src, FI, 0 BEFORE MI.
+    BuildMI(*MBB, MI, DL, TII->get(W65816::STAfi))
+        .addReg(SrcReg)
+        .addFrameIndex(FI)
+        .addImm(0);
+
+    Register NewReg = MRI.createVirtualRegister(&W65816::Acc16RegClass);
+    auto InsertPos = std::next(MachineBasicBlock::iterator(MI));
+    BuildMI(*MBB, InsertPos, DL, TII->get(W65816::LDAfi), NewReg)
+        .addFrameIndex(FI)
+        .addImm(0);
+
+    // Only rewrite uses that come AFTER MI in program order — earlier
+    // uses already saw SrcReg's original value before any tied-def
+    // overwrite, so they don't need redirection.  Uses in successor
+    // MBBs definitely come after; uses in MI's own MBB after the
+    // LDAfi reload come after; uses before MI in its MBB are
+    // pre-consumer and stay on SrcReg.
+    SmallVector<MachineOperand *, 4> ToRewrite;
+    for (auto &U : MRI.use_nodbg_operands(SrcReg)) {
+      if (U.getParent() == MI) continue;
+      MachineBasicBlock *UseMBB = U.getParent()->getParent();
+      bool After = false;
+      if (UseMBB != MBB) {
+        // Different block — only redirect if MI's MBB DOMINATES the
+        // use's MBB.  Without dominance, there's a path from the
+        // function entry to the use that bypasses MI entirely (e.g.,
+        // a loop-exit edge from a pre-loop block straight into a
+        // post-loop block).  Redirecting such a use to %19 (which is
+        // only defined when MI runs) reads stale data — the previous
+        // iter's MI value, or junk if MI never ran.  Caught by parse2/
+        // printf returning N-1 because the loop's tied-def spill of n
+        // was redirected to the exit block, which on the final iter
+        // (loop test fails) sees iter N-1's saved value.
+        if (MDT.dominates(MBB, UseMBB))
+          After = true;
+      } else {
+        // Same block — walk forward from MI to end, see if we hit U.
+        for (auto it = MachineBasicBlock::iterator(MI), e = MBB->end();
+             it != e; ++it) {
+          if (&*it == U.getParent()) { After = true; break; }
+        }
+      }
+      if (After) ToRewrite.push_back(&U);
+    }
+    for (auto *MO : ToRewrite) {
+      MO->setReg(NewReg);
+      MO->setIsKill(false);
+    }
+
+    Changed = true;
+  }
+  return Changed;
+}
--- a/src/llvm/lib/Target/W65816/W65816WidenAcc16.cpp
+++ b/src/llvm/lib/Target/W65816/W65816WidenAcc16.cpp
@ -0,0 +1,178 @@
+//===-- W65816WidenAcc16.cpp - Promote Acc16 vregs to Wide16 ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Pre-RA pass that promotes Acc16 vregs (constrained to physreg A only)
+// to the wider Wide16 class (A + IMG0..IMG7).  Greedy regalloc gets
+// 9-way pressure relief on the i16 register class; functions that
+// previously failed with "ran out of registers" can now spread their
+// live i16 values across A and the DP-backed imaginaries.
+//
+// Cross-class moves between A and IMGn are LDA/STA dp (4 cyc each way,
+// 2 bytes), emitted by W65816InstrInfo::copyPhysReg.  The constraint
+// that arithmetic ops require their source in A propagates back from
+// the use sites — regalloc coerces Wide16 vregs to Acc16 (= {A}) at
+// those sites and inserts the necessary COPYs.
+//
+// Calls clobber IMGn (caller-save), so any vreg in IMGn that lives
+// across a call gets spilled to stack by regalloc.  This pass doesn't
+// model that explicitly — it relies on the calling convention's
+// regmask to mark IMGn clobbered.
+//
+//===----------------------------------------------------------------------===//
+
+#include "W65816.h"
+#include "W65816InstrInfo.h"
+#include "W65816Subtarget.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "w65816-widen-acc16"
+
+namespace {
+
+class W65816WidenAcc16 : public MachineFunctionPass {
+public:
+  static char ID;
+  W65816WidenAcc16() : MachineFunctionPass(ID) {}
+  StringRef getPassName() const override {
+    return "W65816 Acc16 → Wide16 promotion";
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+
+} // namespace
+
+char W65816WidenAcc16::ID = 0;
+
+INITIALIZE_PASS(W65816WidenAcc16, DEBUG_TYPE,
+                "W65816 Acc16 → Wide16 promotion", false, false)
+
+FunctionPass *llvm::createW65816WidenAcc16() {
+  return new W65816WidenAcc16();
+}
+
+// Returns true if the vreg has any physreg-COPY use (e.g., return-value
+// or arg-passing setup that pins the value to a specific physreg).
+static bool flowsToPhysReg(Register VReg, const MachineRegisterInfo &MRI) {
+  for (auto &U : MRI.use_nodbg_instructions(VReg)) {
+    if (!U.isCopy()) continue;
+    const MachineOperand &Dst = U.getOperand(0);
+    if (Dst.isReg() && Dst.getReg().isPhysical()) return true;
+  }
+  return false;
+}
+
+// Returns true if the vreg is used by any PHI.  PHI input/result must
+// share the same register class (verifier requirement).  Rather than
+// also widen the PHI's result and recursively all of its uses, we skip
+// vregs caught up in PHIs entirely — leaves a few wins on the table
+// but avoids cross-MBB analysis here.
+static bool usedByPhi(Register VReg, const MachineRegisterInfo &MRI) {
+  for (auto &U : MRI.use_nodbg_instructions(VReg)) {
+    if (U.isPHI()) return true;
+  }
+  return false;
+}
+
+// Returns true if all non-debug, non-COPY uses of VReg are at operands
+// whose required register class accepts Wide16 (i.e., Wide16 or a
+// superclass).  COPY uses are unconstrained — fine.  PHI uses already
+// filtered earlier.  If any use's operand class is strictly narrower
+// than Wide16 (i.e., Acc16-only, Idx16-only, etc.), return false: the
+// verifier rejects passing a Wide16 vreg to such an operand.
+static bool allUsesAcceptWide(Register VReg,
+                              const MachineRegisterInfo &MRI,
+                              const TargetRegisterInfo &TRI,
+                              const TargetInstrInfo &TII) {
+  for (auto &MO : MRI.use_nodbg_operands(VReg)) {
+    MachineInstr *UMI = MO.getParent();
+    if (UMI->isCopy()) continue;  // COPY accepts anything
+    if (UMI->isPHI()) return false;  // already filtered, but be safe
+    unsigned OpIdx = UMI->getOperandNo(&MO);
+    (void)TRI;
+    const TargetRegisterClass *Expected =
+        TII.getRegClass(UMI->getDesc(), OpIdx);
+    if (!Expected) continue;  // no constraint
+    if (Expected == &W65816::Wide16RegClass) continue;
+    // Check superclass relationship: Wide16 must be a sub-or-equal of
+    // Expected for the use to accept Wide16 vregs.  A common case:
+    // Expected is a superclass that includes Wide16.  If Expected is
+    // narrower (e.g., Acc16 only), reject.
+    if (Expected->hasSubClassEq(&W65816::Wide16RegClass)) continue;
+    return false;
+  }
+  return true;
+}
+
+bool W65816WidenAcc16::runOnMachineFunction(MachineFunction &MF) {
+  if (!MF.getRegInfo().getNumVirtRegs()) return false;
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const W65816Subtarget &STI = MF.getSubtarget<W65816Subtarget>();
+  const W65816InstrInfo *TII = STI.getInstrInfo();
+  const TargetRegisterInfo *TRI = STI.getRegisterInfo();
+  bool Changed = false;
+
+  // For each Acc16 vreg, insert a COPY to a fresh Wide16 vreg right
+  // after its def, then redirect all uses to the Wide16 vreg.  The
+  // original Acc16 vreg keeps its tight constraint (= {A}) for the
+  // def site (which is typically a pseudo whose AsmPrinter expansion
+  // assumes A); the new Wide16 vreg is free for greedy to allocate
+  // anywhere in {A, IMG0..IMG7}.  When both end up in A, the COPY
+  // is a no-op the regalloc/coalescer collapses; when the Wide16
+  // vreg lands on IMGn, the COPY becomes STA dp via copyPhysReg.
+  SmallVector<Register, 16> Candidates;
+  for (unsigned i = 0; i < MRI.getNumVirtRegs(); ++i) {
+    Register VReg = Register::index2VirtReg(i);
+    if (MRI.def_empty(VReg)) continue;
+    if (MRI.getRegClass(VReg) != &W65816::Acc16RegClass) continue;
+    if (flowsToPhysReg(VReg, MRI)) continue;
+    if (usedByPhi(VReg, MRI)) continue;
+    if (!MRI.hasOneDef(VReg)) continue;  // require single SSA def
+    if (!allUsesAcceptWide(VReg, MRI, *TRI, *TII)) continue;
+    Candidates.push_back(VReg);
+  }
+
+  for (Register VReg : Candidates) {
+    MachineInstr *DefMI = &*MRI.def_instructions(VReg).begin();
+    MachineBasicBlock *MBB = DefMI->getParent();
+    DebugLoc DL = DefMI->getDebugLoc();
+    Register WideReg = MRI.createVirtualRegister(&W65816::Wide16RegClass);
+    // Insert AFTER the def, but if the def is a PHI, walk past all
+    // PHIs in the block first — verifier requires all PHIs at MBB
+    // entry, no non-PHI may sit between them.
+    auto InsertAt = std::next(MachineBasicBlock::iterator(DefMI));
+    if (DefMI->isPHI()) {
+      while (InsertAt != MBB->end() && InsertAt->isPHI()) ++InsertAt;
+    }
+    BuildMI(*MBB, InsertAt, DL, TII->get(TargetOpcode::COPY), WideReg)
+        .addReg(VReg);
+    // Rewrite all non-debug uses of VReg (other than the COPY we just
+    // inserted) to WideReg.
+    SmallVector<MachineOperand *, 8> ToRewrite;
+    for (auto &U : MRI.use_nodbg_operands(VReg)) {
+      MachineInstr *UMI = U.getParent();
+      if (UMI->getOpcode() == TargetOpcode::COPY &&
+          UMI->getOperand(0).getReg() == WideReg) continue;
+      ToRewrite.push_back(&U);
+    }
+    for (auto *MO : ToRewrite) {
+      MO->setReg(WideReg);
+      MO->setIsKill(false);
+    }
+    Changed = true;
+  }
+  return Changed;
+}