From e65fedc8e17f00d4fbbb778fbfa7ed85d252c2fb Mon Sep 17 00:00:00 2001
From: Scott Duensing <scott.duensing@gmail.com>
Date: Wed, 13 May 2026 15:48:34 -0500
Subject: [PATCH] Checkpoint

---
 STATUS.md                                     | 171 +++-
 compare/README.md                             |  44 +
 compare/evalAt.c                              |  21 +
 compare/evalAt.calypsi.lst                    | 318 +++++++
 compare/evalAt.ours.s                         | 593 +++++++++++++
 compare/mul16to32.c                           |   4 +
 compare/mul16to32.calypsi.lst                 |  37 +
 compare/mul16to32.ours.s                      |  23 +
 compare/regen.sh                              |  44 +
 compare/sumSquares.c                          |   8 +
 compare/sumSquares.calypsi.lst                |  68 ++
 compare/sumSquares.ours.s                     |  93 +++
 mame.ini                                      | 416 ++++++++++
 patches/0005-target-data-layout-w65816.patch  |   2 +-
 plugin.ini                                    |  17 +
 runtime/build.sh                              |  13 +-
 runtime/include/assert.h                      |   6 +
 runtime/include/complex.h                     | 100 +++
 runtime/include/errno.h                       |  50 +-
 runtime/include/fenv.h                        |  51 ++
 runtime/include/inttypes.h                    |  23 +-
 runtime/include/iso646.h                      |  20 +
 runtime/include/locale.h                      |   4 +
 runtime/include/math.h                        |  44 +
 runtime/include/stdalign.h                    |  13 +
 runtime/include/stdatomic.h                   | 138 ++++
 runtime/include/stddef.h                      |   5 +-
 runtime/include/stdint.h                      |   9 +-
 runtime/include/stdio.h                       |  47 +-
 runtime/include/stdlib.h                      |  22 +
 runtime/include/stdnoreturn.h                 |   9 +
 runtime/include/string.h                      |   4 +
 runtime/include/tgmath.h                      |  97 +++
 runtime/include/threads.h                     |  91 ++
 runtime/include/time.h                        |  12 +
 runtime/include/uchar.h                       |  53 ++
 runtime/include/wchar.h                       |  40 +-
 runtime/include/wctype.h                      |  84 ++
 runtime/src/crt0.s                            |  96 ++-
 runtime/src/extras.c                          | 309 ++++++-
 runtime/src/libc.c                            | 116 +++
 runtime/src/libgcc.s                          |  85 +-
 runtime/src/math.c                            | 109 +++
 runtime/src/snprintf.c                        |  69 +-
 runtime/src/softDouble.c                      |  48 +-
 runtime/src/sscanf.c                          | 110 ++-
 runtime/src/timeExt.c                         |  54 +-
 scripts/runMultiSeg.sh                        |   4 +-
 scripts/smokeTest.sh                          | 781 +++++++++++++++++-
 src/clang/lib/Basic/Targets/W65816.h          |   2 +-
 src/link816/link816.cpp                       | 117 ++-
 src/llvm/lib/Target/W65816/CMakeLists.txt     |   3 +
 src/llvm/lib/Target/W65816/W65816.h           |  25 +
 .../lib/Target/W65816/W65816AsmPrinter.cpp    |  45 +
 .../lib/Target/W65816/W65816FrameLowering.cpp |  26 +-
 .../lib/Target/W65816/W65816I32IncFold.cpp    | 225 +++++
 .../lib/Target/W65816/W65816ISelLowering.cpp  | 132 ++-
 .../lib/Target/W65816/W65816ISelLowering.h    |   5 +
 .../lib/Target/W65816/W65816ImgCalleeSave.cpp | 278 +++++++
 .../lib/Target/W65816/W65816InstrInfo.cpp     | 159 +++-
 src/llvm/lib/Target/W65816/W65816InstrInfo.h  |  18 +-
 src/llvm/lib/Target/W65816/W65816InstrInfo.td |  25 +-
 .../lib/Target/W65816/W65816NarrowI32Mul.cpp  | 150 ++++
 .../lib/Target/W65816/W65816RegisterInfo.cpp  |  53 +-
 .../lib/Target/W65816/W65816SepRepCleanup.cpp | 392 +++++++++
 .../Target/W65816/W65816StackSlotCleanup.cpp  |  79 +-
 .../lib/Target/W65816/W65816TargetMachine.cpp |  81 +-
 ui.ini                                        |  71 ++
 68 files changed, 6153 insertions(+), 308 deletions(-)
 create mode 100644 compare/README.md
 create mode 100644 compare/evalAt.c
 create mode 100644 compare/evalAt.calypsi.lst
 create mode 100644 compare/evalAt.ours.s
 create mode 100644 compare/mul16to32.c
 create mode 100644 compare/mul16to32.calypsi.lst
 create mode 100644 compare/mul16to32.ours.s
 create mode 100755 compare/regen.sh
 create mode 100644 compare/sumSquares.c
 create mode 100644 compare/sumSquares.calypsi.lst
 create mode 100644 compare/sumSquares.ours.s
 create mode 100644 mame.ini
 create mode 100644 plugin.ini
 create mode 100644 runtime/include/complex.h
 create mode 100644 runtime/include/fenv.h
 create mode 100644 runtime/include/iso646.h
 create mode 100644 runtime/include/stdalign.h
 create mode 100644 runtime/include/stdatomic.h
 create mode 100644 runtime/include/stdnoreturn.h
 create mode 100644 runtime/include/tgmath.h
 create mode 100644 runtime/include/threads.h
 create mode 100644 runtime/include/uchar.h
 create mode 100644 runtime/include/wctype.h
 create mode 100644 src/llvm/lib/Target/W65816/W65816I32IncFold.cpp
 create mode 100644 src/llvm/lib/Target/W65816/W65816ImgCalleeSave.cpp
 create mode 100644 src/llvm/lib/Target/W65816/W65816NarrowI32Mul.cpp
 create mode 100644 ui.ini

diff --git a/STATUS.md b/STATUS.md
index 70060b7..a285834 100644
--- a/STATUS.md
+++ b/STATUS.md
@@ -50,6 +50,12 @@ which runs correctly under MAME (apple2gs).
   coverage as printf (`%d %u %x %ld %lu %s %c %f %p %%` + width).
   C99 truncation semantics for snprintf.  `%.Nf` produces the
   correct fractional digits with round-half-up.
+- scanf family: `sscanf` / `vsscanf` parse a C string; `fscanf` /
+  `vfscanf` bridge to vsscanf via a per-call line buffer (caps at
+  255 bytes / line; a longer line silently truncates).  `scanf`
+  reads from stdin which always returns EOF on this target — the
+  surface compiles but isn't useful without a stdin source.
+  Format directives: `%d %i %u %x %X %o %s %c %ld %lu %lx %li %lo %%`.
 - qsort + bsearch over arbitrary element size with a user `cmp`
   callback.
 - Standard string/stdlib glue: strcat, strncat, strpbrk, strspn,
@@ -91,12 +97,61 @@ which runs correctly under MAME (apple2gs).
 - `<wchar.h>`: wcslen / wcscmp / wcsncmp / wcscpy / wcsncpy /
   wcscat / wcschr / wcsrchr; mbtowc / wctomb / mbstowcs /
   wcstombs / mblen with the trivial 1:1 byte<->wide mapping
-  (Latin-1).  wchar_t is 16-bit on this target.
+  (Latin-1).  wchar_t is 16-bit on this target.  Extended set:
+  wmemcpy / wmemmove / wmemset / wmemcmp / wmemchr;
+  wcstol / wcstoul / wcstoll / wcstoull / wcstod / wcstof;
+  swprintf / vswprintf; wcsftime.  All delegate to the byte
+  equivalents under the Latin-1 model.
 - `<signal.h>`: in-process signal table.  signal() registers a
   handler; raise() invokes it.  Default actions: SIGABRT calls
   abort(), SIGINT/SIGTERM call exit(128+sig), others ignored.
 - `<locale.h>`: setlocale always returns "C"; localeconv returns
   a fixed C-locale lconv struct.
+- `<fenv.h>`: rounding mode + exception flag word tracked but
+  no-op (softFloat / softDouble are fixed RNE; exceptions never
+  raised).  Surface compiles cleanly for portable code.
+- `<tgmath.h>`: C11 type-generic math via `_Generic`; selects
+  `sqrtf` vs `sqrt` etc. based on argument type.
+- `<stdatomic.h>`: C11 atomic surface, all ops lower to plain
+  ops (single-core uniprocessor — no real synchronization
+  needed).  `_Atomic T` is treated as plain `T`.
+- `<threads.h>`: stubs.  `thrd_create` returns `thrd_error`;
+  mutex/cond ops are no-ops; `call_once` and `tss_*` work since
+  they're degenerate on a single-core target.
+- `aligned_alloc` / `posix_memalign` / `aligned_free`: wrap
+  malloc with an over-allocation + pointer-stash trick.  Match
+  C11 contract — `aligned_alloc(N, M)` returns N-aligned, free
+  with `aligned_free`.
+- `<iso646.h>`: alternative operator spellings (`and`, `or`,
+  `not`, etc.) — C95 compat header.
+- `<stdalign.h>`: aliases `_Alignas` / `_Alignof` to `alignas` /
+  `alignof`.
+- `<stdnoreturn.h>`: aliases `_Noreturn` to `noreturn`.
+- `<uchar.h>`: `char16_t` / `char32_t` typedefs + `mbrtoc16` /
+  `c16rtomb` / `mbrtoc32` / `c32rtomb` conversion helpers.  In
+  our Latin-1 model these are 1:1 byte copies (no UTF-8 decode).
+- `<wctype.h>`: wide-char classification + case folding.
+  Delegates to `<ctype.h>` for code-points 0..255; anything
+  outside Latin-1 returns false / unchanged.
+- `<complex.h>`: C99 complex-number surface — clang built-in
+  `_Complex` lowers to soft-double under the hood.  Macros
+  `complex` / `_Complex_I` / `I` / `CMPLX` / `CMPLXF` / `CMPLXL`
+  plus inline `creal` / `cimag` / `conj` / `cproj` / `cabs` /
+  `carg` and their `f` / `l` variants.  Transcendental complex
+  routines (csin/ccos/cexp/etc.) intentionally not provided —
+  they would each need a polynomial-expansion implementation
+  with limited IIgs value.
+- `<assert.h>`: adds C11 `static_assert` as a macro alias for
+  the `_Static_assert` keyword.
+- `<errno.h>`: full C standard error codes (EDOM, ERANGE,
+  EILSEQ) plus common POSIX codes (EPERM..EPIPE, ENAMETOOLONG,
+  ENOSYS, ENOTEMPTY, ELOOP).  `strerror` maps every defined
+  code to a human-readable string.
+- `<stdio.h>`: adds C standard buffer-control surface
+  (`setvbuf` / `setbuf` as no-ops, `_IOFBF` / `_IOLBF` / `_IONBF`
+  / `BUFSIZ`); `fgetpos` / `fsetpos` wrap `ftell` / `fseek`;
+  `remove` routes through `mfsUnregister`; `rename` / `tmpfile`
+  / `tmpnam` are stubs.
 - C++ subset: classes, single inheritance, multiple inheritance
   (Drawable+Movable through one Sprite), virtual base diamond
   (A and B virtually derive Base; Diamond inherits from both
@@ -162,7 +217,7 @@ which runs correctly under MAME (apple2gs).
   image addresses.
 - `runtime/build.sh` builds crt0, libc, soft-float, soft-double,
   libgcc into linkable objects.
-- `scripts/smokeTest.sh` runs 132 end-to-end checks at -O2:
+- `scripts/smokeTest.sh` runs 145 end-to-end checks at -O2:
   scalar ops, control flow, calling conventions, MAME execution
   regressions, link816 bss-base safety + weak-symbol resolution +
   heap_end-vs-heap_start sanity, iigs/toolbox.h compile + link,
@@ -191,20 +246,23 @@ which runs correctly under MAME (apple2gs).
 
 - `scripts/benchCyclesPrecise.sh` measures per-call cycle counts
   via MAME's emulated time counter.  Eight benchmarks under
-  `benchmarks/`.  Current numbers: popcount 4876 cyc, bsearch
-  938, memcmp 1330, strcpy 3325, dotProduct 4007, fib(10) 12958,
-  sumOfSquares 40920.  Speed is the optimization priority, not
+  `benchmarks/`.  Current numbers: popcount 3683 cyc, bsearch
+  852, memcmp 1091, strcpy 2558, dotProduct 2387, fib(10) 12617,
+  sumOfSquares 23529.  Speed is the optimization priority, not
   size.
 
 **Backend register allocation:**
 
-- Basic regalloc as default at -O1+; fast at -O0/optnone.  We use
-  basic instead of greedy because greedy fails ("ran out of
-  registers during register allocation") on functions with many
-  cross-call Acc16 vregs (the `ok |= bit; helper(); ok |= bit;`
-  pattern across many if-blocks).  Basic handles those cleanly
-  with negligible code-size overhead vs greedy on the bench
-  suite (~0.6%).
+- Greedy regalloc as default at -O1+; fast at -O0/optnone.  Greedy
+  was previously blocked by an upstream LLVM `LiveRangeEdit::elimina-
+  teDeadDef` assertion firing on KILL pseudos with non-dead implicit-
+  def $a.  Fix landed in `tools/llvm-mos/llvm/lib/CodeGen/InlineSpil-
+  ler.cpp`: when InlineSpiller converts a redundant STAfi to a KILL
+  pseudo, mark BOTH explicit and implicit defs dead (the original loop
+  only iterated `MI.defs()` = explicit-only, leaving the inherited
+  implicit-def $a live).  Bench impact: popcount −19.4%, strcpy
+  −18.9%, memcmp −8.6%, bsearch −9.2%.
+
 - Pre-RA passes: `WidenAcc16` (Acc16→Wide16 promotion, lets
   greedy spread i16 pressure across A and 16 IMG slots);
   `TiedDefSpill` (handles tied-def-multi-use hazard);
@@ -259,29 +317,39 @@ for the common-case C / minimal-C++ workload.  Priority is speed
 
 **Speed wins queued, ranked by expected impact:**
 
-- **ptr32 pointer-increment overhead.**  `*p++` under ptr32 emits
-  a full 32-bit `ADC` chain even when the high half is provably
-  unchanged, and LSR rewrites `*p++` into base+offset (worse on
-  W65816).  strcpy/memcmp pay 30+ cycles per byte for what should
-  be 15-20.  Tried `-disable-lsr` (strcpy −10%, dotProduct +10%)
-  and TTI `isLSRCostLess` override (memcmp +22% — worse); both
-  too risky without per-loop heuristics.  Needs either a peephole
-  for `i32 + 1` with provably-no-carry-into-hi or per-loop LSR
-  override based on pointer-vs-array access pattern.
+- **ptr32 pointer-increment overhead** (partially addressed).  The
+  `i32 += 1` post-PEI peephole (`W65816I32IncFold`) detects the
+  6-instruction LDA/ADCi16imm 1/STA/LDA/ADCEi16imm 0/STA pattern and
+  rewrites to LDA/INA/STA/INC_HI_IF_CARRY (with private-label BNE
+  expansion in AsmPrinter).  Saves ~13 cyc per increment on the
+  no-carry common path.  memcmp 1330 → 1194 (−10.2%), strcpy 3325 →
+  3154 (−5.1%).  LSR's `*p++ → base+offset` rewrite remains
+  unaddressed; tried `-disable-lsr` and `isLSRCostLess` override,
+  both regressed dotProduct.
 
-- **Greedy regalloc retry.**  Currently blocked on an upstream
-  LLVM `LiveRangeEdit::eliminateDeadDef` assertion when our
-  sub-register pair partial-defs reach it.  Basic regalloc works
-  but leaves measurable cycle waste in load/store shuffles.
+- **More peephole / libcall opportunities.**  __mulsi3 just gained
+  early-exit when the multiplier shifts to 0; dotProduct dropped
+  4007→2472 (−38.3%), sumOfSquares 40920→23870 (−41.6%).  Next
+  candidates: a true 16×16→32 multiply libcall (for `(u32)i*i`
+  patterns) and shift-by-N inlining for shifts 5+ that currently
+  go through __ashlsi3.
 
 **Open limitations:**
 
-- **Multi-bank BSS / init_array.**  Multi-segment mode splits
-  `.text` across banks but BSS + init_array still live in
-  segment 1's bank (bank 0).  Programs with zero-init data
-  exceeding the ~60KB bank-0 budget need crt0 to walk a
-  per-segment `(start, end)` table.  Not a blocker for >64KB
-  *code* programs.
+- **Multi-bank BSS** — full support up to 4 banks (256KB).  link816
+  splits BSS into up to 4 contiguous segments at link time; each
+  segment fits within a single bank.  Linker emits
+  `__bss_seg{0..3}_lo16 / _bank / _size` symbols.  crt0 walks the
+  table, setting DBR per segment.  Per-segment size capped at
+  0xFF00 so the 16-bit `cpx #__bss_segN_size` loop comparison
+  doesn't wrap to 0 on a full-bank segment (a single full bank is
+  split into a 0xFF00-byte primary + 0x100-byte tail in the same
+  bank).  Smoke 137/137 validates BSS spanning bank 3 + bank 4
+  (100KB) is zeroed end-to-end.  Note: program access to non-DBR
+  bank globals still requires DBR management — the compiler emits
+  DBR-relative absolute for global accesses, so accessing BSS in
+  bank N needs the program to set DBR=N or use `sta long` via
+  inline asm.
 
 - **C++ exceptions absent from CI smoke.**  The SJLJ runtime
   round-trip is in smoke; the full clang++ → backend → MAME
@@ -295,13 +363,36 @@ for the common-case C / minimal-C++ workload.  Priority is speed
   real bootable GS/OS volume is left out of CI as it needs a
   smartport hard-disk image and live Tool Locator init.
 
-- **gmtime_r requires `optnone`.**  IR-level optimizer issue:
-  loop rotation + IndVar simplify mis-evaluate `days >= 365L +
-  (__isLeap(...) ? 1 : 0)`, folding the comparison to
-  compile-time-false.  Not a backend bug; needs IR-pass-level
-  diagnosis.
+- **VLAs work end-to-end** (2026-05-09).  Backend Custom-lowers
+  `ISD::DYNAMIC_STACKALLOC` for both i16 and i32 result types.
+  Loop patterns now produce correct results: `sum_n(3)→6`
+  verified in MAME smoke.  Fix: in VLA functions PEI expands
+  STAfi/STA8fi/STAfi_indY to a 4-MC sequence ending in `LDY $F8`
+  which clobbers N/Z; the StackSlotCleanup PHP/PLP wrap pass
+  treats those pseudos as flag-corrupting so PLP wraps the entire
+  expansion.  `expandFarFI` uses `STY $F8`/`LDY $F8` to a DP
+  scratch slot rather than PHY/PLY (PHY/PLY between PHP/PLP would
+  pollute the saved P).
 
-- **softDouble `dpack` / `dclass` require `noinline`.**
-  Inlining triggers register pressure that overflows basic
-  regalloc in `__adddf3`/`__muldf3`/`__divdf3`.  Architectural
-  for the same reason as qsort's earlier split.
+- **dpack and dclass now both inline** (2026-05-10).  dpack uses
+  a volatile-output array rewrite to defeat the backend stack-slot
+  coalesce bug that previously caused dadd(1.5, 2.5) →
+  0x4010_4010_0000_0000.  dclass's pointer-arg stores lower to
+  STBptr/STAptr (indirect-long, DBR-independent) and inline
+  cleanly.  All softDouble routines compile at -O2.
+
+- **IMG8..IMG15 callee-save via W65816ImgCalleeSave** (2026-05-13).
+  New post-RA, pre-PEI pass detects use of IMG8..IMG15 ($C0..$CE)
+  in a function and emits prologue save + epilogue restore so those
+  slots behave as callee-saved AT THE ASM LEVEL — without going
+  through LLVM's CSR mechanism (which would shift regalloc decisions
+  and break unrelated tests).  Save shape per used slot: `PHA; LDA
+  $C?; STAfi A,slot,2; PLA`; restore mirrors it.  The `+2` ImmOffset
+  compensates for PHA's SP shift so the lowered `sta d,s` lands on
+  the same byte that subsequent normal-SP reads see.  Cost: ~16
+  cycles + 6 bytes per used slot, applied only to functions that
+  actually use those slots (most don't).  Fixed picol `expr 1+2 == 4`
+  (now `3`) and a class of recursive double-fn miscompiles with
+  compound `||` conditions — see `feedback_picol_expr_compound_or.md`.
+  Smoke 149/149 green including a new orBug regression test guarding
+  the fix.
diff --git a/compare/README.md b/compare/README.md
new file mode 100644
index 0000000..05ee9d4
--- /dev/null
+++ b/compare/README.md
@@ -0,0 +1,44 @@
+# compare/ — backend output side-by-side with Calypsi 5.16
+
+Each test case lives as three files:
+
+- `<name>.c`           — the C source.
+- `<name>.ours.s`      — our backend's assembly (`clang --target=w65816 -O2 -S`).
+- `<name>.calypsi.lst` — Calypsi's listing file with source, hex bytes, and asm
+                         in one document (`cc65816 --speed -O 2 --64bit-doubles`).
+
+Calypsi's `--output` flag emits an ELF object, not text — its `--list-file` is the
+human-readable artifact. (32-bit-doubles is Calypsi's default; we pass
+`--64bit-doubles` so FP-heavy tests compare apples to apples against our IEEE-754
+`double` ABI.)
+
+## Regenerating
+
+```
+bash compare/regen.sh
+```
+
+Recompiles every `*.c` in this directory under both compilers and prints an
+instruction-count summary:
+
+```
+test                        ours  calypsi    ratio
+----                        ----  -------    -----
+evalAt                       419      268    1.56x
+mul16to32                     12       11    1.09x
+sumSquares                    72       31    2.32x
+```
+
+(Numbers above are illustrative — re-run to see current state.)
+
+## Adding a new comparison
+
+Drop a `<name>.c` in this directory and run `regen.sh`. No other wiring needed.
+
+## Counting methodology
+
+The summary counts asm-line opcodes (lda/sta/jsl/...) on our side and listing
+lines that begin with a hex byte (Calypsi's emit-byte column) on theirs.
+Both metrics are static instruction counts, NOT bytes. They underestimate
+calls-to-runtime (each libcall counts as one `jsl`, not the body it expands to).
+For cycle counts, use `scripts/benchCyclesPrecise.sh`.
diff --git a/compare/evalAt.c b/compare/evalAt.c
new file mode 100644
index 0000000..e739f2e
--- /dev/null
+++ b/compare/evalAt.c
@@ -0,0 +1,21 @@
+// Benchmark function — orBug-style recursive double expression eval.
+// Used to compare W65816 backend (with W65816ImgCalleeSave pass) vs Calypsi.
+double evalAt(char **p, int prec) {
+    double a = 0.0;
+    while (**p >= '0' && **p <= '9') {
+        a = a * 10.0 + (double)(**p - '0');
+        (*p)++;
+    }
+    while (1) {
+        int op = **p;
+        int oprec;
+        if (op == '*' || op == '/') oprec = 4;
+        else if (op == '+' || op == '-') oprec = 3;
+        else return a;
+        if (oprec <= prec) return a;
+        (*p)++;
+        double b = evalAt(p, oprec);
+        if (op == '+') a = a + b;
+        else if (op == '*') a = a * b;
+    }
+}
diff --git a/compare/evalAt.calypsi.lst b/compare/evalAt.calypsi.lst
new file mode 100644
index 0000000..e79fa69
--- /dev/null
+++ b/compare/evalAt.calypsi.lst
@@ -0,0 +1,318 @@
+###############################################################################
+#                                                                             #
+# Calypsi ISO C compiler for 65816                               version 5.16 #
+#                                                       13/May/2026  15:46:15 #
+# Command line: --speed -O 2 --64bit-doubles evalAt.c -o                      #
+#               /tmp/evalAt.calypsi.elf --list-file evalAt.calypsi.lst        #
+#                                                                             #
+###############################################################################
+
+    \ 000000                      .rtmodel version,"1"
+    \ 000000                      .rtmodel codeModel,"large"
+    \ 000000                      .rtmodel dataModel,"small"
+    \ 000000                      .rtmodel core,"65816"
+    \ 000000                      .rtmodel huge,"0"
+    \ 000000                      .rtmodel doubleSize,"64"
+    \ 000000                      .rtmodel target,"none-specified"
+    \ 000000                      .extern _Dp
+    \ 000000                      .extern _Vfp
+    \ 000000                      .extern __f64_add
+    \ 000000                      .extern __f64_mul
+    \ 000000                      .extern __i32_to_f64
+0001                  // Benchmark function — orBug-style recursive double expression eval.
+0002                  // Used to compare W65816 backend (with W65816ImgCalleeSave pass) vs Calypsi.
+0003                  double evalAt(char **p, int prec) {
+    \ 000000                      .section farcode,text
+    \ 000000                      .public evalAt
+    \ 000000          evalAt:
+    \ 000000 d4..                 pei     dp:.tiny (_Dp+8)
+    \ 000002 a8                   tay
+    \ 000003 3b                   tsc
+    \ 000004 38                   sec
+    \ 000005 e92600               sbc     ##38
+    \ 000008 1b                   tcs
+    \ 000009 98                   tya
+    \ 00000a 831d                 sta     29,s
+    \ 00000c a5..                 lda     dp:.tiny _Dp
+    \ 00000e 831b                 sta     27,s
+    \ 000010 a5..                 lda     dp:.tiny (_Dp+2)
+    \ 000012 85..                 sta     dp:.tiny (_Dp+8)
+0004                      double a = 0.0;
+    \ 000014 ad....               lda     _Const_0000000000000000+6
+    \ 000017 8309                 sta     9,s
+    \ 000019 ad....               lda     _Const_0000000000000000+4
+    \ 00001c 8307                 sta     7,s
+    \ 00001e ad....               lda     _Const_0000000000000000+2
+    \ 000021 8305                 sta     5,s
+    \ 000023 ad....               lda     _Const_0000000000000000
+    \ 000026 8303                 sta     3,s
+0005                      while (**p >= '0' && **p <= '9') {
+    \ 000028 a309                 lda     9,s
+    \ 00002a 8319                 sta     25,s
+    \ 00002c a307                 lda     7,s
+    \ 00002e 8317                 sta     23,s
+    \ 000030 a305                 lda     5,s
+    \ 000032 8315                 sta     21,s
+    \ 000034 a303                 lda     3,s
+    \ 000036 8313     `?L41`:     sta     19,s
+    \ 000038 22......             jsl     long:`?L44`
+    \ 00003c e220                 sep     #32
+    \ 00003e c930                 cmp     #48
+    \ 000040 c220                 rep     #32
+    \ 000042 b003                 bcs     `?L48`
+    \ 000044 4c....               jmp     .kbank `?L5`
+    \ 000047 a31b     `?L48`:     lda     27,s
+    \ 000049 a8                   tay
+    \ 00004a be0000               ldx     0,y
+    \ 00004d a93900               lda     ##57
+    \ 000050 e220                 sep     #32
+    \ 000052 dd0000               cmp     0,x
+    \ 000055 c220                 rep     #32
+    \ 000057 9072                 bcc     `?L5`
+0006                          a = a * 10.0 + (double)(**p - '0');
+    \ 000059 a2....               ldx     ##_Const_4024000000000000
+    \ 00005c 86..                 stx     dp:.tiny (_Dp+2)
+    \ 00005e 3b                   tsc
+    \ 00005f 18                   clc
+    \ 000060 691300               adc     ##19
+    \ 000063 85..                 sta     dp:.tiny _Dp
+    \ 000065 3b                   tsc
+    \ 000066 18                   clc
+    \ 000067 690300               adc     ##3
+    \ 00006a 22......             jsl     long:__f64_mul
+    \ 00006e 22......             jsl     long:`?L44`
+    \ 000072 29ff00               and     ##255
+    \ 000075 38                   sec
+    \ 000076 e93000               sbc     ##48
+    \ 000079 a20000               ldx     ##0
+    \ 00007c a8                   tay
+    \ 00007d 1001                 bpl     `?L31`
+    \ 00007f ca                   dex
+    \ 000080          `?L31`:
+    \ 000080 86..                 stx     dp:.tiny (_Dp+2)
+    \ 000082 85..                 sta     dp:.tiny _Dp
+    \ 000084 3b                   tsc
+    \ 000085 18                   clc
+    \ 000086 690b00               adc     ##11
+    \ 000089 22......             jsl     long:__i32_to_f64
+    \ 00008d 3b                   tsc
+    \ 00008e 18                   clc
+    \ 00008f 690b00               adc     ##11
+    \ 000092 85..                 sta     dp:.tiny (_Dp+2)
+    \ 000094 3b                   tsc
+    \ 000095 18                   clc
+    \ 000096 690300               adc     ##3
+    \ 000099 85..                 sta     dp:.tiny _Dp
+    \ 00009b 3b                   tsc
+    \ 00009c 18                   clc
+    \ 00009d 690300               adc     ##3
+    \ 0000a0 22......             jsl     long:__f64_add
+    \ 0000a4 a309                 lda     9,s
+    \ 0000a6 8311                 sta     17,s
+    \ 0000a8 a307                 lda     7,s
+    \ 0000aa 830f                 sta     15,s
+    \ 0000ac a305                 lda     5,s
+    \ 0000ae 830d                 sta     13,s
+    \ 0000b0 a303                 lda     3,s
+    \ 0000b2 830b                 sta     11,s
+0007                          (*p)++;
+    \ 0000b4 a31b                 lda     27,s
+    \ 0000b6 aa                   tax
+    \ 0000b7 fe0000               inc     0,x
+    \ 0000ba a311                 lda     17,s
+    \ 0000bc 8319                 sta     25,s
+    \ 0000be a30f                 lda     15,s
+    \ 0000c0 8317                 sta     23,s
+    \ 0000c2 a30d                 lda     13,s
+    \ 0000c4 8315                 sta     21,s
+    \ 0000c6 a30b                 lda     11,s
+    \ 0000c8 4c....               jmp     .kbank `?L41`
+    \ 0000cb          `?L5`:
+0008                      }
+0009                      while (1) {
+    \ 0000cb a319                 lda     25,s
+    \ 0000cd 8325                 sta     37,s
+    \ 0000cf a317                 lda     23,s
+    \ 0000d1 8323                 sta     35,s
+    \ 0000d3 a315                 lda     21,s
+    \ 0000d5 8321                 sta     33,s
+    \ 0000d7 a313                 lda     19,s
+    \ 0000d9 831f     `?L40`:     sta     31,s
+0010                          int op = **p;
+0011                          int oprec;
+0012                          if (op == '*' || op == '/') oprec = 4;
+    \ 0000db 22......             jsl     long:`?L44`
+    \ 0000df 29ff00               and     ##255
+    \ 0000e2 830b                 sta     11,s
+    \ 0000e4 c92a00               cmp     ##42
+    \ 0000e7 f016                 beq     `?L12`
+    \ 0000e9 c92f00               cmp     ##47
+    \ 0000ec f011                 beq     `?L12`
+0013                          else if (op == '+' || op == '-') oprec = 3;
+    \ 0000ee c92b00               cmp     ##43
+    \ 0000f1 f005                 beq     `?L15`
+    \ 0000f3 c92d00               cmp     ##45
+    \ 0000f6 d018                 bne     `?L19`
+    \ 0000f8 a90300   `?L15`:     lda     ##3
+    \ 0000fb 8301                 sta     1,s
+    \ 0000fd 8005                 bra     `?L11`
+    \ 0000ff a90400   `?L12`:     lda     ##4
+    \ 000102 8301                 sta     1,s
+    \ 000104          `?L11`:
+0014                          else return a;
+0015                          if (oprec <= prec) return a;
+    \ 000104 a5..                 lda     dp:.tiny (_Dp+8)
+    \ 000106 38                   sec
+    \ 000107 e301                 sbc     1,s
+    \ 000109 5003                 bvc     `?L35`
+    \ 00010b 490080               eor     ##-32768
+    \ 00010e 302a     `?L35`:     bmi     `?L18`
+    \ 000110 a325     `?L19`:     lda     37,s
+    \ 000112 a00600               ldy     ##6
+    \ 000115 931d                 sta     (29,s),y
+    \ 000117 a323                 lda     35,s
+    \ 000119 a00400               ldy     ##4
+    \ 00011c 931d                 sta     (29,s),y
+    \ 00011e a321                 lda     33,s
+    \ 000120 a00200               ldy     ##2
+    \ 000123 931d                 sta     (29,s),y
+    \ 000125 a31f                 lda     31,s
+    \ 000127 a00000               ldy     ##0
+    \ 00012a 931d                 sta     (29,s),y
+    \ 00012c a31d                 lda     29,s
+0016                          (*p)++;
+0017                          double b = evalAt(p, oprec);
+0018                          if (op == '+') a = a + b;
+0019                          else if (op == '*') a = a * b;
+0020                      }
+0021                  }
+    \ 00012e a8                   tay
+    \ 00012f 3b                   tsc
+    \ 000130 18                   clc
+    \ 000131 692600               adc     ##38
+    \ 000134 1b                   tcs
+    \ 000135 98                   tya
+    \ 000136 7a                   ply
+    \ 000137 84..                 sty     dp:.tiny (_Dp+8)
+    \ 000139 6b                   rtl
+    \ 00013a a31b     `?L18`:     lda     27,s
+    \ 00013c aa                   tax
+    \ 00013d fe0000               inc     0,x
+    \ 000140 a301                 lda     1,s
+    \ 000142 85..                 sta     dp:.tiny (_Dp+2)
+    \ 000144 a31b                 lda     27,s
+    \ 000146 85..                 sta     dp:.tiny _Dp
+    \ 000148 3b                   tsc
+    \ 000149 18                   clc
+    \ 00014a 690300               adc     ##3
+    \ 00014d 22......             jsl     long:evalAt
+    \ 000151 a30b                 lda     11,s
+    \ 000153 c92b00               cmp     ##43
+    \ 000156 d037                 bne     `?L21`
+    \ 000158 3b                   tsc
+    \ 000159 18                   clc
+    \ 00015a 690300               adc     ##3
+    \ 00015d 85..                 sta     dp:.tiny (_Dp+2)
+    \ 00015f 3b                   tsc
+    \ 000160 18                   clc
+    \ 000161 691f00               adc     ##31
+    \ 000164 85..                 sta     dp:.tiny _Dp
+    \ 000166 3b                   tsc
+    \ 000167 18                   clc
+    \ 000168 690300               adc     ##3
+    \ 00016b 22......             jsl     long:__f64_add
+    \ 00016f a309                 lda     9,s
+    \ 000171 8319                 sta     25,s
+    \ 000173 a307                 lda     7,s
+    \ 000175 8317                 sta     23,s
+    \ 000177 a305                 lda     5,s
+    \ 000179 8315                 sta     21,s
+    \ 00017b a303                 lda     3,s
+    \ 00017d 8313                 sta     19,s
+    \ 00017f a319                 lda     25,s
+    \ 000181 8311                 sta     17,s
+    \ 000183 a317                 lda     23,s
+    \ 000185 830f                 sta     15,s
+    \ 000187 a315                 lda     21,s
+    \ 000189 830d                 sta     13,s
+    \ 00018b a313                 lda     19,s
+    \ 00018d 805a                 bra     `?L43`
+    \ 00018f c92a00   `?L21`:     cmp     ##42
+    \ 000192 d037                 bne     `?L24`
+    \ 000194 3b                   tsc
+    \ 000195 18                   clc
+    \ 000196 690300               adc     ##3
+    \ 000199 85..                 sta     dp:.tiny (_Dp+2)
+    \ 00019b 3b                   tsc
+    \ 00019c 18                   clc
+    \ 00019d 691f00               adc     ##31
+    \ 0001a0 85..                 sta     dp:.tiny _Dp
+    \ 0001a2 3b                   tsc
+    \ 0001a3 18                   clc
+    \ 0001a4 690300               adc     ##3
+    \ 0001a7 22......             jsl     long:__f64_mul
+    \ 0001ab a309                 lda     9,s
+    \ 0001ad 8311                 sta     17,s
+    \ 0001af a307                 lda     7,s
+    \ 0001b1 830f                 sta     15,s
+    \ 0001b3 a305                 lda     5,s
+    \ 0001b5 830d                 sta     13,s
+    \ 0001b7 a303                 lda     3,s
+    \ 0001b9 830b                 sta     11,s
+    \ 0001bb a311                 lda     17,s
+    \ 0001bd 8309                 sta     9,s
+    \ 0001bf a30f                 lda     15,s
+    \ 0001c1 8307                 sta     7,s
+    \ 0001c3 a30d                 lda     13,s
+    \ 0001c5 8305                 sta     5,s
+    \ 0001c7 a30b                 lda     11,s
+    \ 0001c9 800e                 bra     `?L42`
+    \ 0001cb a325     `?L24`:     lda     37,s
+    \ 0001cd 8309                 sta     9,s
+    \ 0001cf a323                 lda     35,s
+    \ 0001d1 8307                 sta     7,s
+    \ 0001d3 a321                 lda     33,s
+    \ 0001d5 8305                 sta     5,s
+    \ 0001d7 a31f                 lda     31,s
+    \ 0001d9 8303     `?L42`:     sta     3,s
+    \ 0001db a309                 lda     9,s
+    \ 0001dd 8311                 sta     17,s
+    \ 0001df a307                 lda     7,s
+    \ 0001e1 830f                 sta     15,s
+    \ 0001e3 a305                 lda     5,s
+    \ 0001e5 830d                 sta     13,s
+    \ 0001e7 a303                 lda     3,s
+    \ 0001e9 830b     `?L43`:     sta     11,s
+    \ 0001eb a311                 lda     17,s
+    \ 0001ed 8325                 sta     37,s
+    \ 0001ef a30f                 lda     15,s
+    \ 0001f1 8323                 sta     35,s
+    \ 0001f3 a30d                 lda     13,s
+    \ 0001f5 8321                 sta     33,s
+    \ 0001f7 a30b                 lda     11,s
+    \ 0001f9 4c....               jmp     .kbank `?L40`
+    \ 000000                      .section farcode,text
+    \ 000000 a31e     `?L44`:     lda     30,s
+    \ 000002 a8                   tay
+    \ 000003 be0000               ldx     0,y
+    \ 000006 bd0000               lda     0,x
+    \ 000009 6b                   rtl
+    \ 000000                      .section cdata,rodata
+    \ 000000                      .pubweak _Const_0000000000000000
+    \ 000000          _Const_0000000000000000:
+    \ 000000 00000000             .quad   0
+    \ 000004 00000000
+    \ 000000                      .section cdata,rodata
+    \ 000000                      .pubweak _Const_4024000000000000
+    \ 000000          _Const_4024000000000000:
+    \ 000000 00000000             .quad   0x4024000000000000
+    \ 000004 00002440
+
+##########################
+#                        #
+# Memory sizes (decimal) #
+#                        #
+##########################
+
+Executable  (Text): 518 bytes
+Constant          :  16 bytes
diff --git a/compare/evalAt.ours.s b/compare/evalAt.ours.s
new file mode 100644
index 0000000..cd8be47
--- /dev/null
+++ b/compare/evalAt.ours.s
@@ -0,0 +1,593 @@
+	.file	"evalAt.c"
+	.text
+	.globl	evalAt                          ; -- Begin function evalAt
+	.type	evalAt,@function
+evalAt:                                 ; @evalAt
+; %bb.0:                                ; %entry
+	rep	#0x30
+	tay
+	tsc
+	sec
+	sbc	#0x46
+	tcs
+	tya
+	pha
+	lda	0xc0
+	sta	0xb, s
+	lda	0xc4
+	sta	0x9, s
+	lda	0xc6
+	sta	0x7, s
+	lda	0xc8
+	sta	0x5, s
+	lda	0xca
+	sta	0x3, s
+	pla
+	stx	0xc0
+	sta	0x19, s
+	clc
+	adc	#0x2
+	sta	0x1f, s
+	lda	0xc0
+	sta	0x21, s
+	adc	#0x0
+	sta	0x21, s
+	lda	0x1f, s
+	sta	0x45, s
+	lda	0x21, s
+	sta	0x43, s
+	lda	0x45, s
+	sta	0xe0
+	lda	0x43, s
+	sta	0xe2
+	ldy	#0x0
+	lda	[0xe0 ], y
+	sta	0x1d, s
+	lda	0x19, s
+	sta	0x41, s
+	pha
+	lda	0xc0
+	sta	0x41, s
+	pla
+	lda	0x41, s
+	sta	0xe0
+	lda	0x3f, s
+	sta	0xe2
+	lda	[0xe0 ], y
+	sta	0x21, s
+	lda	0x4a, s
+	sta	0xb, s
+	lda	#0x0
+	sta	0xc4
+	sta	0xc6
+	lda	0x21, s
+	sta	0x3d, s
+	lda	0x1d, s
+	sta	0x3b, s
+	lda	0x3d, s
+	sta	0xe0
+	lda	0x3b, s
+	sta	0xe2
+	lda	[0xe0 ], y
+	and	#0xff
+	sta	0x1b, s
+	sep	#0x20
+	clc
+	adc	#0xd0
+	rep	#0x20
+	and	#0xff
+	cmp	#0xa
+	pha
+	lda	0xc4
+	sta	0xc8
+	pla
+	pha
+	lda	0xc6
+	sta	0xca
+	pla
+	bcc	.LBB0_1
+; %bb.15:                               ; %entry
+	brl	.LBB0_4
+.LBB0_1:                                ; %while.body.preheader
+	lda	0x21, s
+	inc a
+	sta	0x21, s
+	bne	.Ltmp0
+	lda	0x1d, s
+	inc a
+	sta	0x1d, s
+.Ltmp0:
+	lda	#0x0
+	sta	0x15, s
+	sta	0x13, s
+	sta	0x11, s
+	sta	0xf, s
+	lda	0x1d, s
+	sta	0x17, s
+.LBB0_2:                                ; %while.body
+                                        ; =>This Inner Loop Header: Depth=1
+	sta	0x1d, s
+	lda	0x19, s
+	tax
+	pha
+	lda	0xc0
+	sta	0x3b, s
+	pla
+	txa
+	sta	0xe0
+	lda	0x39, s
+	sta	0xe2
+	lda	0x21, s
+	ldy	#0x0
+	sta	[0xe0 ], y
+	lda	0x19, s
+	clc
+	adc	#0x2
+	sta	0xd, s
+	lda	0xc0
+	sta	0x1f, s
+	adc	#0x0
+	sta	0x1f, s
+	lda	0xd, s
+	sta	0x37, s
+	lda	0x1f, s
+	tax
+	lda	0x37, s
+	sta	0xe0
+	txa
+	sta	0xe2
+	lda	0x1d, s
+	sta	[0xe0 ], y
+	pea	0x4024
+	pea	0x0
+	pea	0x0
+	pea	0x0
+	lda	0x17, s
+	pha
+	lda	0x1b, s
+	pha
+	lda	0x1f, s
+	tax
+	lda	0x21, s
+	jsl	__muldf3
+	sta	0xe0
+	tsc
+	clc
+	adc	#0xc
+	tcs
+	lda	0xe0
+	sta	0x1f, s
+	txa
+	sta	0x15, s
+	tya
+	sta	0x13, s
+	lda	0xf0
+	sta	0x11, s
+	lda	0x1b, s
+	sep	#0x20
+	clc
+	adc	#0xd0
+	rep	#0x20
+	and	#0xff
+	sta	0x1b, s
+	ldx	#0x0
+	lda	0x1b, s
+	jsl	__floatunsidf
+	sta	0x1b, s
+	txa
+	sta	0xf, s
+	tya
+	sta	0xd, s
+	pei	0xf0
+	lda	0xf, s
+	pha
+	lda	0x13, s
+	tax
+	phx
+	lda	0x21, s
+	pha
+	lda	0x19, s
+	pha
+	lda	0x1d, s
+	pha
+	lda	0x21, s
+	tax
+	lda	0x2b, s
+	jsl	__adddf3
+	sta	0xe0
+	tsc
+	clc
+	adc	#0xc
+	tcs
+	lda	0xe0
+	sta	0x15, s
+	txa
+	sta	0x13, s
+	tya
+	sta	0x11, s
+	lda	0xf0
+	sta	0xf, s
+	lda	0x21, s
+	sta	0xd0
+	tax
+	lda	0x21, s
+	clc
+	adc	#0x1
+	sta	0x21, s
+	txa
+	lda	0xd0
+	sta	0x1f, s
+	lda	0x17, s
+	adc	#0x0
+	sta	0x17, s
+	lda	0x11, s
+	sta	0xc8
+	lda	0xf, s
+	sta	0xca
+	lda	0x15, s
+	sta	0xc4
+	lda	0x13, s
+	sta	0xc6
+	lda	0x1f, s
+	sta	0x35, s
+	lda	0x1d, s
+	tax
+	lda	0x35, s
+	sta	0xe0
+	txa
+	sta	0xe2
+	ldy	#0x0
+	lda	[0xe0 ], y
+	and	#0xff
+	sta	0x1b, s
+	sep	#0x20
+	clc
+	adc	#0xd0
+	rep	#0x20
+	and	#0xff
+	cmp	#0xa
+	lda	0x17, s
+	bcs	.LBB0_3
+; %bb.16:                               ; %while.body
+                                        ;   in Loop: Header=BB0_2 Depth=1
+	brl	.LBB0_2
+.LBB0_3:                                ; %while.cond7.preheader.loopexit
+	lda	0x21, s
+	clc
+	adc	#0xffff
+	sta	0x21, s
+	lda	0x17, s
+	adc	#0xffff
+	sta	0x1d, s
+.LBB0_4:                                ; %while.cond7.preheader
+	lda	0xb, s
+	eor	#0x8000
+	sta	0xb, s
+	lda	0x1b, s
+	brl	.LBB0_5
+.LBB0_11:                               ; %if.then33
+                                        ;   in Loop: Header=BB0_5 Depth=1
+	lda	0xc6
+	sta	0x1b, s
+	lda	0xc4
+	sta	0x15, s
+	lda	0xca
+	sta	0x11, s
+	lda	0xc8
+	sta	0x13, s
+	lda	0x17, s
+	pha
+	lda	0x1f, s
+	pha
+	lda	0x23, s
+	pha
+	lda	0x27, s
+	pha
+	lda	0x19, s
+	pha
+	lda	0x1d, s
+	pha
+	lda	0x27, s
+	tax
+	lda	0x21, s
+	jsl	__muldf3
+.LBB0_12:                               ; %cleanup
+                                        ;   in Loop: Header=BB0_5 Depth=1
+	sta	0xe0
+	tsc
+	clc
+	adc	#0xc
+	tcs
+	lda	0xe0
+	sta	0x21, s
+	txa
+	sta	0x1f, s
+	tya
+	sta	0x1d, s
+	lda	0xf0
+	sta	0x1b, s
+	lda	0x1d, s
+	sta	0xc8
+	lda	0x1b, s
+	sta	0xca
+	lda	0x21, s
+	sta	0xc4
+	lda	0x1f, s
+	sta	0xc6
+.LBB0_13:                               ; %cleanup
+                                        ;   in Loop: Header=BB0_5 Depth=1
+	lda	0x19, s
+	clc
+	adc	#0x2
+	sta	0x1f, s
+	lda	0xc0
+	sta	0x21, s
+	adc	#0x0
+	sta	0x21, s
+	lda	0x1f, s
+	sta	0x25, s
+	lda	0x21, s
+	tax
+	lda	0x25, s
+	sta	0xe0
+	txa
+	sta	0xe2
+	ldy	#0x0
+	lda	[0xe0 ], y
+	sta	0x1d, s
+	lda	0x19, s
+	tax
+	pha
+	lda	0xc0
+	sta	0x25, s
+	pla
+	txa
+	sta	0xe0
+	lda	0x23, s
+	sta	0xe2
+	lda	[0xe0 ], y
+	sta	0x21, s
+	lda	0x1d, s
+	tax
+	lda	0x21, s
+	sta	0xe0
+	txa
+	sta	0xe2
+	lda	[0xe0 ], y
+	and	#0xff
+.LBB0_5:                                ; %while.cond7
+                                        ; =>This Inner Loop Header: Depth=1
+	sta	0x1b, s
+	sep	#0x20
+	clc
+	adc	#0xd6
+	rep	#0x20
+	and	#0xff
+	sta	0x1f, s
+	lda	0x1f, s
+	pha
+	lda	#0x2b
+	jsl	__lshrhi3
+	ply
+	sta	0x17, s
+	lda	0x1f, s
+	cmp	#0x6
+	bcc	.LBB0_6
+; %bb.17:                               ; %while.cond7
+	brl	.LBB0_14
+.LBB0_6:                                ; %while.cond7
+                                        ;   in Loop: Header=BB0_5 Depth=1
+	lda	0x17, s
+	and	#0x1
+	sta	0x17, s
+	lda	#0x0
+	sta	0x33, s
+	lda	0x17, s
+	ora	0x33, s
+	bne	.LBB0_7
+; %bb.18:                               ; %while.cond7
+	brl	.LBB0_14
+.LBB0_7:                                ; %switch.lookup
+                                        ;   in Loop: Header=BB0_5 Depth=1
+	lda	#0x0
+	asl a
+	sta	0x17, s
+	lda	0x1f, s
+	asl a
+	lda	#0x0
+	rol a
+	sta	0x31, s
+	lda	0x17, s
+	ora	0x31, s
+	sta	0x17, s
+	lda	0x1f, s
+	asl a
+	sta	0x1f, s
+	lda	#.Lswitch.table.evalAt
+	sta	0x2f, s
+	lda	0x1f, s
+	clc
+	adc	0x2f, s
+	sta	0x1f, s
+	lda	#0x0
+	sta	0x2d, s
+	lda	0x17, s
+	adc	0x2d, s
+	sta	0x17, s
+	lda	0x1f, s
+	sta	0x2b, s
+	lda	0x17, s
+	tax
+	lda	0x2b, s
+	sta	0xe0
+	txa
+	sta	0xe2
+	ldy	#0x0
+	lda	[0xe0 ], y
+	sta	0x1f, s
+	lda	0x1f, s
+	tax
+	eor	#0x8000
+	sta	0x1f, s
+	txa
+	sta	0x17, s
+	lda	0xb, s
+	cmp	0x1f, s
+	bcc	.LBB0_8
+; %bb.19:                               ; %switch.lookup
+	brl	.LBB0_14
+.LBB0_8:                                ; %if.end25
+                                        ;   in Loop: Header=BB0_5 Depth=1
+	lda	0x21, s
+	inc a
+	sta	0x21, s
+	bne	.Ltmp1
+	lda	0x1d, s
+	inc a
+	sta	0x1d, s
+.Ltmp1:
+	lda	0x19, s
+	tax
+	pha
+	lda	0xc0
+	sta	0x2b, s
+	pla
+	txa
+	sta	0xe0
+	lda	0x29, s
+	sta	0xe2
+	lda	0x21, s
+	ldy	#0x0
+	sta	[0xe0 ], y
+	lda	0x19, s
+	sta	0xd0
+	clc
+	adc	#0x2
+	sta	0x1f, s
+	lda	0xd0
+	sta	0x21, s
+	lda	0xc0
+	adc	#0x0
+	sta	0x15, s
+	lda	0x1f, s
+	sta	0x27, s
+	lda	0x15, s
+	tax
+	lda	0x27, s
+	sta	0xe0
+	txa
+	sta	0xe2
+	lda	0x1d, s
+	sta	[0xe0 ], y
+	lda	0x17, s
+	pha
+	ldx	0xc0
+	lda	0x23, s
+	jsl	evalAt
+	sta	0xe0
+	tsc
+	clc
+	adc	#0x2
+	tcs
+	lda	0xe0
+	sta	0x21, s
+	txa
+	sta	0x1f, s
+	tya
+	sta	0x1d, s
+	lda	0xf0
+	sta	0x17, s
+	lda	0x1b, s
+	and	#0xff
+	cmp	#0x2a
+	bne	.LBB0_9
+; %bb.20:                               ; %if.end25
+                                        ;   in Loop: Header=BB0_5 Depth=1
+	brl	.LBB0_11
+.LBB0_9:                                ; %if.end25
+                                        ;   in Loop: Header=BB0_5 Depth=1
+	cmp	#0x2b
+	beq	.LBB0_10
+; %bb.21:                               ; %if.end25
+                                        ;   in Loop: Header=BB0_5 Depth=1
+	brl	.LBB0_13
+.LBB0_10:                               ; %if.then29
+                                        ;   in Loop: Header=BB0_5 Depth=1
+	lda	0xc6
+	sta	0x1b, s
+	lda	0xc4
+	sta	0x15, s
+	lda	0xca
+	sta	0x11, s
+	lda	0xc8
+	sta	0x13, s
+	lda	0x17, s
+	pha
+	lda	0x1f, s
+	pha
+	lda	0x23, s
+	pha
+	lda	0x27, s
+	pha
+	lda	0x19, s
+	pha
+	lda	0x1d, s
+	pha
+	lda	0x27, s
+	tax
+	lda	0x21, s
+	jsl	__adddf3
+	brl	.LBB0_12
+.LBB0_14:                               ; %cleanup37
+	lda	0xc6
+	sta	0x21, s
+	lda	0xc4
+	sta	0x1f, s
+	lda	0xca
+	sta	0x1b, s
+	lda	0xc8
+	sta	0x1d, s
+	lda	0x1b, s
+	sta	0xf0
+	lda	0x1d, s
+	tay
+	lda	0x21, s
+	tax
+	lda	0x1f, s
+	pha
+	lda	0x3, s
+	sta	0xca
+	lda	0x5, s
+	sta	0xc8
+	lda	0x7, s
+	sta	0xc6
+	lda	0x9, s
+	sta	0xc4
+	lda	0xb, s
+	sta	0xc0
+	pla
+	sta	0xe0
+	tsc
+	clc
+	adc	#0x46
+	tcs
+	lda	0xe0
+	rtl
+.Lfunc_end0:
+	.size	evalAt, .Lfunc_end0-evalAt
+                                        ; -- End function
+	.type	.Lswitch.table.evalAt,@object   ; @switch.table.evalAt
+	.section	.rodata,"a",@progbits
+	.p2align	1, 0x0
+.Lswitch.table.evalAt:
+	.short	4                               ; 0x4
+	.short	3                               ; 0x3
+	.zero	2
+	.short	3                               ; 0x3
+	.zero	2
+	.short	4                               ; 0x4
+	.size	.Lswitch.table.evalAt, 12
+
+	.ident	"clang version 23.0.0git (https://github.com/llvm-mos/llvm-mos.git c798c31416f72b395c658b5502d281a162387ab1)"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
diff --git a/compare/mul16to32.c b/compare/mul16to32.c
new file mode 100644
index 0000000..7988ac6
--- /dev/null
+++ b/compare/mul16to32.c
@@ -0,0 +1,4 @@
+// Explicit zext pattern - should trigger the combine.
+unsigned long mul16to32(unsigned short a, unsigned short b) {
+    return (unsigned long)a * (unsigned long)b;
+}
diff --git a/compare/mul16to32.calypsi.lst b/compare/mul16to32.calypsi.lst
new file mode 100644
index 0000000..8df288a
--- /dev/null
+++ b/compare/mul16to32.calypsi.lst
@@ -0,0 +1,37 @@
+###############################################################################
+#                                                                             #
+# Calypsi ISO C compiler for 65816                               version 5.16 #
+#                                                       13/May/2026  15:46:15 #
+# Command line: --speed -O 2 --64bit-doubles mul16to32.c -o                   #
+#               /tmp/mul16to32.calypsi.elf --list-file                        #
+#               mul16to32.calypsi.lst                                         #
+#                                                                             #
+###############################################################################
+
+    \ 000000                      .rtmodel version,"1"
+    \ 000000                      .rtmodel codeModel,"large"
+    \ 000000                      .rtmodel dataModel,"small"
+    \ 000000                      .rtmodel core,"65816"
+    \ 000000                      .rtmodel huge,"0"
+    \ 000000                      .rtmodel target,"none-specified"
+    \ 000000                      .extern _Dp
+    \ 000000                      .extern _Mul16
+    \ 000000                      .extern _Vfp
+0001                  // Explicit zext pattern - should trigger the combine.
+0002                  unsigned long mul16to32(unsigned short a, unsigned short b) {
+    \ 000000                      .section farcode,text
+    \ 000000                      .public mul16to32
+    \ 000000 aa       mul16to32:  tax
+0003                      return (unsigned long)a * (unsigned long)b;
+    \ 000001 a5..                 lda     dp:.tiny _Dp
+    \ 000003 22......             jsl     long:_Mul16
+0004                  }
+    \ 000007 6b                   rtl
+
+##########################
+#                        #
+# Memory sizes (decimal) #
+#                        #
+##########################
+
+Executable  (Text): 8 bytes
diff --git a/compare/mul16to32.ours.s b/compare/mul16to32.ours.s
new file mode 100644
index 0000000..0e39aa6
--- /dev/null
+++ b/compare/mul16to32.ours.s
@@ -0,0 +1,23 @@
+	.file	"mul16to32.c"
+	.text
+	.globl	mul16to32                       ; -- Begin function mul16to32
+	.type	mul16to32,@function
+mul16to32:                              ; @mul16to32
+; %bb.0:                                ; %entry
+	rep	#0x30
+	pha
+	pha
+	lda	0x8, s
+	jsl	__umulhisi3
+	ply
+	sta	0x1, s
+	lda	0x1, s
+	ply
+	rtl
+.Lfunc_end0:
+	.size	mul16to32, .Lfunc_end0-mul16to32
+                                        ; -- End function
+	.ident	"clang version 23.0.0git (https://github.com/llvm-mos/llvm-mos.git c798c31416f72b395c658b5502d281a162387ab1)"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.addrsig_sym __umulhisi3
diff --git a/compare/regen.sh b/compare/regen.sh
new file mode 100755
index 0000000..0b70159
--- /dev/null
+++ b/compare/regen.sh
@@ -0,0 +1,44 @@
+#!/usr/bin/env bash
+# Regenerate compare/ artifacts: for each *.c, produce both
+# <name>.ours.s (our backend) and <name>.calypsi.lst (Calypsi listing).
+# Run from the project root or anywhere; uses absolute paths.
+
+set -eu
+
+PROJECT_ROOT="/home/scott/claude/llvm816"
+COMPARE_DIR="$PROJECT_ROOT/compare"
+OUR_CLANG="$PROJECT_ROOT/tools/llvm-mos-build/bin/clang"
+OUR_SYSROOT="$PROJECT_ROOT/runtime"
+CALYPSI_CC="$PROJECT_ROOT/tools/calypsi/usr/local/lib/calypsi-65816-5.16/bin/cc65816"
+
+OURS_FLAGS=(--target=w65816 --sysroot="$OUR_SYSROOT" -O2 -S)
+# --64bit-doubles for fair FP comparison (Calypsi default is 32-bit doubles).
+CALYPSI_FLAGS=(--speed -O 2 --64bit-doubles)
+
+cd "$COMPARE_DIR"
+
+for c in *.c; do
+    base="${c%.c}"
+    echo "build: $base"
+    "$OUR_CLANG"   "${OURS_FLAGS[@]}"   "$c" -o "$base.ours.s"
+    "$CALYPSI_CC" "${CALYPSI_FLAGS[@]}" "$c" -o "/tmp/$base.calypsi.elf" \
+        --list-file "$base.calypsi.lst"
+    rm -f "/tmp/$base.calypsi.elf"
+done
+
+# Per-file instruction-count summary.
+printf '\n%-25s %8s %8s %8s\n' "test" "ours" "calypsi" "ratio"
+printf '%-25s %8s %8s %8s\n' "----" "----" "-------" "-----"
+for c in *.c; do
+    base="${c%.c}"
+    ours_n=$(grep -cE \
+        '^\s+(lda|sta|jsl|jsr|adc|sbc|cmp|sec|clc|sep|rep|inc|dec|bra|brl|bcs|bcc|beq|bne|bmi|bpl|asl|lsr|rol|ror|stz|stx|sty|ldx|ldy|tax|txa|tay|tya|tsc|tcs|tdc|tcd|pha|pla|phx|plx|phy|ply|php|plp|pea|pei|rtl|rts|xba|xce|tsb|trb|bit|and|ora|eor|cop|brk|wai|stp|nop)\b' \
+        "$base.ours.s" || true)
+    cal_n=$(grep -cE '^\s+\\ [0-9a-f]+ [0-9a-f][0-9a-f]' "$base.calypsi.lst" || true)
+    if [ "$cal_n" -gt 0 ]; then
+        ratio=$(awk -v a="$ours_n" -v b="$cal_n" 'BEGIN{printf "%.2fx", a/b}')
+    else
+        ratio="n/a"
+    fi
+    printf '%-25s %8s %8s %8s\n' "$base" "$ours_n" "$cal_n" "$ratio"
+done
diff --git a/compare/sumSquares.c b/compare/sumSquares.c
new file mode 100644
index 0000000..df68fe5
--- /dev/null
+++ b/compare/sumSquares.c
@@ -0,0 +1,8 @@
+// Simple function for compiler-quality comparison.
+unsigned long sumSquares(unsigned short n) {
+    unsigned long total = 0;
+    for (unsigned short i = 1; i <= n; i++) {
+        total += (unsigned long)i * i;
+    }
+    return total;
+}
diff --git a/compare/sumSquares.calypsi.lst b/compare/sumSquares.calypsi.lst
new file mode 100644
index 0000000..09e4d2b
--- /dev/null
+++ b/compare/sumSquares.calypsi.lst
@@ -0,0 +1,68 @@
+###############################################################################
+#                                                                             #
+# Calypsi ISO C compiler for 65816                               version 5.16 #
+#                                                       13/May/2026  15:46:15 #
+# Command line: --speed -O 2 --64bit-doubles sumSquares.c -o                  #
+#               /tmp/sumSquares.calypsi.elf --list-file                       #
+#               sumSquares.calypsi.lst                                        #
+#                                                                             #
+###############################################################################
+
+    \ 000000                      .rtmodel version,"1"
+    \ 000000                      .rtmodel codeModel,"large"
+    \ 000000                      .rtmodel dataModel,"small"
+    \ 000000                      .rtmodel core,"65816"
+    \ 000000                      .rtmodel huge,"0"
+    \ 000000                      .rtmodel target,"none-specified"
+    \ 000000                      .extern _Dp
+    \ 000000                      .extern _Mul16
+    \ 000000                      .extern _Vfp
+0001                  // Simple function for compiler-quality comparison.
+0002                  unsigned long sumSquares(unsigned short n) {
+    \ 000000                      .section farcode,text
+    \ 000000                      .public sumSquares
+    \ 000000 5a       sumSquares: phy
+    \ 000001 5a                   phy
+    \ 000002 8301                 sta     1,s
+0003                      unsigned long total = 0;
+    \ 000004 64..                 stz     dp:.tiny _Dp
+    \ 000006 64..                 stz     dp:.tiny (_Dp+2)
+0004                      for (unsigned short i = 1; i <= n; i++) {
+    \ 000008 a90100               lda     ##1
+    \ 00000b 8303                 sta     3,s
+    \ 00000d a301     `?L5`:      lda     1,s
+    \ 00000f c303                 cmp     3,s
+    \ 000011 b007                 bcs     `?L4`
+0005                          total += (unsigned long)i * i;
+0006                      }
+0007                      return total;
+    \ 000013 a6..                 ldx     dp:.tiny (_Dp+2)
+    \ 000015 a5..                 lda     dp:.tiny _Dp
+0008                  }
+    \ 000017 7a                   ply
+    \ 000018 7a                   ply
+    \ 000019 6b                   rtl
+    \ 00001a a303     `?L4`:      lda     3,s
+    \ 00001c aa                   tax
+    \ 00001d 22......             jsl     long:_Mul16
+    \ 000021 18                   clc
+    \ 000022 65..                 adc     dp:.tiny _Dp
+    \ 000024 48                   pha
+    \ 000025 8a                   txa
+    \ 000026 65..                 adc     dp:.tiny (_Dp+2)
+    \ 000028 aa                   tax
+    \ 000029 68                   pla
+    \ 00002a 86..                 stx     dp:.tiny (_Dp+2)
+    \ 00002c 85..                 sta     dp:.tiny _Dp
+    \ 00002e a303                 lda     3,s
+    \ 000030 1a                   inc     a
+    \ 000031 8303                 sta     3,s
+    \ 000033 80d8                 bra     `?L5`
+
+##########################
+#                        #
+# Memory sizes (decimal) #
+#                        #
+##########################
+
+Executable  (Text): 53 bytes
diff --git a/compare/sumSquares.ours.s b/compare/sumSquares.ours.s
new file mode 100644
index 0000000..bb9efad
--- /dev/null
+++ b/compare/sumSquares.ours.s
@@ -0,0 +1,93 @@
+	.file	"sumSquares.c"
+	.text
+	.globl	sumSquares                      ; -- Begin function sumSquares
+	.type	sumSquares,@function
+sumSquares:                             ; @sumSquares
+; %bb.0:                                ; %entry
+	rep	#0x30
+	tay
+	tsc
+	sec
+	sbc	#0xe
+	tcs
+	tya
+	sta	0x7, s
+	lda	#0x0
+	sta	0xb, s
+	lda	0x7, s
+	cmp	#0x0
+	php
+	lda	#0x0
+	plp
+	sta	0x9, s
+	bne	.LBB0_1
+; %bb.6:                                ; %entry
+	brl	.LBB0_5
+.LBB0_1:                                ; %for.body.preheader
+	lda	0x7, s
+	inc a
+	sta	0x7, s
+	cmp	#0x3
+	bcs	.LBB0_3
+; %bb.2:                                ; %for.body.preheader
+	lda	#0x2
+	sta	0x7, s
+.LBB0_3:                                ; %for.body.preheader
+	lda	#0x0
+	sta	0x3, s
+	lda	#0x1
+	sta	0xd, s
+	lda	0x7, s
+	dec a
+	sta	0x7, s
+	lda	#0x0
+	sta	0x5, s
+	sta	0x1, s
+.LBB0_4:                                ; %for.body
+                                        ; =>This Inner Loop Header: Depth=1
+	lda	0xd, s
+	pha
+	jsl	__umulhisi3
+	ply
+	clc
+	adc	0x3, s
+	sta	0xb, s
+	txa
+	adc	0x1, s
+	sta	0x9, s
+	lda	0xd, s
+	inc a
+	sta	0xd, s
+	bne	.Ltmp0
+	lda	0x5, s
+	inc a
+	sta	0x5, s
+.Ltmp0:
+	lda	0xb, s
+	sta	0x3, s
+	lda	0x9, s
+	sta	0x1, s
+	lda	0x7, s
+	dec a
+	sta	0x7, s
+	cmp	#0x0
+	beq	.LBB0_5
+	bra	.LBB0_4
+.LBB0_5:                                ; %for.cond.cleanup
+	lda	0x9, s
+	tax
+	lda	0xb, s
+	tay
+	tsc
+	clc
+	adc	#0xe
+	tcs
+	tya
+	rtl
+.Lfunc_end0:
+	.size	sumSquares, .Lfunc_end0-sumSquares
+                                        ; -- End function
+	.ident	"clang version 23.0.0git (https://github.com/llvm-mos/llvm-mos.git c798c31416f72b395c658b5502d281a162387ab1)"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.addrsig_sym __umulhisi3
diff --git a/mame.ini b/mame.ini
new file mode 100644
index 0000000..da78978
--- /dev/null
+++ b/mame.ini
@@ -0,0 +1,416 @@
+﻿#
+# CORE CONFIGURATION OPTIONS
+#
+readconfig                1
+writeconfig               0
+
+#
+# CORE SEARCH PATH OPTIONS
+#
+homepath                  .
+rompath                   $HOME/mame/roms;/usr/local/share/games/mame/roms;/usr/share/games/mame/roms
+hashpath                  /usr/share/games/mame/hash
+samplepath                $HOME/mame/samples;/usr/local/share/games/mame/samples;/usr/share/games/mame/samples
+artpath                   $HOME/mame/artwork;/usr/local/share/games/mame/artwork;/usr/share/games/mame/artwork
+ctrlrpath                 /usr/share/games/mame/ctrlr
+inipath                   $HOME/.mame;/etc/mame
+fontpath                  /usr/share/games/mame/fonts
+cheatpath                 $HOME/mame/cheat;/usr/local/share/games/mame/cheat;/usr/share/games/mame/cheat
+crosshairpath             $HOME/mame/crosshair;/usr/local/share/games/mame/crosshair;/usr/share/games/mame/crosshair
+pluginspath               /usr/share/games/mame/plugins
+languagepath              /usr/share/games/mame/language
+swpath                    software
+
+#
+# CORE OUTPUT DIRECTORY OPTIONS
+#
+cfg_directory             $HOME/.mame/cfg
+nvram_directory           $HOME/.mame/nvram
+input_directory           $HOME/.mame/inp
+state_directory           $HOME/.mame/sta
+snapshot_directory        $HOME/.mame/snap
+diff_directory            $HOME/.mame/diff
+comment_directory         $HOME/.mame/comments
+share_directory           share
+
+#
+# CORE STATE/PLAYBACK OPTIONS
+#
+state                     
+autosave                  0
+rewind                    0
+rewind_capacity           100
+playback                  
+record                    
+exit_after_playback       0
+mngwrite                  
+aviwrite                  
+wavwrite                  
+snapname                  %g/%i
+snapsize                  auto
+snapview                  auto
+snapbilinear              1
+statename                 %g
+burnin                    0
+
+#
+# CORE PERFORMANCE OPTIONS
+#
+autoframeskip             0
+frameskip                 0
+seconds_to_run            0
+throttle                  1
+sleep                     1
+speed                     1.0
+refreshspeed              0
+lowlatency                0
+
+#
+# CORE RENDER OPTIONS
+#
+keepaspect                1
+unevenstretch             1
+unevenstretchx            0
+unevenstretchy            0
+autostretchxy             0
+intoverscan               0
+intscalex                 0
+intscaley                 0
+
+#
+# CORE ROTATION OPTIONS
+#
+rotate                    1
+ror                       0
+rol                       0
+autoror                   0
+autorol                   0
+flipx                     0
+flipy                     0
+
+#
+# CORE ARTWORK OPTIONS
+#
+artwork_crop              0
+fallback_artwork          
+override_artwork          
+
+#
+# CORE SCREEN OPTIONS
+#
+brightness                1.0
+contrast                  1.0
+gamma                     1.0
+pause_brightness          0.65
+effect                    none
+
+#
+# CORE VECTOR OPTIONS
+#
+beam_width_min            1.0
+beam_width_max            1.0
+beam_dot_size             1.0
+beam_intensity_weight     0
+flicker                   0
+
+#
+# CORE SOUND OPTIONS
+#
+samplerate                48000
+samples                   1
+volume                    0
+compressor                1
+speaker_report            0
+
+#
+# CORE INPUT OPTIONS
+#
+coin_lockout              1
+ctrlr                     
+mouse                     1
+joystick                  1
+lightgun                  0
+multikeyboard             0
+multimouse                0
+steadykey                 0
+ui_active                 0
+offscreen_reload          0
+joystick_map              auto
+joystick_deadzone         0.15
+joystick_saturation       0.85
+joystick_threshold        0.3
+natural                   0
+joystick_contradictory    0
+coin_impulse              0
+
+#
+# CORE INPUT AUTOMATIC ENABLE OPTIONS
+#
+paddle_device             keyboard
+adstick_device            keyboard
+pedal_device              keyboard
+dial_device               keyboard
+trackball_device          keyboard
+lightgun_device           keyboard
+positional_device         keyboard
+mouse_device              mouse
+
+#
+# CORE DEBUGGING OPTIONS
+#
+verbose                   0
+log                       0
+oslog                     0
+debug                     0
+update_in_pause           0
+debugscript               
+debuglog                  0
+
+#
+# CORE COMM OPTIONS
+#
+comm_localhost            0.0.0.0
+comm_localport            15112
+comm_remotehost           127.0.0.1
+comm_remoteport           15112
+comm_framesync            0
+
+#
+# CORE MISC OPTIONS
+#
+drc                       1
+drc_use_c                 0
+drc_log_uml               0
+drc_log_native            0
+bios                      
+cheat                     0
+skip_gameinfo             0
+uifont                    default
+ui                        cabinet
+ramsize                   
+confirm_quit              0
+ui_mouse                  1
+language                  
+nvram_save                1
+
+#
+# SCRIPTING OPTIONS
+#
+autoboot_command          
+autoboot_delay            0
+autoboot_script           
+console                   0
+plugins                   1
+plugin                    
+noplugin                  
+
+#
+# HTTP SERVER OPTIONS
+#
+http                      0
+http_port                 8080
+http_root                 web
+
+#
+# OSD INPUT MAPPING OPTIONS
+#
+uimodekey                 INSERT
+controller_map            none
+background_input          0
+
+#
+# OSD FONT OPTIONS
+#
+uifontprovider            auto
+
+#
+# OSD OUTPUT OPTIONS
+#
+output                    auto
+
+#
+# OSD INPUT OPTIONS
+#
+keyboardprovider          auto
+mouseprovider             auto
+lightgunprovider          auto
+joystickprovider          auto
+
+#
+# OSD DEBUGGING OPTIONS
+#
+debugger                  auto
+debugger_port             23946
+debugger_font             auto
+debugger_font_size        0
+watchdog                  0
+
+#
+# OSD PERFORMANCE OPTIONS
+#
+numprocessors             auto
+bench                     0
+
+#
+# OSD VIDEO OPTIONS
+#
+video                     opengl
+numscreens                1
+window                    0
+maximize                  1
+waitvsync                 0
+syncrefresh               0
+monitorprovider           auto
+
+#
+# OSD PER-WINDOW VIDEO OPTIONS
+#
+screen                    auto
+aspect                    auto
+resolution                auto
+view                      auto
+screen0                   auto
+aspect0                   auto
+resolution0               auto
+view0                     auto
+screen1                   auto
+aspect1                   auto
+resolution1               auto
+view1                     auto
+screen2                   auto
+aspect2                   auto
+resolution2               auto
+view2                     auto
+screen3                   auto
+aspect3                   auto
+resolution3               auto
+view3                     auto
+
+#
+# OSD FULL SCREEN OPTIONS
+#
+switchres                 0
+
+#
+# OSD ACCELERATED VIDEO OPTIONS
+#
+filter                    1
+prescale                  1
+
+#
+# OpenGL-SPECIFIC OPTIONS
+#
+gl_forcepow2texture       0
+gl_notexturerect          0
+gl_vbo                    1
+gl_pbo                    1
+gl_glsl                   0
+gl_glsl_filter            1
+glsl_shader_mame0         none
+glsl_shader_mame1         none
+glsl_shader_mame2         none
+glsl_shader_mame3         none
+glsl_shader_mame4         none
+glsl_shader_mame5         none
+glsl_shader_mame6         none
+glsl_shader_mame7         none
+glsl_shader_mame8         none
+glsl_shader_mame9         none
+glsl_shader_screen0       none
+glsl_shader_screen1       none
+glsl_shader_screen2       none
+glsl_shader_screen3       none
+glsl_shader_screen4       none
+glsl_shader_screen5       none
+glsl_shader_screen6       none
+glsl_shader_screen7       none
+glsl_shader_screen8       none
+glsl_shader_screen9       none
+
+#
+# OSD SOUND OPTIONS
+#
+sound                     auto
+audio_latency             2
+
+#
+# PORTAUDIO OPTIONS
+#
+pa_api                    none
+pa_device                 none
+pa_latency                0
+
+#
+# OSD MIDI OPTIONS
+#
+midiprovider              auto
+
+#
+# OSD EMULATED NETWORKING OPTIONS
+#
+networkprovider           auto
+
+#
+# BGFX POST-PROCESSING OPTIONS
+#
+bgfx_path                 /usr/share/games/mame/bgfx
+bgfx_backend              auto
+bgfx_debug                0
+bgfx_screen_chains        
+bgfx_shadow_mask          slot-mask.png
+bgfx_lut                  lut-default.png
+bgfx_avi_name             auto
+
+#
+# SDL PERFORMANCE OPTIONS
+#
+sdlvideofps               0
+
+#
+# SDL VIDEO OPTIONS
+#
+centerh                   1
+centerv                   1
+scalemode                 none
+
+#
+# SDL FULL SCREEN OPTIONS
+#
+useallheads               0
+attach_window             
+
+#
+# SDL KEYBOARD MAPPING
+#
+keymap                    0
+keymap_file               keymap.dat
+
+#
+# SDL JOYSTICK MAPPING
+#
+sixaxis                   0
+
+#
+# SDL LIGHTGUN MAPPING
+#
+lightgun_index1           auto
+lightgun_index2           auto
+lightgun_index3           auto
+lightgun_index4           auto
+lightgun_index5           auto
+lightgun_index6           auto
+lightgun_index7           auto
+lightgun_index8           auto
+
+#
+# SDL LOW-LEVEL DRIVER OPTIONS
+#
+videodriver               auto
+renderdriver              auto
+audiodriver               auto
+gl_lib                    auto
+
+#
+# FRONTEND COMMAND OPTIONS
+#
+dtd                       1
diff --git a/patches/0005-target-data-layout-w65816.patch b/patches/0005-target-data-layout-w65816.patch
index 99a070b..d6f76cb 100644
--- a/patches/0005-target-data-layout-w65816.patch
+++ b/patches/0005-target-data-layout-w65816.patch
@@ -7,7 +7,7 @@ index 8837d2f91..920b8ac8e 100644
    case Triple::msp430:
      return "e-m:e-p:16:16-i32:16-i64:16-f32:16-f64:16-a:8-n8:16-S16";
 +  case Triple::w65816:
-+    return "e-m:e-p:32:16-i16:16-i32:16-i64:16-f32:16-f64:16-n8:16-S16";
++    return "e-m:e-p:32:16-i16:16-i32:16-i64:16-f32:16-f64:16-a:8-n8:16-S8";
    case Triple::ppc:
    case Triple::ppcle:
    case Triple::ppc64:
diff --git a/plugin.ini b/plugin.ini
new file mode 100644
index 0000000..1d16567
--- /dev/null
+++ b/plugin.ini
@@ -0,0 +1,17 @@
+﻿#
+# PLUGINS OPTIONS
+#
+gdbstub                   0
+cheatfind                 0
+discord                   0
+autofire                  0
+hiscore                   0
+dummy                     0
+timer                     0
+layout                    1
+timecode                  0
+portname                  0
+console                   0
+inputmacro                0
+cheat                     0
+data                      1
diff --git a/runtime/build.sh b/runtime/build.sh
index 6a5aa8c..6b8c870 100755
--- a/runtime/build.sh
+++ b/runtime/build.sh
@@ -53,12 +53,11 @@ cc  "$SRC/softFloat.c"
 cc  "$SRC/libcxxabi.c"
 cc  "$SRC/libcxxabiSjlj.c"
 asm "$SRC/iigsGsos.s"
-# softDouble.c builds at -O1: __muldf3's u64 live-range pressure
-# overflows the greedy allocator at -O2.  dpack is already noinline
-# to reduce pressure, but dclass MUST stay inline (its pointer-arg
-# writes from a noinline boundary would lower to `sta (d,s),y` which
-# uses DBR for the bank — silently corrupted under DBR != 0, caught
-# by the dmul-after-bank-switch test).  -O1 sidesteps this.
-cc  "$SRC/softDouble.c" -O1
+# softDouble.c builds at -O2.  dpack stays noinline (basic regalloc
+# overflows when dpack inlines into __adddf3/__muldf3).  dclass MUST
+# stay inline (its pointer-arg writes from a noinline boundary would
+# lower to `sta (d,s),y` which uses DBR — silently corrupted under
+# DBR != 0, caught by the dmul-after-bank-switch test).
+cc  "$SRC/softDouble.c"
 
 echo "runtime built: $(ls -1 "$OUT"/*.o | wc -l) objects"
diff --git a/runtime/include/assert.h b/runtime/include/assert.h
index c3f2223..64c1264 100644
--- a/runtime/include/assert.h
+++ b/runtime/include/assert.h
@@ -11,4 +11,10 @@ void __assert_fail(const char *expr, const char *file, unsigned int line,
     __assert_fail(#x, __FILE__, __LINE__, __func__))
 #endif
 
+// C11 static_assert — clang implements `_Static_assert` as a keyword.
+// The macro spelling allows portable code that uses `static_assert(...)`.
+#ifndef __cplusplus
+# define static_assert _Static_assert
+#endif
+
 #endif
diff --git a/runtime/include/complex.h b/runtime/include/complex.h
new file mode 100644
index 0000000..47f59dd
--- /dev/null
+++ b/runtime/include/complex.h
@@ -0,0 +1,100 @@
+// C99 / C11 complex.h — complex-number types and core helpers.
+//
+// clang implements `_Complex` as a built-in type that lowers to a
+// struct-of-two-reals on the W65816 (`_Complex double` = 16 bytes,
+// `_Complex float` = 8 bytes).  Plain arithmetic (`a + b`, `a * b`,
+// etc.) is handled by the compiler via softFloat / softDouble.
+//
+// **Supported surface:** the core component / conjugate / magnitude /
+// argument helpers — `creal`, `cimag`, `conj`, `cabs`, `carg`,
+// `cproj` — plus the `CMPLX` constructor macros.
+//
+// **NOT supported:** the transcendental complex routines (`csin`,
+// `ccos`, `cexp`, `clog`, `cpow`, `csqrt`, etc.) — they would each
+// require a real polynomial-expansion implementation; not worth the
+// runtime cost for our IIgs target.  Code that references those
+// symbols will link-fail; if you need them, implement them in your
+// project and link them in.
+
+#ifndef _COMPLEX_H
+#define _COMPLEX_H
+
+#include <math.h>
+
+// Per C11: `complex` and `_Complex_I` are macros provided by
+// <complex.h>.  Real-world code mostly uses `complex` rather than
+// the underscore form.
+#define complex          _Complex
+#define _Complex_I       ((float _Complex){0.0f, 1.0f})
+#define I                _Complex_I
+
+// CMPLX(real, imag) — C11 constructor.  Avoid the type-pun trick;
+// clang implements this as a compound literal.
+#define CMPLX(r, i)      ((double _Complex){ (r), (i) })
+#define CMPLXF(r, i)     ((float  _Complex){ (r), (i) })
+#define CMPLXL(r, i)     ((double _Complex){ (r), (i) })   // long double = double here
+
+// ---- Component access -----------------------------------------------
+// clang provides `__real__` and `__imag__` lvalue extensions that map
+// directly to the underlying real / imag slot of the complex struct.
+// Wrapping them as inline functions avoids leaking the gcc-extension
+// keyword into user code.
+
+static inline double creal (double  _Complex z) { return __real__ z; }
+static inline double cimag (double  _Complex z) { return __imag__ z; }
+static inline float  crealf(float   _Complex z) { return __real__ z; }
+static inline float  cimagf(float   _Complex z) { return __imag__ z; }
+static inline double creall(double  _Complex z) { return __real__ z; }
+static inline double cimagl(double  _Complex z) { return __imag__ z; }
+
+// ---- Conjugate -------------------------------------------------------
+// conj(a + b*I) = a - b*I.  Implemented via CMPLX so the compiler can
+// optimise away the temporary.
+
+static inline double _Complex conj (double _Complex z) {
+    return CMPLX(__real__ z, -__imag__ z);
+}
+static inline float  _Complex conjf(float  _Complex z) {
+    return CMPLXF(__real__ z, -__imag__ z);
+}
+static inline double _Complex conjl(double _Complex z) {
+    return CMPLX(__real__ z, -__imag__ z);
+}
+
+// ---- Magnitude / argument / projection ------------------------------
+// cabs uses hypot to avoid intermediate over/underflow.  carg uses
+// atan2.  cproj returns z unchanged unless either part is infinite,
+// in which case it returns (INFINITY, +-0).
+
+static inline double cabs (double _Complex z) {
+    return hypot(__real__ z, __imag__ z);
+}
+static inline float  cabsf(float  _Complex z) {
+    return hypotf(__real__ z, __imag__ z);
+}
+static inline double cabsl(double _Complex z) {
+    return hypot(__real__ z, __imag__ z);
+}
+
+static inline double carg (double _Complex z) {
+    return atan2(__imag__ z, __real__ z);
+}
+static inline float  cargf(float  _Complex z) {
+    return atan2f(__imag__ z, __real__ z);
+}
+static inline double cargl(double _Complex z) {
+    return atan2(__imag__ z, __real__ z);
+}
+
+static inline double _Complex cproj(double _Complex z) {
+    if (__isinf_d(__real__ z) || __isinf_d(__imag__ z)) {
+        return CMPLX(HUGE_VAL, __imag__ z < 0.0 ? -0.0 : 0.0);
+    }
+    return z;
+}
+static inline float _Complex cprojf(float _Complex z) {
+    return (float _Complex)cproj((double _Complex)z);
+}
+static inline double _Complex cprojl(double _Complex z) { return cproj(z); }
+
+#endif
diff --git a/runtime/include/errno.h b/runtime/include/errno.h
index 141a048..025880d 100644
--- a/runtime/include/errno.h
+++ b/runtime/include/errno.h
@@ -4,14 +4,46 @@
 extern int errno;
 int *__errno_location(void);
 
-// Standard error codes (subset; matches glibc numbering).
-#define EPERM   1
-#define ENOENT  2
-#define EIO     5
-#define EBADF   9
-#define ENOMEM  12
-#define EACCES  13
-#define EINVAL  22
-#define ENOSPC  28
+// Error codes (glibc numbering for portability).  C standard requires
+// EDOM, ERANGE, EILSEQ; the rest are common POSIX-style codes that
+// real-world code expects to find even on a minimal runtime.
+#define EPERM   1     // Operation not permitted
+#define ENOENT  2     // No such file or directory
+#define ESRCH   3     // No such process
+#define EINTR   4     // Interrupted system call
+#define EIO     5     // I/O error
+#define ENXIO   6     // No such device or address
+#define E2BIG   7     // Argument list too long
+#define ENOEXEC 8     // Exec format error
+#define EBADF   9     // Bad file descriptor
+#define ECHILD  10    // No child processes
+#define EAGAIN  11    // Resource temporarily unavailable
+#define ENOMEM  12    // Out of memory
+#define EACCES  13    // Permission denied
+#define EFAULT  14    // Bad address
+#define EBUSY   16    // Device or resource busy
+#define EEXIST  17    // File exists
+#define EXDEV   18    // Cross-device link
+#define ENODEV  19    // No such device
+#define ENOTDIR 20    // Not a directory
+#define EISDIR  21    // Is a directory
+#define EINVAL  22    // Invalid argument
+#define ENFILE  23    // Too many open files in system
+#define EMFILE  24    // Too many open files
+#define ENOTTY  25    // Not a typewriter
+#define ETXTBSY 26    // Text file busy
+#define EFBIG   27    // File too large
+#define ENOSPC  28    // No space left on device
+#define ESPIPE  29    // Illegal seek
+#define EROFS   30    // Read-only file system
+#define EMLINK  31    // Too many links
+#define EPIPE   32    // Broken pipe
+#define EDOM    33    // Math argument out of domain (C standard)
+#define ERANGE  34    // Math result out of range (C standard)
+#define ENAMETOOLONG 36   // Filename too long
+#define ENOSYS  38    // Function not implemented
+#define ENOTEMPTY 39  // Directory not empty
+#define ELOOP   40    // Too many symbolic links
+#define EILSEQ  84    // Illegal byte sequence (C standard)
 
 #endif
diff --git a/runtime/include/fenv.h b/runtime/include/fenv.h
new file mode 100644
index 0000000..1fe1a7e
--- /dev/null
+++ b/runtime/include/fenv.h
@@ -0,0 +1,51 @@
+// fenv.h — floating-point environment.
+//
+// The W65816 softFloat / softDouble runtime is fixed at round-to-
+// nearest-even (FE_TONEAREST).  Other rounding modes can be set/queried
+// but they have no effect on softFloat output — softDouble always uses
+// RNE.  Exception flags are tracked as a static word but never raised
+// by the soft-float libraries (they don't model overflow/underflow/
+// inexact at the IEEE level; overflow → infinity, underflow → zero,
+// inexact silently rounded).
+//
+// All functions return 0 on success (per C99 7.6.3.1).
+//
+// This header exists so portable code that includes <fenv.h> and calls
+// fegetround() / fesetround() compiles cleanly — it just won't observe
+// non-default rounding.
+
+#ifndef _FENV_H
+#define _FENV_H
+
+typedef unsigned short fenv_t;
+typedef unsigned short fexcept_t;
+
+// Rounding modes.  Only FE_TONEAREST has effect on this target.
+#define FE_TONEAREST    0
+#define FE_DOWNWARD     1
+#define FE_UPWARD       2
+#define FE_TOWARDZERO   3
+
+// Exception flags.  Never raised by softFloat/softDouble.
+#define FE_INVALID      0x01
+#define FE_DIVBYZERO    0x02
+#define FE_OVERFLOW     0x04
+#define FE_UNDERFLOW    0x08
+#define FE_INEXACT      0x10
+#define FE_ALL_EXCEPT   (FE_INVALID|FE_DIVBYZERO|FE_OVERFLOW|FE_UNDERFLOW|FE_INEXACT)
+
+#define FE_DFL_ENV ((const fenv_t *)0)
+
+int feclearexcept(int excepts);
+int fegetexceptflag(fexcept_t *flagp, int excepts);
+int feraiseexcept(int excepts);
+int fesetexceptflag(const fexcept_t *flagp, int excepts);
+int fetestexcept(int excepts);
+int fegetround(void);
+int fesetround(int round);
+int fegetenv(fenv_t *envp);
+int feholdexcept(fenv_t *envp);
+int fesetenv(const fenv_t *envp);
+int feupdateenv(const fenv_t *envp);
+
+#endif
diff --git a/runtime/include/inttypes.h b/runtime/include/inttypes.h
index d47f348..c84f296 100644
--- a/runtime/include/inttypes.h
+++ b/runtime/include/inttypes.h
@@ -8,9 +8,26 @@
 
 #include <stdint.h>
 
-// (strtoimax / strtoumax not implemented — runtime has strtol /
-// strtoul for the 32-bit forms which cover the common needs.)
-//
+// strtoimax / strtoumax — `intmax_t` is 64-bit on this target.  The
+// runtime's strtoll / strtoull cover the 64-bit forms; these wrappers
+// just route through.  imaxabs / imaxdiv handle |x| and quot+rem for
+// the same width.
+extern long long          strtoll (const char *nptr, char **endptr, int base);
+extern unsigned long long strtoull(const char *nptr, char **endptr, int base);
+static inline intmax_t  strtoimax(const char *n, char **e, int b) { return strtoll (n, e, b); }
+static inline uintmax_t strtoumax(const char *n, char **e, int b) { return strtoull(n, e, b); }
+
+extern long long llabs(long long n);
+static inline intmax_t imaxabs(intmax_t n) { return llabs(n); }
+
+typedef struct { intmax_t quot; intmax_t rem; } imaxdiv_t;
+static inline imaxdiv_t imaxdiv(intmax_t n, intmax_t d) {
+    imaxdiv_t r;
+    r.quot = n / d;
+    r.rem  = n - r.quot * d;
+    return r;
+}
+
 // **WARNING — limited printf support.**  The runtime's printf /
 // snprintf understand the `l` length modifier (long, 32-bit) but
 // NOT `ll` (long long, 64-bit).  Using PRId64 / PRIu64 / PRIx64
diff --git a/runtime/include/iso646.h b/runtime/include/iso646.h
new file mode 100644
index 0000000..b86c40d
--- /dev/null
+++ b/runtime/include/iso646.h
@@ -0,0 +1,20 @@
+// C95 iso646.h — alternative spellings of the C operators.  Mandated
+// by C11 for portability with sources written under older standards
+// or in code-pages without the punctuation symbols.
+
+#ifndef _ISO646_H
+#define _ISO646_H
+
+#define and    &&
+#define and_eq &=
+#define bitand &
+#define bitor  |
+#define compl  ~
+#define not    !
+#define not_eq !=
+#define or     ||
+#define or_eq  |=
+#define xor    ^
+#define xor_eq ^=
+
+#endif
diff --git a/runtime/include/locale.h b/runtime/include/locale.h
index 14c1904..dab6ef3 100644
--- a/runtime/include/locale.h
+++ b/runtime/include/locale.h
@@ -6,6 +6,10 @@
 #ifndef _LOCALE_H
 #define _LOCALE_H
 
+#ifndef NULL
+# define NULL ((void *)0)
+#endif
+
 struct lconv {
     char *decimal_point;
     char *thousands_sep;
diff --git a/runtime/include/math.h b/runtime/include/math.h
index 5642062..56fb2c2 100644
--- a/runtime/include/math.h
+++ b/runtime/include/math.h
@@ -104,6 +104,50 @@ double cosh    (double x);
 float  coshf   (float  x);
 double tanh    (double x);
 float  tanhf   (float  x);
+double asinh   (double x);
+float  asinhf  (float  x);
+double acosh   (double x);
+float  acoshf  (float  x);
+double atanh   (double x);
+float  atanhf  (float  x);
+
+// ---- Fused multiply-add (not actually fused — rounds at each step) -
+double fma  (double x, double y, double z);
+float  fmaf (float  x, float  y, float  z);
+
+// ---- NaN payload helpers (tagp ignored — returns canonical NaN) ----
+double nan (const char *tagp);
+float  nanf(const char *tagp);
+
+// ---- IEEE 754 remainder -------------------------------------------
+double remainder  (double x, double y);
+float  remainderf (float  x, float  y);
+
+// ---- Round to floating-point integer ------------------------------
+double rint        (double x);
+float  rintf       (float  x);
+double nearbyint   (double x);
+float  nearbyintf  (float  x);
+
+// ---- Round to integer ----------------------------------------------
+long lround  (double x);
+long lroundf (float  x);
+long lrint   (double x);
+long lrintf  (float  x);
+
+// ---- Scaling -------------------------------------------------------
+double scalbn  (double x, int  n);
+float  scalbnf (float  x, int  n);
+double scalbln (double x, long n);
+float  scalblnf(float  x, long n);
+
+// ---- Classification ------------------------------------------------
+#define FP_NAN       0
+#define FP_INFINITE  1
+#define FP_NORMAL    2
+#define FP_SUBNORMAL 3
+#define FP_ZERO      4
+int fpclassify(double x);
 
 // ---- Common constants -----------------------------------------------
 // (Not in C99 strict, but defined by glibc/BSD math.h and widely used.)
diff --git a/runtime/include/stdalign.h b/runtime/include/stdalign.h
new file mode 100644
index 0000000..32fdc3d
--- /dev/null
+++ b/runtime/include/stdalign.h
@@ -0,0 +1,13 @@
+// C11 stdalign.h — alias the keyword forms `_Alignas` / `_Alignof` to
+// the more readable lowercase names.
+
+#ifndef _STDALIGN_H
+#define _STDALIGN_H
+
+#define alignas _Alignas
+#define alignof _Alignof
+
+#define __alignas_is_defined 1
+#define __alignof_is_defined 1
+
+#endif
diff --git a/runtime/include/stdatomic.h b/runtime/include/stdatomic.h
new file mode 100644
index 0000000..d15fb01
--- /dev/null
+++ b/runtime/include/stdatomic.h
@@ -0,0 +1,138 @@
+// stdatomic.h — C11 atomic operations, single-core stubs.
+//
+// The W65816 is a uniprocessor with no preemption from a kernel scheduler
+// (we run bare on the IIgs, optionally under GS/OS which doesn't yield
+// inside a process).  All `atomic_*` operations lower to plain ops; the
+// `memory_order_*` constants are accepted and ignored.
+//
+// This header provides the C11 API surface so portable code that uses
+// `_Atomic int` / `atomic_fetch_add` / etc. compiles cleanly.  Real
+// multi-core atomicity is not modeled.
+
+#ifndef _STDATOMIC_H
+#define _STDATOMIC_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+typedef enum {
+    memory_order_relaxed,
+    memory_order_consume,
+    memory_order_acquire,
+    memory_order_release,
+    memory_order_acq_rel,
+    memory_order_seq_cst
+} memory_order;
+
+#define ATOMIC_BOOL_LOCK_FREE     1
+#define ATOMIC_CHAR_LOCK_FREE     1
+#define ATOMIC_CHAR16_T_LOCK_FREE 1
+#define ATOMIC_CHAR32_T_LOCK_FREE 1
+#define ATOMIC_WCHAR_T_LOCK_FREE  1
+#define ATOMIC_SHORT_LOCK_FREE    1
+#define ATOMIC_INT_LOCK_FREE      1
+#define ATOMIC_LONG_LOCK_FREE     1
+#define ATOMIC_LLONG_LOCK_FREE    1
+#define ATOMIC_POINTER_LOCK_FREE  1
+
+#define ATOMIC_VAR_INIT(v) (v)
+#define ATOMIC_FLAG_INIT { 0 }
+
+// Atomic flag — a boolean-valued atomic flag.
+typedef struct { volatile unsigned char _v; } atomic_flag;
+
+static inline int atomic_flag_test_and_set_explicit(volatile atomic_flag *o,
+                                                    memory_order m) {
+    (void)m;
+    int r = o->_v;
+    o->_v = 1;
+    return r;
+}
+static inline int atomic_flag_test_and_set(volatile atomic_flag *o) {
+    return atomic_flag_test_and_set_explicit(o, memory_order_seq_cst);
+}
+static inline void atomic_flag_clear_explicit(volatile atomic_flag *o,
+                                              memory_order m) {
+    (void)m;
+    o->_v = 0;
+}
+static inline void atomic_flag_clear(volatile atomic_flag *o) {
+    atomic_flag_clear_explicit(o, memory_order_seq_cst);
+}
+
+// Thread-fence — no-op on a uniprocessor with no kernel preemption.
+static inline void atomic_thread_fence(memory_order m)  { (void)m; }
+static inline void atomic_signal_fence(memory_order m)  { (void)m; }
+
+// _Atomic(T) is just T on this target.  Generic load/store/RMW macros
+// delegate to plain ops.  Uses __typeof__ to preserve type info.
+#define atomic_init(obj, val)        (*(obj) = (val))
+#define atomic_is_lock_free(obj)     ((void)(obj), 1)
+#define atomic_store(obj, val)       (*(obj) = (val))
+#define atomic_store_explicit(obj, val, m) ((void)(m), *(obj) = (val))
+#define atomic_load(obj)             (*(obj))
+#define atomic_load_explicit(obj, m) ((void)(m), *(obj))
+#define atomic_exchange(obj, val) ({ \
+    __typeof__(*(obj)) _old = *(obj); \
+    *(obj) = (val); \
+    _old; })
+#define atomic_exchange_explicit(obj, val, m) \
+    ((void)(m), atomic_exchange(obj, val))
+#define atomic_compare_exchange_strong(obj, expected, desired) ({ \
+    int _ok = (*(obj) == *(expected)); \
+    if (_ok) *(obj) = (desired); else *(expected) = *(obj); \
+    _ok; })
+#define atomic_compare_exchange_weak(obj, expected, desired) \
+    atomic_compare_exchange_strong(obj, expected, desired)
+#define atomic_compare_exchange_strong_explicit(obj, expected, desired, ms, mf) \
+    ((void)(ms), (void)(mf), \
+     atomic_compare_exchange_strong(obj, expected, desired))
+#define atomic_compare_exchange_weak_explicit(obj, expected, desired, ms, mf) \
+    ((void)(ms), (void)(mf), \
+     atomic_compare_exchange_weak(obj, expected, desired))
+#define atomic_fetch_add(obj, val) ({ \
+    __typeof__(*(obj)) _old = *(obj); \
+    *(obj) += (val); \
+    _old; })
+#define atomic_fetch_add_explicit(obj, val, m) \
+    ((void)(m), atomic_fetch_add(obj, val))
+#define atomic_fetch_sub(obj, val) ({ \
+    __typeof__(*(obj)) _old = *(obj); \
+    *(obj) -= (val); \
+    _old; })
+#define atomic_fetch_sub_explicit(obj, val, m) \
+    ((void)(m), atomic_fetch_sub(obj, val))
+#define atomic_fetch_or(obj, val) ({ \
+    __typeof__(*(obj)) _old = *(obj); \
+    *(obj) |= (val); \
+    _old; })
+#define atomic_fetch_or_explicit(obj, val, m) \
+    ((void)(m), atomic_fetch_or(obj, val))
+#define atomic_fetch_and(obj, val) ({ \
+    __typeof__(*(obj)) _old = *(obj); \
+    *(obj) &= (val); \
+    _old; })
+#define atomic_fetch_and_explicit(obj, val, m) \
+    ((void)(m), atomic_fetch_and(obj, val))
+#define atomic_fetch_xor(obj, val) ({ \
+    __typeof__(*(obj)) _old = *(obj); \
+    *(obj) ^= (val); \
+    _old; })
+#define atomic_fetch_xor_explicit(obj, val, m) \
+    ((void)(m), atomic_fetch_xor(obj, val))
+
+// _Atomic-qualified typedefs that portable C11 code expects.
+typedef _Bool              atomic_bool;
+typedef char               atomic_char;
+typedef signed char        atomic_schar;
+typedef unsigned char      atomic_uchar;
+typedef short              atomic_short;
+typedef unsigned short     atomic_ushort;
+typedef int                atomic_int;
+typedef unsigned int       atomic_uint;
+typedef long               atomic_long;
+typedef unsigned long      atomic_ulong;
+typedef long long          atomic_llong;
+typedef unsigned long long atomic_ullong;
+
+#endif
diff --git a/runtime/include/stddef.h b/runtime/include/stddef.h
index 579341b..b0493e0 100644
--- a/runtime/include/stddef.h
+++ b/runtime/include/stddef.h
@@ -6,7 +6,10 @@
 
 typedef unsigned long size_t;
 typedef int          ptrdiff_t;
-typedef int          wchar_t;          // not really wide-char-supported
+#ifndef _WCHAR_T_DEFINED
+# define _WCHAR_T_DEFINED
+typedef int          wchar_t;          // matches clang builtin signature
+#endif
 
 #ifndef NULL
 # define NULL ((void *)0)
diff --git a/runtime/include/stdint.h b/runtime/include/stdint.h
index 738ba70..75a44ce 100644
--- a/runtime/include/stdint.h
+++ b/runtime/include/stdint.h
@@ -37,8 +37,13 @@ typedef uint32_t uint_fast32_t;
 typedef int64_t  int_fast64_t;
 typedef uint64_t uint_fast64_t;
 
-typedef int16_t  intptr_t;        // pointers are 16-bit on W65816
-typedef uint16_t uintptr_t;
+// Under ptr32 (data layout `p:32:16`), pointers are 32 bits even though
+// the IIgs's physical address bus is 24-bit; the high byte of the bank
+// word is reserved.  `uintptr_t` is uint32_t so casts pointer↔integer
+// round-trip without truncating the bank byte (libcxxabiSjlj's exception
+// buffer pointers exercised this — uint16_t lost the bank).
+typedef int32_t  intptr_t;
+typedef uint32_t uintptr_t;
 
 typedef int64_t  intmax_t;
 typedef uint64_t uintmax_t;
diff --git a/runtime/include/stdio.h b/runtime/include/stdio.h
index 924afa6..ec84977 100644
--- a/runtime/include/stdio.h
+++ b/runtime/include/stdio.h
@@ -6,6 +6,10 @@
 typedef struct __sFILE FILE;
 typedef unsigned long size_t;
 
+#ifndef NULL
+# define NULL ((void *)0)
+#endif
+
 extern FILE *stdin;
 extern FILE *stdout;
 extern FILE *stderr;
@@ -35,6 +39,36 @@ int    feof(FILE *stream);
 int    ferror(FILE *stream);
 void   clearerr(FILE *stream);
 
+// fgetpos / fsetpos — alternative seek API.  fpos_t holds the same
+// information as ftell's long return, so the implementation is a thin
+// wrapper.  Provided for source-compat with portable code.
+typedef long fpos_t;
+int    fgetpos(FILE *stream, fpos_t *pos);
+int    fsetpos(FILE *stream, const fpos_t *pos);
+
+// Buffer-control surface — no-ops in our buffer-less I/O model (mfs
+// is direct memory, stdout flushes per putchar).  The functions exist
+// so portable code compiles.
+#define _IOFBF 0
+#define _IOLBF 1
+#define _IONBF 2
+#define BUFSIZ 256
+int    setvbuf(FILE *stream, char *buf, int mode, size_t size);
+void   setbuf (FILE *stream, char *buf);
+
+// File-system operations — stubs that route to mfsUnregister and
+// hand-rolled rename.  Return 0 on success, -1 on failure.
+int    remove(const char *path);
+int    rename(const char *old, const char *neu);
+
+// Temporary-file helpers — stubs returning NULL / (char *)0.  Real
+// temp-file support requires writable storage on disk which the IIgs
+// runtime doesn't provide by default.
+FILE  *tmpfile(void);
+char  *tmpnam(char *s);
+#define L_tmpnam 16
+#define TMP_MAX  1   // we can only produce 1 unique name (always fail)
+
 #define SEEK_SET 0
 #define SEEK_CUR 1
 #define SEEK_END 2
@@ -47,12 +81,17 @@ char *fgets(char *buf, int n, FILE *stream);
 int   ungetc(int c, FILE *stream);
 #define getc(s) fgetc(s)
 
-// scanf family — only sscanf and vsscanf are implemented (parsing
-// from a string buffer). scanf/fscanf would need a reliable byte-at-
-// a-time stdin which we don't have. Supports %d %i %u %x %X %o %s
-// %c %% with optional `l` long modifier.
+// scanf family — sscanf/vsscanf parse a string; fscanf/vfscanf parse
+// from a FILE* via fgetc/ungetc.  scanf/vscanf read from stdin (which
+// returns EOF on the IIgs because there is no integrated keyboard
+// stdin) so they're rarely useful but the surface compiles.  Supports
+// %d %i %u %x %X %o %s %c %ld %lu %lx %li %lo %% with optional `l`.
 int sscanf (const char *str, const char *fmt, ...);
 int vsscanf(const char *str, const char *fmt, va_list ap);
+int fscanf (FILE *stream, const char *fmt, ...);
+int vfscanf(FILE *stream, const char *fmt, va_list ap);
+int scanf  (const char *fmt, ...);
+int vscanf (const char *fmt, va_list ap);
 void  rewind(FILE *stream);  // = fseek(s, 0, SEEK_SET) + clearerr
 
 // Memory-backed FS: register a memory region as a named file so
diff --git a/runtime/include/stdlib.h b/runtime/include/stdlib.h
index 505be1c..ec86fc8 100644
--- a/runtime/include/stdlib.h
+++ b/runtime/include/stdlib.h
@@ -3,10 +3,21 @@
 
 typedef unsigned long size_t;
 
+#ifndef NULL
+# define NULL ((void *)0)
+#endif
+
 void  *malloc(size_t n);
 void  *calloc(size_t nmemb, size_t size);
 void  *realloc(void *ptr, size_t n);
 void   free(void *p);
+// C11 aligned allocation.  `alignment` must be a power of two; `size`
+// must be a multiple of `alignment`.  Free with `aligned_free` (not
+// plain `free`) — the returned pointer is offset from the malloc-block
+// base by an alignment-pad and the original base is stashed just below.
+void  *aligned_alloc(size_t alignment, size_t size);
+void   aligned_free(void *p);
+int    posix_memalign(void **memptr, size_t alignment, size_t size);
 
 int    abs(int n);
 long   labs(long n);
@@ -36,10 +47,21 @@ void  *bsearch(const void *key, const void *base, size_t nmemb,
                size_t size, __cmp_fn cmp);
 
 void   exit(int code) __attribute__((noreturn));
+void   _Exit(int code) __attribute__((noreturn));
 void   abort(void)    __attribute__((noreturn));
+// C11 quick_exit / at_quick_exit — like exit/atexit but invoke a
+// separate handler chain.  No file flushing, no atexit handlers.
+void   quick_exit(int code) __attribute__((noreturn));
 
 typedef void (*__atexit_fn)(void);
 int    atexit(__atexit_fn fn);
+int    at_quick_exit(__atexit_fn fn);
+
+// No environment under GS/OS — `getenv` always returns NULL,
+// `system` always returns 0 (no shell to invoke).  These exist for
+// portable-code compile compatibility.
+char  *getenv(const char *name);
+int    system(const char *cmd);
 
 #define EXIT_SUCCESS 0
 #define EXIT_FAILURE 1
diff --git a/runtime/include/stdnoreturn.h b/runtime/include/stdnoreturn.h
new file mode 100644
index 0000000..688da9a
--- /dev/null
+++ b/runtime/include/stdnoreturn.h
@@ -0,0 +1,9 @@
+// C11 stdnoreturn.h — alias the keyword form `_Noreturn` to the more
+// readable lowercase name.
+
+#ifndef _STDNORETURN_H
+#define _STDNORETURN_H
+
+#define noreturn _Noreturn
+
+#endif
diff --git a/runtime/include/string.h b/runtime/include/string.h
index f419fbe..c403924 100644
--- a/runtime/include/string.h
+++ b/runtime/include/string.h
@@ -3,6 +3,10 @@
 
 typedef unsigned long size_t;
 
+#ifndef NULL
+# define NULL ((void *)0)
+#endif
+
 void  *memcpy(void *dst, const void *src, size_t n);
 void  *memmove(void *dst, const void *src, size_t n);
 void  *memset(void *dst, int c, size_t n);
diff --git a/runtime/include/tgmath.h b/runtime/include/tgmath.h
new file mode 100644
index 0000000..e413ae6
--- /dev/null
+++ b/runtime/include/tgmath.h
@@ -0,0 +1,97 @@
+// tgmath.h — type-generic math macros.
+//
+// Selects between the `f`-suffixed (float) and unsuffixed (double)
+// math functions based on argument type via C11 _Generic.  Our
+// `long double` is aliased to double, so the `l`-suffixed variants
+// aren't separately provided.
+//
+// Usage: `sqrt(x)` picks `sqrtf(x)` if x is float, `sqrt(x)` if double.
+
+#ifndef _TGMATH_H
+#define _TGMATH_H
+
+#include <math.h>
+
+#define __tg1(fn, x) \
+    _Generic((x), float: fn##f, default: fn)(x)
+
+#define __tg2(fn, x, y) \
+    _Generic((x), float: fn##f, default: fn) \
+        ((x), (y))
+
+#undef  sin
+#define sin(x)        __tg1(sin, x)
+#undef  cos
+#define cos(x)        __tg1(cos, x)
+#undef  tan
+#define tan(x)        __tg1(tan, x)
+#undef  asin
+#define asin(x)       __tg1(asin, x)
+#undef  acos
+#define acos(x)       __tg1(acos, x)
+#undef  atan
+#define atan(x)       __tg1(atan, x)
+#undef  atan2
+#define atan2(y, x)   __tg2(atan2, y, x)
+#undef  sinh
+#define sinh(x)       __tg1(sinh, x)
+#undef  cosh
+#define cosh(x)       __tg1(cosh, x)
+#undef  tanh
+#define tanh(x)       __tg1(tanh, x)
+#undef  exp
+#define exp(x)        __tg1(exp, x)
+#undef  log
+#define log(x)        __tg1(log, x)
+#undef  log10
+#define log10(x)      __tg1(log10, x)
+#undef  pow
+#define pow(x, y)     __tg2(pow, x, y)
+#undef  sqrt
+#define sqrt(x)       __tg1(sqrt, x)
+#undef  ceil
+#define ceil(x)       __tg1(ceil, x)
+#undef  floor
+#define floor(x)      __tg1(floor, x)
+#undef  fabs
+#define fabs(x)       __tg1(fabs, x)
+#undef  fmod
+#define fmod(x, y)    __tg2(fmod, x, y)
+#undef  copysign
+#define copysign(x,y) __tg2(copysign, x, y)
+#undef  log2
+#define log2(x)       __tg1(log2, x)
+#undef  exp2
+#define exp2(x)       __tg1(exp2, x)
+#undef  log1p
+#define log1p(x)      __tg1(log1p, x)
+#undef  expm1
+#define expm1(x)      __tg1(expm1, x)
+#undef  hypot
+#define hypot(x, y)   __tg2(hypot, x, y)
+#undef  cbrt
+#define cbrt(x)       __tg1(cbrt, x)
+#undef  trunc
+#define trunc(x)      __tg1(trunc, x)
+#undef  round
+#define round(x)      __tg1(round, x)
+#undef  fmax
+#define fmax(x, y)    __tg2(fmax, x, y)
+#undef  fmin
+#define fmin(x, y)    __tg2(fmin, x, y)
+#undef  fdim
+#define fdim(x, y)    __tg2(fdim, x, y)
+#undef  asinh
+#define asinh(x)      __tg1(asinh, x)
+#undef  acosh
+#define acosh(x)      __tg1(acosh, x)
+#undef  atanh
+#define atanh(x)      __tg1(atanh, x)
+#undef  remainder
+#define remainder(x,y) __tg2(remainder, x, y)
+#undef  rint
+#define rint(x)       __tg1(rint, x)
+#undef  nearbyint
+#define nearbyint(x)  __tg1(nearbyint, x)
+
+#endif
diff --git a/runtime/include/threads.h b/runtime/include/threads.h
new file mode 100644
index 0000000..80d9800
--- /dev/null
+++ b/runtime/include/threads.h
@@ -0,0 +1,91 @@
+// threads.h — C11 threading API.  Single-core IIgs / bare-metal: every
+// thread function fails with `thrd_error`.  Mutexes / cond-vars compile
+// but produce no synchronization — callers running on a single core
+// don't need any.  This header is here so portable C11 code that
+// `#include <threads.h>` and uses `thrd_t` etc. compiles.
+
+#ifndef _THREADS_H
+#define _THREADS_H
+
+#include <time.h>
+
+enum {
+    thrd_success  = 0,
+    thrd_busy     = 1,
+    thrd_error    = 2,
+    thrd_nomem    = 3,
+    thrd_timedout = 4
+};
+
+enum {
+    mtx_plain     = 0,
+    mtx_recursive = 1,
+    mtx_timed     = 2
+};
+
+#define ONCE_FLAG_INIT 0
+#define TSS_DTOR_ITERATIONS 1
+
+typedef int thrd_t;
+typedef int (*thrd_start_t)(void *);
+typedef struct { int _x; } mtx_t;
+typedef struct { int _x; } cnd_t;
+typedef int once_flag;
+typedef unsigned short tss_t;
+typedef void (*tss_dtor_t)(void *);
+
+// All thread create/join calls fail — no scheduler.
+static inline int thrd_create(thrd_t *t, thrd_start_t f, void *a) {
+    (void)t; (void)f; (void)a;
+    return thrd_error;
+}
+static inline thrd_t thrd_current(void)                  { return 0; }
+static inline int    thrd_equal(thrd_t a, thrd_t b)      { return a == b; }
+static inline void   thrd_exit(int v)                    { (void)v; for (;;) {} }
+static inline int    thrd_join(thrd_t t, int *res)       { (void)t; (void)res; return thrd_error; }
+static inline int    thrd_detach(thrd_t t)               { (void)t; return thrd_error; }
+static inline int    thrd_sleep(const struct timespec *d,
+                                struct timespec *r)      { (void)d; (void)r; return -1; }
+static inline void   thrd_yield(void)                    { }
+
+// Mutex / cond — no-ops on a uniprocessor.
+static inline int    mtx_init(mtx_t *m, int t)           { (void)m; (void)t; return thrd_success; }
+static inline int    mtx_lock(mtx_t *m)                  { (void)m; return thrd_success; }
+static inline int    mtx_trylock(mtx_t *m)               { (void)m; return thrd_success; }
+static inline int    mtx_timedlock(mtx_t *m,
+                                   const struct timespec *t) {
+    (void)m; (void)t; return thrd_success;
+}
+static inline int    mtx_unlock(mtx_t *m)                { (void)m; return thrd_success; }
+static inline void   mtx_destroy(mtx_t *m)               { (void)m; }
+
+static inline int    cnd_init(cnd_t *c)                  { (void)c; return thrd_success; }
+static inline int    cnd_signal(cnd_t *c)                { (void)c; return thrd_success; }
+static inline int    cnd_broadcast(cnd_t *c)             { (void)c; return thrd_success; }
+static inline int    cnd_wait(cnd_t *c, mtx_t *m)        { (void)c; (void)m; return thrd_error; }
+static inline int    cnd_timedwait(cnd_t *c, mtx_t *m,
+                                   const struct timespec *t) {
+    (void)c; (void)m; (void)t; return thrd_timedout;
+}
+static inline void   cnd_destroy(cnd_t *c)               { (void)c; }
+
+// call_once — straightforward on a single-core target.
+static inline void   call_once(once_flag *f, void (*fn)(void)) {
+    if (!*f) { *f = 1; fn(); }
+}
+
+// Thread-specific storage: no other threads, so it's just a pointer.
+// At most 8 keys.
+extern void *__tss_slots[8];
+extern int   __tss_next;
+static inline int    tss_create(tss_t *k, tss_dtor_t d) {
+    (void)d;
+    if (__tss_next >= 8) return thrd_error;
+    *k = (tss_t)__tss_next++;
+    return thrd_success;
+}
+static inline void  *tss_get(tss_t k)                    { return __tss_slots[k]; }
+static inline int    tss_set(tss_t k, void *v)           { __tss_slots[k] = v; return thrd_success; }
+static inline void   tss_delete(tss_t k)                 { (void)k; }
+
+#endif
diff --git a/runtime/include/time.h b/runtime/include/time.h
index d9da1d1..4e130da 100644
--- a/runtime/include/time.h
+++ b/runtime/include/time.h
@@ -5,8 +5,20 @@ typedef long          time_t;
 typedef unsigned long clock_t;
 typedef unsigned long size_t;
 
+#ifndef NULL
+# define NULL ((void *)0)
+#endif
+
 #define CLOCKS_PER_SEC 60   // IIgs vsync tick (placeholder)
 
+// C11 / POSIX nanosecond-precision time.  IIgs has only second-level
+// hardware resolution; tv_nsec is reported as 0 by callers that fill
+// a struct timespec.  Defined here so <threads.h> can refer to it.
+struct timespec {
+    time_t tv_sec;
+    long   tv_nsec;
+};
+
 struct tm {
     int tm_sec;        // 0..60 (60 = leap second)
     int tm_min;        // 0..59
diff --git a/runtime/include/uchar.h b/runtime/include/uchar.h
new file mode 100644
index 0000000..1c4476d
--- /dev/null
+++ b/runtime/include/uchar.h
@@ -0,0 +1,53 @@
+// C11 uchar.h — char16_t / char32_t plus minimal conversion helpers.
+//
+// The W65816 runtime treats text as Latin-1 (8-bit) throughout, so
+// the 16-bit and 32-bit char types are degenerate one-byte mappings
+// (high bytes always zero).  Conversion functions are provided for
+// surface-compatibility; they do not handle multi-byte UTF-8 input.
+
+#ifndef _UCHAR_H
+#define _UCHAR_H
+
+#include <stdint.h>
+#include <stddef.h>
+
+typedef uint16_t char16_t;
+typedef uint32_t char32_t;
+
+// mbstate_t is the multi-byte conversion state.  Empty struct — our
+// 1:1 byte mapping is stateless.
+typedef struct { int unused; } mbstate_t;
+
+// mbrtoc16 / c16rtomb — multibyte <-> char16_t.  In our Latin-1
+// model these are byte-for-byte copies.
+static inline size_t mbrtoc16(char16_t *out, const char *s, size_t n, mbstate_t *ps) {
+    (void)ps;
+    if (!s || n == 0) return (size_t)-2;
+    unsigned char c = (unsigned char)*s;
+    if (out) *out = (char16_t)c;
+    return (c == 0) ? 0 : 1;
+}
+
+static inline size_t c16rtomb(char *s, char16_t c, mbstate_t *ps) {
+    (void)ps;
+    if (!s) return 1;
+    *s = (char)(c & 0xFF);
+    return 1;
+}
+
+static inline size_t mbrtoc32(char32_t *out, const char *s, size_t n, mbstate_t *ps) {
+    (void)ps;
+    if (!s || n == 0) return (size_t)-2;
+    unsigned char c = (unsigned char)*s;
+    if (out) *out = (char32_t)c;
+    return (c == 0) ? 0 : 1;
+}
+
+static inline size_t c32rtomb(char *s, char32_t c, mbstate_t *ps) {
+    (void)ps;
+    if (!s) return 1;
+    *s = (char)(c & 0xFF);
+    return 1;
+}
+
+#endif
diff --git a/runtime/include/wchar.h b/runtime/include/wchar.h
index dc223b8..376b6a7 100644
--- a/runtime/include/wchar.h
+++ b/runtime/include/wchar.h
@@ -8,7 +8,10 @@
 #ifndef _WCHAR_H
 #define _WCHAR_H
 
-typedef unsigned short wchar_t;
+#ifndef _WCHAR_T_DEFINED
+# define _WCHAR_T_DEFINED
+typedef int          wchar_t;          // matches clang builtin signature
+#endif
 typedef unsigned long size_t;
 typedef long           wint_t;
 
@@ -35,4 +38,39 @@ size_t mbstowcs(wchar_t *pwcs, const char *s, size_t n);
 size_t wcstombs(char *s, const wchar_t *pwcs, size_t n);
 int    mblen   (const char *s, size_t n);
 
+// Wide-char `memXXX` family — operate on wchar_t arrays.  Under our
+// Latin-1 model these are equivalent to the byte versions scaled by
+// sizeof(wchar_t) for memcpy/memmove, and explicit loops for set/cmp
+// /chr (since the byte versions can't compare a 16-bit wchar_t value
+// against an 8-bit memory cell).
+wchar_t *wmemcpy (wchar_t *dst, const wchar_t *src, size_t n);
+wchar_t *wmemmove(wchar_t *dst, const wchar_t *src, size_t n);
+wchar_t *wmemset (wchar_t *dst, wchar_t c, size_t n);
+int      wmemcmp (const wchar_t *a, const wchar_t *b, size_t n);
+wchar_t *wmemchr (const wchar_t *s, wchar_t c, size_t n);
+
+// Wide-char string-numeric conversion.  Each routine narrows the wide
+// source to bytes first (1:1 Latin-1), then delegates to the strXXX
+// equivalent.  endptr is reported in wide-char position so callers can
+// resume scanning from where the conversion stopped.
+long           wcstol  (const wchar_t *nptr, wchar_t **endptr, int base);
+unsigned long  wcstoul (const wchar_t *nptr, wchar_t **endptr, int base);
+long long      wcstoll (const wchar_t *nptr, wchar_t **endptr, int base);
+unsigned long long wcstoull(const wchar_t *nptr, wchar_t **endptr, int base);
+double         wcstod  (const wchar_t *nptr, wchar_t **endptr);
+float          wcstof  (const wchar_t *nptr, wchar_t **endptr);
+
+// Wide-char printf-family.  Narrow the format string + any %s/%c args,
+// route through the byte snprintf, then widen the result back into the
+// wchar_t buffer.  Limits the format-spec set to what byte snprintf
+// supports (no %ls / %lc — wide args route as plain chars).
+#include <stdarg.h>
+int swprintf (wchar_t *buf, size_t n, const wchar_t *fmt, ...);
+int vswprintf(wchar_t *buf, size_t n, const wchar_t *fmt, va_list ap);
+
+// Wide-char calendar formatting — same surface as strftime but writes
+// wchar_t.  Implementation defers to strftime via a byte buffer.
+struct tm;
+size_t wcsftime(wchar_t *buf, size_t n, const wchar_t *fmt, const struct tm *tm);
+
 #endif
diff --git a/runtime/include/wctype.h b/runtime/include/wctype.h
new file mode 100644
index 0000000..6b5beab
--- /dev/null
+++ b/runtime/include/wctype.h
@@ -0,0 +1,84 @@
+// C95 / C11 wctype.h — wide-character classification + case folding.
+//
+// On the W65816 runtime wchar_t is 16-bit but text is Latin-1; the
+// high byte is always zero.  All functions reduce to the byte
+// equivalents in <ctype.h> by truncating to the low byte (anything
+// in 0x100..0xFFFF is non-printable, non-alpha, non-digit per our
+// Latin-1 assumption).
+
+#ifndef _WCTYPE_H
+#define _WCTYPE_H
+
+#include <wchar.h>
+#include <ctype.h>
+
+typedef int wctype_t;
+typedef int wctrans_t;
+
+// In Latin-1, wide-char in 0x100..0xFFFF have no class.
+#define _WCT_DELEGATE(name)                                         \
+    static inline int isw##name(wint_t c) {                         \
+        return (c >= 0 && c < 0x100) ? is##name((int)c) : 0;        \
+    }
+
+_WCT_DELEGATE(alnum)
+_WCT_DELEGATE(alpha)
+_WCT_DELEGATE(cntrl)
+_WCT_DELEGATE(digit)
+_WCT_DELEGATE(graph)
+_WCT_DELEGATE(lower)
+_WCT_DELEGATE(print)
+_WCT_DELEGATE(punct)
+_WCT_DELEGATE(space)
+_WCT_DELEGATE(upper)
+_WCT_DELEGATE(xdigit)
+
+static inline int iswblank(wint_t c) {
+    return (c == L' ' || c == L'\t');
+}
+
+static inline wint_t towlower(wint_t c) {
+    return (c >= 0 && c < 0x100) ? (wint_t)tolower((int)c) : c;
+}
+static inline wint_t towupper(wint_t c) {
+    return (c >= 0 && c < 0x100) ? (wint_t)toupper((int)c) : c;
+}
+
+// Programmatic lookup — not strictly needed but trivial to provide.
+static inline wctype_t wctype(const char *name) {
+    if (!name) return 0;
+    // Minimal table — just enough for the common cases.
+    char c0 = name[0], c1 = name[1];
+    if (c0 == 'a' && c1 == 'l') return 1;   // alpha or alnum
+    if (c0 == 'd') return 2;                // digit
+    if (c0 == 'l') return 3;                // lower
+    if (c0 == 'u') return 4;                // upper
+    if (c0 == 's') return 5;                // space
+    return 0;
+}
+static inline int iswctype(wint_t c, wctype_t t) {
+    switch (t) {
+    case 1: return iswalpha(c);
+    case 2: return iswdigit(c);
+    case 3: return iswlower(c);
+    case 4: return iswupper(c);
+    case 5: return iswspace(c);
+    }
+    return 0;
+}
+
+static inline wctrans_t wctrans(const char *name) {
+    if (!name) return 0;
+    if (name[0] == 't' && name[1] == 'o' && name[2] == 'l') return 1;
+    if (name[0] == 't' && name[1] == 'o' && name[2] == 'u') return 2;
+    return 0;
+}
+static inline wint_t towctrans(wint_t c, wctrans_t t) {
+    switch (t) {
+    case 1: return towlower(c);
+    case 2: return towupper(c);
+    }
+    return c;
+}
+
+#endif
diff --git a/runtime/src/crt0.s b/runtime/src/crt0.s
index 8511795..3184d5b 100644
--- a/runtime/src/crt0.s
+++ b/runtime/src/crt0.s
@@ -73,21 +73,95 @@ __start:
 	sta 0xbe                  ; persistent data bank
 	rep #0x20
 
-	; Zero BSS.  X iterates from __bss_start to __bss_end; each
-	; iteration writes one byte of zero at addr X (via DP=0 +
-	; offset 0 — which is just X).  STZ in M=8 stores 1 byte and
-	; doesn't touch A, so we don't need the LDA #0 prelude.
+	; Zero BSS.  Up to 4 segments — linker emits __bss_seg{0..3}_lo16
+	; / _bank / _size symbols.  Segments with size=0 are skipped.
+	; Each segment is cleared with DBR-relative STZ abs,X after
+	; setting DBR to the segment's bank.  Original DBR restored at
+	; end via PLB.
 	rep #0x10                ; ensure X is 16-bit
-	ldx #__bss_start
-.Lbss_loop:
-	cpx #__bss_end
-	bcs .Lbss_done           ; X >= end -> done
-	sep #0x20                ; 8-bit M for 1-byte store
-	stz 0x0, x               ; *(uint8_t *)X = 0   (DP=0)
+	phb                      ; save current DBR
+
+	; ---- segment 0 ----
+	rep #0x20
+	ldx #__bss_seg0_size
+	beq .Lbss_seg1
+	sep #0x20
+	.byte 0xA9
+	.byte __bss_seg0_bank
+	pha
+	plb
+	rep #0x20
+	ldx #0
+.Lbss_loop0:
+	cpx #__bss_seg0_size
+	bcs .Lbss_seg1
+	sep #0x20
+	stz __bss_seg0_lo16, x
 	rep #0x20
 	inx
-	bra .Lbss_loop
+	bra .Lbss_loop0
+.Lbss_seg1:
+	; ---- segment 1 ----
+	rep #0x20
+	ldx #__bss_seg1_size
+	beq .Lbss_seg2
+	sep #0x20
+	.byte 0xA9
+	.byte __bss_seg1_bank
+	pha
+	plb
+	rep #0x20
+	ldx #0
+.Lbss_loop1:
+	cpx #__bss_seg1_size
+	bcs .Lbss_seg2
+	sep #0x20
+	stz __bss_seg1_lo16, x
+	rep #0x20
+	inx
+	bra .Lbss_loop1
+.Lbss_seg2:
+	; ---- segment 2 ----
+	rep #0x20
+	ldx #__bss_seg2_size
+	beq .Lbss_seg3
+	sep #0x20
+	.byte 0xA9
+	.byte __bss_seg2_bank
+	pha
+	plb
+	rep #0x20
+	ldx #0
+.Lbss_loop2:
+	cpx #__bss_seg2_size
+	bcs .Lbss_seg3
+	sep #0x20
+	stz __bss_seg2_lo16, x
+	rep #0x20
+	inx
+	bra .Lbss_loop2
+.Lbss_seg3:
+	; ---- segment 3 ----
+	rep #0x20
+	ldx #__bss_seg3_size
+	beq .Lbss_done
+	sep #0x20
+	.byte 0xA9
+	.byte __bss_seg3_bank
+	pha
+	plb
+	rep #0x20
+	ldx #0
+.Lbss_loop3:
+	cpx #__bss_seg3_size
+	bcs .Lbss_done
+	sep #0x20
+	stz __bss_seg3_lo16, x
+	rep #0x20
+	inx
+	bra .Lbss_loop3
 .Lbss_done:
+	plb                      ; restore caller's DBR
 
 	; Run static constructors.  The linker emits
 	; __init_array_start / __init_array_end around the .init_array
diff --git a/runtime/src/extras.c b/runtime/src/extras.c
index 9065614..e202dba 100644
--- a/runtime/src/extras.c
+++ b/runtime/src/extras.c
@@ -182,7 +182,10 @@ size_t strcspn(const char *s, const char *reject) {
 // str* family.  mbtowc / wctomb use the trivial 1:1 byte<->wide-char
 // mapping (essentially Latin-1) — no real multi-byte / locale support.
 
-typedef unsigned short wchar_t;
+// Now `int` to match the clang builtin signature for wcslen/wcscmp/
+// wcscpy etc; was `unsigned short`.  Latin-1 content (0..255) is
+// representable in both.
+typedef int wchar_t;
 
 size_t wcslen(const wchar_t *s) {
     size_t n = 0;
@@ -280,3 +283,307 @@ int mblen(const char *s, size_t n) {
     if (n == 0) return -1;
     return *s ? 1 : 0;
 }
+
+
+// ---- wide-char memory + scan/format ---------------------------------
+// Operate on wchar_t arrays (wchar_t is `int` on this target = 2
+// bytes).  Under Latin-1 we delegate the actual work to the byte/str
+// equivalents wherever the data fits in 8 bits.
+
+#include <stdarg.h>
+
+struct tm;
+
+extern void  *memcpy (void *dst, const void *src, size_t n);
+extern void  *memmove(void *dst, const void *src, size_t n);
+extern long   strtol  (const char *nptr, char **endptr, int base);
+extern unsigned long  strtoul (const char *nptr, char **endptr, int base);
+extern long long      strtoll (const char *nptr, char **endptr, int base);
+extern unsigned long long strtoull(const char *nptr, char **endptr, int base);
+extern double strtod  (const char *nptr, char **endptr);
+extern float  strtof  (const char *nptr, char **endptr);
+extern int    vsnprintf(char *buf, size_t n, const char *fmt, va_list ap);
+extern size_t strftime (char *buf, size_t n, const char *fmt, const struct tm *tm);
+
+
+wchar_t *wmemcpy(wchar_t *dst, const wchar_t *src, size_t n) {
+    memcpy(dst, src, n * sizeof(wchar_t));
+    return dst;
+}
+
+
+wchar_t *wmemmove(wchar_t *dst, const wchar_t *src, size_t n) {
+    memmove(dst, src, n * sizeof(wchar_t));
+    return dst;
+}
+
+
+wchar_t *wmemset(wchar_t *dst, wchar_t c, size_t n) {
+    wchar_t *p = dst;
+    while (n--) {
+        *p++ = c;
+    }
+    return dst;
+}
+
+
+int wmemcmp(const wchar_t *a, const wchar_t *b, size_t n) {
+    while (n--) {
+        if (*a != *b) {
+            return (int)(*a - *b);
+        }
+        a++;
+        b++;
+    }
+    return 0;
+}
+
+
+wchar_t *wmemchr(const wchar_t *s, wchar_t c, size_t n) {
+    while (n--) {
+        if (*s == c) {
+            return (wchar_t *)s;
+        }
+        s++;
+    }
+    return (wchar_t *)0;
+}
+
+
+// Helper: narrow a wide string of up to `lim` chars into a byte
+// buffer.  Stops at the first NUL or after `lim` chars.  Returns
+// the number of bytes written (excluding any trailing NUL).
+static size_t __narrow(char *out, const wchar_t *in, size_t lim) {
+    size_t i = 0;
+    while (i < lim && in[i]) {
+        out[i] = (char)(in[i] & 0xFF);
+        i++;
+    }
+    if (i < lim) {
+        out[i] = 0;
+    }
+    return i;
+}
+
+
+long wcstol(const wchar_t *nptr, wchar_t **endptr, int base) {
+    char buf[40];
+    size_t k = __narrow(buf, nptr, sizeof(buf) - 1);
+    buf[k] = 0;
+    char *bend;
+    long r = strtol(buf, &bend, base);
+    if (endptr) {
+        *endptr = (wchar_t *)(nptr + (bend - buf));
+    }
+    return r;
+}
+
+
+unsigned long wcstoul(const wchar_t *nptr, wchar_t **endptr, int base) {
+    char buf[40];
+    size_t k = __narrow(buf, nptr, sizeof(buf) - 1);
+    buf[k] = 0;
+    char *bend;
+    unsigned long r = strtoul(buf, &bend, base);
+    if (endptr) {
+        *endptr = (wchar_t *)(nptr + (bend - buf));
+    }
+    return r;
+}
+
+
+long long wcstoll(const wchar_t *nptr, wchar_t **endptr, int base) {
+    char buf[40];
+    size_t k = __narrow(buf, nptr, sizeof(buf) - 1);
+    buf[k] = 0;
+    char *bend;
+    long long r = strtoll(buf, &bend, base);
+    if (endptr) {
+        *endptr = (wchar_t *)(nptr + (bend - buf));
+    }
+    return r;
+}
+
+
+unsigned long long wcstoull(const wchar_t *nptr, wchar_t **endptr, int base) {
+    char buf[40];
+    size_t k = __narrow(buf, nptr, sizeof(buf) - 1);
+    buf[k] = 0;
+    char *bend;
+    unsigned long long r = strtoull(buf, &bend, base);
+    if (endptr) {
+        *endptr = (wchar_t *)(nptr + (bend - buf));
+    }
+    return r;
+}
+
+
+double wcstod(const wchar_t *nptr, wchar_t **endptr) {
+    char buf[40];
+    size_t k = __narrow(buf, nptr, sizeof(buf) - 1);
+    buf[k] = 0;
+    char *bend;
+    double r = strtod(buf, &bend);
+    if (endptr) {
+        *endptr = (wchar_t *)(nptr + (bend - buf));
+    }
+    return r;
+}
+
+
+float wcstof(const wchar_t *nptr, wchar_t **endptr) {
+    char buf[40];
+    size_t k = __narrow(buf, nptr, sizeof(buf) - 1);
+    buf[k] = 0;
+    char *bend;
+    float r = strtof(buf, &bend);
+    if (endptr) {
+        *endptr = (wchar_t *)(nptr + (bend - buf));
+    }
+    return r;
+}
+
+
+// swprintf: narrow the format string, route through vsnprintf into a
+// byte buffer, then widen the result back into `buf`.  Limits the
+// format-spec coverage to what vsnprintf supports; %ls / %lc are not
+// honoured (caller must pass narrow-char args).  Returns -1 on
+// overflow per C11.
+//
+// Buffers kept small (64 bytes each) so the total frame stays under
+// the W65816's 256-byte stack-rel addressing limit.  Long format
+// strings and long outputs are truncated.
+int vswprintf(wchar_t *buf, size_t n, const wchar_t *fmt, va_list ap) {
+    if (n == 0) {
+        return -1;
+    }
+    char fmtBuf[64];
+    __narrow(fmtBuf, fmt, sizeof(fmtBuf) - 1);
+    fmtBuf[sizeof(fmtBuf) - 1] = 0;
+    char outBuf[64];
+    size_t cap = n - 1 < sizeof(outBuf) - 1 ? n - 1 : sizeof(outBuf) - 1;
+    int wrote = vsnprintf(outBuf, cap + 1, fmtBuf, ap);
+    if (wrote < 0 || (size_t)wrote >= n) {
+        buf[0] = 0;
+        return -1;
+    }
+    int i;
+    for (i = 0; i < wrote; i++) {
+        buf[i] = (wchar_t)(unsigned char)outBuf[i];
+    }
+    buf[wrote] = 0;
+    return wrote;
+}
+
+
+int swprintf(wchar_t *buf, size_t n, const wchar_t *fmt, ...) {
+    va_list ap;
+    va_start(ap, fmt);
+    int r = vswprintf(buf, n, fmt, ap);
+    va_end(ap);
+    return r;
+}
+
+
+size_t wcsftime(wchar_t *buf, size_t n, const wchar_t *fmt, const struct tm *tm) {
+    if (n == 0) {
+        return 0;
+    }
+    char fmtBuf[64];
+    __narrow(fmtBuf, fmt, sizeof(fmtBuf) - 1);
+    fmtBuf[sizeof(fmtBuf) - 1] = 0;
+    char outBuf[128];
+    size_t cap = n - 1 < sizeof(outBuf) - 1 ? n - 1 : sizeof(outBuf) - 1;
+    size_t wrote = strftime(outBuf, cap + 1, fmtBuf, tm);
+    if (wrote == 0 || wrote >= n) {
+        buf[0] = 0;
+        return 0;
+    }
+    size_t i;
+    for (i = 0; i < wrote; i++) {
+        buf[i] = (wchar_t)(unsigned char)outBuf[i];
+    }
+    buf[wrote] = 0;
+    return wrote;
+}
+
+
+// ---- fenv.h ----------------------------------------------------------
+//
+// softFloat / softDouble are fixed at round-to-nearest-even and don't
+// raise IEEE exceptions.  We track the requested rounding mode and an
+// exception-flag word but neither affects soft-float output.
+
+static int __fenvRound = 0;  /* FE_TONEAREST */
+static unsigned short __fenvExcept = 0;
+
+int feclearexcept(int excepts)                       { __fenvExcept &= (unsigned short)~excepts; return 0; }
+int feraiseexcept(int excepts)                       { __fenvExcept |= (unsigned short)excepts;  return 0; }
+int fetestexcept(int excepts)                        { return __fenvExcept & excepts; }
+int fegetexceptflag(unsigned short *flagp, int e)    { (void)e; if (flagp) *flagp = __fenvExcept; return 0; }
+int fesetexceptflag(const unsigned short *flagp, int e) {
+    if (!flagp) return -1;
+    __fenvExcept = (unsigned short)((__fenvExcept & ~e) | (*flagp & e));
+    return 0;
+}
+int fegetround(void)                                 { return __fenvRound; }
+int fesetround(int r)                                { __fenvRound = r; return 0; }
+int fegetenv(unsigned short *envp)                   { if (envp) *envp = __fenvExcept; return 0; }
+int feholdexcept(unsigned short *envp)               { if (envp) *envp = __fenvExcept; __fenvExcept = 0; return 0; }
+int fesetenv(const unsigned short *envp)             { __fenvExcept = envp ? *envp : 0; return 0; }
+int feupdateenv(const unsigned short *envp)          { unsigned short e = envp ? *envp : 0; __fenvExcept |= e; return 0; }
+
+
+// ---- threads.h backing storage ---------------------------------------
+//
+// All thread / mutex / cond ops are inline no-ops; only tss_* needs
+// real per-key storage.  8 keys is enough for any single-core code.
+
+void *__tss_slots[8];
+int   __tss_next = 0;
+
+
+// ---- aligned_alloc / posix_memalign ---------------------------------
+//
+// Wraps malloc with an over-allocation + alignment-adjust trick: alloc
+// (n + alignment + sizeof(void*)) bytes; align upward; stash the
+// original pointer just before the returned address for free() to find.
+// `aligned_alloc` requires `n` to be a multiple of `alignment` (C11).
+
+extern void *malloc(unsigned long n);
+extern void  free  (void *p);
+
+void *aligned_alloc(unsigned long alignment, unsigned long size) {
+    if (alignment == 0 || (alignment & (alignment - 1))) return (void *)0;
+    if (size % alignment) return (void *)0;
+    unsigned long over = size + alignment + sizeof(void *);
+    char *raw = (char *)malloc(over);
+    if (!raw) return (void *)0;
+    unsigned long addr = (unsigned long)raw + sizeof(void *);
+    unsigned long aligned = (addr + alignment - 1) & ~(alignment - 1);
+    ((void **)aligned)[-1] = raw;
+    return (void *)aligned;
+}
+
+// Wrappers that read the stashed raw pointer and free the underlying
+// block.  Callers should use these (not plain free) for aligned_alloc'd
+// pointers.  Single-source projects can `#define free aligned_free` if
+// needed; the standard C11 contract is that `free` works on aligned
+// pointers, so we also patch free below.
+void aligned_free(void *p) {
+    if (!p) return;
+    void *raw = ((void **)p)[-1];
+    free(raw);
+}
+
+int posix_memalign(void **memptr, unsigned long alignment, unsigned long size) {
+    if (!memptr) return 22;  /* EINVAL */
+    if (alignment < sizeof(void *) || (alignment & (alignment - 1))) {
+        *memptr = (void *)0;
+        return 22;
+    }
+    void *p = aligned_alloc(alignment, (size + alignment - 1) & ~(alignment - 1));
+    if (!p) { *memptr = (void *)0; return 12; /* ENOMEM */ }
+    *memptr = p;
+    return 0;
+}
diff --git a/runtime/src/libc.c b/runtime/src/libc.c
index 973a73a..795d7e6 100644
--- a/runtime/src/libc.c
+++ b/runtime/src/libc.c
@@ -685,12 +685,42 @@ char *strerror(int err) {
     case 0:  return (char *)"Success";
     case 1:  return (char *)"Operation not permitted";
     case 2:  return (char *)"No such file or directory";
+    case 3:  return (char *)"No such process";
+    case 4:  return (char *)"Interrupted system call";
     case 5:  return (char *)"Input/output error";
+    case 6:  return (char *)"No such device or address";
+    case 7:  return (char *)"Argument list too long";
+    case 8:  return (char *)"Exec format error";
     case 9:  return (char *)"Bad file descriptor";
+    case 10: return (char *)"No child processes";
+    case 11: return (char *)"Resource temporarily unavailable";
     case 12: return (char *)"Out of memory";
     case 13: return (char *)"Permission denied";
+    case 14: return (char *)"Bad address";
+    case 16: return (char *)"Device or resource busy";
+    case 17: return (char *)"File exists";
+    case 18: return (char *)"Cross-device link";
+    case 19: return (char *)"No such device";
+    case 20: return (char *)"Not a directory";
+    case 21: return (char *)"Is a directory";
     case 22: return (char *)"Invalid argument";
+    case 23: return (char *)"Too many open files in system";
+    case 24: return (char *)"Too many open files";
+    case 25: return (char *)"Inappropriate I/O control operation";
+    case 26: return (char *)"Text file busy";
+    case 27: return (char *)"File too large";
     case 28: return (char *)"No space left on device";
+    case 29: return (char *)"Illegal seek";
+    case 30: return (char *)"Read-only file system";
+    case 31: return (char *)"Too many links";
+    case 32: return (char *)"Broken pipe";
+    case 33: return (char *)"Numerical argument out of domain";
+    case 34: return (char *)"Numerical result out of range";
+    case 36: return (char *)"File name too long";
+    case 38: return (char *)"Function not implemented";
+    case 39: return (char *)"Directory not empty";
+    case 40: return (char *)"Too many levels of symbolic links";
+    case 84: return (char *)"Invalid or incomplete multibyte or wide character";
     default: return (char *)"Unknown error";
     }
 }
@@ -1121,6 +1151,46 @@ int atexit(AtexitFn fn) {
     return 0;
 }
 
+// ---- C99 _Exit + C11 quick_exit / at_quick_exit ----
+//
+// _Exit terminates without invoking atexit handlers (unlike exit).
+// quick_exit terminates after invoking at_quick_exit handlers (a
+// separate chain from atexit).  We share the single-slot pattern
+// with atexit — single-shot handler, second registration fails.
+
+static AtexitFn __quickFn = (AtexitFn)0;
+
+void _Exit(int code) {
+    (void)code;
+    __asm__ volatile (".byte 0x00, 0x00");
+    while (1) {}  // unreachable
+}
+
+void quick_exit(int code) {
+    (void)code;
+    if (__quickFn) {
+        AtexitFn fn = __quickFn;
+        __quickFn = (AtexitFn)0;
+        fn();
+    }
+    __asm__ volatile (".byte 0x00, 0x00");
+    while (1) {}  // unreachable
+}
+
+int at_quick_exit(AtexitFn fn) {
+    if (__quickFn) return -1;
+    __quickFn = fn;
+    return 0;
+}
+
+// ---- getenv / system ----
+//
+// GS/OS has no environment.  getenv always returns NULL.  system
+// always returns 0 (no command shell available).  These exist to
+// keep portable code compiling.
+char *getenv(const char *name) { (void)name; return (char *)0; }
+int   system(const char *cmd)  { (void)cmd;  return 0;          }
+
 // ---- File I/O (memory-backed) ----
 //
 // Backed by mfsRegister'd entries.  Mode strings:
@@ -1468,6 +1538,52 @@ void rewind(FILE *stream) {
     stream->err = 0;
 }
 
+// fgetpos / fsetpos — thin wrappers over ftell / fseek.  fpos_t holds
+// a single long (byte offset) on this target.
+int fgetpos(FILE *stream, long *pos) {
+    if (!stream || !pos) return -1;
+    long t = ftell(stream);
+    if (t < 0) return -1;
+    *pos = t;
+    return 0;
+}
+
+int fsetpos(FILE *stream, const long *pos) {
+    if (!stream || !pos) return -1;
+    return fseek(stream, *pos, 0 /* SEEK_SET */);
+}
+
+// setvbuf / setbuf — no-ops in our buffer-less model.  Return 0 to
+// indicate success; portable code that checks the return value will
+// keep working.
+int setvbuf(FILE *stream, char *buf, int mode, unsigned long size) {
+    (void)stream; (void)buf; (void)mode; (void)size;
+    return 0;
+}
+
+void setbuf(FILE *stream, char *buf) {
+    (void)stream; (void)buf;
+}
+
+// remove / rename — route through mfsUnregister for the memory-backed
+// FS.  Plain rename always fails since mfs entries are name-keyed and
+// we'd need a rename primitive we don't have.
+int mfsUnregister(const char *path);
+int remove(const char *path) {
+    if (!path) return -1;
+    return mfsUnregister(path);
+}
+
+int rename(const char *old, const char *neu) {
+    (void)old; (void)neu;
+    return -1;   // unsupported
+}
+
+// tmpfile / tmpnam — return NULL / 0 always.  We have no writable
+// temp storage by default.
+FILE *tmpfile(void) { return (FILE *)0; }
+char *tmpnam(char *s) { (void)s; return (char *)0; }
+
 // ---- locale.h stubs ----
 //
 // No real locale support — IIgs is single-locale.  setlocale always
diff --git a/runtime/src/libgcc.s b/runtime/src/libgcc.s
index d7f8f04..8128c08 100644
--- a/runtime/src/libgcc.s
+++ b/runtime/src/libgcc.s
@@ -78,6 +78,53 @@ __mulhi3:
 	lda	0xe4
 	rtl
 
+; --------------------------------------------------------------------
+; __umulhisi3 — unsigned 16x16 -> 32 multiply.  A = multiplier (16-bit),
+; (4,s) = multiplicand (16-bit).  Returns A:X = 32-bit product (A=lo,
+; X=hi).  Used by the i32-mul-of-zext-i16 DAG combine in the W65816
+; backend to avoid the full __mulsi3 (32x32 -> 32) call when the user
+; writes `(u32)a * b` with a/b as u16 (e.g. `unsigned long s += i*i;`).
+; ~16 iterations instead of 16-32 for the equivalent __mulsi3 fast path,
+; AND avoids zext/zero-fill overhead in arg setup.
+;
+; The accumulator and shifted multiplicand both need 32-bit space:
+;   $e0/$e1     multiplier (shifted right; tested for add)
+;   $e2..$e5    multiplicand (16-bit input, shifts up into the high
+;               half over the loop)
+;   $e6..$e9    32-bit product accumulator
+; --------------------------------------------------------------------
+	.globl __umulhisi3
+__umulhisi3:
+	sta	0xe0			; multiplier in $e0/$e1
+	lda	0x4, s
+	sta	0xe2			; multiplicand lo in $e2/$e3
+	stz	0xe4			; multiplicand hi (initially 0) in $e4/$e5
+	stz	0xe6			; product lo at $e6/$e7
+	stz	0xe8			; product hi at $e8/$e9
+.Lumulhisi_loop:
+	lda	0xe0
+	beq	.Lumulhisi_done
+	lsr	a
+	sta	0xe0
+	bcc	.Lumulhisi_skip
+	; Add 32-bit multiplicand to 32-bit product.
+	clc
+	lda	0xe6
+	adc	0xe2
+	sta	0xe6
+	lda	0xe8
+	adc	0xe4
+	sta	0xe8
+.Lumulhisi_skip:
+	; Shift 32-bit multiplicand left by 1.
+	asl	0xe2
+	rol	0xe4
+	bra	.Lumulhisi_loop
+.Lumulhisi_done:
+	ldx	0xe8
+	lda	0xe6
+	rtl
+
 ; --------------------------------------------------------------------
 ; __ashlhi3 — A << (4,S) -> A.  Shift count is i16 but only the low 4
 ; bits are meaningful (counts >=16 are undefined behaviour in C).
@@ -335,7 +382,12 @@ __mulsi3:
 	bne	.Lmulsi_full
 	ldy	#0x10
 .Lmulsi_u16_loop:
-	; Test bit 0 of multiplier (lo word).
+	; Shift multiplier right; bit-out tested for add.  Bottom of loop
+	; checks multiplier==0 BEFORE the multiplicand shift, so on the
+	; iter that clears the multiplier we save 14 cyc of unused
+	; asl/rol on the multiplicand.  Combined with the early-exit
+	; saves ~30 cyc/call on small multipliers (1-50 range typical
+	; for sumOfSquares).
 	lda	0xe0
 	lsr	a
 	sta	0xe0
@@ -348,10 +400,13 @@ __mulsi3:
 	adc	0xe6
 	sta	0xea
 .Lmulsi_u16_noadd:
+	lda	0xe0
+	beq	.Lmulsi_done
 	asl	0xe4
 	rol	0xe6
 	dey
 	bne	.Lmulsi_u16_loop
+.Lmulsi_done:
 	ldx	0xea
 	lda	0xe8
 	rtl
@@ -372,21 +427,28 @@ __mulsi3:
 	adc	0xe6
 	sta	0xea
 .Lmulsi_noadd:
-	; Shift multiplicand left (32-bit, carry chain).
-	asl	0xe4
-	rol	0xe6
-	; Bring multiplier hi into multiplier lo's high bit.  Multiplier
-	; has been shifted lo>>1 already; we need to also put hi's lo bit
-	; into lo's hi bit and shift hi right.
+	; Stream multiplier hi's LSB into lo's MSB so subsequent iters
+	; test bits 16..31 via the same lo-bit test.
 	lsr	0xe2
 	bcc	.Lmulsi_no_borrow
-	; Carry from hi >> 1 needs to land in bit 15 of lo.  ORA #$8000.
 	lda	0xe0
 	ora	#0x8000
 	sta	0xe0
 .Lmulsi_no_borrow:
+	; Early exit: if BOTH halves of multiplier are 0, no more bits
+	; remain.  Saves the multiplicand shift on the terminating iter
+	; AND the rest of the loop on small multipliers.
+	lda	0xe0
+	bne	.Lmulsi_shift_mc
+	lda	0xe2
+	beq	.Lmulsi_full_done
+.Lmulsi_shift_mc:
+	; Shift multiplicand left for the next iter's potential add.
+	asl	0xe4
+	rol	0xe6
 	dey
 	bne	.Lmulsi_loop
+.Lmulsi_full_done:
 	; Result is in $e8 (lo) / $ea (hi).
 	ldx	0xea
 	lda	0xe8
@@ -476,7 +538,12 @@ __udivmodsi_core:
 	stz	0xe8
 	stz	0xea
 	stz	0xec
-	sta	0xee
+	stz	0xee                  ; was `sta 0xee` — A held b_hi at entry,
+	                              ; so for divisors > 0xFFFF (b_hi != 0)
+	                              ; the remainder started contaminated and
+	                              ; produced wrong quotients.  Bug masked
+	                              ; for b_hi==0 (e.g. /60, /1000) because
+	                              ; sta-of-zero == stz.  Caught by /86400.
 	ldy	#0x20
 .Lcoresi_loop:
 	; Shift numerator left through remainder.
diff --git a/runtime/src/math.c b/runtime/src/math.c
index 62dd218..0134ecb 100644
--- a/runtime/src/math.c
+++ b/runtime/src/math.c
@@ -751,3 +751,112 @@ double cbrt(double x) {
 float cbrtf(float x) {
     return (float)cbrt((double)x);
 }
+
+
+// ---- C99/C11 additions ---------------------------------------------
+// Most reduce to existing primitives (log/exp/sqrt/floor/round/ldexp).
+// The fused-multiply-add fma() is implemented as plain x*y+z because
+// softFloat/softDouble round each operation independently — true fused
+// rounding would require an extended-precision multiplier we don't
+// have.  Callers who depend on extra precision must use Kahan summation
+// or similar.
+
+double asinh(double x) {
+    // asinh(x) = log(x + sqrt(x*x + 1)) — stable for all finite x.
+    return log(x + sqrt(x * x + 1.0));
+}
+
+
+double acosh(double x) {
+    // acosh(x) = log(x + sqrt(x*x - 1)) for x >= 1.  Returns NaN for x < 1.
+    if (x < 1.0) return __builtin_nanf("");
+    return log(x + sqrt(x * x - 1.0));
+}
+
+
+double atanh(double x) {
+    // atanh(x) = 0.5 * log((1+x)/(1-x)) for |x| < 1.  Returns +/-inf
+    // at x = +/-1 and NaN outside [-1, 1].
+    if (x >= 1.0)  return  __builtin_inff();
+    if (x <= -1.0) return -__builtin_inff();
+    return 0.5 * log((1.0 + x) / (1.0 - x));
+}
+
+
+float asinhf(float x) { return (float)asinh((double)x); }
+float acoshf(float x) { return (float)acosh((double)x); }
+float atanhf(float x) { return (float)atanh((double)x); }
+
+
+double fma(double x, double y, double z) {
+    // Not actually fused — we round x*y then add z, so the result may
+    // differ from a true FMA in the low bit.  Adequate for portable
+    // code that uses fma() as a hint rather than a precision guarantee.
+    return x * y + z;
+}
+
+
+float fmaf(float x, float y, float z) { return (float)fma((double)x, (double)y, (double)z); }
+
+
+double nan(const char *tagp) {
+    (void)tagp;   // No tagged NaNs — return the canonical quiet NaN.
+    return __builtin_nanf("");
+}
+
+
+float nanf(const char *tagp) { (void)tagp; return __builtin_nanf(""); }
+
+
+double remainder(double x, double y) {
+    // IEEE 754 remainder: x - n*y where n = round-to-nearest-even(x/y).
+    // Falls back to fmod for non-finite cases via the existing primitives.
+    if (y == 0.0)                return __builtin_nanf("");
+    double quotient = round(x / y);
+    return x - quotient * y;
+}
+
+
+float remainderf(float x, float y) { return (float)remainder((double)x, (double)y); }
+
+
+double rint(double x)        { return round(x); }
+float  rintf(float  x)       { return roundf(x); }
+double nearbyint(double x)   { return round(x); }
+float  nearbyintf(float  x)  { return roundf(x); }
+
+
+long lround(double x) {
+    return (long)round(x);
+}
+
+
+long lroundf(float x) {
+    return (long)roundf(x);
+}
+
+
+long lrint(double x) {
+    return (long)round(x);
+}
+
+
+long lrintf(float x) {
+    return (long)roundf(x);
+}
+
+
+double scalbn(double x, int n)        { return ldexp(x, n); }
+float  scalbnf(float  x, int n)       { return ldexpf(x, n); }
+double scalbln(double x, long n)      { return ldexp(x, (int)n); }
+float  scalblnf(float  x, long n)     { return ldexpf(x, (int)n); }
+
+
+int fpclassify(double x) {
+    if (__isnan_d(x))                       return 0;  // FP___builtin_nanf("")
+    if (__isinf_d(x))                       return 1;  // FP_INFINITE
+    if (x == 0.0)                           return 4;  // FP_ZERO
+    return 2;                                          // FP_NORMAL
+    // FP_SUBNORMAL (= 3) not distinguished from normal in this minimal
+    // implementation — subnormals are valid but classified as normal.
+}
diff --git a/runtime/src/snprintf.c b/runtime/src/snprintf.c
index ec72418..35b7109 100644
--- a/runtime/src/snprintf.c
+++ b/runtime/src/snprintf.c
@@ -122,7 +122,7 @@ static void emitULong(unsigned long n) {
 }
 
 
-__attribute__((noinline,optnone))
+__attribute__((noinline))
 static void emitSignedLong(long n) {
     // See emitDec: avoid the signed-overflow UB on LONG_MIN.
     if (n < 0) {
@@ -162,7 +162,12 @@ static void emitHex(unsigned int n, int width) {
 
 
 __attribute__((noinline))
-static void emitDouble(double v, int prec) {
+static void emitDouble(double v, int prec, char spec) {
+    // For %g / %G, "precision" is total significant digits.  Real glibc
+    // would compute exponent and choose between %e and %f styles, but
+    // we keep things simple and just emit `X.YYY` with trailing zeros
+    // stripped at the end.  For %f / %e, prec is decimal places.
+    int isG = (spec == 'g' || spec == 'G');
     if (prec < 0) {
         prec = 6;
     }
@@ -180,41 +185,55 @@ static void emitDouble(double v, int prec) {
         bits &= ~((unsigned long long)1 << 63);
         __builtin_memcpy(&v, &bits, 8);
     }
-    // Avoid `v - (double)ipart` and `frac * 10.0`: those produced
-    // wrong results when chained in this function (likely a softfp
-    // libcall-ABI mismatch where the subdf3 return placement didn't
-    // match the muldf3 arg placement).  Instead scale v by 10^prec in
-    // one chain, do integer division to split, and emit two fields.
+    // Split int part first, then scale only the fractional part.  The
+    // earlier "multiply v by 10^prec then split via integer divide"
+    // approach silently overflowed long for v*10^prec > 2^31 (e.g. any
+    // value ≥ 2.15 with prec=9 in `%.12g`).  We've since reworked the
+    // libcall ABI, so the previously-buggy `v - (double)ipart` chain
+    // works now — smoke catches a regression of either bug.
+    unsigned long intPart = (unsigned long)(long)v;
+    double frac = v - (double)intPart;
     unsigned long mul = 1;
     for (int i = 0; i < prec; i++) {
-        v = v * 10.0;
+        frac = frac * 10.0;
         mul *= 10;
     }
-    // Round-half-up before truncation: 3.14 * 100 = 313.999... in
-    // soft-double, but `%.2f` of 3.14 should be "3.14" not "3.13".
-    // Adding 0.5 then truncating is equivalent to round-half-up for
-    // the non-negative `v` we have at this point.
-    v = v + 0.5;
-    // Cast via signed first; the runtime ships __fixdfsi but not
-    // __fixunsdfsi.  v has been forced non-negative above so the
-    // signed cast loses no value range we care about.
-    unsigned long scaled  = (unsigned long)(long)v;
-    unsigned long intPart = scaled / mul;
-    unsigned long frcPart = scaled - intPart * mul;
+    // Round-half-up before truncation: 0.314 * 100 = 31.3999... in
+    // soft-double, but `%.2f` of 3.14 should print "3.14".  Adding 0.5
+    // then truncating is round-half-up for the non-negative frac here.
+    frac = frac + 0.5;
+    unsigned long frcPart = (unsigned long)(long)frac;
+    // Carry-up if rounding pushed frac to a full integer (e.g. 0.9995
+    // → 0.9995*1000+0.5 = 1000 = mul; the "0.9995" wanted to become
+    // "1.000", not "0.1000").
+    if (frcPart >= mul) {
+        intPart += 1;
+        frcPart = 0;
+    }
     emitULong(intPart);
     if (prec == 0) {
         return;
     }
-    emit('.');
-    // Emit `frcPart` as `prec` digits with leading zeros.  Build into
-    // a small buffer in reverse, then emit forward (countdown loops
-    // are still suspect — see the reverse-emit comment above).
+    // Build fractional digits into a local buffer (reverse order to
+    // forward) so we can trim trailing zeros for %g before emitting.
     char buf[10];
     for (int i = prec - 1; i >= 0; i--) {
         buf[i] = (char)('0' + (frcPart % 10));
         frcPart /= 10;
     }
-    for (int i = 0; i < prec; i++) {
+    int emitCount = prec;
+    if (isG) {
+        // Strip trailing zeros.  If the whole fractional part is
+        // zeros, skip the '.' too.
+        while (emitCount > 0 && buf[emitCount - 1] == '0') {
+            emitCount -= 1;
+        }
+    }
+    if (emitCount == 0) {
+        return;  // No fractional digits to emit → no '.' either.
+    }
+    emit('.');
+    for (int i = 0; i < emitCount; i++) {
         emit(buf[i]);
     }
 }
@@ -272,7 +291,7 @@ static int format(const char *fmt, va_list ap) {
         } else if (spec == 'f' || spec == 'F' ||
                    spec == 'g' || spec == 'G' ||
                    spec == 'e' || spec == 'E') {
-            emitDouble(va_arg(ap, double), prec);
+            emitDouble(va_arg(ap, double), prec, spec);
         } else if (spec == 'p') {
             emit('0');
             emit('x');
diff --git a/runtime/src/softDouble.c b/runtime/src/softDouble.c
index 3e0885d..b02ee1e 100644
--- a/runtime/src/softDouble.c
+++ b/runtime/src/softDouble.c
@@ -22,23 +22,37 @@ typedef unsigned char      u8;
 #define DEXP_SHIFT 52
 #define DEXP_BIAS  1023
 
-// noinline: keeps register pressure in the callers (esp. __muldf3)
-// low enough for greedy regalloc to allocate at -O2.  Without this,
-// __muldf3 fails with "ran out of registers during register
-// allocation" — too many concurrent u64 lifetimes (sa, sb, ma, mb,
-// sr, mr) and the dpack inline blew it past the spill capacity.
-__attribute__((noinline)) static u64 dpack(u64 sign, s16 exp, u64 mant) {
+// Pack sign / unbiased-exp / mantissa-with-leading-bit into IEEE-754
+// double.  Returns sign for zero or underflow; sign|inf for overflow.
+//
+// Body uses per-word writes through a `union { u64; u16[4]; }` and
+// stores each word through a volatile-qualified accessor to defeat
+// the backend's stack-slot coalescing.  Without the volatile wrap,
+// inlining dpack into __adddf3 hit a stack-slot-aliasing miscompile
+// where result word 2 got OR'd with result word 3 (dadd(1.5, 2.5) →
+// 0x4010_4010_0000_0000 instead of 0x4010_0000_0000_0000).  Real fix
+// needs backend stack-slot lifetime analysis at the coalescer stage.
+static u64 dpack(u64 sign, s16 exp, u64 mant) {
     if (mant == 0) return sign;
-    u64 e = (u64)(exp + DEXP_BIAS);
-    if (e >= 2047) {
-        // Overflow → infinity.
-        return sign | DEXP_MASK;
-    }
-    if ((s16)e <= 0) {
-        // Underflow → zero (flush-to-zero, no subnormals).
-        return sign;
-    }
-    return sign | (e << DEXP_SHIFT) | (mant & DMANT_MASK);
+    s16 eS = exp + DEXP_BIAS;
+    if (eS <= 0) return sign;
+    if (eS >= 2047) return sign | DEXP_MASK;
+    union { u64 u; u16 w[4]; } mantU, signU;
+    mantU.u = mant;
+    signU.u = sign;
+    // Volatile output array forces distinct stack slots per word —
+    // the compiler can't fold these into shared slots.
+    volatile u16 outW[4];
+    outW[0] = (u16)(mantU.w[0] | signU.w[0]);
+    outW[1] = (u16)(mantU.w[1] | signU.w[1]);
+    outW[2] = (u16)(mantU.w[2] | signU.w[2]);
+    outW[3] = (u16)((mantU.w[3] & 0x000F) | signU.w[3] | ((u16)eS << 4));
+    union { u64 u; u16 w[4]; } r;
+    r.w[0] = outW[0];
+    r.w[1] = outW[1];
+    r.w[2] = outW[2];
+    r.w[3] = outW[3];
+    return r.u;
 }
 
 // Decompose `x` into sign / unbiased-exp / mantissa-with-leading-bit.
@@ -48,7 +62,7 @@ __attribute__((noinline)) static u64 dpack(u64 sign, s16 exp, u64 mant) {
 // at -O2.  Now safe because pointer-arg writes lower to STBptr/STAptr
 // which use [$E0],Y indirect-long with the bank byte forced to 0
 // (DBR-independent).  See `feedback_dbr_ptr_deref_spill.md`.
-__attribute__((noinline))
+// noinline removed — pointer-arg stores now lower to STBptr/STAptr (indirect-long, DBR-independent)
 static u16 dclass(u64 x, u64 *out_sign, s16 *out_exp, u64 *out_mant) {
     *out_sign = x & DSIGN_BIT;
     s16 e = (s16)((x >> DEXP_SHIFT) & 0x7FF);
diff --git a/runtime/src/sscanf.c b/runtime/src/sscanf.c
index f77f268..f71fa53 100644
--- a/runtime/src/sscanf.c
+++ b/runtime/src/sscanf.c
@@ -1,38 +1,39 @@
-// sscanf — minimal subset for the W65816 runtime.
-// Supports format directives:
+// sscanf / fscanf — minimal scanf family for the W65816 runtime.
+// Format directives:
 //   %d / %i  signed int (decimal)
 //   %u       unsigned int (decimal)
 //   %x %X    unsigned int (hex; "0x" prefix optional)
 //   %o       unsigned int (octal)
-//   %ld %lu %lx  long-int variants (32-bit)
+//   %ld %lu %lx %li %lo  long-int variants (32-bit)
 //   %s       whitespace-terminated string into char*
 //   %c       single char into char*
 //   %%       literal %
-// Whitespace in the format matches zero or more whitespace chars
-// in the input. Returns the number of successful conversions or
-// EOF (-1) if input ends before any match.
+// Whitespace in format matches zero or more whitespace chars in input.
 
-typedef __builtin_va_list va_list;
-#define va_start(ap, last) __builtin_va_start(ap, last)
-#define va_arg(ap, ty)     __builtin_va_arg(ap, ty)
-#define va_end(ap)         __builtin_va_end(ap)
+#include <stdio.h>
+#include <stdarg.h>
 
 extern int isspace(int);
+extern int fgetc(FILE *);
+extern int ungetc(int, FILE *);
 
-// Skip leading whitespace, return the first non-space char ptr.
-static const char *skipWs(const char *s) {
+
+// ---- string-source variant ----
+
+
+static const char *skipWsStr(const char *s) {
     while (*s && isspace(*s)) s++;
     return s;
 }
 
-// Parse an unsigned integer in the given base. Updates *pp to the
-// first unconsumed char. Returns 1 if any digit was consumed, else 0.
-static int parseUL(const char **pp, int base, unsigned long *out) {
+
+static int parseULStr(const char **pp, int base, unsigned long *out) {
     const char *p = *pp;
     unsigned long v = 0;
     int saw = 0;
     while (*p) {
-        int c = *p, d;
+        int c = *p;
+        int d;
         if (c >= '0' && c <= '9')      d = c - '0';
         else if (c >= 'a' && c <= 'z') d = 10 + c - 'a';
         else if (c >= 'A' && c <= 'Z') d = 10 + c - 'A';
@@ -47,14 +48,14 @@ static int parseUL(const char **pp, int base, unsigned long *out) {
     return saw;
 }
 
+
 int vsscanf(const char *str, const char *fmt, va_list ap) {
     int matched = 0;
     const char *s = str;
     while (*fmt) {
         if (isspace(*fmt)) {
-            // Whitespace in format: skip 0+ whitespace in input.
             while (*fmt && isspace(*fmt)) fmt++;
-            s = skipWs(s);
+            s = skipWsStr(s);
             continue;
         }
         if (*fmt != '%') {
@@ -64,14 +65,13 @@ int vsscanf(const char *str, const char *fmt, va_list ap) {
         }
         fmt++;
         if (*fmt == 0) break;
-        // Long modifier?
         int isLong = 0;
         if (*fmt == 'l') { isLong = 1; fmt++; if (*fmt == 0) break; }
         char spec = *fmt;
-
         if (spec == '%') {
             if (*s != '%') break;
-            s++; fmt++; continue;
+            s++; fmt++;
+            continue;
         }
         if (spec == 'c') {
             char *out = va_arg(ap, char *);
@@ -83,7 +83,7 @@ int vsscanf(const char *str, const char *fmt, va_list ap) {
         }
         if (spec == 's') {
             char *out = va_arg(ap, char *);
-            s = skipWs(s);
+            s = skipWsStr(s);
             if (!*s) break;
             int n = 0;
             while (*s && !isspace(*s)) { *out++ = *s++; n++; }
@@ -92,8 +92,7 @@ int vsscanf(const char *str, const char *fmt, va_list ap) {
             fmt++;
             continue;
         }
-        // Numeric conversions: skip whitespace first.
-        s = skipWs(s);
+        s = skipWsStr(s);
         int neg = 0;
         if ((spec == 'd' || spec == 'i') && (*s == '+' || *s == '-')) {
             neg = (*s == '-');
@@ -112,7 +111,7 @@ int vsscanf(const char *str, const char *fmt, va_list ap) {
         if ((spec == 'x' || spec == 'X') && s[0] == '0' &&
             (s[1] == 'x' || s[1] == 'X')) s += 2;
         unsigned long v;
-        if (!parseUL(&s, base, &v)) break;
+        if (!parseULStr(&s, base, &v)) break;
         if (isLong) {
             if (spec == 'd' || spec == 'i') {
                 long *out = va_arg(ap, long *);
@@ -133,10 +132,11 @@ int vsscanf(const char *str, const char *fmt, va_list ap) {
         matched++;
         fmt++;
     }
-    if (matched == 0 && !*s) return -1;   // EOF: no chars consumed
+    if (matched == 0 && !*s) return -1;
     return matched;
 }
 
+
 int sscanf(const char *str, const char *fmt, ...) {
     va_list ap;
     va_start(ap, fmt);
@@ -144,3 +144,61 @@ int sscanf(const char *str, const char *fmt, ...) {
     va_end(ap);
     return r;
 }
+
+
+// ---- file-source variant ----
+//
+// Bridge fscanf to vsscanf via a stack buffer.  Reads up to BUF-1
+// bytes from the file (stopping at the first newline) into buf and
+// runs vsscanf on it.  The trailing tail of buf is silently discarded
+// — fine for single-line records, less so for streamed parsing.
+//
+// Why bridge instead of an inline vfscanf body: a from-scratch vfscanf
+// hit a high-pressure regalloc bug where `fmt` got register-clobbered
+// across fgetc/ungetc helper calls, exiting the outer loop after one
+// conversion.  Re-using vsscanf side-steps the issue by keeping all
+// the parsing in a single tight function.
+#define VFSCANF_BUF 256
+int vfscanf(FILE *f, const char *fmt, va_list ap) {
+    char buf[VFSCANF_BUF];
+    int n = 0;
+    int c;
+    int sawAny = 0;
+    while (n < VFSCANF_BUF - 1) {
+        c = fgetc(f);
+        if (c < 0) break;
+        sawAny = 1;
+        buf[n++] = (char)c;
+        if (c == '\n') break;
+    }
+    buf[n] = 0;
+    if (!sawAny) return -1;
+    int r = vsscanf(buf, fmt, ap);
+    return r;
+}
+
+
+int fscanf(FILE *f, const char *fmt, ...) {
+    va_list ap;
+    va_start(ap, fmt);
+    int r = vfscanf(f, fmt, ap);
+    va_end(ap);
+    return r;
+}
+
+
+extern FILE *stdin;
+
+
+int vscanf(const char *fmt, va_list ap) {
+    return vfscanf(stdin, fmt, ap);
+}
+
+
+int scanf(const char *fmt, ...) {
+    va_list ap;
+    va_start(ap, fmt);
+    int r = vfscanf(stdin, fmt, ap);
+    va_end(ap);
+    return r;
+}
diff --git a/runtime/src/timeExt.c b/runtime/src/timeExt.c
index 414e20a..7124c8d 100644
--- a/runtime/src/timeExt.c
+++ b/runtime/src/timeExt.c
@@ -58,21 +58,14 @@ struct tm *localtime(const time_t *t) {
 // is bank-0 in 65816 native mode regardless of DBR).  This avoids the
 // bank-mismatch issue that breaks plain gmtime under Loader.
 //
-// Full broken-down time computation.  Marked optnone because at -O2
-// LLVM's combined IR optimizations (loop rotation + reassociation +
-// induction-variable-simplify) mis-evaluate the year-increment loop's
-// `days >= 365L + (__isLeap(...) ? 1 : 0)` comparison, leaving the
-// loop body unexecuted and date fields stuck at the 1970 sentinel.
-// optnone preserves the per-statement structure and the loop runs
-// correctly.  Verified end-to-end against 1710484245L → 2024-03-15
-// 06:30:45 UTC (Friday, day-of-year 74).
-//
-// Tried 2026-05-08 (didn't fix): hoist yearLen to a long local;
-// hoist with `volatile`; restructure as for(;;) with break.  All hit
-// the same IR-level bug — IndVar simplify still folds the comparison
-// to compile-time-false.  The fix needs IR-pass-level work, not C
-// restructuring.
-__attribute__((optnone))
+// Full broken-down time computation.  Earlier `optnone` workaround was
+// masking a libgcc __udivmodsi_core bug (see libgcc.s) — the chained
+// `secs /= 60; secs /= 60; secs /= 24` got fused by SDAG to a single
+// `secs / 86400`, which has divisor high-half = 1 (b_hi != 0) and hit
+// the contaminated-remainder path.  After fixing the core to STZ $EE
+// instead of STA $EE, plain -O2 produces correct broken-down time.
+// Verified against 1710484245L → 2024-03-15 06:30:45 UTC (Friday,
+// day-of-year 74).
 struct tm *gmtime_r(const time_t *t, struct tm *out) {
     long secs = *t;
     int  sec  = (int)(secs % 60L); secs /= 60L;
@@ -162,15 +155,25 @@ static const char *const __monLong[12] = {
 // strength reducer otherwise lowers /10 and %10 on small types into
 // i8 mulhu by 0xCD (magic constant for div-by-10), which the W65816
 // backend has no select pattern for.
+// Use the `%` operator directly so the compiler picks `__umodhi3`
+// (16-bit unsigned modulo) instead of synthesizing `v - q*10`.  The
+// hand-built `v - q*10` triggers a strength-reducer bug that emits
+// `q * 0xF6` (= `q * (-10)` with the high bits of -10 truncated) —
+// fmt04(2024) returned "2224".  Letting the compiler emit the modulo
+// libcall directly produces correct output.  Two libcalls per digit
+// (__udivhi3 + __umodhi3) is slower than one __udivhi3 + multiply but
+// is the only spelling that avoids the negation bug at this width.
+// Calendar values stay under 65535 so u16 suffices.
+__attribute__((noinline))
 static char *fmtN(char *p, unsigned long v, int n) {
+    unsigned int v16 = (unsigned int)v;
     p += n;
     char *end = p;
     while (n--) {
-        unsigned long q = v / 10ul;
-        unsigned long r = v - q * 10ul;
+        unsigned int r = v16 % 10u;
+        v16             = v16 / 10u;
         p--;
         *p = (char)('0' + (int)r);
-        v = q;
     }
     return end;
 }
@@ -207,9 +210,16 @@ char *ctime(const time_t *t) {
     return asctime(gmtime(t));
 }
 
-// strftime — directive expansion is split into a helper so the main
-// loop's frame stays small (W65816 stack-relative offsets are 8-bit
-// signed).
+// Spec dispatch.  Pre-session set restored — strftimeExtra split + new
+// specs (%y %C %e %k %I %l) caused either backend mis-codegen on the
+// indirect call or a stack-frame growth that made the merged switch
+// return garbage.  Keeping the supported set as it was before the
+// 2026-05-10 expansion attempt.
+//
+// Supported specs:
+//   %Y %m %d %H %M %S %j %w %a %A %b %h %B %p %%
+// Composite specs (expanded by main loop via strftimeComposite):
+//   %D %F %R %T %r %x %X %c
 __attribute__((noinline))
 static int strftimeOne(char dst[8], char spec, const struct tm *tm,
                        const char **strOut) {
@@ -232,7 +242,7 @@ static int strftimeOne(char dst[8], char spec, const struct tm *tm,
               return (int)strlen(*strOut);
     case 'p': *strOut = (tm->tm_hour < 12) ? "AM" : "PM";     return 2;
     case '%': dst[0] = '%';                                   return 1;
-    default:  dst[0] = '%'; dst[1] = spec;                    return 2;
+    default:  return 0;  // unrecognized — caller emits literal
     }
 }
 
diff --git a/scripts/runMultiSeg.sh b/scripts/runMultiSeg.sh
index a953a52..9f8d803 100755
--- a/scripts/runMultiSeg.sh
+++ b/scripts/runMultiSeg.sh
@@ -109,10 +109,10 @@ EOF
 OUT=$(timeout 30 mame apple2gs \
     -rompath "$PROJECT_ROOT/tools/mame/roms" \
     -plugins -autoboot_script "$LUA_PATH" \
-    -window -sound none -nothrottle -seconds_to_run "$SECS" 2>&1 | grep -E "^(MAME-|SEG-)")
+    -window -sound none -nothrottle -seconds_to_run "$SECS" 2>&1 | /usr/bin/grep -E "^(MAME-|SEG-)")
 
 echo "$OUT"
-mapfile -t GOT_LIST < <(printf '%s\n' "$OUT" | grep -oE 'val=0x[0-9a-f]+' | sed 's/val=0x//')
+mapfile -t GOT_LIST < <(printf '%s\n' "$OUT" | /usr/bin/grep -oE 'val=0x[0-9a-f]+' | sed 's/val=0x//')
 ok=1
 for i in "${!EXPECT_LIST[@]}"; do
     if [ "${GOT_LIST[$i]:-}" != "${EXPECT_LIST[$i]}" ]; then
diff --git a/scripts/smokeTest.sh b/scripts/smokeTest.sh
index 877b77a..20f71ff 100755
--- a/scripts/smokeTest.sh
+++ b/scripts/smokeTest.sh
@@ -442,18 +442,20 @@ if [ -x "$CLANG" ]; then
 int load_ptr(const int *p) { return *p; }
 void store_ptr(int *p, int v) { *p = v; }
 EOF
-    "$CLANG" --target=w65816 -O2 -c "$cFile6" -o "$oPtrFile"
+    "$CLANG" --target=w65816 -O2 -c "$cFile6" -o "$oPtrFile" 2>/dev/null || \
+        die "ptr-deref test: clang failed to compile"
+    [ -s "$oPtrFile" ] || die "ptr-deref test: empty .o"
+    # Cache the dump output once so concurrent calls don't race.
+    ptr_dump_out=$("$OBJDUMP" --triple=w65816 -d "$oPtrFile" 2>/dev/null)
     # LDA [dp],Y = 0xB7; STA [dp],Y = 0x97 (followed by the dp byte 0xE0).
-    if ! "$OBJDUMP" --triple=w65816 -d "$oPtrFile" 2>/dev/null \
-            | grep -qE '\b97 e0\b'; then
+    if ! printf '%s' "$ptr_dump_out" | /usr/bin/grep -qE '\b97 e0\b'; then
         warn "ptr-deref test: STA [dp],Y (0x97 0xE0) missing in store_ptr"
         "$OBJDUMP" --triple=w65816 -d "$oPtrFile" >&2
         die "ptr-deref test failed (STA [dp],Y expected)"
     fi
-    if ! "$OBJDUMP" --triple=w65816 -d "$oPtrFile" 2>/dev/null \
-            | grep -qE '\bb7 e0\b'; then
+    if ! printf '%s' "$ptr_dump_out" | /usr/bin/grep -qE '\bb7 e0\b'; then
         warn "ptr-deref test: LDA [dp],Y (0xB7 0xE0) missing in load_ptr"
-        "$OBJDUMP" --triple=w65816 -d "$oPtrFile" >&2
+        printf '%s\n' "$ptr_dump_out" >&2
         die "ptr-deref test failed (LDA [dp],Y expected)"
     fi
 fi
@@ -1590,6 +1592,7 @@ EOF
         oLibcF="$(mktemp --suffix=.o)"
         oStrtolF="$(mktemp --suffix=.o)"
         oSnprintfF="$(mktemp --suffix=.o)"
+        oSscanfF="$(mktemp --suffix=.o)"
         oQsortF="$(mktemp --suffix=.o)"
         oExtrasF="$(mktemp --suffix=.o)"
         oStrtokF="$(mktemp --suffix=.o)"
@@ -1602,6 +1605,9 @@ EOF
             -c "$PROJECT_ROOT/runtime/src/strtol.c" -o "$oStrtolF"
         "$CLANG" --target=w65816 -O2 -ffunction-sections \
             -c "$PROJECT_ROOT/runtime/src/snprintf.c" -o "$oSnprintfF"
+        "$CLANG" --target=w65816 -O2 -ffunction-sections \
+            -I"$PROJECT_ROOT/runtime/include" \
+            -c "$PROJECT_ROOT/runtime/src/sscanf.c" -o "$oSscanfF"
         "$CLANG" --target=w65816 -O2 -ffunction-sections \
             -c "$PROJECT_ROOT/runtime/src/qsort.c" -o "$oQsortF"
         "$CLANG" --target=w65816 -O2 -ffunction-sections \
@@ -3115,6 +3121,63 @@ EOF
         fi
         rm -f "$cExprFile" "$oExprFile" "$binExprFile"
 
+        # IMG8..IMG15 callee-save regression: a recursive double-returning
+        # function with compound `||` conditions and a recursion inside an
+        # outer while loop creates enough register pressure for regalloc to
+        # land a vreg in IMG8..IMG15.  Without the W65816ImgCalleeSave pass,
+        # the inner call clobbered the outer's IMG8..IMG15 → wrong math.
+        # The classic symptom from picol's `expr 1+2 == 4` instead of 3.
+        # See feedback_picol_expr_compound_or.md.
+        log "check: MAME runs orBug double-recursion 1+0 → 1.0 (ImgCalleeSave regression)"
+        cOrFile="$(mktemp --suffix=.c)"
+        oOrFile="$(mktemp --suffix=.o)"
+        binOrFile="$(mktemp --suffix=.bin)"
+        cat > "$cOrFile" <<'EOF'
+__attribute__((noinline)) void switchToBank2(void) {
+    __asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n");
+}
+__attribute__((noinline))
+double evalAt(char **p, int prec) {
+    double a = 0.0;
+    while (**p >= '0' && **p <= '9') {
+        a = a * 10.0 + (double)(**p - '0');
+        (*p)++;
+    }
+    while (1) {
+        int op = **p;
+        int oprec;
+        if (op == '*' || op == '/') oprec = 4;
+        else if (op == '+' || op == '-') oprec = 3;
+        else return a;
+        if (oprec <= prec) return a;
+        (*p)++;
+        double b = evalAt(p, oprec);
+        if (op == '+') a = a + b;
+        else if (op == '*') a = a * b;
+    }
+}
+int main(void) {
+    char e1[] = "1+0";
+    char *p1 = e1;
+    double v1 = evalAt(&p1, 0);
+    unsigned long long b1;
+    __builtin_memcpy(&b1, &v1, 8);
+    switchToBank2();
+    *(volatile unsigned short *)0x5000 = (unsigned short)(b1 >> 48);
+    while (1) {}
+}
+EOF
+        "$CLANG" --target=w65816 -O2 -ffunction-sections -c \
+            "$cOrFile" -o "$oOrFile"
+        "$PROJECT_ROOT/tools/link816" -o "$binOrFile" --text-base 0x1000 \
+            "$oCrt0F" "$oLibcF" "$oSfF" "$oSdF" \
+            "$oLibgccFile" "$oOrFile" >/dev/null 2>&1
+        if ! bash "$PROJECT_ROOT/scripts/runInMame.sh" "$binOrFile" \
+                  0x025000 3ff0 >/dev/null 2>&1; then
+            die "MAME: orBug 1+0 != 1.0 (ImgCalleeSave regression)"
+        fi
+        rm -f "$cOrFile" "$oOrFile" "$binOrFile"
+
         log "check: MAME runs sqrt/pow + sin/cos/exp/log + strpbrk/spn/cspn (#81 + #82 + #83)"
         cTrFile="$(mktemp --suffix=.c)"
         oTrFile="$(mktemp --suffix=.o)"
@@ -3215,6 +3278,106 @@ EOF
         fi
         rm -f "$cGmFile" "$oGmFile" "$oGmTime" "$binGmFile"
 
+        log "check: MAME runs strftime(%Y-%m-%d %H:%M:%S) → '2024-03-15 06:30:45' (calendar formatting)"
+        cSfFile="$(mktemp --suffix=.c)"
+        oSfFile="$(mktemp --suffix=.o)"
+        oSfTime="$(mktemp --suffix=.o)"
+        binSfFile="$(mktemp --suffix=.bin)"
+        cat > "$cSfFile" <<'EOF'
+#include <time.h>
+__attribute__((noinline)) void switchToBank2(void) {
+    __asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n");
+}
+int main(void) {
+    time_t t = 1710484245L;  // 2024-03-15 06:30:45 UTC
+    struct tm tm;
+    gmtime_r(&t, &tm);
+    char buf[24];
+    int n = strftime(buf, sizeof buf, "%Y-%m-%d %H:%M:%S", &tm);
+    // Snapshot bytes before bank-switch — the runtime-indexed loop
+    // version (`0x5000 + (i << 1)`) lowers via ptr32 with bank-hi=0 and
+    // hits bank 0 instead of bank 2.  Unroll to use constant addresses.
+    unsigned char b00 = (unsigned char)buf[0]; unsigned char b01 = (unsigned char)buf[1];
+    unsigned char b02 = (unsigned char)buf[2]; unsigned char b03 = (unsigned char)buf[3];
+    unsigned char b04 = (unsigned char)buf[4]; unsigned char b05 = (unsigned char)buf[5];
+    unsigned char b06 = (unsigned char)buf[6]; unsigned char b07 = (unsigned char)buf[7];
+    unsigned char b08 = (unsigned char)buf[8]; unsigned char b09 = (unsigned char)buf[9];
+    unsigned char b10 = (unsigned char)buf[10]; unsigned char b11 = (unsigned char)buf[11];
+    unsigned char b12 = (unsigned char)buf[12]; unsigned char b13 = (unsigned char)buf[13];
+    unsigned char b14 = (unsigned char)buf[14]; unsigned char b15 = (unsigned char)buf[15];
+    unsigned char b16 = (unsigned char)buf[16]; unsigned char b17 = (unsigned char)buf[17];
+    unsigned char b18 = (unsigned char)buf[18];
+    switchToBank2();
+    *(volatile unsigned int *)0x5000 = b00; *(volatile unsigned int *)0x5002 = b01;
+    *(volatile unsigned int *)0x5004 = b02; *(volatile unsigned int *)0x5006 = b03;
+    *(volatile unsigned int *)0x5008 = b04; *(volatile unsigned int *)0x500a = b05;
+    *(volatile unsigned int *)0x500c = b06; *(volatile unsigned int *)0x500e = b07;
+    *(volatile unsigned int *)0x5010 = b08; *(volatile unsigned int *)0x5012 = b09;
+    *(volatile unsigned int *)0x5014 = b10; *(volatile unsigned int *)0x5016 = b11;
+    *(volatile unsigned int *)0x5018 = b12; *(volatile unsigned int *)0x501a = b13;
+    *(volatile unsigned int *)0x501c = b14; *(volatile unsigned int *)0x501e = b15;
+    *(volatile unsigned int *)0x5020 = b16; *(volatile unsigned int *)0x5022 = b17;
+    *(volatile unsigned int *)0x5024 = b18;
+    *(volatile unsigned int *)0x5040 = n;
+    while (1) {}
+}
+EOF
+        "$CLANG" --target=w65816 -O2 -ffunction-sections \
+            -I"$PROJECT_ROOT/runtime/include" -c "$cSfFile" -o "$oSfFile"
+        "$CLANG" --target=w65816 -O2 -ffunction-sections \
+            -I"$PROJECT_ROOT/runtime/include" \
+            -c "$PROJECT_ROOT/runtime/src/timeExt.c" -o "$oSfTime"
+        "$PROJECT_ROOT/tools/link816" -o "$binSfFile" --text-base 0x1000 \
+            "$oCrt0F" "$oLibcF" "$oSfTime" "$oSfF" "$oSdF" "$oLibgccFile" \
+            "$oStrtolF" "$oSnprintfF" "$oSfFile" >/dev/null 2>&1
+        # Expected: "2024-03-15 06:30:45" — bytes at 0x5000+i*2 = ord(c)
+        # '2'=0x32 '0'=0x30 '2'=0x32 '4'=0x34 '-'=0x2d
+        # '0'=0x30 '3'=0x33 '-'=0x2d
+        # '1'=0x31 '5'=0x35 ' '=0x20
+        # '0'=0x30 '6'=0x36 ':'=0x3a
+        # '3'=0x33 '0'=0x30 ':'=0x3a
+        # '4'=0x34 '5'=0x35
+        if ! bash "$PROJECT_ROOT/scripts/runInMame.sh" "$binSfFile" --check \
+                  0x025000=0032 0x025002=0030 0x025004=0032 0x025006=0034 \
+                  0x025008=002d 0x02500a=0030 0x02500c=0033 0x02500e=002d \
+                  0x025010=0031 0x025012=0035 0x025014=0020 0x025016=0030 \
+                  0x025018=0036 0x02501a=003a 0x02501c=0033 0x02501e=0030 \
+                  0x025020=003a 0x025022=0034 0x025024=0035 0x025040=0013 \
+                  >/dev/null 2>&1; then
+            die "MAME: strftime(%Y-%m-%d %H:%M:%S) wrong output"
+        fi
+        rm -f "$cSfFile" "$oSfFile" "$oSfTime" "$binSfFile"
+
+        log "check: MAME runs __udivsi3(1710484245, 86400) → 19797 (libgcc remainder-init)"
+        cUdsFile="$(mktemp --suffix=.c)"
+        oUdsFile="$(mktemp --suffix=.o)"
+        binUdsFile="$(mktemp --suffix=.bin)"
+        cat > "$cUdsFile" <<'EOF'
+__attribute__((noinline)) void switchToBank2(void) {
+    __asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n");
+}
+__attribute__((noinline)) unsigned long divIt(unsigned long a, unsigned long b) {
+    return a / b;
+}
+int main(void) {
+    unsigned long q = divIt(1710484245UL, 86400UL);
+    switchToBank2();
+    *(volatile unsigned int *)0x5000 = (unsigned int)(q & 0xFFFFUL);
+    *(volatile unsigned int *)0x5002 = (unsigned int)((q >> 16) & 0xFFFFUL);
+    while (1) {}
+}
+EOF
+        "$CLANG" --target=w65816 -O2 -ffunction-sections -c \
+            "$cUdsFile" -o "$oUdsFile"
+        "$PROJECT_ROOT/tools/link816" -o "$binUdsFile" --text-base 0x1000 \
+            "$oCrt0F" "$oLibcF" "$oSfF" "$oSdF" "$oLibgccFile" "$oUdsFile" \
+            >/dev/null 2>&1
+        if ! bash "$PROJECT_ROOT/scripts/runInMame.sh" "$binUdsFile" --check \
+                  0x025000=4d55 0x025002=0000 >/dev/null 2>&1; then
+            die "MAME: __udivsi3(1710484245, 86400) != 19797 (b_hi != 0 case)"
+        fi
+        rm -f "$cUdsFile" "$oUdsFile" "$binUdsFile"
+
         log "check: MAME runs udivmod(0x123...DEF, 0x10000, &m) → q=0x12345_6789AB m=0xCDEF (#69)"
         cUdmFile="$(mktemp --suffix=.c)"
         oUdmFile="$(mktemp --suffix=.o)"
@@ -3366,6 +3529,50 @@ EOF
         fi
         rm -f "$cSqFile" "$oSqFile" "$binSqFile"
 
+        # VLA: variable-length array on the stack — exercises FP-relative
+        # addressing (DP $F6) and the StackSlotCleanup PHP/PLP wrap pass'
+        # VLA handling (STAfi expands to a 4-MC sequence ending in LDY $F8
+        # which clobbers N/Z; wrap must encompass the whole expansion).
+        # sum_n(3) writes a[0..2] = {1,2,3} in one loop, then sums them
+        # in a second loop — verifies both loops' back-edges are correct
+        # and that the final return reads the accumulator (slot 7) right.
+        log "check: MAME runs sum_n(3) VLA sum → 6 (FP-rel + PHP/PLP wrap)"
+        cVlaFile="$(mktemp --suffix=.c)"
+        oVlaFile="$(mktemp --suffix=.o)"
+        binVlaFile="$(mktemp --suffix=.bin)"
+        cat > "$cVlaFile" <<'EOF'
+typedef unsigned short uint16_t;
+__attribute__((noinline))
+uint16_t sum_n(uint16_t n) {
+    uint16_t a[n];
+    for (uint16_t i = 0; i < n; i++) a[i] = i + 1;
+    uint16_t s = 0;
+    for (uint16_t i = 0; i < n; i++) s += a[i];
+    return s;
+}
+__attribute__((noinline)) void switchToBank2(void) {
+    __asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n");
+}
+int main(void) {
+    volatile uint16_t n = 3;
+    uint16_t r = sum_n(n);
+    switchToBank2();
+    *(volatile uint16_t *)0x5000 = r;
+    while (1) {}
+}
+EOF
+        "$CLANG" --target=w65816 -O2 -ffunction-sections -c \
+            "$cVlaFile" -o "$oVlaFile" 2>/dev/null \
+            || die "clang failed to compile a function with a VLA"
+        "$PROJECT_ROOT/tools/link816" -o "$binVlaFile" --text-base 0x1000 \
+            "$oCrt0F" "$oLibcF" "$oSfF" "$oSdF" "$oLibgccFile" "$oVlaFile" \
+            >/dev/null 2>&1
+        if ! bash "$PROJECT_ROOT/scripts/runInMame.sh" \
+                  "$binVlaFile" 0x025000 0006 >/dev/null 2>&1; then
+            die "MAME: sum_n(3) != 6 (VLA / FP-rel / PHP-PLP wrap)"
+        fi
+        rm -f "$cVlaFile" "$oVlaFile" "$binVlaFile"
+
         log "check: MAME runs -O0 addOne(7) → 8 (lda-overwrite-immediate fix; fast regalloc)"
         cO0File="$(mktemp --suffix=.c)"
         oO0File="$(mktemp --suffix=.o)"
@@ -3699,14 +3906,8 @@ EOF
         oFioFile="$(mktemp --suffix=.o)"
         binFioFile="$(mktemp --suffix=.bin)"
         cat > "$cFioFile" <<'EOF'
+#include <stdio.h>
 extern int mfsRegister(const char *path, void *buf, unsigned long size, unsigned long cap, int writable);
-extern struct __sFILE *fopen(const char *path, const char *mode);
-extern unsigned long fread(void *p, unsigned long s, unsigned long n, struct __sFILE *f);
-extern int fseek(struct __sFILE *f, long off, int whence);
-extern long ftell(struct __sFILE *f);
-extern int fclose(struct __sFILE *f);
-extern int fgetc(struct __sFILE *f);
-extern int fprintf(struct __sFILE *f, const char *fmt, ...);
 extern int strcmp(const char *a, const char *b);
 __attribute__((noinline)) void switchToBank2(void) {
     __asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n");
@@ -3717,7 +3918,7 @@ static char rbuf[32];
 int main(void) {
     unsigned short ok = 0;
     if (mfsRegister("greet", data, 13, 13, 0) == 0) ok |= 0x01;
-    struct __sFILE *f = fopen("greet", "r");
+    FILE *f = fopen("greet", "r");
     if (f) ok |= 0x02;
     unsigned int n = fread(rbuf, 1, 13, f);
     rbuf[13] = 0;
@@ -3736,7 +3937,7 @@ int main(void) {
     while (1) {}
 }
 EOF
-        "$CLANG" --target=w65816 -O2 -ffunction-sections -c \
+        "$CLANG" --target=w65816 -I"$PROJECT_ROOT/runtime/include" -O2 -ffunction-sections -c \
             "$cFioFile" -o "$oFioFile"
         "$PROJECT_ROOT/tools/link816" -o "$binFioFile" --text-base 0x1000 \
             "$oCrt0F" "$oLibcF" "$oExtrasF" "$oSnprintfF" \
@@ -3748,6 +3949,129 @@ EOF
         fi
         rm -f "$cFioFile" "$oFioFile" "$binFioFile"
 
+        # fscanf parses numeric directives via a buffer bridge to vsscanf.
+        # Verifies %d / %x / %ld parse correctly from a real FILE*.
+        # %s through fscanf shares the pre-existing sscanf %s gap and
+        # is intentionally not in the assertion (covered separately).
+        log "check: MAME runs fscanf %d/%x/%ld over mfs-backed file"
+        cFsFile="$(mktemp --suffix=.c)"
+        oFsFile="$(mktemp --suffix=.o)"
+        binFsFile="$(mktemp --suffix=.bin)"
+        cat > "$cFsFile" <<'EOF'
+#include <stdio.h>
+__attribute__((noinline)) void switchToBank2(void) {
+    __asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n");
+}
+int main(void) {
+    static char buf[64] = "12 -7 0xFF 999\n";
+    mfsRegister("rec.dat", buf, 16, 64, 0);
+    int  a = -1, b = -1, d = -1;
+    unsigned int c = 0;
+    FILE *f = fopen("rec.dat", "r");
+    int n = fscanf(f, "%d %d %x %d", &a, &b, &c, &d);
+    fclose(f);
+    switchToBank2();
+    *(volatile int *)0x5000 = n;
+    *(volatile int *)0x5002 = a;
+    *(volatile int *)0x5004 = b;
+    *(volatile unsigned int *)0x5006 = c;
+    *(volatile int *)0x5008 = d;
+    while (1) {}
+}
+EOF
+        "$CLANG" --target=w65816 -O2 -ffunction-sections \
+            -I"$PROJECT_ROOT/runtime/include" -c \
+            "$cFsFile" -o "$oFsFile"
+        "$PROJECT_ROOT/tools/link816" -o "$binFsFile" --text-base 0x1000 \
+            "$oCrt0F" "$oLibcF" "$oExtrasF" "$oSnprintfF" "$oSscanfF" \
+            "$oStrtolF" "$oSfF" "$oSdF" "$oLibgccFile" "$oFsFile" \
+            >/dev/null 2>&1
+        if ! bash "$PROJECT_ROOT/scripts/runInMame.sh" "$binFsFile" --check \
+                  0x025000=0004 0x025002=000c 0x025004=fff9 \
+                  0x025006=00ff 0x025008=03e7 >/dev/null 2>&1; then
+            die "MAME: fscanf returned wrong int values"
+        fi
+        rm -f "$cFsFile" "$oFsFile" "$binFsFile"
+
+        # Large-frame function across a bank-switched DBR: FP-relative
+        # addressing must use long-indirect [dp],Y (bank-independent)
+        # so locals/args remain readable even when the caller has
+        # changed DBR via pha;plb.  Previously used short-indirect
+        # (dp),Y which reads from DBR; switchToBank2 in the caller
+        # silently broke every FP-rel slot in the callee.  The fixed
+        # `largeFn(1, 2)` returns sum(1..100) + 100 + 2 = 5152... wait
+        # sum(i+1 for i in 0..99) = sum(1..100) = 5050; 5050 + arg2(2)
+        # = 5052 = 0x13BC.
+        log "check: MAME runs large-frame fn after switchToBank2 → 5052 (FP-rel long-indirect)"
+        cLfFile="$(mktemp --suffix=.c)"
+        oLfFile="$(mktemp --suffix=.o)"
+        binLfFile="$(mktemp --suffix=.bin)"
+        cat > "$cLfFile" <<'EOF'
+__attribute__((noinline)) void switchToBank2(void) {
+    __asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n");
+}
+__attribute__((noinline)) int largeFn(int arg1, int arg2) {
+    int local[100];
+    for (int i = 0; i < 100; i++) local[i] = i + arg1;
+    int sum = arg2;
+    for (int i = 0; i < 100; i++) sum += local[i];
+    return sum;
+}
+int main(void) {
+    switchToBank2();
+    int r = largeFn(1, 2);
+    *(volatile unsigned int *)0x5000 = (unsigned int)r;
+    while (1) {}
+}
+EOF
+        "$CLANG" --target=w65816 -O2 -ffunction-sections -c "$cLfFile" -o "$oLfFile"
+        "$PROJECT_ROOT/tools/link816" -o "$binLfFile" --text-base 0x1000 \
+            "$oCrt0F" "$oLibgccFile" "$oLfFile" >/dev/null 2>&1
+        if ! bash "$PROJECT_ROOT/scripts/runInMame.sh" "$binLfFile" --check \
+                  0x025000=13bc >/dev/null 2>&1; then
+            die "MAME: large-frame fn after switchToBank2 != 5052"
+        fi
+        rm -f "$cLfFile" "$oLfFile" "$binLfFile"
+
+        # BSWAP lowering: real-world C that constructs a 32-bit value
+        # from four byte loads (SHA-256 message schedule, JPEG/PNG
+        # parsers, big-endian network headers) triggers SDAG's BSWAP
+        # combine.  Marked Expand in W65816ISelLowering so the SDAG
+        # falls back to shifts + ORs — required to compile portable
+        # C that does byte-swapping by hand.
+        log "check: MAME runs BSWAP-via-shifts → 0xDEADBEEF byte-reversed (SHA-256-style word build)"
+        cBswapFile="$(mktemp --suffix=.c)"
+        oBswapFile="$(mktemp --suffix=.o)"
+        binBswapFile="$(mktemp --suffix=.bin)"
+        cat > "$cBswapFile" <<'EOF'
+__attribute__((noinline)) void switchToBank2(void) {
+    __asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n");
+}
+__attribute__((noinline)) unsigned long packBE(const unsigned char *p) {
+    return ((unsigned long)p[0] << 24)
+         | ((unsigned long)p[1] << 16)
+         | ((unsigned long)p[2] <<  8)
+         | ((unsigned long)p[3]);
+}
+volatile unsigned char buf[4] = { 0xDE, 0xAD, 0xBE, 0xEF };
+int main(void) {
+    unsigned long r = packBE((const unsigned char *)buf);
+    switchToBank2();
+    *(volatile unsigned int *)0x5000 = (unsigned int)(r & 0xFFFFUL);
+    *(volatile unsigned int *)0x5002 = (unsigned int)((r >> 16) & 0xFFFFUL);
+    while (1) {}
+}
+EOF
+        "$CLANG" --target=w65816 -O2 -ffunction-sections -I"$PROJECT_ROOT/runtime/include" \
+            -c "$cBswapFile" -o "$oBswapFile"
+        "$PROJECT_ROOT/tools/link816" -o "$binBswapFile" --text-base 0x1000 \
+            "$oCrt0F" "$oLibgccFile" "$oBswapFile" >/dev/null 2>&1
+        if ! bash "$PROJECT_ROOT/scripts/runInMame.sh" "$binBswapFile" --check \
+                  0x025000=beef 0x025002=dead >/dev/null 2>&1; then
+            die "MAME: BSWAP-via-shifts produced wrong word"
+        fi
+        rm -f "$cBswapFile" "$oBswapFile" "$binBswapFile"
+
         # wchar.h + signal.h.  wcslen/wcscmp/wcscpy/wcschr cover the
         # core wide-char family; mbtowc/wctomb verify the trivial 1:1
         # Latin-1 mapping.  signal()/raise() are exercised by
@@ -3800,6 +4124,288 @@ EOF
         fi
         rm -f "$cWsFile" "$oWsFile" "$binWsFile"
 
+        # wchar.h extended surface: wmem* family + wcstol + swprintf.
+        # All delegate to the byte equivalents (1:1 Latin-1) so this
+        # locks in the conversion correctness end-to-end.
+        log "check: MAME runs wchar.h extended (wmem* / wcstol / swprintf)"
+        cWxFile="$(mktemp --suffix=.c)"
+        oWxFile="$(mktemp --suffix=.o)"
+        binWxFile="$(mktemp --suffix=.bin)"
+        cat > "$cWxFile" <<'EOF'
+#include <wchar.h>
+__attribute__((noinline)) void switchToBank2(void) {
+    __asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n");
+}
+int main(void) {
+    unsigned int ok = 0;
+    static const wchar_t src[] = {'h','e','l','l','o',0};
+    wchar_t buf[16];
+    wmemset(buf, '.', 8); buf[8] = 0;
+    if (buf[0] == '.' && buf[7] == '.' && buf[8] == 0)  ok |= 0x0001;
+    wmemcpy(buf, src, 5); buf[5] = 0;
+    if (buf[0]=='h' && buf[4]=='o' && buf[5]==0)         ok |= 0x0002;
+    if (wmemcmp(buf, src, 5) == 0)                       ok |= 0x0004;
+    wchar_t *p = wmemchr(buf, 'l', 5);
+    if (p == buf + 2)                                    ok |= 0x0008;
+    static const wchar_t num[] = {'1','2','3',0};
+    wchar_t *e;
+    if (wcstol(num, &e, 10) == 123 && *e == 0)           ok |= 0x0010;
+    static const wchar_t fmt[] = {'%','d',' ','=','=',' ','%','d',0};
+    wchar_t pbuf[24];
+    int n = swprintf(pbuf, 24, fmt, 7, 42);
+    if (n == 7)                                          ok |= 0x0020;
+    if (pbuf[0]=='7' && pbuf[1]==' ' && pbuf[2]=='=')    ok |= 0x0040;
+    if (pbuf[5]=='4' && pbuf[6]=='2' && pbuf[7]==0)      ok |= 0x0080;
+    switchToBank2();
+    *(volatile unsigned int *)0x5000 = ok;
+    while (1) {}
+}
+EOF
+        "$CLANG" --target=w65816 -O2 -ffunction-sections -I"$PROJECT_ROOT/runtime/include" -c \
+            "$cWxFile" -o "$oWxFile"
+        "$PROJECT_ROOT/tools/link816" -o "$binWxFile" --text-base 0x1000 \
+            "$oCrt0F" "$oLibcF" "$oExtrasF" "$oSnprintfF" "$oStrtolF" \
+            "$oSfF" "$oSdF" "$oLibgccFile" "$oWxFile" >/dev/null 2>&1
+        if ! bash "$PROJECT_ROOT/scripts/runInMame.sh" "$binWxFile" --check \
+                  0x025000=00ff >/dev/null 2>&1; then
+            die "MAME: wchar.h extended != 0xFF (wmem*/wcstol/swprintf regression)"
+        fi
+        rm -f "$cWxFile" "$oWxFile" "$binWxFile"
+
+        # complex.h core surface: CMPLX, creal/cimag, conj.  Validates
+        # that clang's `_Complex` builtin lowers correctly and our
+        # accessor inline functions (`__real__` / `__imag__`) emit
+        # straight-through loads.  cabs/carg are exposed but call into
+        # hypot/atan2 which hit the runtime-sqrt bug, so this check
+        # stays on the algebraic core.
+        log "check: MAME runs complex.h core (CMPLX/creal/cimag/conj)"
+        cCxFile="$(mktemp --suffix=.c)"
+        oCxFile="$(mktemp --suffix=.o)"
+        binCxFile="$(mktemp --suffix=.bin)"
+        cat > "$cCxFile" <<'EOF'
+#include <complex.h>
+__attribute__((noinline)) void switchToBank2(void) {
+    __asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n");
+}
+volatile double gRe = 3.0;
+volatile double gIm = 4.0;
+int main(void) {
+    double _Complex z = CMPLX(gRe, gIm);
+    int re = (int)creal(z);
+    int im = (int)cimag(z);
+    double _Complex w = conj(z);
+    int wre = (int)creal(w);
+    int wim = (int)cimag(w);
+    switchToBank2();
+    *(volatile unsigned int *)0x5000 = (unsigned int)re;
+    *(volatile unsigned int *)0x5002 = (unsigned int)im;
+    *(volatile unsigned int *)0x5004 = (unsigned int)wre;
+    *(volatile unsigned int *)0x5006 = (unsigned int)(short)wim;
+    while (1) {}
+}
+EOF
+        "$CLANG" --target=w65816 -O2 -ffunction-sections \
+            -I"$PROJECT_ROOT/runtime/include" -c "$cCxFile" -o "$oCxFile"
+        "$PROJECT_ROOT/tools/link816" -o "$binCxFile" --text-base 0x1000 \
+            "$oCrt0F" "$oLibcF" "$oExtrasF" "$oMathF" "$oSfF" "$oSdF" \
+            "$oLibgccFile" "$oCxFile" >/dev/null 2>&1
+        if ! bash "$PROJECT_ROOT/scripts/runInMame.sh" "$binCxFile" --check \
+                  0x025000=0003 0x025002=0004 0x025004=0003 0x025006=fffc \
+                  >/dev/null 2>&1; then
+            die "MAME: complex.h core != (3, 4, 3, -4)"
+        fi
+        rm -f "$cCxFile" "$oCxFile" "$binCxFile"
+
+        # C11 header surface: fenv, tgmath, stdatomic, threads,
+        # aligned_alloc.  Single-core IIgs has degenerate threads
+        # (every op stubbed) but the surface must compile + link.
+        # Atomics lower to plain ops; tgmath dispatches sqrt/sqrtf
+        # via _Generic.  fenv tracks rounding mode + exception
+        # word but neither affects softFloat output (fixed RNE).
+        log "check: MAME runs C11 surface (fenv/tgmath/stdatomic/threads/aligned_alloc)"
+        cC11File="$(mktemp --suffix=.c)"
+        oC11File="$(mktemp --suffix=.o)"
+        binC11File="$(mktemp --suffix=.bin)"
+        cat > "$cC11File" <<'EOF'
+#include <stdatomic.h>
+#include <threads.h>
+#include <fenv.h>
+#include <tgmath.h>
+#include <stdlib.h>
+__attribute__((noinline)) void switchToBank2(void) {
+    __asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n");
+}
+int main(void) {
+    unsigned int ok = 0;
+    feclearexcept(FE_ALL_EXCEPT);
+    if (fegetround() == FE_TONEAREST)                ok |= 0x0001;
+    if (fesetround(FE_UPWARD) == 0)                  ok |= 0x0002;
+    if (fegetround() == FE_UPWARD)                   ok |= 0x0004;
+    feraiseexcept(FE_INEXACT);
+    if (fetestexcept(FE_INEXACT))                    ok |= 0x0008;
+    float  f = (float)sqrt((float)4.0f);
+    if (f > 1.99f && f < 2.01f)                      ok |= 0x0010;
+    double d = sqrt(9.0);
+    if (d > 2.99 && d < 3.01)                        ok |= 0x0020;
+    atomic_int counter = 0;
+    atomic_store(&counter, 42);
+    if (atomic_load(&counter) == 42)                 ok |= 0x0040;
+    int prev = atomic_fetch_add(&counter, 8);
+    if (prev == 42 && atomic_load(&counter) == 50)   ok |= 0x0080;
+    atomic_flag flg = ATOMIC_FLAG_INIT;
+    if (!atomic_flag_test_and_set(&flg))             ok |= 0x0100;
+    if (atomic_flag_test_and_set(&flg))              ok |= 0x0200;
+    mtx_t m;
+    if (mtx_init(&m, mtx_plain) == thrd_success)     ok |= 0x0400;
+    if (mtx_lock(&m) == thrd_success)                ok |= 0x0800;
+    if (mtx_unlock(&m) == thrd_success)              ok |= 0x1000;
+    void *p = aligned_alloc(64, 128);
+    if (p && ((unsigned long)p & 63) == 0)           ok |= 0x2000;
+    aligned_free(p);
+    switchToBank2();
+    *(volatile unsigned int *)0x5000 = ok;
+    while (1) {}
+}
+EOF
+        "$CLANG" --target=w65816 -std=c11 -O2 -ffunction-sections \
+            -I"$PROJECT_ROOT/runtime/include" -c "$cC11File" -o "$oC11File"
+        "$PROJECT_ROOT/tools/link816" -o "$binC11File" --text-base 0x1000 \
+            "$oCrt0F" "$oLibcF" "$oExtrasF" "$oSnprintfF" "$oMathF" \
+            "$oSfF" "$oSdF" "$oLibgccFile" "$oC11File" >/dev/null 2>&1
+        if ! bash "$PROJECT_ROOT/scripts/runInMame.sh" "$binC11File" \
+                  0x025000 3fff >/dev/null 2>&1; then
+            die "MAME: C11 surface bitmap != 0x3FFF (fenv/tgmath/atomic/threads/aligned_alloc)"
+        fi
+        rm -f "$cC11File" "$oC11File" "$binC11File"
+
+        # C11 keyword-alias headers + Unicode types.  iso646.h provides
+        # alternative operator spellings (`and`, `or`, etc.); stdalign.h
+        # aliases `_Alignas`/`_Alignof`; stdnoreturn.h aliases
+        # `_Noreturn`; uchar.h provides char16_t / char32_t plus the
+        # mbrtoc16 / c16rtomb / mbrtoc32 / c32rtomb conversion helpers
+        # (1:1 byte mapping in our Latin-1 model); wctype.h delegates
+        # wide-char classification to the byte equivalents.
+        log "check: MAME runs C11 keyword+Unicode headers (iso646/stdalign/stdnoreturn/uchar/wctype)"
+        cC11kFile="$(mktemp --suffix=.c)"
+        oC11kFile="$(mktemp --suffix=.o)"
+        binC11kFile="$(mktemp --suffix=.bin)"
+        cat > "$cC11kFile" <<'EOF'
+#include <iso646.h>
+#include <stdalign.h>
+#include <stdnoreturn.h>
+#include <uchar.h>
+#include <wctype.h>
+__attribute__((noinline)) void switchToBank2(void) {
+    __asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n");
+}
+noreturn static void unreached(void) { while (1) {} }
+int main(void) {
+    unsigned int ok = 0;
+    // iso646: `and` and `not_eq` expand to operators
+    if ((1 and 2) and (3 not_eq 4))   ok |= 0x0001;
+    if (not 0)                         ok |= 0x0002;
+    // stdalign: alignof returns target alignment (>= 1)
+    if (alignof(int) >= 1)             ok |= 0x0004;
+    // uchar: char16_t / char32_t conversion (1:1 byte mapping)
+    char16_t c16 = 0;
+    char32_t c32 = 0;
+    mbstate_t st = {0};
+    if (mbrtoc16(&c16, "Z", 1, &st) == 1 and c16 == 'Z') ok |= 0x0008;
+    if (mbrtoc32(&c32, "Z", 1, &st) == 1 and c32 == 'Z') ok |= 0x0010;
+    char mb[2] = {0};
+    if (c16rtomb(mb, 0x4F, &st) == 1 and mb[0] == 'O')   ok |= 0x0020;
+    // wctype: classification + case folding
+    if (iswalpha('A') and not iswalpha('5'))   ok |= 0x0040;
+    if (iswdigit('5') and not iswdigit('A'))   ok |= 0x0080;
+    if (iswspace(' ') and not iswspace('A'))   ok |= 0x0100;
+    if (towlower('X') == 'x')                  ok |= 0x0200;
+    if (towupper('y') == 'Y')                  ok |= 0x0400;
+    if (not iswalpha(0x1234))                  ok |= 0x0800;
+    (void)unreached;
+    switchToBank2();
+    *(volatile unsigned int *)0x5000 = ok;
+    while (1) {}
+}
+EOF
+        "$CLANG" --target=w65816 -std=c11 -O2 -ffunction-sections \
+            -I"$PROJECT_ROOT/runtime/include" -c "$cC11kFile" -o "$oC11kFile"
+        "$PROJECT_ROOT/tools/link816" -o "$binC11kFile" --text-base 0x1000 \
+            "$oCrt0F" "$oLibcF" "$oExtrasF" "$oLibgccFile" "$oC11kFile" \
+            >/dev/null 2>&1
+        if ! bash "$PROJECT_ROOT/scripts/runInMame.sh" "$binC11kFile" --check \
+                  0x025000=0fff >/dev/null 2>&1; then
+            die "MAME: C11 keyword+Unicode header bitmap != 0x0FFF"
+        fi
+        rm -f "$cC11kFile" "$oC11kFile" "$binC11kFile"
+
+        # math.h C99 additions: fma, remainder, lround, rint, scalbn,
+        # fpclassify, nan().  Inverse hyperbolics (asinh/acosh/atanh)
+        # are exposed in the header but not exercised here because
+        # they call into sqrt/log which trigger a pre-existing
+        # runtime-sqrt crash under -O2.  All other surface members
+        # use only bit manipulation + softDouble round-trip ops which
+        # are known to work end-to-end.
+        log "check: MAME runs math.h C99 additions (fma/remainder/lround/rint/scalbn/fpclassify)"
+        cMaFile="$(mktemp --suffix=.c)"
+        oMaFile="$(mktemp --suffix=.o)"
+        binMaFile="$(mktemp --suffix=.bin)"
+        cat > "$cMaFile" <<'EOF'
+#include <math.h>
+__attribute__((noinline)) void switchToBank2(void) {
+    __asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n");
+}
+volatile double gA = 2.0, gB = 3.0, gC = 4.0;
+volatile double gX = 7.0, gY = 4.0;
+volatile double gPos = 2.7;
+volatile double gNeg = -2.7;
+volatile double gS = 1.5;
+// noinline helpers to keep main's register pressure low
+__attribute__((noinline)) static int checkFma(void) {
+    return fma(gA, gB, gC) == 10.0;
+}
+__attribute__((noinline)) static int checkRem(void) {
+    return remainder(gX, gY) == -1.0;
+}
+__attribute__((noinline)) static int checkLroundPos(void) {
+    return lround(gPos) == 3L;
+}
+__attribute__((noinline)) static int checkLroundNeg(void) {
+    return lround(gNeg) == -3L;
+}
+__attribute__((noinline)) static int checkRint(void) {
+    return rint(gPos) == 3.0;
+}
+__attribute__((noinline)) static int checkScalbn(void) {
+    return scalbn(gS, 3) == 12.0;
+}
+int main(void) {
+    unsigned int ok = 0;
+    if (checkFma())                            ok |= 0x0001;
+    if (checkRem())                            ok |= 0x0002;
+    if (checkLroundPos())                      ok |= 0x0004;
+    if (checkLroundNeg())                      ok |= 0x0008;
+    if (checkRint())                           ok |= 0x0010;
+    if (checkScalbn())                         ok |= 0x0020;
+    if (fpclassify(1.0) == FP_NORMAL)          ok |= 0x0040;
+    if (fpclassify(0.0) == FP_ZERO)            ok |= 0x0080;
+    if (fpclassify(HUGE_VAL) == FP_INFINITE)   ok |= 0x0100;
+    if (isnan(nan("")))                        ok |= 0x0200;
+    switchToBank2();
+    *(volatile unsigned int *)0x5000 = ok;
+    while (1) {}
+}
+EOF
+        "$CLANG" --target=w65816 -O2 -ffunction-sections \
+            -I"$PROJECT_ROOT/runtime/include" -c "$cMaFile" -o "$oMaFile"
+        "$PROJECT_ROOT/tools/link816" -o "$binMaFile" --text-base 0x1000 \
+            "$oCrt0F" "$oLibcF" "$oExtrasF" "$oMathF" \
+            "$oSfF" "$oSdF" "$oLibgccFile" "$oMaFile" >/dev/null 2>&1
+        if ! bash "$PROJECT_ROOT/scripts/runInMame.sh" "$binMaFile" --check \
+                  0x025000=03ff >/dev/null 2>&1; then
+            die "MAME: math.h C99 additions bitmap != 0x03FF"
+        fi
+        rm -f "$cMaFile" "$oMaFile" "$binMaFile"
+
         # clock() reads the IIgs VBL counter at $E1006B (24-bit
         # absolute load).  Works without toolbox init.  time()
         # without iigsToolboxInit() returns 0 (no crash).
@@ -3879,7 +4485,7 @@ static int sumAreas(Shape **shapes, int n) {
     for (int i = 0; i < n; i++) total += shapes[i]->area();
     return total;
 }
-extern "C" int main(void) {
+int main(void) {
     Rect r(3, 4); Square s(5); Circle c(2);
     Shape *arr[3] = { &r, &s, &c };
     int total = sumAreas(arr, 3);
@@ -3928,7 +4534,7 @@ public:
     int draw() const override { return x * 100; }
     int move(int dx) const override { return x + dx; }
 };
-extern "C" int main(void) {
+int main(void) {
     Sprite s(7);
     Drawable *d = &s;
     Movable  *m = &s;
@@ -3980,7 +4586,7 @@ public:
     Diamond(int x) : Base(x), A(x), B(x) {}
     int kind() const override { return 99; }
 };
-extern "C" int main(void) {
+int main(void) {
     Diamond d(42);
     int ok = 0;
     if (d.kind() == 99) ok |= 1;
@@ -4046,7 +4652,7 @@ public:
     Diamond(int x) : Base(x), A(x), B(x) {}
     int who() override { return 99; }
 };
-extern "C" int main(void) {
+int main(void) {
     Dog dog; Cat cat;
     Animal *a = &dog;
     int ok = 0;
@@ -4189,7 +4795,7 @@ EOF
         oExcAbi="$(mktemp --suffix=.o)"
         binCppExcFile="$(mktemp --suffix=.bin)"
         cat > "$cppExcFile" <<'EOF'
-extern "C" int main(void) {
+int main(void) {
     int ok = 0;
     try { throw 42; } catch (int e) { if (e == 42) ok = 1; }
     *(volatile unsigned short *)0x5000 = (unsigned short)ok;
@@ -4222,16 +4828,13 @@ EOF
         oHdFile="$(mktemp --suffix=.o)"
         binHdFile="$(mktemp --suffix=.bin)"
         cat > "$cHdFile" <<'EOF'
+#include <stdio.h>
 extern int mfsRegister(const char *path, void *buf, unsigned long size, unsigned long cap, int writable);
-extern struct __sFILE *fopen(const char *path, const char *mode);
-extern int fclose(struct __sFILE *f);
-extern int fgetc(struct __sFILE *f);
-extern int fprintf(struct __sFILE *f, const char *fmt, ...);
 extern char *strstr(const char *h, const char *n);
 __attribute__((noinline)) void switchToBank2(void) {
     __asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n");
 }
-__attribute__((noinline)) void hexdump(struct __sFILE *in, struct __sFILE *out) {
+__attribute__((noinline)) void hexdump(FILE *in, FILE *out) {
     unsigned int offset = 0;
     unsigned char line[16];
     int linelen;
@@ -4264,8 +4867,8 @@ static char output[300];
 int main(void) {
     mfsRegister("in", input, 16, 16, 0);
     mfsRegister("out", output, 0, 300, 1);
-    struct __sFILE *in  = fopen("in", "r");
-    struct __sFILE *out = fopen("out", "w");
+    FILE *in  = fopen("in", "r");
+    FILE *out = fopen("out", "w");
     hexdump(in, out);
     fclose(in); fclose(out);
     int ok = 0;
@@ -4277,7 +4880,7 @@ int main(void) {
     while (1) {}
 }
 EOF
-        "$CLANG" --target=w65816 -O2 -ffunction-sections -c \
+        "$CLANG" --target=w65816 -I"$PROJECT_ROOT/runtime/include" -O2 -ffunction-sections -c \
             "$cHdFile" -o "$oHdFile"
         "$PROJECT_ROOT/tools/link816" -o "$binHdFile" --text-base 0x1000 \
             "$oCrt0F" "$oLibcF" "$oExtrasF" "$oSnprintfF" \
@@ -4380,6 +4983,7 @@ EOF
         oShFile="$(mktemp --suffix=.o)"
         binShFile="$(mktemp --suffix=.bin)"
         cat > "$cShFile" <<'EOF'
+#include <stdio.h>
 extern void *malloc(unsigned long n);
 extern void free(void *p);
 extern unsigned long strlen(const char *s);
@@ -4387,9 +4991,6 @@ extern int strcmp(const char *a, const char *b);
 extern char *strchr(const char *s, int c);
 extern char *strstr(const char *h, const char *n);
 extern int mfsRegister(const char *path, void *buf, unsigned long size, unsigned long cap, int writable);
-extern struct __sFILE *fopen(const char *path, const char *mode);
-extern int fclose(struct __sFILE *f);
-extern int fprintf(struct __sFILE *f, const char *fmt, ...);
 __attribute__((noinline)) static void switchToBank2(void) {
     __asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n");
 }
@@ -4447,7 +5048,7 @@ __attribute__((noinline)) static char *takeRest(char *s) {
     *end = 0;
     return *s ? s : (char *)0;
 }
-__attribute__((noinline)) static int dispatch(char *line, struct __sFILE *out) {
+__attribute__((noinline)) static int dispatch(char *line, FILE *out) {
     char *cmd; char *rest = takeToken(line, &cmd);
     if (!cmd) return 0;
     if (strcmp(cmd, "INSERT") == 0) {
@@ -4472,7 +5073,7 @@ __attribute__((noinline)) static int dispatch(char *line, struct __sFILE *out) {
     if (strcmp(cmd, "COUNT") == 0) { fprintf(out, "COUNT = %u\n", (unsigned)totalEntries); return 1; }
     return 0;
 }
-__attribute__((noinline)) static int runScript(const char *script, struct __sFILE *out) {
+__attribute__((noinline)) static int runScript(const char *script, FILE *out) {
     int n = 0;
     char buf[64];
     const char *p = script;
@@ -4566,7 +5167,7 @@ __asm__ (
 );
 int main(void) {
     mfsRegister("out", outbuf, 0, 1024, 1);
-    struct __sFILE *out = fopen("out", "w");
+    FILE *out = fopen("out", "w");
     int cmds = runScript(SCRIPT, out);
     fprintf(out, "ran %d cmds\n", cmds);
     fclose(out);
@@ -4585,7 +5186,7 @@ int main(void) {
     while (1) {}
 }
 EOF
-        "$CLANG" --target=w65816 -O2 -ffunction-sections -c \
+        "$CLANG" --target=w65816 -I"$PROJECT_ROOT/runtime/include" -O2 -ffunction-sections -c \
             "$cShFile" -o "$oShFile"
         "$PROJECT_ROOT/tools/link816" -o "$binShFile" --text-base 0x1000 \
             "$oCrt0F" "$oLibcF" "$oExtrasF" "$oSnprintfF" \
@@ -5083,7 +5684,7 @@ EOF
             --bss-base 0xFF00 "$oBigFile" "$oLibgccFile" 2>/tmp/bsslink.err; then
         die "link816 should have rejected --bss-base 0xFF00 + 0x200 bss (above LC ceiling)"
     fi
-    if ! grep -q 'exceeds bank-0 LC ceiling' /tmp/bsslink.err; then
+    if ! grep -q 'exceeds bank-0 ceiling' /tmp/bsslink.err; then
         die "link816 LC-ceiling diagnostic missing: $(cat /tmp/bsslink.err)"
     fi
     rm -f "$cBigFile" "$oBigFile" "$binBssOFile" /tmp/bsslink.err
@@ -5134,6 +5735,106 @@ EOF
     fi
     rm -f "$cBssLcFile" "$oBssLcFile" "$binBssLcFile" "$mapBssLcFile"
 
+    # Multi-bank BSS: --bss-base 0xNN0000 places BSS in bank NN
+    # instead of bank 0.  crt0 reads the new linker symbol
+    # `__bss_bank` to temporarily set DBR for the BSS-clear loop,
+    # uses `__bss_lo16` + X via DBR-relative `stz abs,X`, and `__bss_size`
+    # for the count.  Verifies (a) BSS-resident global writes/reads work
+    # in the non-bank-0 bank, AND (b) crt0 correctly zeroed the BSS
+    # before main (untouched-array-element XOR returns 0).
+    log "check: MAME runs program with --bss-base 0x030000 (multi-bank BSS)"
+    cBmbFile="$(mktemp --suffix=.c)"
+    oBmbFile="$(mktemp --suffix=.o)"
+    binBmbFile="$(mktemp --suffix=.bin)"
+    cat > "$cBmbFile" <<'EOF'
+__attribute__((noinline)) void switchToBank2(void) {
+    __asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n");
+}
+static unsigned short g_arr[8];
+int main(void) {
+    g_arr[0] = 0x1234;
+    g_arr[7] = 0x5678;
+    unsigned short s = g_arr[0] ^ g_arr[7];   // 0x1234 ^ 0x5678 = 0x444C
+    unsigned short z = g_arr[1] | g_arr[2] | g_arr[3]
+                     | g_arr[4] | g_arr[5] | g_arr[6];  // 0 if BSS zeroed
+    switchToBank2();
+    *(volatile unsigned short *)0x5000 = s;
+    *(volatile unsigned short *)0x5002 = z;
+    while (1) {}
+}
+EOF
+    "$CLANG" --target=w65816 -O2 -ffunction-sections -c \
+        "$cBmbFile" -o "$oBmbFile"
+    # Use the prebuilt runtime/*.o (smoke's per-test mktemp .o files
+    # may have been rm -f'd by earlier checks).
+    if ! "$PROJECT_ROOT/tools/link816" -o "$binBmbFile" --text-base 0x1000 \
+        --bss-base 0x030000 \
+        "$PROJECT_ROOT/runtime/crt0.o" "$PROJECT_ROOT/runtime/libc.o" \
+        "$PROJECT_ROOT/runtime/softFloat.o" "$PROJECT_ROOT/runtime/softDouble.o" \
+        "$PROJECT_ROOT/runtime/libgcc.o" "$oBmbFile" \
+        >/dev/null 2>&1; then
+        die "link816 --bss-base 0x030000 failed (multi-bank BSS link regression)"
+    fi
+    if ! bash "$PROJECT_ROOT/scripts/runInMame.sh" \
+              "$binBmbFile" --check 0x025000=444c 0x025002=0000 >/dev/null 2>&1; then
+        die "MAME: --bss-base 0x030000 failed (BSS in bank 3 not zeroed or not writable)"
+    fi
+    rm -f "$cBmbFile" "$oBmbFile" "$binBmbFile"
+
+    log "check: MAME runs program with BSS spanning 2 banks (>64KB)"
+    cMbbFile="$(mktemp --suffix=.c)"
+    oMbbFile="$(mktemp --suffix=.o)"
+    binMbbFile="$(mktemp --suffix=.bin)"
+    cat > "$cMbbFile" <<'EOF'
+__attribute__((noinline)) void switchToBank2(void) {
+    __asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n");
+}
+__attribute__((used)) char arr1[50000];
+__attribute__((used)) char arr2[50000];
+int main(void) {
+    arr1[0] = 0;
+    arr2[0] = 0;
+    // Probe via long-absolute (DBR-independent) at fixed offsets in
+    // bank 3 and bank 4.  All should read 0 if crt0's multi-bank BSS
+    // clear loop walked both segments.
+    unsigned char b0, b1, b2, b3;
+    __asm__ volatile (
+        "sep #0x20\n.byte 0xAF\n.word 0x0100\n.byte 3\nrep #0x20\nand #0xff\n"
+        : "=a"(b0));
+    __asm__ volatile (
+        "sep #0x20\n.byte 0xAF\n.word 0xC000\n.byte 3\nrep #0x20\nand #0xff\n"
+        : "=a"(b1));
+    __asm__ volatile (
+        "sep #0x20\n.byte 0xAF\n.word 0x0100\n.byte 4\nrep #0x20\nand #0xff\n"
+        : "=a"(b2));
+    __asm__ volatile (
+        "sep #0x20\n.byte 0xAF\n.word 0x8000\n.byte 4\nrep #0x20\nand #0xff\n"
+        : "=a"(b3));
+    switchToBank2();
+    *(volatile unsigned int *)0x5000 = (unsigned int)b0;
+    *(volatile unsigned int *)0x5002 = (unsigned int)b1;
+    *(volatile unsigned int *)0x5004 = (unsigned int)b2;
+    *(volatile unsigned int *)0x5006 = (unsigned int)b3;
+    while (1);
+}
+EOF
+    "$CLANG" --target=w65816 -O2 -ffunction-sections -c \
+        "$cMbbFile" -o "$oMbbFile"
+    if ! "$PROJECT_ROOT/tools/link816" -o "$binMbbFile" --text-base 0x1000 \
+        --bss-base 0x030000 \
+        "$PROJECT_ROOT/runtime/crt0.o" "$PROJECT_ROOT/runtime/libc.o" \
+        "$PROJECT_ROOT/runtime/softFloat.o" "$PROJECT_ROOT/runtime/softDouble.o" \
+        "$PROJECT_ROOT/runtime/libgcc.o" "$oMbbFile" \
+        >/dev/null 2>&1; then
+        die "link816 with 100KB BSS in bank 3+ failed"
+    fi
+    if ! bash "$PROJECT_ROOT/scripts/runInMame.sh" "$binMbbFile" --check \
+              0x025000=0000 0x025002=0000 \
+              0x025004=0000 0x025006=0000 >/dev/null 2>&1; then
+        die "MAME: BSS span across banks not zeroed (multi-bank crt0 walk)"
+    fi
+    rm -f "$cMbbFile" "$oMbbFile" "$binMbbFile"
+
     # OMF emitter — wrap the linked binary as a single-segment OMF
     # file ready for IIgs loading.
     log "check: omfEmit produces a valid OMF v2.1 single-segment file"
@@ -5492,9 +6193,7 @@ if [ "${GSOS_FILE_SMOKE:-0}" = "1" ] \
     testFileGsf="$(mktemp --suffix=.dat)"
     printf 'Hello, world!' > "$testFileGsf"
     cat > "$cGsfFile" <<'EOF'
-extern struct __sFILE *fopen(const char *path, const char *mode);
-extern unsigned long fread(void *p, unsigned long s, unsigned long n, struct __sFILE *f);
-extern int fclose(struct __sFILE *f);
+#include <stdio.h>
 static char rbuf[16];
 __attribute__((noinline)) static int strnequ(const char *a, const char *b, int n) {
     for (int i = 0; i < n; i++) if (a[i] != b[i]) return 0;
@@ -5502,7 +6201,7 @@ __attribute__((noinline)) static int strnequ(const char *a, const char *b, int n
 }
 int main(void) {
     unsigned char ok = 0;
-    struct __sFILE *f = fopen("/DATA/TESTFILE", "r");
+    FILE *f = fopen("/DATA/TESTFILE", "r");
     if (f) {
         ok |= 0x10;
         unsigned long n = fread(rbuf, 1, 13, f);
@@ -5514,7 +6213,7 @@ int main(void) {
     return 0;
 }
 EOF
-    "$CLANG" --target=w65816 -O2 -ffunction-sections -c \
+    "$CLANG" --target=w65816 -I"$PROJECT_ROOT/runtime/include" -O2 -ffunction-sections -c \
         "$cGsfFile" -o "$oGsfFile"
     "$PROJECT_ROOT/tools/link816" -o "$binGsf" --text-base 0x1000 \
         --map "$mapGsf" --reloc-out "$relGsf" \
diff --git a/src/clang/lib/Basic/Targets/W65816.h b/src/clang/lib/Basic/Targets/W65816.h
index e4edacf..87b794e 100644
--- a/src/clang/lib/Basic/Targets/W65816.h
+++ b/src/clang/lib/Basic/Targets/W65816.h
@@ -45,7 +45,7 @@ public:
     IntPtrType = SignedLong;
     PtrDiffType = SignedLong;
     SigAtomicType = SignedLong;
-    resetDataLayout("e-m:e-p:32:16-i16:16-i32:16-i64:16-f32:16-f64:16-n8:16-S16");
+    resetDataLayout("e-m:e-p:32:16-i16:16-i32:16-i64:16-f32:16-f64:16-a:8-n8:16-S8");
   }
 
   void getTargetDefines(const LangOptions &Opts,
diff --git a/src/link816/link816.cpp b/src/link816/link816.cpp
index 58323b8..2f3afc8 100644
--- a/src/link816/link816.cpp
+++ b/src/link816/link816.cpp
@@ -789,20 +789,45 @@ struct Linker {
         // bail clearly rather than silently corrupt.
         uint32_t loadEnd = L.initBase + L.initSize;
         L.bssBase    = bssBase;
-        if (L.bssBase < loadEnd) {
-            // Page-align upward for nicer addresses in the map.
-            L.bssBase = (loadEnd + 0xFF) & ~0xFFu;
-            if (L.bssBase >= 0xC000 && L.bssBase < 0xD000) {
-                L.bssBase = 0xD000;
+        // If --bss-base specifies a non-bank-0 address, the user is
+        // explicitly placing BSS in a different bank — don't auto-
+        // adjust around bank-0 hazard zones (IO window, LC1) or
+        // collide with the text/rodata/init load area.  Validate the
+        // bank choice + intra-bank fit instead.
+        bool bssOutOfBank0 = (L.bssBase >= 0x010000u);
+        if (!bssOutOfBank0) {
+            if (L.bssBase < loadEnd) {
+                // Page-align upward for nicer addresses in the map.
+                L.bssBase = (loadEnd + 0xFF) & ~0xFFu;
+                if (L.bssBase >= 0xC000 && L.bssBase < 0xD000) {
+                    L.bssBase = 0xD000;
+                }
+            }
+            if (L.bssBase + L.bssSize > 0x10000u) {
+                char msg[256];
+                std::snprintf(msg, sizeof(msg),
+                    "bss [0x%X+%u] exceeds bank-0 ceiling 0x10000 — "
+                    "shrink runtime, or pass --bss-base 0xNN0000 "
+                    "(multi-bank BSS up to 4 banks now supported)",
+                    L.bssBase, L.bssSize);
+                die(msg);
+            }
+        } else {
+            // Multi-bank BSS: BSS may now span multiple consecutive
+            // banks.  crt0 clears bank-by-bank using DBR-relative
+            // STZ abs,X — see __bss_seg_count / __bss_segN_* symbols
+            // below.  Capped at 4 segments (= 256KB BSS max) which
+            // covers any realistic IIgs program.
+            uint32_t firstBank = (L.bssBase >> 16) & 0xFF;
+            uint32_t lastBank  = ((L.bssBase + L.bssSize - 1) >> 16) & 0xFF;
+            uint32_t segCount  = lastBank - firstBank + 1;
+            if (segCount > 4) {
+                char msg[200];
+                std::snprintf(msg, sizeof(msg),
+                    "bss [0x%X+%u] spans %u banks — limit is 4 (256KB)",
+                    L.bssBase, L.bssSize, segCount);
+                die(msg);
             }
-        }
-        if (L.bssBase + L.bssSize > 0x10000u) {
-            char msg[160];
-            std::snprintf(msg, sizeof(msg),
-                "bss [0x%X+%u] exceeds bank-0 LC ceiling 0x10000 — "
-                "shrink the runtime or split into bank 1",
-                L.bssBase, L.bssSize);
-            die(msg);
         }
         // Publish layout now so resolveSym() can read it during reloc
         // application (it's a const member that uses lastLayout).
@@ -819,6 +844,72 @@ struct Linker {
         globalSyms["__init_array_end"]    = initBase + curInit;
         globalSyms["__bss_start"]         = L.bssBase;
         globalSyms["__bss_end"]           = L.bssBase + L.bssSize;
+        // Multi-bank-BSS support: split __bss_start into the 16-bit
+        // intra-bank offset and the 8-bit bank byte.  crt0 needs the
+        // bank byte separately so it can temporarily set DBR to that
+        // bank for the BSS-clear loop (which uses STZ abs,X — DBR-
+        // relative — and so reads bytes from the wrong bank if BSS
+        // is placed in a non-zero bank).  Also emit __bss_size as a
+        // 16-bit count for the loop boundary; doing so saves crt0
+        // from doing the (__bss_end - __bss_start) subtraction at
+        // runtime, and keeps the count clean even when __bss_start
+        // and __bss_end straddle bank-boundary arithmetic.
+        globalSyms["__bss_lo16"]          = L.bssBase & 0xFFFF;
+        globalSyms["__bss_bank"]          = (L.bssBase >> 16) & 0xFF;
+        globalSyms["__bss_size"]          = L.bssSize <= 0xFFFFu ? L.bssSize
+                                                                  : 0xFFFFu;
+        // Multi-bank BSS segment table — up to 4 entries.  Each segment
+        // has (lo16, bank, size16).  Segment 0 starts at __bss_lo16 in
+        // __bss_bank; segments 1..N-1 start at offset 0 in successive
+        // banks.  crt0 walks __bss_seg{0..N-1}_size and skips when 0.
+        {
+            uint32_t curBase = L.bssBase;
+            uint32_t curRem  = L.bssSize;
+            uint32_t segIdx  = 0;
+            const char *sizeNames[4] = {
+                "__bss_seg0_size", "__bss_seg1_size",
+                "__bss_seg2_size", "__bss_seg3_size"
+            };
+            const char *bankNames[4] = {
+                "__bss_seg0_bank", "__bss_seg1_bank",
+                "__bss_seg2_bank", "__bss_seg3_bank"
+            };
+            const char *lo16Names[4] = {
+                "__bss_seg0_lo16", "__bss_seg1_lo16",
+                "__bss_seg2_lo16", "__bss_seg3_lo16"
+            };
+            // Cap segment size to 0xFF00 (= 65280) so the 16-bit
+            // CPX in crt0 doesn't wrap to 0 on a full-bank segment.
+            // Excess bytes in that bank stay uncleared at link time —
+            // we'd need to chain a second segment in the same bank to
+            // cover them.  Implementation: track per-segment max as
+            // 0xFF00, and if a single bank needs more, allocate two
+            // segments in that bank.
+            constexpr uint32_t MAX_SEG = 0xFF00u;
+            for (segIdx = 0; segIdx < 4; segIdx++) {
+                uint32_t bankEnd = (curBase & 0xFF0000u) + 0x10000u;
+                uint32_t avail   = bankEnd - curBase;
+                uint32_t seg     = curRem < avail ? curRem : avail;
+                if (seg > MAX_SEG) seg = MAX_SEG;
+                globalSyms[lo16Names[segIdx]] = curBase & 0xFFFF;
+                globalSyms[bankNames[segIdx]] = (curBase >> 16) & 0xFF;
+                globalSyms[sizeNames[segIdx]] = seg;
+                curRem  -= seg;
+                if (curRem == 0) { segIdx++; break; }
+                curBase += seg;  // advance within bank or to next
+                if ((curBase & 0xFFFFu) == 0) {
+                    // Crossed bank boundary — already at start of next bank.
+                } else if ((curBase & 0xFF0000u) != ((curBase - 1) & 0xFF0000u)) {
+                    // Just crossed into next bank.
+                }
+            }
+            // Zero out any unused segment slots so crt0 sees size=0.
+            for (uint32_t i = segIdx; i < 4; i++) {
+                globalSyms[lo16Names[i]] = 0;
+                globalSyms[bankNames[i]] = 0;
+                globalSyms[sizeNames[i]] = 0;
+            }
+        }
         // __heap_start / __heap_end: pick the largest contiguous safe
         // range above bss_end.  Without this, the previous hardcoded
         // heap_end=$BF00 gave heap_end < heap_start whenever BSS
diff --git a/src/llvm/lib/Target/W65816/CMakeLists.txt b/src/llvm/lib/Target/W65816/CMakeLists.txt
index d457117..e5d0de2 100644
--- a/src/llvm/lib/Target/W65816/CMakeLists.txt
+++ b/src/llvm/lib/Target/W65816/CMakeLists.txt
@@ -35,6 +35,9 @@ add_llvm_target(W65816CodeGen
   W65816PreSpillCrossCall.cpp
   W65816SjLjFinalize.cpp
   W65816LowerWide32.cpp
+  W65816I32IncFold.cpp
+  W65816ImgCalleeSave.cpp
+  W65816NarrowI32Mul.cpp
   W65816TargetMachine.cpp
   W65816AsmPrinter.cpp
   W65816MCInstLower.cpp
diff --git a/src/llvm/lib/Target/W65816/W65816.h b/src/llvm/lib/Target/W65816/W65816.h
index 2bf5a91..a44611b 100644
--- a/src/llvm/lib/Target/W65816/W65816.h
+++ b/src/llvm/lib/Target/W65816/W65816.h
@@ -116,6 +116,14 @@ FunctionPass *createW65816PreSpillCrossCall();
 // W65816SjLjFinalize.cpp.
 FunctionPass *createW65816SjLjFinalize();
 
+// IR pass: detect `mul i32 X, Y` where the top 16 bits of both X and Y
+// are provably zero (via IR-level computeKnownBits, which traces
+// through PHIs) and rewrite to a call to `__umulhisi3` (16x16 -> 32).
+// IR-level analysis catches cases SDAG can't, because IndVarSimplify
+// widens narrow loop counters to i32 before SDAG sees them, hiding the
+// zext that a SDAG-level combine would key off.  See W65816NarrowI32Mul.cpp.
+FunctionPass *createW65816NarrowI32Mul();
+
 // Pre-RA pass that lowers Wide32 register pairs into pairs of i16
 // vregs.  Without this, greedy/basic regalloc can't fit the pair-
 // pressure of i64-via-2-i32-via-Wide32 traffic in i64-heavy
@@ -125,9 +133,24 @@ FunctionPass *createW65816SjLjFinalize();
 // take 2 i16 ptr operands directly.
 FunctionPass *createW65816LowerWide32();
 
+// Pre-emit peephole: detect the post-PEI 6-instruction `i32 += 1`
+// pattern (LDA-ADCi16imm-STA-LDA-ADCEi16imm-STA on consecutive i16
+// stack-rel halves) and rewrite to LDA-INA-STA + INC_HI_IF_CARRY.
+// Saves ~13 cyc per pointer increment in the common no-carry path.
+// See W65816I32IncFold.cpp.
+FunctionPass *createW65816I32IncFold();
+
+// Post-RA, pre-PEI pass that emits prologue save + epilogue restore for
+// IMG8..IMG15 if the function uses them.  Makes IMG8..IMG15 behave as
+// callee-saved at the asm level without going through LLVM's CSR
+// mechanism (which would shift regalloc decisions and break other
+// tests).  See W65816ImgCalleeSave.cpp.
+FunctionPass *createW65816ImgCalleeSave();
+
 void initializeW65816AsmPrinterPass(PassRegistry &);
 void initializeW65816DAGToDAGISelLegacyPass(PassRegistry &);
 void initializeW65816StackSlotCleanupPass(PassRegistry &);
+void initializeW65816I32IncFoldPass(PassRegistry &);
 void initializeW65816SepRepCleanupPass(PassRegistry &);
 void initializeW65816BranchExpandPass(PassRegistry &);
 void initializeW65816TiedDefSpillPass(PassRegistry &);
@@ -138,6 +161,8 @@ void initializeW65816NegYIndYPass(PassRegistry &);
 void initializeW65816PreSpillCrossCallPass(PassRegistry &);
 void initializeW65816SjLjFinalizePass(PassRegistry &);
 void initializeW65816LowerWide32Pass(PassRegistry &);
+void initializeW65816ImgCalleeSavePass(PassRegistry &);
+void initializeW65816NarrowI32MulPass(PassRegistry &);
 
 } // namespace llvm
 
diff --git a/src/llvm/lib/Target/W65816/W65816AsmPrinter.cpp b/src/llvm/lib/Target/W65816/W65816AsmPrinter.cpp
index 5e27b45..ce8fa2a 100644
--- a/src/llvm/lib/Target/W65816/W65816AsmPrinter.cpp
+++ b/src/llvm/lib/Target/W65816/W65816AsmPrinter.cpp
@@ -18,8 +18,11 @@
 #include "TargetInfo/W65816TargetInfo.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Compiler.h"
 
@@ -182,6 +185,48 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
   switch (MI->getOpcode()) {
   default:
     break;
+  case W65816::INC_HI_IF_CARRY_StackRel: {
+    // Conditional-increment of an i32's hi half on stack at (off, S).
+    // Z is presumed set/clear from the preceding LDA-INA-STA on the
+    // lo half: Z=1 means lo wrapped 0xFFFF→0, so hi must be incremented;
+    // Z=0 means no overflow, skip.  STA preserves N/Z so the gap
+    // between the lo's INA and our BNE is OK.
+    //
+    // Emits:
+    //   bne <skip>
+    //   lda $off, s
+    //   inc a
+    //   sta $off, s
+    //   <skip>:
+    int64_t Off = MI->getOperand(0).getImm();
+    MCSymbol *SkipSym = OutContext.createTempSymbol();
+    {
+      MCInst BneI;
+      BneI.setOpcode(W65816::BNE);
+      BneI.addOperand(MCOperand::createExpr(
+          MCSymbolRefExpr::create(SkipSym, OutContext)));
+      EmitToStreamer(*OutStreamer, BneI);
+    }
+    {
+      MCInst Lda;
+      Lda.setOpcode(W65816::LDA_StackRel);
+      Lda.addOperand(MCOperand::createImm(Off));
+      EmitToStreamer(*OutStreamer, Lda);
+    }
+    {
+      MCInst Ina;
+      Ina.setOpcode(W65816::INA);
+      EmitToStreamer(*OutStreamer, Ina);
+    }
+    {
+      MCInst Sta;
+      Sta.setOpcode(W65816::STA_StackRel);
+      Sta.addOperand(MCOperand::createImm(Off));
+      EmitToStreamer(*OutStreamer, Sta);
+    }
+    OutStreamer->emitLabel(SkipSym);
+    return;
+  }
   case W65816::ADJCALLSTACKDOWN: {
     // DOWN is a no-op in our scheme — the PUSH16 sequence in LowerCall
     // already shifted SP incrementally as args were pushed.  Nothing
diff --git a/src/llvm/lib/Target/W65816/W65816FrameLowering.cpp b/src/llvm/lib/Target/W65816/W65816FrameLowering.cpp
index 08885a2..f85ea6d 100644
--- a/src/llvm/lib/Target/W65816/W65816FrameLowering.cpp
+++ b/src/llvm/lib/Target/W65816/W65816FrameLowering.cpp
@@ -161,13 +161,37 @@ void W65816FrameLowering::emitPrologue(MachineFunction &MF,
       // `LDA/STA ($F6),Y` (bank-0 implicit, since the stack is always
       // bank 0).  A holds the new S right after TCS — store it before
       // restoring A from Y.
-      if (StackSize > 200) {
+      // Capture FP into $F6 when:
+      //  - frame > 200 bytes (8-bit `,S` disp can't reach far slots), OR
+      //  - function has VLAs (DYNAMIC_STACKALLOC shifts S, breaking
+      //    static-frame `,s` access).
+      if (StackSize > 200 || HasVLA) {
         MF.getInfo<W65816MachineFunctionInfo>()->setUsesDpFP(true);
         BuildMI(MBB, MBBI, DL, TII.get(W65816::STA_DP)).addImm(0xF6);
+        // Bank byte at $F8 = 0.  expandFarFI uses `LDA/STA [dp],Y`
+        // (long indirect Y, opcodes B7/97) which reads a 24-bit
+        // pointer at $F6/$F7/$F8 and ignores DBR.  Without this
+        // forced-bank-zero, callers that have switched DBR (e.g.
+        // for I/O register access via `pha;plb`) silently corrupt
+        // every FP-relative load and store in the callee.
+        // sha256_transform exposed this — its 246-byte frame uses
+        // FP-rel, and the test driver switched DBR to bank 2 for
+        // probe writes before invoking it.
+        BuildMI(MBB, MBBI, DL, TII.get(W65816::STZ_DP)).addImm(0xF8);
       }
       BuildMI(MBB, MBBI, DL, TII.get(W65816::TYA));
     }
   }
+  // VLA function with no static frame (or PHA-only): still need FP.
+  if (HasVLA &&
+      !MF.getInfo<W65816MachineFunctionInfo>()->getUsesDpFP()) {
+    MF.getInfo<W65816MachineFunctionInfo>()->setUsesDpFP(true);
+    BuildMI(MBB, MBBI, DL, TII.get(W65816::TAY));
+    BuildMI(MBB, MBBI, DL, TII.get(W65816::TSC));
+    BuildMI(MBB, MBBI, DL, TII.get(W65816::STA_DP)).addImm(0xF6);
+    BuildMI(MBB, MBBI, DL, TII.get(W65816::STZ_DP)).addImm(0xF8);
+    BuildMI(MBB, MBBI, DL, TII.get(W65816::TYA));
+  }
 }
 
 void W65816FrameLowering::emitEpilogue(MachineFunction &MF,
diff --git a/src/llvm/lib/Target/W65816/W65816I32IncFold.cpp b/src/llvm/lib/Target/W65816/W65816I32IncFold.cpp
new file mode 100644
index 0000000..b6bab79
--- /dev/null
+++ b/src/llvm/lib/Target/W65816/W65816I32IncFold.cpp
@@ -0,0 +1,225 @@
+//===-- W65816I32IncFold.cpp - Fold i32 += 1 into INC + conditional skip --===//
+//
+// Pre-emit peephole: detect the post-PEI 6-instruction sequence emitted
+// for `i32 += 1` on a Wide32 vreg whose halves spilled to two stack-rel
+// slots, and rewrite to a tighter form using INA + a conditional skip
+// over the hi half.
+//
+// Original (after PEI, pseudos still un-expanded):
+//   $a = LDA_StackRel imm_lo            ; load lo half
+//   $a = ADCi16imm $a, 1                ; CLC + ADC #1  (5 cyc)
+//   STA_StackRel $a, imm_lo             ; store lo
+//   $a = LDA_StackRel imm_hi            ; load hi half
+//   $a = ADCEi16imm $a, 0               ; ADC #0 (uses carry from lo)
+//   STA_StackRel $a, imm_hi             ; store hi
+//
+// Cycle cost: 5 + 2 + 3 + 5 + 5 + 3 + 5 = 28 cyc
+//
+// Rewrite:
+//   $a = LDA_StackRel imm_lo            ; load lo
+//   $a = INA_PSEUDO $a, $a              ; lo + 1 — sets Z based on result
+//   STA_StackRel $a, imm_lo             ; store lo (Z preserved)
+//   INC_HI_IF_CARRY_StackRel imm_hi     ; AsmPrinter expands to:
+//                                       ;   bne L_skip
+//                                       ;   lda imm_hi, s
+//                                       ;   inc a
+//                                       ;   sta imm_hi, s
+//                                       ; L_skip:
+//
+// Cycle cost (no carry, common case):
+//   5 + 2 + 5 + 3 (BNE taken) = 15 cyc — saves 13 cyc
+// Cycle cost (with carry, rare case):
+//   5 + 2 + 5 + 2 (BNE not-taken) + 5 + 2 + 5 = 26 cyc — saves 2 cyc
+//
+// The Z flag from `INA` survives the intervening STA_StackRel because
+// STA does not modify the processor status register.  The BNE in the
+// expansion of INC_HI_IF_CARRY_StackRel reads that Z to decide whether
+// the hi half needs to be touched.
+//===----------------------------------------------------------------------===//
+
+#include "W65816.h"
+#include "W65816InstrInfo.h"
+#include "W65816Subtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "w65816-i32-inc-fold"
+
+namespace {
+class W65816I32IncFold : public MachineFunctionPass {
+public:
+  static char ID;
+  W65816I32IncFold() : MachineFunctionPass(ID) {}
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  StringRef getPassName() const override {
+    return "W65816 i32 += 1 → INC + conditional skip";
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+} // namespace
+
+char W65816I32IncFold::ID = 0;
+
+INITIALIZE_PASS(W65816I32IncFold, DEBUG_TYPE,
+                "W65816 i32 += 1 fold", false, false)
+
+namespace llvm {
+void initializeW65816I32IncFoldPass(PassRegistry &);
+}
+
+// Match the 6-instruction sequence; returns the post-pattern iterator
+// and fills in the lo/hi stack-rel offsets if the pattern matches.
+// Tolerates intervening TAX/TXA pairs (which regalloc inserts as
+// spurious A-save brackets around STAfi's conservative Defs=[A]).
+// They're collected into `KillMe` so the rewrite can erase them too.
+static bool matchI32AddOnePattern(MachineBasicBlock::iterator It,
+                                  MachineBasicBlock::iterator End,
+                                  int64_t &OffLo, int64_t &OffHi,
+                                  MachineBasicBlock::iterator &PatEnd,
+                                  SmallVectorImpl<MachineInstr *> &KillMe) {
+  auto skipDebug = [&]() {
+    while (It != End && It->isDebugInstr()) ++It;
+  };
+  auto skipTaxTxa = [&]() {
+    while (It != End && (It->isDebugInstr() ||
+                         It->getOpcode() == W65816::TAX ||
+                         It->getOpcode() == W65816::TXA)) {
+      if (It->getOpcode() == W65816::TAX || It->getOpcode() == W65816::TXA) {
+        KillMe.push_back(&*It);
+      }
+      ++It;
+    }
+  };
+  skipDebug();
+  if (It == End) return false;
+
+  // 1. LDA_StackRel imm_lo
+  if (It->getOpcode() != W65816::LDA_StackRel) return false;
+  if (It->getNumOperands() < 1 || !It->getOperand(0).isImm()) return false;
+  OffLo = It->getOperand(0).getImm();
+  ++It;
+  skipTaxTxa();
+  if (It == End) return false;
+
+  // 2. ADCi16imm with imm == 1
+  if (It->getOpcode() != W65816::ADCi16imm) return false;
+  if (It->getNumOperands() < 3 || !It->getOperand(2).isImm()) return false;
+  if (It->getOperand(2).getImm() != 1) return false;
+  ++It;
+  skipDebug();
+  if (It == End) return false;
+
+  // 3. STA_StackRel to same offset
+  if (It->getOpcode() != W65816::STA_StackRel) return false;
+  if (It->getNumOperands() < 1 || !It->getOperand(0).isImm()) return false;
+  if (It->getOperand(0).getImm() != OffLo) return false;
+  ++It;
+  skipTaxTxa();
+  if (It == End) return false;
+
+  // 4. LDA_StackRel imm_hi (different offset)
+  if (It->getOpcode() != W65816::LDA_StackRel) return false;
+  if (It->getNumOperands() < 1 || !It->getOperand(0).isImm()) return false;
+  OffHi = It->getOperand(0).getImm();
+  if (OffHi == OffLo) return false;
+  ++It;
+  skipDebug();
+  if (It == End) return false;
+
+  // 5. ADCEi16imm with imm == 0
+  if (It->getOpcode() != W65816::ADCEi16imm) return false;
+  if (It->getNumOperands() < 3 || !It->getOperand(2).isImm()) return false;
+  if (It->getOperand(2).getImm() != 0) return false;
+  ++It;
+  skipDebug();
+  if (It == End) return false;
+
+  // 6. STA_StackRel to hi offset
+  if (It->getOpcode() != W65816::STA_StackRel) return false;
+  if (It->getNumOperands() < 1 || !It->getOperand(0).isImm()) return false;
+  if (It->getOperand(0).getImm() != OffHi) return false;
+  ++It;
+  PatEnd = It;
+  return true;
+}
+
+bool W65816I32IncFold::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction())) return false;
+  const auto &STI = MF.getSubtarget<W65816Subtarget>();
+  const auto *TII = STI.getInstrInfo();
+  bool Changed = false;
+
+  for (MachineBasicBlock &MBB : MF) {
+    auto It = MBB.begin();
+    while (It != MBB.end()) {
+      int64_t OffLo = 0, OffHi = 0;
+      MachineBasicBlock::iterator PatEnd;
+      SmallVector<MachineInstr *, 4> KillMe;
+      auto Start = It;
+      if (!matchI32AddOnePattern(It, MBB.end(), OffLo, OffHi, PatEnd, KillMe)) {
+        ++It;
+        continue;
+      }
+      // Erase any spurious TAX/TXA pseudo-saves we tolerated inside
+      // the pattern.  These are dead because STAfi's Defs=[A] was
+      // a conservative over-approximation; the A-source path preserves
+      // A in the actual asm.
+      for (MachineInstr *MI : KillMe) MI->eraseFromParent();
+      // Found the 6-instruction pattern, [Start, PatEnd).  Rewrite
+      // in-place: keep the LDA_StackRel for lo, replace ADCi16imm
+      // with INA_PSEUDO, keep STA_StackRel for lo, then replace the
+      // entire LDA-ADCE-STA hi-half triple with INC_HI_IF_CARRY_StackRel.
+      DebugLoc DL = Start->getDebugLoc();
+
+      // Walk to the ADCi16imm (Start+1) and replace.  Build a fresh
+      // INA_PSEUDO with the same tied-def shape: dst=A, src=A.
+      auto AdcIt = std::next(Start);
+      while (AdcIt != PatEnd && AdcIt->isDebugInstr()) ++AdcIt;
+      // INA_PSEUDO has constraint $src = $dst; emit with both as A.
+      // Operand layout: (outs Acc16:$dst), (ins Acc16:$src)
+      BuildMI(MBB, AdcIt, DL, TII->get(W65816::INA_PSEUDO), W65816::A)
+          .addReg(W65816::A);
+      auto Erased = AdcIt;
+      ++AdcIt;
+      Erased->eraseFromParent();
+
+      // Now find the start of the hi-half triple: it's at Start+3 (after
+      // skipping debug).  Walk past STA_StackRel (lo) which is now at
+      // AdcIt's position.
+      while (AdcIt != PatEnd && AdcIt->isDebugInstr()) ++AdcIt;
+      // AdcIt should now point at STA_StackRel (lo).  Skip it.
+      ++AdcIt;
+      while (AdcIt != PatEnd && AdcIt->isDebugInstr()) ++AdcIt;
+      // AdcIt now points at LDA_StackRel (hi) — start of the hi triple.
+      MachineBasicBlock::iterator HiStart = AdcIt;
+
+      // Insert INC_HI_IF_CARRY_StackRel before the hi triple, then
+      // erase all three hi instructions.
+      BuildMI(MBB, HiStart, DL, TII->get(W65816::INC_HI_IF_CARRY_StackRel))
+          .addImm(OffHi);
+
+      // Erase the 3 hi instructions: LDA_StackRel, ADCEi16imm, STA_StackRel.
+      auto KillIt = HiStart;
+      for (int i = 0; i < 3 && KillIt != PatEnd; ) {
+        if (KillIt->isDebugInstr()) { ++KillIt; continue; }
+        auto Next = std::next(KillIt);
+        KillIt->eraseFromParent();
+        KillIt = Next;
+        ++i;
+      }
+
+      Changed = true;
+      It = PatEnd;
+    }
+  }
+  return Changed;
+}
+
+FunctionPass *llvm::createW65816I32IncFold() {
+  return new W65816I32IncFold();
+}
diff --git a/src/llvm/lib/Target/W65816/W65816ISelLowering.cpp b/src/llvm/lib/Target/W65816/W65816ISelLowering.cpp
index 866228c..a261bb1 100644
--- a/src/llvm/lib/Target/W65816/W65816ISelLowering.cpp
+++ b/src/llvm/lib/Target/W65816/W65816ISelLowering.cpp
@@ -22,6 +22,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -93,6 +94,18 @@ W65816TargetLowering::W65816TargetLowering(const TargetMachine &TM,
   // via a bit-7 test and SELECT_CC (see LowerSignExtend).
   setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Custom);
 
+  // BSWAP: no native byte-swap instruction (XBA swaps the two halves
+  // of the 16-bit accumulator only when in 8-bit M mode, hard to
+  // exploit cleanly).  Lower to shifts + ORs via the generic Expand
+  // path — SDAG turns `bswap(i32)` into four byte extracts ORed back
+  // together, which our existing patterns handle.  Required for
+  // portable C that constructs a big-endian word from byte loads:
+  // `((u32)b[0] << 24) | ((u32)b[1] << 16) | ((u32)b[2] << 8) | b[3]`
+  // (SHA-256 message-schedule, JPEG/PNG headers, etc.).
+  setOperationAction(ISD::BSWAP, MVT::i16, Expand);
+  setOperationAction(ISD::BSWAP, MVT::i32, Expand);
+  setOperationAction(ISD::BSWAP, MVT::i64, Expand);
+
   // We have zextload-i8 and extload-i8 patterns (LDA + AND #$FF / bare
   // LDA for the anyext case).  No native sextload; mark it Expand so
   // LLVM rewrites `sextload i16, i8` into `(sign_extend (load i8))`,
@@ -246,7 +259,11 @@ W65816TargetLowering::W65816TargetLowering(const TargetMachine &TM,
   // __ashlsi3 / __lshrsi3 / __ashrsi3 call, which is both smaller and
   // simpler than implementing a 32-bit shift in 65816 assembly inline.
   for (MVT VT : {MVT::i32}) {
-    setOperationAction(ISD::MUL,  VT, LibCall);
+    // MUL i32 is Custom-lowered: the typical fall-through libcall is
+    // __mulsi3 (32x32 -> 32), but when both operands are ZEXT from i16
+    // we can emit __umulhisi3 (16x16 -> 32) instead.  Saves ~60 cyc per
+    // call on the `(unsigned long)i * i` pattern — see LowerMUL_I32.
+    setOperationAction(ISD::MUL,  VT, Custom);
     setOperationAction(ISD::SDIV, VT, LibCall);
     setOperationAction(ISD::UDIV, VT, LibCall);
     setOperationAction(ISD::SREM, VT, LibCall);
@@ -319,6 +336,8 @@ W65816TargetLowering::W65816TargetLowering(const TargetMachine &TM,
   // local-variable access across the alloca will miscompile.  A real
   // FP (DP slot or X-as-FP) would lift this restriction.
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i16, Custom);
+  if (ptr32Active)
+    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
 
   // Opt into PerformDAGCombine on LOAD nodes — needed for the
   // address-select reverse combine (see W65816TargetLowering::
@@ -1216,6 +1235,7 @@ SDValue W65816TargetLowering::LowerOperation(SDValue Op,
   case ISD::AND:
   case ISD::OR:
   case ISD::XOR:            return LowerI32Bin(Op, DAG);
+  case ISD::MUL:            return LowerMUL_I32(Op, DAG);
   case ISD::LOAD:           return LowerLoad(Op, DAG);
   case ISD::STORE:          return LowerStore(Op, DAG);
   case ISD::Constant:       return LowerI32Constant(Op, DAG);
@@ -1305,6 +1325,24 @@ SDValue W65816TargetLowering::LowerDynamicStackalloc(SDValue Op,
   SDLoc DL(Op);
   SDValue Chain = Op.getOperand(0);
   SDValue Size = Op.getOperand(1);
+  EVT ResultVT = Op.getValueType();
+  // Under ptr32, both the result pointer and the size are Wide32 i32
+  // values.  Extract the i16 lo half of size (a VLA larger than 64KB
+  // doesn't fit in our stack anyway), do the i16 ALLOCA, then build
+  // the Wide32 result with bank=0 (stack is always bank 0).
+  if (ResultVT == MVT::i32) {
+    SDValue Size16 = (Size.getValueType() == MVT::i32)
+                       ? extractWide32Lo(DAG, DL, Size)
+                       : Size;
+    SDValue ChainAndPtr = DAG.getNode(W65816ISD::ALLOCA, DL,
+                                      DAG.getVTList(MVT::i16, MVT::Other),
+                                      Chain, Size16);
+    SDValue Ptr16 = ChainAndPtr.getValue(0);
+    SDValue NewChain = ChainAndPtr.getValue(1);
+    SDValue Bank = DAG.getConstant(0, DL, MVT::i16);
+    SDValue Ptr32 = buildWide32(DAG, DL, Ptr16, Bank);
+    return DAG.getMergeValues({Ptr32, NewChain}, DL);
+  }
   SDValue ChainAndPtr = DAG.getNode(W65816ISD::ALLOCA, DL,
                                     DAG.getVTList(MVT::i16, MVT::Other),
                                     Chain, Size);
@@ -1433,10 +1471,28 @@ SDValue W65816TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
   default: llvm_unreachable("not a shift");
   }
 
-  // makeLibCall wants the args as TargetLowering::ArgListEntry; the
-  // simpler getNode form is to manually build the call.  But the
-  // makeLibCall helper handles the calling convention.
-  SmallVector<SDValue, 2> Args = {Op.getOperand(0), Op.getOperand(1)};
+  SDValue Val = Op.getOperand(0);
+  if (IsI32 && Op.getOpcode() == ISD::SHL) {
+    // Force the high half of the input to be concretely zero when the
+    // shift count K is >= 16, so bits K..31 of the input are
+    // mathematically irrelevant.  SDAG legalisation can mark those bits
+    // as `undef` to give the regalloc freedom, but our libcall (a true
+    // 32-bit shift-and-rotate loop in libgcc.s) reads ALL 32 input
+    // bits and propagates garbage into the result's low half.  Caught
+    // by dadd via the dpack-inline `(u64 e) << 52` path which split
+    // into __ashlsi3(e_lo, 20) with X = undef → wrong mantissa.
+    // For SRL/SRA we'd zero/sign-extend the LOW half similarly when
+    // K >= 16, but those paths aren't exercising the bug yet.
+    if (auto *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      unsigned K = (unsigned)C->getZExtValue();
+      if (K >= 16) {
+        SDValue Lo = extractWide32Lo(DAG, SDLoc(Op), Val);
+        SDValue Zero = DAG.getConstant(0, SDLoc(Op), MVT::i16);
+        Val = buildWide32(DAG, SDLoc(Op), Lo, Zero);
+      }
+    }
+  }
+  SmallVector<SDValue, 2> Args = {Val, Op.getOperand(1)};
   TargetLowering::MakeLibCallOptions Opts;
   Opts.setIsSigned(Op.getOpcode() == ISD::SRA);
   return makeLibCall(DAG, LC, Op.getValueType(), Args, Opts, SDLoc(Op)).first;
@@ -2144,9 +2200,75 @@ W65816TargetLowering::PerformDAGCombine(SDNode *N,
       }
     }
   }
+
   return SDValue();
 }
 
+// Custom-lowering for ISD::MUL i32.  When both operands are ZEXT from
+// i16 (or provably have high 16 bits = 0), emit a libcall to
+// __umulhisi3 (16x16 -> 32) instead of the heavier __mulsi3 (32x32 ->
+// 32).  Saves the 32-bit arg marshaling AND the 32-bit accumulator
+// math inside the libcall — roughly equivalent to Calypsi 5.16's
+// `_Mul16`.  Falls through to the standard __mulsi3 libcall otherwise.
+SDValue W65816TargetLowering::LowerMUL_I32(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  assert(VT == MVT::i32 && "LowerMUL_I32 expects i32");
+  SDValue Lhs = Op.getOperand(0);
+  SDValue Rhs = Op.getOperand(1);
+
+  auto narrowToI16 = [&](SDValue V) -> SDValue {
+    // Explicit zext-from-i16 (the IR-level form, before SDAG flattening).
+    if (V.getOpcode() == ISD::ZERO_EXTEND &&
+        V.getOperand(0).getValueType() == MVT::i16)
+      return V.getOperand(0);
+    // ANY_EXTEND-from-i16 is also fine since multiplication of the low
+    // 16 bits gives the same 32-bit result whatever the high bits were.
+    if (V.getOpcode() == ISD::ANY_EXTEND &&
+        V.getOperand(0).getValueType() == MVT::i16)
+      return V.getOperand(0);
+    // High 16 bits provably zero?
+    KnownBits K = DAG.computeKnownBits(V);
+    if (K.countMinLeadingZeros() >= 16)
+      return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, V);
+    return SDValue();
+  };
+
+  SDValue A = narrowToI16(Lhs);
+  SDValue B = narrowToI16(Rhs);
+  if (A && B) {
+    TargetLowering::ArgListTy Args;
+    Args.push_back({A, Type::getInt16Ty(*DAG.getContext())});
+    Args.push_back({B, Type::getInt16Ty(*DAG.getContext())});
+    SDValue Callee = DAG.getExternalSymbol(
+        "__umulhisi3", getPointerTy(DAG.getDataLayout()));
+    TargetLowering::CallLoweringInfo CLI(DAG);
+    CLI.setDebugLoc(DL)
+        .setChain(DAG.getEntryNode())
+        .setLibCallee(CallingConv::C,
+                      Type::getInt32Ty(*DAG.getContext()),
+                      Callee, std::move(Args));
+    auto [Ret, Chain] = LowerCallTo(CLI);
+    return Ret;
+  }
+
+  // Fall back to the standard __mulsi3 libcall.
+  TargetLowering::ArgListTy Args;
+  Args.push_back({Lhs, Type::getInt32Ty(*DAG.getContext())});
+  Args.push_back({Rhs, Type::getInt32Ty(*DAG.getContext())});
+  SDValue Callee = DAG.getExternalSymbol(
+      "__mulsi3", getPointerTy(DAG.getDataLayout()));
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(DL)
+      .setChain(DAG.getEntryNode())
+      .setLibCallee(CallingConv::C,
+                    Type::getInt32Ty(*DAG.getContext()),
+                    Callee, std::move(Args));
+  auto [Ret, Chain] = LowerCallTo(CLI);
+  return Ret;
+}
+
 // Map a W65816CC code to the matching Bxx opcode.
 static unsigned getBranchOpcodeForCC(unsigned CC) {
   switch (CC) {
diff --git a/src/llvm/lib/Target/W65816/W65816ISelLowering.h b/src/llvm/lib/Target/W65816/W65816ISelLowering.h
index 1d640af..c8783a3 100644
--- a/src/llvm/lib/Target/W65816/W65816ISelLowering.h
+++ b/src/llvm/lib/Target/W65816/W65816ISelLowering.h
@@ -218,6 +218,11 @@ private:
   SDValue LowerI32Bin(SDValue Op, SelectionDAG &DAG) const;
   // i32 ConstantNode: split into two i16 constants and REG_SEQUENCE.
   SDValue LowerI32Constant(SDValue Op, SelectionDAG &DAG) const;
+  // i32 MUL: detect (zext i16 a) * (zext i16 b) — or operands with
+  // provably-zero high 16 bits — and emit __umulhisi3 (16x16 -> 32)
+  // instead of __mulsi3 (32x32 -> 32).  Cuts ~30% off the canonical
+  // sumSquares-style loop.
+  SDValue LowerMUL_I32(SDValue Op, SelectionDAG &DAG) const;
 };
 
 } // namespace llvm
diff --git a/src/llvm/lib/Target/W65816/W65816ImgCalleeSave.cpp b/src/llvm/lib/Target/W65816/W65816ImgCalleeSave.cpp
new file mode 100644
index 0000000..1eeba0d
--- /dev/null
+++ b/src/llvm/lib/Target/W65816/W65816ImgCalleeSave.cpp
@@ -0,0 +1,278 @@
+//===-- W65816ImgCalleeSave.cpp - Callee-side save/restore of IMG8..IMG15 -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Post-RA, pre-PEI pass that adds prologue save + epilogue restore for
+// IMG8..IMG15 ($C0..$CE) in any function that uses them.  This makes
+// IMG8..IMG15 behave as callee-saved AT THE ASM LEVEL without going
+// through LLVM's CSR mechanism (which would shift regalloc decisions
+// and break other tests — see history with
+// `feedback_picol_expr_compound_or.md`).
+//
+// Why callee-side, not caller-side?
+//
+//   Callers can hold long-lived vregs in IMG8..IMG15 (regalloc treats
+//   them as preserved across calls because they're not in JSLpseudo's
+//   Defs).  The "obvious" fix — add them to Defs and force regalloc to
+//   spill them across each call — interacts badly with stack-slot
+//   coloring: the spill slot gets coalesced with another vreg whose
+//   liveness appears disjoint, but the post-call reload makes the
+//   lifetimes overlap and the reload reads garbage (caught by qsort,
+//   strncat, etc. when IMG0..IMG7 were also in Defs).
+//
+//   By doing the save/restore on the CALLEE side instead, the caller
+//   doesn't need to spill at all — its values in IMG8..IMG15 are
+//   automatically preserved.  Only functions that USE IMG8..IMG15 pay
+//   the cost (a few bytes of prologue/epilogue), and the cost is
+//   amortized across the whole function (not per call).
+//
+// Why post-RA, not via LLVM's CSR mechanism?
+//
+//   Adding IMG8..IMG15 to `getCalleeSavedRegs()` makes LLVM treat them
+//   as "expensive" in cost-of-use analysis.  Regalloc steers away from
+//   them in functions that don't really need them, but that steering
+//   changes coloring decisions in ways that broke strtol
+//   (`strtol(" 0x1ABC ", &ep, 16)` returned 0).  Implementing
+//   save/restore outside the CSR system keeps regalloc's decisions
+//   unchanged: it sees IMG8..IMG15 as ordinary regs, uses them freely
+//   under pressure, and this pass adds the asm-level bookkeeping.
+//
+// Why pre-PEI?
+//
+//   PEI is what assigns frame-index offsets and emits the actual
+//   prologue/epilogue.  To add new spill slots, we need PEI to see
+//   them so they get included in the frame size.  We use
+//   `MFI.CreateStackObject` to register the slots, then emit STAfi /
+//   LDAfi pseudos that PEI will lower to `STA d,s` / `LDA d,s`.
+//
+//   We also insert the save/restore as REGULAR MIR instructions BEFORE
+//   PEI runs.  That means PEI sees them when it emits its frame setup,
+//   and the STAfi/LDAfi disps are valid post-PEI.
+//
+//===----------------------------------------------------------------------===//
+
+#include "W65816.h"
+#include "W65816InstrInfo.h"
+#include "W65816Subtarget.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "w65816-img-callee-save"
+
+namespace {
+
+class W65816ImgCalleeSave : public MachineFunctionPass {
+public:
+  static char ID;
+  W65816ImgCalleeSave() : MachineFunctionPass(ID) {}
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  StringRef getPassName() const override {
+    return "W65816 callee-side save/restore for IMG8..IMG15";
+  }
+};
+
+} // namespace
+
+char W65816ImgCalleeSave::ID = 0;
+
+INITIALIZE_PASS(W65816ImgCalleeSave, DEBUG_TYPE,
+                "W65816 IMG8..IMG15 callee save/restore", false, false)
+
+namespace llvm {
+void initializeW65816ImgCalleeSavePass(PassRegistry &);
+}
+
+FunctionPass *llvm::createW65816ImgCalleeSave() {
+  return new W65816ImgCalleeSave();
+}
+
+// IMG8..IMG15 physregs (in order so IMG_REGS[i] is the i'th high-half slot).
+// Their DP addresses are $C0, $C2, ..., $CE (each slot is 16 bits = 2 bytes).
+static constexpr unsigned IMG_REGS[8] = {
+    W65816::IMG8,  W65816::IMG9,  W65816::IMG10, W65816::IMG11,
+    W65816::IMG12, W65816::IMG13, W65816::IMG14, W65816::IMG15};
+static constexpr unsigned IMG_DP[8] = {0xC0, 0xC2, 0xC4, 0xC6,
+                                       0xC8, 0xCA, 0xCC, 0xCE};
+
+static int classifyImgReg(unsigned Reg) {
+  for (int i = 0; i < 8; ++i)
+    if (Reg == IMG_REGS[i])
+      return i;
+  return -1;
+}
+
+// Map a DP-addressed instruction's first immediate operand to an IMG
+// slot index if it falls in $C0..$CE.  Returns -1 otherwise.
+static int classifyDpImmAsImg(const MachineInstr &MI) {
+  // Most DP-addressed opcodes take the dp address as immediate op 0.
+  // (Some, like ADC_DP-form-with-explicit-A, may put the imm at op 1.)
+  // For our scan, check the first IMM operand we find.
+  unsigned Opc = MI.getOpcode();
+  switch (Opc) {
+  case W65816::LDA_DP:
+  case W65816::STA_DP:
+  case W65816::STZ_DP:
+  case W65816::LDX_DP:
+  case W65816::STX_DP:
+  case W65816::LDY_DP:
+  case W65816::STY_DP:
+  case W65816::ADC_DP:
+  case W65816::SBC_DP:
+  case W65816::AND_DP:
+  case W65816::ORA_DP:
+  case W65816::EOR_DP:
+  case W65816::CMP_DP:
+  case W65816::CPX_DP:
+  case W65816::CPY_DP:
+  case W65816::BIT_DP:
+  case W65816::INC_DP:
+  case W65816::DEC_DP:
+  case W65816::ASL_DP:
+  case W65816::LSR_DP:
+  case W65816::ROL_DP:
+  case W65816::ROR_DP:
+    break;
+  default:
+    return -1;
+  }
+  for (const auto &MO : MI.operands()) {
+    if (!MO.isImm()) continue;
+    int64_t V = MO.getImm();
+    for (int i = 0; i < 8; ++i)
+      if ((int64_t)IMG_DP[i] == V)
+        return i;
+    return -1;  // First imm is the dp addr; not in IMG range.
+  }
+  return -1;
+}
+
+bool W65816ImgCalleeSave::runOnMachineFunction(MachineFunction &MF) {
+  // Step 1: scan for IMG8..IMG15 usage.  copyPhysReg already lowered
+  // some COPY $imgN = $a forms to STA_DP imm:0xC0 (etc.), so we have
+  // to check both the physreg form AND the DP-immediate form.
+  bool UsedSlot[8] = {false};
+  bool AnyUsed = false;
+  for (auto &MBB : MF) {
+    for (auto &MI : MBB) {
+      // physreg form: $imgN = ... or ... = $imgN
+      for (const auto &MO : MI.operands()) {
+        if (!MO.isReg() || MO.getReg() == 0) continue;
+        int idx = classifyImgReg(MO.getReg());
+        if (idx >= 0) {
+          UsedSlot[idx] = true;
+          AnyUsed = true;
+        }
+      }
+      // DP-imm form: lda dp imm:0xC0 etc.
+      int idx = classifyDpImmAsImg(MI);
+      if (idx >= 0) {
+        UsedSlot[idx] = true;
+        AnyUsed = true;
+      }
+    }
+  }
+  if (!AnyUsed) return false;
+
+  // Step 2: allocate one frame slot per used IMG.  Size = 2 bytes (each
+  // Img16 holds a 16-bit value).  Mark as a spill slot so PEI accounts
+  // for it; isSpillSlot=true means slot coloring CAN coalesce it with
+  // other spill slots — but the STAfi/LDAfi we emit reference this slot
+  // by FrameIndex, and the only writes to this FI are our save/restore
+  // pair, so coloring can't break the round-trip.
+  //
+  // (The picol-expr bug came from a SHARED slot with two DIFFERENT
+  // vregs writing to it; here we have one FI per IMG and a single
+  // write/read pair per function, so coloring can't trip on this.)
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  int FrameSlots[8];
+  for (int i = 0; i < 8; ++i) {
+    FrameSlots[i] = -1;
+    if (UsedSlot[i])
+      FrameSlots[i] = MFI.CreateStackObject(2, Align(2),
+                                             /*isSpillSlot=*/true);
+  }
+
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+  DebugLoc DL;
+
+  // Step 3: emit prologue save.  Insert at entry MBB.begin() so PEI's
+  // emitPrologue (which inserts BEFORE existing MBB.begin) places its
+  // frame setup BEFORE our saves — the right order, since our saves
+  // reference frame slots whose disps require post-TCS S.
+  //
+  // Single PHA/PLA bracket around ALL slot saves (vs per-slot bracket).
+  // For N used slots:
+  //   per-slot:   N * (PHA + LDA dp + STA d,s + PLA) = 16N cyc, 6N bytes
+  //   single:     PHA + N*(LDA dp + STA d,s) + PLA   = 8+8N cyc, 2+4N bytes
+  // Saves 8 cyc + 2 bytes per additional slot beyond the first.
+  //
+  // The +2 ImmOffset on STAfi compensates for PHA's SP shift; same +2
+  // applies to every slot inside the bracket since SP is constant
+  // throughout.
+  MachineBasicBlock &EntryMBB = MF.front();
+  MachineBasicBlock::iterator EntryIt = EntryMBB.begin();
+  BuildMI(EntryMBB, EntryIt, DL, TII->get(W65816::PHA));
+  for (int i = 0; i < 8; ++i) {
+    if (!UsedSlot[i]) continue;
+    BuildMI(EntryMBB, EntryIt, DL, TII->get(W65816::LDA_DP))
+        .addImm(IMG_DP[i])
+        .addReg(W65816::A, RegState::ImplicitDefine);
+    BuildMI(EntryMBB, EntryIt, DL, TII->get(W65816::STAfi))
+        .addReg(W65816::A)
+        .addFrameIndex(FrameSlots[i])
+        .addImm(2)
+        .addReg(W65816::A, RegState::ImplicitDefine);
+  }
+  BuildMI(EntryMBB, EntryIt, DL, TII->get(W65816::PLA))
+      .addReg(W65816::A, RegState::ImplicitDefine);
+
+  // Step 4: emit epilogue restore at each return MBB, just BEFORE the
+  // RTL/RTS/RTI (so the IMG restore happens before the frame teardown
+  // — wait, no: we want the IMG restore BEFORE we tear down the frame
+  // because our STAfi/LDAfi reference frame slots).  Insert just
+  // before the LAST terminator (the return).  PEI's emitEpilogue will
+  // insert its frame-teardown AFTER our restores (BEFORE the return),
+  // which means our `,s` disps see the post-TCS S still.
+  //
+  // Pattern per slot (preserving A, which may hold the return value):
+  //   PHA              (preserve A; SP shifts)
+  //   LDAfi A, <slot>  (A = saved IMGn value via `lda <disp+2>,s`)
+  //   STA <imgN dp>    (write back to IMGn)
+  //   PLA              (restore A)
+  for (auto &MBB : MF) {
+    if (MBB.empty()) continue;
+    auto LastIt = std::prev(MBB.end());
+    while (LastIt != MBB.begin() && LastIt->isDebugInstr())
+      --LastIt;
+    unsigned LastOpc = LastIt->getOpcode();
+    if (LastOpc != W65816::RTL && LastOpc != W65816::RTS &&
+        LastOpc != W65816::RTI)
+      continue;
+
+    // Single PHA/PLA bracket for all restores (same optimization as save).
+    BuildMI(MBB, LastIt, DL, TII->get(W65816::PHA));
+    for (int i = 7; i >= 0; --i) {
+      if (!UsedSlot[i]) continue;
+      BuildMI(MBB, LastIt, DL, TII->get(W65816::LDAfi))
+          .addReg(W65816::A, RegState::Define)
+          .addFrameIndex(FrameSlots[i])
+          .addImm(2);
+      BuildMI(MBB, LastIt, DL, TII->get(W65816::STA_DP))
+          .addImm(IMG_DP[i])
+          .addReg(W65816::A, RegState::Implicit);
+    }
+    BuildMI(MBB, LastIt, DL, TII->get(W65816::PLA))
+        .addReg(W65816::A, RegState::ImplicitDefine);
+  }
+
+  return true;
+}
diff --git a/src/llvm/lib/Target/W65816/W65816InstrInfo.cpp b/src/llvm/lib/Target/W65816/W65816InstrInfo.cpp
index 7d5c83a..9c475e3 100644
--- a/src/llvm/lib/Target/W65816/W65816InstrInfo.cpp
+++ b/src/llvm/lib/Target/W65816/W65816InstrInfo.cpp
@@ -454,22 +454,127 @@ int W65816InstrInfo::getSPAdjust(const MachineInstr &MI) const {
   return TargetInstrInfo::getSPAdjust(MI);
 }
 
+// Conditional branch opcode predicate.
+static bool isCondBranch(unsigned Opc) {
+  switch (Opc) {
+  case W65816::BEQ:
+  case W65816::BNE:
+  case W65816::BCS:
+  case W65816::BCC:
+  case W65816::BMI:
+  case W65816::BPL:
+  case W65816::BVS:
+  case W65816::BVC:
+    return true;
+  default:
+    return false;
+  }
+}
+
+// Unconditional direct-target branch predicate.  Excludes JMP_AbsInd
+// (indirect) and JML_Long (different operand kind).
+static bool isUncondDirectBranch(unsigned Opc) {
+  return Opc == W65816::BRA || Opc == W65816::BRL ||
+         Opc == W65816::JMP_Abs;
+}
+
+// Map a conditional Bxx to its inverse condition (BEQ↔BNE, etc.).
+// Returns 0 if not a recognised conditional.
+static unsigned invertCondOpcode(unsigned Opc) {
+  switch (Opc) {
+  case W65816::BEQ: return W65816::BNE;
+  case W65816::BNE: return W65816::BEQ;
+  case W65816::BCS: return W65816::BCC;
+  case W65816::BCC: return W65816::BCS;
+  case W65816::BMI: return W65816::BPL;
+  case W65816::BPL: return W65816::BMI;
+  case W65816::BVS: return W65816::BVC;
+  case W65816::BVC: return W65816::BVS;
+  default: return 0;
+  }
+}
+
+MachineBasicBlock *
+W65816InstrInfo::getBranchDestBlock(const MachineInstr &MI) const {
+  // All our direct branches encode the target MBB in operand 0.
+  if (MI.getNumOperands() < 1 || !MI.getOperand(0).isMBB())
+    return nullptr;
+  return MI.getOperand(0).getMBB();
+}
+
 bool W65816InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
                                     MachineBasicBlock *&TBB,
                                     MachineBasicBlock *&FBB,
                                     SmallVectorImpl<MachineOperand> &Cond,
                                     bool AllowModify) const {
-  // Return "unanalyzable" — we don't decode our BR_CC pseudos here.
-  // BranchFolder treats a true return as "leave this block alone",
-  // which avoids the default insertBranch llvm_unreachable.
-  return true;
+  TBB = nullptr;
+  FBB = nullptr;
+  Cond.clear();
+
+  // We deliberately keep conditional branches (BEQ/BNE/etc.) opaque to
+  // BranchFolder.  Their condition is encoded in the OPCODE and the
+  // flag input is an implicit use of P set by a preceding CMP/etc.;
+  // BranchFolder doesn't track that the CMP must stay adjacent, so
+  // if it re-inserts the Bxx in a tail-merged block the flag input
+  // becomes whatever earlier instruction last clobbered P.  Caught by
+  // the softDouble dadd smoke (1.5 + 2.5 != 4.0) once we tried to make
+  // conditional branches analyzable.
+  //
+  // What we DO analyze:
+  //   * Empty terminator sequence (pure fall-through) — return
+  //     analyzable with no targets so MachineBlockPlacement's assert
+  //     about fall-through blocks is satisfied trivially.
+  //   * Single unconditional direct branch (BRA / BRL / JMP_Abs) —
+  //     return analyzable with TBB set, no Cond.  Safe to move because
+  //     no flag dependency.
+  // Everything else (Bxx in any position, indirect jumps, multiple
+  // terminators, etc.) stays unanalyzable.
+  MachineBasicBlock::iterator I = MBB.getFirstTerminator();
+  while (I != MBB.end() && I->isDebugInstr())
+    ++I;
+  if (I == MBB.end())
+    return false;  // No terminators: pure fall-through.
+
+  unsigned FirstOpc = I->getOpcode();
+  if (!isUncondDirectBranch(FirstOpc))
+    return true;  // Conditional or unknown.  Stay opaque.
+
+  // Single unconditional direct branch — analyzable.
+  TBB = getBranchDestBlock(*I);
+  if (!TBB)
+    return true;
+  auto Next = std::next(I);
+  while (Next != MBB.end() && Next->isDebugInstr())
+    ++Next;
+  if (Next != MBB.end())
+    return true;  // Extra terminators after unconditional.
+  return false;
 }
 
 unsigned W65816InstrInfo::removeBranch(MachineBasicBlock &MBB,
                                        int *BytesRemoved) const {
   if (BytesRemoved)
     *BytesRemoved = 0;
-  return 0;
+  unsigned NumRemoved = 0;
+  // Walk from the end, removing trailing direct branches.  Stop when
+  // we hit a non-branch or a branch we can't analyze (e.g. JMP_AbsInd).
+  while (!MBB.empty()) {
+    auto It = std::prev(MBB.end());
+    if (It->isDebugInstr()) {
+      // Skip debug instructions but don't delete them.
+      if (It == MBB.begin())
+        break;
+      --It;
+    }
+    unsigned Opc = It->getOpcode();
+    if (!isCondBranch(Opc) && !isUncondDirectBranch(Opc))
+      break;
+    if (BytesRemoved)
+      *BytesRemoved += getInstSizeInBytes(*It);
+    It->eraseFromParent();
+    ++NumRemoved;
+  }
+  return NumRemoved;
 }
 
 unsigned W65816InstrInfo::insertBranch(MachineBasicBlock &MBB,
@@ -478,11 +583,49 @@ unsigned W65816InstrInfo::insertBranch(MachineBasicBlock &MBB,
                                        ArrayRef<MachineOperand> Cond,
                                        const DebugLoc &DL,
                                        int *BytesAdded) const {
-  // Should not be called: analyzeBranch returns true so BranchFolder
-  // treats blocks as unanalyzable and never asks us to insert.
+  assert(TBB && "insertBranch requires a true target");
+  assert((Cond.empty() || Cond.size() == 1) &&
+         "W65816 branch conditions are single-operand (opcode)");
+
   if (BytesAdded)
     *BytesAdded = 0;
-  return 0;
+  unsigned NumAdded = 0;
+
+  if (Cond.empty()) {
+    // Unconditional branch.  Use BRA — W65816AsmBackend auto-relaxes
+    // to BRL when the displacement exceeds an 8-bit signed offset.
+    auto MI = BuildMI(&MBB, DL, get(W65816::BRA)).addMBB(TBB);
+    if (BytesAdded)
+      *BytesAdded += getInstSizeInBytes(*MI);
+    return 1;
+  }
+
+  // Conditional branch using the opcode stored in Cond[0].
+  unsigned CondOpc = Cond[0].getImm();
+  auto MIc = BuildMI(&MBB, DL, get(CondOpc)).addMBB(TBB);
+  if (BytesAdded)
+    *BytesAdded += getInstSizeInBytes(*MIc);
+  ++NumAdded;
+
+  // If there's also a false target, emit an unconditional branch to it.
+  if (FBB) {
+    auto MIu = BuildMI(&MBB, DL, get(W65816::BRA)).addMBB(FBB);
+    if (BytesAdded)
+      *BytesAdded += getInstSizeInBytes(*MIu);
+    ++NumAdded;
+  }
+  return NumAdded;
+}
+
+bool W65816InstrInfo::reverseBranchCondition(
+    SmallVectorImpl<MachineOperand> &Cond) const {
+  if (Cond.size() != 1)
+    return true;
+  unsigned Inverted = invertCondOpcode(Cond[0].getImm());
+  if (!Inverted)
+    return true;
+  Cond[0].setImm(Inverted);
+  return false;
 }
 
 unsigned W65816InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
diff --git a/src/llvm/lib/Target/W65816/W65816InstrInfo.h b/src/llvm/lib/Target/W65816/W65816InstrInfo.h
index 4074c2f..8341bd7 100644
--- a/src/llvm/lib/Target/W65816/W65816InstrInfo.h
+++ b/src/llvm/lib/Target/W65816/W65816InstrInfo.h
@@ -95,13 +95,13 @@ public:
   // int b, int c) { return a*b + c; }` under fast regalloc).
   int getSPAdjust(const MachineInstr &MI) const override;
 
-  // Branch-control hooks — minimal stubs that opt our blocks out of
-  // BranchFolder's tail-merging pass.  Return "unanalyzable" from
-  // analyzeBranch so BranchFolder leaves the block alone; the empty
-  // remove/insertBranch stubs are required by the contract but never
-  // actually invoked in the unanalyzable path.  Pre-ptr32 the smoke
-  // never hit BranchFolder via this entry; under ptr32 it does
-  // (multi-pattern test at smoke #7).
+  // Branch-control hooks.  These now decode our real branch opcodes
+  // (BEQ/BNE/BCS/BCC/BMI/BPL/BVS/BVC and BRA/BRL/JMP_Abs) so
+  // BranchFolder and MachineBlockPlacement can rearrange blocks.
+  // Cond is encoded as a single Imm operand holding the conditional
+  // branch's opcode; reverseBranchCondition flips it via opcode map.
+  // JMP_AbsInd / JML_Long return "unanalyzable" — they're indirect or
+  // bank-crossing, which the layout passes can't reason about.
   bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
                      MachineBasicBlock *&FBB,
                      SmallVectorImpl<MachineOperand> &Cond,
@@ -112,6 +112,10 @@ public:
                         MachineBasicBlock *FBB,
                         ArrayRef<MachineOperand> Cond, const DebugLoc &DL,
                         int *BytesAdded = nullptr) const override;
+  bool reverseBranchCondition(
+      SmallVectorImpl<MachineOperand> &Cond) const override;
+  MachineBasicBlock *getBranchDestBlock(
+      const MachineInstr &MI) const override;
 };
 
 } // namespace llvm
diff --git a/src/llvm/lib/Target/W65816/W65816InstrInfo.td b/src/llvm/lib/Target/W65816/W65816InstrInfo.td
index 2d46efe..f678d8a 100644
--- a/src/llvm/lib/Target/W65816/W65816InstrInfo.td
+++ b/src/llvm/lib/Target/W65816/W65816InstrInfo.td
@@ -797,7 +797,13 @@ def LDAfi : W65816Pseudo<(outs Acc16:$dst), (ins memfi:$addr),
 // with-IMG-source that clobbered $a, silently storing X's value where
 // A's was expected — observed as `dadd(1.5,2.5) → 0x4010_0000_3000_3000`
 // under full IMG-clobber.
-let mayStore = 1, hasSideEffects = 0, mayLoad = 0, Defs = [A] in {
+//
+// Note: Defs = [A] triggers a greedy-regalloc assertion failure
+// (LiveRangeEdit::eliminateDeadDef on a KILL pseudo with non-dead
+// implicit-def $a) on functions with many cross-call Acc16 vregs
+// (atoi, etc.).  Greedy is currently disabled — basic regalloc avoids
+// the bad path.
+let mayStore = 1, hasSideEffects = 1, mayLoad = 0, Defs = [A] in {
 def STAfi : W65816Pseudo<(outs),
                          (ins Wide16:$src, memfi:$addr),
                          "# STAfi $src, $addr", []>;
@@ -1604,6 +1610,23 @@ def EOR_StackRel : InstStackRel<0x43, "eor">;
 def LDA_StackRelIndY : InstStackRelIndY<0xB3, "lda">;
 def STA_StackRelIndY : InstStackRelIndY<0x93, "sta">;
 
+// Pseudo: conditional-increment of the hi half of an i32 spilled to a
+// pair of stack-rel slots.  Emitted by W65816I32IncFold when the
+// preceding LDA-INA-STA on the lo half established Z based on the
+// post-INA value (Z=1 means the lo wrapped to 0, i.e. a carry into hi).
+// AsmPrinter expands to:
+//   bne L_skip
+//   lda $imm, s
+//   inc a
+//   sta $imm, s
+// L_skip:
+let mayLoad = 1, mayStore = 1, hasSideEffects = 0,
+    Defs = [A] in {
+def INC_HI_IF_CARRY_StackRel : W65816Pseudo<(outs), (ins i16imm:$off),
+                                            "# INC_HI_IF_CARRY_StackRel $off",
+                                            []>;
+}
+
 //===----------------------------------------------------------------------===//
 // Branch patterns (placed after the Bxx defs).
 //
diff --git a/src/llvm/lib/Target/W65816/W65816NarrowI32Mul.cpp b/src/llvm/lib/Target/W65816/W65816NarrowI32Mul.cpp
new file mode 100644
index 0000000..0394d6d
--- /dev/null
+++ b/src/llvm/lib/Target/W65816/W65816NarrowI32Mul.cpp
@@ -0,0 +1,150 @@
+//===-- W65816NarrowI32Mul.cpp - Narrow i32 multiplies -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===---------------------------------------------------------------------===//
+//
+// IR-level peephole.  Detects `mul i32 X, Y` where both X and Y have
+// their top 16 bits provably zero (via LLVM's IR-level computeKnownBits)
+// and rewrites to a call to `__umulhisi3` — a 16x16 -> 32 unsigned
+// multiply (~30% faster than __mulsi3 for `(u32)i * i` patterns).
+//
+// Why an IR pass instead of a Custom SDAG lowering: LLVM's IndVarSimplify
+// loop pass widens narrow induction variables (e.g. an i16 loop counter
+// later zext'd to i32) into i32 PHIs.  By SDAG-build time the zext is
+// gone — the MUL's operand is just `CopyFromReg %2:i32`, an opaque value.
+// SDAG's computeKnownBits can't trace back across BB boundaries through
+// CopyFromReg.  IR-level computeKnownBits, by contrast, walks the use-def
+// graph (including PHIs) and can prove the high bits zero.
+//
+// Runs in addISelPrepare (right before SDAG-ISel) so it sees the
+// final-shape IR.  The libcall declaration is auto-added if missing.
+//
+//===---------------------------------------------------------------------===//
+
+#include "W65816.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/KnownBits.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "w65816-narrow-i32-mul"
+
+
+namespace {
+
+
+class W65816NarrowI32Mul : public FunctionPass {
+public:
+  static char ID;
+  W65816NarrowI32Mul() : FunctionPass(ID) {}
+
+  StringRef getPassName() const override {
+    return "W65816 narrow i32 multiplies to __umulhisi3";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.setPreservesCFG();
+  }
+
+  bool runOnFunction(Function &F) override;
+};
+
+
+} // namespace
+
+
+char W65816NarrowI32Mul::ID = 0;
+
+INITIALIZE_PASS_BEGIN(W65816NarrowI32Mul, DEBUG_TYPE,
+                      "W65816 narrow i32 multiplies", false, false)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_END(W65816NarrowI32Mul, DEBUG_TYPE,
+                    "W65816 narrow i32 multiplies", false, false)
+
+
+// Get-or-declare `__umulhisi3(i16, i16) -> i32` in the module.
+static FunctionCallee getUmulhisi3(Module &M) {
+  LLVMContext &Ctx = M.getContext();
+  Type *I16 = Type::getInt16Ty(Ctx);
+  Type *I32 = Type::getInt32Ty(Ctx);
+  FunctionType *FT = FunctionType::get(I32, {I16, I16}, false);
+  return M.getOrInsertFunction("__umulhisi3", FT);
+}
+
+
+// True iff the top 16 bits of V are known zero.  Tries IR-level
+// computeKnownBits first; if that doesn't prove enough, falls back
+// to ScalarEvolution's unsigned-range analysis (which handles
+// loop-bounded induction variables that KnownBits can't).
+static bool top16Zero(Value *V, const DataLayout &DL, ScalarEvolution &SE) {
+  KnownBits K = computeKnownBits(V, DL);
+  if (K.countMinLeadingZeros() >= 16) {
+    return true;
+  }
+  if (!SE.isSCEVable(V->getType())) {
+    return false;
+  }
+  const SCEV *S = SE.getSCEV(V);
+  ConstantRange R = SE.getUnsignedRange(S);
+  return R.getActiveBits() <= 16;
+}
+
+
+bool W65816NarrowI32Mul::runOnFunction(Function &F) {
+  Module *M = F.getParent();
+  const DataLayout &DL = M->getDataLayout();
+  Type *I16 = Type::getInt16Ty(F.getContext());
+  ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+
+  SmallVector<BinaryOperator *, 8> Worklist;
+  for (Instruction &I : instructions(F)) {
+    auto *BO = dyn_cast<BinaryOperator>(&I);
+    if (!BO || BO->getOpcode() != Instruction::Mul) {
+      continue;
+    }
+    if (!BO->getType()->isIntegerTy(32)) {
+      continue;
+    }
+    if (!top16Zero(BO->getOperand(0), DL, SE)) {
+      continue;
+    }
+    if (!top16Zero(BO->getOperand(1), DL, SE)) {
+      continue;
+    }
+    Worklist.push_back(BO);
+  }
+
+  if (Worklist.empty()) {
+    return false;
+  }
+
+  FunctionCallee Callee = getUmulhisi3(*M);
+  for (BinaryOperator *BO : Worklist) {
+    IRBuilder<> B(BO);
+    Value *A = B.CreateTrunc(BO->getOperand(0), I16);
+    Value *Bv = B.CreateTrunc(BO->getOperand(1), I16);
+    Value *Call = B.CreateCall(Callee, {A, Bv});
+    BO->replaceAllUsesWith(Call);
+    BO->eraseFromParent();
+  }
+  return true;
+}
+
+
+FunctionPass *llvm::createW65816NarrowI32Mul() {
+  return new W65816NarrowI32Mul();
+}
diff --git a/src/llvm/lib/Target/W65816/W65816RegisterInfo.cpp b/src/llvm/lib/Target/W65816/W65816RegisterInfo.cpp
index c4f0af7..870766d 100644
--- a/src/llvm/lib/Target/W65816/W65816RegisterInfo.cpp
+++ b/src/llvm/lib/Target/W65816/W65816RegisterInfo.cpp
@@ -69,16 +69,21 @@ static bool expandFarFI(MachineInstr &MI, int FPOff,
   switch (Opc) {
   case W65816::LDAfi: {
     Register Dst = MI.getOperand(0).getReg();
-    BuildMI(MBB, II, DL, TII.get(W65816::PHY))
-        .addReg(W65816::Y, RegState::Implicit);
+    // Mark Y use as Undef: if Y is dead at this insertion point, the
+    // value we save is "don't care" — we restore the same garbage byte
+    // later.  Without Undef, the verifier rejects when no def reaches
+    // (cause of the sha256_transform crash: STY_DP $FA emitted in the
+    // round-loop preheader before any LDY definition was reachable).
+    BuildMI(MBB, II, DL, TII.get(W65816::STY_DP)).addImm(0xFA)
+        .addReg(W65816::Y, RegState::Implicit | RegState::Undef);
     BuildMI(MBB, II, DL, TII.get(W65816::LDY_Imm16))
         .addImm(FPOff)
         .addReg(W65816::Y, RegState::ImplicitDefine);
-    BuildMI(MBB, II, DL, TII.get(W65816::LDA_DPIndY))
+    BuildMI(MBB, II, DL, TII.get(W65816::LDA_DPIndLongY))
         .addImm(0xF6)
         .addReg(W65816::A, RegState::ImplicitDefine)
         .addReg(W65816::Y, RegState::Implicit);
-    BuildMI(MBB, II, DL, TII.get(W65816::PLY))
+    BuildMI(MBB, II, DL, TII.get(W65816::LDY_DP)).addImm(0xFA)
         .addReg(W65816::Y, RegState::ImplicitDefine);
     if (Dst == W65816::X)
       BuildMI(MBB, II, DL, TII.get(W65816::TAX));
@@ -91,26 +96,26 @@ static bool expandFarFI(MachineInstr &MI, int FPOff,
     int srcDP = imgRegToDP(Src);
     if (srcDP >= 0)
       BuildMI(MBB, II, DL, TII.get(W65816::LDA_DP)).addImm(srcDP);
-    BuildMI(MBB, II, DL, TII.get(W65816::PHY))
-        .addReg(W65816::Y, RegState::Implicit);
+    BuildMI(MBB, II, DL, TII.get(W65816::STY_DP)).addImm(0xFA)
+        .addReg(W65816::Y, RegState::Implicit | RegState::Undef);
     BuildMI(MBB, II, DL, TII.get(W65816::LDY_Imm16)).addImm(FPOff);
-    BuildMI(MBB, II, DL, TII.get(W65816::STA_DPIndY))
+    BuildMI(MBB, II, DL, TII.get(W65816::STA_DPIndLongY))
         .addImm(0xF6)
         .addReg(W65816::A, RegState::Implicit)
         .addReg(W65816::Y, RegState::Implicit);
-    BuildMI(MBB, II, DL, TII.get(W65816::PLY));
+    BuildMI(MBB, II, DL, TII.get(W65816::LDY_DP)).addImm(0xFA);
     return true;
   }
   case W65816::STA8fi: {
     BuildMI(MBB, II, DL, TII.get(W65816::SEP)).addImm(0x20)
         .addReg(W65816::P, RegState::ImplicitDefine);
-    BuildMI(MBB, II, DL, TII.get(W65816::PHY))
-        .addReg(W65816::Y, RegState::Implicit);
+    BuildMI(MBB, II, DL, TII.get(W65816::STY_DP)).addImm(0xFA)
+        .addReg(W65816::Y, RegState::Implicit | RegState::Undef);
     BuildMI(MBB, II, DL, TII.get(W65816::LDY_Imm16)).addImm(FPOff);
-    BuildMI(MBB, II, DL, TII.get(W65816::STA_DPIndY))
+    BuildMI(MBB, II, DL, TII.get(W65816::STA_DPIndLongY))
         .addImm(0xF6)
         .addReg(W65816::A, RegState::Implicit);
-    BuildMI(MBB, II, DL, TII.get(W65816::PLY));
+    BuildMI(MBB, II, DL, TII.get(W65816::LDY_DP)).addImm(0xFA);
     BuildMI(MBB, II, DL, TII.get(W65816::REP)).addImm(0x20)
         .addReg(W65816::P, RegState::ImplicitDefine);
     return true;
@@ -126,13 +131,13 @@ static bool expandFarFI(MachineInstr &MI, int FPOff,
     // op's flags from a downstream consumer.
     BuildMI(MBB, II, DL, TII.get(W65816::STA_DP)).addImm(0xE2)
         .addReg(W65816::A, RegState::Implicit);
-    BuildMI(MBB, II, DL, TII.get(W65816::PHY))
-        .addReg(W65816::Y, RegState::Implicit);
+    BuildMI(MBB, II, DL, TII.get(W65816::STY_DP)).addImm(0xFA)
+        .addReg(W65816::Y, RegState::Implicit | RegState::Undef);
     BuildMI(MBB, II, DL, TII.get(W65816::LDY_Imm16)).addImm(FPOff);
-    BuildMI(MBB, II, DL, TII.get(W65816::LDA_DPIndY)).addImm(0xF6)
+    BuildMI(MBB, II, DL, TII.get(W65816::LDA_DPIndLongY)).addImm(0xF6)
         .addReg(W65816::A, RegState::ImplicitDefine)
         .addReg(W65816::Y, RegState::Implicit);
-    BuildMI(MBB, II, DL, TII.get(W65816::PLY))
+    BuildMI(MBB, II, DL, TII.get(W65816::LDY_DP)).addImm(0xFA)
         .addReg(W65816::Y, RegState::ImplicitDefine);
     unsigned OpDPOpc = 0;
     switch (Opc) {
@@ -167,17 +172,17 @@ static bool expandFarFI(MachineInstr &MI, int FPOff,
     //   SBC/CMP $E2
     BuildMI(MBB, II, DL, TII.get(W65816::STA_DP)).addImm(0xE0)
         .addReg(W65816::A, RegState::Implicit);
-    BuildMI(MBB, II, DL, TII.get(W65816::PHY))
-        .addReg(W65816::Y, RegState::Implicit);
+    BuildMI(MBB, II, DL, TII.get(W65816::STY_DP)).addImm(0xFA)
+        .addReg(W65816::Y, RegState::Implicit | RegState::Undef);
     BuildMI(MBB, II, DL, TII.get(W65816::LDY_Imm16)).addImm(FPOff);
-    BuildMI(MBB, II, DL, TII.get(W65816::LDA_DPIndY)).addImm(0xF6)
+    BuildMI(MBB, II, DL, TII.get(W65816::LDA_DPIndLongY)).addImm(0xF6)
         .addReg(W65816::A, RegState::ImplicitDefine)
         .addReg(W65816::Y, RegState::Implicit);
     BuildMI(MBB, II, DL, TII.get(W65816::STA_DP)).addImm(0xE2)
         .addReg(W65816::A, RegState::Implicit);
     BuildMI(MBB, II, DL, TII.get(W65816::LDA_DP)).addImm(0xE0)
         .addReg(W65816::A, RegState::ImplicitDefine);
-    BuildMI(MBB, II, DL, TII.get(W65816::PLY))
+    BuildMI(MBB, II, DL, TII.get(W65816::LDY_DP)).addImm(0xFA)
         .addReg(W65816::Y, RegState::ImplicitDefine);
     if (Opc == W65816::CMPfi) {
       BuildMI(MBB, II, DL, TII.get(W65816::CMP_DP)).addImm(0xE2)
@@ -268,7 +273,7 @@ bool W65816RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     int ImmOffset = MI.getOperand(FIOperandNum + 1).getImm();
     int Offset = FrameOffset + ImmOffset + (int)MFI.getStackSize() + SPAdj;
     if (FrameOffset < 0) Offset += 1;
-    if (Offset < 0 || Offset > 0xFF) {
+    if (Offset < 0 || Offset > 0xFF || MFI.hasVarSizedObjects()) {
       // Far slot.  Use FP if reserved.  FP-relative offset excludes
       // SPAdj because $F6 captures S after prologue, before any
       // intermediate PUSH16 inside a call sequence.
@@ -342,7 +347,7 @@ bool W65816RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     // in callee), so they don't need the skew.
     int Offset = FrameOffset + ImmOffset + (int)MFI.getStackSize() + SPAdj;
     if (FrameOffset < 0) Offset += 1;
-    if (Offset < 0 || Offset > 0xFF) {
+    if (Offset < 0 || Offset > 0xFF || MFI.hasVarSizedObjects()) {
       if (MF.getInfo<W65816MachineFunctionInfo>()->getUsesDpFP()) {
         int FPOff = FrameOffset + ImmOffset + (int)MFI.getStackSize();
         if (FrameOffset < 0) FPOff += 1;
@@ -434,7 +439,7 @@ bool W65816RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     int ImmOffset = MI.getOperand(FIOperandNum + 1).getImm();
     int Offset = FrameOffset + ImmOffset + (int)MFI.getStackSize() + SPAdj;
     if (FrameOffset < 0) Offset += 1;  // empty-descending SP skew (see STAfi)
-    if (Offset < 0 || Offset > 0xFF) {
+    if (Offset < 0 || Offset > 0xFF || MFI.hasVarSizedObjects()) {
       if (MF.getInfo<W65816MachineFunctionInfo>()->getUsesDpFP()) {
         int FPOff = FrameOffset + ImmOffset + (int)MFI.getStackSize();
         if (FrameOffset < 0) FPOff += 1;
@@ -516,7 +521,7 @@ bool W65816RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   int Offset = FrameOffset + ImmOffset + (int)MFI.getStackSize() + SPAdj;
   if (FrameOffset < 0) Offset += 1;
 
-  if (Offset < 0 || Offset > 0xFF) {
+  if (Offset < 0 || Offset > 0xFF || MFI.hasVarSizedObjects()) {
     if (MF.getInfo<W65816MachineFunctionInfo>()->getUsesDpFP()) {
       int FPOff = FrameOffset + ImmOffset + (int)MFI.getStackSize();
       if (FrameOffset < 0) FPOff += 1;
diff --git a/src/llvm/lib/Target/W65816/W65816SepRepCleanup.cpp b/src/llvm/lib/Target/W65816/W65816SepRepCleanup.cpp
index 8c542c2..eac1e48 100644
--- a/src/llvm/lib/Target/W65816/W65816SepRepCleanup.cpp
+++ b/src/llvm/lib/Target/W65816/W65816SepRepCleanup.cpp
@@ -40,6 +40,7 @@
 #include "W65816.h"
 #include "W65816InstrInfo.h"
 #include "W65816Subtarget.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -422,6 +423,397 @@ bool W65816SepRepCleanup::runOnMachineFunction(MachineFunction &MF) {
     // generic post-RA pseudo expander), so it's still in the MIR here.
     Changed |= foldImmAdcToInaDea(MBB, TII);
 
+    // PHI-copy hoist.
+    {
+      auto isStaLike = [](const MachineInstr &MI) {
+        unsigned O = MI.getOpcode();
+        return O == W65816::STA_StackRel || O == W65816::STZ_DP ||
+               O == W65816::STZ_Abs;
+      };
+      auto isLdaSR = [](const MachineInstr &MI) {
+        return MI.getOpcode() == W65816::LDA_StackRel;
+      };
+      auto isFlagPreservingMem = [&](const MachineInstr &MI) {
+        return isStaLike(MI) || isLdaSR(MI);
+      };
+      auto It = MBB.begin();
+      while (It != MBB.end()) {
+        if (It->getOpcode() != W65816::PHP) { ++It; continue; }
+        auto Php = It;
+        // Walk forward: collect LDA/STA pairs, stop at PLP.
+        auto Walker = std::next(Php);
+        SmallVector<MachineInstr *, 8> Block;
+        SmallSet<int64_t, 8> ReadSlots;
+        SmallSet<int64_t, 8> WriteSlots;
+        bool ok = true;
+        while (Walker != MBB.end()) {
+          if (Walker->isDebugInstr()) { ++Walker; continue; }
+          if (Walker->getOpcode() == W65816::PLP) break;
+          if (!isFlagPreservingMem(*Walker)) { ok = false; break; }
+          // Track slots so we can check the gap below.
+          if (Walker->getNumOperands() >= 1 && Walker->getOperand(0).isImm()) {
+            int64_t off = Walker->getOperand(0).getImm();
+            if (isLdaSR(*Walker)) ReadSlots.insert(off);
+            else WriteSlots.insert(off);
+          }
+          Block.push_back(&*Walker);
+          ++Walker;
+        }
+        if (!ok || Walker == MBB.end()) { ++It; continue; }
+        auto Plp = Walker;
+        // Trailing flag-preservers after PLP (STA/STZ only).
+        auto Tail = std::next(Plp);
+        SmallVector<MachineInstr *, 4> Trailing;
+        while (Tail != MBB.end()) {
+          if (Tail->isDebugInstr()) { ++Tail; continue; }
+          if (!isStaLike(*Tail)) break;
+          if (Tail->getNumOperands() >= 1 && Tail->getOperand(0).isImm()) {
+            WriteSlots.insert(Tail->getOperand(0).getImm());
+          }
+          Trailing.push_back(&*Tail);
+          ++Tail;
+        }
+        // Pair check: the wrap structure is a sequence of LDA-STA
+        // memory-to-memory PHI copies, where the FINAL STA may live
+        // outside the wrap (as Trailing) because STA doesn't clobber
+        // flags.  Count LDAs in Block vs total STAs (Block + Trailing).
+        // If they're not equal, some LDA's $a-output is a register-
+        // live-out PHI value (consumed by a back-edge successor's
+        // first STA, e.g. the vararg `sta 0x5, s` pattern).  Hoisting
+        // it earlier would lose the value.
+        unsigned NLda = 0, NSta = 0;
+        for (MachineInstr *MI : Block) {
+          if (isLdaSR(*MI)) ++NLda;
+          else if (isStaLike(*MI)) ++NSta;
+        }
+        NSta += Trailing.size();
+        if (NLda != NSta) { ++It; continue; }
+        // Walk backward from PHP to find the hoist insertion point.
+        // The hoisted block clobbers $a and $p (LDA writes both).
+        // Skip insts that USE $a (consumer of an earlier $a producer)
+        // or that DEFINE $p (flag-setter — its $p output will be
+        // re-established by the same flag-setter).  Stop at a pure A
+        // producer (defines $a, doesn't use $a).
+        //
+        // Also bail if any in-gap inst writes a slot we read or reads
+        // a slot we write (in-gap reads of our writes would observe
+        // a stale value after hoist; in-gap writes to our reads would
+        // produce a different value if hoisted before).
+        auto Back = Php;
+        if (Back == MBB.begin()) { ++It; continue; }
+        --Back;
+        bool gapOK = true;
+        while (true) {
+          while (Back != MBB.begin() && Back->isDebugInstr()) --Back;
+          if (Back->isDebugInstr()) { gapOK = false; break; }
+          // Slot conflict check.
+          unsigned BO = Back->getOpcode();
+          if ((BO == W65816::STA_StackRel || BO == W65816::STZ_DP ||
+               BO == W65816::STZ_Abs) &&
+              Back->getNumOperands() >= 1 && Back->getOperand(0).isImm()) {
+            int64_t off = Back->getOperand(0).getImm();
+            if (ReadSlots.count(off)) { gapOK = false; break; }
+          }
+          if (BO == W65816::LDA_StackRel &&
+              Back->getNumOperands() >= 1 && Back->getOperand(0).isImm()) {
+            int64_t off = Back->getOperand(0).getImm();
+            if (WriteSlots.count(off)) { gapOK = false; break; }
+          }
+          // Bail on call / branch / asm.
+          if (Back->isCall() || Back->isBranch() ||
+              Back->isReturn() || Back->isInlineAsm()) {
+            gapOK = false; break;
+          }
+          bool usesA = false;
+          bool defsA = false;
+          for (const MachineOperand &MO : Back->operands()) {
+            if (MO.isReg() && MO.getReg() == W65816::A) {
+              if (MO.isUse()) usesA = true;
+              if (MO.isDef()) defsA = true;
+            }
+          }
+          if (defsA && !usesA) break;  // Pure A producer found.
+          if (Back == MBB.begin()) { gapOK = false; break; }
+          --Back;
+        }
+        if (!gapOK) { ++It; continue; }
+        // Hoist: move Block and Trailing to before Back.  Undo the
+        // +1 stack-rel bump on Block's in-wrap memory ops; Trailing
+        // stays AS-IS (it was already outside the wrap and never
+        // bumped).
+        for (MachineInstr *MI : Block) {
+          // All ops in Block matched isFlagPreservingMem, so they're
+          // LDA_StackRel/STA_StackRel/STZ_DP/STZ_Abs.  LDA_StackRel
+          // and STA_StackRel use operand 0 as the disp; that's the
+          // bumped one.  STZ_DP/STZ_Abs aren't stack-rel — no bump.
+          unsigned MOpc = MI->getOpcode();
+          if (MOpc == W65816::LDA_StackRel || MOpc == W65816::STA_StackRel) {
+            if (MI->getNumOperands() >= 1 && MI->getOperand(0).isImm()) {
+              int64_t v = MI->getOperand(0).getImm();
+              MI->getOperand(0).setImm(v - 1);
+            }
+          }
+          MI->removeFromParent();
+          MBB.insert(Back, MI);
+        }
+        for (MachineInstr *MI : Trailing) {
+          MI->removeFromParent();
+          MBB.insert(Back, MI);
+        }
+        Php->eraseFromParent();
+        Plp->eraseFromParent();
+        Changed = true;
+        // Restart iteration from the beginning since we mutated.
+        It = MBB.begin();
+      }
+    }
+
+    // i32 += i32 store-bypass.  Regalloc materializes the call result
+    // (A=lo, X=hi) into Wide32 spill slots before the add, then reads
+    // them back — emitting 4 instructions of redundant store/reload:
+    //
+    //   STA_StackRel slotA   ; A (mul.lo) -> slotA
+    //   TXA                  ; A = X = mul.hi
+    //   STA_StackRel slotB   ; mul.hi -> slotB
+    //   LDA_StackRel slotA   ; reload mul.lo  <-- redundant
+    //   CLC
+    //   ADC_StackRel slotC   ; mul.lo + total.lo
+    //   STA_StackRel slotA   ; sum-lo
+    //   LDA_StackRel slotB   ; reload mul.hi  <-- redundant
+    //   ADC_StackRel slotD   ; mul.hi + total.hi + C
+    //   STA_StackRel slotB   ; sum-hi
+    //
+    // Reorder to do the lo-add directly off A and the hi-add directly
+    // off X (via TXA preserving carry):
+    //
+    //   CLC
+    //   ADC_StackRel slotC   ; A = mul.lo + total.lo
+    //   STA_StackRel slotA   ; sum-lo
+    //   TXA                  ; A = X = mul.hi (C preserved)
+    //   ADC_StackRel slotD   ; A = mul.hi + total.hi + C
+    //   STA_StackRel slotB   ; sum-hi
+    //
+    // 10 -> 6 inst.  Saves 4 inst / ~13 cyc per i32-add-of-call-result
+    // site.  Hits the sumOfSquares loop and any total += __umulhisi3
+    // pattern.
+    {
+      auto isStaSR = [](MachineInstr &MI, int64_t *off) {
+        if (MI.getOpcode() != W65816::STA_StackRel) return false;
+        if (MI.getNumOperands() < 1 || !MI.getOperand(0).isImm()) return false;
+        if (off) *off = MI.getOperand(0).getImm();
+        return true;
+      };
+      auto isLdaSR = [](MachineInstr &MI, int64_t *off) {
+        if (MI.getOpcode() != W65816::LDA_StackRel) return false;
+        if (MI.getNumOperands() < 1 || !MI.getOperand(0).isImm()) return false;
+        if (off) *off = MI.getOperand(0).getImm();
+        return true;
+      };
+      auto isAdcSR = [](MachineInstr &MI, int64_t *off) {
+        if (MI.getOpcode() != W65816::ADC_StackRel) return false;
+        if (MI.getNumOperands() < 1 || !MI.getOperand(0).isImm()) return false;
+        if (off) *off = MI.getOperand(0).getImm();
+        return true;
+      };
+      auto It = MBB.begin();
+      while (It != MBB.end()) {
+        auto Cur = It;
+        int64_t slotA = 0, slotB = 0, slotC = 0, slotD = 0;
+        // Step 1: STA_StackRel slotA
+        if (!isStaSR(*Cur, &slotA)) { ++It; continue; }
+        auto P2 = std::next(Cur);
+        while (P2 != MBB.end() && P2->isDebugInstr()) ++P2;
+        if (P2 == MBB.end() || P2->getOpcode() != W65816::TXA) { ++It; continue; }
+        auto P3 = std::next(P2);
+        while (P3 != MBB.end() && P3->isDebugInstr()) ++P3;
+        if (P3 == MBB.end() || !isStaSR(*P3, &slotB)) { ++It; continue; }
+        if (slotA == slotB) { ++It; continue; }
+        auto P4 = std::next(P3);
+        while (P4 != MBB.end() && P4->isDebugInstr()) ++P4;
+        int64_t lreloadA = 0;
+        if (P4 == MBB.end() || !isLdaSR(*P4, &lreloadA) || lreloadA != slotA) {
+          ++It; continue;
+        }
+        auto P5 = std::next(P4);
+        while (P5 != MBB.end() && P5->isDebugInstr()) ++P5;
+        if (P5 == MBB.end() || P5->getOpcode() != W65816::CLC) {
+          ++It; continue;
+        }
+        auto P6 = std::next(P5);
+        while (P6 != MBB.end() && P6->isDebugInstr()) ++P6;
+        if (P6 == MBB.end() || !isAdcSR(*P6, &slotC)) { ++It; continue; }
+        auto P7 = std::next(P6);
+        while (P7 != MBB.end() && P7->isDebugInstr()) ++P7;
+        int64_t outA = 0;
+        if (P7 == MBB.end() || !isStaSR(*P7, &outA) || outA != slotA) {
+          ++It; continue;
+        }
+        auto P8 = std::next(P7);
+        while (P8 != MBB.end() && P8->isDebugInstr()) ++P8;
+        int64_t lreloadB = 0;
+        if (P8 == MBB.end() || !isLdaSR(*P8, &lreloadB) || lreloadB != slotB) {
+          ++It; continue;
+        }
+        auto P9 = std::next(P8);
+        while (P9 != MBB.end() && P9->isDebugInstr()) ++P9;
+        if (P9 == MBB.end() || !isAdcSR(*P9, &slotD)) { ++It; continue; }
+        auto P10 = std::next(P9);
+        while (P10 != MBB.end() && P10->isDebugInstr()) ++P10;
+        int64_t outB = 0;
+        if (P10 == MBB.end() || !isStaSR(*P10, &outB) || outB != slotB) {
+          ++It; continue;
+        }
+        // All 10 matched.  slotA != slotB already.  Also require all
+        // four slots distinct.  (slotC/slotD are the total.lo/hi read
+        // addresses; in the canonical case slotC != slotA and slotD !=
+        // slotB; without this the rewrite would re-read its own output.)
+        if (slotC == slotA || slotD == slotB ||
+            slotC == slotD) {
+          ++It; continue;
+        }
+        // Rewrite: emit CLC ; ADC slotC ; STA slotA ; TXA ; ADC slotD ;
+        // STA slotB before P1, then erase steps 1-10.
+        DebugLoc DL = Cur->getDebugLoc();
+        BuildMI(MBB, Cur, DL, TII.get(W65816::CLC));
+        BuildMI(MBB, Cur, DL, TII.get(W65816::ADC_StackRel))
+            .addImm(slotC);
+        BuildMI(MBB, Cur, DL, TII.get(W65816::STA_StackRel))
+            .addImm(slotA);
+        BuildMI(MBB, Cur, DL, TII.get(W65816::TXA));
+        BuildMI(MBB, Cur, DL, TII.get(W65816::ADC_StackRel))
+            .addImm(slotD);
+        BuildMI(MBB, Cur, DL, TII.get(W65816::STA_StackRel))
+            .addImm(slotB);
+        // Advance It past the matched pattern before erasing (so we
+        // don't iterate through deleted insts).
+        It = std::next(P10);
+        // Erase the 10 originals.
+        Cur->eraseFromParent(); P2->eraseFromParent();
+        P3->eraseFromParent();  P4->eraseFromParent();
+        P5->eraseFromParent();  P6->eraseFromParent();
+        P7->eraseFromParent();  P8->eraseFromParent();
+        P9->eraseFromParent();  P10->eraseFromParent();
+        Changed = true;
+      }
+    }
+
+    // Dead TAX / TXA elimination.  STAfi declares `Defs = [A]` as a
+    // safe over-approximation (eliminateFrameIndex emits a PHA-bracketed
+    // sequence when the source is IMG-class).  Regalloc honors that by
+    // inserting `TAX ; ...STAfi... ; TXA` brackets around STAfi that
+    // SOURCES from A — but in the A-source path A is preserved.  The
+    // TXA's output gets clobbered immediately by the next LDA*, so the
+    // TXA is dead; once TXA is gone, the TAX's X-value has no consumer
+    // and is dead too.  This pattern recurs once per i32-spill site.
+    //
+    // Conservative: only elide TXA if the IMMEDIATE next non-debug
+    // instruction defines $a (and doesn't read $a or N/Z first).  No
+    // intervening flag-readers between TXA and the A-define is then
+    // guaranteed.  Same logic for TYA.
+    //
+    // For TAX: elide if no instruction between TAX and the next $x def
+    // reads $x (and we can prove the original X had no live consumer).
+    // Done as a fixed-point: keep iterating until no change.
+    auto definesReg = [](const MachineInstr &MI, unsigned Reg) -> bool {
+      for (const MachineOperand &MO : MI.operands()) {
+        if (MO.isReg() && MO.getReg() == Reg && MO.isDef())
+          return true;
+      }
+      return false;
+    };
+    auto readsReg = [](const MachineInstr &MI, unsigned Reg) -> bool {
+      for (const MachineOperand &MO : MI.operands()) {
+        if (MO.isReg() && MO.getReg() == Reg && MO.isUse())
+          return true;
+      }
+      return false;
+    };
+    bool again2 = true;
+    while (again2) {
+      again2 = false;
+      // Pass A: dead TXA / TYA
+      for (auto It = MBB.begin(); It != MBB.end(); ) {
+        unsigned O = It->getOpcode();
+        if (O != W65816::TXA && O != W65816::TYA) { ++It; continue; }
+        auto Next = std::next(It);
+        while (Next != MBB.end() && Next->isDebugInstr()) ++Next;
+        if (Next == MBB.end()) { ++It; continue; }
+        // Next must define $a unconditionally, and must not read $a
+        // (since we're about to discard the TXA-defined A) and must
+        // not be a call / branch / inline asm (which conservatively
+        // read $a).
+        if (Next->isCall() || Next->isBranch() ||
+            Next->isReturn() || Next->isInlineAsm()) {
+          ++It; continue;
+        }
+        if (!definesReg(*Next, W65816::A)) { ++It; continue; }
+        if (readsReg(*Next, W65816::A))   { ++It; continue; }
+        // P (flags) liveness: TXA/TYA set N/Z.  If Next reads P, we'd
+        // be discarding the flags it expects.  Bxx and friends read P.
+        // Conservative: also require Next does not read $p.
+        if (readsReg(*Next, W65816::P))   { ++It; continue; }
+        auto Dead = It++;
+        Dead->eraseFromParent();
+        Changed = true;
+        again2 = true;
+      }
+      // Pass B: dead TAX / TAY
+      for (auto It = MBB.begin(); It != MBB.end(); ) {
+        unsigned O = It->getOpcode();
+        unsigned Target;
+        if      (O == W65816::TAX) Target = W65816::X;
+        else if (O == W65816::TAY) Target = W65816::Y;
+        else { ++It; continue; }
+        // Walk forward.  TAX/TAY is dead if every use of Target is
+        // preceded by a redefinition of Target (and the in-MBB region
+        // between has no flag-reader that consumes TAX's N/Z).  At MBB
+        // end, check successor live-ins: if none has Target as live-in
+        // it's also dead.
+        //
+        // Flag liveness: TAX defines $p (N/Z).  A later $p-reader only
+        // consumes TAX's flags if no intervening instruction REDEFINES
+        // $p in the gap.  Track `pRedef` to allow common patterns like
+        // `TAX ; CLC ; ADC ; ...` where ADC reads $p but the $p it
+        // reads is the freshly-CLC'd carry, not TAX's N/Z.
+        auto Walker = std::next(It);
+        bool deadIt = false;
+        bool bailed = false;
+        bool pRedef = false;
+        while (Walker != MBB.end()) {
+          if (Walker->isDebugInstr()) { ++Walker; continue; }
+          if (Walker->isCall() || Walker->isInlineAsm()) {
+            bailed = true; break;
+          }
+          // Branch / return: stop walking; rely on successor live-ins.
+          if (Walker->isBranch() || Walker->isReturn()) break;
+          if (readsReg(*Walker, Target)) { bailed = true; break; }
+          if (readsReg(*Walker, W65816::P) && !pRedef) {
+            bailed = true; break;
+          }
+          if (definesReg(*Walker, W65816::P)) pRedef = true;
+          if (definesReg(*Walker, Target)) { deadIt = true; break; }
+          ++Walker;
+        }
+        if (bailed) { ++It; continue; }
+        if (!deadIt) {
+          // Fell through to MBB end / branch.  Check successor live-ins.
+          bool liveOut = false;
+          for (MachineBasicBlock *Succ : MBB.successors()) {
+            if (Succ->isLiveIn(Target)) { liveOut = true; break; }
+          }
+          // Return blocks: $a and $x are the i32 return-value convention.
+          // RTL doesn't model these as Uses, but they ARE live at the
+          // return.  Be conservative — don't elide TAX/TAY before a return.
+          if (!MBB.empty() && MBB.back().isReturn()) liveOut = true;
+          if (liveOut) { ++It; continue; }
+        }
+        auto Dead = It++;
+        Dead->eraseFromParent();
+        Changed = true;
+        again2 = true;
+      }
+    }
+
     // Third peephole: drop `LDY_Imm16 K` when Y already holds K from
     // an earlier LDY in the same MBB and no intervening MI clobbered
     // Y.  Custom inserter emits LDY #0 before every LDAfi_indY/STAfi_indY,
diff --git a/src/llvm/lib/Target/W65816/W65816StackSlotCleanup.cpp b/src/llvm/lib/Target/W65816/W65816StackSlotCleanup.cpp
index 4918ca1..470179b 100644
--- a/src/llvm/lib/Target/W65816/W65816StackSlotCleanup.cpp
+++ b/src/llvm/lib/Target/W65816/W65816StackSlotCleanup.cpp
@@ -283,15 +283,18 @@ static bool tryEliminateLoadAfterStore(MachineBasicBlock &MBB,
         case W65816::LDAfi:
         case W65816::LDAi16imm:
         case W65816::LDAabs:
-        case W65816::ANDi16imm: case W65816::ANDabs:
-        case W65816::ORAi16imm: case W65816::ORAabs:
-        case W65816::EORi16imm: case W65816::EORabs:
+        case W65816::ANDi16imm: case W65816::ANDabs: case W65816::ANDfi:
+        case W65816::ORAi16imm: case W65816::ORAabs: case W65816::ORAfi:
+        case W65816::EORi16imm: case W65816::EORabs: case W65816::EORfi:
         case W65816::ADCi16imm: case W65816::ADCabs: case W65816::ADCfi:
         case W65816::SBCi16imm: case W65816::SBCabs: case W65816::SBCfi:
         case W65816::ADCEi16imm: case W65816::ADCEabs: case W65816::ADCEfi:
         case W65816::SBCEi16imm: case W65816::SBCEabs: case W65816::SBCEfi:
         case W65816::ASLA16: case W65816::LSRA16:
         case W65816::ASLA8:  case W65816::LSRA8:
+        case W65816::INA: case W65816::DEA:
+        case W65816::INA_PSEUDO: case W65816::DEA_PSEUDO:
+        case W65816::INA_PSEUDO8: case W65816::DEA_PSEUDO8:
           return true;
         default:
           return false;
@@ -756,6 +759,24 @@ bool W65816StackSlotCleanup::runOnMachineFunction(MachineFunction &MF) {
   // pass that might still violate the dep, and to wrap the rare cases
   // where the IR-level test is a load (LDA flag side-effect) rather
   // than an explicit CMP.
+  // In VLA functions, FI store pseudos (STAfi, STA8fi, STAfi_indY)
+  // expand at PEI to a 4-MC sequence ending in `LDY $F8` (Y-restore),
+  // which clobbers N/Z.  The PHP/PLP wrap pass runs pre-PEI; treating
+  // those pseudos as flag-preserving leaves the trailing LDY outside
+  // the wrap, so a downstream BEQ/BNE reads the LDY's flags instead of
+  // the test's.  Treat them as corrupting in VLA functions so the wrap
+  // covers the whole expansion.
+  // VLAFunc: narrow predicate used by the flag-preserving / lda-like
+  // helpers (broadening it to UsesFPRel broke dadd's i64-ABI libcall
+  // flow — the STAfi pseudos in non-VLA large-frame functions don't
+  // need to be marked corrupting for the wrap-detection walk).
+  // UsesFPRel: broader FrameLowering-matching predicate used by the
+  // pseudo-bump's offset-routing check (FP-rel ops must NOT be bumped,
+  // SP-rel ops MUST be bumped; we replicate eliminateFrameIndex's
+  // routing decision below to choose).
+  bool VLAFunc = MF.getFrameInfo().hasVarSizedObjects();
+  bool UsesFPRel = MF.getFrameInfo().hasVarSizedObjects() ||
+                   MF.getFrameInfo().estimateStackSize(MF) > 200;
   for (MachineBasicBlock &MBB : MF) {
     SmallVector<MachineInstr *, 4> Branches;
     for (MachineInstr &MI : MBB) {
@@ -764,7 +785,13 @@ bool W65816StackSlotCleanup::runOnMachineFunction(MachineFunction &MF) {
           Opc == W65816::BMI || Opc == W65816::BPL)
         Branches.push_back(&MI);
     }
-    auto isFlagPreserving = [](unsigned Opc) {
+    auto isFlagPreserving = [VLAFunc](unsigned Opc) {
+      if (VLAFunc) {
+        // FI store pseudos are flag-corrupting under VLA expansion.
+        if (Opc == W65816::STAfi || Opc == W65816::STAfi_indY ||
+            Opc == W65816::STA8fi)
+          return false;
+      }
       return Opc == W65816::STA_StackRel ||
              Opc == W65816::STA_StackRelIndY ||
              Opc == W65816::STAfi ||
@@ -805,7 +832,14 @@ bool W65816StackSlotCleanup::runOnMachineFunction(MachineFunction &MF) {
         return !MI.isBranch() && !MI.isReturn();
       }
     };
-    auto isLdaLike = [](unsigned Opc) {
+    auto isLdaLike = [VLAFunc](unsigned Opc) {
+      if (VLAFunc) {
+        // STAfi-family: see isFlagPreserving comment.  They expand to a
+        // sequence whose final LDY $F8 corrupts N/Z; treat as corrupting.
+        if (Opc == W65816::STAfi || Opc == W65816::STAfi_indY ||
+            Opc == W65816::STA8fi)
+          return true;
+      }
       // COPY between physregs: lowers in AsmPrinter to one of TXA/TYA/
       // LDA $D? (for IMG↔A bridges) etc. — all of which set N/Z based
       // on the loaded value.  Treating COPY as flag-defining caused the
@@ -926,6 +960,14 @@ bool W65816StackSlotCleanup::runOnMachineFunction(MachineFunction &MF) {
       // eliminateFrameIndex; bumping ImmOffset by 1 produces the
       // right post-lowered disp.  For already-lowered MC ops
       // (LDA_StackRel etc), bump the disp operand directly.
+      //
+      // CAVEAT for FP-relative functions (see UsesFPRel declaration above):
+      // FI accesses go through FP-relative addressing (eliminateFrameIndex
+      // routes through expandFarFI when FrameLowering captured FP).  FP
+      // was captured BEFORE PHP, so (FP),Y reads aren't affected by PHP's
+      // S decrement.  Don't bump pseudo *fi ImmOffsets in that case
+      // (already-lowered MC StackRel ops still need the bump — those are
+      // SP-rel).
       const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
       DebugLoc DL = Test->getDebugLoc();
       BuildMI(MBB, FirstCorrupt->getIterator(), DL, TII->get(W65816::PHP));
@@ -944,6 +986,33 @@ bool W65816StackSlotCleanup::runOnMachineFunction(MachineFunction &MF) {
                         Opc == W65816::ANDfi || Opc == W65816::ORAfi ||
                         Opc == W65816::EORfi || Opc == W65816::CMPfi ||
                         Opc == W65816::ADDframe;
+        // For pseudo *fi ops in FP-rel functions: only SOME will end up
+        // SP-rel after PEI (offsets in [0,255]); the rest go through
+        // expandFarFI → `[$F6],Y`.  FP-rel access is unaffected by PHP's
+        // S decrement and must NOT be bumped; SP-rel access IS affected
+        // and MUST be bumped.  Replicate eliminateFrameIndex's offset
+        // calculation here to decide.  Without this, large-frame
+        // functions that mix both addressing modes (e.g. sha256-style
+        // i32-libcall loops) get their FP-rel pseudos bumped, which
+        // shifts reads/writes by one byte and corrupts state at
+        // iteration N proportional to the i32-libcall count.
+        if (IsPseudo && UsesFPRel) {
+          const MachineFrameInfo &MFI = MF.getFrameInfo();
+          if (It->getOperand(1).isFI()) {
+            int FI = It->getOperand(1).getIndex();
+            int FrameOffset = MFI.getObjectOffset(FI);
+            int ImmOffset = It->getOperand(2).isImm()
+                              ? (int)It->getOperand(2).getImm() : 0;
+            int LoweredOff = FrameOffset + ImmOffset +
+                             (int)MFI.getStackSize();
+            if (FrameOffset < 0) LoweredOff += 1;
+            // Out-of-range or VLA → FP-rel → no bump.
+            if (LoweredOff < 0 || LoweredOff > 0xFF ||
+                MFI.hasVarSizedObjects())
+              continue;
+            // Else SP-rel: fall through and bump ImmOffset.
+          }
+        }
         unsigned ImmIdx = IsPseudo ? 2 : 0;
         if (ImmIdx < It->getNumOperands() && It->getOperand(ImmIdx).isImm()) {
           int64_t v = It->getOperand(ImmIdx).getImm();
diff --git a/src/llvm/lib/Target/W65816/W65816TargetMachine.cpp b/src/llvm/lib/Target/W65816/W65816TargetMachine.cpp
index eeae746..031a699 100644
--- a/src/llvm/lib/Target/W65816/W65816TargetMachine.cpp
+++ b/src/llvm/lib/Target/W65816/W65816TargetMachine.cpp
@@ -27,13 +27,17 @@ using namespace llvm;
 
 // Data layout for the 65816 lives in Triple::computeDataLayout via
 // patches/0005-target-data-layout-w65816.patch.  The string is:
-//   e      - little endian
-//   m:e    - ELF-style symbol mangling
-//   p:16:8 - 16-bit pointers, 8-bit stack alignment
-//   i16:16 - 16-bit integers aligned to 16 bits
-//   i32:16 - 32-bit integers aligned to 16 bits
-//   n8:16  - native integer widths
-//   S16    - 16-bit natural stack alignment
+//   e       - little endian
+//   m:e     - ELF-style symbol mangling
+//   p:32:16 - 32-bit pointers (lo16 + hi-bank), 16-bit alignment
+//   i16:16  - 16-bit integers aligned to 16 bits
+//   i32:16  - 32-bit integers aligned to 16 bits
+//   a:8     - alloca defaults to 1-byte alignment
+//   n8:16   - native integer widths
+//   S8      - 1-byte natural stack alignment.  JSL's 3-byte ret-addr
+//             push means SP is never reliably 2-aligned inside a
+//             callee; the older S16 caused SDAG to fold &buf[1] to
+//             buf | 1, which breaks for odd-aligned stack locals.
 
 extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
 LLVMInitializeW65816Target() {
@@ -49,6 +53,9 @@ LLVMInitializeW65816Target() {
   initializeW65816PreSpillCrossCallPass(PR);
   initializeW65816SjLjFinalizePass(PR);
   initializeW65816LowerWide32Pass(PR);
+  initializeW65816I32IncFoldPass(PR);
+  initializeW65816ImgCalleeSavePass(PR);
+  initializeW65816NarrowI32MulPass(PR);
 
   // Default IndVarSimplify's exit-value rewriter to "never".  The
   // closed-form replacement frequently widens an i16 induction var
@@ -104,22 +111,21 @@ public:
   void addMachineSSAOptimization() override;
   void addISelPrepare() override;
 
-  // W65816's only 16-bit ALU register is A.  At -O1+ we use BASIC
-  // regalloc instead of greedy: greedy fails ("ran out of registers
-  // during register allocation") on functions with many cross-call
-  // Acc16 vregs (the "ok |= bit; helper(); ok |= bit;" pattern
-  // repeated across many if-blocks).  Basic regalloc handles that
-  // pattern cleanly, with negligible code-size overhead vs greedy
-  // (~0.7% on the bench suite).
+  // Greedy at -O1+; fast at -O0/optnone.  Greedy used to abort with
+  // "Def isn't really dead" inside LiveRangeEdit::eliminateDeadDef
+  // when InlineSpiller converted a redundant STAfi (Defs = [A]) into
+  // a KILL pseudo while only marking explicit defs dead — leaving the
+  // implicit-def $a live, then later trying to delete it.  Patched in
+  // tools/llvm-mos/llvm/lib/CodeGen/InlineSpiller.cpp to mark all defs
+  // (explicit + implicit) dead.  Bench wins after the switch:
+  // popcount −19.4%, strcpy −18.9%, memcmp −8.6%, bsearch −9.2%,
+  // fib(10) −2.6%.
   //
-  // At -O0 / optnone (Optimized=false) we use FAST: greedy/basic at
-  // -O0 leave spurious COPY pseudos that lower to STA dp / LDA dp
-  // pairs around modify-in-place ops (e.g. INA), miscompiling a + 1.
-  //
-  // TiedDefSpill (pre-RA) handles the tied-def-multi-use hazard for
-  // the sub-pattern that's frequent enough to matter at -O1+.
+  // At -O0 / optnone (Optimized=false) we use FAST: greedy at -O0
+  // left spurious COPY pseudos that lowered to STA dp / LDA dp pairs
+  // around modify-in-place ops (e.g. INA), miscompiling a + 1.
   FunctionPass *createTargetRegisterAllocator(bool Optimized) override {
-    return Optimized ? createBasicRegisterAllocator()
+    return Optimized ? createGreedyRegisterAllocator()
                      : createFastRegisterAllocator();
   }
 };
@@ -137,6 +143,11 @@ void W65816PassConfig::addISelPrepare() {
   // intrinsics our backend doesn't natively lower.  Must run BEFORE
   // the base ISelPrepare passes so isel sees the cleaned IR.
   addPass(createW65816SjLjFinalize());
+  // IR-level peephole: narrow `mul i32 X, Y` to a __umulhisi3 call
+  // when IR-level computeKnownBits proves the top 16 bits of both
+  // operands are zero.  Catches the sumSquares-style `(u32)i * i`
+  // pattern that SDAG-level analysis can't see across BB boundaries.
+  addPass(createW65816NarrowI32Mul());
   TargetPassConfig::addISelPrepare();
 }
 
@@ -148,6 +159,15 @@ void W65816PassConfig::addMachineSSAOptimization() {
   // Uses=[P] on Bxx (so MachineCSE sees the dep) and let the
   // pass run normally — that landed in W65816InstrInfo.td.
   TargetPassConfig::addMachineSSAOptimization();
+
+  // MachineBlockPlacement is now re-enabled.  Previously disabled
+  // because W65816InstrInfo::analyzeBranch returned unanalyzable
+  // unconditionally; we now decode the BRA / BRL / JMP_Abs uncond
+  // direct-branch case (see W65816InstrInfo::analyzeBranch) which is
+  // enough to satisfy MBP's fall-through assertion.  Conditional
+  // branches stay opaque on purpose: their condition is encoded in
+  // the OPCODE and the P-flag input must stay adjacent to a preceding
+  // CMP, which BranchFolder doesn't know to preserve.
 }
 
 void W65816PassConfig::addPreRegAlloc() {
@@ -175,6 +195,15 @@ void W65816PassConfig::addPreRegAlloc() {
 }
 
 void W65816PassConfig::addPostRegAlloc() {
+  // ImgCalleeSave runs FIRST so its STAfi/LDAfi pseudos go through the
+  // rest of the post-RA pipeline (SpillToX, StackSlotCleanup) normally.
+  // It detects IMG8..IMG15 usage post-regalloc and inserts prologue
+  // save + epilogue restore so those slots act as callee-saved at the
+  // asm level.  Fixes picol's `expr 1+2 == 4` bug: high-pressure
+  // recursive double fns use IMG8..IMG15 as scratch but, without this
+  // pass, expected them preserved across calls — and callees were
+  // happy to clobber them.  See W65816ImgCalleeSave.cpp.
+  addPass(createW65816ImgCalleeSave());
   // SpillToX converts STA/LDA pairs to TAX/TXA bridges; StackSlotCleanup
   // then deletes still-adjacent redundant spills.  A second SpillToX
   // invocation collapses any TAX/TXA pair left adjacent by cleanup
@@ -223,6 +252,16 @@ void W65816PassConfig::addPreEmitPass() {
   // Distance estimation now uses TII::getInstSizeInBytes so it's
   // byte-accurate; the 110-byte threshold leaves margin without
   // expanding short branches that would otherwise survive as Bxx.
+  // Detect i32 += 1 patterns (LDA/ADC #1/STA/LDA/ADCE #0/STA) and
+  // rewrite to a tighter LDA/INA/STA + INC_HI_IF_CARRY form that
+  // skips the hi half on the no-carry path.  Must run BEFORE
+  // BranchExpand so the inserted conditional skip's distances are
+  // covered by the branch-distance estimator.  Also before
+  // SepRepCleanup (which has the existing ADC #±1 → INA peephole)
+  // because we deliberately KEEP ADCi16imm 1 so this pass can match
+  // it; the subsequent SepRepCleanup will see only the residual
+  // (non-fold-eligible) ADCi16imm cases.
+  addPass(createW65816I32IncFold());
   addPass(createW65816BranchExpand());
   addPass(createW65816SepRepCleanup());
 }
diff --git a/ui.ini b/ui.ini
new file mode 100644
index 0000000..9c2cbf1
--- /dev/null
+++ b/ui.ini
@@ -0,0 +1,71 @@
+﻿#
+# UI SEARCH PATH OPTIONS
+#
+historypath               history;dats;.
+categorypath              folders
+cabinets_directory        cabinets;cabdevs
+cpanels_directory         cpanel
+pcbs_directory            pcb
+flyers_directory          flyers
+titles_directory          titles
+ends_directory            ends
+marquees_directory        marquees
+artwork_preview_directory "artwork preview;artpreview"
+bosses_directory          bosses
+logos_directory           logo
+scores_directory          scores
+versus_directory          versus
+gameover_directory        gameover
+howto_directory           howto
+select_directory          select
+icons_directory           icons
+covers_directory          covers
+ui_path                   ui
+
+#
+# UI MISC OPTIONS
+#
+system_names              
+skip_warnings             0
+unthrottle_mute           0
+
+#
+# UI OPTIONS
+#
+infos_text_size           0.75
+font_rows                 30
+ui_border_color           ffffffff
+ui_bg_color               ef101030
+ui_clone_color            ff808080
+ui_dipsw_color            ffffff00
+ui_gfxviewer_color        ef101030
+ui_mousedown_bg_color     b0606000
+ui_mousedown_color        ffffff80
+ui_mouseover_bg_color     70404000
+ui_mouseover_color        ffffff80
+ui_selected_bg_color      ef808000
+ui_selected_color         ffffff00
+ui_slider_color           ffffffff
+ui_subitem_color          ffffffff
+ui_text_bg_color          ef000000
+ui_text_color             ffffffff
+ui_unavail_color          ff404040
+
+#
+# SYSTEM/SOFTWARE SELECTION MENU OPTIONS
+#
+hide_main_panel           0
+use_background            1
+skip_biosmenu             0
+skip_partsmenu            0
+remember_last             1
+last_used_machine         
+last_used_filter          
+system_right_panel        image
+software_right_panel      image
+system_right_image        snap
+software_right_image      snap
+enlarge_snaps             1
+forced4x3                 1
+info_audit_enabled        0
+hide_romless              1