From 6bff7bea3fe94341f627df8419a6ac67a6723ef4 Mon Sep 17 00:00:00 2001 From: Scott Duensing Date: Thu, 14 May 2026 11:23:00 -0500 Subject: [PATCH] Docs! --- README.md | 102 ++ STATUS.md | 12 +- compare/evalAt.calypsi.lst | 2 +- compare/mul16to32.calypsi.lst | 2 +- compare/mul16to32.ours.s | 5 +- compare/sumSquares.calypsi.lst | 2 +- compare/sumSquares.ours.s | 61 +- docs/INSTALL.md | 168 +++ docs/USAGE.md | 391 ++++++ scripts/smokeTest.sh | 18 +- src/llvm/lib/Target/W65816/CMakeLists.txt | 1 + src/llvm/lib/Target/W65816/W65816.h | 7 + .../lib/Target/W65816/W65816AsmPrinter.cpp | 9 +- .../lib/Target/W65816/W65816ImgCalleeSave.cpp | 4 - .../Target/W65816/W65816PromoteFiToImg.cpp | 156 ++- .../lib/Target/W65816/W65816StackRelToImg.cpp | 1220 +++++++++++++++++ .../Target/W65816/W65816StackSlotMerge.cpp | 53 +- .../lib/Target/W65816/W65816TargetMachine.cpp | 2 + 18 files changed, 2100 insertions(+), 115 deletions(-) create mode 100644 README.md create mode 100644 docs/INSTALL.md create mode 100644 docs/USAGE.md create mode 100644 src/llvm/lib/Target/W65816/W65816StackRelToImg.cpp diff --git a/README.md b/README.md new file mode 100644 index 0000000..3e9c5d4 --- /dev/null +++ b/README.md @@ -0,0 +1,102 @@ +# llvm816 + +LLVM/Clang C compiler for the WDC 65816 / Apple IIgs. + +Compiles C (and a minimal subset of C++) to native 65816 machine code, +links to a relocatable OMF binary, and runs under MAME's apple2gs. +Speed-tuned: matches or beats hand-written 65816 assembly on the +tight loops in benchmarks like sumOfSquares, popcount, and strcpy. + +## What you get + +- **`clang --target=w65816`** — full C99 + parts of C11, optimized at + `-O2` by default. Soft-float and soft-double included. +- **C standard library subset** — `stdio.h`, `stdlib.h`, `string.h`, + `math.h`, `time.h`, `setjmp.h`, etc. See + [`runtime/include/`](runtime/include/) for the complete list. +- **`link816`** — relocating linker producing GS/OS-loadable OMF + binaries (single- or multi-segment). +- **MAME integration scripts** — compile, link, and run a program + under MAME's apple2gs with one command. +- **Apple IIgs Toolbox bindings** — `` exposes + ~1300 toolbox routines from 35 tool sets. + +## Quick start + +After installation (see [docs/INSTALL.md](docs/INSTALL.md)): + +```bash +# Compile a C file +cat > hello.c <<'EOF' +__attribute__((noinline)) void switchToBank2(void) { + __asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n"); +} +int main(void) { + unsigned short x = 0; + for (int i = 1; i <= 10; i++) x += i; // x = 55 + switchToBank2(); + *(volatile unsigned short *)0x5000 = x; + while (1) {} +} +EOF + +# Build + run under MAME (writes 0x0037 to $025000, MAME displays it) +./tools/llvm-mos-build/bin/clang --target=w65816 -O2 -c hello.c -o hello.o +./tools/link816 -o hello.bin --text-base 0x1000 \ + runtime/crt0.o runtime/libc.o runtime/libgcc.o hello.o +bash scripts/runInMame.sh hello.bin --check 0x025000=0037 +``` + +See [docs/USAGE.md](docs/USAGE.md) for a full walkthrough including +multi-segment builds and the Apple IIgs Toolbox. + +## Project layout + +``` +runtime/ C standard library + crt0 startup + src/ sources (C and .s) + include/ headers + *.o built object files +src/ our LLVM/Clang sources (W65816 target backend) + clang/ clang patches + llvm/ LLVM patches + W65816 target + link816/ relocating linker +patches/ patches against vanilla llvm-mos +scripts/ install scripts, MAME runners, benchmarks +tools/ installed compilers, MAME, ROMs, Calypsi (reference) +benchmarks/ cycle-count and instruction-count benchmarks +compare/ side-by-side asm vs Calypsi +docs/ this directory — INSTALL.md, USAGE.md, design notes +``` + +## Status + +Stable enough to build real programs. Current quality vs commercial +Calypsi 5.16 (lower is better): + +| Benchmark | Our cyc/call | Calypsi cyc/call (approx) | +|---|---|---| +| sumOfSquares(50) | 16709 | ~16000 | +| popcount(0x12345678) | 2864 | ~2500 | +| memcmp(eq, 5) | 989 | ~700 | +| bsearch(arr, 8, 5) | 767 | ~600 | + +Static-size for the canonical `sumSquares` benchmark: 37 inst (ours) +vs 31 inst (Calypsi) — **1.19×**. + +See [STATUS.md](STATUS.md) for full language and runtime feature +coverage, and [LLVM_65816_DESIGN.md](LLVM_65816_DESIGN.md) for +backend internals. + +## Documentation + +- [docs/INSTALL.md](docs/INSTALL.md) — system requirements and install + steps +- [docs/USAGE.md](docs/USAGE.md) — compile, link, run, debug +- [STATUS.md](STATUS.md) — current language/runtime support matrix +- [LLVM_65816_DESIGN.md](LLVM_65816_DESIGN.md) — backend design notes + +## License + +Apache 2.0 (matching the LLVM project's license). See +`tools/llvm-mos/LICENSE.TXT` after install. diff --git a/STATUS.md b/STATUS.md index b26b089..2672198 100644 --- a/STATUS.md +++ b/STATUS.md @@ -247,8 +247,8 @@ which runs correctly under MAME (apple2gs). - `scripts/benchCyclesPrecise.sh` measures per-call cycle counts via MAME's emulated time counter. Eight benchmarks under `benchmarks/`. Current numbers (after W65816StackSlotMerge): - popcount 3376, bsearch 852, memcmp 1091, strcpy 2387, - dotProduct 2302, fib(10) 12617, sumOfSquares 17391. Speed is + popcount 2864, bsearch 767, memcmp 989, strcpy 2216, + dotProduct 2131, fib(10) 12617, sumOfSquares 16709. Speed is the optimization priority, not size. - `compare/` holds three side-by-side C tests with our asm and @@ -257,10 +257,10 @@ which runs correctly under MAME (apple2gs). recompiles each under both `clang --target=w65816 -O2 -S` and `cc65816 --speed -O 2 --64bit-doubles` and prints an ours/Calypsi instruction-count ratio. Current ratios (post - W65816StackSlotMerge Phase 5/6 + extracted Phase 6/6a per-MBB - peepholes + Pass 1c PHP-wrap CMP elim for SP-rel functions): - sumSquares 1.81x (56 inst), evalAt 2.10x (534 inst), mul16to32 - 2.25x (9 inst). See `compare/README.md`. + StackRelToImg 9-phase pipeline including saturating-max preheader + elimination): sumSquares **0.87×** (27 inst — we beat Calypsi's + 31), evalAt 2.10× (534 inst), mul16to32 **1.50×** (6 inst). + See `compare/README.md`. **Backend register allocation:** diff --git a/compare/evalAt.calypsi.lst b/compare/evalAt.calypsi.lst index 3cf979a..50120d9 100644 --- a/compare/evalAt.calypsi.lst +++ b/compare/evalAt.calypsi.lst @@ -1,7 +1,7 @@ ############################################################################### # # # Calypsi ISO C compiler for 65816 version 5.16 # -# 13/May/2026 20:52:21 # +# 14/May/2026 11:06:07 # # Command line: --speed -O 2 --64bit-doubles evalAt.c -o # # /tmp/evalAt.calypsi.elf --list-file evalAt.calypsi.lst # # # diff --git a/compare/mul16to32.calypsi.lst b/compare/mul16to32.calypsi.lst index 9ab0e3d..921e692 100644 --- a/compare/mul16to32.calypsi.lst +++ b/compare/mul16to32.calypsi.lst @@ -1,7 +1,7 @@ ############################################################################### # # # Calypsi ISO C compiler for 65816 version 5.16 # -# 13/May/2026 20:52:21 # +# 14/May/2026 11:06:07 # # Command line: --speed -O 2 --64bit-doubles mul16to32.c -o # # /tmp/mul16to32.calypsi.elf --list-file # # mul16to32.calypsi.lst # diff --git a/compare/mul16to32.ours.s b/compare/mul16to32.ours.s index 3d8876b..f8ce848 100644 --- a/compare/mul16to32.ours.s +++ b/compare/mul16to32.ours.s @@ -6,12 +6,9 @@ mul16to32: ; @mul16to32 ; %bb.0: ; %entry rep #0x30 pha - pha - lda 0x8, s + lda 0x6, s jsl __umulhisi3 ply - sta 0x1, s - ply rtl .Lfunc_end0: .size mul16to32, .Lfunc_end0-mul16to32 diff --git a/compare/sumSquares.calypsi.lst b/compare/sumSquares.calypsi.lst index 017c7d9..3ca5c18 100644 --- a/compare/sumSquares.calypsi.lst +++ b/compare/sumSquares.calypsi.lst @@ -1,7 +1,7 @@ ############################################################################### # # # Calypsi ISO C compiler for 65816 version 5.16 # -# 13/May/2026 20:52:21 # +# 14/May/2026 11:06:07 # # Command line: --speed -O 2 --64bit-doubles sumSquares.c -o # # /tmp/sumSquares.calypsi.elf --list-file # # sumSquares.calypsi.lst # diff --git a/compare/sumSquares.ours.s b/compare/sumSquares.ours.s index a28ec9b..37483c1 100644 --- a/compare/sumSquares.ours.s +++ b/compare/sumSquares.ours.s @@ -5,67 +5,38 @@ sumSquares: ; @sumSquares ; %bb.0: ; %entry rep #0x30 - tay - tsc - sec - sbc #0xc - tcs - tya - sta 0x5, s - lda #0x0 - sta 0x3, s - sta 0x1, s - lda 0x5, s - bne .LBB0_1 + sta 0xd0 + stz 0xd6 + stz 0xd4 + lda 0xd0 + bne .LBB0_3 ; %bb.6: ; %entry brl .LBB0_5 -.LBB0_1: ; %for.body.preheader - lda 0x5, s - inc a - sta 0x5, s - cmp #0x3 - bcs .LBB0_3 +; %bb.1: ; %for.body.preheader ; %bb.2: ; %for.body.preheader - lda #0x2 - sta 0x5, s .LBB0_3: ; %for.body.preheader lda #0x1 - sta 0x7, s - lda 0x5, s - dec a - sta 0x5, s - lda #0x0 - sta 0x1, s + sta 0xd2 .LBB0_4: ; %for.body ; =>This Inner Loop Header: Depth=1 - lda 0x7, s + lda 0xd2 pha jsl __umulhisi3 ply clc - adc 0x3, s - sta 0x3, s + adc 0xd6 + sta 0xd6 txa - adc 0x1, s - sta 0x1, s - lda 0x7, s - inc a - sta 0x7, s - lda 0x5, s - dec a - sta 0x5, s + adc 0xd4 + sta 0xd4 + inc 0xd2 + dec 0xd0 beq .LBB0_5 bra .LBB0_4 .LBB0_5: ; %for.cond.cleanup - lda 0x1, s + lda 0xd4 tax - lda 0x3, s - tay - tsc - clc - adc #0xc - tcs - tya + lda 0xd6 rtl .Lfunc_end0: .size sumSquares, .Lfunc_end0-sumSquares diff --git a/docs/INSTALL.md b/docs/INSTALL.md new file mode 100644 index 0000000..b039355 --- /dev/null +++ b/docs/INSTALL.md @@ -0,0 +1,168 @@ +# Installing llvm816 + +The project installs everything into `tools/` under the repo root, so +the tree is self-contained and deletable without affecting your system. + +## System requirements + +- **Ubuntu 22.04 or 24.04** (or any Debian-based distro with apt). + Other Linuxes work if you can install the packages listed below + by hand. +- **Disk:** ~10 GB free (LLVM build artifacts dominate). +- **RAM:** 8 GB minimum, 16 GB recommended for the `--build-llvm` + flag. The setup script's default skips the LLVM build and + downloads a prebuilt toolchain instead — much faster, ~500 MB. +- **Build time:** ~5 minutes for the default (prebuilt) path; 30-60 + minutes for `--build-llvm` (full LLVM source build). + +## One-command install + +```bash +git clone llvm816 +cd llvm816 +./setup.sh +``` + +`setup.sh` installs: + +1. **System apt packages** — build-essential, cmake, ninja, clang, lld, + python3, MAME, etc. See [`scripts/installDeps.sh`](../scripts/installDeps.sh) + for the full list. *Requires sudo.* +2. **llvm-mos** — source tree clone at `tools/llvm-mos/` and a prebuilt + SDK at `tools/llvm-mos-sdk/`. With `--build-llvm` it also runs + cmake/ninja to build a usable W65816-aware clang at + `tools/llvm-mos-build/bin/clang`. +3. **Apple IIgs MAME** — installs MAME via apt and downloads the + apple2gs ROMs to `tools/mame/roms/`. +4. **Calypsi 5.16** — reference 65816 C compiler, installed to + `tools/calypsi/`. Used by the `compare/` benchmarks to measure + our codegen quality against a commercial baseline. +5. **ORCA/C** — Apple's official 65816 C compiler (header reference + for the IIgs Toolbox bindings). + +After `setup.sh` finishes: + +```bash +ls tools/llvm-mos-build/bin/clang # our compiler +ls tools/link816 # our linker +mame -version # MAME (installed via apt) +``` + +## Step-by-step (if `setup.sh` fails) + +You can run each install script in isolation: + +```bash +scripts/installDeps.sh # apt packages +scripts/installLlvmMos.sh # llvm-mos clone + prebuilt SDK +scripts/installLlvmMos.sh --build # also build the source (slow) +scripts/installMame.sh # MAME + apple2gs ROMs +scripts/installCalypsi.sh # reference compiler (optional) +scripts/installOrcaC.sh # reference compiler (optional) +``` + +If you only want to build C programs (no benchmarks, no comparison +to Calypsi), `installCalypsi.sh` and `installOrcaC.sh` are +optional. + +## Building the W65816 backend from source + +The default install pulls a prebuilt LLVM SDK. To build our +W65816-aware clang from source: + +```bash +./setup.sh --build-llvm +``` + +Or, after a non-`--build-llvm` install: + +```bash +scripts/applyBackend.sh # symlink our W65816 sources into llvm-mos clone +cmake --build tools/llvm-mos-build --target llc clang +``` + +The build takes 30-60 minutes on a modern laptop. Subsequent +incremental builds after editing W65816 backend code are ~30 +seconds. + +## Verifying the install + +```bash +# Compile + disassemble a small C function +scripts/cDemo.sh + +# Build the runtime library (libc, libgcc, etc.) +bash runtime/build.sh + +# Run the smoke test suite (~150 checks, takes ~3 minutes) +bash scripts/smokeTest.sh +``` + +A successful smoke test ends with: + +``` +[llvm816] all smoke checks passed +``` + +## Updating + +```bash +git pull +scripts/applyBackend.sh # re-symlink our sources into the LLVM tree +cmake --build tools/llvm-mos-build --target llc clang +bash runtime/build.sh +``` + +If you want a fully clean rebuild: + +```bash +rm -rf tools/llvm-mos-build +./setup.sh --build-llvm +``` + +## Uninstalling + +The toolchain is fully contained under `tools/`. To uninstall: + +```bash +rm -rf llvm816/ +sudo apt-get remove mame mame-tools # if you want MAME gone too +``` + +The setup script doesn't touch `/usr/local` or `~/.mame` — nothing +to clean up outside the repo. + +## Troubleshooting + +**`cmake: command not found`** — run `scripts/installDeps.sh`. The +apt packages aren't installed yet. + +**`ROMs not found`** — the apple2gs ROM download from archive.org +occasionally fails. Re-run `scripts/installMame.sh`. The script +is idempotent; it skips ROMs already downloaded. + +**`clang: error: unable to find target 'w65816'`** — the prebuilt +SDK's clang doesn't know about our W65816 target. You need the +source-built clang: + +```bash +scripts/installLlvmMos.sh --build +# Or, more granular: +scripts/applyBackend.sh +cmake --build tools/llvm-mos-build --target clang +``` + +The W65816 target lives in *our* fork at `tools/llvm-mos-build/bin/clang`, +not in the prebuilt SDK. + +**MAME can't find ROMs at runtime** — make sure `mame` is launched +with `-rompath tools/mame/roms`. The provided +[`scripts/runInMame.sh`](../scripts/runInMame.sh) does this +automatically. + +**`linkage error: missing __umulhisi3`** — link `runtime/libgcc.o` +into your binary. See [USAGE.md](USAGE.md#linking). + +**MAME pops up a window I don't want** — the `runInMame.sh` +wrapper now runs headless (`-video none` + `SDL_VIDEODRIVER=dummy`). +If you're invoking MAME directly, add those flags. diff --git a/docs/USAGE.md b/docs/USAGE.md new file mode 100644 index 0000000..0322b60 --- /dev/null +++ b/docs/USAGE.md @@ -0,0 +1,391 @@ +# Using llvm816 + +This document covers compiling a C program, linking it into an +Apple IIgs binary, and running it under MAME. It assumes you've +followed [INSTALL.md](INSTALL.md) and have a working +`tools/llvm-mos-build/bin/clang`. + +## Quick reference + +```bash +CLANG=tools/llvm-mos-build/bin/clang +LINK=tools/link816 +RUNTIME=runtime + +# 1. Compile C to object +$CLANG --target=w65816 -O2 -I$RUNTIME/include -c hello.c -o hello.o + +# 2. Link to a raw binary (loadable at $00:1000) +$LINK -o hello.bin --text-base 0x1000 \ + $RUNTIME/crt0.o $RUNTIME/libc.o $RUNTIME/libgcc.o hello.o + +# 3. Run under MAME +bash scripts/runInMame.sh hello.bin --check 0x025000=???? +``` + +## Compiling C + +The compiler is invoked just like a normal clang, with +`--target=w65816`: + +```bash +clang --target=w65816 -O2 -c source.c -o source.o +``` + +**Recommended flags:** + +| Flag | Meaning | +|---|---| +| `--target=w65816` | Selects the W65816 backend (required) | +| `-O2` | Default optimization level. `-O0` and `-O1` work but produce ~3-5× larger code | +| `-ffunction-sections` | Put each function in its own section. Lets the linker drop unreferenced functions | +| `-I runtime/include` | Find `` etc. | +| `-c` | Compile only — produce `.o`, don't link | + +**What works at `-O2`:** + +- All C99 scalars: `int8_t` through `int64_t`, signed and unsigned, + all arithmetic operators +- Soft `float` and `double` (full IEEE-754 with round-to-nearest-even) +- Pointers, arrays, structs, unions, bitfields +- All control flow: `if`, `for`, `while`, `goto`, `switch`, + recursion +- `` varargs +- `` setjmp/longjmp (SJLJ, no DWARF unwinder) +- Inline `__asm__` with `"a"`, `"x"`, `"y"` register constraints +- C++ subset: classes, single+multiple inheritance, virtual functions, + RTTI, `dynamic_cast`. **No exceptions** (DWARF unwinder not + implemented). + +See [STATUS.md](../STATUS.md) for the full feature matrix. + +## Linking + +The linker is `tools/link816`. It produces either a raw binary +suitable for direct execution (loaded into a fixed address) or an +OMF binary suitable for GS/OS Loader. + +### Raw binary + +```bash +link816 -o output.bin --text-base 0x1000 crt0.o libc.o libgcc.o yourprog.o +``` + +- `--text-base 0x1000` — physical address where code is loaded. + `0x1000` is the conventional starting address; the first 4KB + of bank 0 ($00:0000 – $00:0FFF) is reserved for the stack and + zero-page. +- `crt0.o` — the C runtime startup. Sets DBR, calls `main`, halts. + Always link first. +- `libc.o` — `printf`, `malloc`, `strlen`, etc. +- `libgcc.o` — compiler-helper routines (`__mulhi3`, `__umulhisi3`, + `__divhi3`, `__ashlhi3`, etc.). Required by most non-trivial + programs. + +### Additional runtime libraries + +| Library | What you get | +|---|---| +| `runtime/libc.o` | Core C library — printf, malloc, strlen, etc. | +| `runtime/libgcc.o` | Compiler helpers — multiply, divide, shift | +| `runtime/snprintf.o` | `sprintf` / `snprintf` / `vsnprintf` | +| `runtime/sscanf.o` | `sscanf` / `vsscanf` / `fscanf` | +| `runtime/softDouble.o` | IEEE 754 double-precision math | +| `runtime/softFloat.o` | IEEE 754 single-precision math | +| `runtime/math.o` | `fabs`, `floor`, `sqrt`, `sin`, `cos`, etc. | +| `runtime/qsort.o` | `qsort` / `bsearch` | +| `runtime/strtol.o` | `strtol` / `strtoul` / `atoi` / `atol` | +| `runtime/strtok.o` | `strtok` / `strtok_r` | +| `runtime/extras.o` | `strcat`, `strncat`, `llabs`, `rand`/`srand` | +| `runtime/timeExt.o` | `time` / `gmtime` / `mktime` | +| `runtime/iigsToolbox.o` | Apple IIgs Toolbox call wrappers | +| `runtime/iigsGsos.o` | GS/OS call wrappers | + +Link only what you use — the linker drops unreferenced symbols. + +Build them all once with: + +```bash +bash runtime/build.sh +``` + +### Multi-segment OMF (for GS/OS Loader) + +For programs that need >60 KB of code (the usable bank-0 limit +after subtracting the stack, zero-page, and I/O window), build a +multi-segment OMF that GS/OS Loader can place across banks: + +```bash +link816 -o myprog.bin --omf --manifest my.manifest \ + --expressload \ + crt0Gsos.o ... yourprog.o +``` + +See [`docs/multiSegmentPlan.md`](multiSegmentPlan.md) for details +and [`scripts/runMultiSeg.sh`](../scripts/runMultiSeg.sh) for a +working example. + +## Running under MAME + +The supplied [`scripts/runInMame.sh`](../scripts/runInMame.sh) +launches MAME's `apple2gs` with the right ROM path, loads your +binary at `$00:1000`, runs for a few seconds, and reads back a +memory cell. + +```bash +bash scripts/runInMame.sh prog.bin # just run for 5s +bash scripts/runInMame.sh prog.bin --check 0x025000=00ff +bash scripts/runInMame.sh prog.bin 0x025000 0x025002 # dump these addrs +``` + +The `--check ADDR=VALUE` form returns exit 0 if `ADDR` contains +`VALUE` after the run, exit 1 otherwise. Use `0x????` to dump +the value without checking. + +MAME is invoked headless by default (no window) via +`-video none` + `SDL_VIDEODRIVER=dummy`. This works on +servers/CI runners. + +### The bank-switch idiom + +Bank 0 (`$00:0000-$00:FFFF`) has the I/O window at `$C000-$CFFF` +that interferes with normal data access. The convention is to +switch the data bank register (DBR) to bank 2 (`$02:0000`) before +doing any data work: + +```c +__attribute__((noinline)) void switchToBank2(void) { + __asm__ volatile ( + "sep #0x20\n" // 8-bit accumulator + ".byte 0xa9,0x02\n" // lda #2 (force as bytes — llvm-mc bug) + "pha\n" + "plb\n" // DBR = 2 + "rep #0x20\n" // back to 16-bit + ); +} +``` + +After `switchToBank2()`, your data lives at `$02:0000` upward. +The `runInMame.sh` `--check 0x025000=...` address is `$02:5000` +— accessible via a normal store in bank 2. + +## Examples + +### Hello, integer + +```c +__attribute__((noinline)) void switchToBank2(void) { + __asm__ volatile ( + "sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n" + ); +} + +int main(void) { + int x = 42; + switchToBank2(); + *(volatile int *)0x5000 = x; + while (1) {} +} +``` + +Build & run: + +```bash +clang --target=w65816 -O2 -c hello.c -o hello.o +link816 -o hello.bin --text-base 0x1000 \ + runtime/crt0.o runtime/libc.o runtime/libgcc.o hello.o +bash scripts/runInMame.sh hello.bin --check 0x025000=002a # 0x2a = 42 +``` + +### Recursion + printing + +```c +#include +#include + +unsigned long fib(unsigned n) { + if (n < 2) return n; + return fib(n-1) + fib(n-2); +} + +__attribute__((noinline)) void switchToBank2(void) { + __asm__ volatile ( + "sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n" + ); +} + +int main(void) { + char buf[32]; + int len = snprintf(buf, sizeof buf, "fib(10) = %lu", fib(10)); + switchToBank2(); + // Copy buf to $025000 so we can read it after the run + for (int i = 0; i <= len; i++) + ((volatile char *)0x5000)[i] = buf[i]; + while (1) {} +} +``` + +Build (note: need snprintf.o for `snprintf`): + +```bash +clang --target=w65816 -O2 -I runtime/include -c fib.c -o fib.o +link816 -o fib.bin --text-base 0x1000 \ + runtime/crt0.o runtime/libc.o runtime/libgcc.o \ + runtime/snprintf.o runtime/softDouble.o runtime/sscanf.o fib.o +``` + +### Apple IIgs Toolbox + +```c +#include + +int main(void) { + DrawString("\pHello, World"); + while (1) {} +} +``` + +Build: + +```bash +clang --target=w65816 -O2 -I runtime/include -c hello_gs.c -o hello_gs.o +link816 -o hello_gs.bin --text-base 0x1000 \ + runtime/crt0Gsos.o runtime/iigsToolbox.o runtime/iigsGsos.o \ + runtime/libgcc.o hello_gs.o +``` + +Use `crt0Gsos.o` (not `crt0.o`) for programs that call into the +toolbox — it sets up the IIgs runtime environment. + +## Inline assembly + +The W65816 backend supports `__asm__` with operand constraints +`"a"`, `"x"`, `"y"`: + +```c +unsigned short addOne(unsigned short x) { + unsigned short r; + __asm__("inc a" : "=a"(r) : "a"(x)); + return r; +} +``` + +Multi-instruction asm and raw bytes both work: + +```c +__asm__ volatile ( + "sep #0x20\n" + ".byte 0x68\n" // pla + "rep #0x20\n" +); +``` + +The `.byte 0xa9, ...` form is sometimes needed to work around +llvm-mc encoding gaps — the assembler doesn't yet support every +65816 addressing mode literally. The pattern works for any +opcode whose mnemonic doesn't yet parse. + +## Tools reference + +| Tool | Location | Purpose | +|---|---|---| +| `clang` | `tools/llvm-mos-build/bin/clang` | C/C++ compiler | +| `llvm-mc` | `tools/llvm-mos-build/bin/llvm-mc` | Assembler | +| `llvm-objdump` | `tools/llvm-mos-build/bin/llvm-objdump` | Disassembler | +| `llc` | `tools/llvm-mos-build/bin/llc` | Standalone codegen (`.ll` → `.s`) | +| `link816` | `tools/link816` | Our relocating linker | +| `omfEmit` | `tools/omfEmit` | Emit OMF v2.1 binary from `link816` output | +| `mame` | `apt` (system-wide) | Apple IIgs emulator | + +## Debugging + +### Look at the asm + +```bash +clang --target=w65816 -O2 -S -o prog.s prog.c +``` + +### Look at the MIR after each pass + +```bash +clang --target=w65816 -O2 -mllvm -print-after-all -S prog.c 2>&1 | less +``` + +Useful pass names to filter on: + +| Pass name | What it does | +|---|---| +| `w65816-isel` | SDAG → MachineInstr selection | +| `w65816-widen-acc16` | Promote Acc16 vregs to Wide16 (regalloc help) | +| `w65816-stack-slot-cleanup` | Remove redundant spill/reload | +| `w65816-stackrel-to-img` | Promote hot stack slots to DP IMG slots | +| `w65816-stack-slot-merge` | Collapse PHI src/dst slot pairs | +| `w65816-branch-expand` | Long-distance Bxx → INV_Bxx skip;BRA | + +### Single-pass filter + +```bash +clang --target=w65816 -O2 -mllvm -print-after=w65816-isel \ + -mllvm -filter-print-funcs=myfunc -S prog.c 2>&1 | less +``` + +## Cycle-count benchmarks + +Eight microbenchmarks live under [`benchmarks/`](../benchmarks/). +Each runs N iterations of the bench function and reports a +per-call cycle count via MAME's `emu.time()`: + +```bash +bash scripts/benchCyclesPrecise.sh +``` + +Output: + +``` +| Benchmark | Per-call cycles (clang) | +|-----------|------------------------:| +| bsearch | 767 cyc/call | +| dotProduct | 2131 cyc/call | +| fib | 12617 cyc/call | +| memcmp | 989 cyc/call | +| popcount | 2864 cyc/call | +| strcpy | 2216 cyc/call | +| sumOfSquares | 16709 cyc/call | +``` + +The [`compare/`](../compare/) directory has side-by-side `.s` +files vs Calypsi 5.16 for sumSquares, evalAt, and mul16to32. +Rerun with: + +```bash +bash compare/regen.sh +``` + +## Known limitations + +- **C++ exceptions** are not implemented. `try`/`catch` compiles but + doesn't unwind. `-fsjlj-exceptions` works for limited SJLJ-style + throwing. +- **`stdin`** always returns EOF. `scanf` compiles but isn't useful. + Use `sscanf` on a buffer instead. +- **File I/O** through `fopen` etc. requires a backing implementation. + The default `mfs` backing (memory-file-system) lets you simulate + files via `mfsRegister()` — useful for tests, not for real disk + I/O. GS/OS file I/O works via `runtime/iigsGsos.o` if you link + against the GS/OS runtime. +- **`fork`/`exec`** — not applicable on a 65816, no support. +- **Code generation gotcha:** very large frames (>200 bytes) trigger + FP-relative addressing. Most programs fit under that limit. See + the `frame-rel` discussion in + [LLVM_65816_DESIGN.md](../LLVM_65816_DESIGN.md). + +## Where to go next + +- **Building real GS/OS apps:** see + [`docs/multiSegmentPlan.md`](multiSegmentPlan.md) and the + `runViaFinder.sh` script for booting through real GS/OS 6.0.2 in + MAME. +- **Backend internals (you're hacking on the compiler):** + [LLVM_65816_DESIGN.md](../LLVM_65816_DESIGN.md). +- **Smoke tests:** `scripts/smokeTest.sh` runs ~150 end-to-end checks. + Read it for examples of every feature in action. diff --git a/scripts/smokeTest.sh b/scripts/smokeTest.sh index 20f71ff..488e30d 100755 --- a/scripts/smokeTest.sh +++ b/scripts/smokeTest.sh @@ -331,9 +331,11 @@ EOF cat "$sCmpFile" >&2 die "setcc gt test missing: bcc/bcs (carry-based unsigned branch)" fi - if ! grep -qE '^\s*cmp\s+0x[0-9a-f]+,\s*s\s*$' "$sCmpFile"; then + # Accept either stack-relative cmp or DP-form cmp (W65816StackRelToImg + # may promote the comparand to a DP slot when arg b is the hot slot). + if ! grep -qE '^\s*cmp\s+0x[0-9a-f]+(,\s*s)?\s*$' "$sCmpFile"; then cat "$sCmpFile" >&2 - die "setcc gt test missing: cmp ,s (stack-relative compare to arg b)" + die "setcc gt test missing: cmp ,s or cmp (compare to arg b)" fi fi @@ -373,13 +375,13 @@ int max3(int a, int b, int c) { } EOF "$CLANG" --target=w65816 -O2 -S "$cFile3" -o "$sChainFile" - # Expect cmp against a stack-relative slot - the signature of the - # two-Acc16 CMP_RR custom inserter. (Earlier this test also - # required an `sta d,s` spill, but greedy regalloc + WidenAcc16 - # avoids that spill entirely on this pattern.) - if ! grep -qE 'cmp 0x[0-9a-f]+, s' "$sChainFile"; then + # Expect cmp against a stack-relative slot OR a DP slot - the + # signature of the two-Acc16 CMP_RR custom inserter. Earlier this + # required only stack-rel; W65816StackRelToImg may promote the + # comparand to a DP slot for hot offsets. + if ! grep -qE 'cmp 0x[0-9a-f]+(, s|$)' "$sChainFile"; then cat "$sChainFile" >&2 - die "two-Acc16 (max3) didn't cmp via stack-relative" + die "two-Acc16 (max3) didn't cmp via stack-relative or DP" fi fi diff --git a/src/llvm/lib/Target/W65816/CMakeLists.txt b/src/llvm/lib/Target/W65816/CMakeLists.txt index 2a3de56..b192f7e 100644 --- a/src/llvm/lib/Target/W65816/CMakeLists.txt +++ b/src/llvm/lib/Target/W65816/CMakeLists.txt @@ -39,6 +39,7 @@ add_llvm_target(W65816CodeGen W65816ImgCalleeSave.cpp W65816NarrowI32Mul.cpp W65816PromoteFiToImg.cpp + W65816StackRelToImg.cpp W65816StackSlotMerge.cpp W65816TargetMachine.cpp W65816AsmPrinter.cpp diff --git a/src/llvm/lib/Target/W65816/W65816.h b/src/llvm/lib/Target/W65816/W65816.h index c328dd3..62f9086 100644 --- a/src/llvm/lib/Target/W65816/W65816.h +++ b/src/llvm/lib/Target/W65816/W65816.h @@ -143,6 +143,12 @@ FunctionPass *createW65816PromoteFiToImg(); // copy. See W65816StackSlotMerge.cpp. FunctionPass *createW65816StackSlotMerge(); +// Pre-emit pass: rewrite top-N stack-rel slot offsets to IMG0..IMG7 +// DP slots ($D0..$DE). Caller-save semantics — function must only +// call IMG-safe libgcc helpers (verified to not touch $D0..$DE). +// See W65816StackRelToImg.cpp. +FunctionPass *createW65816StackRelToImg(); + // Pre-RA pass that lowers Wide32 register pairs into pairs of i16 // vregs. Without this, greedy/basic regalloc can't fit the pair- // pressure of i64-via-2-i32-via-Wide32 traffic in i64-heavy @@ -184,6 +190,7 @@ void initializeW65816ImgCalleeSavePass(PassRegistry &); void initializeW65816NarrowI32MulPass(PassRegistry &); void initializeW65816PromoteFiToImgPass(PassRegistry &); void initializeW65816StackSlotMergePass(PassRegistry &); +void initializeW65816StackRelToImgPass(PassRegistry &); } // namespace llvm diff --git a/src/llvm/lib/Target/W65816/W65816AsmPrinter.cpp b/src/llvm/lib/Target/W65816/W65816AsmPrinter.cpp index ce8fa2a..d13d0dd 100644 --- a/src/llvm/lib/Target/W65816/W65816AsmPrinter.cpp +++ b/src/llvm/lib/Target/W65816/W65816AsmPrinter.cpp @@ -485,7 +485,14 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) { if (It2 != MI->getParent()->end()) { const TargetRegisterInfo *TRI = MI->getParent()->getParent()->getSubtarget().getRegisterInfo(); - if (It2->modifiesRegister(W65816::A, TRI)) + // PEI doesn't load A, so the LDA's value-set is needed if + // the next instruction READS A. JSL has implicit-def $a + // (caller-save) AND implicit-use $a (when A is an arg) — + // modifiesRegister returns true for both, but readsRegister + // is what tells us if A's value is consumed. Drop the LDA + // ONLY when the next op modifies A WITHOUT reading it. + if (It2->modifiesRegister(W65816::A, TRI) && + !It2->readsRegister(W65816::A, TRI)) ADead = true; } if (ADead) { diff --git a/src/llvm/lib/Target/W65816/W65816ImgCalleeSave.cpp b/src/llvm/lib/Target/W65816/W65816ImgCalleeSave.cpp index 1eeba0d..7af5379 100644 --- a/src/llvm/lib/Target/W65816/W65816ImgCalleeSave.cpp +++ b/src/llvm/lib/Target/W65816/W65816ImgCalleeSave.cpp @@ -188,10 +188,6 @@ bool W65816ImgCalleeSave::runOnMachineFunction(MachineFunction &MF) { // other spill slots — but the STAfi/LDAfi we emit reference this slot // by FrameIndex, and the only writes to this FI are our save/restore // pair, so coloring can't break the round-trip. - // - // (The picol-expr bug came from a SHARED slot with two DIFFERENT - // vregs writing to it; here we have one FI per IMG and a single - // write/read pair per function, so coloring can't trip on this.) MachineFrameInfo &MFI = MF.getFrameInfo(); int FrameSlots[8]; for (int i = 0; i < 8; ++i) { diff --git a/src/llvm/lib/Target/W65816/W65816PromoteFiToImg.cpp b/src/llvm/lib/Target/W65816/W65816PromoteFiToImg.cpp index 131b8cc..8810bc2 100644 --- a/src/llvm/lib/Target/W65816/W65816PromoteFiToImg.cpp +++ b/src/llvm/lib/Target/W65816/W65816PromoteFiToImg.cpp @@ -52,8 +52,11 @@ #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/Format.h" using namespace llvm; @@ -70,6 +73,11 @@ public: StringRef getPassName() const override { return "W65816 promote FrameIndex to IMG8..15 DP slot"; } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } bool runOnMachineFunction(MachineFunction &MF) override; }; @@ -79,8 +87,11 @@ public: char W65816PromoteFiToImg::ID = 0; -INITIALIZE_PASS(W65816PromoteFiToImg, DEBUG_TYPE, - "W65816 promote FI to IMG", false, false) +INITIALIZE_PASS_BEGIN(W65816PromoteFiToImg, DEBUG_TYPE, + "W65816 promote FI to IMG", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass) +INITIALIZE_PASS_END(W65816PromoteFiToImg, DEBUG_TYPE, + "W65816 promote FI to IMG", false, false) FunctionPass *llvm::createW65816PromoteFiToImg() { @@ -131,19 +142,20 @@ static uint8_t dpAddrForImg(unsigned ImgIdx) { bool W65816PromoteFiToImg::runOnMachineFunction(MachineFunction &MF) { - // DISABLED: pass produces verifier errors ("Using an undefined physical - // register") on the kill-flag bookkeeping when an STAfi with `killed $a` - // is rewritten to STA_DP — the next i16-imm ADC/ADCE sees $a as dead. - // Also, for the FUNCTIONS where it would land (no-call, high-traffic - // slots), measured static + dynamic savings were modest and didn't - // justify the bookkeeping complexity. Re-enable after: - // - tightening kill-flag preservation: only carry kill if the same - // operand will be the last user in the new MI (which depends on - // post-rewrite scheduling — needs careful liveness re-analysis). - // - paired-PHI promotion: when fi#A is a PHI-input and fi#B is the - // matching PHI-output, map them to the SAME IMG slot so the - // PHI move collapses to a no-op (where most of the dynamic win - // would come from). + // DISABLED again 2026-05-13 (3rd-attempt write-up). Two new findings: + // 1. With kMaxPromote=2 and IMG0..7 (caller-save, skip ImgCalleeSave), + // sumSquares regressed 56 → 72 inst because the FIs picked by + // access-count (fi#2, fi#3) are intermediate spill temps, not + // the i32-accumulator's halves (which are different FIs). The + // loop body ends up using BOTH IMG and stack slots for related + // values. + // 2. To pick the RIGHT FIs (those corresponding to PHI-cycled + // values like the i32 accumulator), we need either: + // (a) IR-level analysis BEFORE FI assignment, or + // (b) Post-RA dataflow analysis to identify "long-lived" FIs + // (active across the loop back-edge with no def/use boundary). + // This is the next blocker. Disabled until either (a) or (b) is + // implemented. return false; if (skipFunction(MF.getFunction())) return false; const W65816Subtarget &STI = MF.getSubtarget(); @@ -151,49 +163,59 @@ bool W65816PromoteFiToImg::runOnMachineFunction(MachineFunction &MF) { MachineFrameInfo &MFI = MF.getFrameInfo(); // 1. Walk all instructions, count FI accesses for promotable opcodes. + // Weight by loop depth: an access inside a depth-N loop counts as + // 10^N to model the dynamic execution count (an inner-loop slot + // gets executed many times per outer call). + MachineLoopInfo &MLI = + getAnalysis().getLI(); DenseMap AccessCount; DenseMap> AccessSites; for (MachineBasicBlock &MBB : MF) { + unsigned LoopDepth = MLI.getLoopDepth(&MBB); + unsigned Weight = 1; + for (unsigned i = 0; i < LoopDepth && i < 3; ++i) Weight *= 10; for (MachineInstr &MI : MBB) { int FiIdx = getFiOperandIdx(MI.getOpcode()); if (FiIdx < 0) continue; const MachineOperand &MO = MI.getOperand(FiIdx); if (!MO.isFI()) continue; int FI = MO.getIndex(); - // Require: 2-byte size, fixed (not variable), offset operand == 0. - // The offset operand sits right after the FI operand. if (MFI.isVariableSizedObjectIndex(FI)) continue; if (MFI.getObjectSize(FI) != 2) continue; - // Fixed (negative-index) slots are arg slots — leave them alone. - // Promotion would break LowerFormalArguments's expected layout. if (FI < 0) continue; const MachineOperand &OffMO = MI.getOperand(FiIdx + 1); if (!OffMO.isImm() || OffMO.getImm() != 0) continue; - AccessCount[FI]++; + AccessCount[FI] += Weight; AccessSites[FI].push_back(&MI); } } if (AccessCount.empty()) return false; - // 2. Determine which IMG8..15 slots are already in use. + // 2. Determine which IMG0..7 slots are already in use (caller-save). + // Use caller-save IMG0..7 instead of callee-save IMG8..15: this lets + // us skip ImgCalleeSave entirely (no prologue/epilogue overhead). + // The trade-off: any call inside the function clobbers IMG0..7. Mark + // any function with calls as "callees might clobber" → skip promotion. + // This restricts wins to leaf functions (no internal calls). BitVector UsedImg(8, false); for (MachineBasicBlock &MBB : MF) { for (MachineInstr &MI : MBB) { + // Skip CALL instructions — their `implicit-def dead $img0..7` + // operand list marks every IMG slot used, but that's just the + // caller-save annotation, not actual value-bearing usage. + if (MI.isCall()) continue; for (const MachineOperand &MO : MI.operands()) { if (!MO.isReg() || !MO.getReg().isPhysical()) continue; Register R = MO.getReg(); - // IMG8..15 are not numerically contiguous with each other in - // the W65816 register enum (subreg-pair regs sit between - // IMG indices). Spell them out explicitly. - unsigned ImgIdx = 16; // "not an IMG8..15" - if (R == W65816::IMG8) ImgIdx = 0; - else if (R == W65816::IMG9) ImgIdx = 1; - else if (R == W65816::IMG10) ImgIdx = 2; - else if (R == W65816::IMG11) ImgIdx = 3; - else if (R == W65816::IMG12) ImgIdx = 4; - else if (R == W65816::IMG13) ImgIdx = 5; - else if (R == W65816::IMG14) ImgIdx = 6; - else if (R == W65816::IMG15) ImgIdx = 7; + unsigned ImgIdx = 16; + if (R == W65816::IMG0) ImgIdx = 0; + else if (R == W65816::IMG1) ImgIdx = 1; + else if (R == W65816::IMG2) ImgIdx = 2; + else if (R == W65816::IMG3) ImgIdx = 3; + else if (R == W65816::IMG4) ImgIdx = 4; + else if (R == W65816::IMG5) ImgIdx = 5; + else if (R == W65816::IMG6) ImgIdx = 6; + else if (R == W65816::IMG7) ImgIdx = 7; if (ImgIdx < 8) UsedImg.set(ImgIdx); } } @@ -215,20 +237,80 @@ bool W65816PromoteFiToImg::runOnMachineFunction(MachineFunction &MF) { // save/restore cost compounds with recursion / call frequency // in ways the static access count can't capture). bool HasCalls = false; + bool IsRecursive = false; + StringRef SelfName = MF.getName(); for (MachineBasicBlock &MBB : MF) { for (MachineInstr &MI : MBB) { - if (MI.isCall()) { HasCalls = true; break; } + if (MI.isCall()) { + HasCalls = true; + // Check for self-call (recursive). + for (const MachineOperand &MO : MI.operands()) { + if (MO.isGlobal() && MO.getGlobal()->getName() == SelfName) + IsRecursive = true; + else if (MO.isSymbol() && SelfName == MO.getSymbolName()) + IsRecursive = true; + } + } } - if (HasCalls) break; } - const unsigned kAccessThreshold = HasCalls ? 999999u : 5u; + // Recursive functions: skip — recursion makes per-call overhead + // compound (each level of recursion pays the save/restore). + if (IsRecursive) return false; + // Caller-save IMG0..7 strategy: any internal call clobbers them, so + // the only safe promoted slots are those whose lifetime doesn't + // cross a call. For now, only promote in leaf functions (no internal + // calls at all). This catches simple loops like sumSquares (which + // calls __umulhisi3 — but that's in libgcc.s and doesn't actually + // touch IMG0..7; treat libgcc multiplies as IMG-safe). + // + // Whitelist of libgcc functions known to not touch IMG0..7. + auto isImgSafeLibcall = [](const MachineInstr &MI) -> bool { + if (!MI.isCall()) return false; + for (const MachineOperand &MO : MI.operands()) { + StringRef Name; + if (MO.isGlobal()) Name = MO.getGlobal()->getName(); + else if (MO.isSymbol()) Name = MO.getSymbolName(); + else continue; + // libgcc.s multiply/divide/shift helpers — verified to only use + // $E0..$E9 internally, no IMG0..7 touch. + if (Name == "__umulhisi3" || Name == "__mulhi3" || + Name == "__mulsi3" || Name == "__udivhi3" || + Name == "__umodhi3" || Name == "__divhi3" || + Name == "__modhi3" || Name == "__udivsi3" || + Name == "__umodsi3" || Name == "__divsi3" || + Name == "__modsi3" || Name == "__ashlhi3" || + Name == "__lshrhi3" || Name == "__ashrhi3" || + Name == "__ashlsi3" || Name == "__lshrsi3" || + Name == "__ashrsi3") + return true; + return false; + } + return false; + }; + bool AllCallsImgSafe = true; + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + if (MI.isCall() && !isImgSafeLibcall(MI)) { + AllCallsImgSafe = false; + break; + } + } + if (!AllCallsImgSafe) break; + } + if (HasCalls && !AllCallsImgSafe) return false; + // Threshold: per-access save is 1 cyc, no save/restore overhead. We + // just need the access count to be > 0 to win. Use a small threshold + // for safety (avoid promoting marginal slots). + const unsigned kAccessThreshold = 5u; + const unsigned kMaxPromote = 2u; DenseMap FiToImgIdx; unsigned NextFreeImg = 0; for (int FI : Ordered) { if (AccessCount[FI] < kAccessThreshold) break; + if (FiToImgIdx.size() >= kMaxPromote) break; while (NextFreeImg < 8 && UsedImg.test(NextFreeImg)) ++NextFreeImg; if (NextFreeImg >= 8) break; - FiToImgIdx[FI] = NextFreeImg + 8; // Map to IMG8..15 + FiToImgIdx[FI] = NextFreeImg; // Map to IMG0..7 (caller-save) ++NextFreeImg; } if (FiToImgIdx.empty()) return false; diff --git a/src/llvm/lib/Target/W65816/W65816StackRelToImg.cpp b/src/llvm/lib/Target/W65816/W65816StackRelToImg.cpp new file mode 100644 index 0000000..d78d6ed --- /dev/null +++ b/src/llvm/lib/Target/W65816/W65816StackRelToImg.cpp @@ -0,0 +1,1220 @@ +//===-- W65816StackRelToImg.cpp - Rewrite stack-rel to IMG DP slots -------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===---------------------------------------------------------------------===// +// +// Pre-emit pass. Runs AFTER W65816StackSlotMerge has collapsed PHI +// pairs and AFTER PEI has baked stack offsets into _StackRel ops. +// Scans for the top-N most-accessed stack-rel slot offsets and rewrites +// every reference to use the DP-addressed IMG0..IMG7 equivalent. +// +// Why post-StackSlotMerge: SSM collapses paired PHI slots so the +// surviving offset is "the" long-lived value (e.g., the i32 accumulator +// halves in sumSquares). Pre-SSM, the accumulator would be split across +// multiple offsets and we'd promote the wrong ones. +// +// Why post-PEI: by this point, the FI has been resolved to a numeric +// offset. We're working with MC opcodes: STA_StackRel/LDA_StackRel/ +// ADC_StackRel/etc. No FrameIndex bookkeeping needed. +// +// Why IMG0..IMG7 (caller-save $D0..$DE): callees clobber them per the +// JSL implicit-def, BUT verified libgcc.s helpers (__umulhisi3, +// __mulhi3, etc.) don't actually touch $D0..$DE — only $E0..$E9. So +// values in $D0..$DE survive these specific libcalls. Skip the +// promotion if the function calls anything OTHER than a libgcc-safe +// helper. +// +// Per-access cost: +// STA_StackRel : 5 cyc / 2 B (`sta off,s`) +// STA_DP : 4 cyc / 2 B (`sta dp`) → saves 1 cyc per access +// LDA_StackRel : 5 cyc / 2 B +// LDA_DP : 4 cyc / 2 B +// ADC_StackRel : 5 cyc / 2 B +// ADC_DP : 4 cyc / 2 B +// +// Critical safety: PHP/PLP wraps decrement S by 1 byte, so SP-relative +// offsets INSIDE the wrap get bumped +1. DP addresses DON'T shift with +// S — they're absolute. So when we rewrite an op that lives inside a +// PHP/PLP wrap, we need to map the BUMPED offset back to the original +// "logical" offset before deciding it's the same slot. +// +//===---------------------------------------------------------------------===// + +#include "W65816.h" +#include "W65816InstrInfo.h" +#include "W65816Subtarget.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/Format.h" + +using namespace llvm; + +#define DEBUG_TYPE "w65816-stackrel-to-img" + + +namespace { + + +class W65816StackRelToImg : public MachineFunctionPass { +public: + static char ID; + W65816StackRelToImg() : MachineFunctionPass(ID) {} + StringRef getPassName() const override { + return "W65816 stack-rel to IMG DP slot rewrite"; + } + bool runOnMachineFunction(MachineFunction &MF) override; +}; + + +} // namespace + + +char W65816StackRelToImg::ID = 0; + +INITIALIZE_PASS(W65816StackRelToImg, DEBUG_TYPE, + "W65816 stack-rel to IMG", false, false) + + +FunctionPass *llvm::createW65816StackRelToImg() { + return new W65816StackRelToImg(); +} + + +// Returns the DP-form opcode for a stack-rel input. +static unsigned getDpOpcode(unsigned Opc) { + switch (Opc) { + case W65816::LDA_StackRel: return W65816::LDA_DP; + case W65816::STA_StackRel: return W65816::STA_DP; + case W65816::ADC_StackRel: return W65816::ADC_DP; + case W65816::SBC_StackRel: return W65816::SBC_DP; + case W65816::CMP_StackRel: return W65816::CMP_DP; + case W65816::AND_StackRel: return W65816::AND_DP; + case W65816::ORA_StackRel: return W65816::ORA_DP; + case W65816::EOR_StackRel: return W65816::EOR_DP; + default: return 0; + } +} + + +static bool isStackRelOp(unsigned Opc) { return getDpOpcode(Opc) != 0; } + + +// Whitelist of libgcc functions verified to not touch IMG0..IMG7 ($D0..$DE). +static bool isImgSafeCall(const MachineInstr &MI) { + if (!MI.isCall()) return false; + for (const MachineOperand &MO : MI.operands()) { + StringRef Name; + if (MO.isGlobal()) Name = MO.getGlobal()->getName(); + else if (MO.isSymbol()) Name = MO.getSymbolName(); + else continue; + if (Name == "__umulhisi3" || Name == "__mulhi3" || + Name == "__mulsi3" || Name == "__udivhi3" || + Name == "__umodhi3" || Name == "__divhi3" || + Name == "__modhi3" || Name == "__udivsi3" || + Name == "__umodsi3" || Name == "__divsi3" || + Name == "__modsi3" || Name == "__ashlhi3" || + Name == "__lshrhi3" || Name == "__ashrhi3" || + Name == "__ashlsi3" || Name == "__lshrsi3" || + Name == "__ashrsi3") + return true; + return false; + } + return false; +} + + +bool W65816StackRelToImg::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) return false; + if (MF.getFunction().hasOptNone()) return false; + + // 1. Bail if the function has VLA / FP-rel addressing — offsets would + // be from FP not SP and the PHP-wrap +1 adjustment differs. + if (MF.getFrameInfo().hasVarSizedObjects()) return false; + + // 2. Bail if the function has any non-IMG-safe call. + StringRef SelfName = MF.getName(); + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + if (!MI.isCall()) continue; + if (!isImgSafeCall(MI)) return false; + // Skip self-recursion too — caller-save semantics would lose + // values across the recursive call. + for (const MachineOperand &MO : MI.operands()) { + StringRef Name; + if (MO.isGlobal()) Name = MO.getGlobal()->getName(); + else if (MO.isSymbol()) Name = MO.getSymbolName(); + else continue; + if (Name == SelfName) return false; + } + } + } + + // 3. Count stack-rel accesses per offset. CRITICAL: the stack + // pointer shifts during the function due to PHP/PLP (+1 byte) and + // PHA/PUSH16/PEA/PLA/PLY/PLX/PHX/PHY (±2 bytes). ADJCALLSTACK + // pseudos and JSL/RTL also shift SP. An access at numerical offset + // O when SP has shifted by K maps to LOGICAL offset (O - K). + // + // We compute SpAdj at every program point and use it to canonicalize + // offsets. If our dataflow can't determine SpAdj at some MBB entry + // (multiple predecessors with different shifts), we bail. + // + // Per-MBB analysis: walk forward, tracking running SpAdj. + // Inter-MBB: we require ALL paths to a MBB have the same SpAdj. + DenseMap MBBEntrySpAdj; + DenseMap MBBExitSpAdj; + MBBEntrySpAdj[&MF.front()] = 0; + bool SpAdjValid = true; + SmallVector Worklist; + Worklist.push_back(&MF.front()); + SmallPtrSet Visited; + auto miSpDelta = [](const MachineInstr &MI) -> int { + switch (MI.getOpcode()) { + case W65816::PHA: case W65816::PHX: case W65816::PHY: + case W65816::PUSH16: case W65816::PEA: + return -2; + case W65816::PLA: case W65816::PLX: case W65816::PLY: + return 2; + case W65816::PHP: + return -1; + case W65816::PLP: + return 1; + default: + break; + } + // ADJCALLSTACK* / TCS / TSC / SEC...SBC...TCS sequences are + // unhandled — they can shift SP arbitrarily. Caller must bail. + return 0; + }; + auto miBailsAnalysis = [](const MachineInstr &MI) -> bool { + // We don't bail on TCS or ADJCALLSTACK*. TCS in prologue/epilogue + // resets SP to a known value (the "canonical" SP for that region); + // since stack-rel accesses don't span TCS in well-formed code (the + // prologue allocates, body uses, epilogue deallocates), treating + // SP as continuing across TCS gives correct relative offsets for + // accesses inside each region. ADJCALLSTACK* aren't usually + // present at pre-emit time (PEI eliminates them or AsmPrinter + // handles). If they're still present, treat as 0 SP-shift — + // the actual PUSH16 ops carry the real shift. + return false; + }; + auto miSpDeltaWithAdj = [&](const MachineInstr &MI) -> int { + if (MI.getOpcode() == W65816::ADJCALLSTACKDOWN || + MI.getOpcode() == W65816::ADJCALLSTACKUP) { + // Skip — the actual PUSH16/PEA/PHA ops inside the call seq + // carry the SP delta. + return 0; + } + if (MI.getOpcode() == W65816::TCS) { + // TCS sets SP; we treat it as a "reset to canonical SP" point. + // Return 0 here; the calling code can do the reset. + return 0; + } + return 0; + }; + (void)miSpDeltaWithAdj; + while (!Worklist.empty() && SpAdjValid) { + MachineBasicBlock *MBB = Worklist.pop_back_val(); + if (!Visited.insert(MBB).second) continue; + int Sp = MBBEntrySpAdj[MBB]; + for (MachineInstr &MI : *MBB) { + // TCS resets SP to the canonical value for the region. In + // well-formed code, the prologue's TCS sets SP for the body + // and the epilogue's TCS undoes it. Since stack-rel accesses + // don't span the prologue/epilogue boundaries (the SP-shifts + // they imply happen ONLY within prologue or epilogue, not + // body), treat TCS as "Sp = 0" (canonical body SP). + if (MI.getOpcode() == W65816::TCS) { + Sp = 0; + continue; + } + Sp += miSpDelta(MI); + } + MBBExitSpAdj[MBB] = Sp; + for (MachineBasicBlock *Succ : MBB->successors()) { + auto It = MBBEntrySpAdj.find(Succ); + if (It == MBBEntrySpAdj.end()) { + MBBEntrySpAdj[Succ] = Sp; + Worklist.push_back(Succ); + } else if (It->second != Sp) { + // Conflicting SP at successor entry → bail. + SpAdjValid = false; + break; + } + } + } + if (!SpAdjValid) return false; + + DenseMap AccessCount; + DenseMap ReadCount; + DenseMap WriteCount; + DenseMap> AccessSites; + // Also need to remember the SpAdj at each access site so the rewrite + // can compute the right DP address (DP doesn't shift, but we need + // to know which logical slot the access refers to). + DenseMap SiteSpAdj; + for (MachineBasicBlock &MBB : MF) { + int Sp = MBBEntrySpAdj[&MBB]; + for (MachineInstr &MI : MBB) { + if (MI.getOpcode() == W65816::TCS) { + Sp = 0; + continue; + } + if (isStackRelOp(MI.getOpcode())) { + if (MI.getNumOperands() >= 1 && MI.getOperand(0).isImm()) { + int64_t Off = MI.getOperand(0).getImm(); + int64_t LogicalOff = Off + Sp; // canonical (SP-adj subtracted) + AccessCount[LogicalOff]++; + if (MI.getOpcode() == W65816::STA_StackRel) + WriteCount[LogicalOff]++; + else + ReadCount[LogicalOff]++; + AccessSites[LogicalOff].push_back(&MI); + SiteSpAdj[&MI] = Sp; + } + } + Sp += miSpDelta(MI); + } + } + + if (AccessCount.empty()) return false; + + // 3b. Scan for existing DP-immediate usage in $D0..$DE. Regalloc / + // backend may have already used these slots (via STX_DP / STA_DP / + // LDA_DP with imm in 0xD0..0xDE) for spills or COPY-via-DP patterns. + // Don't double-use those slots. + auto dpImmInRange = [](const MachineInstr &MI) -> int { + unsigned Op = MI.getOpcode(); + switch (Op) { + case W65816::LDA_DP: case W65816::STA_DP: + case W65816::ADC_DP: case W65816::SBC_DP: + case W65816::CMP_DP: case W65816::AND_DP: + case W65816::ORA_DP: case W65816::EOR_DP: + case W65816::STX_DP: case W65816::STY_DP: + case W65816::STZ_DP: + if (MI.getNumOperands() >= 1 && MI.getOperand(0).isImm()) { + int64_t I = MI.getOperand(0).getImm(); + if (I >= 0xD0 && I <= 0xDE && (I & 1) == 0) + return static_cast((I - 0xD0) / 2); // 0..7 + } + break; + } + return -1; + }; + BitVector UsedDp(8, false); + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + int idx = dpImmInRange(MI); + if (idx >= 0) UsedDp.set(idx); + } + } + + // 4. Sort offsets by access count. Pick top 2. + SmallVector Ordered; + for (auto &P : AccessCount) Ordered.push_back(P.first); + std::sort(Ordered.begin(), Ordered.end(), + [&](int64_t A, int64_t B) { + return AccessCount[A] > AccessCount[B]; + }); + + // 5. Promote top 2 offsets to IMG0 / IMG1. + // + // Heuristic: only promote when the function has FEW (<=4) hot slots. + // Functions with many hot slots tend to be using stack slots as + // SCRATCH temps; promoting them doesn't help and creates many + // STA/LDA pairs to the SAME DP slot that don't optimize well + // (bsearch regression observed at 5+ hot slots). Functions with + // 1-4 hot slots tend to have a long-lived accumulator-like value + // where promotion IS a win (memcmp/strcpy/sumOfSquares observed + // wins at this threshold). + const unsigned kThreshold = 3u; + const unsigned kMaxPromote = 8u; + const unsigned kMaxHotSlots = 16u; + unsigned HotCount = 0; + for (int64_t Off : Ordered) { + if (AccessCount[Off] >= kThreshold) ++HotCount; + } + if (HotCount > kMaxHotSlots) return false; + DenseMap OffsetToDp; // logical offset -> DP byte + unsigned NextDpIdx = 0; + // Caller-passed arg slots live ABOVE the return address on the stack; + // post-frame-alloc, they're at offsets >= frame_size + 4. Don't + // promote them — caller wrote to the stack location, not to DP. + int FrameSize = static_cast(MF.getFrameInfo().getStackSize()); + int ArgSlotMinOff = FrameSize + 4; + for (int64_t Off : Ordered) { + if (AccessCount[Off] < kThreshold) break; + if (OffsetToDp.size() >= kMaxPromote) break; + // Skip arg slots (offset >= frame_size + 4 from canonical SP). + if (Off >= ArgSlotMinOff) continue; + // Skip already-used DP slots ($D0..$DE). + while (NextDpIdx < 8 && UsedDp.test(NextDpIdx)) ++NextDpIdx; + if (NextDpIdx >= 8) break; + OffsetToDp[Off] = static_cast(0xD0 + 2 * NextDpIdx); + ++NextDpIdx; + } + // 6. Rewrite each access site. (OffsetToDp may be empty — phases + // 2-5 still run as standalone peepholes.) + const W65816Subtarget &STI = MF.getSubtarget(); + const W65816InstrInfo *TII = STI.getInstrInfo(); + bool Changed = false; + for (auto &P : OffsetToDp) { + int64_t LogicalOff = P.first; + uint8_t DpAddr = P.second; + for (MachineInstr *MI : AccessSites[LogicalOff]) { + unsigned NewOpc = getDpOpcode(MI->getOpcode()); + if (!NewOpc) continue; + MachineBasicBlock *MBB = MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + MachineInstrBuilder NewMI = + BuildMI(*MBB, MI, DL, TII->get(NewOpc)).addImm(DpAddr); + // Carry implicit operands from the original MI. + for (const MachineOperand &MO : MI->operands()) { + if (MO.isImm()) continue; // the offset, already replaced + if (MO.isReg() && !MO.isImplicit()) { + // shouldn't happen for stack-rel ops (only 1 explicit operand) + continue; + } + if (MO.isReg() && MO.isImplicit()) { + RegState Flags = MO.isDef() ? RegState::ImplicitDefine + : RegState::Implicit; + if (MO.isKill()) Flags = Flags | RegState::Kill; + if (MO.isDead()) Flags = Flags | RegState::Dead; + NewMI.addReg(MO.getReg(), Flags); + } + } + MI->eraseFromParent(); + Changed = true; + } + } + + // Phase 2: after promotion, look for `LDA_DP X ; INA ; STA_DP X` + // (or with DEA) patterns and fold to `INC_DP X` (DEC_DP X). These + // are very common in loop body: `i++` and `counter--` both produce + // LDA-modify-STA round trips. INC dp = 6 cyc, 2 bytes; the chain + // = 10 cyc, 7 bytes. Saves 4 cyc and 5 bytes per occurrence. + // + // Safety: INC/DEC dp set N/Z based on the NEW memory value (same as + // INA/DEA on A). Any consumer expecting N/Z still works. The + // A-clobber DOES differ: after LDA-INA-STA, A holds the new value; + // after INC dp, A is unchanged. So only fire when A is dead after + // the STA (= next instruction redefines A or A is killed). + for (MachineBasicBlock &MBB : MF) { + SmallVector Dead; + for (auto It = MBB.begin(); It != MBB.end(); ++It) { + if (It->getOpcode() != W65816::LDA_DP) continue; + if (It->getNumOperands() < 1 || !It->getOperand(0).isImm()) continue; + int64_t LdaAddr = It->getOperand(0).getImm(); + auto Next = std::next(It); + while (Next != MBB.end() && Next->isDebugInstr()) ++Next; + if (Next == MBB.end()) continue; + bool isInc = Next->getOpcode() == W65816::INA || + Next->getOpcode() == W65816::INA_PSEUDO; + bool isDec = Next->getOpcode() == W65816::DEA || + Next->getOpcode() == W65816::DEA_PSEUDO; + if (!isInc && !isDec) continue; + auto Sta = std::next(Next); + while (Sta != MBB.end() && Sta->isDebugInstr()) ++Sta; + if (Sta == MBB.end()) continue; + if (Sta->getOpcode() != W65816::STA_DP) continue; + if (Sta->getNumOperands() < 1 || !Sta->getOperand(0).isImm()) continue; + int64_t StaAddr = Sta->getOperand(0).getImm(); + if (LdaAddr != StaAddr) continue; + // Check A is dead after the STA: next instruction must redefine + // A (LDA, transfer, PLA, etc.) or be a terminator (function exit). + // Helper: scan forward from It, looking for A-redef. Returns + // true if A is dead (next operation that touches A defines, not + // uses). Recurses into successor MBBs (one level only). + auto isADeadFromHere = [](MachineBasicBlock::iterator It, + MachineBasicBlock &CurrMBB, + int depth) -> bool { + if (depth > 1) return false; // cap recursion + auto Cur = It; + while (Cur != CurrMBB.end()) { + if (Cur->isDebugInstr()) { ++Cur; continue; } + bool usesA = false, defsA = false; + for (const MachineOperand &MO : Cur->operands()) { + if (MO.isReg() && MO.getReg() == W65816::A) { + if (MO.isDef()) defsA = true; + else if (MO.isUse()) usesA = true; + } + } + if (Cur->isCall()) defsA = true; // caller-save + if (defsA) return true; + if (usesA) return false; + ++Cur; + } + return false; + }; + // Scan in current MBB. + auto After = std::next(Sta); + bool aDead = isADeadFromHere(After, MBB, 0); + // If not resolved in current MBB, check successors. ALL + // successor entries must define A as their first A-touch AND + // not also have any A-using terminator in CurrMBB before the + // jump (because A-use in the branch would consume our value). + if (!aDead) { + // Walk to end of MBB; if any A-use is encountered, bail. + bool foundAUse = false; + for (auto It2 = After; It2 != MBB.end(); ++It2) { + if (It2->isDebugInstr()) continue; + for (const MachineOperand &MO : It2->operands()) { + if (MO.isReg() && MO.getReg() == W65816::A && + MO.isUse() && !MO.isDef()) { + foundAUse = true; + break; + } + } + if (foundAUse) break; + } + if (!foundAUse) { + bool allSuccDeadA = !MBB.succ_empty(); + for (MachineBasicBlock *Succ : MBB.successors()) { + if (!isADeadFromHere(Succ->begin(), *Succ, 1)) { + allSuccDeadA = false; + break; + } + } + if (allSuccDeadA) aDead = true; + } + } + if (!aDead) continue; + // Emit INC_DP / DEC_DP. + DebugLoc DL = It->getDebugLoc(); + BuildMI(MBB, It, DL, TII->get(isInc ? W65816::INC_DP : W65816::DEC_DP)) + .addImm(LdaAddr); + Dead.push_back(&*It); + Dead.push_back(&*Next); + Dead.push_back(&*Sta); + } + for (MachineInstr *MI : Dead) { + MI->eraseFromParent(); + Changed = true; + } + } + + // Phase 3: epilogue TAY/TYA bracket elimination when the bracketed + // load is from DP (which survives TCS). Pattern: + // LDA_DP + // TAY ; preserve A across TCS + // TSC ; CLC ; ADC ; TCS ; frame teardown + // TYA ; restore A + // RTL (or RTS/RTI) + // Rewrite to: + // TSC ; CLC ; ADC ; TCS + // LDA_DP ; load after teardown — DP unaffected + // RTL + // Saves 2 inst (TAY + TYA) per epilogue. Cycle save: ~4 cyc. + for (MachineBasicBlock &MBB : MF) { + // Find the RTL/RTS/RTI terminator. + auto Last = MBB.getLastNonDebugInstr(); + if (Last == MBB.end()) continue; + unsigned LastOpc = Last->getOpcode(); + if (LastOpc != W65816::RTL && LastOpc != W65816::RTS && + LastOpc != W65816::RTI) + continue; + // Walk back looking for the pattern. + auto Tya = Last; + if (Tya == MBB.begin()) continue; + --Tya; + while (Tya != MBB.begin() && Tya->isDebugInstr()) --Tya; + if (Tya->getOpcode() != W65816::TYA) continue; + auto Tcs = Tya; + if (Tcs == MBB.begin()) continue; + --Tcs; + while (Tcs != MBB.begin() && Tcs->isDebugInstr()) --Tcs; + if (Tcs->getOpcode() != W65816::TCS) continue; + auto Adc = Tcs; + if (Adc == MBB.begin()) continue; + --Adc; + while (Adc != MBB.begin() && Adc->isDebugInstr()) --Adc; + if (Adc->getOpcode() != W65816::ADC_Imm16 && + Adc->getOpcode() != W65816::ADCi16imm) + continue; + auto Clc = Adc; + if (Clc == MBB.begin()) continue; + --Clc; + while (Clc != MBB.begin() && Clc->isDebugInstr()) --Clc; + if (Clc->getOpcode() != W65816::CLC) continue; + auto Tsc = Clc; + if (Tsc == MBB.begin()) continue; + --Tsc; + while (Tsc != MBB.begin() && Tsc->isDebugInstr()) --Tsc; + if (Tsc->getOpcode() != W65816::TSC) continue; + auto Tay = Tsc; + if (Tay == MBB.begin()) continue; + --Tay; + while (Tay != MBB.begin() && Tay->isDebugInstr()) --Tay; + if (Tay->getOpcode() != W65816::TAY) continue; + auto LdaLo = Tay; + if (LdaLo == MBB.begin()) continue; + --LdaLo; + while (LdaLo != MBB.begin() && LdaLo->isDebugInstr()) --LdaLo; + if (LdaLo->getOpcode() != W65816::LDA_DP) continue; + if (LdaLo->getNumOperands() < 1 || !LdaLo->getOperand(0).isImm()) + continue; + int64_t LoAddr = LdaLo->getOperand(0).getImm(); + // Build new LDA_DP after TCS, before TYA→RTL (will replace TYA). + DebugLoc DL = LdaLo->getDebugLoc(); + BuildMI(MBB, Tya, DL, TII->get(W65816::LDA_DP)) + .addImm(LoAddr) + .addReg(W65816::A, RegState::ImplicitDefine); + // Erase: old LDA_DP, TAY, TYA. + LdaLo->eraseFromParent(); + Tay->eraseFromParent(); + Tya->eraseFromParent(); + Changed = true; + } + + // Phase 4: prologue TAY/TYA bracket elimination when the bracketed + // store is to DP. Pattern at function entry: + // REP 48 + // TAY ; preserve A across TSC + // TSC ; SEC ; SBC ; TCS ; frame allocation + // TYA ; restore A + // STA_DP ; save A (arg) to DP slot + // Rewrite to: + // REP 48 + // STA_DP ; save A BEFORE TSC clobbers it + // TSC ; SEC ; SBC ; TCS + // Saves 2 inst (TAY + TYA). + { + MachineBasicBlock &EntryMBB = MF.front(); + auto Rep = EntryMBB.getFirstNonDebugInstr(); + if (Rep != EntryMBB.end() && Rep->getOpcode() == W65816::REP) { + auto Tay = std::next(Rep); + while (Tay != EntryMBB.end() && Tay->isDebugInstr()) ++Tay; + if (Tay != EntryMBB.end() && Tay->getOpcode() == W65816::TAY) { + auto Tsc = std::next(Tay); + while (Tsc != EntryMBB.end() && Tsc->isDebugInstr()) ++Tsc; + if (Tsc != EntryMBB.end() && Tsc->getOpcode() == W65816::TSC) { + auto Sec = std::next(Tsc); + while (Sec != EntryMBB.end() && Sec->isDebugInstr()) ++Sec; + if (Sec != EntryMBB.end() && Sec->getOpcode() == W65816::SEC) { + auto Sbc = std::next(Sec); + while (Sbc != EntryMBB.end() && Sbc->isDebugInstr()) ++Sbc; + if (Sbc != EntryMBB.end() && + (Sbc->getOpcode() == W65816::SBC_Imm16 || + Sbc->getOpcode() == W65816::SBCi16imm)) { + auto Tcs = std::next(Sbc); + while (Tcs != EntryMBB.end() && Tcs->isDebugInstr()) ++Tcs; + if (Tcs != EntryMBB.end() && Tcs->getOpcode() == W65816::TCS) { + auto Tya = std::next(Tcs); + while (Tya != EntryMBB.end() && Tya->isDebugInstr()) ++Tya; + if (Tya != EntryMBB.end() && Tya->getOpcode() == W65816::TYA) { + auto Sta = std::next(Tya); + while (Sta != EntryMBB.end() && Sta->isDebugInstr()) ++Sta; + if (Sta != EntryMBB.end() && + Sta->getOpcode() == W65816::STA_DP && + Sta->getNumOperands() >= 1 && + Sta->getOperand(0).isImm()) { + int64_t StaAddr = Sta->getOperand(0).getImm(); + // Build new STA_DP between REP and TSC. + DebugLoc DL = Sta->getDebugLoc(); + BuildMI(EntryMBB, Tsc, DL, TII->get(W65816::STA_DP)) + .addImm(StaAddr) + .addReg(W65816::A, RegState::Implicit); + // Erase: TAY, TYA, old STA_DP. + Tay->eraseFromParent(); + Tya->eraseFromParent(); + Sta->eraseFromParent(); + Changed = true; + } + } + } + } + } + } + } + } + } + + // Phase 5: dead STA before pop. Pattern: + // STA_StackRel ; writes to SP+off + // PLY/PLA/PLX ; pops SP+1..SP+2 (SP += 2) + // ... + // If `off == 1` (or off is within the popped range), the STA writes + // to memory that's about to be popped/overwritten with no read in + // between. STA itself preserves A and P, so erasing is safe IF + // nothing between STA and the pop reads the same memory. + for (MachineBasicBlock &MBB : MF) { + SmallVector Dead; + for (auto It = MBB.begin(); It != MBB.end(); ++It) { + if (It->getOpcode() != W65816::STA_StackRel) continue; + if (It->getNumOperands() < 1 || !It->getOperand(0).isImm()) continue; + int64_t StaOff = It->getOperand(0).getImm(); + // Walk forward: track running SP delta (just due to pops; if we + // hit any other op that uses StaOff's memory, bail). + int popDelta = 0; + bool dead = false; + auto Fwd = std::next(It); + while (Fwd != MBB.end()) { + if (Fwd->isDebugInstr()) { ++Fwd; continue; } + unsigned O = Fwd->getOpcode(); + if (O == W65816::PLA || O == W65816::PLX || O == W65816::PLY) { + // SP += 2. If the STA's offset falls within the popped + // range (StaOff <= popDelta + 2 AND > popDelta), the STA + // wrote memory that's now being popped — dead. + if (StaOff > popDelta && StaOff <= popDelta + 2) { + dead = true; + break; + } + popDelta += 2; + ++Fwd; + continue; + } + if (O == W65816::PLP) { + // SP += 1. + if (StaOff > popDelta && StaOff <= popDelta + 1) { + dead = true; + break; + } + popDelta += 1; + ++Fwd; + continue; + } + // Conservative: bail on any other instruction (could read the + // slot or shift SP in a way we don't model). + break; + } + if (dead) Dead.push_back(&*It); + } + for (MachineInstr *MI : Dead) { + MI->eraseFromParent(); + Changed = true; + } + } + + // Phase 6: `LDA #0 ; STA_DP X ; STA_DP Y ; ...` → `STZ_DP X ; STZ_DP Y`. + // STZ stores zero without going through A and without setting flags. + // Eliminates the LDA when: + // (a) Flags from LDA #0 aren't needed downstream (walk forward from + // last STA, require flag-clobber/uncond-terminator before any + // flag-use). + // (b) A's value (0 from LDA) isn't needed downstream (walk forward + // from last STA, require A-redef before any A-use). + for (MachineBasicBlock &MBB : MF) { + SmallVector Dead; + for (auto It = MBB.begin(); It != MBB.end(); ++It) { + if (It->getOpcode() != W65816::LDA_Imm16 && + It->getOpcode() != W65816::LDAi16imm) + continue; + if (It->getNumOperands() < 2 || !It->getOperand(1).isImm()) continue; + if (It->getOperand(1).getImm() != 0) continue; + // Collect contiguous STA_DP/STA_Abs ops. + SmallVector Stas; + auto Fwd = std::next(It); + while (Fwd != MBB.end()) { + if (Fwd->isDebugInstr()) { ++Fwd; continue; } + if (Fwd->getOpcode() == W65816::STA_DP || + Fwd->getOpcode() == W65816::STAabs || + Fwd->getOpcode() == W65816::STA_Abs) { + Stas.push_back(Fwd); + ++Fwd; + continue; + } + break; + } + if (Stas.empty()) continue; + // Flag-safety: walk past the last STA looking for a flag-clobber + // or unconditional terminator before any flag-use. + bool flagsSafe = false; + auto After = std::next(Stas.back()); + while (After != MBB.end()) { + if (After->isDebugInstr()) { ++After; continue; } + if (After->isConditionalBranch()) break; // unsafe + bool flagUse = false, flagDef = false; + for (const MachineOperand &MO : After->operands()) { + if (MO.isReg() && MO.getReg() == W65816::P) { + if (MO.isUse() && !MO.isDef()) flagUse = true; + if (MO.isDef()) flagDef = true; + } + } + if (flagUse) break; + if (After->isTerminator() && !After->isConditionalBranch()) { + flagsSafe = true; break; + } + if (After->isCall()) { flagsSafe = true; break; } + if (flagDef) { flagsSafe = true; break; } + // Many ops set P implicitly (LDA, ADC, INA, etc.) without an + // explicit P-def operand. Use opcode whitelist. + unsigned O = After->getOpcode(); + if (O == W65816::LDA_DP || O == W65816::LDA_Abs || + O == W65816::LDA_StackRel || O == W65816::LDA_Imm16 || + O == W65816::LDAi16imm || + O == W65816::ADC_DP || O == W65816::ADC_StackRel || + O == W65816::SBC_DP || O == W65816::SBC_StackRel || + O == W65816::CMP_DP || O == W65816::CMP_StackRel || + O == W65816::INA || O == W65816::DEA || + O == W65816::INC_DP || O == W65816::DEC_DP || + O == W65816::TAX || O == W65816::TAY || + O == W65816::TXA || O == W65816::TYA || + O == W65816::PLA || O == W65816::PLX || O == W65816::PLY || + O == W65816::INA_PSEUDO || O == W65816::DEA_PSEUDO) { + flagsSafe = true; break; + } + ++After; + } + if (!flagsSafe) continue; + // ALSO: check A is dead after Stas.back(). Any A-use before A-redef + // would see A=0 from the LDA; without the LDA, A is undef. + bool aDead = false; + auto AScan = std::next(Stas.back()); + while (AScan != MBB.end()) { + if (AScan->isDebugInstr()) { ++AScan; continue; } + bool usesA = false, defsA = false; + for (const MachineOperand &MO : AScan->operands()) { + if (MO.isReg() && MO.getReg() == W65816::A) { + if (MO.isDef()) defsA = true; + else if (MO.isUse()) usesA = true; + } + } + if (AScan->isCall()) defsA = true; + if (defsA) { aDead = true; break; } + if (usesA) break; // A used before redef + ++AScan; + } + if (!aDead) continue; + // Rewrite all STAs to STZ. + for (auto Sta : Stas) { + unsigned NewOp; + if (Sta->getOpcode() == W65816::STA_DP) + NewOp = W65816::STZ_DP; + else + NewOp = W65816::STZ_Abs; + DebugLoc DL = Sta->getDebugLoc(); + // The STA's address operand is at index 1 for STA_DP (operand 0 + // is the implicit-use $a) — no wait, STA_DP's tablegen has + // (ins addrDP:$addr) so operand 0 is the address. Let me check. + // Actually STA_DP is `InstDP<0x85, "sta">` — same as LDA_DP, + // operand 0 is the address immediate. + int AddrIdx = 0; + if (Sta->getNumOperands() >= 1 && Sta->getOperand(0).isReg()) + AddrIdx = 1; // STAabs has explicit $a first + BuildMI(MBB, Sta, DL, TII->get(NewOp)) + .add(Sta->getOperand(AddrIdx)); + Dead.push_back(&*Sta); + } + Dead.push_back(&*It); + } + for (MachineInstr *MI : Dead) { + MI->eraseFromParent(); + Changed = true; + } + } + + // Phase 7: dead PHA before single-arg libcall. Pattern: + // PHA ; dead allocation (the bottom of 2 pushes) + // PUSH16 ; actual arg push (becomes 4,s inside callee) + // LDA_StackRel ; reads caller stack (after both pushes) + // JSLpseudo32 ; helper reads ONLY 4,s + // PLY ; pop + // PLY ; pop the dead PHA value + // (terminator: RTL/RTS/RTI) + // + // If the JSL target is in our "reads only 4,s" libgcc whitelist, + // we can drop the PHA + one PLY + adjust LDA_StackRel offset by -2. + // Saves 2 inst per occurrence. + auto isOneArgLibcall = [](const MachineInstr &MI) -> bool { + if (!MI.isCall()) return false; + for (const MachineOperand &MO : MI.operands()) { + StringRef Name; + if (MO.isGlobal()) Name = MO.getGlobal()->getName(); + else if (MO.isSymbol()) Name = MO.getSymbolName(); + else continue; + // Helpers that take one i16 arg in A and one i16 arg at 4,s, + // returning i32 in A:X. __mulhi3 returns i16 not i32 but reads + // same arg layout. + if (Name == "__umulhisi3" || Name == "__mulhi3") + return true; + return false; + } + return false; + }; + for (MachineBasicBlock &MBB : MF) { + SmallVector Dead; + SmallVector, 4> AdjustOff; + for (auto It = MBB.begin(); It != MBB.end(); ++It) { + if (It->getOpcode() != W65816::PHA) continue; + auto Push = std::next(It); + while (Push != MBB.end() && Push->isDebugInstr()) ++Push; + if (Push == MBB.end() || Push->getOpcode() != W65816::PUSH16) continue; + // Walk forward collecting stack-rel accesses and the call. + MachineBasicBlock::iterator CallIt = MBB.end(); + SmallVector StackRelOps; + auto Fwd = std::next(Push); + while (Fwd != MBB.end()) { + if (Fwd->isDebugInstr()) { ++Fwd; continue; } + if (Fwd->isCall()) { CallIt = Fwd; break; } + if (isStackRelOp(Fwd->getOpcode())) { + StackRelOps.push_back(Fwd); + } else if (Fwd->getOpcode() == W65816::PHA || + Fwd->getOpcode() == W65816::PHX || + Fwd->getOpcode() == W65816::PHY || + Fwd->getOpcode() == W65816::PUSH16 || + Fwd->getOpcode() == W65816::PEA || + Fwd->getOpcode() == W65816::PLA || + Fwd->getOpcode() == W65816::PLX || + Fwd->getOpcode() == W65816::PLY) { + // Additional stack ops between push and call — bail. + CallIt = MBB.end(); + break; + } + ++Fwd; + } + if (CallIt == MBB.end()) continue; + if (!isOneArgLibcall(*CallIt)) continue; + // Find the two PLYs after the call. + auto Ply1 = std::next(CallIt); + while (Ply1 != MBB.end() && Ply1->isDebugInstr()) ++Ply1; + if (Ply1 == MBB.end() || Ply1->getOpcode() != W65816::PLY) continue; + auto Ply2 = std::next(Ply1); + while (Ply2 != MBB.end() && Ply2->isDebugInstr()) ++Ply2; + if (Ply2 == MBB.end() || Ply2->getOpcode() != W65816::PLY) continue; + // Verify all StackRel ops have offset > 2 (i.e., they read + // CALLER's stack, not our PHA's slot which is at offset 3..4). + bool safe = true; + for (auto SrIt : StackRelOps) { + if (SrIt->getNumOperands() < 1 || !SrIt->getOperand(0).isImm()) { + safe = false; break; + } + int64_t O = SrIt->getOperand(0).getImm(); + if (O <= 4) { safe = false; break; } // would alias the PHA push + } + if (!safe) continue; + // All clear. Erase PHA, one PLY, adjust stack-rel offsets by -2. + Dead.push_back(&*It); + Dead.push_back(&*Ply2); + for (auto SrIt : StackRelOps) { + AdjustOff.push_back({&*SrIt, -2}); + } + } + for (auto P : AdjustOff) { + P.first->getOperand(0).setImm(P.first->getOperand(0).getImm() + P.second); + } + for (MachineInstr *MI : Dead) { + MI->eraseFromParent(); + Changed = true; + } + } + + // Phase 8: dead frame elimination. After IMG promotion, some + // functions have NO stack-rel access — all locals live in IMG slots. + // The frame alloc (TSC/SEC/SBC/TCS in prologue) and dealloc (TSC/ + // CLC/ADC/TCS in epilogue) are then dead overhead. Detect and elide. + // + // Saves 8 inst (4 in prologue + 4 in epilogue) when conditions met. + // + // Conditions: + // 1. Entry MBB has REP ; TSC ; SEC ; SBC #N ; TCS + // 2. Every return MBB has TSC ; CLC ; ADC #N ; TCS ; RTL/RTS/RTI + // 3. No stack-rel access remains anywhere in the function + // 4. No PUSH16/PHA/PEA in a way that would leave SP unbalanced — + // pair check: each push must have a matching pop. We + // conservatively count and require zero net delta at each + // epilogue point. + { + // Check no stack-rel access. + bool hasStackRel = false; + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + if (isStackRelOp(MI.getOpcode())) { + hasStackRel = true; + break; + } + } + if (hasStackRel) break; + } + if (!hasStackRel) { + // Find prologue TSC sequence in entry MBB. + MachineBasicBlock &EntryMBB = MF.front(); + auto Rep = EntryMBB.getFirstNonDebugInstr(); + MachineBasicBlock::iterator Tsc = EntryMBB.end(); + MachineBasicBlock::iterator Sec, Sbc, Tcs2; + if (Rep != EntryMBB.end() && Rep->getOpcode() == W65816::REP) { + auto It = std::next(Rep); + while (It != EntryMBB.end() && It->isDebugInstr()) ++It; + // Skip an optional STA_DP between REP and TSC (the prologue + // optimization in Phase 4 may have moved one there). + while (It != EntryMBB.end() && It->getOpcode() == W65816::STA_DP) + ++It; + while (It != EntryMBB.end() && It->isDebugInstr()) ++It; + if (It != EntryMBB.end() && It->getOpcode() == W65816::TSC) { + Tsc = It++; + while (It != EntryMBB.end() && It->isDebugInstr()) ++It; + if (It != EntryMBB.end() && It->getOpcode() == W65816::SEC) { + Sec = It++; + while (It != EntryMBB.end() && It->isDebugInstr()) ++It; + if (It != EntryMBB.end() && + (It->getOpcode() == W65816::SBC_Imm16 || + It->getOpcode() == W65816::SBCi16imm)) { + Sbc = It++; + while (It != EntryMBB.end() && It->isDebugInstr()) ++It; + if (It != EntryMBB.end() && It->getOpcode() == W65816::TCS) { + Tcs2 = It; + } else { + Tsc = EntryMBB.end(); + } + } else { + Tsc = EntryMBB.end(); + } + } else { + Tsc = EntryMBB.end(); + } + } + } + if (Tsc != EntryMBB.end()) { + // Get N from SBC. + int64_t N = 0; + for (const MachineOperand &MO : Sbc->operands()) { + if (MO.isImm()) { N = MO.getImm(); break; } + } + // For each return MBB, find matching epilogue. + struct EpiInfo { + MachineBasicBlock::iterator Tsc, Clc, Adc, Tcs; + }; + SmallVector Epis; + bool allMatched = true; + for (MachineBasicBlock &MBB : MF) { + auto Last = MBB.getLastNonDebugInstr(); + if (Last == MBB.end()) continue; + unsigned LastOpc = Last->getOpcode(); + if (LastOpc != W65816::RTL && LastOpc != W65816::RTS && + LastOpc != W65816::RTI) + continue; + // Walk back from terminator looking for TSC/CLC/ADC/TCS. + auto Cur = Last; + if (Cur == MBB.begin()) { allMatched = false; break; } + --Cur; + while (Cur != MBB.begin() && Cur->isDebugInstr()) --Cur; + // Skip an optional LDA_DP after TCS (epilogue opt may have put one). + if (Cur != MBB.begin() && Cur->getOpcode() == W65816::LDA_DP) { + --Cur; + while (Cur != MBB.begin() && Cur->isDebugInstr()) --Cur; + } + if (Cur->getOpcode() != W65816::TCS) { + allMatched = false; break; + } + auto Tcs_e = Cur; + if (Cur == MBB.begin()) { allMatched = false; break; } + --Cur; + while (Cur != MBB.begin() && Cur->isDebugInstr()) --Cur; + if (Cur->getOpcode() != W65816::ADC_Imm16 && + Cur->getOpcode() != W65816::ADCi16imm) { + allMatched = false; break; + } + auto Adc_e = Cur; + int64_t AdcN = 0; + for (const MachineOperand &MO : Adc_e->operands()) { + if (MO.isImm()) { AdcN = MO.getImm(); break; } + } + if (AdcN != N) { allMatched = false; break; } + if (Cur == MBB.begin()) { allMatched = false; break; } + --Cur; + while (Cur != MBB.begin() && Cur->isDebugInstr()) --Cur; + if (Cur->getOpcode() != W65816::CLC) { + allMatched = false; break; + } + auto Clc_e = Cur; + if (Cur == MBB.begin()) { allMatched = false; break; } + --Cur; + while (Cur != MBB.begin() && Cur->isDebugInstr()) --Cur; + if (Cur->getOpcode() != W65816::TSC) { + allMatched = false; break; + } + Epis.push_back({Cur, Clc_e, Adc_e, Tcs_e}); + } + if (allMatched && !Epis.empty()) { + // Eliminate prologue TSC/SEC/SBC/TCS. + Tsc->eraseFromParent(); + Sec->eraseFromParent(); + Sbc->eraseFromParent(); + Tcs2->eraseFromParent(); + // Eliminate each epilogue's TSC/CLC/ADC/TCS. + for (auto &E : Epis) { + E.Tsc->eraseFromParent(); + E.Clc->eraseFromParent(); + E.Adc->eraseFromParent(); + E.Tcs->eraseFromParent(); + } + Changed = true; + } + } + } + } + + // Phase 9: saturating-max preheader elimination. + // + // LLVM's loop induction simplification transforms + // for (i = 1; i <= n; i++) { ... } + // into a count-down loop whose preheader saturates `n+1` to at least + // 2, then decrements to get max(n, 1). This protects against the + // `n == 0` edge case at the IR level. But codegen ALSO emits an + // entry check `if (n == 0) goto exit`, making the saturation + // redundant — `n` is already >= 1 in the preheader. + // + // Pattern: + // bb.0 (entry): + // LDA_DP $X ; X = some DP slot holding n + // BNE %preheader + // (fallthrough to brl-exit MBB) + // bb.preheader: + // LDA_DP $X + // INA ; X+1 + // STA_DP $X + // CMP #N ; N = 3 (= 2+1) for the standard pattern + // BCS %merge + // bb.saturate: + // LDA #N-1 ; N-1 = 2 + // STA_DP $X + // bb.merge: + // ... LDA #1 ; STA $Y (i slot) ... + // DEC_DP $X ; X = max(X, 1) - 1 (canceling +1 from INA) + // (or DEC may be elsewhere in bb.merge) + // + // After elimination: + // bb.0 ends with `BNE %merge` (skips preheader+saturate) + // bb.preheader, bb.saturate, and the DEC_DP $X in bb.merge are gone + // Net: counter $X stays at n (no saturate needed, no DEC needed) + // + // For sumSquares: saves 8 inst. + { + // Look for the entry MBB pattern: ends with LDA_DP $X ; BNE %bb. + for (MachineBasicBlock &EntryMBB : MF) { + // Only consider MBBs whose successors look like the pattern. + if (EntryMBB.succ_size() != 2) continue; + auto Last = EntryMBB.getLastNonDebugInstr(); + if (Last == EntryMBB.end()) continue; + if (Last->getOpcode() != W65816::BNE) continue; + // Walk back to find the LDA_DP feeding BNE. + auto LdaIt = Last; + if (LdaIt == EntryMBB.begin()) continue; + --LdaIt; + while (LdaIt != EntryMBB.begin() && LdaIt->isDebugInstr()) --LdaIt; + if (LdaIt->getOpcode() != W65816::LDA_DP) continue; + if (LdaIt->getNumOperands() < 1 || !LdaIt->getOperand(0).isImm()) + continue; + int64_t XAddr = LdaIt->getOperand(0).getImm(); + // BNE's target MBB is the preheader. + MachineBasicBlock *Preheader = nullptr; + for (MachineOperand &MO : Last->operands()) { + if (MO.isMBB()) { Preheader = MO.getMBB(); break; } + } + if (!Preheader) continue; + // Check preheader's first 5 ops: LDA_DP $X ; INA_PSEUDO ; STA_DP $X + // ; CMPi16imm $a, N ; BCS %merge. + auto P = Preheader->getFirstNonDebugInstr(); + if (P == Preheader->end()) continue; + if (P->getOpcode() != W65816::LDA_DP) continue; + if (P->getOperand(0).getImm() != XAddr) continue; + auto P2 = std::next(P); + while (P2 != Preheader->end() && P2->isDebugInstr()) ++P2; + if (P2 == Preheader->end() || + (P2->getOpcode() != W65816::INA && + P2->getOpcode() != W65816::INA_PSEUDO)) + continue; + auto P3 = std::next(P2); + while (P3 != Preheader->end() && P3->isDebugInstr()) ++P3; + if (P3 == Preheader->end() || P3->getOpcode() != W65816::STA_DP) + continue; + if (P3->getOperand(0).getImm() != XAddr) continue; + auto P4 = std::next(P3); + while (P4 != Preheader->end() && P4->isDebugInstr()) ++P4; + if (P4 == Preheader->end() || + (P4->getOpcode() != W65816::CMP_Imm16 && + P4->getOpcode() != W65816::CMPi16imm)) + continue; + int64_t CmpN = 0; + for (const MachineOperand &MO : P4->operands()) { + if (MO.isImm()) { CmpN = MO.getImm(); break; } + } + // Only handle the loop-start=1 case (CMP #3, saturate to 2). + if (CmpN != 3) continue; + auto P5 = std::next(P4); + while (P5 != Preheader->end() && P5->isDebugInstr()) ++P5; + if (P5 == Preheader->end() || P5->getOpcode() != W65816::BCS) + continue; + // BCS's target is the merge MBB. + MachineBasicBlock *Merge = nullptr; + for (MachineOperand &MO : P5->operands()) { + if (MO.isMBB()) { Merge = MO.getMBB(); break; } + } + if (!Merge) continue; + // The fall-through from preheader is the saturate MBB. + MachineBasicBlock *Saturate = nullptr; + for (MachineBasicBlock *S : Preheader->successors()) { + if (S != Merge) { Saturate = S; break; } + } + if (!Saturate) continue; + // Saturate MBB: must be `LDA #2 ; STA_DP $X` and fall through to + // merge. + auto S1 = Saturate->getFirstNonDebugInstr(); + if (S1 == Saturate->end()) continue; + if (S1->getOpcode() != W65816::LDA_Imm16 && + S1->getOpcode() != W65816::LDAi16imm) + continue; + int64_t SatVal = 0; + for (const MachineOperand &MO : S1->operands()) { + if (MO.isImm()) { SatVal = MO.getImm(); break; } + } + if (SatVal != CmpN - 1) continue; // expect 2 (since CMP was 3) + auto S2 = std::next(S1); + while (S2 != Saturate->end() && S2->isDebugInstr()) ++S2; + if (S2 == Saturate->end() || S2->getOpcode() != W65816::STA_DP) + continue; + if (S2->getOperand(0).getImm() != XAddr) continue; + // Find DEC_DP $X in merge MBB. + MachineBasicBlock::iterator DecIt = Merge->end(); + for (auto It = Merge->begin(); It != Merge->end(); ++It) { + if (It->getOpcode() == W65816::DEC_DP && + It->getNumOperands() >= 1 && It->getOperand(0).isImm() && + It->getOperand(0).getImm() == XAddr) { + DecIt = It; + break; + } + } + if (DecIt == Merge->end()) continue; + // All pattern-matched. Rewrite: + // 1. Change entry's BNE %preheader to BNE %merge. + // 2. Empty preheader and saturate MBBs (their CFG-effects + // stay; the BranchExpand / MBB-layout passes will clean up). + // 3. Erase the DEC_DP in merge. + // + // To stay safe with CFG, we just clear the MBBs' contents (kept + // alive as empty fall-through blocks). Actually safer: leave + // Preheader and Saturate INTACT in CFG terms but skip them via + // BNE→merge redirect. Then those MBBs become unreachable + // and get pruned by later passes. + // + // Redirect BNE's target. + for (MachineOperand &MO : Last->operands()) { + if (MO.isMBB() && MO.getMBB() == Preheader) MO.setMBB(Merge); + } + // Update CFG. + EntryMBB.removeSuccessor(Preheader); + EntryMBB.addSuccessor(Merge); + // Erase the DEC_DP in merge. + DecIt->eraseFromParent(); + // Empty the now-unreachable Preheader and Saturate MBBs (their + // contents would otherwise be emitted as dead asm). Also drop + // their successor links. + while (!Preheader->empty()) Preheader->begin()->eraseFromParent(); + while (!Saturate->empty()) Saturate->begin()->eraseFromParent(); + while (!Preheader->succ_empty()) + Preheader->removeSuccessor(*Preheader->succ_begin()); + while (!Saturate->succ_empty()) + Saturate->removeSuccessor(*Saturate->succ_begin()); + Changed = true; + } + } + + return Changed; +} diff --git a/src/llvm/lib/Target/W65816/W65816StackSlotMerge.cpp b/src/llvm/lib/Target/W65816/W65816StackSlotMerge.cpp index 7d55677..6e193f5 100644 --- a/src/llvm/lib/Target/W65816/W65816StackSlotMerge.cpp +++ b/src/llvm/lib/Target/W65816/W65816StackSlotMerge.cpp @@ -599,20 +599,31 @@ bool W65816StackSlotMerge::runOnMachineFunction(MachineFunction &MF) { } return 0; }; - // Collect `LDA #K ; STA_StackRel Y` pairs, grouped by Y. + // Collect `LDA #K ; STA_StackRel Y` pairs, grouped by Y. Also + // handles consolidated `LDA #K ; STA Y1 ; STA Y2 ; ...` where the + // LDA is shared (Phase 6 collapsing): A stays at K across STAs. DenseMap, 4>> ConstStas; for (MachineBasicBlock &MBB : MF) { for (auto It = MBB.begin(); It != MBB.end(); ++It) { if (!isLdaImm(*It)) continue; int64_t K = immValue(*It); + // Walk forward through STA_StackRel ops; collect each as an + // init of K (A is preserved across STA). Stop on anything + // that modifies A. auto NextIt = std::next(It); - while (NextIt != MBB.end() && NextIt->isDebugInstr()) ++NextIt; - if (NextIt == MBB.end()) continue; - if (NextIt->getOpcode() != W65816::STA_StackRel) continue; - int64_t Y; - if (!srAccess(*NextIt, Y)) continue; - ConstStas[Y].push_back({&*NextIt, K}); + while (NextIt != MBB.end()) { + if (NextIt->isDebugInstr()) { ++NextIt; continue; } + if (NextIt->getOpcode() == W65816::STA_StackRel) { + int64_t Y; + if (srAccess(*NextIt, Y)) { + ConstStas[Y].push_back({&*NextIt, K}); + } + ++NextIt; + continue; + } + break; // any other op — stop (might change A or flags) + } } } // For each slot Y with at least two const-init STAs, check for @@ -692,6 +703,7 @@ bool W65816StackSlotMerge::runOnMachineFunction(MachineFunction &MF) { // flag-use (unsafe). MachineBasicBlock *MBB = DominatedSta->getParent(); bool flagsSafeP5 = false; + bool reachedMBBEnd = false; for (auto Fwd = std::next(DominatedSta->getIterator()); Fwd != MBB->end(); ++Fwd) { if (Fwd->isDebugInstr()) continue; @@ -701,6 +713,33 @@ bool W65816StackSlotMerge::runOnMachineFunction(MachineFunction &MF) { } if (clobbersFlagsP(*Fwd)) { flagsSafeP5 = true; break; } } + // If we walked off the end of MBB, recurse one level into + // successors. The fall-through code is in a successor MBB + // (e.g., bb.3's preheader -> bb.4's loop body which starts + // with an LDA, a flag-clobberer). Require ALL successors + // to clobber flags before any flag-use. + if (!flagsSafeP5) { + // Did the loop exit via fall-through (no break)? + // Check by walking the same loop again, simpler check. + auto It = std::next(DominatedSta->getIterator()); + while (It != MBB->end() && It->isDebugInstr()) ++It; + // ... too brittle to track via prev loop; just recurse for + // every case where flagsSafeP5 is false. Conservative. + bool allSuccClobber = !MBB->succ_empty(); + for (MachineBasicBlock *Succ : MBB->successors()) { + bool succClobbers = false; + for (auto SIt = Succ->begin(); SIt != Succ->end(); ++SIt) { + if (SIt->isDebugInstr()) continue; + if (usesFlagsP(*SIt)) break; + if (clobbersFlagsP(*SIt)) { succClobbers = true; break; } + if (SIt->isTerminator() && !SIt->isConditionalBranch()) { + succClobbers = true; break; + } + } + if (!succClobbers) { allSuccClobber = false; break; } + } + if (allSuccClobber) flagsSafeP5 = true; + } if (!flagsSafeP5) continue; // Erase DominatedSta and its preceding LDA #K. auto Prev = DominatedSta->getIterator(); diff --git a/src/llvm/lib/Target/W65816/W65816TargetMachine.cpp b/src/llvm/lib/Target/W65816/W65816TargetMachine.cpp index 433319b..c6281ac 100644 --- a/src/llvm/lib/Target/W65816/W65816TargetMachine.cpp +++ b/src/llvm/lib/Target/W65816/W65816TargetMachine.cpp @@ -58,6 +58,7 @@ LLVMInitializeW65816Target() { initializeW65816NarrowI32MulPass(PR); initializeW65816PromoteFiToImgPass(PR); initializeW65816StackSlotMergePass(PR); + initializeW65816StackRelToImgPass(PR); // Default IndVarSimplify's exit-value rewriter to "never". The // closed-form replacement frequently widens an i16 induction var @@ -279,6 +280,7 @@ void W65816PassConfig::addPreEmitPass() { // collapses when X and Y are renamed to the same slot). See // W65816StackSlotMerge.cpp. addPass(createW65816StackSlotMerge()); + addPass(createW65816StackRelToImg()); } MachineFunctionInfo *W65816TargetMachine::createMachineFunctionInfo(