Install cleaned up.
This commit is contained in:
parent
2deaba9c29
commit
2bec85ffa5
26 changed files with 1010 additions and 96 deletions
50
README.md
50
README.md
|
|
@ -26,21 +26,20 @@ tight loops in benchmarks like sumOfSquares, popcount, and strcpy.
|
||||||
After installation (see [docs/INSTALL.md](docs/INSTALL.md)):
|
After installation (see [docs/INSTALL.md](docs/INSTALL.md)):
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Compile a C file
|
# Write a tiny C program that computes 1+2+...+10 = 55 and stores it.
|
||||||
cat > hello.c <<'EOF'
|
cat > hello.c <<'EOF'
|
||||||
__attribute__((noinline)) void switchToBank2(void) {
|
|
||||||
__asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n");
|
|
||||||
}
|
|
||||||
int main(void) {
|
int main(void) {
|
||||||
unsigned short x = 0;
|
unsigned short x = 0;
|
||||||
for (int i = 1; i <= 10; i++) x += i; // x = 55
|
for (int i = 1; i <= 10; i++) x += i; // x = 55 = 0x37
|
||||||
switchToBank2();
|
// Write to a known 24-bit absolute address. The compiler lowers
|
||||||
*(volatile unsigned short *)0x5000 = x;
|
// this to `sta long $025000` — no bank switching needed. The MAME
|
||||||
|
// test harness reads this cell to verify the program ran.
|
||||||
|
*(volatile unsigned short *)0x025000 = x;
|
||||||
while (1) {}
|
while (1) {}
|
||||||
}
|
}
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
# Build + run under MAME (writes 0x0037 to $025000, MAME displays it)
|
# Compile, link, run under MAME, check the result.
|
||||||
./tools/llvm-mos-build/bin/clang --target=w65816 -O2 -c hello.c -o hello.o
|
./tools/llvm-mos-build/bin/clang --target=w65816 -O2 -c hello.c -o hello.o
|
||||||
./tools/link816 -o hello.bin --text-base 0x1000 \
|
./tools/link816 -o hello.bin --text-base 0x1000 \
|
||||||
runtime/crt0.o runtime/libc.o runtime/libgcc.o hello.o
|
runtime/crt0.o runtime/libc.o runtime/libgcc.o hello.o
|
||||||
|
|
@ -71,21 +70,32 @@ docs/ this directory — INSTALL.md, USAGE.md, design notes
|
||||||
|
|
||||||
## Status
|
## Status
|
||||||
|
|
||||||
Stable enough to build real programs. Static instruction-count
|
Stable enough to build real programs. Per-call cycle measurements
|
||||||
ratio against commercial Calypsi 5.16 (lower is better):
|
against commercial Calypsi 5.16, measured under MAME via `emu.time()`
|
||||||
|
(IIgs slow-mode 1.023 MHz, `-mllvm -w65816-dbr-safe-ptrs` enabled):
|
||||||
|
|
||||||
| Benchmark | Ours (inst) | Calypsi (inst) | Ratio |
|
| Benchmark | Ours | Calypsi | Ratio |
|
||||||
|---|---:|---:|---:|
|
|---|---:|---:|---:|
|
||||||
| sumSquares | 26 | 31 | **0.84×** ✓ |
|
| bsearch | 682 | 2,387 | **0.29×** ✓ |
|
||||||
| evalAt | 472 | 254 | 1.86× |
|
| dotProduct | 1,534 | 5,712 | **0.27×** ✓ |
|
||||||
| mul16to32 | 1 | 4 | **0.25×** ✓ |
|
| sumOfSquares | 6,820 | 16,368 | **0.42×** ✓ |
|
||||||
|
| bubbleSort | 11,594 | 17,050 | **0.68×** ✓ |
|
||||||
|
| djb2Hash | 2,387 | 2,643 | **0.90×** ✓ |
|
||||||
|
| memcmp | 716 | 716 | **1.00×** |
|
||||||
|
| strcpy | 1,279 | 1,194 | 1.07× |
|
||||||
|
| popcount | 1,705 | 1,534 | 1.11× |
|
||||||
|
| fib | 12,106 | 10,912 | 1.11× |
|
||||||
|
| strLen | 1,876 | 1,023 | 1.83× |
|
||||||
|
|
||||||
Per-iteration cycle measurements (via MAME's HBL counter, 2026-05-20):
|
**Geomean: 0.74× Calypsi** across this suite. Six of ten benches beat
|
||||||
bsearch 127, dotProduct 144, fib 97, memcmp 113, popcount 93,
|
Calypsi outright; one ties exactly. Run `scripts/benchCyclesPrecise.sh`
|
||||||
strcpy 91, sumOfSquares 126 cyc/iter (100 iters);
|
(ours) and `scripts/benchCyclesCalypsi.sh` (Calypsi) to reproduce.
|
||||||
dadd 1157, ddiv 1261, dmul 1033 cyc/iter (10 iters);
|
|
||||||
particles 2253 (3 iters — 32-particle physics tick);
|
On real programs:
|
||||||
mandelbrot 11570 (1 iter — 4×4 fixed-point tile).
|
- **Lua 5.1.5** (17K LoC, 24 source files) compiles + links clean.
|
||||||
|
Object total 0.93× Calypsi.
|
||||||
|
- **CoreMark 1.0** (EEMBC standard benchmark) compiles + links clean.
|
||||||
|
Object total 0.80× Calypsi.
|
||||||
|
|
||||||
See [STATUS.md](STATUS.md) for full language and runtime feature
|
See [STATUS.md](STATUS.md) for full language and runtime feature
|
||||||
coverage, and [LLVM_65816_DESIGN.md](LLVM_65816_DESIGN.md) for
|
coverage, and [LLVM_65816_DESIGN.md](LLVM_65816_DESIGN.md) for
|
||||||
|
|
|
||||||
|
|
@ -46,14 +46,15 @@ everywhere.
|
||||||
distro-neutral.
|
distro-neutral.
|
||||||
- **CPU:** any 64-bit x86 or ARM Linux machine. We're cross-compiling,
|
- **CPU:** any 64-bit x86 or ARM Linux machine. We're cross-compiling,
|
||||||
so the host CPU only matters for build speed.
|
so the host CPU only matters for build speed.
|
||||||
- **Disk:** ~10 GB free total (~5 GB during build, ~7 GB after install
|
- **Disk:** ~20 GB free during install (~12 GB peak for LLVM's cmake
|
||||||
with all reference compilers). If you skip Calypsi (`--skip-calypsi`),
|
intermediates, ~7 GB resident after the install + delete the
|
||||||
knock 580 MB off.
|
intermediates). If you skip Calypsi (`--skip-calypsi`), knock
|
||||||
- **RAM:** 8 GB minimum for the default install (downloads a prebuilt
|
580 MB off the resident size.
|
||||||
llvm-mos SDK). 16 GB recommended if you use `--build-llvm` (compiles
|
- **RAM:** 16 GB recommended (LLVM's link step is the memory-heavy
|
||||||
LLVM from source).
|
one). 8 GB works but the linker may swap, doubling build time.
|
||||||
- **Time:** ~5 minutes for the default (prebuilt) path; 30-60 minutes
|
- **Time:** 30-60 minutes end-to-end (LLVM is the long pole). After
|
||||||
for `--build-llvm` on a modern laptop (depends on core count).
|
the first build, incremental edits to the W65816 backend rebuild
|
||||||
|
in ~30 seconds.
|
||||||
- **Network:** the install pulls ~500 MB of binaries from GitHub,
|
- **Network:** the install pulls ~500 MB of binaries from GitHub,
|
||||||
archive.org, and the Calypsi releases page. No proxy support
|
archive.org, and the Calypsi releases page. No proxy support
|
||||||
baked in — set `http_proxy` / `https_proxy` if you need one.
|
baked in — set `http_proxy` / `https_proxy` if you need one.
|
||||||
|
|
@ -89,7 +90,8 @@ running, see `scripts/installDeps.sh` for the exact list.
|
||||||
## One-command install
|
## One-command install
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
git clone <this-repo-url> llvm816
|
# Replace <REPO-URL> with the actual git URL for this repository.
|
||||||
|
git clone <REPO-URL> llvm816
|
||||||
cd llvm816
|
cd llvm816
|
||||||
./setup.sh
|
./setup.sh
|
||||||
```
|
```
|
||||||
|
|
@ -99,33 +101,40 @@ That's it. `setup.sh` runs five stages in order:
|
||||||
| Stage | Script | What it does | Time |
|
| Stage | Script | What it does | Time |
|
||||||
|---|---|---|---|
|
|---|---|---|---|
|
||||||
| 1/5 | `installDeps.sh` | `sudo apt-get install` the packages listed above. | ~1 min |
|
| 1/5 | `installDeps.sh` | `sudo apt-get install` the packages listed above. | ~1 min |
|
||||||
| 2/5 | `installLlvmMos.sh` | Clone `llvm-mos` source (5 GB), download prebuilt llvm-mos SDK (400 MB), build our W65816 clang under `tools/llvm-mos-build/`. Without `--build-llvm`, downloads the prebuilt SDK only; clang for our target is then built incrementally. | ~5 min (no source build) or 30-60 min (with `--build-llvm`) |
|
| 2/5 | `installLlvmMos.sh` | Clone `llvm-mos` source (5 GB), download prebuilt llvm-mos SDK (400 MB), **apply our W65816 backend** (symlinks + patches), **build clang/llc/llvm-mc with W65816 target enabled**, **build `link816` + `omfEmit`**, and **build the runtime libraries** (`libc.o`, `crt0.o`, `libgcc.o`, soft-float, etc.). After this stage you have a working W65816 toolchain end-to-end. | ~30-60 min (first time; LLVM build is the long pole) |
|
||||||
| 3/5 | `installMame.sh` | Install MAME via apt, download `apple2gs.zip` (ROM 03) and `apple2gsr1.zip` (ROM 01) into `tools/mame/roms/`. | ~30 s |
|
| 3/5 | `installMame.sh` | Install MAME via apt, download `apple2gs.zip` (ROM 03) and `apple2gsr1.zip` (ROM 01) into `tools/mame/roms/`. | ~30 s |
|
||||||
| 4/5 | `installCalypsi.sh` | Download Calypsi 5.16 .deb, extract its payload into `tools/calypsi/` (no system-wide install). | ~30 s |
|
| 4/5 | `installCalypsi.sh` | Download Calypsi 5.16 .deb, extract its payload into `tools/calypsi/` (no system-wide install). | ~30 s |
|
||||||
| 5/5 | `installOrcaC.sh` | Shallow clone of byteworksinc's ORCA/C repo into `tools/orca-c/` for toolbox header reference. | ~15 s |
|
| 5/5 | `installOrcaC.sh` | Shallow clone of byteworksinc's ORCA/C repo into `tools/orca-c/` for toolbox header reference. | ~15 s |
|
||||||
|
|
||||||
After each stage, the script prints `=== N/5 stage-name ===` so you
|
After each stage, the script prints `=== N/5 stage-name ===` so you
|
||||||
can follow progress. At the end it runs `verify.sh` which sanity-
|
can follow progress. At the end it runs `verify.sh` which sanity-
|
||||||
checks every tool was installed.
|
checks every tool was installed AND end-to-end compiles a tiny C
|
||||||
|
program to confirm `clang` actually produces W65816 machine code.
|
||||||
|
|
||||||
A successful install ends with:
|
A successful install ends with:
|
||||||
|
|
||||||
```
|
```
|
||||||
|
[llvm816] all checks passed
|
||||||
[llvm816] setup complete
|
[llvm816] setup complete
|
||||||
```
|
```
|
||||||
|
|
||||||
|
If `verify.sh` reports failures, the most common cause is that the
|
||||||
|
LLVM build didn't include the W65816 target. Re-run
|
||||||
|
`scripts/applyBackend.sh` followed by
|
||||||
|
`ninja -C tools/llvm-mos-build clang llc llvm-mc llvm-objdump`.
|
||||||
|
|
||||||
### `setup.sh` flags
|
### `setup.sh` flags
|
||||||
|
|
||||||
| Flag | Effect |
|
| Flag | Effect |
|
||||||
|---|---|
|
|---|---|
|
||||||
| `--build-llvm` | Build clang from source (30-60 min) instead of using the prebuilt SDK. Required if you plan to modify the W65816 backend. |
|
|
||||||
| `--skip-deps` | Don't run apt (use if you've already installed the system packages). |
|
| `--skip-deps` | Don't run apt (use if you've already installed the system packages). |
|
||||||
| `--skip-llvm` | Skip the LLVM clone + build. Useful for iterating on other parts. |
|
| `--skip-llvm` | Skip the LLVM clone + build + runtime. Useful for iterating on other parts. |
|
||||||
| `--skip-mame` | Skip MAME + ROM download. |
|
| `--skip-mame` | Skip MAME + ROM download. |
|
||||||
| `--skip-calypsi` | Skip Calypsi (saves 580 MB if you don't need the comparison benchmarks). |
|
| `--skip-calypsi` | Skip Calypsi (saves 580 MB if you don't need the comparison benchmarks). |
|
||||||
| `--skip-orca` | Skip ORCA/C (saves ~10 MB; only needed if you regenerate `iigs/toolbox.h`). |
|
| `--skip-orca` | Skip ORCA/C (saves ~10 MB; only needed if you regenerate `iigs/toolbox.h`). |
|
||||||
| `--skip-verify` | Don't run the post-install verification check. |
|
| `--skip-verify` | Don't run the post-install verification check. |
|
||||||
| `--verify-only` | Just run the verification check, don't install anything. |
|
| `--verify-only` | Just run the verification check, don't install anything. |
|
||||||
|
| `--build-llvm` | Deprecated alias — the LLVM build is now always part of stage 2/5 (without it we wouldn't have a usable W65816 compiler). |
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
@ -240,19 +249,11 @@ If you only want to *build* C programs (no benchmarks, no comparisons),
|
||||||
|
|
||||||
### Building W65816 clang from source
|
### Building W65816 clang from source
|
||||||
|
|
||||||
The default install pulls a *prebuilt* llvm-mos SDK but builds our
|
`setup.sh` always builds clang from source — that's the only way to
|
||||||
W65816 backend incrementally on top. If you want to build everything
|
get a `clang` that actually targets W65816 (the prebuilt llvm-mos SDK
|
||||||
from source (recommended for backend development):
|
in `tools/llvm-mos-sdk/` only knows about the 6502 MOS target). The
|
||||||
|
initial build takes 30-60 minutes depending on core count; after that
|
||||||
```bash
|
incremental rebuilds are ~30 seconds:
|
||||||
./setup.sh --build-llvm
|
|
||||||
```
|
|
||||||
|
|
||||||
This adds about 30-60 minutes to install time but means you can edit
|
|
||||||
files under `src/llvm/lib/Target/W65816/` and rebuild quickly.
|
|
||||||
|
|
||||||
After the initial source build, incremental rebuilds after editing
|
|
||||||
backend code take ~30 seconds:
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
ninja -C tools/llvm-mos-build llc clang
|
ninja -C tools/llvm-mos-build llc clang
|
||||||
|
|
@ -314,7 +315,7 @@ If you want a fully clean rebuild (e.g., to chase a "stale .o" bug):
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
rm -rf tools/llvm-mos-build
|
rm -rf tools/llvm-mos-build
|
||||||
./setup.sh --build-llvm
|
./setup.sh --skip-deps --skip-mame --skip-calypsi --skip-orca
|
||||||
```
|
```
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
@ -417,11 +418,11 @@ bash scripts/updateLlvmMos.sh
|
||||||
|
|
||||||
That script handles the symlinks safely.
|
That script handles the symlinks safely.
|
||||||
|
|
||||||
### Disk fills up during `--build-llvm`
|
### Disk fills up during the LLVM build
|
||||||
|
|
||||||
A full LLVM build needs ~12 GB of temporary build artifacts (cmake's
|
A full LLVM build needs ~12 GB of temporary build artifacts (cmake's
|
||||||
intermediate `.o` files, .a archives, etc.) on top of the 5 GB source
|
intermediate `.o` files, .a archives, etc.) on top of the 5 GB source
|
||||||
tree. Free ~15 GB before running `--build-llvm`.
|
tree. Free ~15 GB before running `setup.sh`.
|
||||||
|
|
||||||
Once the build completes, the *intermediate* artifacts under
|
Once the build completes, the *intermediate* artifacts under
|
||||||
`tools/llvm-mos-build/CMakeFiles/` can be deleted — the binaries
|
`tools/llvm-mos-build/CMakeFiles/` can be deleted — the binaries
|
||||||
|
|
@ -431,7 +432,7 @@ under `tools/llvm-mos-build/bin/` are self-contained:
|
||||||
rm -rf tools/llvm-mos-build/CMakeFiles tools/llvm-mos-build/lib
|
rm -rf tools/llvm-mos-build/CMakeFiles tools/llvm-mos-build/lib
|
||||||
```
|
```
|
||||||
|
|
||||||
But this disables incremental rebuilds. Re-running `--build-llvm`
|
But this disables incremental rebuilds. Re-running `setup.sh`
|
||||||
recreates everything.
|
recreates everything.
|
||||||
|
|
||||||
### Calypsi install fails / I don't want it
|
### Calypsi install fails / I don't want it
|
||||||
|
|
|
||||||
|
|
@ -1,17 +1,21 @@
|
||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
# Install llvm-mos: clone source tree for backend development, plus
|
# Install + build the W65816 toolchain on top of llvm-mos:
|
||||||
# download prebuilt SDK for reference/smoke-testing existing 6502 targets.
|
# 1. Clone llvm-mos source.
|
||||||
|
# 2. Download the prebuilt llvm-mos-sdk (reference baseline).
|
||||||
|
# 3. Apply our W65816 backend INTO the clone (symlinks + patches).
|
||||||
|
# 4. Configure + build clang + llc + llvm-mc with the W65816 target.
|
||||||
|
# 5. Build link816 + omfEmit (the linker).
|
||||||
|
# 6. Build the runtime (libc.o, crt0.o, libgcc.o, etc.).
|
||||||
#
|
#
|
||||||
# Flags:
|
# Flags:
|
||||||
# --build also build the source tree with cmake/ninja (slow, ~30-60 min)
|
# --build (no-op; retained for backward compat — we always build)
|
||||||
|
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
source "$(dirname "$0")/common.sh"
|
source "$(dirname "$0")/common.sh"
|
||||||
|
|
||||||
doBuild=0
|
|
||||||
for arg in "$@"; do
|
for arg in "$@"; do
|
||||||
case "$arg" in
|
case "$arg" in
|
||||||
--build) doBuild=1 ;;
|
--build) ;; # no-op; we always build now (see step 4)
|
||||||
*) die "unknown flag: $arg" ;;
|
*) die "unknown flag: $arg" ;;
|
||||||
esac
|
esac
|
||||||
done
|
done
|
||||||
|
|
@ -44,7 +48,9 @@ else
|
||||||
git clone --depth=1 https://github.com/llvm-mos/llvm-mos.git "$LLVM_SRC"
|
git clone --depth=1 https://github.com/llvm-mos/llvm-mos.git "$LLVM_SRC"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# 2. Prebuilt SDK for testing/reference.
|
# 2. Prebuilt SDK for testing/reference (smoke tests against the
|
||||||
|
# vanilla 6502 MOS target; mostly unused once you have a W65816
|
||||||
|
# build).
|
||||||
if [ -x "$LLVM_SDK/bin/mos-common-clang" ] || [ -x "$LLVM_SDK/bin/clang" ]; then
|
if [ -x "$LLVM_SDK/bin/mos-common-clang" ] || [ -x "$LLVM_SDK/bin/clang" ]; then
|
||||||
log "llvm-mos-sdk already extracted"
|
log "llvm-mos-sdk already extracted"
|
||||||
else
|
else
|
||||||
|
|
@ -54,30 +60,57 @@ else
|
||||||
tar -xJf "$archive" -C "$LLVM_SDK" --strip-components=1
|
tar -xJf "$archive" -C "$LLVM_SDK" --strip-components=1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# 3. Optional source build.
|
# 3. Apply our W65816 backend INTO the clone (symlinks + patches).
|
||||||
if [ "$doBuild" -eq 1 ]; then
|
# Must run BEFORE cmake configure so the W65816 target dir + cmake
|
||||||
needCmd cmake
|
# patch are present.
|
||||||
needCmd ninja
|
log "applying W65816 backend (symlinks + patches)"
|
||||||
log "configuring llvm-mos build (this takes a while)"
|
bash "$(dirname "$0")/applyBackend.sh"
|
||||||
|
|
||||||
|
# 4. Configure + build LLVM with W65816 enabled. We always build —
|
||||||
|
# without a built clang the rest of the toolchain (runtime, link816)
|
||||||
|
# can't produce any usable output. --build is kept as a no-op flag
|
||||||
|
# for backward compat.
|
||||||
|
needCmd cmake
|
||||||
|
needCmd ninja
|
||||||
|
if [ -x "$LLVM_BUILD/bin/clang" ] && \
|
||||||
|
[ -x "$LLVM_BUILD/bin/llc" ] && \
|
||||||
|
"$LLVM_BUILD/bin/llc" --version 2>/dev/null | grep -q "^[[:space:]]*w65816[[:space:]]"; then
|
||||||
|
log "llvm-mos-build/bin/clang already exists and supports w65816"
|
||||||
|
else
|
||||||
|
log "configuring llvm-mos build (LLVM + clang + lld; ~5 min after the first cmake)"
|
||||||
install -d "$LLVM_BUILD"
|
install -d "$LLVM_BUILD"
|
||||||
cmake -S "$LLVM_SRC/llvm" -B "$LLVM_BUILD" -G Ninja \
|
cmake -S "$LLVM_SRC/llvm" -B "$LLVM_BUILD" -G Ninja \
|
||||||
-DCMAKE_BUILD_TYPE=Release \
|
-DCMAKE_BUILD_TYPE=Release \
|
||||||
-DLLVM_TARGETS_TO_BUILD="" \
|
-DLLVM_TARGETS_TO_BUILD="" \
|
||||||
-DLLVM_EXPERIMENTAL_TARGETS_TO_BUILD="MOS" \
|
-DLLVM_EXPERIMENTAL_TARGETS_TO_BUILD="MOS;W65816" \
|
||||||
-DLLVM_ENABLE_PROJECTS="clang;lld" \
|
-DLLVM_ENABLE_PROJECTS="clang;lld" \
|
||||||
-DLLVM_PARALLEL_LINK_JOBS=1 \
|
-DLLVM_PARALLEL_LINK_JOBS=1 \
|
||||||
-DLLVM_USE_LINKER=lld \
|
-DLLVM_USE_LINKER=lld \
|
||||||
-DLLVM_INCLUDE_TESTS=OFF \
|
-DLLVM_INCLUDE_TESTS=OFF \
|
||||||
-DLLVM_INCLUDE_EXAMPLES=OFF \
|
-DLLVM_INCLUDE_EXAMPLES=OFF \
|
||||||
-DLLVM_INCLUDE_BENCHMARKS=OFF
|
-DLLVM_INCLUDE_BENCHMARKS=OFF
|
||||||
log "building llvm-mos (background-friendly: use --build only when you have time)"
|
log "building clang, llc, llvm-mc, llvm-objdump (the tools we actually use)"
|
||||||
ninja -C "$LLVM_BUILD"
|
ninja -C "$LLVM_BUILD" clang llc llvm-mc llvm-objdump llvm-readobj
|
||||||
log "llvm-mos build complete: $LLVM_BUILD/bin/clang"
|
log "llvm build done: $LLVM_BUILD/bin/clang"
|
||||||
else
|
|
||||||
log "skipped source build; rerun with --build when ready (cmake+ninja)"
|
|
||||||
fi
|
fi
|
||||||
|
# Sanity check: llc must list w65816 as a registered target.
|
||||||
|
if ! "$LLVM_BUILD/bin/llc" --version 2>/dev/null | grep -q "^[[:space:]]*w65816[[:space:]]"; then
|
||||||
|
"$LLVM_BUILD/bin/llc" --version 2>/dev/null | head -20
|
||||||
|
warn "llc built but does NOT list w65816 as a target. Backend symlinks/patches may have failed. Re-run scripts/applyBackend.sh and ninja -C tools/llvm-mos-build."
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 5. Build link816 + omfEmit.
|
||||||
|
log "building link816 + omfEmit"
|
||||||
|
make -C "$PROJECT_ROOT/src/link816" all
|
||||||
|
|
||||||
|
# 6. Build the runtime (libc.o, crt0.o, libgcc.o, etc.).
|
||||||
|
log "building runtime"
|
||||||
|
bash "$PROJECT_ROOT/runtime/build.sh"
|
||||||
|
|
||||||
log "llvm-mos install done"
|
log "llvm-mos install done"
|
||||||
log " source: $LLVM_SRC"
|
log " source: $LLVM_SRC"
|
||||||
log " sdk: $LLVM_SDK"
|
log " sdk: $LLVM_SDK"
|
||||||
[ "$doBuild" -eq 1 ] && log " build: $LLVM_BUILD"
|
log " build: $LLVM_BUILD"
|
||||||
|
log " clang: $LLVM_BUILD/bin/clang"
|
||||||
|
log " link816: $PROJECT_ROOT/tools/link816"
|
||||||
|
log " omfEmit: $PROJECT_ROOT/tools/omfEmit"
|
||||||
|
|
|
||||||
|
|
@ -28,17 +28,61 @@ check "git" git --version
|
||||||
check "llvm-mos source tree" test -d "$TOOLS_DIR/llvm-mos/.git"
|
check "llvm-mos source tree" test -d "$TOOLS_DIR/llvm-mos/.git"
|
||||||
check "llvm-mos-sdk extracted" test -x "$TOOLS_DIR/llvm-mos-sdk/bin/mos-common-clang"
|
check "llvm-mos-sdk extracted" test -x "$TOOLS_DIR/llvm-mos-sdk/bin/mos-common-clang"
|
||||||
|
|
||||||
|
# W65816 backend integration
|
||||||
|
check "W65816 source symlinked into llvm-mos" \
|
||||||
|
test -L "$TOOLS_DIR/llvm-mos/llvm/lib/Target/W65816/W65816ISelLowering.cpp"
|
||||||
|
check "W65816 clang built" test -x "$TOOLS_DIR/llvm-mos-build/bin/clang"
|
||||||
|
check "W65816 llc built" test -x "$TOOLS_DIR/llvm-mos-build/bin/llc"
|
||||||
|
check "W65816 llvm-mc built" test -x "$TOOLS_DIR/llvm-mos-build/bin/llvm-mc"
|
||||||
|
check "llc lists w65816 target" \
|
||||||
|
bash -c "'$TOOLS_DIR/llvm-mos-build/bin/llc' --version 2>/dev/null | grep -q '^[[:space:]]*w65816[[:space:]]'"
|
||||||
|
|
||||||
|
# link816 + omfEmit
|
||||||
|
check "link816 binary" test -x "$TOOLS_DIR/link816"
|
||||||
|
check "omfEmit binary" test -x "$TOOLS_DIR/omfEmit"
|
||||||
|
|
||||||
|
# Runtime libraries (built objects we link into every program)
|
||||||
|
check "runtime/crt0.o" test -s "$PROJECT_ROOT/runtime/crt0.o"
|
||||||
|
check "runtime/libc.o" test -s "$PROJECT_ROOT/runtime/libc.o"
|
||||||
|
check "runtime/libgcc.o" test -s "$PROJECT_ROOT/runtime/libgcc.o"
|
||||||
|
check "runtime/softFloat.o" test -s "$PROJECT_ROOT/runtime/softFloat.o"
|
||||||
|
check "runtime/softDouble.o" test -s "$PROJECT_ROOT/runtime/softDouble.o"
|
||||||
|
|
||||||
# MAME + ROMs
|
# MAME + ROMs
|
||||||
check "mame binary" command -v mame
|
check "mame binary" command -v mame
|
||||||
check "mame Lua console support" bash -c "mame -showusage 2>&1 | grep -q -- '-console'"
|
check "mame Lua console support" bash -c "mame -showusage 2>&1 | grep -q -- '-console'"
|
||||||
check "rom: apple2gs.zip" test -s "$TOOLS_DIR/mame/roms/apple2gs.zip"
|
check "rom: apple2gs.zip" test -s "$TOOLS_DIR/mame/roms/apple2gs.zip"
|
||||||
check "rom: apple2gsr1.zip" test -s "$TOOLS_DIR/mame/roms/apple2gsr1.zip"
|
check "rom: apple2gsr1.zip" test -s "$TOOLS_DIR/mame/roms/apple2gsr1.zip"
|
||||||
|
|
||||||
# Calypsi benchmark
|
# Calypsi benchmark (optional; --skip-calypsi at install skips this).
|
||||||
check "calypsi cc65816" test -x "$TOOLS_DIR/calypsi/bin/cc65816"
|
if [ -d "$TOOLS_DIR/calypsi" ]; then
|
||||||
|
check "calypsi cc65816" \
|
||||||
|
test -x "$TOOLS_DIR/calypsi/usr/local/lib/calypsi-65816-5.16/bin/cc65816"
|
||||||
|
fi
|
||||||
|
|
||||||
# ORCA/C reference
|
# ORCA/C reference (optional; --skip-orca skips this).
|
||||||
check "orca-c source" test -d "$TOOLS_DIR/orca-c/.git"
|
if [ -d "$TOOLS_DIR/orca-c" ]; then
|
||||||
|
check "orca-c source" test -d "$TOOLS_DIR/orca-c/.git"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# End-to-end smoke: compile a tiny C program for W65816 to prove the
|
||||||
|
# toolchain actually produces machine code.
|
||||||
|
echo
|
||||||
|
log "end-to-end compile check"
|
||||||
|
tmp=$(mktemp -d -t llvm816-verify.XXXXXX)
|
||||||
|
trap "rm -rf '$tmp'" EXIT
|
||||||
|
cat > "$tmp/hello.c" <<'EOF'
|
||||||
|
int answer(void) { return 42; }
|
||||||
|
EOF
|
||||||
|
if "$TOOLS_DIR/llvm-mos-build/bin/clang" --target=w65816 -O2 -c "$tmp/hello.c" \
|
||||||
|
-o "$tmp/hello.o" 2>/dev/null \
|
||||||
|
&& "$TOOLS_DIR/llvm-mos-build/bin/llvm-objdump" --triple=w65816 -d "$tmp/hello.o" 2>/dev/null \
|
||||||
|
| grep -q "lda"; then
|
||||||
|
printf ' [ OK ] C -> w65816 .o compile produces lda instruction\n'
|
||||||
|
else
|
||||||
|
printf ' [FAIL] end-to-end compile failed\n'
|
||||||
|
fails=$((fails + 1))
|
||||||
|
fi
|
||||||
|
|
||||||
echo
|
echo
|
||||||
if [ "$fails" -eq 0 ]; then
|
if [ "$fails" -eq 0 ]; then
|
||||||
|
|
|
||||||
15
setup.sh
15
setup.sh
|
|
@ -2,15 +2,24 @@
|
||||||
# Top-level installer for the llvm816 project. Installs everything into
|
# Top-level installer for the llvm816 project. Installs everything into
|
||||||
# ./tools/ so the tree is self-contained and deletable.
|
# ./tools/ so the tree is self-contained and deletable.
|
||||||
#
|
#
|
||||||
|
# Stages (5/5):
|
||||||
|
# 1. apt dependencies
|
||||||
|
# 2. llvm-mos clone + W65816 backend apply + cmake/ninja build of
|
||||||
|
# clang/llc/llvm-mc + build link816/omfEmit + build runtime (.o)
|
||||||
|
# 3. MAME + apple2gs ROMs
|
||||||
|
# 4. Calypsi 5.16 (reference compiler — optional)
|
||||||
|
# 5. ORCA/C source (header reference — optional)
|
||||||
|
#
|
||||||
# Usage:
|
# Usage:
|
||||||
# ./setup.sh # install everything (no llvm-mos source build)
|
# ./setup.sh # install + build everything (~30-60 min)
|
||||||
# ./setup.sh --build-llvm # also cmake+ninja build llvm-mos (slow)
|
|
||||||
# ./setup.sh --skip-deps # skip apt packages
|
# ./setup.sh --skip-deps # skip apt packages
|
||||||
# ./setup.sh --skip-llvm # skip llvm-mos
|
# ./setup.sh --skip-llvm # skip stage 2 (clang/runtime/link816)
|
||||||
# ./setup.sh --skip-mame # skip MAME + ROM fetch
|
# ./setup.sh --skip-mame # skip MAME + ROM fetch
|
||||||
# ./setup.sh --skip-calypsi # skip Calypsi
|
# ./setup.sh --skip-calypsi # skip Calypsi
|
||||||
# ./setup.sh --skip-orca # skip ORCA/C reference
|
# ./setup.sh --skip-orca # skip ORCA/C reference
|
||||||
|
# ./setup.sh --skip-verify # skip the post-install verification
|
||||||
# ./setup.sh --verify-only # run verification only
|
# ./setup.sh --verify-only # run verification only
|
||||||
|
# ./setup.sh --build-llvm # deprecated alias for the default behavior
|
||||||
|
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1731,28 +1731,28 @@ SDValue W65816TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
|
||||||
SDValue X = Op.getOperand(0);
|
SDValue X = Op.getOperand(0);
|
||||||
SDValue Lo = extractWide32Lo(DAG, DL, X);
|
SDValue Lo = extractWide32Lo(DAG, DL, X);
|
||||||
SDValue Hi = extractWide32Hi(DAG, DL, X);
|
SDValue Hi = extractWide32Hi(DAG, DL, X);
|
||||||
SDValue One = DAG.getConstant(1, DL, MVT::i16);
|
SDValue ShN = DAG.getConstant(N, DL, MVT::i16);
|
||||||
SDValue Fifteen = DAG.getConstant(15, DL, MVT::i16);
|
SDValue ShCo = DAG.getConstant(16 - N, DL, MVT::i16);
|
||||||
for (unsigned i = 0; i < N; i++) {
|
if (Op0 == ISD::SHL) {
|
||||||
if (Op0 == ISD::SHL) {
|
// (Hi:Lo) << N == ((Hi << N) | (Lo >> (16-N))) : (Lo << N)
|
||||||
// (Hi:Lo) << 1: carry = Lo bit15 → into Hi bit0.
|
// 4 SDAG ops instead of N iterations of 4 ops. Lets the
|
||||||
SDValue NewLo = DAG.getNode(ISD::SHL, DL, MVT::i16, Lo, One);
|
// combiner / isel produce ASLA16-cascade + SRL8A+LSRA16-
|
||||||
SDValue HiBit0 = DAG.getNode(ISD::SRL, DL, MVT::i16, Lo, Fifteen);
|
// cascade + single OR, avoiding the bit-by-bit OR cascade
|
||||||
SDValue HiShl = DAG.getNode(ISD::SHL, DL, MVT::i16, Hi, One);
|
// that the unrolled form produced.
|
||||||
SDValue NewHi = DAG.getNode(ISD::OR, DL, MVT::i16, HiShl, HiBit0);
|
SDValue NewLo = DAG.getNode(ISD::SHL, DL, MVT::i16, Lo, ShN);
|
||||||
Lo = NewLo; Hi = NewHi;
|
SDValue HiTop = DAG.getNode(ISD::SRL, DL, MVT::i16, Lo, ShCo);
|
||||||
} else {
|
SDValue HiShl = DAG.getNode(ISD::SHL, DL, MVT::i16, Hi, ShN);
|
||||||
// SRL/SRA: Hi shifts (logical or arithmetic), Lo gets the
|
SDValue NewHi = DAG.getNode(ISD::OR, DL, MVT::i16, HiShl, HiTop);
|
||||||
// low bit of pre-shift Hi inserted at bit 15.
|
return buildWide32(DAG, DL, NewLo, NewHi);
|
||||||
SDValue NewHi = DAG.getNode(Op0, DL, MVT::i16, Hi, One);
|
} else {
|
||||||
SDValue HiLow = DAG.getNode(ISD::AND, DL, MVT::i16, Hi, One);
|
// SRL/SRA by N: NewHi = Hi >> N (logical or arithmetic);
|
||||||
SDValue LoTop = DAG.getNode(ISD::SHL, DL, MVT::i16, HiLow, Fifteen);
|
// NewLo = (Lo >> N) | (Hi << (16-N)).
|
||||||
SDValue LoSrl = DAG.getNode(ISD::SRL, DL, MVT::i16, Lo, One);
|
SDValue NewHi = DAG.getNode(Op0, DL, MVT::i16, Hi, ShN);
|
||||||
SDValue NewLo = DAG.getNode(ISD::OR, DL, MVT::i16, LoSrl, LoTop);
|
SDValue LoTop = DAG.getNode(ISD::SHL, DL, MVT::i16, Hi, ShCo);
|
||||||
Lo = NewLo; Hi = NewHi;
|
SDValue LoSrl = DAG.getNode(ISD::SRL, DL, MVT::i16, Lo, ShN);
|
||||||
}
|
SDValue NewLo = DAG.getNode(ISD::OR, DL, MVT::i16, LoSrl, LoTop);
|
||||||
|
return buildWide32(DAG, DL, NewLo, NewHi);
|
||||||
}
|
}
|
||||||
return buildWide32(DAG, DL, Lo, Hi);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -2663,6 +2663,32 @@ SDValue W65816TargetLowering::LowerMUL_I32(SDValue Op,
|
||||||
return SDValue();
|
return SDValue();
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Mul-by-constant strength reduction: (X * K) where K-1 or K+1 is
|
||||||
|
// a small power of 2 (shift count 1..5, matching our inlined i32
|
||||||
|
// SHL range) expands to (X << N) +/- X — saves a __mulsi3 libcall
|
||||||
|
// (~250 cyc) for ~70 cyc of inlined shift+add. Catches djb2Hash's
|
||||||
|
// `h * 33` = (h << 5) + h.
|
||||||
|
//
|
||||||
|
// Patterns covered:
|
||||||
|
// K = 2^N + 1 in {3,5,9,17,33} → (X << N) + X
|
||||||
|
// K = 2^N - 1 in {7,15,31} → (X << N) - X
|
||||||
|
// Larger N hits the i32 SHL libcall path (no longer profitable).
|
||||||
|
if (auto *CN = dyn_cast<ConstantSDNode>(Rhs)) {
|
||||||
|
int64_t K = CN->getSExtValue();
|
||||||
|
for (unsigned N = 1; N <= 5; N++) {
|
||||||
|
int64_t Pow = int64_t{1} << N;
|
||||||
|
SDValue ShAmt = DAG.getConstant(N, DL, MVT::i16);
|
||||||
|
if (K == Pow + 1) {
|
||||||
|
SDValue Shl = DAG.getNode(ISD::SHL, DL, MVT::i32, Lhs, ShAmt);
|
||||||
|
return DAG.getNode(ISD::ADD, DL, MVT::i32, Shl, Lhs);
|
||||||
|
}
|
||||||
|
if (K == Pow - 1) {
|
||||||
|
SDValue Shl = DAG.getNode(ISD::SHL, DL, MVT::i32, Lhs, ShAmt);
|
||||||
|
return DAG.getNode(ISD::SUB, DL, MVT::i32, Shl, Lhs);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
SDValue A = narrowToI16(Lhs);
|
SDValue A = narrowToI16(Lhs);
|
||||||
SDValue B = narrowToI16(Rhs);
|
SDValue B = narrowToI16(Rhs);
|
||||||
if (A && B) {
|
if (A && B) {
|
||||||
|
|
|
||||||
|
|
@ -1414,8 +1414,10 @@ def PEI_DP : InstDP<0xD4, "pei">;
|
||||||
// AsmParser has no way to know the current M/X bits, so it always
|
// AsmParser has no way to know the current M/X bits, so it always
|
||||||
// reaches for the _Imm16 form. Codegen can still select _Imm8
|
// reaches for the _Imm16 form. Codegen can still select _Imm8
|
||||||
// explicitly once we have 8-bit patterns.
|
// explicitly once we have 8-bit patterns.
|
||||||
|
let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isReMaterializable = 1, isAsCheapAsAMove = 1 in {
|
||||||
def LDA_Imm8 : InstImm8<0xA9, "lda"> { let MHigh = 1; let DecoderNamespace = "W65816MHigh"; let isCodeGenOnly = 1; let Defs = [A]; }
|
def LDA_Imm8 : InstImm8<0xA9, "lda"> { let MHigh = 1; let DecoderNamespace = "W65816MHigh"; let isCodeGenOnly = 1; let Defs = [A]; }
|
||||||
def LDA_Imm16 : InstImm16<0xA9, "lda"> { let MLow = 1; let Defs = [A]; }
|
def LDA_Imm16 : InstImm16<0xA9, "lda"> { let MLow = 1; let Defs = [A]; }
|
||||||
|
}
|
||||||
// Same opcode/encoding as LDA_Imm16, but the operand emits a fixup_bank16
|
// Same opcode/encoding as LDA_Imm16, but the operand emits a fixup_bank16
|
||||||
// so the linker / OMF Loader fills in (bankByte, 0) of the symbol.
|
// so the linker / OMF Loader fills in (bankByte, 0) of the symbol.
|
||||||
// Used by the LDAi16imm_bank pseudo for materializing the high half of
|
// Used by the LDAi16imm_bank pseudo for materializing the high half of
|
||||||
|
|
@ -1455,8 +1457,10 @@ def STA_DPIndLongY : InstDPIndLongY<0x97, "sta"> { let Uses = [A, Y]; }
|
||||||
def STA_LongX : InstAbsLongX<0x9F, "sta">;
|
def STA_LongX : InstAbsLongX<0x9F, "sta">;
|
||||||
|
|
||||||
//---------------------------------------------------------------- LDX (load X)
|
//---------------------------------------------------------------- LDX (load X)
|
||||||
|
let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isReMaterializable = 1, isAsCheapAsAMove = 1 in {
|
||||||
def LDX_Imm8 : InstImm8<0xA2, "ldx"> { let XHigh = 1; let DecoderNamespace = "W65816XHigh"; let isCodeGenOnly = 1; let Defs = [X]; }
|
def LDX_Imm8 : InstImm8<0xA2, "ldx"> { let XHigh = 1; let DecoderNamespace = "W65816XHigh"; let isCodeGenOnly = 1; let Defs = [X]; }
|
||||||
def LDX_Imm16 : InstImm16<0xA2, "ldx"> { let XLow = 1; let Defs = [X]; }
|
def LDX_Imm16 : InstImm16<0xA2, "ldx"> { let XLow = 1; let Defs = [X]; }
|
||||||
|
}
|
||||||
def LDX_DP : InstDP<0xA6, "ldx">;
|
def LDX_DP : InstDP<0xA6, "ldx">;
|
||||||
def LDX_Abs : InstAbs<0xAE, "ldx">;
|
def LDX_Abs : InstAbs<0xAE, "ldx">;
|
||||||
def LDX_DPY : InstDPY<0xB6, "ldx">;
|
def LDX_DPY : InstDPY<0xB6, "ldx">;
|
||||||
|
|
@ -1468,8 +1472,10 @@ def STX_Abs : InstAbs<0x8E, "stx">;
|
||||||
def STX_DPY : InstDPY<0x96, "stx">;
|
def STX_DPY : InstDPY<0x96, "stx">;
|
||||||
|
|
||||||
//---------------------------------------------------------------- LDY (load Y)
|
//---------------------------------------------------------------- LDY (load Y)
|
||||||
|
let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isReMaterializable = 1, isAsCheapAsAMove = 1 in {
|
||||||
def LDY_Imm8 : InstImm8<0xA0, "ldy"> { let XHigh = 1; let DecoderNamespace = "W65816XHigh"; let isCodeGenOnly = 1; let Defs = [Y]; }
|
def LDY_Imm8 : InstImm8<0xA0, "ldy"> { let XHigh = 1; let DecoderNamespace = "W65816XHigh"; let isCodeGenOnly = 1; let Defs = [Y]; }
|
||||||
def LDY_Imm16 : InstImm16<0xA0, "ldy"> { let XLow = 1; let Defs = [Y]; }
|
def LDY_Imm16 : InstImm16<0xA0, "ldy"> { let XLow = 1; let Defs = [Y]; }
|
||||||
|
}
|
||||||
def LDY_DP : InstDP<0xA4, "ldy">;
|
def LDY_DP : InstDP<0xA4, "ldy">;
|
||||||
def LDY_Abs : InstAbs<0xAC, "ldy">;
|
def LDY_Abs : InstAbs<0xAC, "ldy">;
|
||||||
def LDY_DPX : InstDPX<0xB4, "ldy">;
|
def LDY_DPX : InstDPX<0xB4, "ldy">;
|
||||||
|
|
|
||||||
|
|
@ -2051,6 +2051,396 @@ bool W65816StackRelToImg::runOnMachineFunction(MachineFunction &MF) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Shift-cascade dead-store elimination. Greedy regalloc sometimes
|
||||||
|
// emits `LDA dp; (ASLA16; STA dp){×N}; ...` where each intermediate
|
||||||
|
// STA_DP is dead — the next ASLA16 reads $a (still holding the value)
|
||||||
|
// and shifts again, then stores again. Only the final STA matters.
|
||||||
|
//
|
||||||
|
// Pattern: LDA_DP X ; (ASLA16; STA_DP X){×N+1}
|
||||||
|
// where every STA writes to the same DP slot the LDA read
|
||||||
|
// from, and nothing in between reads $a or DP[X] except
|
||||||
|
// the cascade itself.
|
||||||
|
// Rewrite: LDA_DP X ; ASLA16{×N+1} ; STA_DP X
|
||||||
|
// (final STA only; intermediate STAs erased.)
|
||||||
|
//
|
||||||
|
// For N=4 (i.e. 5 shifts), saves 4 STA_DPs = 12 cyc. Hits djb2Hash's
|
||||||
|
// `Hi << 5` cascade (where greedy spills the intermediate vregs).
|
||||||
|
for (MachineBasicBlock &MBB : MF) {
|
||||||
|
SmallVector<MachineInstr *, 8> ToErase;
|
||||||
|
for (auto It = MBB.begin(); It != MBB.end(); ++It) {
|
||||||
|
if (It->getOpcode() != W65816::LDA_DP) continue;
|
||||||
|
if (It->getNumOperands() < 1 || !It->getOperand(0).isImm()) continue;
|
||||||
|
int64_t Slot = It->getOperand(0).getImm();
|
||||||
|
auto Cur = std::next(It);
|
||||||
|
// Track intermediate STAs in this cascade.
|
||||||
|
SmallVector<MachineInstr *, 6> Intermediates;
|
||||||
|
bool sawAny = false;
|
||||||
|
while (Cur != MBB.end()) {
|
||||||
|
if (Cur->isDebugInstr()) { ++Cur; continue; }
|
||||||
|
unsigned Op = Cur->getOpcode();
|
||||||
|
if (Op != W65816::ASLA16 && Op != W65816::LSRA16) break;
|
||||||
|
auto Sta = std::next(Cur);
|
||||||
|
while (Sta != MBB.end() && Sta->isDebugInstr()) ++Sta;
|
||||||
|
if (Sta == MBB.end()) break;
|
||||||
|
if (Sta->getOpcode() != W65816::STA_DP) break;
|
||||||
|
if (Sta->getNumOperands() < 1 || !Sta->getOperand(0).isImm()) break;
|
||||||
|
if (Sta->getOperand(0).getImm() != Slot) break;
|
||||||
|
sawAny = true;
|
||||||
|
Intermediates.push_back(&*Sta);
|
||||||
|
Cur = std::next(Sta);
|
||||||
|
}
|
||||||
|
if (!sawAny || Intermediates.size() < 2) continue;
|
||||||
|
// The LAST STA is the real one; mark everything before it for erase.
|
||||||
|
Intermediates.pop_back();
|
||||||
|
for (MachineInstr *MI : Intermediates)
|
||||||
|
ToErase.push_back(MI);
|
||||||
|
}
|
||||||
|
for (MachineInstr *MI : ToErase) {
|
||||||
|
MI->eraseFromParent();
|
||||||
|
Changed = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// i32-SRL-by-1 fold: detects the SDAG expansion for `(SRL i32 X, 1)`
|
||||||
|
// and rewrites to the two-instruction `LSR Hi ; ROR Lo` pair when
|
||||||
|
// both halves live in DP.
|
||||||
|
//
|
||||||
|
// Input pattern (after the DP-shift-fold above runs on the trailing
|
||||||
|
// `LDA Hi ; LSRA16 ; STA Hi` triplet):
|
||||||
|
// LDA_DP Hi
|
||||||
|
// SHL15A ; A = (Hi & 1) << 15
|
||||||
|
// STA_DP Yc ; carry slot
|
||||||
|
// LDA_DP Lo
|
||||||
|
// LSRA16 ; Lo >>= 1
|
||||||
|
// STA_DP Lo
|
||||||
|
// ORA_DP Yc ; combine with carry-at-bit-15
|
||||||
|
// STA_DP Lo
|
||||||
|
// LSR_DP Hi ; (folded from the trailing triplet)
|
||||||
|
//
|
||||||
|
// Output:
|
||||||
|
// LSR_DP Hi ; Hi >>= 1, C = old bit 0 of Hi
|
||||||
|
// ROR_DP Lo ; Lo = (C, Lo >> 1)
|
||||||
|
//
|
||||||
|
// Same semantics, ~30 cyc saved per iter. popcount measured at 2728
|
||||||
|
// cyc; expected post-fold ~1858 cyc (-32%) due to 29 iters of i32
|
||||||
|
// SRL by 1.
|
||||||
|
for (MachineBasicBlock &MBB : MF) {
|
||||||
|
SmallVector<MachineInstr *, 8> ToErase;
|
||||||
|
for (auto It = MBB.begin(); It != MBB.end();) {
|
||||||
|
auto LdaHi = It++;
|
||||||
|
if (LdaHi->getOpcode() != W65816::LDA_DP) continue;
|
||||||
|
if (LdaHi->getNumOperands() < 1 || !LdaHi->getOperand(0).isImm())
|
||||||
|
continue;
|
||||||
|
int64_t HiAddr = LdaHi->getOperand(0).getImm();
|
||||||
|
auto P = std::next(LdaHi);
|
||||||
|
auto skipDbg = [&](auto &P) {
|
||||||
|
while (P != MBB.end() && P->isDebugInstr()) ++P;
|
||||||
|
};
|
||||||
|
skipDbg(P);
|
||||||
|
if (P == MBB.end() || P->getOpcode() != W65816::SHL15A) continue;
|
||||||
|
auto Shl = P; ++P; skipDbg(P);
|
||||||
|
if (P == MBB.end() || P->getOpcode() != W65816::STA_DP) continue;
|
||||||
|
if (P->getNumOperands() < 1 || !P->getOperand(0).isImm()) continue;
|
||||||
|
int64_t YcAddr = P->getOperand(0).getImm();
|
||||||
|
auto StaYc = P; ++P; skipDbg(P);
|
||||||
|
if (P == MBB.end() || P->getOpcode() != W65816::LDA_DP) continue;
|
||||||
|
if (P->getNumOperands() < 1 || !P->getOperand(0).isImm()) continue;
|
||||||
|
int64_t LoAddr = P->getOperand(0).getImm();
|
||||||
|
auto LdaLo = P; ++P; skipDbg(P);
|
||||||
|
if (P == MBB.end() || P->getOpcode() != W65816::LSRA16) continue;
|
||||||
|
auto LsrA1 = P; ++P; skipDbg(P);
|
||||||
|
if (P == MBB.end() || P->getOpcode() != W65816::STA_DP) continue;
|
||||||
|
if (P->getOperand(0).getImm() != LoAddr) continue;
|
||||||
|
auto StaLo1 = P; ++P; skipDbg(P);
|
||||||
|
if (P == MBB.end() || P->getOpcode() != W65816::ORA_DP) continue;
|
||||||
|
if (P->getOperand(0).getImm() != YcAddr) continue;
|
||||||
|
auto OraYc = P; ++P; skipDbg(P);
|
||||||
|
if (P == MBB.end() || P->getOpcode() != W65816::STA_DP) continue;
|
||||||
|
if (P->getOperand(0).getImm() != LoAddr) continue;
|
||||||
|
auto StaLo2 = P; ++P; skipDbg(P);
|
||||||
|
if (P == MBB.end() || P->getOpcode() != W65816::LSR_DP) continue;
|
||||||
|
if (P->getOperand(0).getImm() != HiAddr) continue;
|
||||||
|
auto LsrHi = P;
|
||||||
|
// Check that YcAddr is not READ after LsrHi before being
|
||||||
|
// overwritten. If the next op touching YcAddr is a STA (write),
|
||||||
|
// the carry-slot value is dead — safe to drop our STA Yc. If
|
||||||
|
// it's a load-style op (LDA/ORA/AND/etc.) before any STA, then
|
||||||
|
// some downstream code is consuming our stored carry — bail.
|
||||||
|
bool YcReadBeforeWrite = false;
|
||||||
|
for (auto Q = std::next(LsrHi); Q != MBB.end(); ++Q) {
|
||||||
|
if (Q->isDebugInstr()) continue;
|
||||||
|
bool touchesYc = false;
|
||||||
|
bool isWriteOfYc = false;
|
||||||
|
for (const MachineOperand &MO : Q->operands()) {
|
||||||
|
if (MO.isImm() && MO.getImm() == YcAddr) {
|
||||||
|
unsigned QO = Q->getOpcode();
|
||||||
|
if (QO == W65816::STA_DP || QO == W65816::STZ_DP ||
|
||||||
|
QO == W65816::STX_DP || QO == W65816::STY_DP) {
|
||||||
|
touchesYc = true; isWriteOfYc = true;
|
||||||
|
} else if (QO == W65816::LDA_DP || QO == W65816::ORA_DP ||
|
||||||
|
QO == W65816::AND_DP || QO == W65816::EOR_DP ||
|
||||||
|
QO == W65816::ADC_DP || QO == W65816::SBC_DP ||
|
||||||
|
QO == W65816::CMP_DP || QO == W65816::LSR_DP ||
|
||||||
|
QO == W65816::ROR_DP || QO == W65816::ASL_DP ||
|
||||||
|
QO == W65816::ROL_DP || QO == W65816::INC_DP ||
|
||||||
|
QO == W65816::DEC_DP) {
|
||||||
|
touchesYc = true; // any of these is a read or RMW
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (touchesYc) {
|
||||||
|
if (!isWriteOfYc) YcReadBeforeWrite = true;
|
||||||
|
break; // first touch decides
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (YcReadBeforeWrite) continue;
|
||||||
|
// Apply: replace the 8-op sequence with LSR_DP Hi ; ROR_DP Lo.
|
||||||
|
// The LSR_DP Hi already exists (at LsrHi); just insert ROR_DP Lo
|
||||||
|
// immediately after it and erase the rest.
|
||||||
|
BuildMI(MBB, std::next(MachineBasicBlock::iterator(LsrHi)),
|
||||||
|
LsrHi->getDebugLoc(), TII->get(W65816::ROR_DP))
|
||||||
|
.addImm(LoAddr);
|
||||||
|
ToErase.push_back(&*LdaHi);
|
||||||
|
ToErase.push_back(&*Shl);
|
||||||
|
ToErase.push_back(&*StaYc);
|
||||||
|
ToErase.push_back(&*LdaLo);
|
||||||
|
ToErase.push_back(&*LsrA1);
|
||||||
|
ToErase.push_back(&*StaLo1);
|
||||||
|
ToErase.push_back(&*OraYc);
|
||||||
|
ToErase.push_back(&*StaLo2);
|
||||||
|
It = std::next(MachineBasicBlock::iterator(LsrHi));
|
||||||
|
}
|
||||||
|
for (MachineInstr *MI : ToErase) {
|
||||||
|
MI->eraseFromParent();
|
||||||
|
Changed = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// DP dead-store elimination — runs LAST (after the i32-SRL-by-1
|
||||||
|
// fold, which depends on the `STA_DP Lo` between LSRA16 and ORA_DP
|
||||||
|
// staying intact). When two STA_DP X stores write the same DP slot
|
||||||
|
// with no intervening read or write of that slot, the first is dead.
|
||||||
|
// Emerges from popcount's `bit = x & 1` pattern at end of body.
|
||||||
|
// Saves 5 cyc per match.
|
||||||
|
//
|
||||||
|
// Conservative: branches / calls / inline asm and all DP-indirect
|
||||||
|
// addressing modes (which use the slot AS a pointer) block the
|
||||||
|
// elim — those reads must see the intended store value.
|
||||||
|
{
|
||||||
|
auto touchesDpSlot = [](const MachineInstr &MI, int64_t Addr) {
|
||||||
|
unsigned Op = MI.getOpcode();
|
||||||
|
switch (Op) {
|
||||||
|
// Direct DP ops.
|
||||||
|
case W65816::LDA_DP: case W65816::STA_DP: case W65816::STZ_DP:
|
||||||
|
case W65816::LDX_DP: case W65816::STX_DP:
|
||||||
|
case W65816::LDY_DP: case W65816::STY_DP:
|
||||||
|
case W65816::ADC_DP: case W65816::SBC_DP:
|
||||||
|
case W65816::AND_DP: case W65816::ORA_DP:
|
||||||
|
case W65816::EOR_DP: case W65816::CMP_DP:
|
||||||
|
case W65816::CPX_DP: case W65816::CPY_DP:
|
||||||
|
case W65816::LSR_DP: case W65816::ROR_DP:
|
||||||
|
case W65816::ASL_DP: case W65816::ROL_DP:
|
||||||
|
case W65816::INC_DP: case W65816::DEC_DP:
|
||||||
|
case W65816::BIT_DP: case W65816::TSB_DP:
|
||||||
|
case W65816::TRB_DP:
|
||||||
|
// DP-indexed ops.
|
||||||
|
case W65816::LDA_DPX: case W65816::STA_DPX: case W65816::STZ_DPX:
|
||||||
|
case W65816::LDY_DPX: case W65816::STY_DPX:
|
||||||
|
case W65816::ADC_DPX: case W65816::SBC_DPX:
|
||||||
|
case W65816::AND_DPX: case W65816::ORA_DPX:
|
||||||
|
case W65816::EOR_DPX: case W65816::CMP_DPX:
|
||||||
|
case W65816::LDX_DPY: case W65816::STX_DPY:
|
||||||
|
case W65816::LSR_DPX: case W65816::ROR_DPX:
|
||||||
|
case W65816::ASL_DPX: case W65816::ROL_DPX:
|
||||||
|
case W65816::BIT_DPX:
|
||||||
|
// DP-indirect ops — read the slot AS a pointer.
|
||||||
|
case W65816::LDA_DPInd: case W65816::STA_DPInd:
|
||||||
|
case W65816::LDA_DPIndY: case W65816::STA_DPIndY:
|
||||||
|
case W65816::LDA_DPIndX: case W65816::STA_DPIndX:
|
||||||
|
case W65816::LDA_DPIndLong: case W65816::STA_DPIndLong:
|
||||||
|
case W65816::LDA_DPIndLongY: case W65816::STA_DPIndLongY:
|
||||||
|
case W65816::ADC_DPInd: case W65816::SBC_DPInd:
|
||||||
|
case W65816::AND_DPInd: case W65816::ORA_DPInd:
|
||||||
|
case W65816::EOR_DPInd: case W65816::CMP_DPInd:
|
||||||
|
case W65816::ADC_DPIndY: case W65816::SBC_DPIndY:
|
||||||
|
case W65816::AND_DPIndY: case W65816::ORA_DPIndY:
|
||||||
|
case W65816::EOR_DPIndY: case W65816::CMP_DPIndY:
|
||||||
|
case W65816::ADC_DPIndLong: case W65816::SBC_DPIndLong:
|
||||||
|
case W65816::AND_DPIndLong: case W65816::ORA_DPIndLong:
|
||||||
|
case W65816::EOR_DPIndLong: case W65816::CMP_DPIndLong:
|
||||||
|
case W65816::ADC_DPIndLongY: case W65816::SBC_DPIndLongY:
|
||||||
|
case W65816::AND_DPIndLongY: case W65816::ORA_DPIndLongY:
|
||||||
|
case W65816::EOR_DPIndLongY: case W65816::CMP_DPIndLongY:
|
||||||
|
if (MI.getNumOperands() >= 1 && MI.getOperand(0).isImm() &&
|
||||||
|
MI.getOperand(0).getImm() == Addr)
|
||||||
|
return true;
|
||||||
|
// Indirect ops read addr AND addr+1 (16-bit ptr) or addr+1,2
|
||||||
|
// (24-bit ptr). Bail when the candidate dead-store target is
|
||||||
|
// within those bytes.
|
||||||
|
if (MI.getNumOperands() >= 1 && MI.getOperand(0).isImm()) {
|
||||||
|
int64_t IndAddr = MI.getOperand(0).getImm();
|
||||||
|
if (IndAddr == Addr - 1 || IndAddr == Addr - 2)
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
};
|
||||||
|
for (MachineBasicBlock &MBB : MF) {
|
||||||
|
SmallVector<MachineInstr *, 8> ToErase;
|
||||||
|
SmallPtrSet<MachineInstr *, 8> ErasedSet;
|
||||||
|
for (auto It = MBB.begin(); It != MBB.end(); ++It) {
|
||||||
|
if (ErasedSet.count(&*It)) continue;
|
||||||
|
if (It->getOpcode() != W65816::STA_DP) continue;
|
||||||
|
if (It->getNumOperands() < 1 || !It->getOperand(0).isImm())
|
||||||
|
continue;
|
||||||
|
int64_t Sta1Addr = It->getOperand(0).getImm();
|
||||||
|
auto Walk = std::next(It);
|
||||||
|
while (Walk != MBB.end()) {
|
||||||
|
if (Walk->isDebugInstr()) { ++Walk; continue; }
|
||||||
|
if (Walk->isBranch() || Walk->isCall() || Walk->isReturn() ||
|
||||||
|
Walk->isInlineAsm()) break;
|
||||||
|
if (Walk->getOpcode() == W65816::STA_DP &&
|
||||||
|
Walk->getNumOperands() >= 1 && Walk->getOperand(0).isImm() &&
|
||||||
|
Walk->getOperand(0).getImm() == Sta1Addr) {
|
||||||
|
ToErase.push_back(&*It);
|
||||||
|
ErasedSet.insert(&*It);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (touchesDpSlot(*Walk, Sta1Addr)) break;
|
||||||
|
++Walk;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (MachineInstr *MI : ToErase) {
|
||||||
|
MI->eraseFromParent();
|
||||||
|
Changed = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// DP-slot zero-check bridge via X. Pattern:
|
||||||
|
// [op that sets Z on A]
|
||||||
|
// STA_DP slot
|
||||||
|
// [ops that don't read/write slot, don't touch X, don't branch/call]
|
||||||
|
// LDA_DP slot
|
||||||
|
// Bcond (BNE/BEQ)
|
||||||
|
//
|
||||||
|
// The STA/LDA round-trip exists purely to preserve A's value across
|
||||||
|
// the clobbers. TAX/TXA does the same job in 4 cyc instead of 8.
|
||||||
|
// Saves 4 cyc/match. Hits popcount's `x_lo | x_hi ; (work) ; bne`.
|
||||||
|
//
|
||||||
|
// Safety: X register must be dead. Conservative check — fires only
|
||||||
|
// when the entire MBB doesn't reference X register except as our new
|
||||||
|
// TAX/TXA, AND all MBB successors don't have X as live-in.
|
||||||
|
for (MachineBasicBlock &MBB : MF) {
|
||||||
|
// First: does this MBB reference X at all? If yes, bail. This is
|
||||||
|
// conservative — refining would need full liveness.
|
||||||
|
bool MbbTouchesX = false;
|
||||||
|
for (const MachineInstr &MI : MBB) {
|
||||||
|
for (const MachineOperand &MO : MI.operands()) {
|
||||||
|
if (MO.isReg() && MO.getReg() == W65816::X) {
|
||||||
|
MbbTouchesX = true; break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (MbbTouchesX) break;
|
||||||
|
// Also handle InstImplied ops that don't list X explicitly.
|
||||||
|
switch (MI.getOpcode()) {
|
||||||
|
case W65816::TAX: case W65816::TYX: case W65816::TSX:
|
||||||
|
case W65816::PLX: case W65816::TXA: case W65816::TXY:
|
||||||
|
case W65816::TXS: case W65816::PHX: case W65816::INX:
|
||||||
|
case W65816::DEX:
|
||||||
|
MbbTouchesX = true; break;
|
||||||
|
}
|
||||||
|
if (MI.isCall()) { MbbTouchesX = true; break; }
|
||||||
|
if (MbbTouchesX) break;
|
||||||
|
}
|
||||||
|
if (MbbTouchesX) continue;
|
||||||
|
// Successors with X as live-in?
|
||||||
|
bool SuccUsesX = false;
|
||||||
|
for (MachineBasicBlock *Succ : MBB.successors()) {
|
||||||
|
if (Succ->isLiveIn(W65816::X)) { SuccUsesX = true; break; }
|
||||||
|
}
|
||||||
|
if (SuccUsesX) continue;
|
||||||
|
// Walk forward looking for STA_DP / ... / LDA_DP / Bcond patterns.
|
||||||
|
SmallVector<MachineInstr *, 4> ToErase;
|
||||||
|
for (auto It = MBB.begin(); It != MBB.end(); ++It) {
|
||||||
|
if (It->getOpcode() != W65816::STA_DP) continue;
|
||||||
|
if (It->getNumOperands() < 1 || !It->getOperand(0).isImm()) continue;
|
||||||
|
int64_t StaAddr = It->getOperand(0).getImm();
|
||||||
|
auto Walk = std::next(It);
|
||||||
|
MachineInstr *LdaMI = nullptr;
|
||||||
|
MachineInstr *BcondMI = nullptr;
|
||||||
|
bool blocked = false;
|
||||||
|
while (Walk != MBB.end()) {
|
||||||
|
if (Walk->isDebugInstr()) { ++Walk; continue; }
|
||||||
|
unsigned WO = Walk->getOpcode();
|
||||||
|
if (Walk->isBranch() || Walk->isCall() || Walk->isReturn() ||
|
||||||
|
Walk->isInlineAsm()) {
|
||||||
|
// If this is the Bcond AFTER our LDA, capture it.
|
||||||
|
if (LdaMI && (WO == W65816::BNE || WO == W65816::BEQ)) {
|
||||||
|
BcondMI = &*Walk;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (WO == W65816::LDA_DP &&
|
||||||
|
Walk->getNumOperands() >= 1 && Walk->getOperand(0).isImm() &&
|
||||||
|
Walk->getOperand(0).getImm() == StaAddr) {
|
||||||
|
LdaMI = &*Walk;
|
||||||
|
// Next non-debug must be BNE/BEQ.
|
||||||
|
auto Next = std::next(Walk);
|
||||||
|
while (Next != MBB.end() && Next->isDebugInstr()) ++Next;
|
||||||
|
if (Next != MBB.end()) {
|
||||||
|
unsigned NO = Next->getOpcode();
|
||||||
|
if (NO == W65816::BNE || NO == W65816::BEQ) BcondMI = &*Next;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
// Anything touching our DP slot bails.
|
||||||
|
for (const MachineOperand &MO : Walk->operands()) {
|
||||||
|
if (MO.isImm() && MO.getImm() == StaAddr) {
|
||||||
|
// Conservatively assume DP-op refs StaAddr unless we know
|
||||||
|
// it's a different opcode entirely. The dead-store-elim
|
||||||
|
// has similar logic but more refined; here we keep it
|
||||||
|
// simple: bail on any imm-matching op.
|
||||||
|
blocked = true; break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (blocked) break;
|
||||||
|
++Walk;
|
||||||
|
}
|
||||||
|
if (blocked || !LdaMI || !BcondMI) continue;
|
||||||
|
// Global-use check: if any OTHER MBB references the DP slot, the
|
||||||
|
// STA we'd erase may be initializing it for a later use. Bail.
|
||||||
|
// Caught by sumOfSquares' counter at $D0 — entry-BB's STA_DP 208
|
||||||
|
// initializes the countdown counter that bb.4 reads via DEC_DP.
|
||||||
|
bool referencedElsewhere = false;
|
||||||
|
for (MachineBasicBlock &OtherMBB : MF) {
|
||||||
|
if (&OtherMBB == &MBB) continue;
|
||||||
|
for (const MachineInstr &OtherMI : OtherMBB) {
|
||||||
|
for (const MachineOperand &MO : OtherMI.operands()) {
|
||||||
|
if (MO.isImm() && MO.getImm() == StaAddr) {
|
||||||
|
referencedElsewhere = true; break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (referencedElsewhere) break;
|
||||||
|
}
|
||||||
|
if (referencedElsewhere) break;
|
||||||
|
}
|
||||||
|
if (referencedElsewhere) continue;
|
||||||
|
// Replace STA_DP with TAX, LDA_DP with TXA.
|
||||||
|
const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
|
||||||
|
BuildMI(MBB, It, It->getDebugLoc(), TII->get(W65816::TAX));
|
||||||
|
BuildMI(MBB, LdaMI, LdaMI->getDebugLoc(), TII->get(W65816::TXA));
|
||||||
|
ToErase.push_back(&*It);
|
||||||
|
ToErase.push_back(LdaMI);
|
||||||
|
}
|
||||||
|
for (MachineInstr *MI : ToErase) {
|
||||||
|
MI->eraseFromParent();
|
||||||
|
Changed = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Run elideStoreForwarding at the very end, AFTER IMG promotion has
|
// Run elideStoreForwarding at the very end, AFTER IMG promotion has
|
||||||
// committed slot assignments. Running this peephole earlier (with
|
// committed slot assignments. Running this peephole earlier (with
|
||||||
// the other early peepholes) cascades into different IMG-promotion
|
// the other early peepholes) cascades into different IMG-promotion
|
||||||
|
|
|
||||||
|
|
@ -40,6 +40,7 @@
|
||||||
//===---------------------------------------------------------------------===//
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
#include "W65816.h"
|
#include "W65816.h"
|
||||||
|
#include "llvm/ADT/DenseMap.h"
|
||||||
#include "llvm/ADT/SmallPtrSet.h"
|
#include "llvm/ADT/SmallPtrSet.h"
|
||||||
#include "llvm/ADT/SmallVector.h"
|
#include "llvm/ADT/SmallVector.h"
|
||||||
#include "llvm/Analysis/LoopInfo.h"
|
#include "llvm/Analysis/LoopInfo.h"
|
||||||
|
|
@ -82,6 +83,8 @@ public:
|
||||||
|
|
||||||
private:
|
private:
|
||||||
bool processLoop(Loop *L);
|
bool processLoop(Loop *L);
|
||||||
|
bool processCounterToPtrPHIs(Loop *L);
|
||||||
|
bool processReturnedCounter(Loop *L);
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
@ -103,17 +106,409 @@ bool W65816UnLSR::runOnFunction(Function &F) {
|
||||||
bool Changed = false;
|
bool Changed = false;
|
||||||
for (Loop *L : LI) {
|
for (Loop *L : LI) {
|
||||||
Changed |= processLoop(L);
|
Changed |= processLoop(L);
|
||||||
|
Changed |= processCounterToPtrPHIs(L);
|
||||||
|
// NOTE: processReturnedCounter (strLen-shape counter → ptr-difference
|
||||||
|
// at exit) is correct but produces a NET LOSS on strLen: without the
|
||||||
|
// counter PHI, the i32 pointer arithmetic falls back to clc+adc
|
||||||
|
// chains (16+ cyc/iter) instead of inc-A on the lo half (5 cyc/iter
|
||||||
|
// for ptr update + 5 for counter inc). See feedback memory.
|
||||||
|
// Disabled until codegen can use inc-DP for the lo half of a pointer
|
||||||
|
// PHI's increment without the SDAG materializing a full i32 add.
|
||||||
// Recurse into nested loops.
|
// Recurse into nested loops.
|
||||||
SmallVector<Loop *, 4> Worklist(L->begin(), L->end());
|
SmallVector<Loop *, 4> Worklist(L->begin(), L->end());
|
||||||
while (!Worklist.empty()) {
|
while (!Worklist.empty()) {
|
||||||
Loop *Sub = Worklist.pop_back_val();
|
Loop *Sub = Worklist.pop_back_val();
|
||||||
Changed |= processLoop(Sub);
|
Changed |= processLoop(Sub);
|
||||||
|
Changed |= processCounterToPtrPHIs(Sub);
|
||||||
Worklist.append(Sub->begin(), Sub->end());
|
Worklist.append(Sub->begin(), Sub->end());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return Changed;
|
return Changed;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// strLen-style undo: LSR converts `return p - s` into a counter PHI
|
||||||
|
// `%lsr.iv` that increments per iter and is returned directly:
|
||||||
|
// %lsr.iv = phi i16 [-1, %entry], [%lsr.iv.next, %latch]
|
||||||
|
// %p.0 = phi ptr [%s, %entry], [%incdec.ptr, %latch]
|
||||||
|
// %incdec.ptr = getelementptr i8, %p.0, i32 1
|
||||||
|
// %lsr.iv.next = add i16 %lsr.iv, 1
|
||||||
|
// br ..., %exit, %loop
|
||||||
|
// %exit:
|
||||||
|
// ret i16 %lsr.iv.next
|
||||||
|
//
|
||||||
|
// LSR's reasoning: cheaper to maintain a counter than compute (p - s)
|
||||||
|
// at exit. On W65816 the opposite is true: counter inc per iter costs
|
||||||
|
// 5 cyc/iter * N iters; one-time sub at exit costs ~10 cyc total.
|
||||||
|
//
|
||||||
|
// This undo finds the counter PHI, verifies its only out-of-loop use
|
||||||
|
// is via LCSSA → return, finds the sibling pointer PHI with the same
|
||||||
|
// stride, and replaces the return value with
|
||||||
|
// `(i16)(p_lcssa - base) + (K_init + 1)`. Erases the counter PHI.
|
||||||
|
//
|
||||||
|
// Saves ~5 cyc/iter on strLen-shape loops with a returned counter.
|
||||||
|
bool W65816UnLSR::processReturnedCounter(Loop *L) {
|
||||||
|
BasicBlock *Header = L->getHeader();
|
||||||
|
BasicBlock *Latch = L->getLoopLatch();
|
||||||
|
BasicBlock *Preheader = L->getLoopPreheader();
|
||||||
|
if (!Latch || !Preheader) return false;
|
||||||
|
|
||||||
|
// Single-exit loop.
|
||||||
|
SmallVector<BasicBlock *, 2> ExitBlocks;
|
||||||
|
L->getExitBlocks(ExitBlocks);
|
||||||
|
if (ExitBlocks.size() != 1) return false;
|
||||||
|
BasicBlock *Exit = ExitBlocks[0];
|
||||||
|
|
||||||
|
// Find a candidate counter PHI: integer, init=ConstantInt, step=+1.
|
||||||
|
PHINode *CounterPHI = nullptr;
|
||||||
|
ConstantInt *KInit = nullptr;
|
||||||
|
BinaryOperator *CounterStep = nullptr;
|
||||||
|
for (PHINode &PN : Header->phis()) {
|
||||||
|
if (!PN.getType()->isIntegerTy()) continue;
|
||||||
|
if (PN.getNumIncomingValues() != 2) continue;
|
||||||
|
Value *Init = nullptr, *Step = nullptr;
|
||||||
|
for (unsigned i = 0; i < PN.getNumIncomingValues(); ++i) {
|
||||||
|
BasicBlock *Pred = PN.getIncomingBlock(i);
|
||||||
|
if (L->contains(Pred)) Step = PN.getIncomingValue(i);
|
||||||
|
else Init = PN.getIncomingValue(i);
|
||||||
|
}
|
||||||
|
if (!Init || !Step) continue;
|
||||||
|
auto *InitC = dyn_cast<ConstantInt>(Init);
|
||||||
|
if (!InitC) continue;
|
||||||
|
auto *StepBO = dyn_cast<BinaryOperator>(Step);
|
||||||
|
if (!StepBO || StepBO->getOpcode() != Instruction::Add) continue;
|
||||||
|
Value *Other = nullptr;
|
||||||
|
if (StepBO->getOperand(0) == &PN) Other = StepBO->getOperand(1);
|
||||||
|
else if (StepBO->getOperand(1) == &PN) Other = StepBO->getOperand(0);
|
||||||
|
if (!Other) continue;
|
||||||
|
auto *StepCI = dyn_cast<ConstantInt>(Other);
|
||||||
|
if (!StepCI || !StepCI->isOne()) continue;
|
||||||
|
CounterPHI = &PN;
|
||||||
|
KInit = InitC;
|
||||||
|
CounterStep = StepBO;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (!CounterPHI) return false;
|
||||||
|
|
||||||
|
// The counter PHI must be used INSIDE the loop only by its increment
|
||||||
|
// and OUTSIDE the loop only via an LCSSA PHI in the exit block that
|
||||||
|
// feeds a return. Same for the increment.
|
||||||
|
auto isOnlyInLoopUseTheStep = [&](Value *V) {
|
||||||
|
for (User *U : V->users()) {
|
||||||
|
auto *UI = dyn_cast<Instruction>(U);
|
||||||
|
if (!UI) return false;
|
||||||
|
if (!L->contains(UI)) continue; // out-of-loop is handled separately
|
||||||
|
if (UI == CounterStep) continue;
|
||||||
|
// The PHI itself is allowed (V might be CounterStep, used by
|
||||||
|
// CounterPHI's back-edge incoming).
|
||||||
|
if (UI == CounterPHI) continue;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
};
|
||||||
|
if (!isOnlyInLoopUseTheStep(CounterPHI)) return false;
|
||||||
|
if (!isOnlyInLoopUseTheStep(CounterStep)) return false;
|
||||||
|
|
||||||
|
// Find a use of CounterPHI or CounterStep that's a ReturnInst.
|
||||||
|
// The use might be DIRECT (no LCSSA — common after LCSSA cleanup)
|
||||||
|
// or via an LCSSA PHI in the exit block.
|
||||||
|
ReturnInst *Ret = nullptr;
|
||||||
|
Value *RetSource = nullptr; // the value the ret reads
|
||||||
|
PHINode *ExitLCSSA = nullptr; // optional LCSSA PHI to erase
|
||||||
|
bool fromNext = false; // true if return source is CounterStep
|
||||||
|
auto findRet = [&](Value *V, bool isNext) -> bool {
|
||||||
|
for (User *U : V->users()) {
|
||||||
|
auto *UI = dyn_cast<Instruction>(U);
|
||||||
|
if (!UI) continue;
|
||||||
|
// Skip in-loop uses (those are the counter increment chain).
|
||||||
|
if (L->contains(UI->getParent())) continue;
|
||||||
|
if (auto *R = dyn_cast<ReturnInst>(UI)) {
|
||||||
|
if (R->getReturnValue() != V) continue;
|
||||||
|
Ret = R; RetSource = V; fromNext = isNext; return true;
|
||||||
|
}
|
||||||
|
// LCSSA PHI in the exit block?
|
||||||
|
if (auto *PN = dyn_cast<PHINode>(UI)) {
|
||||||
|
if (PN->getParent() != Exit) continue;
|
||||||
|
if (PN->getNumIncomingValues() != 1) continue;
|
||||||
|
if (PN->getIncomingValue(0) != V) continue;
|
||||||
|
if (!PN->hasOneUse()) continue;
|
||||||
|
auto *R = dyn_cast<ReturnInst>(PN->user_back());
|
||||||
|
if (!R || R->getReturnValue() != PN) continue;
|
||||||
|
Ret = R; RetSource = V; fromNext = isNext; ExitLCSSA = PN;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
};
|
||||||
|
if (!findRet(CounterStep, true) && !findRet(CounterPHI, false))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
// Find a sibling pointer PHI: init=Base, latch incoming is a
|
||||||
|
// `getelementptr i8, %ptr, 1` of itself.
|
||||||
|
PHINode *PtrPHI = nullptr;
|
||||||
|
Value *Base = nullptr;
|
||||||
|
GetElementPtrInst *PtrStep = nullptr;
|
||||||
|
for (PHINode &PN : Header->phis()) {
|
||||||
|
if (!PN.getType()->isPointerTy()) continue;
|
||||||
|
if (PN.getNumIncomingValues() != 2) continue;
|
||||||
|
Value *Init = nullptr, *Step = nullptr;
|
||||||
|
for (unsigned i = 0; i < PN.getNumIncomingValues(); ++i) {
|
||||||
|
BasicBlock *Pred = PN.getIncomingBlock(i);
|
||||||
|
if (L->contains(Pred)) Step = PN.getIncomingValue(i);
|
||||||
|
else Init = PN.getIncomingValue(i);
|
||||||
|
}
|
||||||
|
if (!Init || !Step) continue;
|
||||||
|
auto *StepGEP = dyn_cast<GetElementPtrInst>(Step);
|
||||||
|
if (!StepGEP) continue;
|
||||||
|
if (StepGEP->getPointerOperand() != &PN) continue;
|
||||||
|
if (StepGEP->getNumIndices() != 1) continue;
|
||||||
|
if (!StepGEP->getSourceElementType()->isIntegerTy(8)) continue;
|
||||||
|
auto *StrideCI = dyn_cast<ConstantInt>(StepGEP->getOperand(1));
|
||||||
|
if (!StrideCI || !StrideCI->isOne()) continue;
|
||||||
|
PtrPHI = &PN;
|
||||||
|
Base = Init;
|
||||||
|
PtrStep = StepGEP;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (!PtrPHI) return false;
|
||||||
|
|
||||||
|
// The pointer-PHI must have an LCSSA in the exit (so we can compute
|
||||||
|
// p_lcssa - base). Find it or create one.
|
||||||
|
PHINode *PtrLCSSA = nullptr;
|
||||||
|
for (PHINode &EPN : Exit->phis()) {
|
||||||
|
if (EPN.getNumIncomingValues() != 1) continue;
|
||||||
|
if (EPN.getIncomingValue(0) == PtrPHI) {
|
||||||
|
PtrLCSSA = &EPN; break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!PtrLCSSA) {
|
||||||
|
// Create LCSSA for PtrPHI.
|
||||||
|
IRBuilder<> B(&Exit->front());
|
||||||
|
PtrLCSSA = B.CreatePHI(PtrPHI->getType(), 1, "unlsr.p.lcssa");
|
||||||
|
PtrLCSSA->addIncoming(PtrPHI, Latch);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build replacement value: (i16)(p_lcssa - base) + (K_init + (fromNext ? 1 : 0))
|
||||||
|
// For fromNext=true (returning %counter.next): value = K_init + iters
|
||||||
|
// p_lcssa - base = iters (in bytes, stride 1) → value = K_init + (p_lcssa - base)
|
||||||
|
// But we want: counter.next at exit = K_init + iters; and p_lcssa - base = iters.
|
||||||
|
// So replacement = (i16)(p_lcssa - base) + K_init.
|
||||||
|
// For strLen: K_init = -1; iters at exit = K (where ret = K - 1 + 1 = K)
|
||||||
|
// Wait let me re-derive. counter init = -1. iter 1 entry: counter = -1.
|
||||||
|
// iter 1 exit: counter.next = 0. Suppose exit-iter is iter K. Then at
|
||||||
|
// iter K's icmp-true, counter.next = -1 + K.
|
||||||
|
// And p_lcssa = base + (K - 1) (since iter K had p.0 = base + K-1).
|
||||||
|
// So p_lcssa - base = K - 1.
|
||||||
|
// We want counter.next = K - 1 (because exit-iter is iter K, but counter.next
|
||||||
|
// was computed before icmp tested 0 - so it's K - 1 (with K iters = K decisions))
|
||||||
|
// Hmm, off-by-one is tricky. Let me just test empirically.
|
||||||
|
|
||||||
|
// The "return value type" we'll cast to.
|
||||||
|
Type *RetTy = Ret->getReturnValue()->getType();
|
||||||
|
if (!RetTy->isIntegerTy()) return false;
|
||||||
|
Instruction *InsertPt = ExitLCSSA ? ExitLCSSA->getNextNode() : Ret;
|
||||||
|
IRBuilder<> B(InsertPt);
|
||||||
|
// (p_lcssa - base) as integer.
|
||||||
|
Value *PLcssaInt = B.CreatePtrToInt(PtrLCSSA, Type::getInt32Ty(Header->getContext()), "unlsr.plcssa.i");
|
||||||
|
Value *BaseInt = B.CreatePtrToInt(Base, Type::getInt32Ty(Header->getContext()), "unlsr.base.i");
|
||||||
|
Value *Diff = B.CreateSub(PLcssaInt, BaseInt, "unlsr.diff");
|
||||||
|
// Truncate to counter type.
|
||||||
|
Value *DiffI = B.CreateTrunc(Diff, CounterPHI->getType(), "unlsr.diff.trunc");
|
||||||
|
// For fromNext (returning %counter.next): replacement = diff + (K_init + 1).
|
||||||
|
// At exit, counter.next = K_init + iters.
|
||||||
|
// p_lcssa - base = iters (in bytes; stride 1). Wait but iters is the iter count.
|
||||||
|
// Let me re-check with concrete example.
|
||||||
|
// strLen("a\0"): iter 1: p.0 = s, *p='a'!=0, p++, counter=-1, counter.next=0.
|
||||||
|
// iter 2: p.0 = s+1, *p=0, exit. counter=0, counter.next=1.
|
||||||
|
// At exit: counter.next = 1. iters before exit-iter's icmp-true = 2.
|
||||||
|
// p_lcssa = s+1 (the iter-2 entry value). p_lcssa - base = 1.
|
||||||
|
// counter.next = 1 = K_init + 2 = -1 + 2 = 1. ✓
|
||||||
|
// p_lcssa - base = 1. So counter.next = p_lcssa - base + 0.
|
||||||
|
// (K_init + iters - (iters - (p_lcssa - base))) = K_init + (p_lcssa - base) = K_init + 1.
|
||||||
|
// Wait: counter.next = K_init + iters; p_lcssa - base = iters - 1.
|
||||||
|
// So counter.next = K_init + (p_lcssa - base) + 1.
|
||||||
|
// For K_init = -1: counter.next = -1 + 1 + 1 = 1 if iters=2. ✓
|
||||||
|
// So replacement = diff + (K_init + 1).
|
||||||
|
int64_t Adjust = KInit->getSExtValue() + (fromNext ? 1 : 0);
|
||||||
|
Value *Result = DiffI;
|
||||||
|
if (Adjust != 0) {
|
||||||
|
Result = B.CreateAdd(DiffI,
|
||||||
|
ConstantInt::get(CounterPHI->getType(), Adjust),
|
||||||
|
"unlsr.result");
|
||||||
|
}
|
||||||
|
// Cast to return type if different.
|
||||||
|
if (Result->getType() != RetTy) {
|
||||||
|
if (CounterPHI->getType()->getIntegerBitWidth() <
|
||||||
|
RetTy->getIntegerBitWidth())
|
||||||
|
Result = B.CreateZExt(Result, RetTy);
|
||||||
|
else
|
||||||
|
Result = B.CreateTrunc(Result, RetTy);
|
||||||
|
}
|
||||||
|
// Replace the return. If there's an LCSSA PHI, replace it. Otherwise
|
||||||
|
// replace the direct use in `ret`.
|
||||||
|
if (ExitLCSSA) {
|
||||||
|
ExitLCSSA->replaceAllUsesWith(Result);
|
||||||
|
ExitLCSSA->eraseFromParent();
|
||||||
|
} else {
|
||||||
|
Ret->setOperand(0, Result);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Erase the counter PHI and its increment.
|
||||||
|
CounterStep->replaceAllUsesWith(UndefValue::get(CounterPHI->getType()));
|
||||||
|
CounterPHI->replaceAllUsesWith(UndefValue::get(CounterPHI->getType()));
|
||||||
|
CounterStep->eraseFromParent();
|
||||||
|
CounterPHI->eraseFromParent();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// strcpy-style undo: LSR converts two pointer PHIs (`src.addr.0` and
|
||||||
|
// `d.0` each stepping by 1) into a single counter PHI (`lsr.iv`) plus
|
||||||
|
// GEPs `(base, counter)` per iter. On 65816 the counter+GEP form
|
||||||
|
// each iter does i32 (base + counter) on each pointer — much more
|
||||||
|
// expensive than just incrementing two i16 pointer PHIs.
|
||||||
|
//
|
||||||
|
// Pattern (post-LSR):
|
||||||
|
// %lsr.iv = phi i32 [0, %entry], [%lsr.iv.next, %latch]
|
||||||
|
// %scevgep_i = getelementptr i8, ptr %base_i, i32 %lsr.iv (for each base_i)
|
||||||
|
// ... loads/stores via %scevgep_i ...
|
||||||
|
// %lsr.iv.next = add i32 %lsr.iv, 1
|
||||||
|
//
|
||||||
|
// Where each %base_i is loop-invariant (typically a function arg).
|
||||||
|
//
|
||||||
|
// Rewrite: for each base_i, introduce a pointer PHI that strides by 1
|
||||||
|
// per iter. Replace %scevgep_i with the new pointer PHI. If counter
|
||||||
|
// has no other uses, eliminate it.
|
||||||
|
bool W65816UnLSR::processCounterToPtrPHIs(Loop *L) {
|
||||||
|
BasicBlock *Header = L->getHeader();
|
||||||
|
BasicBlock *Latch = L->getLoopLatch();
|
||||||
|
BasicBlock *Preheader = L->getLoopPreheader();
|
||||||
|
if (!Latch || !Preheader) return false;
|
||||||
|
|
||||||
|
// Find an integer counter PHI starting at 0 with step +1.
|
||||||
|
PHINode *Counter = nullptr;
|
||||||
|
Value *CounterNext = nullptr;
|
||||||
|
for (PHINode &PN : Header->phis()) {
|
||||||
|
if (!PN.getType()->isIntegerTy()) continue;
|
||||||
|
if (PN.getNumIncomingValues() != 2) continue;
|
||||||
|
Value *Init = nullptr, *Step = nullptr;
|
||||||
|
for (unsigned i = 0; i < PN.getNumIncomingValues(); ++i) {
|
||||||
|
BasicBlock *Pred = PN.getIncomingBlock(i);
|
||||||
|
if (L->contains(Pred)) Step = PN.getIncomingValue(i);
|
||||||
|
else Init = PN.getIncomingValue(i);
|
||||||
|
}
|
||||||
|
if (!Init || !Step) continue;
|
||||||
|
auto *InitCI = dyn_cast<ConstantInt>(Init);
|
||||||
|
if (!InitCI || !InitCI->isZero()) continue;
|
||||||
|
auto *StepBO = dyn_cast<BinaryOperator>(Step);
|
||||||
|
if (!StepBO || StepBO->getOpcode() != Instruction::Add) continue;
|
||||||
|
Value *Other = nullptr;
|
||||||
|
if (StepBO->getOperand(0) == &PN) Other = StepBO->getOperand(1);
|
||||||
|
else if (StepBO->getOperand(1) == &PN) Other = StepBO->getOperand(0);
|
||||||
|
if (!Other) continue;
|
||||||
|
auto *StepCI = dyn_cast<ConstantInt>(Other);
|
||||||
|
if (!StepCI || !StepCI->isOne()) continue;
|
||||||
|
Counter = &PN;
|
||||||
|
CounterNext = StepBO;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (!Counter) return false;
|
||||||
|
|
||||||
|
// Find GEPs `getelementptr i8, %base, %counter` (or %counter.next)
|
||||||
|
// where base is loop-invariant. Collect them and verify the counter
|
||||||
|
// has no OTHER uses outside this pattern.
|
||||||
|
SmallVector<GetElementPtrInst *, 4> GEPs;
|
||||||
|
for (User *U : Counter->users()) {
|
||||||
|
if (U == CounterNext) continue;
|
||||||
|
auto *GEP = dyn_cast<GetElementPtrInst>(U);
|
||||||
|
if (!GEP) return false;
|
||||||
|
if (GEP->getNumIndices() != 1) return false;
|
||||||
|
if (GEP->getOperand(1) != Counter) return false;
|
||||||
|
Value *Base = GEP->getPointerOperand();
|
||||||
|
// base must be loop-invariant. Instructions inside the loop fail;
|
||||||
|
// arguments and globals are always invariant.
|
||||||
|
if (auto *BaseI = dyn_cast<Instruction>(Base))
|
||||||
|
if (L->contains(BaseI)) return false;
|
||||||
|
if (!Base->getType()->isPointerTy()) return false;
|
||||||
|
// Only handle the i8 element type (byte stride). Other strides
|
||||||
|
// would need different ptr-PHI step values.
|
||||||
|
if (!GEP->getSourceElementType()->isIntegerTy(8)) return false;
|
||||||
|
GEPs.push_back(GEP);
|
||||||
|
}
|
||||||
|
// Also accept if CounterNext is used as a GEP index (sometimes LSR
|
||||||
|
// uses the post-increment value). Walk those too.
|
||||||
|
for (User *U : CounterNext->users()) {
|
||||||
|
if (U == Counter) continue;
|
||||||
|
auto *GEP = dyn_cast<GetElementPtrInst>(U);
|
||||||
|
if (GEP) {
|
||||||
|
// Bail if CounterNext is used as a GEP index — we'd need to add
|
||||||
|
// a +1 offset to the new pointer PHI to match. Keep this simple
|
||||||
|
// for now: only handle uses of Counter, not CounterNext.
|
||||||
|
if (GEP->getNumIndices() == 1 && GEP->getOperand(1) == CounterNext)
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
// Allow icmp / branch / other non-GEP uses of CounterNext — those
|
||||||
|
// are the loop's exit test, fine to leave alone.
|
||||||
|
}
|
||||||
|
if (GEPs.empty()) return false;
|
||||||
|
|
||||||
|
// For each unique base, build a pointer PHI.
|
||||||
|
LLVMContext &Ctx = Header->getContext();
|
||||||
|
Type *I8 = Type::getInt8Ty(Ctx);
|
||||||
|
DenseMap<Value *, PHINode *> BasePhis;
|
||||||
|
for (GetElementPtrInst *GEP : GEPs) {
|
||||||
|
Value *Base = GEP->getPointerOperand();
|
||||||
|
if (BasePhis.count(Base)) continue;
|
||||||
|
IRBuilder<> B(&Header->front());
|
||||||
|
PHINode *PtrPHI = B.CreatePHI(Base->getType(), 2, "unlsr.ptr");
|
||||||
|
PtrPHI->addIncoming(Base, Preheader);
|
||||||
|
// Build the step GEP in the latch (just before terminator).
|
||||||
|
IRBuilder<> BL(Latch->getTerminator());
|
||||||
|
Value *PtrNext = BL.CreateGEP(I8, PtrPHI,
|
||||||
|
ConstantInt::get(Type::getInt16Ty(Ctx), 1),
|
||||||
|
"unlsr.ptr.next");
|
||||||
|
PtrPHI->addIncoming(PtrNext, Latch);
|
||||||
|
BasePhis[Base] = PtrPHI;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Replace each GEP's uses with the corresponding pointer PHI.
|
||||||
|
for (GetElementPtrInst *GEP : GEPs) {
|
||||||
|
GEP->replaceAllUsesWith(BasePhis[GEP->getPointerOperand()]);
|
||||||
|
}
|
||||||
|
// Erase the now-dead GEPs.
|
||||||
|
for (GetElementPtrInst *GEP : GEPs) {
|
||||||
|
if (GEP->use_empty()) GEP->eraseFromParent();
|
||||||
|
}
|
||||||
|
|
||||||
|
// If counter has no other uses (besides CounterNext and the latch
|
||||||
|
// incoming), eliminate it. CounterNext might still be used by the
|
||||||
|
// exit test — leave that alone.
|
||||||
|
bool counterDead = true;
|
||||||
|
for (User *U : Counter->users()) {
|
||||||
|
if (U == CounterNext) continue;
|
||||||
|
counterDead = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (counterDead) {
|
||||||
|
// CounterNext might be used by other PHIs / icmp. Don't erase if so.
|
||||||
|
bool counterNextHasOtherUses = false;
|
||||||
|
for (User *U : CounterNext->users()) {
|
||||||
|
if (U == Counter) continue;
|
||||||
|
counterNextHasOtherUses = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (!counterNextHasOtherUses) {
|
||||||
|
Type *IntT = Counter->getType();
|
||||||
|
cast<Instruction>(CounterNext)->replaceAllUsesWith(
|
||||||
|
UndefValue::get(IntT));
|
||||||
|
Counter->replaceAllUsesWith(UndefValue::get(IntT));
|
||||||
|
cast<Instruction>(CounterNext)->eraseFromParent();
|
||||||
|
Counter->eraseFromParent();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
bool W65816UnLSR::processLoop(Loop *L) {
|
bool W65816UnLSR::processLoop(Loop *L) {
|
||||||
BasicBlock *Header = L->getHeader();
|
BasicBlock *Header = L->getHeader();
|
||||||
BasicBlock *Latch = L->getLoopLatch();
|
BasicBlock *Latch = L->getLoopLatch();
|
||||||
|
|
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
Add table
Reference in a new issue