From 55c1ae1c3ee398ad2ad561579557d2d72088656f Mon Sep 17 00:00:00 2001 From: Scott Duensing Date: Tue, 28 Apr 2026 16:49:41 -0500 Subject: [PATCH] Initial check in. Lots of work yet to do. --- .gitignore | 4 + SESSION_STATE.md | 146 +++ patches/0006-runtime-libcalls-w65816.patch | 20 + runtime/build.sh | 18 + runtime/src/libgcc.s | 640 ++++++++++++ scripts/safeCC.sh | 33 + scripts/smokeTest.sh | 567 ++++++++++- .../W65816/AsmParser/W65816AsmParser.cpp | 8 +- src/llvm/lib/Target/W65816/CMakeLists.txt | 1 + .../W65816/MCTargetDesc/W65816AsmBackend.cpp | 16 + src/llvm/lib/Target/W65816/W65816.h | 22 +- .../lib/Target/W65816/W65816AsmPrinter.cpp | 153 +++ .../lib/Target/W65816/W65816CallingConv.td | 6 +- .../lib/Target/W65816/W65816FrameLowering.cpp | 206 +++- .../lib/Target/W65816/W65816ISelDAGToDAG.cpp | 39 +- .../lib/Target/W65816/W65816ISelLowering.cpp | 924 ++++++++++++++++-- .../lib/Target/W65816/W65816ISelLowering.h | 41 + .../lib/Target/W65816/W65816InstrFormats.td | 17 + .../lib/Target/W65816/W65816InstrInfo.cpp | 71 +- src/llvm/lib/Target/W65816/W65816InstrInfo.h | 23 + src/llvm/lib/Target/W65816/W65816InstrInfo.td | 503 +++++++++- .../lib/Target/W65816/W65816RegisterInfo.cpp | 81 +- .../Target/W65816/W65816StackSlotCleanup.cpp | 355 +++++++ .../lib/Target/W65816/W65816TargetMachine.cpp | 6 + 24 files changed, 3776 insertions(+), 124 deletions(-) create mode 100644 patches/0006-runtime-libcalls-w65816.patch create mode 100755 runtime/build.sh create mode 100644 runtime/src/libgcc.s create mode 100755 scripts/safeCC.sh create mode 100644 src/llvm/lib/Target/W65816/W65816StackSlotCleanup.cpp diff --git a/.gitignore b/.gitignore index cdd6ac6..9abef7d 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,10 @@ tools/ # Claude Code tool state .claude/ +# Runtime build artifacts: regenerable via runtime/build.sh from +# runtime/src/*.s. The source files (.s, build.sh) are tracked. +runtime/*.o + # Editor / OS *.swp *.swo diff --git a/SESSION_STATE.md b/SESSION_STATE.md index 1953623..9af65c4 100644 --- a/SESSION_STATE.md +++ b/SESSION_STATE.md @@ -219,6 +219,152 @@ Design doc section 7 lists a 12-step implementation order. We are at: scheduling pass.** The prologue `REP #$30` is unconditional; the REP/SEP pass will remove it when redundant. +### Where we actually got to (current state, 2026-04-27) + +The "open codegen gaps" list above is mostly resolved. Status of the +seven sub-items at line 192: + +1. **Multi-arg call lowering (caller side)** — done. `LowerCall` + pushes args 1..N-1 right-to-left via `W65816ISD::PUSH`, + `ADJCALLSTACKUP` unwinds with `tsc;clc;adc #N;tcs`. +2. **Frame-reserved scratch space** — done. `emitPrologue` / + `emitEpilogue` use `tsc;sec;sbc #N;tcs` and the inverse. +3. **Mixed-mode i8/i16** — partial. Per-function mode based on IR + scan; full REP/SEP scheduling pass still TODO (Step 4). +4. **Signed `(a - b)` overflow in compares** — handled for i8/i16 + via the signed-CC promote-to-i16 path. Still has the BMI/BPL + correctness caveat at INT16_MIN/MAX boundaries. +5. **`mul var, var` and friends** — done via libcalls; runtime stubs + live in `runtime/src/libgcc.s` (__mulhi3, __mulsi3, __ashlhi3, + __ashrhi3, __lshrhi3, __ashlsi3, __ashrsi3, __lshrsi3, __udivhi3, + __divhi3, __umodhi3, __modhi3, __udivsi3, __divsi3, __umodsi3, + __modsi3). +6. **SETCC and SELECT_CC i16** — done via custom inserter and the + `W65816cmp + W65816selectcc` SDNode pair. +7. **Library functions** — done; see #5 above. + +### i32 (long) support — landed (2026-04-26..28) + +- Type legalization splits i32 into two i16 halves. +- ABI: i32 first-arg lives in A:X (lo:hi), matching the return + ABI; subsequent i32 args go on stack 2 bytes per half. + `RetCC_W65816` assigns `[A, X]` for two i16 returns so + `__mulsi3` / `__divsi3` libcall returns work. +- ADD/SUB use the native ADC carry chain via ISD::ADDC/ADDE/SUBC/ + SUBE Legal: `ADCi16imm` etc. mark `Defs = [P]` and pattern-match + `addc`; new `ADCEi16imm` / `ADCEabs` / `ADCEfi` (and SBC/E + variants) mark `Uses = [P], Defs = [P]` for `adde`/`sube`. + `ADDE_RR` / `SUBE_RR` have the inserter equivalent for two-Acc16 + chains (e.g. fib32's loop). Net: an i32 add went from ~25 insns + (manual UADDO + SETCC + add-of-bool) to ~17 incl. prologue/epilogue, + with the core 8 being the optimal `clc;adc;sta;lda;adc;tax;lda;rtl`. +- NEGC16 / NEGE16 lower `(subc/sube 0, x)` for i32 negate via the + ADD chain (`EOR #$FFFF; CLC; ADC #1` lo, `EOR #$FFFF; ADC #0` hi). +- MUL/DIV/MOD/SHL/SHR/USHR all libcalled; preferredShift­Legalization­ + Strategy returns `LowerToLibcall` for i32 to keep LLVM from emitting + SHL_PARTS we'd have no pattern for. +- `BuildSDIVPow2` / `BuildSREMPow2` overrides return SDValue() to + block the magic-constant pow2 expansion that emits unsupported + BUILD_VECTOR. + +### Other recent work + +- `i1` `sext_inreg` lowered as `(sub 0, (and x, 1))`. +- `i8` `sext_inreg` and `sextload-i8` go through the existing + branchless `((x & 0xFF) ^ 0x80) - 0x80` sequence (SEXTLOAD i8 set + to Expand, sext_inreg pattern added). +- `extloadi8` from an `Acc16` register pointer maps to `LDAptr` (16- + bit load; consumer ignores high byte). +- Bare `ISD::FrameIndex` selected as `ADDframe (FI, 0)` for + alloca'd-array address-of; `eliminateFrameIndex` expands ADDframe + into `tsc;clc;adc #disp` (LEA equivalent). +- **Indirect calls** (function pointers): `LowerCall` redirects + through `__jsl_indir` in `runtime/src/libgcc.s` — caller stores + the dynamic target to global `__indirTarget` then JSLs the + trampoline, which does `JMP (__indirTarget)`. Target's RTL pops + the original JSL frame and returns directly to the caller. + Single-bank only (JMP indirect is bank-local). +- **Code-quality cleanup pass** (`W65816StackSlotCleanup`, + addPostRegAlloc): + - Removes redundant `LDAfi slot` after `STAfi reg, slot` when the + LDA's destination matches and nothing in between clobbers + either reg or slot. Catches the regalloc spill+reload cycle + around COPY $a → vreg. + - Removes dead `STAfi reg, slot` when a subsequent `STAfi` + overwrites the same slot before any read, OR when the function + returns without reading the slot (catches result-spill-before- + return that the libcall return ABI makes redundant). + - Combined with `isReMaterializable` on LDAfi from fixed FIs, the + i32 add went from 17 → 11 instructions. +- **i32 shift-by-1 inline** (task #59). The type-legalizer's + SHL_PARTS / SRL_PARTS expansion of `i32 << 1` / `>> 1` emits a + `(srl x, 15)` or `(shl x, 15)` for the carry-cross-halves slot. + Previously routed through __lshrhi3 / __ashlhi3 libcalls. Added + SRL15A pseudo (`ASL A; LDA #0; ROL A`, 3 bytes) and SHL15A + (`LSR A; LDA #0; ROR A`). i32 shl-by-1 went 33 → 26 insns; + shr-by-1 29 → 23. +- **i16 shift-by-8 inline** (task #60). Same idea for `(srl x, 8)` + and `(shl x, 8)` — used by i32 shift-by-8 type-legalization. + XBA swaps the two bytes of A in 16-bit M; AND clears the half + we don't want. 4 bytes per shift. i32 shl/shr-by-8 went + 39/35 → 27/24 insns. +- **PUSH16X for direct X-push** (task #61). When LowerCall sees + an outgoing arg whose SDValue is `CopyFromReg` of a vreg that's + live-in from $x (i.e. the i32-first-arg-in-A:X hi half), emit + `phx` directly instead of `txa; pha` (which also requires + spilling $a to preserve it). mul32 went 19 → 13 insns. +- **Dead frame-slot trimming** (task #62). Extended W65816Stack­ + SlotCleanup to scan MIR for unreferenced (post-cleanup) local + frame indices and zero-size them so PrologueEpilogue trims the + prologue PHA/TSC reservation. Combined with the spill cleanup, + shrinks frames in many functions by 2-4 bytes (one fewer + PHA + PLY pair). +- **i32 first-arg in A:X (task #50)**. When the first original + argument is i32 (LowerFormalArguments / LowerCall detect via + `Outs[0..1].OrigArgIndex == 0` on i16 halves), pass it lo:hi in + A:X — matching the i32 return ABI. Saves one stack slot per + i32 arg. Required updating libgcc.s helpers (`__mulsi3`, + `__udivsi3`, `__umodsi3`, `__divsi3`, `__modsi3`, `__ashlsi3`, + `__lshrsi3`, `__ashrsi3`, `__divmodsi_setup`) to read arg0_hi + from X (and shifted arg1 offsets). +- **Implicit Defs/Uses on stack-rel MC instructions**: was a + pre-existing latent bug — `eliminateFrameIndex` strips the + implicit A/P def/use info when it converts ADCfi/STAfi/etc. to + the MC form (ADC d,S, STA d,S etc.). Machine Copy Propagation + then sees stale dataflow and elides necessary TAX/TXA copies. + Fixed by re-attaching `RegState::Implicit` operands on each + expanded MC instruction in W65816RegisterInfo::eliminateFrame­ + Index. Without this, the i32-A:X ABI miscompiles return values + (TAX gets elided, X retains arg0_hi instead of result_hi). + The fix also benefits the existing single-A path; before it, + certain Machine Copy Propagation choices were unsafe but + happened not to trigger. Now they're also safe. + +### Currently still pending + +- **REP/SEP scheduling pass** (Step 4) — per-function mode only; + mixed-mode functions don't work. +- **Vararg functions** — `LowerFormalArguments` reports a fatal + error. +- **i32 comparison** — uses SETCC+ADD-of-bool instead of a CMP+SBC + chain (analogous to the ADC chain we landed for add/sub). +- **Regalloc** (#56) — heapify-style functions with 4+ live i16 + values run out of A. + +### Smoke-test coverage (31 checks as of 2026-04-28) + +`scripts/smokeTest.sh` covers: target registration, llvm-mc encode/ +disassemble, end-to-end IR→ELF, multi-pattern function, single-arg +call, 3-arg stack reads, pure-i8 SEP prologue, multi-branch SETCC, +SELECT_CC, two-Acc16 spill, libcall emission (__mulhi3/__ashlhi3), +pointer load/store, runtime/build.sh, real-world program, +libcall-symbol coverage, signed/eq i8 compare, -O2 tiny C, i32 add +end-to-end, i32 carry-chain shape (1 clc + 2 adc + 0 bcc), i32 +A:X first-arg ABI (1 txa), 32-bit fib loop (ADDE_RR inserter), +__mulsi3 libcall, alloca'd-array LEA, signed-byte strcmp +(sextload + sext_inreg + extload-via-ptr), indirect call via +__jsl_indir trampoline, i32 shift-by-1 inline (no hi3 libcall). + ## 3. What is installed and where All under `/home/scott/claude/llvm816/tools/`: diff --git a/patches/0006-runtime-libcalls-w65816.patch b/patches/0006-runtime-libcalls-w65816.patch new file mode 100644 index 0000000..8df049c --- /dev/null +++ b/patches/0006-runtime-libcalls-w65816.patch @@ -0,0 +1,20 @@ +diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.td b/llvm/include/llvm/IR/RuntimeLibcalls.td +index 0000000..0000000 100644 +--- a/llvm/include/llvm/IR/RuntimeLibcalls.td ++++ b/llvm/include/llvm/IR/RuntimeLibcalls.td +@@ -3620,6 +3620,15 @@ def MOSSystemLibrary + __memset, + abort)>; + ++// W65816 (WDC 65816) - integer libcalls only. Multiply, divide, modulo ++// and shifts go through the standard compiler-rt names (__mulhi3, ++// __divhi3 etc.). No floating point yet. ++def isW65816 : RuntimeLibcallPredicate<"TT.getArch() == Triple::w65816">; ++ ++def W65816SystemLibrary ++ : SystemRuntimeLibrary; ++ + //===----------------------------------------------------------------------===// + // Legacy Default Runtime Libcalls + //===----------------------------------------------------------------------===// diff --git a/runtime/build.sh b/runtime/build.sh new file mode 100755 index 0000000..11f2747 --- /dev/null +++ b/runtime/build.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash +# Assemble the W65816 runtime library to runtime/libgcc.o. +# Run after editing runtime/src/*.s. + +set -euo pipefail +PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +LLVM_MC="$PROJECT_ROOT/tools/llvm-mos-build/bin/llvm-mc" + +[ -x "$LLVM_MC" ] || { + echo "llvm-mc not found at $LLVM_MC" >&2 + exit 1 +} + +"$LLVM_MC" -arch=w65816 -filetype=obj \ + "$PROJECT_ROOT/runtime/src/libgcc.s" \ + -o "$PROJECT_ROOT/runtime/libgcc.o" + +echo "built runtime/libgcc.o" diff --git a/runtime/src/libgcc.s b/runtime/src/libgcc.s new file mode 100644 index 0000000..ad6a680 --- /dev/null +++ b/runtime/src/libgcc.s @@ -0,0 +1,640 @@ +; Minimal libgcc-equivalent runtime for the W65816 / Apple IIgs. +; Provides the helpers that the LLVM backend lowers integer multiply, +; shift, divide, and modulo operations to. Implementations are +; correct-but-unoptimised; they exist to unblock end-to-end testing, +; not to compete with hand-tuned 65816 math libraries. +; +; Calling convention (matches W65816ISelLowering::LowerCall): +; - Arg 0 in A (16-bit M). +; - Arg 1 pushed via PHA before the JSL. Reads as (4,S) inside the +; callee (3-byte JSL return address sits at 1..3,S). +; - Return value in A. Caller releases pushed args. +; - Routines run in 16-bit M, 16-bit X (REP #$30 by convention). +; +; Direct-page scratch lives at DP+$E0..DP+$EF (16 bytes). Programs +; that use this runtime must keep DP=0 or remap accordingly. +; +; Assembled with: tools/llvm-mos-build/bin/llvm-mc -arch=w65816 \ +; -filetype=obj +; runtime/src/libgcc.s +; -o runtime/libgcc.o + + .text + +; -------------------------------------------------------------------- +; Indirect-call trampoline. An indirect call (function pointer) stores +; the target's 16-bit address to __indirTarget before JSL'ing here. +; This routine does a JMP indirect through that variable: control +; transfers to the target with the original caller's JSL frame still +; on the stack, so target's RTL returns to the original caller (one +; frame, no double-RTL). +; +; Caller emit sequence in W65816ISelLowering::LowerCall: +; sta __indirTarget ; store ptr (must precede any A clobber for args) +; ... arg pushes ... +; jsl __jsl_indir +; +; Single-bank only (the IIgs convention assumes code in bank 0/1 +; via JSL — JMP indirect is bank-local). +; -------------------------------------------------------------------- + .globl __indirTarget + .bss +__indirTarget: + .zero 2 + + .text + .globl __jsl_indir +__jsl_indir: + ; Hand-encoded JMP (__indirTarget): 6C is "jmp (a)" — the assembler + ; doesn't yet parse the `(abs)` syntax, so emit the bytes directly + ; with a 16-bit relocation against the variable. Effective transfer: + ; PC <- mem[__indirTarget]. + .byte 0x6C + .word __indirTarget + +; -------------------------------------------------------------------- +; __mulhi3 — 16-bit multiply. A * (4,S) -> A. +; Signed and unsigned share an implementation: only the low 16 bits of +; the product are returned, which is identical for both. Uses +; shift-and-add over the multiplier bits. +; -------------------------------------------------------------------- + .globl __mulhi3 +__mulhi3: + sta 0xe0 ; multiplier + lda 0x4, s + sta 0xe2 ; multiplicand + lda #0x0 + sta 0xe4 ; running product +.Lmul_loop: + lda 0xe0 + beq .Lmul_done + lsr a + sta 0xe0 + bcc .Lmul_skip + lda 0xe4 + clc + adc 0xe2 + sta 0xe4 +.Lmul_skip: + asl 0xe2 + bra .Lmul_loop +.Lmul_done: + lda 0xe4 + rtl + +; -------------------------------------------------------------------- +; __ashlhi3 — A << (4,S) -> A. Shift count is i16 but only the low 4 +; bits are meaningful (counts >=16 are undefined behaviour in C). +; -------------------------------------------------------------------- + .globl __ashlhi3 +__ashlhi3: + pha ; save value on stack so we can free A + lda 0x6, s ; arg 1 sits at 6,s now (PHA shifted by 2) + tax + pla ; restore value +.Lashl_loop: + cpx #0x0 + beq .Lashl_done + asl a + dex + bra .Lashl_loop +.Lashl_done: + rtl + +; -------------------------------------------------------------------- +; __lshrhi3 — A logical >> (4,S) -> A. Same shape as __ashlhi3 with +; LSR instead of ASL. +; -------------------------------------------------------------------- + .globl __lshrhi3 +__lshrhi3: + pha + lda 0x6, s + tax + pla +.Llshr_loop: + cpx #0x0 + beq .Llshr_done + lsr a + dex + bra .Llshr_loop +.Llshr_done: + rtl + +; -------------------------------------------------------------------- +; __ashrhi3 — A arithmetic >> (4,S) -> A. Sign bit is preserved by +; copying it into carry before each ROR via CMP #$8000 (which sets +; carry exactly when the sign bit is set on a 16-bit unsigned compare). +; -------------------------------------------------------------------- + .globl __ashrhi3 +__ashrhi3: + pha + lda 0x6, s + tax + pla +.Lashr_loop: + cpx #0x0 + beq .Lashr_done + cmp #0x8000 + ror a + dex + bra .Lashr_loop +.Lashr_done: + rtl + +; -------------------------------------------------------------------- +; __udivhi3 — A unsigned / (4,S) -> A. +; Restoring shift-subtract division. Common helper; __umodhi3 reuses +; the algorithm and returns the remainder instead. +; Scratch: $e6 = numerator, $e8 = denominator, +; $ea = quotient, $ec = remainder. +; -------------------------------------------------------------------- + .globl __udivhi3 +__udivhi3: + ; Public entry: A=dividend, (4,S)=divisor. Set up scratch and + ; call the same JSR-based core used by signed divide. + sta 0xe6 + lda 0x4, s + sta 0xe8 + jsr __udivmod_core + lda 0xea + rtl + + .globl __umodhi3 +__umodhi3: + sta 0xe6 + lda 0x4, s + sta 0xe8 + jsr __udivmod_core + lda 0xec + rtl + +; -------------------------------------------------------------------- +; __divhi3 / __modhi3 — signed 16-bit divide and modulo. Strategy: +; - Stash sign of dividend in $ee bit 0 (used by modulo). +; - Stash result sign of quotient (sign(a) XOR sign(b)) in $ee bit 1 +; (used by divide). +; - Take absolute values, run the unsigned core, then negate the +; appropriate result if its sign bit is set. +; C99: quotient truncates toward zero; remainder takes the sign of the +; dividend. +; -------------------------------------------------------------------- + .globl __divhi3 +__divhi3: + jsr __divmod_setup + jsr __udivmod_core + ; Quotient is in $ea. Negate if bit 1 of $ee is set. + lda 0xea + pha + lda 0xee + and #0x2 + beq .Ldiv_pos + pla + eor #0xffff + clc + adc #0x1 + rtl +.Ldiv_pos: + pla + rtl + + .globl __modhi3 +__modhi3: + jsr __divmod_setup + jsr __udivmod_core + ; Remainder is in $ec. Negate if bit 0 of $ee is set (dividend + ; was negative). + lda 0xec + pha + lda 0xee + and #0x1 + beq .Lmod_pos + pla + eor #0xffff + clc + adc #0x1 + rtl +.Lmod_pos: + pla + rtl + +; -------------------------------------------------------------------- +; __divmod_setup — common prologue for __divhi3/__modhi3. Reads +; A=dividend and (4,S)=divisor (the public-entry stack frame is intact +; because we used JSR not JSL, so (4,S) still points to the user's +; pushed arg1 relative to the original JSL). Computes |a| -> $e6, +; |b| -> $e8, and sign tracker -> $ee: +; bit 0 = 1 if dividend was negative (modulo result sign) +; bit 1 = 1 if dividend XOR divisor signs differ (quotient sign) +; Uses JSR/RTS, same bank. +; -------------------------------------------------------------------- +__divmod_setup: + ; Sign tracker. We don't have STZ in our instruction set yet, so + ; clear via PHA/LDA #0/STA/PLA to avoid trashing A. + pha + lda #0x0 + sta 0xee + pla + ; Dividend sign + abs value. + cmp #0x8000 + bcc .Lset_a_pos + ; Negative: set bits 0 and 1 (dividend sign, result sign so far). + pha + lda 0xee + ora #0x3 + sta 0xee + pla + eor #0xffff + clc + adc #0x1 +.Lset_a_pos: + sta 0xe6 + ; Divisor sign + abs value. After our JSR (pushed 2 bytes of + ; near-return), the user's arg1 has shifted up by 2 from (4,S) + ; to (6,S). + lda 0x6, s + cmp #0x8000 + bcc .Lset_b_pos + ; Negative: flip bit 1 of $ee (XOR with sign of dividend). + pha + lda 0xee + eor #0x2 + sta 0xee + pla + eor #0xffff + clc + adc #0x1 +.Lset_b_pos: + sta 0xe8 + rts + +; -------------------------------------------------------------------- +; __udivmod_core — internal restoring divide. Inputs at $e6/$e8, +; outputs quotient at $ea, remainder at $ec. JSR/RTS local helper. +; -------------------------------------------------------------------- +__udivmod_core: + lda #0x0 + sta 0xea + sta 0xec + ldx #0x10 +.Lcore_loop: + asl 0xe6 + rol 0xec + asl 0xea + lda 0xec + cmp 0xe8 + bcc .Lcore_skip + sec + sbc 0xe8 + sta 0xec + inc 0xea +.Lcore_skip: + dex + bne .Lcore_loop + rts + +; ==================================================================== +; 32-bit (long / si) helpers. +; +; ABI for these is the natural extension of the i16 libcalls: +; - arg0_lo in A +; - arg0_hi at (4,s) +; - arg1_lo at (6,s) (or shift count, for the shift helpers) +; - arg1_hi at (8,s) +; - return: result_lo in A, result_hi in X +; +; All are correct-but-unoptimised; goal is unblocking end-to-end builds, +; not winning a 65816 codegolf. +; +; Direct-page scratch for these: +; $e0..$e3 = a (lo, hi) [renamed from $e0/$e2 for the i16 ones] +; $e4..$e7 = b (lo, hi) +; $e8..$eb = result / quotient (lo, hi) +; $ec..$ef = remainder (lo, hi) +; ==================================================================== + +; -------------------------------------------------------------------- +; __mulsi3 — 32-bit multiply. Shift-and-add over 32 bits of the +; multiplier. Result = (a * b) mod 2^32. +; +; ABI: A = a_lo, X = a_hi (the i32-first-arg in A:X convention), +; (4,s) = b_lo, (6,s) = b_hi. Result returned in A:X (lo:hi). +; -------------------------------------------------------------------- + .globl __mulsi3 +__mulsi3: + ; Stash a (multiplier) into $e0/$e2. + sta 0xe0 + stx 0xe2 + ; Stash b (multiplicand) into $e4/$e6. + lda 0x4, s + sta 0xe4 + lda 0x6, s + sta 0xe6 + ; Clear running product at $e8/$ea. + lda #0x0 + sta 0xe8 + sta 0xea + ; Loop 32 times: examine LSB of multiplier, conditionally add + ; multiplicand to product, then shift multiplier right and + ; multiplicand left. Use Y as a 16-bit counter (X mode = 16). + ldy #0x20 +.Lmulsi_loop: + ; Test bit 0 of multiplier (lo word). + lda 0xe0 + lsr a + sta 0xe0 + bcc .Lmulsi_noadd + ; Add multiplicand to product (32-bit). + clc + lda 0xe8 + adc 0xe4 + sta 0xe8 + lda 0xea + adc 0xe6 + sta 0xea +.Lmulsi_noadd: + ; Shift multiplier right (32-bit, hi-into-lo) — we already shifted + ; the lo half above, but the bit shifted out went to carry. We + ; need to also bring the lo bit of the hi half into bit 15 of lo, + ; and shift hi right. Simpler: do a full 32-bit shift right + ; before the LSR. Restructure: + ; + ; Shift multiplicand left (32-bit, carry chain). + asl 0xe4 + rol 0xe6 + ; Bring multiplier hi into multiplier lo's high bit. Multiplier + ; has been shifted lo>>1 already; we need to also put hi's lo bit + ; into lo's hi bit and shift hi right. + lsr 0xe2 + bcc .Lmulsi_no_borrow + ; Carry from hi >> 1 needs to land in bit 15 of lo. ORA #$8000. + lda 0xe0 + ora #0x8000 + sta 0xe0 +.Lmulsi_no_borrow: + dey + bne .Lmulsi_loop + ; Result is in $e8 (lo) / $ea (hi). + ldx 0xea + lda 0xe8 + rtl + +; -------------------------------------------------------------------- +; __ashlsi3 — (A:X) << (4,s) -> A:X. Shift count is i16 in low byte; +; counts >= 32 are UB in C. Uses a per-bit loop (cheap on 65816 — one +; ASL + ROL per bit). +; +; ABI: A = a_lo, X = a_hi (i32-first-arg in A:X), (4,s) = count. +; -------------------------------------------------------------------- + .globl __ashlsi3 +__ashlsi3: + sta 0xe0 ; lo + stx 0xe2 ; hi + lda 0x4, s + tay ; count -> Y +.Lashlsi_loop: + cpy #0x0 + beq .Lashlsi_done + asl 0xe0 + rol 0xe2 + dey + bra .Lashlsi_loop +.Lashlsi_done: + ldx 0xe2 + lda 0xe0 + rtl + +; -------------------------------------------------------------------- +; __lshrsi3 — logical >> shift. LSR hi, ROR lo: hi gets a 0, lo gets +; hi's old bit 0. Per-bit loop. +; -------------------------------------------------------------------- + .globl __lshrsi3 +__lshrsi3: + sta 0xe0 + stx 0xe2 + lda 0x4, s + tay +.Llshrsi_loop: + cpy #0x0 + beq .Llshrsi_done + lsr 0xe2 + ror 0xe0 + dey + bra .Llshrsi_loop +.Llshrsi_done: + ldx 0xe2 + lda 0xe0 + rtl + +; -------------------------------------------------------------------- +; __ashrsi3 — arithmetic >> shift. Sign bit must be preserved on each +; iteration: copy bit 15 of hi into carry (via CMP #$8000), then ROR +; hi, ROR lo. Per-bit loop. +; -------------------------------------------------------------------- + .globl __ashrsi3 +__ashrsi3: + sta 0xe0 + stx 0xe2 + lda 0x4, s + tay +.Lashrsi_loop: + cpy #0x0 + beq .Lashrsi_done + ; CMP #$8000 sets C iff the unsigned value >= 0x8000, i.e. bit 15 + ; is set — exactly the sign bit. + lda 0xe2 + cmp #0x8000 + ror 0xe2 + ror 0xe0 + dey + bra .Lashrsi_loop +.Lashrsi_done: + ldx 0xe2 + lda 0xe0 + rtl + +; -------------------------------------------------------------------- +; __udivmodsi_core — internal 32-bit unsigned divide. Inputs in +; $e0/$e2 (numerator) and $e4/$e6 (denominator); outputs quotient in +; $e8/$ea and remainder in $ec/$ee. 32-iteration restoring divide. +; JSR/RTS local helper. +; -------------------------------------------------------------------- +__udivmodsi_core: + lda #0x0 + sta 0xe8 + sta 0xea + sta 0xec + sta 0xee + ldy #0x20 +.Lcoresi_loop: + ; Shift numerator left through remainder. + asl 0xe0 + rol 0xe2 + rol 0xec + rol 0xee + ; Shift quotient left. + asl 0xe8 + rol 0xea + ; Compare remainder to denominator (32-bit). + lda 0xee + cmp 0xe6 + bcc .Lcoresi_skip + bne .Lcoresi_take + lda 0xec + cmp 0xe4 + bcc .Lcoresi_skip +.Lcoresi_take: + ; Remainder >= denominator: subtract and set quotient bit 0. + sec + lda 0xec + sbc 0xe4 + sta 0xec + lda 0xee + sbc 0xe6 + sta 0xee + inc 0xe8 +.Lcoresi_skip: + dey + bne .Lcoresi_loop + rts + +; -------------------------------------------------------------------- +; __udivsi3 — unsigned 32/32 -> 32 divide. +; -------------------------------------------------------------------- + .globl __udivsi3 +__udivsi3: + ; ABI: A = a_lo, X = a_hi, (4,s) = b_lo, (6,s) = b_hi. + sta 0xe0 + stx 0xe2 + lda 0x4, s + sta 0xe4 + lda 0x6, s + sta 0xe6 + jsr __udivmodsi_core + ldx 0xea + lda 0xe8 + rtl + +; -------------------------------------------------------------------- +; __umodsi3 — unsigned 32/32 -> 32 modulo. +; -------------------------------------------------------------------- + .globl __umodsi3 +__umodsi3: + sta 0xe0 + stx 0xe2 + lda 0x4, s + sta 0xe4 + lda 0x6, s + sta 0xe6 + jsr __udivmodsi_core + ldx 0xee + lda 0xec + rtl + +; -------------------------------------------------------------------- +; __divsi3 / __modsi3 — signed 32-bit divide / modulo. Strategy mirrors +; the i16 helpers: stash signs, take abs, run unsigned core, negate +; result(s) as needed. Sign tracker bits in $f0: +; bit 0 = dividend was negative (modulo result sign) +; bit 1 = quotient sign (sign(a) XOR sign(b)) +; -------------------------------------------------------------------- + .globl __divsi3 +__divsi3: + jsr __divmodsi_setup + jsr __udivmodsi_core + ; Quotient at $e8/$ea. Negate if bit 1 of $f0 is set. + lda 0xf0 + and #0x2 + beq .Ldivsi_pos + ; 32-bit two's complement of quotient. + lda 0xe8 + eor #0xffff + clc + adc #0x1 + sta 0xe8 + lda 0xea + eor #0xffff + adc #0x0 + sta 0xea +.Ldivsi_pos: + ldx 0xea + lda 0xe8 + rtl + + .globl __modsi3 +__modsi3: + jsr __divmodsi_setup + jsr __udivmodsi_core + ; Remainder at $ec/$ee. Negate if bit 0 of $f0 set (dividend + ; was negative — C99 remainder takes dividend's sign). + lda 0xf0 + and #0x1 + beq .Lmodsi_pos + lda 0xec + eor #0xffff + clc + adc #0x1 + sta 0xec + lda 0xee + eor #0xffff + adc #0x0 + sta 0xee +.Lmodsi_pos: + ldx 0xee + lda 0xec + rtl + +; -------------------------------------------------------------------- +; __divmodsi_setup — common prologue for __divsi3 / __modsi3. +; Reads A=a_lo, X=a_hi (i32-first-arg ABI), (4,s)=b_lo, (6,s)=b_hi. +; Writes |a| to $e0/$e2, |b| to $e4/$e6, sign bits to $f0. JSR/RTS. +; After JSR's 2-byte ret push, callee-relative offsets are (6,s)=b_lo, +; (8,s)=b_hi. +; -------------------------------------------------------------------- +__divmodsi_setup: + ; Clear sign tracker. + pha + lda #0x0 + sta 0xf0 + pla + ; |a|: A=a_lo, X=a_hi. Save them first (we need a_hi for sign test). + sta 0xe0 ; tentative a_lo (may negate below) + stx 0xe2 ; tentative a_hi + cpx #0x8000 + bcc .Lsetsi_a_pos + ; a is negative. Set sign tracker bits 0+1 and negate. + lda 0xf0 + ora #0x3 + sta 0xf0 + ; 32-bit negate: invert + 1. + lda 0xe0 + eor #0xffff + clc + adc #0x1 + sta 0xe0 + lda 0xe2 + eor #0xffff + adc #0x0 + sta 0xe2 +.Lsetsi_a_pos: + ; |b|. Args shifted by 2 (the JSR ret push). + lda 0x6, s + sta 0xe4 + lda 0x8, s + sta 0xe6 + cmp #0x8000 + bcc .Lsetsi_b_pos + ; b is negative. Flip bit 1 of $f0. + lda 0xf0 + eor #0x2 + sta 0xf0 + lda 0xe4 + eor #0xffff + clc + adc #0x1 + sta 0xe4 + lda 0xe6 + eor #0xffff + adc #0x0 + sta 0xe6 +.Lsetsi_b_pos: + rts diff --git a/scripts/safeCC.sh b/scripts/safeCC.sh new file mode 100755 index 0000000..bc3344b --- /dev/null +++ b/scripts/safeCC.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash +# Wrapper for ad-hoc invocations of the W65816 cross-compiler toolchain. +# Applies the same memory/CPU caps as smokeTest.sh so a runaway backend +# bug (infinite combine, runaway inserter) can't OOM-kill the whole tmux +# scope and take Claude Code down with it. +# +# Usage: +# scripts/safeCC.sh clang --target=w65816 -O2 -S foo.c -o foo.s +# scripts/safeCC.sh llc -march=w65816 foo.ll -o foo.s +# +# The first arg is resolved against tools/llvm-mos-build/bin/ if it isn't +# already an absolute or relative path containing a slash. + +set -euo pipefail + +ulimit -v $((4 * 1024 * 1024)) # 4 GB virtual memory +ulimit -t 90 # 90 CPU-seconds + +if [ $# -lt 1 ]; then + printf 'usage: %s [args...]\n' "$0" >&2 + exit 2 +fi + +PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +BIN_DIR="$PROJECT_ROOT/tools/llvm-mos-build/bin" + +tool="$1" +shift + +case "$tool" in + /*|./*|*/*) exec "$tool" "$@" ;; + *) exec "$BIN_DIR/$tool" "$@" ;; +esac diff --git a/scripts/smokeTest.sh b/scripts/smokeTest.sh index 8de958d..0b3c20d 100755 --- a/scripts/smokeTest.sh +++ b/scripts/smokeTest.sh @@ -11,6 +11,18 @@ set -euo pipefail source "$(dirname "$0")/common.sh" +# Resource caps for child compilers. A bug in the W65816 backend can send +# clang/llc into a runaway combine/inserter loop that allocates tens of GB +# of RAM. When that happens the kernel OOM-killer takes down the entire +# tmux scope (bash, the compiler, and the parent Claude Code session with +# it). Bounding virtual memory and CPU time here turns "OOM kills the +# terminal" into "compiler dies with SIGSEGV / SIGXCPU and we get a clean +# error." Numbers are well above what a healthy compile of these tiny +# test inputs needs (~200 MB / a few seconds), so legitimate work is +# unaffected. +ulimit -v $((4 * 1024 * 1024)) # 4 GB virtual memory ceiling +ulimit -t 90 # 90 CPU-seconds per process + BUILD_DIR="$TOOLS_DIR/llvm-mos-build" LLC="$BUILD_DIR/bin/llc" LLVM_MC="$BUILD_DIR/bin/llvm-mc" @@ -249,7 +261,344 @@ EOF done fi -# 11. Real C through clang. Uses the clang front-end if it has been +# 11a. SETCC via clang: a > b returns 0/1. Exercises the multi-branch +# CC path (BEQ + BPL diamond, since SETGT can't be a single Bxx). +CLANG="$BUILD_DIR/bin/clang" +if [ -x "$CLANG" ]; then + log "check: clang compiles a > b via multi-branch SETCC" + cFile="$(mktemp --suffix=.c)" + sCmpFile="$(mktemp --suffix=.s)" + trap 'rm -f "$irFile" "$sFile" "$irCallFile" "$sCallFile" "$irMaFile" "$sMaFile" "$irI8File" "$sI8File" "$cFile" "$sCmpFile"' EXIT + cat > "$cFile" <<'EOF' +int gt(int a, int b) { return a > b; } +EOF + "$CLANG" --target=w65816 -O2 -S "$cFile" -o "$sCmpFile" + # Expect a CMP, then BEQ + BPL forming the multi-branch diamond. + for expect in "cmp 0x4, s" "lda #0x1" "beq" "bpl" "lda #0x0"; do + if ! grep -qF "$expect" "$sCmpFile"; then + warn "setcc gt test missing: $expect" + cat "$sCmpFile" >&2 + die "setcc gt test failed" + fi + done +fi + +# 11b. SELECT via clang: c ? a : b returns one of two constants. +if [ -x "$CLANG" ]; then + log "check: clang compiles c ? 100 : 200 via SELECT_CC" + cFile2="$(mktemp --suffix=.c)" + sSelFile="$(mktemp --suffix=.s)" + trap 'rm -f "$irFile" "$sFile" "$irCallFile" "$sCallFile" "$irMaFile" "$sMaFile" "$irI8File" "$sI8File" "$cFile" "$sCmpFile" "$cFile2" "$sSelFile"' EXIT + cat > "$cFile2" <<'EOF' +int sel(int c) { return c ? 100 : 200; } +EOF + "$CLANG" --target=w65816 -O2 -S "$cFile2" -o "$sSelFile" + for expect in "cmp #0x0" "lda #0xc8" "beq" "lda #0x64"; do + if ! grep -qF "$expect" "$sSelFile"; then + warn "select test missing: $expect" + cat "$sSelFile" >&2 + die "select test failed" + fi + done +fi + +# 11c. Two-Acc16 op via clang: a - b where both are non-foldable Acc16. +# Caller-side b lives in memory (FI), so this matches via SBCfi without +# the spill — but a + b + c chains through a true two-Acc16 add. +if [ -x "$CLANG" ]; then + log "check: clang compiles two-Acc16 ops via spill (chained add)" + cFile3="$(mktemp --suffix=.c)" + sChainFile="$(mktemp --suffix=.s)" + trap 'rm -f "$irFile" "$sFile" "$irCallFile" "$sCallFile" "$irMaFile" "$sMaFile" "$irI8File" "$sI8File" "$cFile" "$sCmpFile" "$cFile2" "$sSelFile" "$cFile3" "$sChainFile"' EXIT + cat > "$cFile3" <<'EOF' +// max3 forces two-Acc16: outer SELECT_CC compares one Acc16 PHI value +// to another Acc16 PHI value (m vs c, both computed values). +int max3(int a, int b, int c) { + int m = a > b ? a : b; + return m > c ? m : c; +} +EOF + "$CLANG" --target=w65816 -O2 -S "$cFile3" -o "$sChainFile" + # Expect at least one sta-spill paired with cmp to a stack-relative + # slot - the signature of the two-Acc16 CMP_RR custom inserter. + if ! grep -qE 'sta 0x[0-9a-f]+, s' "$sChainFile" \ + || ! grep -qE 'cmp 0x[0-9a-f]+, s' "$sChainFile"; then + cat "$sChainFile" >&2 + die "two-Acc16 (max3) didn't spill+cmp via stack-relative" + fi +fi + +# 11d. Multiply via libcall. +if [ -x "$CLANG" ]; then + log "check: clang emits __mulhi3 libcall for i16 multiply" + cFile4="$(mktemp --suffix=.c)" + sMulFile="$(mktemp --suffix=.s)" + trap 'rm -f "$irFile" "$sFile" "$irCallFile" "$sCallFile" "$irMaFile" "$sMaFile" "$irI8File" "$sI8File" "$cFile" "$sCmpFile" "$cFile2" "$sSelFile" "$cFile3" "$sChainFile" "$cFile4" "$sMulFile"' EXIT + cat > "$cFile4" <<'EOF' +int mul(int a, int b) { return a * b; } +EOF + "$CLANG" --target=w65816 -O2 -S "$cFile4" -o "$sMulFile" + if ! grep -qF "jsl __mulhi3" "$sMulFile"; then + cat "$sMulFile" >&2 + die "expected jsl __mulhi3" + fi +fi + +# 11e. Variable shift via libcall. +if [ -x "$CLANG" ]; then + log "check: clang emits __ashlhi3 libcall for variable i16 shift" + cFile5="$(mktemp --suffix=.c)" + sShfFile="$(mktemp --suffix=.s)" + trap 'rm -f "$irFile" "$sFile" "$irCallFile" "$sCallFile" "$irMaFile" "$sMaFile" "$irI8File" "$sI8File" "$cFile" "$sCmpFile" "$cFile2" "$sSelFile" "$cFile3" "$sChainFile" "$cFile4" "$sMulFile" "$cFile5" "$sShfFile"' EXIT + cat > "$cFile5" <<'EOF' +int shf(int x, int n) { return x << n; } +EOF + "$CLANG" --target=w65816 -O2 -S "$cFile5" -o "$sShfFile" + if ! grep -qF "jsl __ashlhi3" "$sShfFile"; then + cat "$sShfFile" >&2 + die "expected jsl __ashlhi3" + fi +fi + +# 11f. Pointer deref: *p loads via stack-relative-indirect-Y. +if [ -x "$CLANG" ]; then + log "check: clang compiles *p via LDA (slot,s),y" + cFile6="$(mktemp --suffix=.c)" + sPtrFile="$(mktemp --suffix=.s)" + trap 'rm -f "$irFile" "$sFile" "$irCallFile" "$sCallFile" "$irMaFile" "$sMaFile" "$irI8File" "$sI8File" "$cFile" "$sCmpFile" "$cFile2" "$sSelFile" "$cFile3" "$sChainFile" "$cFile4" "$sMulFile" "$cFile5" "$sShfFile" "$cFile6" "$sPtrFile"' EXIT + cat > "$cFile6" <<'EOF' +int load_ptr(const int *p) { return *p; } +void store_ptr(int *p, int v) { *p = v; } +EOF + "$CLANG" --target=w65816 -O2 -S "$cFile6" -o "$sPtrFile" + for expect in "ldy #0x0" "lda (0x" "sta (0x"; do + if ! grep -qF "$expect" "$sPtrFile"; then + warn "ptr-deref test missing: $expect" + cat "$sPtrFile" >&2 + die "ptr-deref test failed" + fi + done +fi + +# 11g. i8 store via pointer: *p = v wraps the STA in SEP/REP so only +# 1 byte is written. Both load_byte and store_byte must compile. +if [ -x "$CLANG" ]; then + log "check: clang compiles *p = v with SEP/REP-wrapped STA (i8 store)" + cFile7="$(mktemp --suffix=.c)" + sBptrFile="$(mktemp --suffix=.s)" + trap 'rm -f "$irFile" "$sFile" "$irCallFile" "$sCallFile" "$irMaFile" "$sMaFile" "$irI8File" "$sI8File" "$cFile" "$sCmpFile" "$cFile2" "$sSelFile" "$cFile3" "$sChainFile" "$cFile4" "$sMulFile" "$cFile5" "$sShfFile" "$cFile6" "$sPtrFile" "$cFile7" "$sBptrFile"' EXIT + cat > "$cFile7" <<'EOF' +unsigned char loadb(const unsigned char *p) { return *p; } +void storeb(unsigned char *p, unsigned char v) { *p = v; } +unsigned char incb(unsigned char *p) { return ++*p; } +EOF + "$CLANG" --target=w65816 -O2 -S "$cFile7" -o "$sBptrFile" + # storeb body should contain SEP #$20 ... STA (slot,s),y ... REP #$20. + if ! grep -qF "sep #0x20" "$sBptrFile" \ + || ! grep -qF "rep #0x20" "$sBptrFile" \ + || ! grep -qE 'sta \(0x[0-9a-f]+, s\), y' "$sBptrFile"; then + cat "$sBptrFile" >&2 + die "i8 ptr-store test missing SEP/STA/REP sequence" + fi + # All three functions must produce labels. + for sym in loadb storeb incb; do + if ! grep -qE "^${sym}:" "$sBptrFile"; then + cat "$sBptrFile" >&2 + die "i8 ptr test: missing function ${sym}" + fi + done + # Correctness check: storeb's prologue must NOT clobber A. A holds + # the pointer arg on entry; the first body op must spill A intact. + # The fixed prologue uses N/2 PHAs (small N) or TAY/TSC/.../TYA + # (large N). Either way, the first non-prologue op should be a + # `sta NN,s` that captures arg0=p. If we see TSC anywhere in the + # prologue WITHOUT a TAY before it, that's the broken form (A + # clobbered by TSC, then the spill stores garbage SP value as if + # it were the pointer). + storeb_body="$(sed -n '/^storeb:/,/^\.Lfunc_end/p' "$sBptrFile")" + if printf '%s\n' "$storeb_body" | grep -qE '^ tsc$' \ + && ! printf '%s\n' "$storeb_body" | grep -qE '^ tay$'; then + cat "$sBptrFile" >&2 + die "storeb prologue uses bare TSC without TAY — A (the pointer arg) gets clobbered before being spilled. Byte store writes to the wrong address. Use PHA-based prologue or TAY/TSC/.../TYA bracket." + fi + # Also: there must be at least one `sta NN,s` in the body (the spill + # of the pointer arg). + if ! printf '%s\n' "$storeb_body" | grep -qE '^ sta 0x[0-9a-f]+, s$'; then + cat "$sBptrFile" >&2 + die "storeb missing pointer-arg spill (sta NN,s)" + fi +fi + +# 11h. i8 global access stays in 8-bit M (no over-read). bump_gb must +# get the SEP #$20 prologue and emit a single-byte lda/inc/sta sequence. +if [ -x "$CLANG" ]; then + log "check: clang keeps pure-i8 global access in 8-bit M (no wide-read regression)" + cFile8="$(mktemp --suffix=.c)" + sGbFile="$(mktemp --suffix=.s)" + trap 'rm -f "$irFile" "$sFile" "$irCallFile" "$sCallFile" "$irMaFile" "$sMaFile" "$irI8File" "$sI8File" "$cFile" "$sCmpFile" "$cFile2" "$sSelFile" "$cFile3" "$sChainFile" "$cFile4" "$sMulFile" "$cFile5" "$sShfFile" "$cFile6" "$sPtrFile" "$cFile7" "$sBptrFile" "$cFile8" "$sGbFile"' EXIT + cat > "$cFile8" <<'EOF' +unsigned char gb; +void bump_gb(void) { gb++; } +EOF + "$CLANG" --target=w65816 -O2 -S "$cFile8" -o "$sGbFile" + # Must use 8-bit M prologue (sep #$20), not the 16-bit one. + if ! grep -qF "sep #0x20" "$sGbFile"; then + cat "$sGbFile" >&2 + die "bump_gb test: expected sep #\$20 prologue (got 16-bit M)" + fi +fi + +# 11j. Runtime library assembles and exports all expected libcalls. +# This is the destination of every __mulhi3/__ashlhi3/etc. that clang +# emits — without it, generated code links to nothing. +RUNTIME_SH="$PROJECT_ROOT/runtime/build.sh" +RUNTIME_OBJ="$PROJECT_ROOT/runtime/libgcc.o" +if [ -x "$RUNTIME_SH" ]; then + log "check: runtime/build.sh assembles libgcc.o with all libcall symbols" + "$RUNTIME_SH" >/dev/null + if [ ! -f "$RUNTIME_OBJ" ]; then + die "runtime/build.sh did not produce libgcc.o" + fi + syms="$("$BUILD_DIR/bin/llvm-objdump" -t "$RUNTIME_OBJ" 2>&1 | awk '{print $NF}')" + for need in __mulhi3 __ashlhi3 __ashrhi3 __lshrhi3 __divhi3 __udivhi3 __modhi3 __umodhi3; do + if ! printf '%s\n' "$syms" | grep -qx "$need"; then + printf '%s\n' "$syms" >&2 + die "runtime missing symbol: $need" + fi + done +fi + +# 11m. Real-world surface area: a non-trivial program that exercises +# struct-field deref, char* iteration, multiply, shift, and a bit-twiddle +# function. Validates the backend compiles a realistic C input +# end-to-end without crashing. Doesn't assert specific asm; just +# success and that the function bodies are non-empty. +if [ -x "$CLANG" ]; then + log "check: clang compiles a real-world multi-function program" + cFile12="$(mktemp --suffix=.c)" + sBigFile="$(mktemp --suffix=.s)" + trap 'rm -f "$irFile" "$sFile" "$irCallFile" "$sCallFile" "$irMaFile" "$sMaFile" "$irI8File" "$sI8File" "$cFile" "$sCmpFile" "$cFile2" "$sSelFile" "$cFile3" "$sChainFile" "$cFile4" "$sMulFile" "$cFile5" "$sShfFile" "$cFile6" "$sPtrFile" "$cFile7" "$sBptrFile" "$cFile8" "$sGbFile" "$cFile9" "$sEqbFile" "$cFile10" "$sSgnFile" "$cFile11" "$sCallsFile" "$cFile12" "$sBigFile"' EXIT + cat > "$cFile12" <<'EOF' +typedef unsigned char u8; +typedef unsigned int u16; +struct Node { u16 data; struct Node *next; }; +u16 list_sum(const struct Node *h) { + u16 s=0; while(h){ s+=h->data; h=h->next; } return s; +} +int strcmp_test(const char *a, const char *b) { + while (*a && *a == *b) { a++; b++; } + return (unsigned char)*a - (unsigned char)*b; +} +u16 fnv16(const u8 *p, u16 n) { + u16 h=0x811C; for (u16 i=0;i>=8; } + if (!(x & 0x0F)) { n+=4; x>>=4; } + if (!(x & 0x03)) { n+=2; x>>=2; } + if (!(x & 0x01)) n+=1; + return n; +} +EOF + "$CLANG" --target=w65816 -O2 -S "$cFile12" -o "$sBigFile" + for sym in list_sum strcmp_test fnv16 ctz16; do + if ! grep -qE "^${sym}:" "$sBigFile"; then + cat "$sBigFile" >&2 + die "real-world test missing function: $sym" + fi + done +fi + +# 11l. Linkage contract: every libcall clang generates from arithmetic +# ops must match a symbol provided by runtime/libgcc.o. We can't run a +# real link yet (no w65816-aware linker), but we can verify the symbol +# names line up — drift here would be a silent runtime crash. +if [ -x "$CLANG" ] && [ -f "$RUNTIME_OBJ" ]; then + log "check: every libcall clang emits has a matching definition in libgcc.o" + cFile11="$(mktemp --suffix=.c)" + sCallsFile="$(mktemp --suffix=.s)" + trap 'rm -f "$irFile" "$sFile" "$irCallFile" "$sCallFile" "$irMaFile" "$sMaFile" "$irI8File" "$sI8File" "$cFile" "$sCmpFile" "$cFile2" "$sSelFile" "$cFile3" "$sChainFile" "$cFile4" "$sMulFile" "$cFile5" "$sShfFile" "$cFile6" "$sPtrFile" "$cFile7" "$sBptrFile" "$cFile8" "$sGbFile" "$cFile9" "$sEqbFile" "$cFile10" "$sSgnFile" "$cFile11" "$sCallsFile"' EXIT + cat > "$cFile11" <<'EOF' +int m1(int a, int b) { return a * b; } +unsigned int m2(unsigned int a, unsigned int b) { return a * b; } +int s1(int x, int n) { return x << n; } +unsigned int s2(unsigned int x, int n) { return x >> n; } +int s3(int x, int n) { return x >> n; } +int d1(int a, int b) { return a / b; } +unsigned int d2(unsigned int a, unsigned int b) { return a / b; } +int r1(int a, int b) { return a % b; } +unsigned int r2(unsigned int a, unsigned int b) { return a % b; } +long m3(long a, long b) { return a * b; } +unsigned long m4(unsigned long a, unsigned long b) { return a * b; } +long s4(long x, int n) { return x << n; } +long s5(long x, int n) { return x >> n; } +unsigned long s6(unsigned long x, int n) { return x >> n; } +long d3(long a, long b) { return a / b; } +unsigned long d4(unsigned long a, unsigned long b) { return a / b; } +long r3(long a, long b) { return a % b; } +unsigned long r4(unsigned long a, unsigned long b) { return a % b; } +EOF + "$CLANG" --target=w65816 -O2 -S "$cFile11" -o "$sCallsFile" + runtime_syms="$("$BUILD_DIR/bin/llvm-objdump" -t "$RUNTIME_OBJ" 2>&1 | awk '$2 == "g" {print $NF}')" + emitted="$(grep -oE 'jsl __[a-z0-9]+' "$sCallsFile" | awk '{print $2}' | sort -u)" + for sym in $emitted; do + if ! printf '%s\n' "$runtime_syms" | grep -qx "$sym"; then + warn "clang emitted libcall $sym but runtime/libgcc.o has no such symbol" + printf 'runtime exports:\n%s\n' "$runtime_syms" >&2 + printf 'clang emitted:\n%s\n' "$emitted" >&2 + die "libcall name drift: $sym missing from runtime" + fi + done +fi + +# 11k. signed i8 compare: forces 16-bit M prologue (instrLowersToWide) +# because the SEXT lowering needs i16 ops. Verifies both that the +# code compiles AND that the prologue is REP #$30 (not the 8-bit M +# fast path, which would silently corrupt the SEXT mask). +if [ -x "$CLANG" ]; then + log "check: signed i8 compare gets 16-bit M prologue + emits cmp" + cFile10="$(mktemp --suffix=.c)" + sSgnFile="$(mktemp --suffix=.s)" + trap 'rm -f "$irFile" "$sFile" "$irCallFile" "$sCallFile" "$irMaFile" "$sMaFile" "$irI8File" "$sI8File" "$cFile" "$sCmpFile" "$cFile2" "$sSelFile" "$cFile3" "$sChainFile" "$cFile4" "$sMulFile" "$cFile5" "$sShfFile" "$cFile6" "$sPtrFile" "$cFile7" "$sBptrFile" "$cFile8" "$sGbFile" "$cFile9" "$sEqbFile" "$cFile10" "$sSgnFile"' EXIT + cat > "$cFile10" <<'EOF' +signed char sgnlt(signed char a, signed char b) { return a < b; } +EOF + "$CLANG" --target=w65816 -O2 -S "$cFile10" -o "$sSgnFile" + # Must use 16-bit M (rep #$30), not the 8-bit fast path. + if ! grep -qF "rep #0x30" "$sSgnFile"; then + cat "$sSgnFile" >&2 + die "sgnlt: expected rep #\$30 prologue (i8 signed cmp needs 16-bit M)" + fi + # Must NOT contain the 8-bit prologue, which would mean we never + # transitioned (the SEXT injection's ora #\$ff00 would silently + # truncate to ora #\$00 in 8-bit M). + if grep -qF "rep #0x10" "$sSgnFile" && ! grep -qF "rep #0x30" "$sSgnFile"; then + cat "$sSgnFile" >&2 + die "sgnlt: only saw 8-bit M prologue, SEXT high-byte mask would be dropped" + fi +fi + +# 11i. i8 equality compare on two stack args (eqbyte): exercises i8 +# SETCC promotion through Lower*CC. +if [ -x "$CLANG" ]; then + log "check: clang lowers i8 == i8 via promoted i16 cmp" + cFile9="$(mktemp --suffix=.c)" + sEqbFile="$(mktemp --suffix=.s)" + trap 'rm -f "$irFile" "$sFile" "$irCallFile" "$sCallFile" "$irMaFile" "$sMaFile" "$irI8File" "$sI8File" "$cFile" "$sCmpFile" "$cFile2" "$sSelFile" "$cFile3" "$sChainFile" "$cFile4" "$sMulFile" "$cFile5" "$sShfFile" "$cFile6" "$sPtrFile" "$cFile7" "$sBptrFile" "$cFile8" "$sGbFile" "$cFile9" "$sEqbFile"' EXIT + cat > "$cFile9" <<'EOF' +unsigned char eqbyte(unsigned char a, unsigned char b) { return a == b; } +EOF + "$CLANG" --target=w65816 -O2 -S "$cFile9" -o "$sEqbFile" + # Must produce a cmp + beq (the eq diamond). + if ! grep -qE 'cmp ' "$sEqbFile" || ! grep -qF "beq" "$sEqbFile"; then + cat "$sEqbFile" >&2 + die "eqbyte test: expected cmp + beq sequence" + fi +fi + +# 12. Real C through clang. Uses the clang front-end if it has been # built; skipped otherwise (clang takes 15-30 minutes to build the # first time; afterwards rebuilds are fast). CLANG="$BUILD_DIR/bin/clang" @@ -270,6 +619,222 @@ EOF die "clang end-to-end test failed" fi done + + # 13. i32 (long) compile path. Type legalization splits i32 into + # two i16 halves; the high half flows through the (add FrameIndex, + # 2) shape, which previously crashed ISel with "Cannot select + # FrameIndex<-2>". SelectFrameIndex now folds (add FI, const) so + # the split loads land on a stack-relative addressing mode. + # Return ABI: low->A, high->X (TAX in the epilogue). + # Also asserts the native ADC carry chain (CLC + ADC + ADC) is in + # place — task #49 replaced the bloated SETCC-based carry detect + # (lda;cmp;bcc;lda) with a direct ADDC/ADDE-pattern lowering that + # uses the C flag in P as a Glue-modeled physreg. + log "check: clang compiles a long add (i32 split + A:X return)" + cI32File="$(mktemp --suffix=.c)" + oI32File="$(mktemp --suffix=.o)" + trap 'rm -f "$irFile" "$sFile" "$irCallFile" "$sCallFile" "$irMaFile" "$sMaFile" "$irI8File" "$sI8File" "$cFile" "$oFile2" "$cI32File" "$oI32File"' EXIT + cat > "$cI32File" <<'EOF' +long add32(long a, long b) { return a + b; } +EOF + "$CLANG" --target=w65816 -O2 -c "$cI32File" -o "$oI32File" + disasmI32="$("$OBJDUMP" --triple=w65816 -d "$oI32File" 2>&1)" + # TAX confirms the high-half-into-X part of the return ABI fired. + # Without it, both halves would pile into A and one would be lost. + # Exactly one CLC and exactly two ADCs prove the native carry chain + # is wired (one CLC for lo, ADC lo, ADC hi-with-carry); a regression + # to the SETCC path would show two CLCs and a bcc/cmp. + for expect in "tax" "rtl" "clc" "adc"; do + if ! printf '%s\n' "$disasmI32" | grep -qF "$expect"; then + warn "i32 add test missing: $expect" + printf '%s\n' "$disasmI32" >&2 + die "i32 add end-to-end test failed" + fi + done + nClc="$(printf '%s\n' "$disasmI32" | grep -cE '\bclc\b' || true)" + nAdc="$(printf '%s\n' "$disasmI32" | grep -cE '\badc\b' || true)" + nBcc="$(printf '%s\n' "$disasmI32" | grep -cE '\bbcc\b' || true)" + if [ "$nClc" != "1" ] || [ "$nAdc" != "2" ] || [ "$nBcc" != "0" ]; then + warn "i32 add carry-chain shape wrong (clc=$nClc adc=$nAdc bcc=$nBcc, want 1/2/0)" + printf '%s\n' "$disasmI32" >&2 + die "i32 add carry-chain regression" + fi + # Lock the post-StackSlotCleanup instruction count: should be ~11 for + # add32 (rep + pha + clc + adc + sta + txa + adc + tax + lda + ply + rtl + # — i32-first-arg in A:X means arg0_hi loads as TXA, no LDAfi). If + # this regresses meaningfully (say >14) the cleanup pass, the + # rematerialization flag, or the A:X first-arg ABI has been broken. + nInsns="$(printf '%s\n' "$disasmI32" | grep -cE '^[0-9a-f]+:' || true)" + if [ "$nInsns" -gt 14 ]; then + warn "i32 add bloat (got $nInsns insns, want <=14 — was 25 pre-cleanup, 11 post)" + printf '%s\n' "$disasmI32" >&2 + die "i32 add code-quality regression" + fi + # The A:X arg0 ABI moves arg0_hi out of the stack slot, so the + # asm should contain TXA (X→A for the hi-half ADC tied input) + # exactly once. A regression to "load arg0_hi from stack" would + # remove the TXA and add an extra LDA. + nTxa="$(printf '%s\n' "$disasmI32" | grep -cE '\btxa\b' || true)" + if [ "$nTxa" != "1" ]; then + warn "i32 add: expected exactly 1 txa (i32-first-arg-in-A:X path); got $nTxa" + printf '%s\n' "$disasmI32" >&2 + die "i32 add A:X first-arg ABI regression" + fi + + # i32 carry chain on two-Acc16 (no foldable load): exercises the + # ADD_RR + ADDE_RR custom-inserter path. fib32 has live a/b values + # the inserter must spill to a fresh slot; pre-fix this crashed at + # ISel with "Cannot select: adde reg, reg". + log "check: clang compiles a 32-bit fib loop (ADDE_RR inserter path)" + cFibFile="$(mktemp --suffix=.c)" + sFibFile="$(mktemp --suffix=.s)" + trap 'rm -f "$irFile" "$sFile" "$irCallFile" "$sCallFile" "$irMaFile" "$sMaFile" "$irI8File" "$sI8File" "$cFile" "$oFile2" "$cI32File" "$oI32File" "$cFibFile" "$sFibFile"' EXIT + cat > "$cFibFile" <<'EOF' +unsigned long fib32(unsigned long n) { + unsigned long a = 0, b = 1, t; + while (n > 0) { t = a + b; a = b; b = t; n--; } + return a; +} +EOF + if ! "$CLANG" --target=w65816 -O2 -S "$cFibFile" -o "$sFibFile" 2>&1 >/dev/null; then + die "i32 fib (ADDE_RR inserter) failed to compile" + fi + if ! grep -qE '\bclc\b' "$sFibFile" || ! grep -qE '\badc\b' "$sFibFile"; then + warn "i32 fib output missing clc/adc" + die "i32 fib carry-chain regression" + fi + + # i32 multiply via __mulsi3 libcall: tests the multi-i16-return path + # (RetCC_W65816 assigning A then X for 2 i16 returns) plus the i32 + # arg push side. Pre-fix this hit "multi-return calls not yet + # supported (Ins.size=4)" when LowerCallTo split the i32 return. + log "check: clang compiles a long multiply via __mulsi3 libcall" + cMulFile="$(mktemp --suffix=.c)" + sMulFile="$(mktemp --suffix=.s)" + trap 'rm -f "$irFile" "$sFile" "$irCallFile" "$sCallFile" "$irMaFile" "$sMaFile" "$irI8File" "$sI8File" "$cFile" "$oFile2" "$cI32File" "$oI32File" "$cFibFile" "$sFibFile" "$cMulFile" "$sMulFile"' EXIT + cat > "$cMulFile" <<'EOF' +unsigned long mul32(unsigned long a, unsigned long b) { return a * b; } +EOF + if ! "$CLANG" --target=w65816 -O2 -S "$cMulFile" -o "$sMulFile" 2>&1 >/dev/null; then + die "i32 mul via __mulsi3 failed to compile" + fi + if ! grep -q '__mulsi3' "$sMulFile"; then + die "i32 mul did not emit __mulsi3 libcall" + fi + + # i32 shift-by-1 (SHL/SRL): the type-legalizer's SHL_PARTS / SRL_PARTS + # expansion needs `(srl x, 15)` or `(shl x, 15)` for the carry-cross- + # halves slot. Without inline patterns those fall to __lshrhi3 / + # __ashlhi3 libcalls (~10 byte overhead per shift). SRL15A and + # SHL15A pseudos handle them inline (`ASL/LSR; LDA #0; ROL/ROR`, + # 3 bytes). Verify the shift-by-1 output doesn't contain a hi3 + # libcall. + log "check: clang i32 shift-by-1 stays inline (no __lshrhi3 / __ashlhi3 libcall)" + cSh1File="$(mktemp --suffix=.c)" + sSh1File="$(mktemp --suffix=.s)" + trap 'rm -f "$irFile" "$sFile" "$irCallFile" "$sCallFile" "$irMaFile" "$sMaFile" "$irI8File" "$sI8File" "$cFile" "$oFile2" "$cI32File" "$oI32File" "$cFibFile" "$sFibFile" "$cMulFile" "$sMulFile" "$cSh1File" "$sSh1File"' EXIT + cat > "$cSh1File" <<'EOF' +unsigned long shl1(unsigned long a) { return a << 1; } +unsigned long shr1(unsigned long a) { return a >> 1; } +EOF + if ! "$CLANG" --target=w65816 -O2 -S "$cSh1File" -o "$sSh1File" 2>&1 >/dev/null; then + die "i32 shift-by-1 failed to compile" + fi + if grep -qE '__lshrhi3|__ashlhi3' "$sSh1File"; then + warn "i32 shift-by-1 still calling i16 shift libcall — SRL15A/SHL15A pattern not firing" + die "i32 shift-by-1 regression" + fi + + # Varargs (): LowerFormalArguments creates a fixed FI + # for the first vararg slot when IsVarArg; LowerVASTART stores + # its address to the va_list pointer. VAARG/VACOPY/VAEND use + # default LLVM expansions. Pre-fix this hit + # "vararg functions not yet supported" fatal error. + log "check: clang compiles a vararg function ()" + cVaFile="$(mktemp --suffix=.c)" + sVaFile="$(mktemp --suffix=.s)" + trap 'rm -f "$irFile" "$sFile" "$irCallFile" "$sCallFile" "$irMaFile" "$sMaFile" "$irI8File" "$sI8File" "$cFile" "$oFile2" "$cI32File" "$oI32File" "$cFibFile" "$sFibFile" "$cMulFile" "$sMulFile" "$cSh1File" "$sSh1File" "$cVaFile" "$sVaFile"' EXIT + cat > "$cVaFile" <<'EOF' +#include +int sumArgs(int n, ...) { + va_list args; + va_start(args, n); + int sum = 0; + for (int i = 0; i < n; i++) sum += va_arg(args, int); + va_end(args); + return sum; +} +EOF + if ! "$CLANG" --target=w65816 -O2 -S "$cVaFile" -o "$sVaFile" 2>&1 >/dev/null; then + die "vararg function failed to compile" + fi + + # Stack-array LEA: `char arr[16]; arr[i] = ...` needs the address + # of an alloca'd object as an i16 value. Pre-fix this hit "Cannot + # select: FrameIndex<0>" because addr_fi only matches in load/store + # contexts. W65816DAGToDAGISel::Select now lowers a bare + # ISD::FrameIndex to ADDframe (FI, 0); eliminateFrameIndex expands + # ADDframe into TSC + CLC + ADC #disp. + log "check: clang takes the address of a stack-allocated array" + cAllocaFile="$(mktemp --suffix=.c)" + sAllocaFile="$(mktemp --suffix=.s)" + trap 'rm -f "$irFile" "$sFile" "$irCallFile" "$sCallFile" "$irMaFile" "$sMaFile" "$irI8File" "$sI8File" "$cFile" "$oFile2" "$cI32File" "$oI32File" "$cFibFile" "$sFibFile" "$cMulFile" "$sMulFile" "$cAllocaFile" "$sAllocaFile"' EXIT + cat > "$cAllocaFile" <<'EOF' +void writeBytes(char *out, char v) { + char tmp[8]; + for (int i = 0; i < 8; i++) tmp[i] = v + i; + for (int i = 0; i < 8; i++) out[i] = tmp[i]; +} +EOF + if ! "$CLANG" --target=w65816 -O2 -S "$cAllocaFile" -o "$sAllocaFile" 2>&1 >/dev/null; then + die "alloca'd-array address failed to compile" + fi + # The TSC; CLC; ADC #disp triple is the LEA expansion of ADDframe; + # at least one occurrence proves the pseudo wired through. + if ! grep -qE '^\s*tsc' "$sAllocaFile"; then + die "alloca'd-array LEA missing TSC (ADDframe expansion broken)" + fi + + # signed-byte arithmetic (`(int)(*p) - (int)(*q)` style — strcmp). + # Exercises three formerly-missing patterns: SEXTLOAD i16 from i8 + # (we Expand it to (sext (load))), sext_inreg i16 from i8 (the + # `((x & 0xFF) ^ 0x80) - 0x80` tablegen Pat), and extloadi8 from + # an Acc16 register pointer (LDAptr / "high byte don't care"). + log "check: clang compiles a signed-byte strcmp (sextload + sext_inreg + extload-via-ptr)" + cStrFile="$(mktemp --suffix=.c)" + sStrFile="$(mktemp --suffix=.s)" + trap 'rm -f "$irFile" "$sFile" "$irCallFile" "$sCallFile" "$irMaFile" "$sMaFile" "$irI8File" "$sI8File" "$cFile" "$oFile2" "$cI32File" "$oI32File" "$cFibFile" "$sFibFile" "$cMulFile" "$sMulFile" "$cAllocaFile" "$sAllocaFile" "$cStrFile" "$sStrFile"' EXIT + cat > "$cStrFile" <<'EOF' +int strcmp32(const char *a, const char *b) { + while (*a && *a == *b) { a++; b++; } + return (int)(*a) - (int)(*b); +} +EOF + if ! "$CLANG" --target=w65816 -O2 -S "$cStrFile" -o "$sStrFile" 2>&1 >/dev/null; then + die "signed-byte strcmp failed to compile" + fi + + # Indirect calls (function pointers). Lowered via the runtime + # trampoline at runtime/src/libgcc.s::__jsl_indir, which does + # JMP (__indirTarget) — caller stores target to __indirTarget then + # JSL __jsl_indir. Pre-fix, LowerCall reported a fatal error. + log "check: clang compiles an indirect call (via __jsl_indir trampoline)" + cIndFile="$(mktemp --suffix=.c)" + sIndFile="$(mktemp --suffix=.s)" + trap 'rm -f "$irFile" "$sFile" "$irCallFile" "$sCallFile" "$irMaFile" "$sMaFile" "$irI8File" "$sI8File" "$cFile" "$oFile2" "$cI32File" "$oI32File" "$cFibFile" "$sFibFile" "$cMulFile" "$sMulFile" "$cAllocaFile" "$sAllocaFile" "$cStrFile" "$sStrFile" "$cIndFile" "$sIndFile"' EXIT + cat > "$cIndFile" <<'EOF' +typedef int (*BinOp)(int, int); +int doOp(BinOp op, int x, int y) { return op(x, y); } +EOF + if ! "$CLANG" --target=w65816 -O2 -S "$cIndFile" -o "$sIndFile" 2>&1 >/dev/null; then + die "indirect call failed to compile" + fi + if ! grep -q '__indirTarget' "$sIndFile"; then + die "indirect call missing __indirTarget store" + fi + if ! grep -q '__jsl_indir' "$sIndFile"; then + die "indirect call missing JSL to __jsl_indir trampoline" + fi fi log "all smoke checks passed" diff --git a/src/llvm/lib/Target/W65816/AsmParser/W65816AsmParser.cpp b/src/llvm/lib/Target/W65816/AsmParser/W65816AsmParser.cpp index 24c28bd..45d2d2b 100644 --- a/src/llvm/lib/Target/W65816/AsmParser/W65816AsmParser.cpp +++ b/src/llvm/lib/Target/W65816/AsmParser/W65816AsmParser.cpp @@ -200,7 +200,13 @@ public: } bool isPCRel8() const { - return Kind == k_Addr && isConstant(Addr) && constFitsUnsigned(Addr, 8); + // Branch targets are typically symbols (resolved by the assembler / + // linker into the final 8-bit signed offset). Accept any address + // expression — constant in-range, or symbolic. Constants outside + // 8 bits are rejected so they fall through to PCRel16 / longer + // forms instead of silently overflowing. + return Kind == k_Addr && + (!isConstant(Addr) || constFitsUnsigned(Addr, 8)); } bool isPCRel16() const { return Kind == k_Addr && diff --git a/src/llvm/lib/Target/W65816/CMakeLists.txt b/src/llvm/lib/Target/W65816/CMakeLists.txt index 2000020..dea260c 100644 --- a/src/llvm/lib/Target/W65816/CMakeLists.txt +++ b/src/llvm/lib/Target/W65816/CMakeLists.txt @@ -24,6 +24,7 @@ add_llvm_target(W65816CodeGen W65816RegisterInfo.cpp W65816SelectionDAGInfo.cpp W65816Subtarget.cpp + W65816StackSlotCleanup.cpp W65816TargetMachine.cpp W65816AsmPrinter.cpp W65816MCInstLower.cpp diff --git a/src/llvm/lib/Target/W65816/MCTargetDesc/W65816AsmBackend.cpp b/src/llvm/lib/Target/W65816/MCTargetDesc/W65816AsmBackend.cpp index a2cf6a6..a637fd5 100644 --- a/src/llvm/lib/Target/W65816/MCTargetDesc/W65816AsmBackend.cpp +++ b/src/llvm/lib/Target/W65816/MCTargetDesc/W65816AsmBackend.cpp @@ -66,6 +66,22 @@ public: return; } + // PCRel8 (Bxx / BRA) takes a signed 8-bit offset. If the resolved + // displacement won't fit, the encoded byte is meaningless — the + // branch would land somewhere unintended. Diagnose explicitly + // instead of silently truncating. + if (Fixup.getKind() == W65816::fixup_8_pcrel) { + int64_t Signed = static_cast(Value); + if (Signed < -128 || Signed > 127) { + getContext().reportError( + Fixup.getLoc(), + "branch target out of range for 8-bit PC-relative branch " + "(offset " + Twine(Signed) + " bytes); use a long branch (BRL) " + "or restructure the code"); + return; // don't patch — leave zero, error already issued + } + } + // Little-endian patch. for (unsigned i = 0; i < Width; ++i) { Data[Offset + i] = static_cast((Value >> (8 * i)) & 0xff); diff --git a/src/llvm/lib/Target/W65816/W65816.h b/src/llvm/lib/Target/W65816/W65816.h index ae90fc2..6a3bed6 100644 --- a/src/llvm/lib/Target/W65816/W65816.h +++ b/src/llvm/lib/Target/W65816/W65816.h @@ -20,15 +20,26 @@ namespace W65816CC { // 65816 branch condition codes. Encoded as i8 immediate operands in // the BR_CC SDNode and tablegen patterns. +// +// 0..7 map to single Bxx instructions. 8..11 are pseudo codes that +// expand to a two-branch sequence — needed for SETGT/SETLE/SETUGT/ +// SETULE when the operand we'd swap to LHS is a load (no +// pattern-match for load on LHS without spilling A). Only used in +// SELECT_CC16's custom inserter; never reaches a single Bxx. enum CondCode { COND_EQ = 0, // BEQ COND_NE = 1, // BNE COND_HS = 2, // BCS (unsigned >=) COND_LO = 3, // BCC (unsigned <) - COND_MI = 4, // BMI (negative) - COND_PL = 5, // BPL (non-negative) + COND_MI = 4, // BMI (negative, signed <) + COND_PL = 5, // BPL (non-negative, signed >=) COND_VS = 6, // BVS (overflow) COND_VC = 7, // BVC (no overflow) + // Multi-branch pseudo codes (handled by SELECT_CC16 inserter): + COND_GT_MB = 8, // signed > : take if (PL && NE) + COND_LE_MB = 9, // signed <= : take if (MI || EQ) + COND_HI_MB = 10, // unsigned > : take if (HS && NE) + COND_LS_MB = 11, // unsigned <=: take if (LO || EQ) COND_INVALID = -1 }; } // namespace W65816CC @@ -42,8 +53,15 @@ class PassRegistry; FunctionPass *createW65816ISelDag(W65816TargetMachine &TM, CodeGenOptLevel OptLevel); +// Post-RA cleanup: removes redundant STAfi+LDAfi same-slot pairs that +// the greedy allocator emits when materialising a COPY $a -> vreg as +// a spill/reload cycle, even though A still holds the value. See +// W65816StackSlotCleanup.cpp. +FunctionPass *createW65816StackSlotCleanup(); + void initializeW65816AsmPrinterPass(PassRegistry &); void initializeW65816DAGToDAGISelLegacyPass(PassRegistry &); +void initializeW65816StackSlotCleanupPass(PassRegistry &); } // namespace llvm diff --git a/src/llvm/lib/Target/W65816/W65816AsmPrinter.cpp b/src/llvm/lib/Target/W65816/W65816AsmPrinter.cpp index e5c9a4f..1cdcfdc 100644 --- a/src/llvm/lib/Target/W65816/W65816AsmPrinter.cpp +++ b/src/llvm/lib/Target/W65816/W65816AsmPrinter.cpp @@ -82,6 +82,13 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) { switch (MI->getOpcode()) { default: break; + case W65816::LDXi16imm: { + MCInst Ldx; + Ldx.setOpcode(W65816::LDX_Imm16); + Ldx.addOperand(lowerOperand(MI->getOperand(1), MCInstLowering)); + EmitToStreamer(*OutStreamer, Ldx); + return; + } case W65816::LDAi16imm: { MCInst Lda; Lda.setOpcode(W65816::LDA_Imm16); @@ -126,6 +133,18 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) { EmitToStreamer(*OutStreamer, Op); return; } + case W65816::ADCEi16imm: + case W65816::SBCEi16imm: { + // Chained ADC/SBC: no CLC/SEC prefix — the carry/borrow from the + // previous addc/adde/subc/sube is already in P. See ADCi16imm + // comment in W65816InstrInfo.td. + bool IsSub = MI->getOpcode() == W65816::SBCEi16imm; + MCInst Op; + Op.setOpcode(IsSub ? W65816::SBC_Imm16 : W65816::ADC_Imm16); + Op.addOperand(lowerOperand(MI->getOperand(2), MCInstLowering)); + EmitToStreamer(*OutStreamer, Op); + return; + } case W65816::ADCi8imm: case W65816::SBCi8imm: { bool IsSub = MI->getOpcode() == W65816::SBCi8imm; @@ -185,6 +204,16 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) { EmitToStreamer(*OutStreamer, Op); return; } + case W65816::ADCEabs: + case W65816::SBCEabs: { + // Chained variant — no CLC/SEC prefix. + bool IsSub = MI->getOpcode() == W65816::SBCEabs; + MCInst Op; + Op.setOpcode(IsSub ? W65816::SBC_Abs : W65816::ADC_Abs); + Op.addOperand(lowerOperand(MI->getOperand(2), MCInstLowering)); + EmitToStreamer(*OutStreamer, Op); + return; + } case W65816::CMPi16imm: { // CMPi16imm has (outs), (ins Acc16:$lhs, i16imm:$rhs); MC needs only // the immediate. @@ -248,6 +277,18 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) { EmitToStreamer(*OutStreamer, Jsl); return; } + case W65816::PUSH16: { + MCInst Pha; + Pha.setOpcode(W65816::PHA); + EmitToStreamer(*OutStreamer, Pha); + return; + } + case W65816::PUSH16X: { + MCInst Phx; + Phx.setOpcode(W65816::PHX); + EmitToStreamer(*OutStreamer, Phx); + return; + } case W65816::ASLA16: { MCInst Asl; Asl.setOpcode(W65816::ASL_A); @@ -275,6 +316,12 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) { MCInst ror; ror.setOpcode(W65816::ROR_A); EmitToStreamer(*OutStreamer, ror); return; } + case W65816::XBA16: { + MCInst Xba; + Xba.setOpcode(W65816::XBA); + EmitToStreamer(*OutStreamer, Xba); + return; + } case W65816::INA_PSEUDO: { MCInst In; In.setOpcode(W65816::INA); @@ -305,6 +352,112 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) { EmitToStreamer(*OutStreamer, Inc); return; } + case W65816::NEGC16: { + // (subc 0, x) — lo half of multi-precision negate. + // EOR #$FFFF; CLC; ADC #1. C-out = 1 iff result = 0 (i.e. x was 0), + // matching SBC's "no borrow" convention. + MCInst Eor; + Eor.setOpcode(W65816::EOR_Imm16); + Eor.addOperand(MCOperand::createImm(0xFFFF)); + EmitToStreamer(*OutStreamer, Eor); + MCInst Clc; + Clc.setOpcode(W65816::CLC); + EmitToStreamer(*OutStreamer, Clc); + MCInst Adc; + Adc.setOpcode(W65816::ADC_Imm16); + Adc.addOperand(MCOperand::createImm(1)); + EmitToStreamer(*OutStreamer, Adc); + return; + } + case W65816::SRL15A: { + // ASL A; LDA #0; ROL A — extract bit 15 to bit 0. + MCInst Asl; + Asl.setOpcode(W65816::ASL_A); + EmitToStreamer(*OutStreamer, Asl); + MCInst Lda; + Lda.setOpcode(W65816::LDA_Imm16); + Lda.addOperand(MCOperand::createImm(0)); + EmitToStreamer(*OutStreamer, Lda); + MCInst Rol; + Rol.setOpcode(W65816::ROL_A); + EmitToStreamer(*OutStreamer, Rol); + return; + } + case W65816::SHL15A: { + // LSR A; LDA #0; ROR A — move bit 0 to bit 15. + MCInst Lsr; + Lsr.setOpcode(W65816::LSR_A); + EmitToStreamer(*OutStreamer, Lsr); + MCInst Lda; + Lda.setOpcode(W65816::LDA_Imm16); + Lda.addOperand(MCOperand::createImm(0)); + EmitToStreamer(*OutStreamer, Lda); + MCInst Ror; + Ror.setOpcode(W65816::ROR_A); + EmitToStreamer(*OutStreamer, Ror); + return; + } + case W65816::SRL8A: { + // XBA; AND #$00FF — high byte to low byte, zero high. + MCInst Xba; + Xba.setOpcode(W65816::XBA); + EmitToStreamer(*OutStreamer, Xba); + MCInst And; + And.setOpcode(W65816::AND_Imm16); + And.addOperand(MCOperand::createImm(0x00FF)); + EmitToStreamer(*OutStreamer, And); + return; + } + case W65816::SHL8A: { + // XBA; AND #$FF00 — low byte to high byte, zero low. + MCInst Xba; + Xba.setOpcode(W65816::XBA); + EmitToStreamer(*OutStreamer, Xba); + MCInst And; + And.setOpcode(W65816::AND_Imm16); + And.addOperand(MCOperand::createImm(0xFF00)); + EmitToStreamer(*OutStreamer, And); + return; + } + case W65816::SRA15A: { + // ASL A; LDA #0; ADC #-1; EOR #-1 — sign-fill from bit 15. + // ASL: C = bit 15 of input (the sign). + // LDA #0: A = 0, C unchanged. + // ADC #-1: A = 0 + (-1) + C = -1 + C. If C=1 (neg): A = 0; if + // C=0 (pos): A = -1. Inverted from what we want. + // EOR #-1: flip bits — A = -1 (neg) or 0 (pos), correct. + MCInst Asl; + Asl.setOpcode(W65816::ASL_A); + EmitToStreamer(*OutStreamer, Asl); + MCInst Lda; + Lda.setOpcode(W65816::LDA_Imm16); + Lda.addOperand(MCOperand::createImm(0)); + EmitToStreamer(*OutStreamer, Lda); + MCInst Adc; + Adc.setOpcode(W65816::ADC_Imm16); + Adc.addOperand(MCOperand::createImm(0xFFFF)); + EmitToStreamer(*OutStreamer, Adc); + MCInst Eor; + Eor.setOpcode(W65816::EOR_Imm16); + Eor.addOperand(MCOperand::createImm(0xFFFF)); + EmitToStreamer(*OutStreamer, Eor); + return; + } + case W65816::NEGE16: { + // (sube 0, x) — hi half of multi-precision negate. + // EOR #$FFFF; ADC #0. Carry-in from the previous subc/sube is + // already in P; ADC #0 propagates it as ~x + C, which matches + // 0 - x - !C in two's complement. + MCInst Eor; + Eor.setOpcode(W65816::EOR_Imm16); + Eor.addOperand(MCOperand::createImm(0xFFFF)); + EmitToStreamer(*OutStreamer, Eor); + MCInst Adc; + Adc.setOpcode(W65816::ADC_Imm16); + Adc.addOperand(MCOperand::createImm(0)); + EmitToStreamer(*OutStreamer, Adc); + return; + } } MCInst TmpInst; diff --git a/src/llvm/lib/Target/W65816/W65816CallingConv.td b/src/llvm/lib/Target/W65816/W65816CallingConv.td index 7bf96fb..82fc165 100644 --- a/src/llvm/lib/Target/W65816/W65816CallingConv.td +++ b/src/llvm/lib/Target/W65816/W65816CallingConv.td @@ -18,8 +18,10 @@ def RetCC_W65816 : CallingConv<[ // i8 values are returned in the 8-bit accumulator. CCIfType<[i8], CCAssignToReg<[A]>>, - // i16 values are returned in the 16-bit accumulator (same physical reg). - CCIfType<[i16], CCAssignToReg<[A]>> + // i16 values are returned in A; for a split i32 (legalizer produces + // two i16 returns), the second slot lands in X. LowerReturn / + // LowerCall hardcode the same A,X order — keep them in sync. + CCIfType<[i16], CCAssignToReg<[A, X]>> ]>; //===----------------------------------------------------------------------===// diff --git a/src/llvm/lib/Target/W65816/W65816FrameLowering.cpp b/src/llvm/lib/Target/W65816/W65816FrameLowering.cpp index c4346d1..cffa52f 100644 --- a/src/llvm/lib/Target/W65816/W65816FrameLowering.cpp +++ b/src/llvm/lib/Target/W65816/W65816FrameLowering.cpp @@ -19,11 +19,52 @@ #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instructions.h" #include "llvm/Support/ErrorHandling.h" using namespace llvm; +// "Wide" = needs to live in a 16-bit register at some point during the +// function body. i8 and i1 are fine in 8-bit M. Pointer operands that +// are constant addresses (globals, externs) are fine too — they're +// immediate operands of LDA/STA, not values held in A. A non-constant +// pointer (function arg, computed value) does need to sit in A as 16 +// bits for stack-relative-indirect addressing. +static bool isWideTyForMode(Type *T, const llvm::Value *V) { + if (!T || T->isVoidTy()) return false; + if (T->isIntegerTy(8) || T->isIntegerTy(1)) return false; + if (T->isPointerTy() && V && (isa(V) || isa(V))) + return false; + return true; +} + +// Some IR ops, even when their visible types are all i8, lower to +// sequences that need 16-bit M during execution: signed compares (via +// SEXT to i16 + cmp), variable shifts (libcall via i16-promoted args), +// constant shifts > 4 (also routed through i16 via LowerShift), and +// any sext of an i8 (synthesized as a SELECT_CC with i16 mask ops). +// Detect those here so the prologue picks 16-bit M up front. +static bool instrLowersToWide(const Instruction &I) { + if (auto *Cmp = dyn_cast(&I)) { + if (Cmp->isSigned() && + Cmp->getOperand(0)->getType()->isIntegerTy(8)) + return true; + } + if (isa(&I) && + I.getOperand(0)->getType()->isIntegerTy(8)) + return true; + unsigned Op = I.getOpcode(); + if ((Op == Instruction::Shl || Op == Instruction::LShr || + Op == Instruction::AShr) && + I.getType()->isIntegerTy(8)) + return true; + return false; +} + W65816FrameLowering::W65816FrameLowering(const W65816Subtarget &STI) : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, Align(1), 0, Align(1)) {} @@ -54,39 +95,33 @@ void W65816FrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock::iterator MBBI = MBB.begin(); DebugLoc DL; - // Heuristic: scan the function body for any value with i8 type. - // Captures both signature types and internal i8 ops (e.g. a void - // function that loads / stores bytes). An eventual full - // mode-dependence analysis (the REP/SEP pass) will replace this. - bool UsesAcc8 = false; + // Heuristic: choose 8-bit M (REP #$10 + SEP #$20) only for "pure-i8" + // functions — those whose signature and body use no type wider than + // i8 (no i16 ops, no pointers). Any wider type forces 16-bit M + // (REP #$30) since pointer dereferences and stack-relative addressing + // need M=1 to load/store 16 bits at a time. In 16-bit M functions, + // individual i8 ops are wrapped with SEP/REP at the pseudo level. + // A future REP/SEP scheduling pass (design doc 3.3) will replace + // this whole-function decision with a per-region one. const Function &F = MF.getFunction(); - auto isI8 = [](Type *T) { return T && T->isIntegerTy(8); }; - if (isI8(F.getReturnType())) - UsesAcc8 = true; + bool HasWide = isWideTyForMode(F.getReturnType(), nullptr); for (const Argument &Arg : F.args()) { - if (isI8(Arg.getType())) { - UsesAcc8 = true; - break; - } + if (isWideTyForMode(Arg.getType(), &Arg)) { HasWide = true; break; } } - if (!UsesAcc8) { + if (!HasWide) { for (const BasicBlock &BB : F) { - if (UsesAcc8) break; + if (HasWide) break; for (const Instruction &I : BB) { - if (isI8(I.getType())) { - UsesAcc8 = true; - break; - } + if (isWideTyForMode(I.getType(), &I)) { HasWide = true; break; } + if (instrLowersToWide(I)) { HasWide = true; break; } for (const Value *Op : I.operands()) { - if (isI8(Op->getType())) { - UsesAcc8 = true; - break; - } + if (isWideTyForMode(Op->getType(), Op)) { HasWide = true; break; } } - if (UsesAcc8) break; + if (HasWide) break; } } } + bool UsesAcc8 = !HasWide; (void)MRI; if (UsesAcc8) { @@ -96,17 +131,47 @@ void W65816FrameLowering::emitPrologue(MachineFunction &MF, BuildMI(MBB, MBBI, DL, TII.get(W65816::REP)).addImm(0x30); } - // Reserve stack space for locals/spills if any. Sequence is - // `TSC ; SEC ; SBC #N ; TCS` to subtract N from S in 16-bit mode. - // Skipped for i8 functions for now since the stack adjustment uses - // the 16-bit accumulator (would need a save/restore around it). + // Reserve stack space for locals/spills. + // + // Critical: arg0 lives in A on entry, so the prologue MUST NOT + // clobber A. The naive `TSC; SEC; SBC #N; TCS` sequence destroys A + // (TSC overwrites A with SP) — used to silently corrupt arg0 in + // every function with a stack frame, until this fix. + // + // Strategy (16-bit M): + // - Small frames (N <= 14 bytes): use N/2 `PHA` instructions. PHA + // pushes A's value (whatever it is — including arg0) and only + // decrements S. A is not modified. N/2 bytes of code per call. + // Side-effect: the bytes pushed contain copies of arg0; the body's + // regalloc-inserted spills may overwrite them, which is fine. + // - Larger frames: TAY/TSC/.../TYA — 8 bytes total, preserves A + // through Y as a temporary. Y is caller-saved by our (loose) ABI. + // + // Strategy (8-bit M): PHA in 8-bit M pushes 1 byte, so N PHAs for + // N bytes. Without this, spills land on top of the return address + // and corrupt it (was a latent silent crash for 8-bit M functions + // that needed any spilling). uint64_t StackSize = MF.getFrameInfo().getStackSize(); - if (StackSize > 0 && !UsesAcc8) { - BuildMI(MBB, MBBI, DL, TII.get(W65816::TSC)); - BuildMI(MBB, MBBI, DL, TII.get(W65816::SEC)); - BuildMI(MBB, MBBI, DL, TII.get(W65816::SBC_Imm16)) - .addImm(StackSize); - BuildMI(MBB, MBBI, DL, TII.get(W65816::TCS)); + if (StackSize > 0) { + if (UsesAcc8) { + // 8-bit M: 1 PHA per byte. Preserves A. + for (uint64_t i = 0; i < StackSize; ++i) + BuildMI(MBB, MBBI, DL, TII.get(W65816::PHA)); + } else if (StackSize <= 14 && (StackSize % 2) == 0) { + // 16-bit M, small frame: N/2 PHAs. Preserves A. + for (uint64_t i = 0; i < StackSize / 2; ++i) + BuildMI(MBB, MBBI, DL, TII.get(W65816::PHA)); + } else { + // 16-bit M, larger frame: TAY/TSC/.../TYA bracket. Preserves A + // via Y as a temp. + BuildMI(MBB, MBBI, DL, TII.get(W65816::TAY)); + BuildMI(MBB, MBBI, DL, TII.get(W65816::TSC)); + BuildMI(MBB, MBBI, DL, TII.get(W65816::SEC)); + BuildMI(MBB, MBBI, DL, TII.get(W65816::SBC_Imm16)) + .addImm(StackSize); + BuildMI(MBB, MBBI, DL, TII.get(W65816::TCS)); + BuildMI(MBB, MBBI, DL, TII.get(W65816::TYA)); + } } } @@ -124,25 +189,90 @@ void W65816FrameLowering::emitEpilogue(MachineFunction &MF, // Insert before the terminator (the return). DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); + // Mirror the prologue's pure-i8 detection: skip the 16-bit stack + // adjustment only if the function ran in 8-bit M (no wide types + // anywhere). const Function &F = MF.getFunction(); - bool UsesAcc8 = F.getReturnType()->isIntegerTy(8); - if (!UsesAcc8) { + bool HasWide = isWideTyForMode(F.getReturnType(), nullptr); + if (!HasWide) { for (const Argument &Arg : F.args()) { - if (Arg.getType()->isIntegerTy(8)) { UsesAcc8 = true; break; } + if (isWideTyForMode(Arg.getType(), &Arg)) { HasWide = true; break; } } } - if (UsesAcc8) return; // Cannot 16-bit math while in 8-bit mode. + if (!HasWide) { + for (const BasicBlock &BB : F) { + if (HasWide) break; + for (const Instruction &I : BB) { + if (isWideTyForMode(I.getType(), &I)) { HasWide = true; break; } + if (instrLowersToWide(I)) { HasWide = true; break; } + for (const Value *Op : I.operands()) { + if (isWideTyForMode(Op->getType(), Op)) { HasWide = true; break; } + } + if (HasWide) break; + } + } + } + // 8-bit M epilogue. Save A in Y(low) via TAY, pop N bytes via N + // PLAs (each pops 1 byte in 8-bit M), restore A via TYA. Y is + // caller-saved by our ABI so we can use it freely. Total cost: + // N + 2 bytes per epilogue. + if (!HasWide) { + BuildMI(MBB, MBBI, DL, TII.get(W65816::TAY)); // save A in Y + for (uint64_t i = 0; i < StackSize; ++i) + BuildMI(MBB, MBBI, DL, TII.get(W65816::PLA)); // pop frame bytes + BuildMI(MBB, MBBI, DL, TII.get(W65816::TYA)); // restore A from Y + return; + } + // 16-bit M epilogue. Mirror the prologue: A holds the return value + // at this point and MUST be preserved. Small frames release via + // N/2 PLY (pop into Y, discard); larger frames use + // TAY/TSC/CLC/ADC #N/TCS/TYA. + if (StackSize <= 14 && (StackSize % 2) == 0) { + for (uint64_t i = 0; i < StackSize / 2; ++i) + BuildMI(MBB, MBBI, DL, TII.get(W65816::PLY)); + return; + } + BuildMI(MBB, MBBI, DL, TII.get(W65816::TAY)); BuildMI(MBB, MBBI, DL, TII.get(W65816::TSC)); BuildMI(MBB, MBBI, DL, TII.get(W65816::CLC)); BuildMI(MBB, MBBI, DL, TII.get(W65816::ADC_Imm16)) .addImm(StackSize); BuildMI(MBB, MBBI, DL, TII.get(W65816::TCS)); + BuildMI(MBB, MBBI, DL, TII.get(W65816::TYA)); } MachineBasicBlock::iterator W65816FrameLowering::eliminateCallFramePseudoInstr( MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { - // Drop ADJCALLSTACKDOWN/UP with no replacement for now. + const W65816Subtarget &STI = MF.getSubtarget(); + const W65816InstrInfo &TII = *STI.getInstrInfo(); + + // ADJCALLSTACKDOWN does nothing — we push args via PUSH16/PHA which + // implicitly decrements SP, so no separate adjustment is needed. + // ADJCALLSTACKUP releases all the pushed bytes after a call. + // + // Critical: A holds the callee's return value here, so this MUST NOT + // clobber A. The naive `tsc;clc;adc #N;tcs` does (TSC overwrites A), + // which silently corrupts every call's return value. Same fix as the + // epilogue: small N via PLY (clobbers Y, preserves A); larger N via + // TAY/.../TYA bracket. + if (I->getOpcode() == W65816::ADJCALLSTACKUP) { + int N = I->getOperand(0).getImm(); + if (N > 0) { + DebugLoc DL = I->getDebugLoc(); + if (N <= 14 && (N % 2) == 0) { + for (int i = 0; i < N / 2; ++i) + BuildMI(MBB, I, DL, TII.get(W65816::PLY)); + } else { + BuildMI(MBB, I, DL, TII.get(W65816::TAY)); + BuildMI(MBB, I, DL, TII.get(W65816::TSC)); + BuildMI(MBB, I, DL, TII.get(W65816::CLC)); + BuildMI(MBB, I, DL, TII.get(W65816::ADC_Imm16)).addImm(N); + BuildMI(MBB, I, DL, TII.get(W65816::TCS)); + BuildMI(MBB, I, DL, TII.get(W65816::TYA)); + } + } + } return MBB.erase(I); } diff --git a/src/llvm/lib/Target/W65816/W65816ISelDAGToDAG.cpp b/src/llvm/lib/Target/W65816/W65816ISelDAGToDAG.cpp index 108a36c..84c8bfe 100644 --- a/src/llvm/lib/Target/W65816/W65816ISelDAGToDAG.cpp +++ b/src/llvm/lib/Target/W65816/W65816ISelDAGToDAG.cpp @@ -71,17 +71,52 @@ void W65816DAGToDAGISel::Select(SDNode *Node) { return; } - // Defer to the auto-generated selector for everything else. Custom - // selection paths (frame-index, wrapper, etc.) will land here later. + // Custom selection: bare FrameIndex SDValue used as an i16 pointer + // value (e.g. `&arr[0]` for a stack-allocated array). The + // auto-generated selector has no pattern for `(i16 frameindex)` + // because tablegen doesn't expose FrameIndex as a leaf type — so + // ISel fails with "Cannot select: FrameIndex" before ever reaching + // a load/store-context fold. Convert it to ADDframe (FI, 0); the + // frame-index elimination pass turns ADDframe into TSC + CLC + ADC + // #(offset+stackSize), producing SP+offset in A. + if (Node->getOpcode() == ISD::FrameIndex) { + SDLoc DL(Node); + int FI = cast(Node)->getIndex(); + SDValue TFI = CurDAG->getTargetFrameIndex(FI, MVT::i16); + SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i16); + CurDAG->SelectNodeTo(Node, W65816::ADDframe, MVT::i16, TFI, Zero); + return; + } + + // Defer to the auto-generated selector for everything else. SelectCode(Node); } bool W65816DAGToDAGISel::SelectFrameIndex(SDValue N, SDValue &Base, SDValue &Offset) { + // Bare FrameIndex: offset 0. if (auto *FIN = dyn_cast(N)) { Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i16); Offset = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i16); return true; } + // (add FrameIndex, const): fold the const into the memfi offset. + // Type legalization emits this shape when splitting a multi-byte + // load/store at a stack slot into multiple smaller loads (e.g. an + // i32 spill becomes two i16 loads, with the high load at FI+2). + // Without this, the bare FrameIndex inside the add is left as an + // unmatched i16 leaf and ISel reports "Cannot select FrameIndex". + if (N.getOpcode() == ISD::ADD) { + SDValue LHS = N.getOperand(0); + SDValue RHS = N.getOperand(1); + if (auto *FIN = dyn_cast(LHS)) { + if (auto *CN = dyn_cast(RHS)) { + Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i16); + Offset = CurDAG->getTargetConstant(CN->getSExtValue(), + SDLoc(N), MVT::i16); + return true; + } + } + } return false; } diff --git a/src/llvm/lib/Target/W65816/W65816ISelLowering.cpp b/src/llvm/lib/Target/W65816/W65816ISelLowering.cpp index 7c5de40..7a7f379 100644 --- a/src/llvm/lib/Target/W65816/W65816ISelLowering.cpp +++ b/src/llvm/lib/Target/W65816/W65816ISelLowering.cpp @@ -12,10 +12,13 @@ //===----------------------------------------------------------------------===// #include "W65816ISelLowering.h" +#include "W65816InstrInfo.h" +#include "W65816MachineFunctionInfo.h" #include "W65816SelectionDAGInfo.h" #include "W65816Subtarget.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" @@ -53,20 +56,121 @@ W65816TargetLowering::W65816TargetLowering(const TargetMachine &TM, setOperationAction(ISD::BRCOND, MVT::Other, Expand); setOperationAction(ISD::BR_JT, MVT::Other, Expand); - // SELECT / SELECT_CC: leave as default for now. Expanding either - // currently infinite-loops because they cross-expand into each other - // without a base case. Custom lowering to a Bxx + branch + phi - // pattern is the right fix; tracked separately. + // SETCC and SELECT_CC: custom-lowered to a CMP + W65816ISD::SELECT_CC + // pseudo (with usesCustomInserter=1) that EmitInstrWithCustomInserter + // expands into a Bxx + diamond CFG + PHI. SETCC funnels through the + // same path with TVal=1 / FVal=0. SELECT (no condition operand) is + // expanded to SELECT_CC by the legalizer using SETNE against zero. + setOperationAction(ISD::SETCC, MVT::i16, Custom); + setOperationAction(ISD::SETCC, MVT::i8, Custom); + setOperationAction(ISD::SELECT_CC, MVT::i16, Custom); + setOperationAction(ISD::SELECT_CC, MVT::i8, Custom); + setOperationAction(ISD::SELECT, MVT::i16, Expand); + setOperationAction(ISD::SELECT, MVT::i8, Expand); + // 65816 has no inline sign-extend instruction; synthesize i8 -> i16 + // via a bit-7 test and SELECT_CC (see LowerSignExtend). + setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Custom); + + // We have zextload-i8 and extload-i8 patterns (LDA + AND #$FF / bare + // LDA for the anyext case). No native sextload; mark it Expand so + // LLVM rewrites `sextload i16, i8` into `(sign_extend (load i8))`, + // which then flows through LowerSignExtend's branchless 3-insn + // sequence (AND #$00FF; EOR #$0080; SEC; SBC #$0080). + for (MVT VT : MVT::integer_valuetypes()) + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Expand); + + // Vararg support: VASTART writes the address of the first vararg slot + // to the va_list pointer. VAARG/VACOPY/VAEND use the default + // expansions that load through that pointer and bump it. This makes + // -style functions (e.g. printf-likes) compile cleanly. + setOperationAction(ISD::VASTART, MVT::Other, Custom); + setOperationAction(ISD::VAARG, MVT::Other, Expand); + setOperationAction(ISD::VACOPY, MVT::Other, Expand); + setOperationAction(ISD::VAEND, MVT::Other, Expand); // The 65816 has no hardware multiplier or divider. Multiply by a // power-of-two constant is auto-rewritten to shifts by the DAG - // combiner; arbitrary multiply / divide / mod fail to select today. - // Real support needs (a) library functions (`__mulhi3` etc.) and - // (b) multi-arg call lowering — both are tracked separately. + // combiner; arbitrary multiply / divide / mod go through libcalls + // (`__mulhi3` for i16 multiply etc.). The libcall expander emits a + // standard CALL node which flows through LowerCall, so multi-arg + // call lowering must be working first (it is, see task #26). setOperationAction(ISD::MULHU, MVT::i16, Expand); setOperationAction(ISD::MULHS, MVT::i16, Expand); setOperationAction(ISD::SMUL_LOHI, MVT::i16, Expand); setOperationAction(ISD::UMUL_LOHI, MVT::i16, Expand); + setOperationAction(ISD::MUL, MVT::i16, LibCall); + setOperationAction(ISD::SDIV, MVT::i16, LibCall); + setOperationAction(ISD::UDIV, MVT::i16, LibCall); + setOperationAction(ISD::SREM, MVT::i16, LibCall); + setOperationAction(ISD::UREM, MVT::i16, LibCall); + setOperationAction(ISD::SDIVREM, MVT::i16, Expand); + setOperationAction(ISD::UDIVREM, MVT::i16, Expand); + + // Variable-amount and large-constant shifts. We have inline + // patterns for shift-by-1..4; everything else goes through + // __ashlhi3 / __lshrhi3 / __ashrhi3. Setting the action to Custom + // lets us return SDValue() for the fast cases and route everything + // else through the libcall lowering helper. + setOperationAction(ISD::SHL, MVT::i16, Custom); + setOperationAction(ISD::SRL, MVT::i16, Custom); + setOperationAction(ISD::SRA, MVT::i16, Custom); + // i8 shifts go through Custom too — LowerShift detects the i8 result + // and routes through trunc(i16-shift(zext_or_sext(lhs), amount)). + // Avoids needing a parallel set of qi3 libcalls. + setOperationAction(ISD::SHL, MVT::i8, Custom); + setOperationAction(ISD::SRL, MVT::i8, Custom); + setOperationAction(ISD::SRA, MVT::i8, Custom); + + // ADDC/ADDE/SUBC/SUBE are the legacy SDNodes with implicit Glue carrying + // the carry/borrow flag between the two halves of a multi-precision add or + // sub. Setting them Legal triggers the type legalizer's carry-chain split + // for i32 ADD/SUB, which lowers to native ADC/SBC pairs (~7 instructions) + // instead of the default UADDO+SETCC+ADD-of-bool path (~25 instructions). + // The matching tablegen pseudos add Defs/Uses on the P register, which + // tablegen wires up to the SDNode's SDNPInGlue/SDNPOutGlue automatically. + setOperationAction(ISD::ADDC, MVT::i16, Legal); + setOperationAction(ISD::ADDE, MVT::i16, Legal); + setOperationAction(ISD::SUBC, MVT::i16, Legal); + setOperationAction(ISD::SUBE, MVT::i16, Legal); + + // i32 (long). Type legalization splits i32 into two i16 halves; with + // ADDC/ADDE Legal (above), ADD/SUB go through the native carry chain. + // AND/OR/XOR split cleanly into per-half ops with no carry to track. + // Multiply/divide/shift go through libcall stubs whose + // implementations live in runtime/src/libgcc.s. SHL_PARTS / SRL_PARTS + // / SRA_PARTS are the SDNodes the type legalizer emits when splitting + // a variable-amount shift; without an action they get "Cannot select". + // LibCall on the parent node routes the whole shift through one + // __ashlsi3 / __lshrsi3 / __ashrsi3 call, which is both smaller and + // simpler than implementing a 32-bit shift in 65816 assembly inline. + for (MVT VT : {MVT::i32}) { + setOperationAction(ISD::MUL, VT, LibCall); + setOperationAction(ISD::SDIV, VT, LibCall); + setOperationAction(ISD::UDIV, VT, LibCall); + setOperationAction(ISD::SREM, VT, LibCall); + setOperationAction(ISD::UREM, VT, LibCall); + setOperationAction(ISD::MULHU, VT, Expand); + setOperationAction(ISD::MULHS, VT, Expand); + setOperationAction(ISD::SMUL_LOHI, VT, Expand); + setOperationAction(ISD::UMUL_LOHI, VT, Expand); + setOperationAction(ISD::SDIVREM, VT, Expand); + setOperationAction(ISD::UDIVREM, VT, Expand); + // i32 shifts route through a libcall via the + // preferredShiftLegalizationStrategy override (see header). No + // explicit SHL/SHL_PARTS action needed — the override forces the + // type-legalizer's libcall path before SHL_PARTS would be emitted. + } + + // Disable jump tables. Generating them costs us BRIND (indirect + // branch via 16-bit pointer load), which we don't have. A long + // if-else chain compiles fine without them. Setting the threshold + // to UINT_MAX makes LLVM never form a jump table. + setMinimumJumpTableEntries(UINT_MAX); + + // Opt into PerformDAGCombine on LOAD nodes — needed for the + // address-select reverse combine (see W65816TargetLowering:: + // PerformDAGCombine). + setTargetDAGCombine(ISD::LOAD); } // Map an LLVM SETCC condition to a W65816 branch. Returns the condition @@ -94,14 +198,32 @@ static W65816CC::CondCode mapCC(ISD::CondCode CC) { } } -SDValue W65816TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { - SDValue Chain = Op.getOperand(0); - ISD::CondCode CC = cast(Op.getOperand(1))->get(); - SDValue LHS = Op.getOperand(2); - SDValue RHS = Op.getOperand(3); - SDValue Dest = Op.getOperand(4); - SDLoc DL(Op); +// If both compare operands are i8, widen them to i16 so the existing +// i16 CMP path can handle them. Use ZEXT for unsigned/eq/ne CCs and +// SEXT for signed CCs — picking the wrong extension would invert the +// answer (e.g. -1i8 sext to 0xFFFF compares < 1 signed; zext to 0x00FF +// compares > 1 unsigned, which would flip a signed less-than). +static void promoteI8Cmp(SDValue &LHS, SDValue &RHS, ISD::CondCode CC, + SelectionDAG &DAG, const SDLoc &DL) { + if (LHS.getValueType() != MVT::i8) return; + unsigned Ext; + switch (CC) { + case ISD::SETLT: case ISD::SETLE: case ISD::SETGT: case ISD::SETGE: + Ext = ISD::SIGN_EXTEND; break; + default: + Ext = ISD::ZERO_EXTEND; break; // unsigned + eq/ne + } + LHS = DAG.getNode(Ext, DL, MVT::i16, LHS); + RHS = DAG.getNode(Ext, DL, MVT::i16, RHS); +} +// Normalize a (LHS, RHS, CC) triple so the result is something we can +// emit with one CMP + Bxx. Returns the W65816 condition code; updates +// LHS/RHS/CC in place. Returns COND_INVALID on failure. +static W65816CC::CondCode normalizeCC(SDValue &LHS, SDValue &RHS, + ISD::CondCode &CC, SelectionDAG &DAG, + const SDLoc &DL) { + promoteI8Cmp(LHS, RHS, CC, DAG, DL); // CMP wants the comparand (constant or memory) on the right. If a DAG // pre-pass put the constant on the left, swap and flip the condition. if (isa(LHS) && !isa(RHS)) { @@ -110,11 +232,10 @@ SDValue W65816TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { } // Rewrite SETULE / SETUGT / SETLE / SETGT to SETULT / SETUGE / SETLT / - // SETGE with constant +/- 1. This keeps the variable on the LHS (so - // our pattern matches) and lets us use the BCS / BCC / BMI / BPL - // mnemonics natively. Only valid when the constant is not at its - // signed/unsigned boundary; for now we just bail in that pathological - // case. + // SETGE with constant +/- 1. Keeps the variable on the LHS and lets + // us use BCS / BCC / BMI / BPL natively. Only valid when the constant + // is not at its signed/unsigned boundary; we bail in that pathological + // case for now. if (auto *RhsConst = dyn_cast(RHS)) { int64_t V = RhsConst->getSExtValue(); if (CC == ISD::SETULE && (uint64_t)V < 0xffff) { @@ -132,35 +253,214 @@ SDValue W65816TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { } } - // Final fallback: any condition we didn't handle yet might still be - // representable by swapping operands (e.g. SETUGT b a → SETULT a b). - // Try once if the direct map doesn't recognise it. W65816CC::CondCode TCC = mapCC(CC); if (TCC == W65816CC::COND_INVALID) { - std::swap(LHS, RHS); - CC = ISD::getSetCCSwappedOperands(CC); - TCC = mapCC(CC); + // Try swapping operands first — preferable since it leaves us with + // a single-Bxx form. But reject the swap if it would put a load on + // the LHS (we can't pattern-match cmp(load,reg) without spilling A). + bool RhsIsLoad = isa(RHS.getNode()); + bool LhsIsLoad = isa(LHS.getNode()); + bool SwapWouldHurt = RhsIsLoad && !LhsIsLoad; + if (!SwapWouldHurt) { + std::swap(LHS, RHS); + CC = ISD::getSetCCSwappedOperands(CC); + TCC = mapCC(CC); + } } + // Final fallback: GT/LE/UGT/ULE without a useful swap target. Use a + // multi-branch pseudo CC; the SELECT_CC16 custom inserter expands it + // into a 3-BB diamond. Only valid for SELECT_CC, not for BR_CC — + // LowerBR_CC re-routes those through SETCC + BR_CC NE. + if (TCC == W65816CC::COND_INVALID) { + switch (CC) { + case ISD::SETGT: TCC = W65816CC::COND_GT_MB; break; + case ISD::SETLE: TCC = W65816CC::COND_LE_MB; break; + case ISD::SETUGT: TCC = W65816CC::COND_HI_MB; break; + case ISD::SETULE: TCC = W65816CC::COND_LS_MB; break; + default: break; + } + } + return TCC; +} + +SDValue W65816TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { + SDValue Chain = Op.getOperand(0); + ISD::CondCode CC = cast(Op.getOperand(1))->get(); + SDValue LHS = Op.getOperand(2); + SDValue RHS = Op.getOperand(3); + SDValue Dest = Op.getOperand(4); + SDLoc DL(Op); + EVT VT = LHS.getValueType(); + + W65816CC::CondCode TCC = normalizeCC(LHS, RHS, CC, DAG, DL); if (TCC == W65816CC::COND_INVALID) report_fatal_error("W65816: branch condition not yet implemented"); + // Multi-branch CCs only have inserter support via SELECT_CC16. For + // BR_CC, reroute through SETCC: materialise the boolean to A, then + // branch on NE-vs-zero. One extra LDA but always works. + if (TCC >= W65816CC::COND_GT_MB) { + SDValue Bool = DAG.getNode(ISD::SETCC, DL, VT, LHS, RHS, + DAG.getCondCode(CC)); + SDValue Zero = DAG.getConstant(0, DL, VT); + return DAG.getNode(ISD::BR_CC, DL, MVT::Other, Chain, + DAG.getCondCode(ISD::SETNE), Bool, Zero, Dest); + } + SDValue Glue = DAG.getNode(W65816ISD::CMP, DL, MVT::Glue, LHS, RHS); SDValue CCOp = DAG.getTargetConstant(TCC, DL, MVT::i8); return DAG.getNode(W65816ISD::BR_CC, DL, MVT::Other, Chain, Dest, CCOp, Glue); } +SDValue W65816TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { + // setcc lhs, rhs, cc -> select_cc lhs, rhs, 1, 0, cc. + // The SELECT_CC then re-enters LowerOperation and we lower it via the + // diamond-CFG path. setBooleanContents(ZeroOrOne) means callers see + // the result as a clean 0/1 value. + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + ISD::CondCode CC = cast(Op.getOperand(2))->get(); + SDLoc DL(Op); + EVT VT = Op.getValueType(); + SDValue One = DAG.getConstant(1, DL, VT); + SDValue Zero = DAG.getConstant(0, DL, VT); + return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, One, Zero, + DAG.getCondCode(CC)); +} + +SDValue W65816TargetLowering::LowerSELECT_CC(SDValue Op, + SelectionDAG &DAG) const { + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + SDValue TVal = Op.getOperand(2); + SDValue FVal = Op.getOperand(3); + ISD::CondCode CC = cast(Op.getOperand(4))->get(); + SDLoc DL(Op); + + W65816CC::CondCode TCC = normalizeCC(LHS, RHS, CC, DAG, DL); + if (TCC == W65816CC::COND_INVALID) + report_fatal_error("W65816: select_cc condition not yet implemented"); + + SDValue Glue = DAG.getNode(W65816ISD::CMP, DL, MVT::Glue, LHS, RHS); + SDValue CCOp = DAG.getTargetConstant(TCC, DL, MVT::i8); + SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue); + SDValue Ops[] = {TVal, FVal, CCOp, Glue}; + return DAG.getNode(W65816ISD::SELECT_CC, DL, VTs, Ops); +} + +// i8 -> i16 sign extend. Branchless 3-instruction trick: +// sext(x) = ((x & 0xFF) ^ 0x80) - 0x80 +// Verify: x=0x00 -> 0x80 - 0x80 = 0x0000. x=0x7F -> 0xFF - 0x80 = 0x7F. +// x=0x80 -> 0x00 - 0x80 = 0xFF80 (-128). x=0xFF -> 0x7F - 0x80 +// = 0xFFFF (-1). +// Lowers to: AND #$00FF; EOR #$0080; SEC; SBC #$0080 (10 bytes total, +// no branches, no temp slots — much cheaper than the SELECT_CC diamond +// version that produced ~14 instructions plus stack spills). +SDValue W65816TargetLowering::LowerSignExtend(SDValue Op, + SelectionDAG &DAG) const { + SDValue X = Op.getOperand(0); + if (X.getValueType() != MVT::i8 || Op.getValueType() != MVT::i16) + return SDValue(); + SDLoc DL(Op); + SDValue Z = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, X); + SDValue Sign = DAG.getConstant(0x0080, DL, MVT::i16); + SDValue Xor = DAG.getNode(ISD::XOR, DL, MVT::i16, Z, Sign); + return DAG.getNode(ISD::SUB, DL, MVT::i16, Xor, Sign); +} + +// VASTART: store the address of the first vararg slot (recorded by +// LowerFormalArguments via VarArgsFrameIndex) to the va_list pointer. +// va_list is just `i16 *next` here — minimum implementation. +static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) { + MachineFunction &MF = DAG.getMachineFunction(); + auto *FuncInfo = MF.getInfo(); + SDLoc DL(Op); + // Address of the first vararg slot. + SDValue VAFI = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), + MVT::i16); + SDValue Chain = Op.getOperand(0); + SDValue VAListPtr = Op.getOperand(1); + const Value *SV = cast(Op.getOperand(2))->getValue(); + return DAG.getStore(Chain, DL, VAFI, VAListPtr, MachinePointerInfo(SV)); +} + SDValue W65816TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); case ISD::BR_CC: return LowerBR_CC(Op, DAG); + case ISD::SETCC: return LowerSETCC(Op, DAG); + case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); + case ISD::SIGN_EXTEND: return LowerSignExtend(Op, DAG); + case ISD::VASTART: return LowerVASTART(Op, DAG); + case ISD::SHL: + case ISD::SRL: + case ISD::SRA: return LowerShift(Op, DAG); default: llvm_unreachable("W65816: unexpected operation in LowerOperation"); } } +SDValue W65816TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { + // i8 shifts: promote to i16, shift, truncate. SRA promotes via SEXT + // (preserves sign for arithmetic right shift); SHL/SRL via ZEXT + // (logical / left shifts don't care about high bits). This routes + // i8 shifts through the same i16 fast paths and libcalls — no + // parallel qi3 libcall set needed. + if (Op.getValueType() == MVT::i8) { + SDLoc DL(Op); + SDValue X = Op.getOperand(0); + SDValue N = Op.getOperand(1); + unsigned Ext = (Op.getOpcode() == ISD::SRA) ? ISD::SIGN_EXTEND + : ISD::ZERO_EXTEND; + SDValue X16 = DAG.getNode(Ext, DL, MVT::i16, X); + SDValue N16 = N.getValueType() == MVT::i16 + ? N + : DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, N); + SDValue R16 = DAG.getNode(Op.getOpcode(), DL, MVT::i16, X16, N16); + return DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, R16); + } + // Fast path: shift-by-{1,2,3,4} have inline tablegen patterns. Return + // Op (the unchanged node) so the legalizer leaves it alone — the + // pattern matcher catches it later. Returning SDValue() instead + // would fall through to the generic Expand path, which generates a + // BUILD_VECTOR-based magic-constant rewrite that we can't lower. + // Also allow `(srl x, 15)` through — pattern SRL15A handles it as + // `ASL A; LDA #0; ROL A` (3 bytes), much shorter than the libcall. + // The type-legalizer's i32-shift-by-1 expansion emits this exact + // node for the high-half "bit-from-low" slot. + // Everything else goes to a libcall (__ashlhi3 / __lshrhi3 / __ashrhi3). + SDValue Amount = Op.getOperand(1); + if (auto *C = dyn_cast(Amount)) { + uint64_t N = C->getZExtValue(); + if (N >= 1 && N <= 4) + return Op; + if ((N == 15 || N == 8) && + (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL)) + return Op; + if (N == 15 && Op.getOpcode() == ISD::SRA) + return Op; + } + + RTLIB::Libcall LC; + switch (Op.getOpcode()) { + case ISD::SHL: LC = RTLIB::SHL_I16; break; + case ISD::SRL: LC = RTLIB::SRL_I16; break; + case ISD::SRA: LC = RTLIB::SRA_I16; break; + default: llvm_unreachable("not a shift"); + } + + // makeLibCall wants the args as TargetLowering::ArgListEntry; the + // simpler getNode form is to manually build the call. But the + // makeLibCall helper handles the calling convention. + SmallVector Args = {Op.getOperand(0), Op.getOperand(1)}; + TargetLowering::MakeLibCallOptions Opts; + Opts.setIsSigned(Op.getOpcode() == ISD::SRA); + return makeLibCall(DAG, LC, Op.getValueType(), Args, Opts, SDLoc(Op)).first; +} + SDValue W65816TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { auto *GA = cast(Op); @@ -195,13 +495,19 @@ SDValue W65816TargetLowering::LowerFormalArguments( // (low addr) <- (1,S) // // Each i16 stack arg occupies 2 bytes. arg 1 lives at (4,S). - if (IsVarArg) - report_fatal_error("W65816: vararg functions not yet supported"); MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); + // i32 first-arg ABI: if the first original argument is i32 (the + // type legalizer split it into two i16 InputArgs both with + // OrigArgIndex == 0), pass it in A:X (lo:hi) — matching the i32 + // return ABI (also A:X). Saves one stack slot for the i32 arg. + bool I32FirstArg = + Ins.size() >= 2 && Ins[0].VT == MVT::i16 && Ins[1].VT == MVT::i16 && + Ins[0].OrigArgIndex == 0 && Ins[1].OrigArgIndex == 0; + unsigned ArgIdx = 0; // Stack offset is measured from S+1 (the WDC convention) and grows // upward as we walk through the stack-passed args. @@ -217,31 +523,52 @@ SDValue W65816TargetLowering::LowerFormalArguments( VT == MVT::i16 ? &W65816::Acc16RegClass : &W65816::Acc8RegClass); MRI.addLiveIn(W65816::A, VReg); InVals.push_back(DAG.getCopyFromReg(Chain, DL, VReg, VT)); + } else if (ArgIdx == 1 && I32FirstArg) { + // i32 first-arg hi half: in X. + Register VReg = MRI.createVirtualRegister(&W65816::Idx16RegClass); + MRI.addLiveIn(W65816::X, VReg); + InVals.push_back(DAG.getCopyFromReg(Chain, DL, VReg, MVT::i16)); } else { - // Subsequent args are loaded from the stack. Use a fixed frame - // object positioned at the absolute stack offset; the - // eliminateFrameIndex pass turns it into LDA d,S. - unsigned ObjSize = (VT == MVT::i16) ? 2 : 1; + // Subsequent args are loaded from the stack. i8 args are + // promoted to i16 slots (matching CC_W65816's CCPromoteToType) + // so the load can run in the function's default 16-bit M mode + // without needing a per-byte SEP/REP wrap; we then truncate the + // i16 back to i8 for the IR. i16 args are loaded directly. + unsigned ObjSize = 2; int FI = MFI.CreateFixedObject(ObjSize, StackOffset, /*Immutable*/true); StackOffset += ObjSize; SDValue FIN = DAG.getFrameIndex(FI, MVT::i16); - InVals.push_back(DAG.getLoad( - VT, DL, Chain, FIN, - MachinePointerInfo::getFixedStack(MF, FI))); + SDValue Val = DAG.getLoad( + MVT::i16, DL, Chain, FIN, + MachinePointerInfo::getFixedStack(MF, FI)); + if (VT == MVT::i8) + Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Val); + InVals.push_back(Val); } ++ArgIdx; } + + // Vararg support: stash the FrameIndex of the next stack-arg slot + // (where the caller's first vararg lives) so VASTART can use it + // as the va_list start. StackOffset has been advanced past every + // named stack arg; the first vararg sits at SP + StackOffset. + if (IsVarArg) { + int FI = MFI.CreateFixedObject(2, StackOffset, /*Immutable=*/true); + auto *FuncInfo = MF.getInfo(); + FuncInfo->setVarArgsFrameIndex(FI); + } + return Chain; } SDValue W65816TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl &InVals) const { - // Single-arg version: arg 0 in A; LowerFormalArguments accepts - // additional args via the stack, but this side doesn't yet emit the - // pushes. Multi-arg call lowering wants a PUSHA pseudo with proper - // SP unwinding via TSC/ADC #N/TCS in the ADJCALLSTACKUP pseudo — - // tracked separately. + // Multi-arg ABI: arg 0 in A, args 1..N-1 pushed in REVERSE order via + // PUSH16 (PHA) so the callee's `(4,S)` reads pick up arg 1, `(6,S)` + // gets arg 2, etc. CALLSEQ_START records the byte count; + // ADJCALLSTACKUP after the call emits `tsc;clc;adc #N;tcs` to + // release the pushed bytes (eliminateCallFramePseudoInstr). SelectionDAG &DAG = CLI.DAG; SDLoc &DL = CLI.DL; SDValue Chain = CLI.Chain; @@ -252,16 +579,116 @@ W65816TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (CLI.IsTailCall) CLI.IsTailCall = false; - if (Outs.size() > 1) - report_fatal_error("W65816: multi-argument calls not yet supported"); - if (Ins.size() > 1) + // Up to 2 return values: i8/i16 in A, or split i32 in A:X. The + // result-read loop at the end of this function honors the same + // ordering as LowerReturn. + if (Ins.size() > 2) report_fatal_error("W65816: multi-return calls not yet supported"); - Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL); + // Indirect calls (function pointers): redirect through the runtime + // trampoline `__jsl_indir`. The 65816 has no JSL-indirect; instead, + // we store the dynamic target to a global (`__indirTarget`), then + // JSL the trampoline, which immediately does `JMP (__indirTarget)`. + // The target's RTL pops the original JSL's return frame and returns + // straight back to the caller — no double-RTL or extra frame. + // Caveat: single-bank only (JMP indirect is bank-local). + bool IsIndirect = !isa(Callee) && + !isa(Callee); + if (IsIndirect) { + // Store the dynamic target to __indirTarget *before* any other + // setup, since pushing args clobbers A. STAabs takes an + // ExternalSymbol-wrapped address operand. + SDValue TargetSym = DAG.getTargetExternalSymbol("__indirTarget", + MVT::i16); + SDValue WrappedSym = DAG.getNode(W65816ISD::Wrapper, DL, MVT::i16, + TargetSym); + Chain = DAG.getStore(Chain, DL, Callee, WrappedSym, + MachinePointerInfo()); + // Replace the callee with __jsl_indir for the actual JSL. + Callee = DAG.getExternalSymbol("__jsl_indir", MVT::i16); + } + for (const ISD::OutputArg &O : Outs) { + if (O.VT != MVT::i16 && O.VT != MVT::i8) + report_fatal_error("W65816: argument type not yet supported"); + } + + // i32 first-arg ABI: if Outs[0] and Outs[1] are halves of the same + // original i32 first arg (OrigArgIndex == 0), pass them in A:X. + bool I32FirstArg = + Outs.size() >= 2 && Outs[0].VT == MVT::i16 && Outs[1].VT == MVT::i16 && + Outs[0].OrigArgIndex == 0 && Outs[1].OrigArgIndex == 0; + unsigned FirstStackArg = I32FirstArg ? 2 : 1; + + // i8 stack args are promoted to i16 (2-byte slots) so the callee can + // read them with a 16-bit M load — matches LowerFormalArguments and + // CC_W65816's CCPromoteToType. Arg 0 stays in A in its native + // width; only stack-passed args promote. + unsigned StackBytes = 2 * (Outs.size() > FirstStackArg + ? Outs.size() - FirstStackArg : 0); + + Chain = DAG.getCALLSEQ_START(Chain, StackBytes, 0, DL); + + // Push stack-passed args in reverse so arg FirstStackArg ends up at + // the lowest post-JSL stack-relative offset (4,S). Each push uses A + // by default; if the value being pushed is already a `CopyFromReg X` + // (e.g. forwarding the i32-first-arg-in-A:X hi half), push directly + // from X via PHX — saves the TXA + A-spill round-trip that would + // otherwise be required. SDValue Glue; + for (int i = (int)Outs.size() - 1; i >= (int)FirstStackArg; --i) { + SDValue V = OutVals[i]; + if (Outs[i].VT == MVT::i8) + V = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, V); + // Detect "value is already in X" — either as a physreg + // CopyFromReg($x), or as a vreg in the Idx16 class that's + // live-in from $x. In the i32-first-arg-in-A:X path, + // LowerFormalArguments creates a vreg in Idx16 and addLiveIn's + // it to $x. + bool ViaX = false; + if (V.getOpcode() == ISD::CopyFromReg) { + auto *RegN = dyn_cast(V.getOperand(1).getNode()); + if (RegN) { + Register R = RegN->getReg(); + if (R.isPhysical() && R == W65816::X) { + ViaX = true; + } else if (R.isVirtual()) { + MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); + if (MRI.getRegClass(R) == &W65816::Idx16RegClass) { + for (auto &LI : MRI.liveins()) + if (LI.second == R && LI.first == W65816::X) { + ViaX = true; + break; + } + } + } + } + } + if (ViaX) { + // CopyToReg(X, X) is a no-op but it threads the Glue chain so the + // PUSH_X can be sequenced correctly relative to other pushes. + Chain = DAG.getCopyToReg(Chain, DL, W65816::X, V, Glue); + Glue = Chain.getValue(1); + Chain = DAG.getNode(W65816ISD::PUSH_X, DL, + DAG.getVTList(MVT::Other, MVT::Glue), Chain, Glue); + } else { + Chain = DAG.getCopyToReg(Chain, DL, W65816::A, V, Glue); + Glue = Chain.getValue(1); + Chain = DAG.getNode(W65816ISD::PUSH, DL, + DAG.getVTList(MVT::Other, MVT::Glue), Chain, Glue); + } + Glue = Chain.getValue(1); + } + + // i32 first-arg hi half goes in X. Emit before the A copy so the + // CopyToReg for X is glued, then A's copy follows. + if (I32FirstArg) { + Chain = DAG.getCopyToReg(Chain, DL, W65816::X, OutVals[1], Glue); + Glue = Chain.getValue(1); + } + + // Arg 0 in A. if (!OutVals.empty()) { - MVT VT = Outs[0].VT; Chain = DAG.getCopyToReg(Chain, DL, W65816::A, OutVals[0], Glue); Glue = Chain.getValue(1); } @@ -274,6 +701,8 @@ W65816TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVector CallOps = {Chain, Callee}; if (!OutVals.empty()) CallOps.push_back(DAG.getRegister(W65816::A, Outs[0].VT)); + if (I32FirstArg) + CallOps.push_back(DAG.getRegister(W65816::X, Outs[1].VT)); if (Glue.getNode()) CallOps.push_back(Glue); @@ -281,14 +710,20 @@ W65816TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, DAG.getVTList(MVT::Other, MVT::Glue), CallOps); Glue = Chain.getValue(1); - Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Glue, DL); + Chain = DAG.getCALLSEQ_END(Chain, StackBytes, 0, Glue, DL); Glue = Chain.getValue(1); - for (const ISD::InputArg &Arg : Ins) { - MVT VT = Arg.VT; + // Read return value(s). Mirrors LowerReturn: i8/i16 in A, i32 in A:X. + if (Ins.size() > 2) + report_fatal_error("W65816: return type not yet supported"); + static constexpr Register RetRegs[2] = {W65816::A, W65816::X}; + for (unsigned i = 0, e = Ins.size(); i != e; ++i) { + MVT VT = Ins[i].VT; if (VT != MVT::i16 && VT != MVT::i8) report_fatal_error("W65816: return type not yet supported"); - SDValue V = DAG.getCopyFromReg(Chain, DL, W65816::A, VT, Glue); + if (i == 1 && VT != MVT::i16) + report_fatal_error("W65816: split return must be i16"); + SDValue V = DAG.getCopyFromReg(Chain, DL, RetRegs[i], VT, Glue); Chain = V.getValue(1); Glue = V.getValue(2); InVals.push_back(V); @@ -302,24 +737,39 @@ SDValue W65816TargetLowering::LowerReturn( const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SDLoc &DL, SelectionDAG &DAG) const { - // Copy scalar return values into A and emit a retglue chain. Supports - // one i16 return today; i8 would use the same A register in 8-bit mode, - // and larger returns (i32 A:X, structures via hidden pointer) are future - // work. - // Copy each scalar return value into A and reference A in the RET_GLUE - // operand list so the register allocator keeps the defining instructions - // alive (otherwise dead-MI elimination strips them — the physreg copy - // alone is not enough of a liveness signal). - SDValue Glue; - SmallVector RetOps(1, Chain); - for (unsigned i = 0, e = Outs.size(); i != e; ++i) { + // Return ABI: + // i8/i16: value in A. + // i32: low half (Outs[0]) in A, high half (Outs[1]) in X. + // wider: not yet supported. + // Type legalization splits an i32 return into 2 consecutive i16 Outs. + // Emission order matters: we copy the high half to X *first* so that + // the regalloc can place both halves through the only Acc16 reg (A) + // without conflict. The TAX in copyPhysReg preserves A, so the + // subsequent copy of the low half to A doesn't clobber the high. + // Emitting low->A first would force a spill since computing the high + // would overwrite A while the low is still live for RTL. + if (Outs.size() > 2) + report_fatal_error("W65816: return type not yet supported"); + for (unsigned i = 0; i != Outs.size(); ++i) { MVT VT = Outs[i].VT; if (VT != MVT::i16 && VT != MVT::i8) report_fatal_error("W65816: return type not yet supported"); - Chain = DAG.getCopyToReg(Chain, DL, W65816::A, OutVals[i], Glue); - Glue = Chain.getValue(1); - RetOps.push_back(DAG.getRegister(W65816::A, VT)); + if (i == 1 && VT != MVT::i16) + report_fatal_error("W65816: split return must be i16"); } + SDValue Glue; + SmallVector RetOps(1, Chain); + if (Outs.size() == 2) { + Chain = DAG.getCopyToReg(Chain, DL, W65816::X, OutVals[1], Glue); + Glue = Chain.getValue(1); + } + if (!Outs.empty()) { + Chain = DAG.getCopyToReg(Chain, DL, W65816::A, OutVals[0], Glue); + Glue = Chain.getValue(1); + RetOps.push_back(DAG.getRegister(W65816::A, Outs[0].VT)); + } + if (Outs.size() == 2) + RetOps.push_back(DAG.getRegister(W65816::X, Outs[1].VT)); RetOps[0] = Chain; if (Glue.getNode()) @@ -327,3 +777,353 @@ SDValue W65816TargetLowering::LowerReturn( return DAG.getNode(W65816ISD::RET_GLUE, DL, MVT::Other, RetOps); } + +// DAG combine: undo clang's `load(SELECT_CC(fi, fi))` rewrite of +// `c ? *p : *q` when both ptrs are FrameIndex. Without this, the +// SELECT_CC matcher (which expects Acc16 inputs) fails to match the +// FrameIndex tval/fval. We rewrite back to the original +// `SELECT_CC(load(fi), load(fi))` shape — safe because both stack +// slots are guaranteed valid memory. We deliberately do NOT do this +// for arbitrary pointers, since reading from both branches could +// touch invalid memory or memory-mapped IO with side effects. +SDValue +W65816TargetLowering::PerformDAGCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + if (N->getOpcode() != ISD::LOAD) + return SDValue(); + LoadSDNode *Ld = cast(N); + if (!Ld->isSimple()) + return SDValue(); + SDValue Ptr = Ld->getBasePtr(); + + // Pre-legalize SELECT (cond, T, F): undo the address-select if both + // pointer operands are FrameIndex. + if (Ptr.getOpcode() == ISD::SELECT) { + SDValue T = Ptr.getOperand(1); + SDValue F = Ptr.getOperand(2); + if (T.getOpcode() != ISD::FrameIndex || + F.getOpcode() != ISD::FrameIndex) + return SDValue(); + SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + SDLoc DL(N); + SDValue Chain = Ld->getChain(); + MachineFunction &MF = DAG.getMachineFunction(); + int TFI = cast(T)->getIndex(); + int FFI = cast(F)->getIndex(); + SDValue LoadT = DAG.getLoad(VT, DL, Chain, T, + MachinePointerInfo::getFixedStack(MF, TFI)); + SDValue LoadF = DAG.getLoad(VT, DL, Chain, F, + MachinePointerInfo::getFixedStack(MF, FFI)); + SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, + LoadT.getValue(1), LoadF.getValue(1)); + SDValue NewSel = DAG.getNode(ISD::SELECT, DL, VT, + Ptr.getOperand(0), LoadT, LoadF); + DCI.CombineTo(N, NewSel, NewChain); + return SDValue(N, 0); + } + + // Match either pre-legalize ISD::SELECT_CC (LHS,RHS,T,F,CC) or our + // post-legalize W65816ISD::SELECT_CC (T,F,CC,glue). We only sink the + // load into both branches when both branch values are FrameIndex — + // safe because stack slots are guaranteed valid memory. For + // arbitrary pointers, side-effecting reads make this unsafe. + if (Ptr.getOpcode() == ISD::SELECT_CC) { + SDValue T = Ptr.getOperand(2); + SDValue F = Ptr.getOperand(3); + if (T.getOpcode() != ISD::FrameIndex || + F.getOpcode() != ISD::FrameIndex) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + SDLoc DL(N); + SDValue Chain = Ld->getChain(); + MachineFunction &MF = DAG.getMachineFunction(); + int TFI = cast(T)->getIndex(); + int FFI = cast(F)->getIndex(); + + SDValue LoadT = DAG.getLoad(VT, DL, Chain, T, + MachinePointerInfo::getFixedStack(MF, TFI)); + SDValue LoadF = DAG.getLoad(VT, DL, Chain, F, + MachinePointerInfo::getFixedStack(MF, FFI)); + SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, + LoadT.getValue(1), LoadF.getValue(1)); + + SDValue NewSel = DAG.getNode(ISD::SELECT_CC, DL, VT, + Ptr.getOperand(0), Ptr.getOperand(1), + LoadT, LoadF, Ptr.getOperand(4)); + DCI.CombineTo(N, NewSel, NewChain); + return SDValue(N, 0); + } + return SDValue(); +} + +// Map a W65816CC code to the matching Bxx opcode. +static unsigned getBranchOpcodeForCC(unsigned CC) { + switch (CC) { + case W65816CC::COND_EQ: return W65816::BEQ; + case W65816CC::COND_NE: return W65816::BNE; + case W65816CC::COND_HS: return W65816::BCS; + case W65816CC::COND_LO: return W65816::BCC; + case W65816CC::COND_MI: return W65816::BMI; + case W65816CC::COND_PL: return W65816::BPL; + case W65816CC::COND_VS: return W65816::BVS; + case W65816CC::COND_VC: return W65816::BVC; + } + llvm_unreachable("invalid W65816 condition code"); +} + +// For multi-branch CCs, return the (branchA, branchB, BothMeanTrue) tuple. +// branchA is tested first; if it takes, we go to TrueBB if BothMeanTrue +// (i.e. both branches are "take if true"), otherwise to FalseBB. branchB +// is tested next with the same semantic. +// +// GT : (BPL && BNE) → BEQ FalseBB; BPL TrueBB; fall-through FalseBB +// LE : (BMI || BEQ) → BEQ TrueBB; BMI TrueBB; fall-through FalseBB +// HI : (BCS && BNE) → BEQ FalseBB; BCS TrueBB; fall-through FalseBB +// LS : (BCC || BEQ) → BEQ TrueBB; BCC TrueBB; fall-through FalseBB +struct MultiBranch { + unsigned First, Second; + bool FirstToTrue, SecondToTrue; +}; +static MultiBranch getMultiBranch(unsigned CC) { + switch (CC) { + case W65816CC::COND_GT_MB: + return {W65816::BEQ, W65816::BPL, false, true}; + case W65816CC::COND_LE_MB: + return {W65816::BEQ, W65816::BMI, true, true}; + case W65816CC::COND_HI_MB: + return {W65816::BEQ, W65816::BCS, false, true}; + case W65816CC::COND_LS_MB: + return {W65816::BEQ, W65816::BCC, true, true}; + } + llvm_unreachable("not a multi-branch CC"); +} + +// Emit a two-Acc16 binary op as STAfi src2; OPfi dst, src1. Allocates +// a fresh 2-byte stack slot per call. For CMP (HasOut=false) there's +// no destination register, just the two src operands. Always spill +// the SECOND operand so non-commutative ops (sub, cmp) compute +// src1 OP src2 correctly via OPfi (which gives src1 OP load(spill)). +static MachineBasicBlock * +emitRROp(MachineInstr &MI, MachineBasicBlock *BB, unsigned StoreOp, + unsigned OpFI, bool HasOut) { + MachineFunction *MF = BB->getParent(); + const W65816Subtarget &STI = MF->getSubtarget(); + const W65816InstrInfo &TII = *STI.getInstrInfo(); + DebugLoc DL = MI.getDebugLoc(); + + int FI = MF->getFrameInfo().CreateStackObject(2, Align(2), + /*isSpillSlot=*/true); + + unsigned LhsIdx = HasOut ? 1 : 0; + unsigned RhsIdx = HasOut ? 2 : 1; + Register Src1 = MI.getOperand(LhsIdx).getReg(); + Register Src2 = MI.getOperand(RhsIdx).getReg(); + + // Spill src2 (the rhs). Then OPfi computes src1 OP load(spill). + BuildMI(*BB, MI.getIterator(), DL, TII.get(StoreOp)) + .addReg(Src2) + .addFrameIndex(FI) + .addImm(0); + + if (HasOut) { + Register Dst = MI.getOperand(0).getReg(); + BuildMI(*BB, MI.getIterator(), DL, TII.get(OpFI), Dst) + .addReg(Src1) + .addFrameIndex(FI) + .addImm(0); + } else { + BuildMI(*BB, MI.getIterator(), DL, TII.get(OpFI)) + .addReg(Src1) + .addFrameIndex(FI) + .addImm(0); + } + + MI.eraseFromParent(); + return BB; +} + +MachineBasicBlock * +W65816TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, + MachineBasicBlock *BB) const { + // The only opcode we currently emit with usesCustomInserter=1 is + // SELECT_CC16. Expand it into a diamond CFG with a PHI. For + // single-branch CCs: + // + // thisMBB: + // ... CMP already emitted ... + // Bxx sinkMBB ; branch to "true" path + // ; fall through to copy0MBB + // copy0MBB: + // ; (no instructions; PHI picks fval here) + // sinkMBB: + // dst = PHI [tval, thisMBB], [fval, copy0MBB] + // + // For multi-branch CCs (GT/LE/UGT/ULE without const RHS, where a + // single Bxx isn't enough), insert two branches. Both target either + // sinkMBB or copy0MBB depending on the condition. + switch (MI.getOpcode()) { + default: + llvm_unreachable("unexpected instruction in EmitInstrWithCustomInserter"); + case W65816::ADD_RR: + return emitRROp(MI, BB, W65816::STAfi, W65816::ADCfi, /*HasOut=*/true); + case W65816::SUB_RR: + return emitRROp(MI, BB, W65816::STAfi, W65816::SBCfi, /*HasOut=*/true); + // Carry-chain variants for the hi half of an i32 split. STAfi doesn't + // touch P, so the carry from the previous addc/adde survives the + // spill and is consumed by ADCEfi/SBCEfi below. + case W65816::ADDE_RR: + return emitRROp(MI, BB, W65816::STAfi, W65816::ADCEfi, /*HasOut=*/true); + case W65816::SUBE_RR: + return emitRROp(MI, BB, W65816::STAfi, W65816::SBCEfi, /*HasOut=*/true); + case W65816::AND_RR: + return emitRROp(MI, BB, W65816::STAfi, W65816::ANDfi, /*HasOut=*/true); + case W65816::ORA_RR: + return emitRROp(MI, BB, W65816::STAfi, W65816::ORAfi, /*HasOut=*/true); + case W65816::EOR_RR: + return emitRROp(MI, BB, W65816::STAfi, W65816::EORfi, /*HasOut=*/true); + case W65816::CMP_RR: + return emitRROp(MI, BB, W65816::STAfi, W65816::CMPfi, /*HasOut=*/false); + case W65816::LDAptrOff: + case W65816::STAptrOff: + case W65816::STBptrOff: { + // Pointer access with a constant offset folded into Y. Saves a + // CLC/ADC #off pair plus a spill/reload over computing + // `ptr + off` then doing LDAptr/STAptr. Since Y is 16-bit, any + // i16 offset fits. Operand layout: + // LDAptrOff: 0=dst, 1=ptr, 2=off + // STAptrOff / STBptrOff: 0=val, 1=ptr, 2=off + MachineFunction *MF = BB->getParent(); + const W65816Subtarget &STI = MF->getSubtarget(); + const W65816InstrInfo &TII = *STI.getInstrInfo(); + DebugLoc DL = MI.getDebugLoc(); + bool IsLoad = MI.getOpcode() == W65816::LDAptrOff; + bool IsByteStore = MI.getOpcode() == W65816::STBptrOff; + Register Ptr = MI.getOperand(1).getReg(); + int64_t Off = MI.getOperand(2).getImm(); + int FI = MF->getFrameInfo().CreateStackObject(2, Align(2), + /*isSpillSlot=*/true); + BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi)) + .addReg(Ptr).addFrameIndex(FI).addImm(0); + BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDY_Imm16)) + .addImm(Off); + if (IsLoad) { + Register Dst = MI.getOperand(0).getReg(); + BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi_indY), Dst) + .addFrameIndex(FI).addImm(0); + } else { + Register Val = MI.getOperand(0).getReg(); + if (IsByteStore) + BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::SEP)).addImm(0x20); + BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi_indY)) + .addReg(Val).addFrameIndex(FI).addImm(0); + if (IsByteStore) + BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::REP)).addImm(0x20); + } + MI.eraseFromParent(); + return BB; + } + case W65816::LDAptr: + case W65816::STAptr: + case W65816::STBptr: { + // Spill the pointer to a fresh 2-byte stack slot. Then LDY #0 and + // emit LDAfi_indY / STAfi_indY against that slot. The (slot,S),Y + // addressing reads the pointer from the spill, adds Y (=0), and + // dereferences. STBptr (truncating i8 store) wraps the actual STA + // in SEP/REP so M=8 across the store and only one byte is written. + MachineFunction *MF = BB->getParent(); + const W65816Subtarget &STI = MF->getSubtarget(); + const W65816InstrInfo &TII = *STI.getInstrInfo(); + DebugLoc DL = MI.getDebugLoc(); + bool IsLoad = MI.getOpcode() == W65816::LDAptr; + bool IsByteStore = MI.getOpcode() == W65816::STBptr; + + // Operand layout (explicit only; Defs=[Y] adds an implicit at the + // end which we don't read here): + // LDAptr: 0=dst, 1=ptr + // STAptr / STBptr: 0=val, 1=ptr + // The pointer operand is always at index 1. Earlier code reading + // operand 2 for stores hit the implicit Y def, not the pointer — + // which only "worked" because regalloc didn't notice and A + // happened to hold the right bytes by accident. + Register Ptr = MI.getOperand(1).getReg(); + int FI = MF->getFrameInfo().CreateStackObject(2, Align(2), + /*isSpillSlot=*/true); + + // Spill ptr. + BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi)) + .addReg(Ptr).addFrameIndex(FI).addImm(0); + // LDY #0. LDY_Imm16 has no output operand; Y is defined implicitly + // via the pseudo's Defs=[Y] marking. + BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDY_Imm16)) + .addImm(0); + + if (IsLoad) { + Register Dst = MI.getOperand(0).getReg(); + BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi_indY), Dst) + .addFrameIndex(FI).addImm(0); + } else { + Register Val = MI.getOperand(0).getReg(); + if (IsByteStore) + BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::SEP)).addImm(0x20); + BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi_indY)) + .addReg(Val).addFrameIndex(FI).addImm(0); + if (IsByteStore) + BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::REP)).addImm(0x20); + } + MI.eraseFromParent(); + return BB; + } + case W65816::SELECT_CC16: { + const W65816Subtarget &STI = BB->getParent()->getSubtarget(); + const W65816InstrInfo &TII = *STI.getInstrInfo(); + DebugLoc DL = MI.getDebugLoc(); + MachineFunction *MF = BB->getParent(); + const BasicBlock *LLVM_BB = BB->getBasicBlock(); + MachineFunction::iterator It = ++BB->getIterator(); + + MachineBasicBlock *thisMBB = BB; + MachineBasicBlock *copy0MBB = MF->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(LLVM_BB); + MF->insert(It, copy0MBB); + MF->insert(It, sinkMBB); + + // Move the rest of thisMBB after MI to sinkMBB. + sinkMBB->splice(sinkMBB->begin(), BB, + std::next(MachineBasicBlock::iterator(MI)), BB->end()); + sinkMBB->transferSuccessorsAndUpdatePHIs(BB); + + BB->addSuccessor(copy0MBB); + BB->addSuccessor(sinkMBB); + + unsigned CC = MI.getOperand(3).getImm(); + if (CC < W65816CC::COND_GT_MB) { + // Single-branch: Bxx sinkMBB. + unsigned BrOp = getBranchOpcodeForCC(CC); + BuildMI(thisMBB, DL, TII.get(BrOp)).addMBB(sinkMBB); + } else { + // Multi-branch: two Bxx. Each may target sinkMBB (true) or + // copy0MBB (false). Fall-through is the OTHER block. + MultiBranch MB = getMultiBranch(CC); + MachineBasicBlock *Tgt1 = MB.FirstToTrue ? sinkMBB : copy0MBB; + MachineBasicBlock *Tgt2 = MB.SecondToTrue ? sinkMBB : copy0MBB; + BuildMI(thisMBB, DL, TII.get(MB.First)).addMBB(Tgt1); + BuildMI(thisMBB, DL, TII.get(MB.Second)).addMBB(Tgt2); + } + + // copy0MBB falls through to sinkMBB. + copy0MBB->addSuccessor(sinkMBB); + + // sinkMBB: dst = PHI [tval, thisMBB], [fval, copy0MBB]. + BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII.get(W65816::PHI), + MI.getOperand(0).getReg()) + .addReg(MI.getOperand(1).getReg()).addMBB(thisMBB) + .addReg(MI.getOperand(2).getReg()).addMBB(copy0MBB); + + MI.eraseFromParent(); + return sinkMBB; + } + } +} diff --git a/src/llvm/lib/Target/W65816/W65816ISelLowering.h b/src/llvm/lib/Target/W65816/W65816ISelLowering.h index 7755bf3..6c52639 100644 --- a/src/llvm/lib/Target/W65816/W65816ISelLowering.h +++ b/src/llvm/lib/Target/W65816/W65816ISelLowering.h @@ -46,6 +46,10 @@ public: SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; + MachineBasicBlock * + EmitInstrWithCustomInserter(MachineInstr &MI, + MachineBasicBlock *MBB) const override; + // The 65816 has no alignment requirement on memory access — any // address is fine. Telling LLVM this lets it emit single 16-bit // loads/stores even when the IR alignment is 1, instead of @@ -59,10 +63,47 @@ public: return true; } + // Disable LLVM's magic-constant expansion of sdiv/srem by power-of-2. + // The default expansion generates BUILD_VECTOR (used as a "splat shifter" + // intermediate) which we can't lower; without an override, every sdiv/srem + // by a pow2 constant crashes ISel. Returning the original node leaves it + // intact for the libcall lowering path (SDIV/SREM are LibCall in our + // ctor — see setOperationAction calls above). + SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, + SelectionDAG &DAG, + SmallVectorImpl &Created) const override { + return SDValue(N, 0); + } + SDValue BuildSREMPow2(SDNode *N, const APInt &Divisor, + SelectionDAG &DAG, + SmallVectorImpl &Created) const override { + return SDValue(N, 0); + } + + SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; + + // Force i32 / i64 shifts through a libcall (__ashlsi3 / __lshrsi3 / + // __ashrsi3) instead of LLVM's default ExpandToParts strategy, which + // emits an SHL_PARTS node we have no pattern for. ExpandToParts also + // produces a long select-based sequence; the libcall is both smaller + // and matches our existing libcall-based approach for i16 mul/div. + ShiftLegalizationStrategy + preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, + unsigned ExpansionFactor) const override { + if (N->getValueType(0).getSizeInBits() > 16) + return ShiftLegalizationStrategy::LowerToLibcall; + return TargetLowering::preferredShiftLegalizationStrategy(DAG, N, + ExpansionFactor); + } + private: SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSignExtend(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const; }; } // namespace llvm diff --git a/src/llvm/lib/Target/W65816/W65816InstrFormats.td b/src/llvm/lib/Target/W65816/W65816InstrFormats.td index 30305f8..5083021 100644 --- a/src/llvm/lib/Target/W65816/W65816InstrFormats.td +++ b/src/llvm/lib/Target/W65816/W65816InstrFormats.td @@ -258,6 +258,23 @@ class InstStackRel op, string mnem> let Inst{15-8} = off; } +// Stack-relative indirect indexed-Y: `LDA (off,S),Y`. Reads the 16-bit +// pointer stored at S+off, adds Y, then loads from that address. Used +// to dereference pointers spilled to a stack scratch slot — the only +// way the 65816 can deref a pointer not already in zero page. +// isCodeGenOnly because the asm-parser doesn't accept `(d,S),Y` syntax +// today; codegen builds these MIs directly. +class InstStackRelIndY op, string mnem> + : W65816Inst<(outs), (ins addrDP:$off), + !strconcat(mnem, "\t($off, s), y")> { + let Size = 2; + bits<8> off; + bits<16> Inst; + let Inst{7-0} = op; + let Inst{15-8} = off; + let isCodeGenOnly = 1; +} + class InstPCRel8 op, string mnem> : W65816Inst<(outs), (ins pcrel8:$dest), !strconcat(mnem, "\t$dest")> { let Size = 2; diff --git a/src/llvm/lib/Target/W65816/W65816InstrInfo.cpp b/src/llvm/lib/Target/W65816/W65816InstrInfo.cpp index d7708b1..607af09 100644 --- a/src/llvm/lib/Target/W65816/W65816InstrInfo.cpp +++ b/src/llvm/lib/Target/W65816/W65816InstrInfo.cpp @@ -14,6 +14,7 @@ #include "W65816InstrInfo.h" #include "W65816.h" #include "W65816Subtarget.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/Support/ErrorHandling.h" @@ -34,13 +35,28 @@ void W65816InstrInfo::copyPhysReg(MachineBasicBlock &MBB, const DebugLoc &DL, Register DestReg, Register SrcReg, bool KillSrc, bool RenamableDest, bool RenamableSrc) const { - // The only Acc16 register is A; copies between A and itself are no-ops. - // Cross-class copies (e.g. A → X) need TAX/TXA pairs which we don't - // need yet — bail loudly so we notice when the time comes. if (DestReg == SrcReg) return; - if (DestReg == W65816::A && SrcReg == W65816::A) + // A → X / X → A via TAX / TXA. Used by i32 return ABI (lo in A, hi + // in X) and by callers reading split-i32 results. Both instructions + // are 16-bit when M=0/X=0; that matches our default mode. + if (DestReg == W65816::X && SrcReg == W65816::A) { + BuildMI(MBB, I, DL, get(W65816::TAX)); return; + } + if (DestReg == W65816::A && SrcReg == W65816::X) { + BuildMI(MBB, I, DL, get(W65816::TXA)); + return; + } + // A → Y / Y → A via TAY / TYA. Same M/X width caveat. + if (DestReg == W65816::Y && SrcReg == W65816::A) { + BuildMI(MBB, I, DL, get(W65816::TAY)); + return; + } + if (DestReg == W65816::A && SrcReg == W65816::Y) { + BuildMI(MBB, I, DL, get(W65816::TYA)); + return; + } llvm_unreachable("W65816: cross-class copyPhysReg not yet implemented"); } @@ -71,3 +87,50 @@ void W65816InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, .addFrameIndex(FrameIdx) .addImm(0); } + +Register W65816InstrInfo::isLoadFromStackSlot(const MachineInstr &MI, + int &FrameIndex) const { + if (MI.getOpcode() != W65816::LDAfi) + return 0; + // memfi packs (FrameIndex, offset). Treat only offset==0 as a true + // stack-slot load — non-zero offset means we're addressing within + // the slot (e.g. the high half of an i32 spill), which the generic + // peephole/CSE machinery doesn't model. + if (MI.getNumOperands() < 3 || !MI.getOperand(1).isFI() || + !MI.getOperand(2).isImm() || MI.getOperand(2).getImm() != 0) + return 0; + FrameIndex = MI.getOperand(1).getIndex(); + return MI.getOperand(0).getReg(); +} + +Register W65816InstrInfo::isStoreToStackSlot(const MachineInstr &MI, + int &FrameIndex) const { + if (MI.getOpcode() != W65816::STAfi) + return 0; + // STAfi: (ins Acc16:$src, memfi:$addr) — op0 is src reg, op1 is + // FrameIndex, op2 is offset. + if (MI.getNumOperands() < 3 || !MI.getOperand(1).isFI() || + !MI.getOperand(2).isImm() || MI.getOperand(2).getImm() != 0) + return 0; + FrameIndex = MI.getOperand(1).getIndex(); + return MI.getOperand(0).getReg(); +} + +bool W65816InstrInfo::isReMaterializableImpl(const MachineInstr &MI) const { + // Only LDAfi is gated on this hook. We declare it + // isReMaterializable=1 in tablegen so the framework will *consider* + // re-emitting it instead of spilling, then call back here to confirm. + // The instruction is safely rematerializable iff it loads from a + // *fixed* (immutable) frame index — i.e. an arg slot. Loads from a + // regular spill slot read a computed value that may not be available + // at the rematerialization point. + if (MI.getOpcode() != W65816::LDAfi) + return TargetInstrInfo::isReMaterializableImpl(MI); + + // Operand 1 is the FrameIndex (operand 0 is the def). + const MachineOperand &FIOp = MI.getOperand(1); + if (!FIOp.isFI()) + return false; + const MachineFrameInfo &MFI = MI.getMF()->getFrameInfo(); + return MFI.isFixedObjectIndex(FIOp.getIndex()); +} diff --git a/src/llvm/lib/Target/W65816/W65816InstrInfo.h b/src/llvm/lib/Target/W65816/W65816InstrInfo.h index 19fc860..8a3ba39 100644 --- a/src/llvm/lib/Target/W65816/W65816InstrInfo.h +++ b/src/llvm/lib/Target/W65816/W65816InstrInfo.h @@ -46,6 +46,29 @@ public: int FrameIdx, const TargetRegisterClass *RC, Register VReg, unsigned SubReg = 0, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; + + // Override the default rematerializability check to recognise LDAfi + // from a *fixed* (immutable) frame index — i.e. an arg slot — as + // trivially rematerializable. Without this, the greedy allocator + // spills arg loads to a fresh local slot the moment A is needed for + // anything else, then reloads from the local slot at every use. + // With it, the allocator just re-emits `LDA arg_slot,S` at each use + // and the `STA local; LDA local; LDA local` cluster collapses to a + // single `LDA arg_slot,S`. Spill-slot LDAfi (regular FI) is *not* + // rematerializable — that loads a computed value. + bool isReMaterializableImpl(const MachineInstr &MI) const override; + + // Tell the framework which pseudos are direct stack-slot loads/stores. + // MachineCSE, machine-licm, and peephole-opt use these hooks to elide + // redundant store/load pairs and to hoist invariants. Without them, + // patterns like `STAfi A, slot; LDAfi slot, A` (introduced by the + // greedy allocator's COPY-of-physreg expansion) survive into final + // asm as `sta x,s; lda x,s` no-op pairs. + Register isLoadFromStackSlot(const MachineInstr &MI, + int &FrameIndex) const override; + Register isStoreToStackSlot(const MachineInstr &MI, + int &FrameIndex) const override; + }; } // namespace llvm diff --git a/src/llvm/lib/Target/W65816/W65816InstrInfo.td b/src/llvm/lib/Target/W65816/W65816InstrInfo.td index 5db2373..db318c5 100644 --- a/src/llvm/lib/Target/W65816/W65816InstrInfo.td +++ b/src/llvm/lib/Target/W65816/W65816InstrInfo.td @@ -54,6 +54,31 @@ def W65816cmp : SDNode<"W65816ISD::CMP", SDT_W65816Cmp, [SDNPOutGlue]>; def W65816brcc : SDNode<"W65816ISD::BR_CC", SDT_W65816BrCC, [SDNPHasChain, SDNPInGlue]>; +// Push A onto the stack. Used by LowerCall to pass extra args. +// Takes Chain + Glue (with A pre-loaded via CopyToReg), produces +// Chain + Glue. Has a side effect (SP changes) and stores to +// memory. In 16-bit M mode, pushes 2 bytes and decrements SP by 2; +// the call's ADJCALLSTACKUP pseudo unwinds those bytes via +// tsc;clc;adc #N;tcs after the JSL returns. +def W65816push : SDNode<"W65816ISD::PUSH", SDTNone, + [SDNPHasChain, SDNPInGlue, SDNPOutGlue, + SDNPSideEffect, SDNPMayStore]>; + +// Push X onto the stack. Same shape as W65816push but the value to +// push is glued from CopyToReg(X) instead of CopyToReg(A). +def W65816pushx : SDNode<"W65816ISD::PUSH_X", SDTNone, + [SDNPHasChain, SDNPInGlue, SDNPOutGlue, + SDNPSideEffect, SDNPMayStore]>; + +// SELECT_CC: takes (TVal, FVal, CC) plus a glue value carrying the +// flags from a preceding W65816cmp. Lowered by EmitInstrWithCustomInserter +// into a CMP (already in the BB) + Bxx + diamond CFG + PHI. +def SDT_W65816SelectCC : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, + SDTCisVT<3, i8>]>; +def W65816selectcc : SDNode<"W65816ISD::SELECT_CC", SDT_W65816SelectCC, + [SDNPInGlue]>; + //===----------------------------------------------------------------------===// // Pseudo Instructions //===----------------------------------------------------------------------===// @@ -71,14 +96,51 @@ def ADJCALLSTACKUP : W65816Pseudo<(outs), timm:$amt2)]>; } -let isReMaterializable = 1 in -def ADDframe : W65816Pseudo<(outs PtrRegs:$dst), +// LEA-equivalent: compute the address (SP + frame_offset + offset) of a +// stack slot and place it in A. Selected from a bare ISD::FrameIndex +// SDValue in W65816DAGToDAGISel::Select; expanded by eliminateFrameIndex +// into TSC + CLC + ADC #disp. Output is Acc16 because the address ends +// up in A; PtrRegs (which only contains SP) is the wrong class. +let isReMaterializable = 1, hasSideEffects = 0, + mayLoad = 0, mayStore = 0 in +def ADDframe : W65816Pseudo<(outs Acc16:$dst), (ins i16imm:$base, i16imm:$offset), "# ADDframe PSEUDO", []>; // The retglue node lowers directly to RTL (see Returns section below). // No separate RET pseudo — the real MC instruction handles the pattern. +// Push A onto the stack. Expanded in AsmPrinter to MC `PHA`. Used by +// LowerCall to pass extra args; the matching `tsc;clc;adc #N;tcs` SP +// unwind happens in eliminateCallFramePseudoInstr for ADJCALLSTACKUP. +let Defs = [SP], Uses = [A, SP], mayStore = 1, hasSideEffects = 1 in { +def PUSH16 : W65816Pseudo<(outs), (ins), "# PUSH16", + [(W65816push)]>; +} +// Push X onto the stack. Used by LowerCall when an outgoing arg's +// SDValue is already in X (e.g. forwarding the i32-first-arg-in-A:X +// hi half). Saves a TXA+spill round-trip. Expansion: PHX. +let Defs = [SP], Uses = [X, SP], mayStore = 1, hasSideEffects = 1 in { +def PUSH16X : W65816Pseudo<(outs), (ins), "# PUSH16X", + [(W65816pushx)]>; +} + +// SELECT_CC16: implements (set Acc16:$dst, (W65816selectcc tval, fval, cc)) +// where the CMP that produced the flags has already been emitted (its +// glue is implicit via the P register). EmitInstrWithCustomInserter +// expands this into a Bxx + 2 BBs + PHI. Marked usesCustomInserter so +// the codegen invokes our hook; Uses=[P] so MachineSched keeps the CMP +// adjacent. +let usesCustomInserter = 1, Uses = [P], hasSideEffects = 1 in { +def SELECT_CC16 : W65816Pseudo<(outs Acc16:$dst), + (ins Acc16:$tval, Acc16:$fval, i8imm:$cc), + "# SELECT_CC16 $dst, $tval, $fval, $cc", + [(set Acc16:$dst, + (W65816selectcc Acc16:$tval, + Acc16:$fval, + timm:$cc))]>; +} + //===----------------------------------------------------------------------===// // Codegen pseudos that expand to MC instructions in the AsmPrinter. // @@ -94,6 +156,15 @@ let isAsCheapAsAMove = 1, isReMaterializable = 1, def LDAi16imm : W65816Pseudo<(outs Acc16:$dst), (ins i16imm:$imm), "# LDAi16imm $dst, $imm", [(set Acc16:$dst, (i16 imm:$imm))]>; +// Materialise an i16 constant directly in X (Idx16). Useful when the +// constant's only consumer is `CopyToReg($x)` — saves an LDA+TAX +// round-trip (and the A-clobber that round-trip implies). Common for +// the high half of `(zext i16 to i32)` returns, where hi=const-zero. +let isReMaterializable = 1, isAsCheapAsAMove = 1, hasSideEffects = 0, + mayLoad = 0, mayStore = 0 in +def LDXi16imm : W65816Pseudo<(outs Idx16:$dst), (ins i16imm:$imm), + "# LDXi16imm $dst, $imm", + [(set Idx16:$dst, (i16 imm:$imm))]>; def LDAi8imm : W65816Pseudo<(outs Acc8:$dst), (ins i8imm:$imm), "# LDAi8imm $dst, $imm", [(set Acc8:$dst, (i8 imm:$imm))]>; @@ -177,8 +248,13 @@ def : Pat<(store Acc16:$src, (W65816Wrapper texternalsym:$s)), // source and dest to A — there is only one Acc16 register so this is // implicit, but stating it lets the register allocator coalesce // without needing a COPY. +// +// Defs = [P] models the C-flag side-effect. Required so tablegen can +// connect this instruction to the SDNode `addc` / `subc` (SDNPOutGlue), +// which is what the type legalizer emits as the lo half of a multi- +// precision add/sub when ADDC/SUBC is Legal (see W65816ISelLowering ctor). let Constraints = "$src = $dst", - hasSideEffects = 0, mayLoad = 0, mayStore = 0 in { + hasSideEffects = 0, mayLoad = 0, mayStore = 0, Defs = [P] in { def ADCi16imm : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src, i16imm:$imm), "# ADCi16imm $dst, $src, $imm", @@ -191,10 +267,19 @@ def SBCi16imm : W65816Pseudo<(outs Acc16:$dst), (sub Acc16:$src, imm:$imm))]>; } +// addc/subc: same as add/sub on this target (CLC then ADC, SEC then SBC), +// but the SDNode produces a Glue carrying the post-op carry into a +// subsequent adde/sube. Tablegen wires the Glue to the P register +// because the instruction has Defs = [P]. +def : Pat<(addc Acc16:$src, imm:$imm), + (ADCi16imm Acc16:$src, imm:$imm)>; +def : Pat<(subc Acc16:$src, imm:$imm), + (SBCi16imm Acc16:$src, imm:$imm)>; + // ADC/SBC from a 16-bit absolute address. Folds a load on the // right-hand side of an add/sub into the carry-arithmetic op. let Constraints = "$src = $dst", - hasSideEffects = 0, mayLoad = 1, mayStore = 0 in { + hasSideEffects = 0, mayLoad = 1, mayStore = 0, Defs = [P] in { def ADCabs : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src, i32imm:$addr), "# ADCabs $dst, $src, $addr", []>; @@ -214,6 +299,61 @@ def : Pat<(sub Acc16:$src, def : Pat<(sub Acc16:$src, (i16 (load (W65816Wrapper texternalsym:$s)))), (SBCabs Acc16:$src, texternalsym:$s)>; +def : Pat<(addc Acc16:$src, + (i16 (load (W65816Wrapper tglobaladdr:$g)))), + (ADCabs Acc16:$src, tglobaladdr:$g)>; +def : Pat<(addc Acc16:$src, + (i16 (load (W65816Wrapper texternalsym:$s)))), + (ADCabs Acc16:$src, texternalsym:$s)>; +def : Pat<(subc Acc16:$src, + (i16 (load (W65816Wrapper tglobaladdr:$g)))), + (SBCabs Acc16:$src, tglobaladdr:$g)>; +def : Pat<(subc Acc16:$src, + (i16 (load (W65816Wrapper texternalsym:$s)))), + (SBCabs Acc16:$src, texternalsym:$s)>; + +// adde/sube: the chained ADC/SBC for the hi half of a multi-precision +// add/sub. Reads the C flag from the previous addc/adde (Uses = [P]), +// produces a fresh carry/borrow (Defs = [P]). AsmPrinter expansion +// emits a bare ADC/SBC with no preceding CLC/SEC; eliminateFrameIndex +// for ADCEfi/SBCEfi skips the carry-prefix step that the standalone +// ADCfi/SBCfi rely on. +let Constraints = "$src = $dst", + hasSideEffects = 0, mayLoad = 0, mayStore = 0, + Uses = [P], Defs = [P] in { +def ADCEi16imm : W65816Pseudo<(outs Acc16:$dst), + (ins Acc16:$src, i16imm:$imm), + "# ADCEi16imm $dst, $src, $imm", + [(set Acc16:$dst, + (adde Acc16:$src, imm:$imm))]>; +def SBCEi16imm : W65816Pseudo<(outs Acc16:$dst), + (ins Acc16:$src, i16imm:$imm), + "# SBCEi16imm $dst, $src, $imm", + [(set Acc16:$dst, + (sube Acc16:$src, imm:$imm))]>; +} +let Constraints = "$src = $dst", + hasSideEffects = 0, mayLoad = 1, mayStore = 0, + Uses = [P], Defs = [P] in { +def ADCEabs : W65816Pseudo<(outs Acc16:$dst), + (ins Acc16:$src, i32imm:$addr), + "# ADCEabs $dst, $src, $addr", []>; +def SBCEabs : W65816Pseudo<(outs Acc16:$dst), + (ins Acc16:$src, i32imm:$addr), + "# SBCEabs $dst, $src, $addr", []>; +} +def : Pat<(adde Acc16:$src, + (i16 (load (W65816Wrapper tglobaladdr:$g)))), + (ADCEabs Acc16:$src, tglobaladdr:$g)>; +def : Pat<(adde Acc16:$src, + (i16 (load (W65816Wrapper texternalsym:$s)))), + (ADCEabs Acc16:$src, texternalsym:$s)>; +def : Pat<(sube Acc16:$src, + (i16 (load (W65816Wrapper tglobaladdr:$g)))), + (SBCEabs Acc16:$src, tglobaladdr:$g)>; +def : Pat<(sube Acc16:$src, + (i16 (load (W65816Wrapper texternalsym:$s)))), + (SBCEabs Acc16:$src, texternalsym:$s)>; // (add Acc16, Acc16) — same value added to itself, equivalent to a 1-bit // left shift. Pattern needs a tied input so the result lands in A. @@ -293,6 +433,27 @@ def NEGA16 : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src), [(set Acc16:$dst, (sub (i16 0), Acc16:$src))]>; } +// Multi-precision negation: lo + hi halves of `-x` where x is i32. +// LLVM splits `0 - x` into `(subc 0, x_lo)` and `(sube 0, x_hi)`. +// We implement both via the ADD chain `~x + carry` since INC doesn't +// touch C; the bit pattern of C from `~x + 1` matches what `subc 0, x` +// would set (C=1 iff x was 0, i.e. no borrow). +// NEGC16 matches subc → "EOR #$FFFF; CLC; ADC #1" (5 bytes) +// NEGE16 matches sube → "EOR #$FFFF; ADC #0" (4 bytes, uses C-in) +let Constraints = "$src = $dst", + hasSideEffects = 0, mayLoad = 0, mayStore = 0, Defs = [P] in { +def NEGC16 : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src), + "# NEGC16 $dst, $src", + [(set Acc16:$dst, (subc (i16 0), Acc16:$src))]>; +} +let Constraints = "$src = $dst", + hasSideEffects = 0, mayLoad = 0, mayStore = 0, + Uses = [P], Defs = [P] in { +def NEGE16 : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src), + "# NEGE16 $dst, $src", + [(set Acc16:$dst, (sube (i16 0), Acc16:$src))]>; +} + // Bitwise NOT pattern moved below EORi16imm definition. // 16-bit bitwise ops: AND / OR / XOR against an immediate or memory @@ -340,6 +501,71 @@ def : Pat<(xor Acc16:$src, (i16 (load (W65816Wrapper tglobaladdr:$g)))), def : Pat<(xor Acc16:$src, (i16 -1)), (EORi16imm Acc16:$src, 0xFFFF)>; +// (srl x, 15): extract bit 15 to bit 0 (yields 0 or 1). The +// type-legalizer's SHL_PARTS expansion of `i32 << 1` needs this for +// the high-half "carry from low" slot, and routing it through the +// __lshrhi3 libcall costs ~10 bytes per i32 shift-by-1. Inline as +// `ASL A; LDA #0; ROL A` (3 bytes): ASL puts bit 15 into C and +// trashes A; LDA #0 doesn't touch C; ROL A folds C into bit 0. +// +// (shl x, 15): move bit 0 to bit 15 (yields 0 or 0x8000). Used by +// SRL_PARTS / SRA_PARTS expansion of `i32 >> 1` for the low-half +// "carry from hi" slot. Mirror sequence: `LSR A; LDA #0; ROR A`. +let Constraints = "$src = $dst", + hasSideEffects = 0, mayLoad = 0, mayStore = 0, Defs = [P] in { +def SRL15A : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src), + "# SRL15A $dst, $src", + [(set Acc16:$dst, (srl Acc16:$src, (i16 15)))]>; +def SHL15A : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src), + "# SHL15A $dst, $src", + [(set Acc16:$dst, (shl Acc16:$src, (i16 15)))]>; +} +// (srl x, 8): high byte to low byte, zero high byte. XBA swaps the +// two bytes of A (in 16-bit M); AND #$00FF clears the new high byte. +// 4 bytes total — much shorter than the __lshrhi3 libcall path. Used +// by i32 shift-by-8 SHL_PARTS expansion for the cross-half slot. +// +// (shl x, 8): low byte to high byte, zero low byte. Mirror. +let Constraints = "$src = $dst", + hasSideEffects = 0, mayLoad = 0, mayStore = 0 in { +def SRL8A : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src), + "# SRL8A $dst, $src", + [(set Acc16:$dst, (srl Acc16:$src, (i16 8)))]>; +def SHL8A : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src), + "# SHL8A $dst, $src", + [(set Acc16:$dst, (shl Acc16:$src, (i16 8)))]>; +} +// (sra x, 15): sign-fill — yields $0000 if x is non-negative, $FFFF +// if negative. Used by i32 sext-from-i16 type-legalization for the +// hi half (avoids the __ashrhi3 libcall path). Sequence: +// `ASL A; LDA #0; SBC #0; EOR #-1` (when our SBCi16imm uses SEC + SBC, +// LDA #0; SBC #0 produces $FFFF if C=0, $0000 if C=1; EOR #-1 flips). +// Actually simpler since SBC sets carry differently: see AsmPrinter +// expansion for the exact 5-byte sequence. +let Constraints = "$src = $dst", + hasSideEffects = 0, mayLoad = 0, mayStore = 0, Defs = [P] in { +def SRA15A : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src), + "# SRA15A $dst, $src", + [(set Acc16:$dst, (sra Acc16:$src, (i16 15)))]>; +} + +// sext_inreg from i1: broadcast bit 0 to all bits. LLVM emits this +// for `(c & 1) ? -1 : 0` patterns (e.g. CRC inner loops). The result +// is `-(x & 1)` — 0 if bit 0 was clear, 0xFFFF if set. Mask to bit +// 0 then two's-complement-negate. Three pseudos = ~7 bytes. +def : Pat<(sext_inreg Acc16:$src, i1), + (NEGA16 (ANDi16imm Acc16:$src, 1))>; + +// sext_inreg from i8: branchless `((x & 0xFF) ^ 0x80) - 0x80` trick +// (same sequence LowerSignExtend uses for ISD::SIGN_EXTEND i8->i16). +// LLVM emits this when expanding a sextload-i16-from-i8 (we set +// SEXTLOAD i8 to Expand in the lowering ctor) and for explicit +// `(int)(signed char)` casts. +def : Pat<(sext_inreg Acc16:$src, i8), + (SBCi16imm (EORi16imm + (ANDi16imm Acc16:$src, 0x00FF), 0x0080), + 0x0080)>; + // Frame-index loads/stores: take a FrameIndex + offset (packed into a // single MIOperandInfo) and expand (in eliminateFrameIndex) into an // LDA / STA d,S with the offset baked in. Used by LowerFormalArguments @@ -350,7 +576,12 @@ def memfi : Operand { let PrintMethod = "printFrameMem"; } -let mayLoad = 1, hasSideEffects = 0, mayStore = 0 in { +// LDAfi is rematerializable when the FI is a fixed (immutable) arg +// slot — see W65816InstrInfo::isReMaterializableImpl. Without this, +// greedy regalloc spills every arg load to a fresh local slot then +// reloads from there, ballooning every i32-arg function by 4-6 insns. +let mayLoad = 1, hasSideEffects = 0, mayStore = 0, + isReMaterializable = 1 in { def LDAfi : W65816Pseudo<(outs Acc16:$dst), (ins memfi:$addr), "# LDAfi $dst, $addr", []>; } @@ -369,14 +600,37 @@ def : Pat<(i16 (load addr_fi:$addr)), def : Pat<(store Acc16:$src, addr_fi:$addr), (STAfi Acc16:$src, addr_fi:$addr)>; +// i8 access to a FrameIndex slot. The slots holding i8 values are +// allocated as 2 bytes (CC_W65816 promotes i8 args to i16; spills also +// align), so reading 2 bytes is safe even for an i8 value — we just +// narrow to Acc8. Extending loads mask the high byte (zext) or leave +// it (anyext). Truncating store writes the full i16 (overwrites the +// 2-byte slot's high byte with whatever sits in A's high byte; safe +// since the slot holds an i8 and no other consumer reads that high +// byte). +def : Pat<(i8 (load addr_fi:$addr)), + (COPY_TO_REGCLASS (LDAfi addr_fi:$addr), Acc8)>; +def : Pat<(i16 (zextloadi8 addr_fi:$addr)), + (ANDi16imm (LDAfi addr_fi:$addr), 0xFF)>; +def : Pat<(i16 (extloadi8 addr_fi:$addr)), + (LDAfi addr_fi:$addr)>; +def : Pat<(store Acc8:$src, addr_fi:$addr), + (STAfi (COPY_TO_REGCLASS Acc8:$src, Acc16), addr_fi:$addr)>; +def : Pat<(truncstorei8 Acc16:$src, addr_fi:$addr), + (STAfi Acc16:$src, addr_fi:$addr)>; + // Frame-index folding into ADC / SBC / AND / ORA / EOR / CMP. Same // shape as the *abs variants but the second operand is a stack slot. +// ADCfi/SBCfi mark P as Def so they can match `addc`/`subc` (the lo +// half of a multi-precision split — see ADCi16imm comment above). let Constraints = "$src = $dst", hasSideEffects = 0, mayLoad = 1, mayStore = 0 in { +let Defs = [P] in { def ADCfi : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src, memfi:$addr), "# ADCfi $dst, $src, $addr", []>; def SBCfi : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src, memfi:$addr), "# SBCfi $dst, $src, $addr", []>; +} def ANDfi : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src, memfi:$addr), "# ANDfi $dst, $src, $addr", []>; def ORAfi : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src, memfi:$addr), @@ -384,6 +638,16 @@ def ORAfi : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src, memfi:$addr), def EORfi : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src, memfi:$addr), "# EORfi $dst, $src, $addr", []>; } +// ADCEfi / SBCEfi: chained ADC/SBC, hi half of a multi-precision split. +// Read carry from previous addc/adde/subc/sube via Uses = [P]. +let Constraints = "$src = $dst", + hasSideEffects = 0, mayLoad = 1, mayStore = 0, + Uses = [P], Defs = [P] in { +def ADCEfi : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src, memfi:$addr), + "# ADCEfi $dst, $src, $addr", []>; +def SBCEfi : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src, memfi:$addr), + "# SBCEfi $dst, $src, $addr", []>; +} let hasSideEffects = 0, mayLoad = 1, mayStore = 0, Defs = [P] in { def CMPfi : W65816Pseudo<(outs), (ins Acc16:$lhs, memfi:$addr), "# CMPfi $lhs, $addr", []>; @@ -392,6 +656,14 @@ def : Pat<(add Acc16:$src, (i16 (load addr_fi:$addr))), (ADCfi Acc16:$src, addr_fi:$addr)>; def : Pat<(sub Acc16:$src, (i16 (load addr_fi:$addr))), (SBCfi Acc16:$src, addr_fi:$addr)>; +def : Pat<(addc Acc16:$src, (i16 (load addr_fi:$addr))), + (ADCfi Acc16:$src, addr_fi:$addr)>; +def : Pat<(subc Acc16:$src, (i16 (load addr_fi:$addr))), + (SBCfi Acc16:$src, addr_fi:$addr)>; +def : Pat<(adde Acc16:$src, (i16 (load addr_fi:$addr))), + (ADCEfi Acc16:$src, addr_fi:$addr)>; +def : Pat<(sube Acc16:$src, (i16 (load addr_fi:$addr))), + (SBCEfi Acc16:$src, addr_fi:$addr)>; def : Pat<(and Acc16:$src, (i16 (load addr_fi:$addr))), (ANDfi Acc16:$src, addr_fi:$addr)>; def : Pat<(or Acc16:$src, (i16 (load addr_fi:$addr))), @@ -433,11 +705,217 @@ def : Pat<(W65816cmp Acc16:$lhs, (i16 (load (W65816Wrapper texternalsym:$s)))), (CMPabs Acc16:$lhs, texternalsym:$s)>; -// Two-Acc16 ops: deferred — needs proper frame setup so the register -// allocator can spill one operand to a local stack slot. Without -// reserved frame space, the spill goes to a negative SP offset and -// eliminateFrameIndex bails. See SESSION_STATE §6 for the -// dependency chain. +// 16-bit byte swap: XBA exchanges A.high and A.low. Pattern matches +// the (bswap Acc16) SDNode emitted by clang for byte-reverse loops. +let Constraints = "$src = $dst", + hasSideEffects = 0, mayLoad = 0, mayStore = 0 in { +def XBA16 : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src), + "# XBA16 $dst, $src", + [(set Acc16:$dst, (bswap Acc16:$src))]>; +} + +// Two-Acc16 binary ops. We have only one A register, so when both +// operands are computed values (neither a foldable load/imm/global) we +// must spill one to a stack slot. Each pseudo's custom inserter +// allocates a fresh slot and emits a STAfi+OPfi sequence; the +// register allocator handles the surrounding spills/reloads. +// hasSideEffects=1 tells the validator the pseudo may load/store +// without requiring a matching SDNode pattern (the stores are added +// by the inserter, not visible in the DAG pattern). +// +// Defs = [P] on ADD_RR/SUB_RR matches the C-flag side-effect of the +// underlying ADC/SBC, letting these pseudos serve `addc`/`subc` (the +// lo half of an i32 split) as well as plain `add`/`sub`. +let usesCustomInserter = 1, hasSideEffects = 1 in { +let Defs = [P] in { +def ADD_RR : W65816Pseudo<(outs Acc16:$dst), + (ins Acc16:$src1, Acc16:$src2), + "# ADD_RR $dst, $src1, $src2", + [(set Acc16:$dst, + (add Acc16:$src1, Acc16:$src2))]>; +def SUB_RR : W65816Pseudo<(outs Acc16:$dst), + (ins Acc16:$src1, Acc16:$src2), + "# SUB_RR $dst, $src1, $src2", + [(set Acc16:$dst, + (sub Acc16:$src1, Acc16:$src2))]>; +} +def AND_RR : W65816Pseudo<(outs Acc16:$dst), + (ins Acc16:$src1, Acc16:$src2), + "# AND_RR $dst, $src1, $src2", + [(set Acc16:$dst, + (and Acc16:$src1, Acc16:$src2))]>; +def ORA_RR : W65816Pseudo<(outs Acc16:$dst), + (ins Acc16:$src1, Acc16:$src2), + "# ORA_RR $dst, $src1, $src2", + [(set Acc16:$dst, + (or Acc16:$src1, Acc16:$src2))]>; +def EOR_RR : W65816Pseudo<(outs Acc16:$dst), + (ins Acc16:$src1, Acc16:$src2), + "# EOR_RR $dst, $src1, $src2", + [(set Acc16:$dst, + (xor Acc16:$src1, Acc16:$src2))]>; +} +def : Pat<(addc Acc16:$src1, Acc16:$src2), + (ADD_RR Acc16:$src1, Acc16:$src2)>; +def : Pat<(subc Acc16:$src1, Acc16:$src2), + (SUB_RR Acc16:$src1, Acc16:$src2)>; + +// Chained-carry two-Acc16 add/sub for the hi half of i32 splits. +// Inserter mirrors ADD_RR (STAfi spill + ADCEfi load-fold) but emits +// the carry-chain pseudo so the previous addc/adde's C flag is +// consumed instead of overwritten by a CLC. Uses+Defs = [P] +// reflects the carry chain through the SDNode. +let usesCustomInserter = 1, hasSideEffects = 1, + Uses = [P], Defs = [P] in { +def ADDE_RR : W65816Pseudo<(outs Acc16:$dst), + (ins Acc16:$src1, Acc16:$src2), + "# ADDE_RR $dst, $src1, $src2", + [(set Acc16:$dst, + (adde Acc16:$src1, Acc16:$src2))]>; +def SUBE_RR : W65816Pseudo<(outs Acc16:$dst), + (ins Acc16:$src1, Acc16:$src2), + "# SUBE_RR $dst, $src1, $src2", + [(set Acc16:$dst, + (sube Acc16:$src1, Acc16:$src2))]>; +} +let usesCustomInserter = 1, hasSideEffects = 1, Defs = [P] in { +def CMP_RR : W65816Pseudo<(outs), (ins Acc16:$lhs, Acc16:$rhs), + "# CMP_RR $lhs, $rhs", + [(W65816cmp Acc16:$lhs, Acc16:$rhs)]>; +} + +// Pointer dereference. The 65816 can't deref a register pointer +// directly — the indirect addressing modes all read the pointer from +// memory (DP or stack). These pseudos spill the Acc16 pointer to a +// fresh stack slot, set Y=0, and emit LDA/STA (slot,S),Y. Y gets +// clobbered as a side effect. hasSideEffects=1 covers the spill +// store the inserter adds, in addition to the deref. +let usesCustomInserter = 1, hasSideEffects = 1, mayLoad = 1, + Defs = [Y] in { +def LDAptr : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$ptr), + "# LDAptr $dst, $ptr", + [(set Acc16:$dst, (load Acc16:$ptr))]>; +} +let usesCustomInserter = 1, hasSideEffects = 1, mayStore = 1, + Defs = [Y] in { +def STAptr : W65816Pseudo<(outs), (ins Acc16:$val, Acc16:$ptr), + "# STAptr $val, $ptr", + [(store Acc16:$val, Acc16:$ptr)]>; +} + +// i8 zero-extending pointer load: do a 16-bit LDA (slot,s),y and mask +// the high byte. Reads one byte past the source — fine for byte-array +// iteration where the buffer is at least 2 bytes long. A future +// SEP/REP-aware mode pass could switch to a true 8-bit LDA. +def : Pat<(i16 (zextloadi8 Acc16:$ptr)), + (ANDi16imm (LDAptr Acc16:$ptr), 0xFF)>; +// Anyext byte load via pointer: consumer doesn't care about the high +// byte, so just LDA (16-bit). Same 1-byte-past-buffer caveat as +// zextloadi8. +def : Pat<(i16 (extloadi8 Acc16:$ptr)), + (LDAptr Acc16:$ptr)>; +// And the equivalent for absolute addresses (byte loads via global ptr). +// (Already covered for Wrapper(global) above; this catches the case +// where the ptr is materialised as a value.) + +// Intermediate pseudos used by the LDAptr/STAptr inserters. Each takes +// a memfi describing the slot containing the pointer; eliminateFrameIndex +// resolves it to LDA_StackRelIndY / STA_StackRelIndY with the right d-byte. +// Y must hold 0 at the issue point (the inserter emits LDY #0 first). +let mayLoad = 1, hasSideEffects = 0, mayStore = 0, Uses = [Y] in { +def LDAfi_indY : W65816Pseudo<(outs Acc16:$dst), (ins memfi:$addr), + "# LDAfi_indY $dst, $addr", []>; +} +let mayStore = 1, hasSideEffects = 0, mayLoad = 0, Uses = [Y] in { +def STAfi_indY : W65816Pseudo<(outs), (ins Acc16:$src, memfi:$addr), + "# STAfi_indY $src, $addr", []>; +} + +// i8 truncating store via Acc16 pointer. Same shape as STAptr but +// custom inserter wraps the actual STA in SEP/REP so the M-bit is 8 +// across the store and only one byte is written. Without the wrap the +// 16-bit STA would clobber the byte at ptr+1. Two patterns: the +// natural truncstorei8 from an i16 value (common with arg promotion), +// and a true i8 store (Acc8) that arises from i8-typed IR. +let usesCustomInserter = 1, hasSideEffects = 1, mayStore = 1, + Defs = [Y] in { +def STBptr : W65816Pseudo<(outs), (ins Acc16:$val, Acc16:$ptr), + "# STBptr $val, $ptr", + [(truncstorei8 Acc16:$val, Acc16:$ptr)]>; +} + +// Pointer access with constant offset. `(load (add ptr, $off))` and +// `(store val, (add ptr, $off))` come up for struct field access and +// array indexing with small constant offsets. Without these patterns, +// the offset becomes an explicit ADC #imm that has to spill A and +// recompute the pointer per access. With them, we just load Y with +// the offset in the inserter (Y is 16-bit so any i16 constant fits). +let usesCustomInserter = 1, hasSideEffects = 1, mayLoad = 1, + Defs = [Y] in { +def LDAptrOff : W65816Pseudo<(outs Acc16:$dst), + (ins Acc16:$ptr, i16imm:$off), + "# LDAptrOff $dst, $ptr, $off", []>; +} +let usesCustomInserter = 1, hasSideEffects = 1, mayStore = 1, + Defs = [Y] in { +def STAptrOff : W65816Pseudo<(outs), + (ins Acc16:$val, Acc16:$ptr, i16imm:$off), + "# STAptrOff $val, $ptr, $off", []>; +def STBptrOff : W65816Pseudo<(outs), + (ins Acc16:$val, Acc16:$ptr, i16imm:$off), + "# STBptrOff $val, $ptr, $off", []>; +} +def : Pat<(i16 (load (add Acc16:$ptr, (i16 imm:$off)))), + (LDAptrOff Acc16:$ptr, imm:$off)>; +def : Pat<(store Acc16:$val, (add Acc16:$ptr, (i16 imm:$off))), + (STAptrOff Acc16:$val, Acc16:$ptr, imm:$off)>; +def : Pat<(truncstorei8 Acc16:$val, (add Acc16:$ptr, (i16 imm:$off))), + (STBptrOff Acc16:$val, Acc16:$ptr, imm:$off)>; +def : Pat<(store Acc8:$val, (add Acc16:$ptr, (i16 imm:$off))), + (STBptrOff (COPY_TO_REGCLASS Acc8:$val, Acc16), + Acc16:$ptr, imm:$off)>; +def : Pat<(store Acc8:$val, Acc16:$ptr), + (STBptr (COPY_TO_REGCLASS Acc8:$val, Acc16), Acc16:$ptr)>; + +// i8 load via Acc16 pointer producing a true i8 (Acc8) result. Reuses +// the existing zextloadi8 16-bit-LDA-and-mask path: load 2 bytes, mask +// the high byte, then narrow to Acc8. COPY_TO_REGCLASS to Acc8 is a +// no-op at MC level (same physical A). Reads one byte past the source; +// fine for char-array iteration where the buffer is at least 2 bytes. +def : Pat<(i8 (load Acc16:$ptr)), + (COPY_TO_REGCLASS (ANDi16imm (LDAptr Acc16:$ptr), 0xFF), Acc8)>; + +// Acc8-to-Acc16 type conversions. Both Acc8 and Acc16 alias physical A, +// so COPY_TO_REGCLASS is a no-op at MC level. ZEXT additionally masks +// the high byte (which holds B from before any prior SEP). ANYEXT +// leaves the high byte untouched since the consumer doesn't care. +def : Pat<(i16 (anyext Acc8:$src)), + (COPY_TO_REGCLASS Acc8:$src, Acc16)>; +def : Pat<(i16 (zext Acc8:$src)), + (ANDi16imm (COPY_TO_REGCLASS Acc8:$src, Acc16), 0xFF)>; +def : Pat<(i8 (trunc Acc16:$src)), + (COPY_TO_REGCLASS Acc16:$src, Acc8)>; + +// Acc8 reg-reg arithmetic and bitwise ops, expanded through the Acc16 +// _RR pseudos. Cheap to do because Acc8 and Acc16 alias the same +// physical A — COPY_TO_REGCLASS is a no-op. Only the low byte +// matters; the high byte gets unrelated bits but is discarded by the +// final narrow-back to Acc8. This lets an i8 expression that wasn't +// promoted by legalization (e.g. an i8 XOR feeding only an i8 store) +// reuse the spill-and-OPfi inserter without needing dedicated Acc8 +// pseudos. +multiclass Acc8RR { + def : Pat<(i8 (op Acc8:$a, Acc8:$b)), + (COPY_TO_REGCLASS + (ri (COPY_TO_REGCLASS Acc8:$a, Acc16), + (COPY_TO_REGCLASS Acc8:$b, Acc16)), + Acc8)>; +} +defm : Acc8RR; +defm : Acc8RR; +defm : Acc8RR; +defm : Acc8RR; +defm : Acc8RR; // (memory inc/dec patterns moved below INC_Abs/DEC_Abs defs.) @@ -728,6 +1206,11 @@ def AND_StackRel : InstStackRel<0x23, "and">; def ORA_StackRel : InstStackRel<0x03, "ora">; def EOR_StackRel : InstStackRel<0x43, "eor">; +//---------------------------------------------------------------- Stack-ind-Y +// Stack-relative indirect indexed-Y: deref a pointer spilled at S+off. +def LDA_StackRelIndY : InstStackRelIndY<0xB3, "lda">; +def STA_StackRelIndY : InstStackRelIndY<0x93, "sta">; + //===----------------------------------------------------------------------===// // Branch patterns (placed after the Bxx defs). // diff --git a/src/llvm/lib/Target/W65816/W65816RegisterInfo.cpp b/src/llvm/lib/Target/W65816/W65816RegisterInfo.cpp index 4e8f7f9..3ab6346 100644 --- a/src/llvm/lib/Target/W65816/W65816RegisterInfo.cpp +++ b/src/llvm/lib/Target/W65816/W65816RegisterInfo.cpp @@ -77,10 +77,46 @@ bool W65816RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, case W65816::STAfi: NewOpc = W65816::STA_StackRel; break; case W65816::ADCfi: NewOpc = W65816::ADC_StackRel; NeedsCarryPrefix = true; break; case W65816::SBCfi: NewOpc = W65816::SBC_StackRel; NeedsCarryPrefix = true; IsSub = true; break; + // ADCEfi / SBCEfi are the chained-carry variants used as the hi half of a + // multi-precision split. No CLC/SEC prefix — they read the carry left + // in P by the previous addc/adde/subc/sube. + case W65816::ADCEfi: NewOpc = W65816::ADC_StackRel; break; + case W65816::SBCEfi: NewOpc = W65816::SBC_StackRel; break; case W65816::ANDfi: NewOpc = W65816::AND_StackRel; break; case W65816::ORAfi: NewOpc = W65816::ORA_StackRel; break; case W65816::EORfi: NewOpc = W65816::EOR_StackRel; break; case W65816::CMPfi: NewOpc = W65816::CMP_StackRel; break; + case W65816::LDAfi_indY: NewOpc = W65816::LDA_StackRelIndY; break; + case W65816::STAfi_indY: NewOpc = W65816::STA_StackRelIndY; break; + case W65816::ADDframe: { + // LEA-equivalent: emit "TSC; CLC; ADC #disp" so A holds SP + disp, + // i.e. the address of the stack slot. TSC has no carry side-effect + // (it just transfers SP into A), so the CLC + ADC is needed for a + // clean unsigned add. Disp uses the same FrameOffset+ImmOffset+ + // StackSize formula as the load/store cases. + int FI = MI.getOperand(FIOperandNum).getIndex(); + int FrameOffset = MFI.getObjectOffset(FI); + int ImmOffset = MI.getOperand(FIOperandNum + 1).getImm(); + int Disp = FrameOffset + ImmOffset + (int)MFI.getStackSize(); + if (Disp < 0 || Disp > 0xFFFF) + report_fatal_error("W65816: frame offset out of i16 LEA range"); + // TSC: A = SP (implicit def of A, use of SP). + BuildMI(*MI.getParent(), II, MI.getDebugLoc(), TII.get(W65816::TSC)) + .addReg(W65816::A, RegState::ImplicitDefine) + .addReg(W65816::SP, RegState::Implicit); + // CLC: clears C. Models as P-def, P-use (preserves N/V/Z). + BuildMI(*MI.getParent(), II, MI.getDebugLoc(), TII.get(W65816::CLC)) + .addReg(W65816::P, RegState::ImplicitDefine); + // ADC #imm: reads A and P, writes A and P. + BuildMI(*MI.getParent(), II, MI.getDebugLoc(), TII.get(W65816::ADC_Imm16)) + .addImm(Disp) + .addReg(W65816::A, RegState::Implicit) + .addReg(W65816::A, RegState::ImplicitDefine) + .addReg(W65816::P, RegState::Implicit) + .addReg(W65816::P, RegState::ImplicitDefine); + MI.eraseFromParent(); + return true; + } default: llvm_unreachable("W65816: unhandled instruction in eliminateFrameIndex"); } @@ -108,8 +144,49 @@ bool W65816RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, BuildMI(*MI.getParent(), II, MI.getDebugLoc(), TII.get(IsSub ? W65816::SEC : W65816::CLC)); } - BuildMI(*MI.getParent(), II, MI.getDebugLoc(), TII.get(NewOpc)) - .addImm(Offset); + // The MC instructions (LDA_StackRel, STA_StackRel, ADC_StackRel, + // ADC_Imm16, etc.) don't have explicit Defs/Uses on the accumulator + // because that's an implicit hardware semantic of every 65816 + // arithmetic/load/store. Without an explicit Def/Use, post-RA + // passes (Machine Copy Propagation in particular) miss that an ADC + // d,S between a TXA and a TAX redefines $a, and elide the TAX as + // "redundant" — corrupting the return value. Add the implicit + // operands here so dataflow tracking is correct. Match the + // original pseudo's read/write semantics: LDA defs A only; STA uses + // A only; ADC/SBC/AND/ORA/EOR/CMP read A and write A (CMP only + // sets flags, but it still uses A — modelling it as Use is + // sufficient since it doesn't change A). + auto Builder = BuildMI(*MI.getParent(), II, MI.getDebugLoc(), + TII.get(NewOpc)).addImm(Offset); + switch (NewOpc) { + case W65816::LDA_StackRel: + case W65816::LDA_StackRelIndY: + Builder.addReg(W65816::A, RegState::ImplicitDefine); + break; + case W65816::STA_StackRel: + case W65816::STA_StackRelIndY: + Builder.addReg(W65816::A, RegState::Implicit); + break; + case W65816::ADC_StackRel: + case W65816::SBC_StackRel: + Builder.addReg(W65816::A, RegState::Implicit) + .addReg(W65816::A, RegState::ImplicitDefine) + .addReg(W65816::P, RegState::Implicit) + .addReg(W65816::P, RegState::ImplicitDefine); + break; + case W65816::AND_StackRel: + case W65816::ORA_StackRel: + case W65816::EOR_StackRel: + Builder.addReg(W65816::A, RegState::Implicit) + .addReg(W65816::A, RegState::ImplicitDefine); + break; + case W65816::CMP_StackRel: + Builder.addReg(W65816::A, RegState::Implicit) + .addReg(W65816::P, RegState::ImplicitDefine); + break; + default: + break; + } MI.eraseFromParent(); return true; } diff --git a/src/llvm/lib/Target/W65816/W65816StackSlotCleanup.cpp b/src/llvm/lib/Target/W65816/W65816StackSlotCleanup.cpp new file mode 100644 index 0000000..c9272c4 --- /dev/null +++ b/src/llvm/lib/Target/W65816/W65816StackSlotCleanup.cpp @@ -0,0 +1,355 @@ +//===-- W65816StackSlotCleanup.cpp - Remove redundant spill/reload pairs --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Post-RA cleanup that erases redundant STAfi+LDAfi pairs to the same +// stack slot when no instruction in between writes A or that slot. +// +// The greedy register allocator routinely emits this pattern when +// materialising a COPY of $a into a vreg that gets allocated back to +// $a — the spill+reload cycle is a no-op since A already holds the +// stored value. The standard MachineLateInstrsCleanup pass only +// detects identical instructions; it doesn't recognise that +// `LDAfi slot` after `STAfi $a, slot` is a no-op. We do the +// simple per-block scan here. +// +// Conservative: only matches adjacent STAfi+LDAfi pairs (no scan for +// instructions in between). In practice the greedy-allocator-emitted +// pattern is always adjacent or near-adjacent, and the scheduler keeps +// it that way because the LDAfi feeds the next instruction. If +// future codegen breaks this assumption, generalise to a longer scan +// with explicit clobber tracking. +// +//===----------------------------------------------------------------------===// + +#include "W65816.h" +#include "W65816InstrInfo.h" +#include "W65816Subtarget.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "w65816-stack-slot-cleanup" + +namespace { + +class W65816StackSlotCleanup : public MachineFunctionPass { +public: + static char ID; + + W65816StackSlotCleanup() : MachineFunctionPass(ID) {} + + StringRef getPassName() const override { + return "W65816 redundant stack-slot spill/reload elimination"; + } + + bool runOnMachineFunction(MachineFunction &MF) override; +}; + +} // namespace + +char W65816StackSlotCleanup::ID = 0; + +INITIALIZE_PASS(W65816StackSlotCleanup, DEBUG_TYPE, + "W65816 redundant stack-slot spill/reload elimination", + false, false) + +FunctionPass *llvm::createW65816StackSlotCleanup() { + return new W65816StackSlotCleanup(); +} + +// Returns true if MI references frame index FI as one of its operands. +// Used to bail dead-store removal when an intervening instruction +// reads or writes the slot. +static bool referencesFrameIndex(const MachineInstr &MI, int FI) { + for (const MachineOperand &MO : MI.operands()) + if (MO.isFI() && MO.getIndex() == FI) + return true; + return false; +} + +// Match `STAfi reg1, FI, 0; ... ; STAfi reg2, FI, 0` (kill via overwrite) +// or `STAfi reg, FI, 0; ... ; (no read in between)` (dead store +// at function exit). Both mean the first STAfi is dead. Conservative: +// bails on anything that references the slot, calls, inline asm. The +// slot must be a *local* (non-fixed) FrameIndex — args live across the +// function so we can't kill stores to fixed slots. +static bool tryEliminateDeadStore(MachineBasicBlock &MBB, + MachineInstr &StaMI) { + if (StaMI.getOpcode() != W65816::STAfi) + return false; + if (StaMI.getNumOperands() < 3 || + !StaMI.getOperand(1).isFI() || + !StaMI.getOperand(2).isImm() || StaMI.getOperand(2).getImm() != 0) + return false; + int StoredFI = StaMI.getOperand(1).getIndex(); + + // Don't try to kill a store to a fixed (arg) slot — those are + // observable to the caller. Locals/spills are fair game. + const MachineFunction *MF = StaMI.getMF(); + if (MF->getFrameInfo().isFixedObjectIndex(StoredFI)) + return false; + + auto It = std::next(StaMI.getIterator()); + while (It != MBB.end()) { + MachineInstr &MI = *It; + if (MI.isDebugInstr()) { + ++It; + continue; + } + // A subsequent STAfi to the same slot, offset 0, kills our store. + if (MI.getOpcode() == W65816::STAfi && + MI.getNumOperands() >= 3 && + MI.getOperand(1).isFI() && + MI.getOperand(1).getIndex() == StoredFI && + MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) { + // Found the killing store. Erase the first. + StaMI.eraseFromParent(); + return true; + } + // A return that doesn't read the slot kills the store too — the + // local goes out of scope at function exit. + if (MI.isReturn() && !referencesFrameIndex(MI, StoredFI)) { + StaMI.eraseFromParent(); + return true; + } + // Anything else that touches the slot (load, ADC d,S, etc.) means + // the first store IS observed — bail. + if (referencesFrameIndex(MI, StoredFI)) + return false; + // Inline asm / branches: too tricky. Calls are OK to walk past — + // local (non-fixed) slots are addressed at offsets the callee + // can't reach (callee's S has been shifted down by JSL's + // 3-byte return frame and any of its own pha/tsc adjustments, + // so its `(4,s)` reads land above our locals). We've already + // bailed on fixed slots above, so reaching here means the slot + // is local and call-safe. + if (MI.isInlineAsm() || MI.isBranch()) + return false; + ++It; + } + // Walked off the end of the BB without seeing a return/use. Bail + // (could fall through to a successor that reads the slot). + return false; +} + +// Match `STAfi reg, FI, 0; ... ; LDAfi destReg, FI, 0` when reg == destReg +// and nothing in between clobbers reg or the slot. Erase the LDAfi. +static bool tryEliminateLoadAfterStore(MachineBasicBlock &MBB, + MachineInstr &StaMI, + const TargetRegisterInfo *TRI) { + if (StaMI.getOpcode() != W65816::STAfi) + return false; + if (StaMI.getNumOperands() < 3 || + !StaMI.getOperand(0).isReg() || + !StaMI.getOperand(1).isFI() || + !StaMI.getOperand(2).isImm() || StaMI.getOperand(2).getImm() != 0) + return false; + Register StoredReg = StaMI.getOperand(0).getReg(); + int StoredFI = StaMI.getOperand(1).getIndex(); + + // Walk forward looking for the matching LDAfi. Bail on any + // instruction that could clobber StoredReg or write the slot. + auto It = std::next(StaMI.getIterator()); + while (It != MBB.end()) { + MachineInstr &MI = *It; + if (MI.isDebugInstr()) { + ++It; + continue; + } + if (MI.getOpcode() == W65816::LDAfi && + MI.getNumOperands() >= 3 && + MI.getOperand(1).isFI() && + MI.getOperand(1).getIndex() == StoredFI && + MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0 && + MI.getOperand(0).isReg() && + MI.getOperand(0).getReg() == StoredReg) { + MI.eraseFromParent(); + return true; + } + // Calls clobber A — be safe. + if (MI.isCall()) + return false; + // Any other instruction that defines StoredReg or stores to the + // slot invalidates the redundancy — bail. + if (MI.modifiesRegister(StoredReg, TRI)) + return false; + if (MI.getOpcode() == W65816::STAfi && + MI.getNumOperands() >= 2 && MI.getOperand(1).isFI() && + MI.getOperand(1).getIndex() == StoredFI) + return false; + ++It; + } + return false; +} + +bool W65816StackSlotCleanup::runOnMachineFunction(MachineFunction &MF) { + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + bool Changed = false; + + // Pass 0: rewrite `LDAi16imm $a, imm` immediately followed by + // `COPY $x = $a` (with no intervening A clobber) into + // `LDXi16imm $x, imm`. Run BEFORE the spill/reload cleanups so + // the disappearing A clobber unblocks subsequent STAfi+LDAfi + // pair removal. + for (MachineBasicBlock &MBB : MF) { + SmallVector Worklist; + for (MachineInstr &MI : MBB) + if (MI.getOpcode() == W65816::LDAi16imm) + Worklist.push_back(&MI); + for (MachineInstr *Lda : Worklist) { + if (Lda->getNumOperands() < 2 || !Lda->getOperand(0).isReg() || + Lda->getOperand(0).getReg() != W65816::A) + continue; + auto It = std::next(Lda->getIterator()); + while (It != MBB.end() && It->isDebugInstr()) + ++It; + if (It == MBB.end()) + continue; + MachineInstr &Next = *It; + if (!Next.isCopy()) + continue; + Register DstReg = Next.getOperand(0).getReg(); + Register SrcReg = Next.getOperand(1).getReg(); + if (DstReg != W65816::X || SrcReg != W65816::A) + continue; + const MachineOperand &ImmMO = Lda->getOperand(1); + const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + MachineInstrBuilder Mib = + BuildMI(MBB, Lda->getIterator(), Lda->getDebugLoc(), + TII->get(W65816::LDXi16imm), W65816::X); + if (ImmMO.isImm()) + Mib.addImm(ImmMO.getImm()); + else + Mib.add(ImmMO); + Lda->eraseFromParent(); + Next.eraseFromParent(); + Changed = true; + } + } + + // Pass 1: redundant LDAfi after STAfi (load-after-same-store with + // matching register). Two-pass over Stores worklist to avoid + // iterator invalidation when we erase the LDAfi mid-walk. + for (MachineBasicBlock &MBB : MF) { + SmallVector Stores; + for (MachineInstr &MI : MBB) + if (MI.getOpcode() == W65816::STAfi) + Stores.push_back(&MI); + for (MachineInstr *StaMI : Stores) + if (tryEliminateLoadAfterStore(MBB, *StaMI, TRI)) + Changed = true; + } + + // Pass 2: dead stores (STAfi to slot followed by another STAfi to + // the same slot with no intervening read). This catches the + // arg0_lo "preserve" spill that the regalloc emits even though the + // value is consumed by the very next instruction. + for (MachineBasicBlock &MBB : MF) { + SmallVector Stores; + for (MachineInstr &MI : MBB) + if (MI.getOpcode() == W65816::STAfi) + Stores.push_back(&MI); + for (MachineInstr *StaMI : Stores) + if (tryEliminateDeadStore(MBB, *StaMI)) + Changed = true; + } + + // Pass 2.5: deleted (logic moved to Pass 0 above). + // `COPY $x = $a` (with no intervening A use/def) into + // `LDXi16imm $x, imm`, removing the A clobber. Without this, the + // regalloc materialises i16 constants via Acc16 (LDAi16imm) even + // when the only consumer is CopyToReg($x), forcing a TAX round-trip + // and (often) a spill+reload of A's previous value. Common case: + // the high half of `(zext i16 to i32)` returns, where hi = 0. + for (MachineBasicBlock &MBB : MF) { + SmallVector Worklist; + for (MachineInstr &MI : MBB) + if (MI.getOpcode() == W65816::LDAi16imm) + Worklist.push_back(&MI); + for (MachineInstr *Lda : Worklist) { + // The LDA's def must be $a (post-RA) and the next instruction + // must be a COPY $x = $a. + if (Lda->getNumOperands() < 2 || !Lda->getOperand(0).isReg() || + Lda->getOperand(0).getReg() != W65816::A) + continue; + auto It = std::next(Lda->getIterator()); + // Skip debug instructions. + while (It != MBB.end() && It->isDebugInstr()) + ++It; + if (It == MBB.end()) + continue; + MachineInstr &Next = *It; + if (!Next.isCopy()) + continue; + Register DstReg = Next.getOperand(0).getReg(); + Register SrcReg = Next.getOperand(1).getReg(); + if (DstReg != W65816::X || SrcReg != W65816::A) + continue; + // Replace LDAi16imm with LDXi16imm and erase the COPY. + const MachineOperand &ImmMO = Lda->getOperand(1); + const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + MachineInstrBuilder Mib = + BuildMI(MBB, Lda->getIterator(), Lda->getDebugLoc(), + TII->get(W65816::LDXi16imm), W65816::X); + if (ImmMO.isImm()) + Mib.addImm(ImmMO.getImm()); + else + Mib.add(ImmMO); + Lda->eraseFromParent(); + Next.eraseFromParent(); + Changed = true; + } + } + + // Pass 3: zero-size unused local frame objects so the + // PrologueEpilogue pass shrinks the prologue PHAs / TSC reservation. + // Walk the MIR collecting which FIs are still referenced; any + // *non-fixed* (local) FI with no remaining reference is dead. We + // can't safely remove it (RemoveStackObject can shift indexes); we + // just zero-size it via setObjectSize, which is enough for the + // frame layout pass to skip it. + MachineFrameInfo &MFI = MF.getFrameInfo(); + if (MFI.getNumObjects() > 0) { + BitVector Used(MFI.getObjectIndexEnd() - MFI.getObjectIndexBegin()); + auto Mark = [&](int FI) { + int Idx = FI - MFI.getObjectIndexBegin(); + if (Idx >= 0 && Idx < (int)Used.size()) + Used.set(Idx); + }; + for (MachineBasicBlock &MBB : MF) + for (MachineInstr &MI : MBB) + for (MachineOperand &MO : MI.operands()) + if (MO.isFI()) + Mark(MO.getIndex()); + for (int FI = MFI.getObjectIndexBegin(); + FI < MFI.getObjectIndexEnd(); ++FI) { + // Skip fixed (arg) slots — those are "owned" by the caller. + if (MFI.isFixedObjectIndex(FI)) + continue; + int Idx = FI - MFI.getObjectIndexBegin(); + if (Idx < 0 || Idx >= (int)Used.size() || Used.test(Idx)) + continue; + // Already zero-sized? Skip. + if (MFI.getObjectSize(FI) == 0) + continue; + // Don't touch dead-stripped objects either. + if (MFI.isDeadObjectIndex(FI)) + continue; + MFI.setObjectSize(FI, 0); + Changed = true; + } + } + + return Changed; +} diff --git a/src/llvm/lib/Target/W65816/W65816TargetMachine.cpp b/src/llvm/lib/Target/W65816/W65816TargetMachine.cpp index e24f832..f93d608 100644 --- a/src/llvm/lib/Target/W65816/W65816TargetMachine.cpp +++ b/src/llvm/lib/Target/W65816/W65816TargetMachine.cpp @@ -39,6 +39,7 @@ LLVMInitializeW65816Target() { PassRegistry &PR = *PassRegistry::getPassRegistry(); initializeW65816AsmPrinterPass(PR); initializeW65816DAGToDAGISelLegacyPass(PR); + initializeW65816StackSlotCleanupPass(PR); } static Reloc::Model getEffectiveRelocModel(std::optional RM) { @@ -74,6 +75,7 @@ public: } bool addInstSelector() override; + void addPostRegAlloc() override; }; } // namespace @@ -82,6 +84,10 @@ TargetPassConfig *W65816TargetMachine::createPassConfig(PassManagerBase &PM) { return new W65816PassConfig(*this, PM); } +void W65816PassConfig::addPostRegAlloc() { + addPass(createW65816StackSlotCleanup()); +} + MachineFunctionInfo *W65816TargetMachine::createMachineFunctionInfo( BumpPtrAllocator &Allocator, const Function &F, const TargetSubtargetInfo *STI) const {