65816-llvm-mos/src/llvm/lib/Target/W65816/W65816SepRepCleanup.cpp
Scott Duensing 09f7405362 Updates
2026-06-03 16:08:42 -05:00

1133 lines
49 KiB
C++

//===-- W65816SepRepCleanup.cpp - Coalesce adjacent SEP/REP toggles -------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Post-PEI peephole that drops adjacent `REP #$20 ; SEP #$20` (or vice
// versa) pairs that toggle the M-bit redundantly.
//
// The STA8fi expansion in W65816RegisterInfo::eliminateFrameIndex emits
// `SEP #$20 / STA d,S / REP #$20` so each i8 store runs with M=1. When
// two STA8fi sit back-to-back in the MIR (no 16-bit ALU op between
// them), the post-PEI stream contains:
//
// SEP #$20
// STA d1, S
// REP #$20 <-- toggle
// SEP #$20 <-- toggle (cancels above)
// STA d2, S
// REP #$20
//
// The middle REP/SEP pair is a no-op: both stores can run in one M=1
// region. We drop them to leave:
//
// SEP #$20
// STA d1, S
// STA d2, S
// REP #$20
//
// Saves 2 bytes / 6 cycles per coalesced pair. Symmetric `SEP/REP`
// pairs (M=1 then M=0 with nothing in between) are also dropped — they
// can arise around inline-asm or hand-written assembly snippets.
//
// Runs at addPreEmitPass (after PEI has expanded STA8fi).
//
//===----------------------------------------------------------------------===//
#include "W65816.h"
#include "W65816InstrInfo.h"
#include "W65816Subtarget.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
using namespace llvm;
#define DEBUG_TYPE "w65816-sep-rep-cleanup"
// W65816 processor status M-bit mask (set/clear via SEP/REP #$20).
static constexpr int kMBit = 0x20;
namespace {
class W65816SepRepCleanup : public MachineFunctionPass {
public:
static char ID;
W65816SepRepCleanup() : MachineFunctionPass(ID) {}
StringRef getPassName() const override {
return "W65816 SEP/REP toggle coalescing";
}
bool runOnMachineFunction(MachineFunction &MF) override;
};
} // namespace
char W65816SepRepCleanup::ID = 0;
INITIALIZE_PASS(W65816SepRepCleanup, DEBUG_TYPE,
"W65816 SEP/REP toggle coalescing", false, false)
FunctionPass *llvm::createW65816SepRepCleanup() {
return new W65816SepRepCleanup();
}
// Returns the immediate value of `op` if MI is a `SEP #imm` or `REP #imm`,
// else -1.
static int getSepRepImm(const MachineInstr &MI, unsigned Opc) {
if (MI.getOpcode() != Opc)
return -1;
if (MI.getNumOperands() < 1 || !MI.getOperand(0).isImm())
return -1;
return MI.getOperand(0).getImm();
}
// Returns true if MI may consume the carry or overflow flag — these
// are the flags that ADC/SBC define but INA/DEA don't. Conservative:
// any branch that reads C or V counts, plus the chained ADC/SBC ops
// that wait for a prior carry-out. Anything else (CMP, CLC, SEC,
// LDA, STA, AND, ORA, EOR, etc.) re-defines or doesn't read C/V.
static bool readsCarryOrV(const MachineInstr &MI) {
switch (MI.getOpcode()) {
case W65816::BCS: // reads C
case W65816::BCC: // reads C
case W65816::BVS: // reads V
case W65816::BVC: // reads V
case W65816::ADC_StackRel: // reads C as carry-in
case W65816::ADC_Imm16:
case W65816::ADC_Imm8:
case W65816::ADC_DP:
case W65816::ADC_Abs:
case W65816::SBC_StackRel:
case W65816::SBC_Imm16:
case W65816::SBC_Imm8:
case W65816::SBC_DP:
case W65816::SBC_Abs:
// Chained-carry pseudos. These run BEFORE AsmPrinter expansion so
// we must whitelist them explicitly — they're the hi-half of any
// multi-precision add/sub and read the lo-half's carry-out. Without
// these, the INA/DEA peephole below silently rewrites a lo-half
// `ADCi16imm src, 2` to `INA; INA` (which DOES NOT set C), breaking
// the i32 ADD carry chain. Caught as `arr[0] = arr[1]` writing to
// wrong bank under ptr32 because the high half got a stale C.
case W65816::ADCEi16imm:
case W65816::SBCEi16imm:
// The fi/abs/imm forms of ADC/SBC are also pre-AsmPrinter pseudos;
// each expands to a real ADC_/SBC_ opcode that reads carry.
case W65816::ADCi16imm: // lo-half (CLC + ADC_Imm16)
case W65816::SBCi16imm: // lo-half (SEC + SBC_Imm16)
case W65816::ADCfi: // chained-carry stack form
case W65816::SBCfi:
case W65816::ADCEfi:
case W65816::SBCEfi:
case W65816::ADCabs:
case W65816::SBCabs:
case W65816::ROL_A: // rotates fold C in
case W65816::ROR_A:
case W65816::ROL_DP:
case W65816::ROL_Abs:
case W65816::ROR_DP:
case W65816::ROR_Abs:
return true;
default:
return false;
}
}
// Returns true if `Op` is one of the flag-redefining opcodes (CLC, SEC,
// CMP*, CPX*, CPY*, REP, SEP) — observing C/V before this is safe.
// Includes the pseudo CMP* variants (CMPi16imm etc.) since this peephole
// runs at pre-emit, BEFORE the AsmPrinter expands them.
static bool isFlagRedefiner(unsigned Op) {
switch (Op) {
case W65816::CLC:
case W65816::SEC:
case W65816::CMP_Imm8: case W65816::CMP_Imm16:
case W65816::CMP_StackRel: case W65816::CMP_DP: case W65816::CMP_Abs:
case W65816::CMPi16imm: case W65816::CMPi8imm:
case W65816::CMPfi: case W65816::CMPabs:
case W65816::CMP_RR:
case W65816::CPX_Imm8: case W65816::CPX_Imm16:
case W65816::CPX_DP: case W65816::CPX_Abs:
case W65816::CPY_Imm8: case W65816::CPY_Imm16:
case W65816::CPY_DP: case W65816::CPY_Abs:
case W65816::REP: case W65816::SEP:
return true;
default: return false;
}
}
// Returns true if a subsequent MI in the same MBB observes the C/V
// flags before any flag-redefiner clears the dependency. At MBB end,
// extends one step into each successor: if any successor's first
// (non-debug) MI reads C/V before redefining them, the flag is live
// across the edge — bail. This is critical for loop bodies where
// the back-edge re-enters the same MBB at LDA/PHA (neither reads C/V),
// so a per-iteration `clc; adc #2` is foldable. Cross-MBB carry chains
// would normally use ADCEi16imm (not ADCi16imm), so this is safe.
static bool carryFlagLiveAfter(MachineBasicBlock::iterator After,
MachineBasicBlock &MBB) {
// Phase 1: scan within this MBB.
for (auto Probe = std::next(After); Probe != MBB.end(); ++Probe) {
if (Probe->isDebugInstr()) continue;
if (readsCarryOrV(*Probe)) return true;
if (isFlagRedefiner(Probe->getOpcode())) return false;
if (Probe->isCall()) return false; // callee resets flags
}
// Phase 2: peek into each successor's first few MIs. We BAIL only on
// a positive C/V read; reaching MBB end or peek-cap without finding
// one is treated as "carry dead" — ADCi16imm's carry-out is never
// used in carry chains (those use ADCEi16imm), so a stray carry
// floating into RTL or an unrelated arithmetic op causes no harm.
const unsigned MaxPeek = 6;
for (MachineBasicBlock *Succ : MBB.successors()) {
unsigned Peeked = 0;
for (auto &MI : *Succ) {
if (MI.isDebugInstr()) continue;
if (readsCarryOrV(MI)) return true;
if (isFlagRedefiner(MI.getOpcode()) || MI.isCall()) break;
if (++Peeked >= MaxPeek) break;
}
}
return false;
}
// Convert `ADCi16imm dst, src, ±1`/`±2` and `SBCi16imm` similarly to
// INA / INA;INA / DEA / DEA;DEA chains when C/V are dead. ADCi16imm
// is a pseudo that expands to CLC+ADC_Imm16 (4B/5cyc). INA is 1B/2cyc.
// Savings per ±1: 3B/3cyc; per ±2: 2B/1cyc. SBCi16imm is symmetric
// (sub by N == add by -N), so SBC #1 → DEA, SBC #-1 → INA, etc.
static bool foldImmAdcToInaDea(MachineBasicBlock &MBB,
const W65816InstrInfo &TII) {
bool Changed = false;
auto It = MBB.begin();
while (It != MBB.end()) {
unsigned Op = It->getOpcode();
bool isAdc = (Op == W65816::ADCi16imm);
bool isSbc = (Op == W65816::SBCi16imm);
if ((!isAdc && !isSbc) || It->getNumOperands() < 3 ||
!It->getOperand(2).isImm()) { ++It; continue; }
int64_t Imm = (int16_t)It->getOperand(2).getImm();
// For SBC, negate: SBC by +N is "subtract N", same as ADC by -N.
int64_t Effective = isSbc ? -Imm : Imm;
if (Effective < -2 || Effective > 2 || Effective == 0) { ++It; continue; }
if (carryFlagLiveAfter(It, MBB)) { ++It; continue; }
DebugLoc DL = It->getDebugLoc();
unsigned NewOpc = (Effective > 0) ? W65816::INA : W65816::DEA;
unsigned Count = (Effective > 0) ? Effective : -Effective;
for (unsigned i = 0; i < Count; ++i)
BuildMI(MBB, It, DL, TII.get(NewOpc));
auto NextIt = std::next(It);
It->eraseFromParent();
It = NextIt;
Changed = true;
}
return Changed;
}
// DBG_VALUE preservation in this pass:
//
// Every instruction this pass erases falls into one of these classes:
// * SEP/REP — MCInst-level mode-flag toggles, no value flow.
// * TAX/TXA/TAY/TYA — register transfers; the source value still
// exists in A and is followed by an A-redefining instruction that
// was the reason we identified the transfer as dead.
// * Redundant LDY_Imm16 — Y already holds the constant.
// * Redundant ADCi16imm/SBCi16imm rewritten to INA/DEA — same value,
// fewer cycles.
// * Lagged-ptr PHI-copy sink — relocates a `STA dst` from end-of-MBB
// to immediately after the iter-load. The destination slot is
// written earlier but with the same value at every read point
// because the iter's OLD value is what flowed through the
// PHP/PLP-wrapped tail copy.
// * i32-add store-bypass — reorders 10 instructions to 6 that
// compute the same lo/hi result into the same destination slots
// in the same order from the user's point of view.
//
// None of these change the user-visible value of a named variable at
// any PC where a DBG_VALUE could observe it. Hoisted/moved
// instructions write the same data at slightly earlier PCs in their
// MBB; a DBG_VALUE between the OLD and NEW write positions could read
// a slightly-fresher value (the next-iteration's prefetch instead of
// the current iteration's tail), but never a wrong value — the loop
// invariant guarantees both values agree at the moved boundary.
bool W65816SepRepCleanup::runOnMachineFunction(MachineFunction &MF) {
bool Changed = false;
const auto &STI = MF.getSubtarget<W65816Subtarget>();
const auto &TII = *STI.getInstrInfo();
for (MachineBasicBlock &MBB : MF) {
// Pre-pass: hoist LDAi8imm out of byte-store SEP/REP wraps.
// The post-RA scheduler can move LDAi8imm (which is marked
// hasSideEffects=0 at MIR but expands at AsmPrinter to its OWN
// SEP+LDA8+REP that toggles M) INSIDE an STBptr inserter's
// SEP/REP wrap. When that happens, the LDAi8imm's expansion
// REP fires BEFORE the byte STA, leaving the STA in M=16 — the
// store becomes a 16-bit zero write, corrupting the byte AFTER
// the intended target. Detect the pattern and hoist the
// LDAi8imm above the outer SEP. #107 strtok_r BB0_15 was this
// exact bug.
{
SmallVector<MachineInstr *, 8> SepHoists;
for (auto It = MBB.begin(); It != MBB.end(); ++It) {
if (It->getOpcode() != W65816::SEP) continue;
if (It->getNumOperands() < 1 || !It->getOperand(0).isImm()) continue;
if (It->getOperand(0).getImm() != kMBit) continue;
// Walk forward looking for LDAi8imm before any STAfi_indY
// or REP at this nesting level.
auto Walker = std::next(It);
MachineInstr *LdaToHoist = nullptr;
while (Walker != MBB.end()) {
if (Walker->isDebugInstr()) { ++Walker; continue; }
unsigned Opc = Walker->getOpcode();
// Hit a REP — wrap is closing without LDAi8imm inside.
if (Opc == W65816::REP) break;
// Hit a call / branch / asm — bail.
if (Walker->isCall() || Walker->isBranch() ||
Walker->isReturn() || Walker->isInlineAsm()) break;
// Hit an STAfi_indY — this is the byte store; an LDAi8imm
// before it would be the bug, but if we found one already
// we'd have hoisted it; nothing to do here, stop scanning.
if (Opc == W65816::STAfi_indY) break;
if (Opc == W65816::LDAi8imm) {
LdaToHoist = &*Walker;
break;
}
++Walker;
}
if (LdaToHoist)
SepHoists.push_back(LdaToHoist);
}
for (MachineInstr *Lda : SepHoists) {
// Find the SEP we entered before the LDA. Walk backward.
auto Back = Lda->getIterator();
MachineInstr *OuterSep = nullptr;
while (Back != MBB.begin()) {
--Back;
if (Back->isDebugInstr()) continue;
if (Back->getOpcode() == W65816::SEP &&
Back->getNumOperands() >= 1 &&
Back->getOperand(0).isImm() &&
Back->getOperand(0).getImm() == kMBit) {
OuterSep = &*Back;
break;
}
if (Back->isCall() || Back->isBranch() || Back->isInlineAsm())
break;
}
if (!OuterSep) continue;
Lda->removeFromParent();
MBB.insert(OuterSep->getIterator(), Lda);
Changed = true;
}
}
SmallVector<MachineInstr *, 8> Toggles;
for (MachineInstr &MI : MBB) {
unsigned Opc = MI.getOpcode();
if (Opc == W65816::REP || Opc == W65816::SEP)
Toggles.push_back(&MI);
}
SmallPtrSet<MachineInstr *, 8> Erased;
for (MachineInstr *First : Toggles) {
if (Erased.count(First)) continue;
// The next non-debug instruction must be the matching opposite
// toggle with the same imm.
auto It = std::next(First->getIterator());
while (It != MBB.end() && It->isDebugInstr()) ++It;
if (It == MBB.end()) continue;
MachineInstr &Next = *It;
// Look for REP-then-SEP or SEP-then-REP with matching imm.
unsigned FirstOpc = First->getOpcode();
unsigned WantOpc = (FirstOpc == W65816::REP) ? W65816::SEP : W65816::REP;
int FirstImm = getSepRepImm(*First, FirstOpc);
int NextImm = getSepRepImm(Next, WantOpc);
if (FirstImm < 0 || NextImm < 0 || FirstImm != NextImm) continue;
Erased.insert(First);
Erased.insert(&Next);
First->eraseFromParent();
Next.eraseFromParent();
Changed = true;
}
// Extended toggle coalesce — REP/SEP scheduling.
//
// Walk the MBB looking for `T1 ; ...neutral... ; T2` where T1 and
// T2 are opposite-polarity SEP/REP toggles (T1=REP T2=SEP, or
// vice versa) with the same imm, and the gap contains only
// M-mode-neutral instructions (transfers/branches/X-flag-only
// index ops). In that case T1+T2 form a no-op pair around code
// that doesn't care about M, so both can be dropped. Equivalent
// to "moving the SEP/REP wrap inward to skip the neutral region".
//
// Saves 4 bytes / 12 cycles per gap collapsed. The common
// trigger is two STA8 stores separated by an LDY for the second
// store's address — STA8fi each emit SEP/STA/REP, the existing
// adjacent coalesce can't see across the LDY, this pass can.
{
// Mode-neutral instruction set: don't touch the M-bit and
// don't depend on A's width. X-flag dependent ops (LDX/LDY/
// STX/STY/INX/DEX/INY/DEY/CPX/CPY/PHX/PHY/PLX/PLY) are
// independent of M. So are all branches, JMP/JSR/JSL/RTL/RTS,
// CLC/SEC/CLI/SEI/CLD/SED/CLV, NOP, and PHP/PLP (they push
// 8-bit P regardless of M).
auto isMNeutral = [](const MachineInstr &MI) -> bool {
if (MI.isDebugInstr()) return true;
if (MI.isBranch() || MI.isReturn()) return true;
unsigned O = MI.getOpcode();
switch (O) {
case W65816::LDX_Imm16: case W65816::LDX_DP: case W65816::LDX_Abs:
case W65816::LDX_DPY: case W65816::LDX_AbsY:
case W65816::LDY_Imm16: case W65816::LDY_DP: case W65816::LDY_Abs:
case W65816::LDY_DPX: case W65816::LDY_AbsX:
case W65816::STX_DP: case W65816::STX_Abs: case W65816::STX_DPY:
case W65816::STY_DP: case W65816::STY_Abs: case W65816::STY_DPX:
case W65816::INX: case W65816::DEX:
case W65816::INY: case W65816::DEY:
case W65816::CPX_Imm16: case W65816::CPX_DP: case W65816::CPX_Abs:
case W65816::CPY_Imm16: case W65816::CPY_DP: case W65816::CPY_Abs:
case W65816::PHX: case W65816::PHY:
case W65816::PLX: case W65816::PLY:
case W65816::CLC: case W65816::SEC:
case W65816::PHP: case W65816::PLP:
case W65816::NOP:
return true;
default:
return false;
}
};
bool again = true;
while (again) {
again = false;
for (auto It = MBB.begin(); It != MBB.end(); ++It) {
unsigned Op1 = It->getOpcode();
if (Op1 != W65816::REP && Op1 != W65816::SEP) continue;
if (It->getNumOperands() < 1 || !It->getOperand(0).isImm()) continue;
int Imm1 = It->getOperand(0).getImm();
if (Imm1 != kMBit) continue; // M-bit only
// Walk forward across mode-neutral ops looking for the matching
// opposite toggle. Bail at calls, asm, ALU ops on A, etc.
unsigned WantOp = (Op1 == W65816::REP) ? W65816::SEP : W65816::REP;
auto Walker = std::next(It);
MachineInstr *Match = nullptr;
while (Walker != MBB.end()) {
if (Walker->isDebugInstr()) { ++Walker; continue; }
unsigned WO = Walker->getOpcode();
if (WO == WantOp && Walker->getNumOperands() >= 1 &&
Walker->getOperand(0).isImm() &&
Walker->getOperand(0).getImm() == Imm1) {
Match = &*Walker;
break;
}
// Bail on anything that touches A or otherwise cares about M.
if (Walker->isCall() || Walker->isInlineAsm()) break;
if (!isMNeutral(*Walker)) break;
++Walker;
}
if (!Match) continue;
// Drop both toggles. Erasing changes iterator stability; restart.
MachineInstr *T1 = &*It;
T1->eraseFromParent();
Match->eraseFromParent();
Changed = true;
again = true;
break;
}
}
}
// Second peephole: collapse `ADCi16imm src, ±1/±2` (and SBCi16imm)
// into INA/DEA chains when the carry flag they would set is unused.
// ADCi16imm is a pseudo (expands to CLC+ADC_Imm16); we rewrite it
// here BEFORE the AsmPrinter expansion runs. But this pass runs at
// pre-emit, AFTER post-RA pseudo expansion. ADCi16imm survives
// because its MCInst lowering is in W65816AsmPrinter (not in the
// generic post-RA pseudo expander), so it's still in the MIR here.
Changed |= foldImmAdcToInaDea(MBB, TII);
// PHI-copy hoist.
{
auto isStaLike = [](const MachineInstr &MI) {
unsigned O = MI.getOpcode();
return O == W65816::STA_StackRel || O == W65816::STZ_DP ||
O == W65816::STZ_Abs;
};
auto isLdaSR = [](const MachineInstr &MI) {
return MI.getOpcode() == W65816::LDA_StackRel;
};
// Accept LDA_Imm16 (MC) AND LDAi16imm (pseudo) inside the wrap —
// both are flag-clobbering A-loads of a 16-bit immediate, with
// no stack-rel offset to bump-undo and no memory operand to
// alias-check against the gap. Common in init blocks: `lda #0 ;
// sta slot,s` wrapped around the loop pre-test. Some functions
// still carry the pseudo LDAi16imm at SepRepCleanup time (post-RA
// pseudo expansion didn't lower it), so accept both spellings.
auto isImmLoad = [](const MachineInstr &MI) {
unsigned O = MI.getOpcode();
return O == W65816::LDA_Imm16 || O == W65816::LDAi16imm;
};
auto isFlagPreservingMem = [&](const MachineInstr &MI) {
return isStaLike(MI) || isLdaSR(MI) || isImmLoad(MI);
};
auto isLdaCount = [&](const MachineInstr &MI) {
return isLdaSR(MI) || isImmLoad(MI);
};
auto It = MBB.begin();
while (It != MBB.end()) {
if (It->getOpcode() != W65816::PHP) { ++It; continue; }
auto Php = It;
// Walk forward: collect LDA/STA pairs, stop at PLP.
auto Walker = std::next(Php);
SmallVector<MachineInstr *, 8> Block;
SmallSet<int64_t, 8> ReadSlots; // post-unbump slots (effective)
SmallSet<int64_t, 8> WriteSlots; // post-unbump slots (effective)
bool ok = true;
while (Walker != MBB.end()) {
if (Walker->isDebugInstr()) { ++Walker; continue; }
if (Walker->getOpcode() == W65816::PLP) break;
if (!isFlagPreservingMem(*Walker)) { ok = false; break; }
// Track stack-rel slots so we can check the gap below.
// Immediate loads have no stack-rel addr — skip.
// In-wrap LDA_StackRel / STA_StackRel slots are BUMPED by +1
// to compensate for PHP's S-decrement; on hoist out of the
// wrap we un-bump them. Record the POST-UNBUMP (effective)
// slot here so the gap conflict-check uses the addresses
// these ops will actually access in their new position.
// Without this, an outside-wrap LDA at slot N would not
// conflict with an in-wrap STA at slot N+1 even though the
// un-bumped STA writes the SAME memory address as the LDA
// reads — corrupting flag-test data flow. (bsearch's i32
// `lo < hi` termination compare under TTI-driven less-
// aggressive inlining: hoisting STA 6 -> STA 5 above LDA 5
// re-reads the just-overwritten value.)
unsigned WOpc = Walker->getOpcode();
bool isBumpedSR = (WOpc == W65816::LDA_StackRel ||
WOpc == W65816::STA_StackRel);
if (!isImmLoad(*Walker) &&
Walker->getNumOperands() >= 1 &&
Walker->getOperand(0).isImm()) {
int64_t off = Walker->getOperand(0).getImm();
int64_t effOff = isBumpedSR ? off - 1 : off;
if (isLdaSR(*Walker)) ReadSlots.insert(effOff);
else WriteSlots.insert(effOff);
}
Block.push_back(&*Walker);
++Walker;
}
if (!ok || Walker == MBB.end()) { ++It; continue; }
auto Plp = Walker;
// Trailing flag-preservers after PLP (STA/STZ only). These
// already live OUTSIDE the wrap so their slot operand is the
// effective (unbumped) value — no -1 adjustment.
auto Tail = std::next(Plp);
SmallVector<MachineInstr *, 4> Trailing;
while (Tail != MBB.end()) {
if (Tail->isDebugInstr()) { ++Tail; continue; }
if (!isStaLike(*Tail)) break;
if (Tail->getNumOperands() >= 1 && Tail->getOperand(0).isImm()) {
WriteSlots.insert(Tail->getOperand(0).getImm());
}
Trailing.push_back(&*Tail);
++Tail;
}
// Pair check: the wrap structure is a sequence of LDA-STA
// memory-to-memory PHI copies, where the FINAL STA may live
// outside the wrap (as Trailing) because STA doesn't clobber
// flags. Count LDAs in Block vs total STAs (Block + Trailing).
// If they're not equal, some LDA's $a-output is a register-
// live-out PHI value (consumed by a back-edge successor's
// first STA, e.g. the vararg `sta 0x5, s` pattern). Hoisting
// it earlier would lose the value.
unsigned NLda = 0, NSta = 0;
for (MachineInstr *MI : Block) {
if (isLdaCount(*MI)) ++NLda;
else if (isStaLike(*MI)) ++NSta;
}
NSta += Trailing.size();
if (NLda != NSta) { ++It; continue; }
// Even with paired LDA-STA, the LAST LDA's $a value can still
// be consumed downstream — by a successor's first STA — making
// it a fall-through register-PHI. If $a is live-out at MBB
// end (any successor has $a as live-in), bail. Caught by
// sumTable, where `lda #0` (wrap) feeds A into bb.2's `sta 0x1,
// s`, with `sta 0x9, s` (trailing) just happening to also store
// the same A — the pair count balances but A is still live-out.
bool aLiveOut = false;
for (MachineBasicBlock *Succ : MBB.successors()) {
if (Succ->isLiveIn(W65816::A)) { aLiveOut = true; break; }
}
if (aLiveOut) { ++It; continue; }
// Walk backward from PHP to find the hoist insertion point.
// The hoisted block clobbers $a and $p (LDA writes both).
// Skip insts that USE $a (consumer of an earlier $a producer)
// or that DEFINE $p (flag-setter — its $p output will be
// re-established by the same flag-setter). Stop at a pure A
// producer (defines $a, doesn't use $a).
//
// Also bail if any in-gap inst writes a slot we read or reads
// a slot we write (in-gap reads of our writes would observe
// a stale value after hoist; in-gap writes to our reads would
// produce a different value if hoisted before).
auto isStackRelIndYRead = [](unsigned O) {
switch (O) {
case W65816::LDA_StackRelIndY:
case W65816::ADC_StackRelIndY:
case W65816::SBC_StackRelIndY:
case W65816::CMP_StackRelIndY:
case W65816::AND_StackRelIndY:
case W65816::ORA_StackRelIndY:
case W65816::EOR_StackRelIndY:
case W65816::STA_StackRelIndY:
return true;
}
return false;
};
auto Back = Php;
if (Back == MBB.begin()) { ++It; continue; }
--Back;
bool gapOK = true;
while (true) {
while (Back != MBB.begin() && Back->isDebugInstr()) --Back;
if (Back->isDebugInstr()) { gapOK = false; break; }
// Slot conflict check.
unsigned BO = Back->getOpcode();
if ((BO == W65816::STA_StackRel || BO == W65816::STZ_DP ||
BO == W65816::STZ_Abs) &&
Back->getNumOperands() >= 1 && Back->getOperand(0).isImm()) {
int64_t off = Back->getOperand(0).getImm();
if (ReadSlots.count(off)) { gapOK = false; break; }
}
if (BO == W65816::LDA_StackRel &&
Back->getNumOperands() >= 1 && Back->getOperand(0).isImm()) {
int64_t off = Back->getOperand(0).getImm();
if (WriteSlots.count(off)) { gapOK = false; break; }
}
// *_StackRelIndY ops use their slot operand AS A POINTER for
// the `(d,S),Y` deref. Hoisting a STA WriteSlot above an
// IndY use of that slot changes which value the IndY reads
// through. Forbid the hoist in that case. Caught by Layer 2
// ptr32 sumByteToZero loop: PHP-wrapped `LDA stack.3, 1; STA
// stack.4` was being hoisted across `LDA_StackRelIndY stack.4`,
// making the deref use stack.3's NEW value instead of the
// LAGGED stack.4 value — off-by-one summing the byte stream.
if (isStackRelIndYRead(BO) &&
Back->getNumOperands() >= 1 && Back->getOperand(0).isImm()) {
int64_t off = Back->getOperand(0).getImm();
if (WriteSlots.count(off)) { gapOK = false; break; }
}
// Bail on call / branch / asm.
if (Back->isCall() || Back->isBranch() ||
Back->isReturn() || Back->isInlineAsm()) {
gapOK = false; break;
}
bool usesA = false;
bool defsA = false;
for (const MachineOperand &MO : Back->operands()) {
if (MO.isReg() && MO.getReg() == W65816::A) {
if (MO.isUse()) usesA = true;
if (MO.isDef()) defsA = true;
}
}
if (defsA && !usesA) break; // Pure A producer found.
if (Back == MBB.begin()) { gapOK = false; break; }
--Back;
}
if (!gapOK) { ++It; continue; }
// Hoist: move Block and Trailing to before Back. Undo the
// +1 stack-rel bump on Block's in-wrap memory ops; Trailing
// stays AS-IS (it was already outside the wrap and never
// bumped).
for (MachineInstr *MI : Block) {
// All ops in Block matched isFlagPreservingMem, so they're
// LDA_StackRel/STA_StackRel/STZ_DP/STZ_Abs. LDA_StackRel
// and STA_StackRel use operand 0 as the disp; that's the
// bumped one. STZ_DP/STZ_Abs aren't stack-rel — no bump.
unsigned MOpc = MI->getOpcode();
if (MOpc == W65816::LDA_StackRel || MOpc == W65816::STA_StackRel) {
if (MI->getNumOperands() >= 1 && MI->getOperand(0).isImm()) {
int64_t v = MI->getOperand(0).getImm();
MI->getOperand(0).setImm(v - 1);
}
}
MI->removeFromParent();
MBB.insert(Back, MI);
}
for (MachineInstr *MI : Trailing) {
MI->removeFromParent();
MBB.insert(Back, MI);
}
Php->eraseFromParent();
Plp->eraseFromParent();
Changed = true;
// Restart iteration from the beginning since we mutated.
It = MBB.begin();
}
}
// Lagged-ptr PHI-copy sink. In strLen / strcpy / sumByteToZero
// loop bodies, the deref reads slot B (the "lagged" PHI value)
// while slot A holds the just-incremented iter. At end of body,
// a PHP/PLP-wrapped `LDA slot A ; STA slot B` propagates the new
// iter to slot B for next iter. The wrap costs 8 cyc/iter (PHP +
// PLP) plus 8 cyc for the LDA/STA pair.
//
// Equivalent rewrite: at the start of the body, BEFORE the
// iter++, A already holds slot A's OLD value (loaded for the
// INA). Insert `STA slot B` THERE — it copies OLD iter to slot
// B, matching the lagged semantic. Slot B is no longer touched
// at end of body, so the PHP/PLP wrap (+ its LDA/PLP/STA tail)
// can be erased. Net: -11 cyc/iter on strLen (44 chars → -484
// cyc / -20%).
//
// Pattern at end of MBB (immediately before terminator):
// ANDi #imm ; flag-setter
// PHP
// LDA_StackRel SrcOff ; reload iter NEW (SrcOff is
// PHP-bumped: actually =
// IterSlotOff + 1)
// PLP
// STA_StackRel DstOff ; slot B = iter NEW
// Bxx ... ; conditional branch
//
// Earlier in MBB:
// LDA_StackRel IterSlotOff ; A = OLD iter
// INA_PSEUDO (or ADCi16imm 1) ; iter++
// STA_StackRel IterSlotOff ; iter = NEW
//
// Rewrite: insert `STA_StackRel DstOff` right after the LDA
// (between LDA and INA). Erase the PHP/LDA/PLP/STA + the
// ANDi-after-PHP wrap entirely. The ANDi at the front is kept
// since it's also the BNE's flag source.
{
auto isCondBranch = [](const MachineInstr &MI) {
unsigned O = MI.getOpcode();
return O == W65816::BNE || O == W65816::BEQ ||
O == W65816::BCC || O == W65816::BCS ||
O == W65816::BMI || O == W65816::BPL ||
O == W65816::BVC || O == W65816::BVS;
};
auto isFlagSetter = [](const MachineInstr &MI) {
unsigned O = MI.getOpcode();
return O == W65816::ANDi16imm || O == W65816::ANDi8imm ||
O == W65816::ORAi16imm || O == W65816::EORi16imm;
};
// Find Bxx terminator.
MachineInstr *Bxx = nullptr;
for (auto It = MBB.rbegin(); It != MBB.rend(); ++It) {
if (isCondBranch(*It)) { Bxx = &*It; break; }
if (It->isBranch()) break; // BRA etc. — skip past it
}
if (!Bxx) goto skip_lagged_sink;
{
// Walk backward from Bxx to find STA, PLP, LDA, PHP.
auto It2 = MachineBasicBlock::iterator(Bxx);
if (It2 == MBB.begin()) goto skip_lagged_sink;
--It2; // first non-branch
if (It2->getOpcode() != W65816::STA_StackRel ||
!It2->getOperand(0).isImm()) goto skip_lagged_sink;
MachineInstr *FinalSta = &*It2;
int64_t DstOff = FinalSta->getOperand(0).getImm();
if (It2 == MBB.begin()) goto skip_lagged_sink;
--It2;
if (It2->getOpcode() != W65816::PLP) goto skip_lagged_sink;
MachineInstr *Plp2 = &*It2;
if (It2 == MBB.begin()) goto skip_lagged_sink;
--It2;
if (It2->getOpcode() != W65816::LDA_StackRel ||
!It2->getOperand(0).isImm()) goto skip_lagged_sink;
MachineInstr *InnerLda = &*It2;
int64_t SrcOff = InnerLda->getOperand(0).getImm();
if (It2 == MBB.begin()) goto skip_lagged_sink;
--It2;
if (It2->getOpcode() != W65816::PHP) goto skip_lagged_sink;
MachineInstr *Php2 = &*It2;
if (It2 == MBB.begin()) goto skip_lagged_sink;
--It2;
if (!isFlagSetter(*It2)) goto skip_lagged_sink;
// The PHP-bumped SrcOff is the IterSlotOff + 1.
int64_t IterSlotOff = SrcOff - 1;
// Now find the iter++ sequence earlier in MBB: LDA IterSlotOff;
// INA_PSEUDO; STA IterSlotOff.
MachineInstr *IterLda = nullptr;
MachineInstr *IterIna = nullptr;
MachineInstr *IterSta = nullptr;
for (auto Walk = MBB.begin(); Walk != MachineBasicBlock::iterator(Php2); ++Walk) {
if (Walk->getOpcode() != W65816::LDA_StackRel) continue;
if (!Walk->getOperand(0).isImm() ||
Walk->getOperand(0).getImm() != IterSlotOff) continue;
auto N1 = std::next(Walk);
while (N1 != MBB.end() && N1->isDebugInstr()) ++N1;
if (N1 == MBB.end()) continue;
if (N1->getOpcode() != W65816::INA_PSEUDO &&
N1->getOpcode() != W65816::ADCi16imm) continue;
auto N2 = std::next(N1);
while (N2 != MBB.end() && N2->isDebugInstr()) ++N2;
if (N2 == MBB.end()) continue;
if (N2->getOpcode() != W65816::STA_StackRel) continue;
if (!N2->getOperand(0).isImm() ||
N2->getOperand(0).getImm() != IterSlotOff) continue;
IterLda = &*Walk;
IterIna = &*N1;
IterSta = &*N2;
break;
}
if (!IterLda) goto skip_lagged_sink;
// Safety: make sure DstOff isn't written between IterLda and
// the IndY use of DstOff. Walk forward from IterLda looking
// for STA DstOff (other than our FinalSta) — if found, bail.
for (auto Walk = std::next(MachineBasicBlock::iterator(IterSta));
Walk != MachineBasicBlock::iterator(Php2); ++Walk) {
if (Walk->getOpcode() == W65816::STA_StackRel &&
Walk->getOperand(0).isImm() &&
Walk->getOperand(0).getImm() == DstOff) {
goto skip_lagged_sink;
}
}
// Apply: insert STA_StackRel DstOff right after IterLda,
// BEFORE INA.
const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
DebugLoc DL = IterLda->getDebugLoc();
BuildMI(MBB, std::next(MachineBasicBlock::iterator(IterLda)),
DL, TII->get(W65816::STA_StackRel))
.addImm(DstOff)
.addReg(W65816::A, RegState::Implicit);
// Erase PHP, InnerLda, PLP, FinalSta.
Php2->eraseFromParent();
InnerLda->eraseFromParent();
Plp2->eraseFromParent();
FinalSta->eraseFromParent();
Changed = true;
}
skip_lagged_sink:;
}
// i32 += i32 store-bypass. Regalloc materializes the call result
// (A=lo, X=hi) into Wide32 spill slots before the add, then reads
// them back — emitting 4 instructions of redundant store/reload:
//
// STA_StackRel slotA ; A (mul.lo) -> slotA
// TXA ; A = X = mul.hi
// STA_StackRel slotB ; mul.hi -> slotB
// LDA_StackRel slotA ; reload mul.lo <-- redundant
// CLC
// ADC_StackRel slotC ; mul.lo + total.lo
// STA_StackRel slotA ; sum-lo
// LDA_StackRel slotB ; reload mul.hi <-- redundant
// ADC_StackRel slotD ; mul.hi + total.hi + C
// STA_StackRel slotB ; sum-hi
//
// Reorder to do the lo-add directly off A and the hi-add directly
// off X (via TXA preserving carry):
//
// CLC
// ADC_StackRel slotC ; A = mul.lo + total.lo
// STA_StackRel slotA ; sum-lo
// TXA ; A = X = mul.hi (C preserved)
// ADC_StackRel slotD ; A = mul.hi + total.hi + C
// STA_StackRel slotB ; sum-hi
//
// 10 -> 6 inst. Saves 4 inst / ~13 cyc per i32-add-of-call-result
// site. Hits the sumOfSquares loop and any total += __umulhisi3
// pattern.
{
auto isStaSR = [](MachineInstr &MI, int64_t *off) {
if (MI.getOpcode() != W65816::STA_StackRel) return false;
if (MI.getNumOperands() < 1 || !MI.getOperand(0).isImm()) return false;
if (off) *off = MI.getOperand(0).getImm();
return true;
};
auto isLdaSR = [](MachineInstr &MI, int64_t *off) {
if (MI.getOpcode() != W65816::LDA_StackRel) return false;
if (MI.getNumOperands() < 1 || !MI.getOperand(0).isImm()) return false;
if (off) *off = MI.getOperand(0).getImm();
return true;
};
auto isAdcSR = [](MachineInstr &MI, int64_t *off) {
if (MI.getOpcode() != W65816::ADC_StackRel) return false;
if (MI.getNumOperands() < 1 || !MI.getOperand(0).isImm()) return false;
if (off) *off = MI.getOperand(0).getImm();
return true;
};
auto It = MBB.begin();
while (It != MBB.end()) {
auto Cur = It;
int64_t slotA = 0, slotB = 0, slotC = 0, slotD = 0;
// Step 1: STA_StackRel slotA
if (!isStaSR(*Cur, &slotA)) { ++It; continue; }
auto P2 = std::next(Cur);
while (P2 != MBB.end() && P2->isDebugInstr()) ++P2;
if (P2 == MBB.end() || P2->getOpcode() != W65816::TXA) { ++It; continue; }
auto P3 = std::next(P2);
while (P3 != MBB.end() && P3->isDebugInstr()) ++P3;
if (P3 == MBB.end() || !isStaSR(*P3, &slotB)) { ++It; continue; }
if (slotA == slotB) { ++It; continue; }
auto P4 = std::next(P3);
while (P4 != MBB.end() && P4->isDebugInstr()) ++P4;
int64_t lreloadA = 0;
if (P4 == MBB.end() || !isLdaSR(*P4, &lreloadA) || lreloadA != slotA) {
++It; continue;
}
auto P5 = std::next(P4);
while (P5 != MBB.end() && P5->isDebugInstr()) ++P5;
if (P5 == MBB.end() || P5->getOpcode() != W65816::CLC) {
++It; continue;
}
auto P6 = std::next(P5);
while (P6 != MBB.end() && P6->isDebugInstr()) ++P6;
if (P6 == MBB.end() || !isAdcSR(*P6, &slotC)) { ++It; continue; }
auto P7 = std::next(P6);
while (P7 != MBB.end() && P7->isDebugInstr()) ++P7;
int64_t outA = 0;
if (P7 == MBB.end() || !isStaSR(*P7, &outA) || outA != slotA) {
++It; continue;
}
auto P8 = std::next(P7);
while (P8 != MBB.end() && P8->isDebugInstr()) ++P8;
int64_t lreloadB = 0;
if (P8 == MBB.end() || !isLdaSR(*P8, &lreloadB) || lreloadB != slotB) {
++It; continue;
}
auto P9 = std::next(P8);
while (P9 != MBB.end() && P9->isDebugInstr()) ++P9;
if (P9 == MBB.end() || !isAdcSR(*P9, &slotD)) { ++It; continue; }
auto P10 = std::next(P9);
while (P10 != MBB.end() && P10->isDebugInstr()) ++P10;
int64_t outB = 0;
if (P10 == MBB.end() || !isStaSR(*P10, &outB) || outB != slotB) {
++It; continue;
}
// All 10 matched. slotA != slotB already. Also require all
// four slots distinct. (slotC/slotD are the total.lo/hi read
// addresses; in the canonical case slotC != slotA and slotD !=
// slotB; without this the rewrite would re-read its own output.)
if (slotC == slotA || slotD == slotB ||
slotC == slotD) {
++It; continue;
}
// Rewrite: emit CLC ; ADC slotC ; STA slotA ; TXA ; ADC slotD ;
// STA slotB before P1, then erase steps 1-10.
DebugLoc DL = Cur->getDebugLoc();
BuildMI(MBB, Cur, DL, TII.get(W65816::CLC));
BuildMI(MBB, Cur, DL, TII.get(W65816::ADC_StackRel))
.addImm(slotC);
BuildMI(MBB, Cur, DL, TII.get(W65816::STA_StackRel))
.addImm(slotA);
BuildMI(MBB, Cur, DL, TII.get(W65816::TXA));
BuildMI(MBB, Cur, DL, TII.get(W65816::ADC_StackRel))
.addImm(slotD);
BuildMI(MBB, Cur, DL, TII.get(W65816::STA_StackRel))
.addImm(slotB);
// Advance It past the matched pattern before erasing (so we
// don't iterate through deleted insts).
It = std::next(P10);
// Erase the 10 originals.
Cur->eraseFromParent(); P2->eraseFromParent();
P3->eraseFromParent(); P4->eraseFromParent();
P5->eraseFromParent(); P6->eraseFromParent();
P7->eraseFromParent(); P8->eraseFromParent();
P9->eraseFromParent(); P10->eraseFromParent();
Changed = true;
}
}
// Dead TAX / TXA elimination. STAfi declares `Defs = [A]` as a
// safe over-approximation (eliminateFrameIndex emits a PHA-bracketed
// sequence when the source is IMG-class). Regalloc honors that by
// inserting `TAX ; ...STAfi... ; TXA` brackets around STAfi that
// SOURCES from A — but in the A-source path A is preserved. The
// TXA's output gets clobbered immediately by the next LDA*, so the
// TXA is dead; once TXA is gone, the TAX's X-value has no consumer
// and is dead too. This pattern recurs once per i32-spill site.
//
// Conservative: only elide TXA if the IMMEDIATE next non-debug
// instruction defines $a (and doesn't read $a or N/Z first). No
// intervening flag-readers between TXA and the A-define is then
// guaranteed. Same logic for TYA.
//
// For TAX: elide if no instruction between TAX and the next $x def
// reads $x (and we can prove the original X had no live consumer).
// Done as a fixed-point: keep iterating until no change.
auto definesReg = [](const MachineInstr &MI, unsigned Reg) -> bool {
for (const MachineOperand &MO : MI.operands()) {
if (MO.isReg() && MO.getReg() == Reg && MO.isDef())
return true;
}
return false;
};
auto readsReg = [](const MachineInstr &MI, unsigned Reg) -> bool {
for (const MachineOperand &MO : MI.operands()) {
if (MO.isReg() && MO.getReg() == Reg && MO.isUse())
return true;
}
return false;
};
bool again2 = true;
while (again2) {
again2 = false;
// Pass A: dead TXA / TYA
for (auto It = MBB.begin(); It != MBB.end(); ) {
unsigned O = It->getOpcode();
if (O != W65816::TXA && O != W65816::TYA) { ++It; continue; }
auto Next = std::next(It);
while (Next != MBB.end() && Next->isDebugInstr()) ++Next;
if (Next == MBB.end()) { ++It; continue; }
// Next must define $a unconditionally, and must not read $a
// (since we're about to discard the TXA-defined A) and must
// not be a call / branch / inline asm (which conservatively
// read $a).
if (Next->isCall() || Next->isBranch() ||
Next->isReturn() || Next->isInlineAsm()) {
++It; continue;
}
if (!definesReg(*Next, W65816::A)) { ++It; continue; }
if (readsReg(*Next, W65816::A)) { ++It; continue; }
// P (flags) liveness: TXA/TYA set N/Z. If Next reads P, we'd
// be discarding the flags it expects. Bxx and friends read P.
// Conservative: also require Next does not read $p.
if (readsReg(*Next, W65816::P)) { ++It; continue; }
auto Dead = It++;
Dead->eraseFromParent();
Changed = true;
again2 = true;
}
// Pass B: dead TAX / TAY
for (auto It = MBB.begin(); It != MBB.end(); ) {
unsigned O = It->getOpcode();
unsigned Target;
if (O == W65816::TAX) Target = W65816::X;
else if (O == W65816::TAY) Target = W65816::Y;
else { ++It; continue; }
// Walk forward. TAX/TAY is dead if every use of Target is
// preceded by a redefinition of Target (and the in-MBB region
// between has no flag-reader that consumes TAX's N/Z). At MBB
// end, check successor live-ins: if none has Target as live-in
// it's also dead.
//
// Flag liveness: TAX defines $p (N/Z). A later $p-reader only
// consumes TAX's flags if no intervening instruction REDEFINES
// $p in the gap. Track `pRedef` to allow common patterns like
// `TAX ; CLC ; ADC ; ...` where ADC reads $p but the $p it
// reads is the freshly-CLC'd carry, not TAX's N/Z.
auto Walker = std::next(It);
bool deadIt = false;
bool bailed = false;
bool pRedef = false;
while (Walker != MBB.end()) {
if (Walker->isDebugInstr()) { ++Walker; continue; }
if (Walker->isCall() || Walker->isInlineAsm()) {
bailed = true; break;
}
// Branch / return: stop walking; rely on successor live-ins.
if (Walker->isBranch() || Walker->isReturn()) break;
if (readsReg(*Walker, Target)) { bailed = true; break; }
if (readsReg(*Walker, W65816::P) && !pRedef) {
bailed = true; break;
}
if (definesReg(*Walker, W65816::P)) pRedef = true;
if (definesReg(*Walker, Target)) { deadIt = true; break; }
++Walker;
}
if (bailed) { ++It; continue; }
if (!deadIt) {
// Fell through to MBB end / branch. Check successor live-ins.
bool liveOut = false;
for (MachineBasicBlock *Succ : MBB.successors()) {
if (Succ->isLiveIn(Target)) { liveOut = true; break; }
}
// Return blocks: $a and $x are the i32 return-value convention.
// RTL doesn't model these as Uses, but they ARE live at the
// return. Be conservative — don't elide TAX/TAY before a return.
if (!MBB.empty() && MBB.back().isReturn()) liveOut = true;
if (liveOut) { ++It; continue; }
}
auto Dead = It++;
Dead->eraseFromParent();
Changed = true;
again2 = true;
}
}
// Third peephole: drop `LDY_Imm16 K` when Y already holds K from
// an earlier LDY in the same MBB and no intervening MI clobbered
// Y. Custom inserter emits LDY #0 before every LDAfi_indY/STAfi_indY,
// even though Y already holds 0 from a previous emit — the
// redundant LDYs survive MachineLICM because Y is a phys reg and
// the inserter binds them tightly to each use.
int yKnown = -1; // -1 means unknown; otherwise the immediate
auto It2 = MBB.begin();
while (It2 != MBB.end()) {
MachineInstr &MI = *It2;
if (MI.isDebugInstr()) { ++It2; continue; }
unsigned Op = MI.getOpcode();
if (Op == W65816::LDY_Imm16 && MI.getNumOperands() >= 1 &&
MI.getOperand(0).isImm()) {
int K = MI.getOperand(0).getImm() & 0xFFFF;
if (yKnown == K) {
// Before erasing this redundant LDY: the prior LDY is still in
// scope, so all of its Y-uses between the two LDYs are still
// valid uses. But liveness already marked the LAST one (just
// before the redundant LDY) as `implicit killed $y`, because
// that LDY was about to redefine Y. After erasure, Y survives
// through to the NEXT use, so the prior "kill" annotation is
// wrong and the machine verifier rejects. Walk backward and
// clear the kill flag on the most recent Y-using operand.
for (auto Back = std::prev(It2);; --Back) {
bool clearedAny = false;
for (MachineOperand &MO : Back->operands()) {
if (MO.isReg() && MO.getReg() == W65816::Y &&
MO.isUse() && MO.isKill()) {
MO.setIsKill(false);
clearedAny = true;
}
}
if (clearedAny) break;
if (Back == MBB.begin()) break;
}
auto Erase = It2++;
Erase->eraseFromParent();
Changed = true;
continue;
}
yKnown = K;
} else {
// Conservatively invalidate yKnown on anything that touches Y
// or on calls / inline asm / any instruction that doesn't have
// a clean "no Y effect" guarantee. Cheaper to underclaim than
// miscompile.
switch (Op) {
case W65816::LDAfi_indY: // reads Y, doesn't def it — keep yKnown
case W65816::STAfi_indY:
case W65816::LDA_StackRelIndY:
case W65816::STA_StackRelIndY:
break;
case W65816::TAY: case W65816::TXY:
case W65816::INY: case W65816::DEY:
case W65816::PLY: case W65816::LDY_DP: case W65816::LDY_Abs:
case W65816::LDY_DPX: case W65816::LDY_AbsX:
yKnown = -1; break;
default:
if (MI.isCall()) yKnown = -1;
break;
}
}
++It2;
}
}
// Three prototype peepholes were tried here and removed once shown
// to regress benchmarks; design notes in
// feedback_close_gap_attempts_round2.md / feedback_cmp_zero_elim.md:
// - PHI store-forwarding (CRC32 regression / memmove safety hole).
// - Redundant CMP #0 elimination (VLA sum_n carry-flag bookkeeping).
// - Narrow PHI-copy slot collapse (qsort regression).
return Changed;
}