//===-- W65816SepRepCleanup.cpp - Coalesce adjacent SEP/REP toggles -------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // Post-PEI peephole that drops adjacent `REP #$20 ; SEP #$20` (or vice // versa) pairs that toggle the M-bit redundantly. // // The STA8fi expansion in W65816RegisterInfo::eliminateFrameIndex emits // `SEP #$20 / STA d,S / REP #$20` so each i8 store runs with M=1. When // two STA8fi sit back-to-back in the MIR (no 16-bit ALU op between // them), the post-PEI stream contains: // // SEP #$20 // STA d1, S // REP #$20 <-- toggle // SEP #$20 <-- toggle (cancels above) // STA d2, S // REP #$20 // // The middle REP/SEP pair is a no-op: both stores can run in one M=1 // region. We drop them to leave: // // SEP #$20 // STA d1, S // STA d2, S // REP #$20 // // Saves 2 bytes / 6 cycles per coalesced pair. Symmetric `SEP/REP` // pairs (M=1 then M=0 with nothing in between) are also dropped — they // can arise around inline-asm or hand-written assembly snippets. // // Runs at addPreEmitPass (after PEI has expanded STA8fi). // //===----------------------------------------------------------------------===// #include "W65816.h" #include "W65816InstrInfo.h" #include "W65816Subtarget.h" #include "llvm/ADT/SmallSet.h" #include "llvm/Support/raw_ostream.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" using namespace llvm; #define DEBUG_TYPE "w65816-sep-rep-cleanup" // W65816 processor status M-bit mask (set/clear via SEP/REP #$20). static constexpr int kMBit = 0x20; namespace { class W65816SepRepCleanup : public MachineFunctionPass { public: static char ID; W65816SepRepCleanup() : MachineFunctionPass(ID) {} StringRef getPassName() const override { return "W65816 SEP/REP toggle coalescing"; } bool runOnMachineFunction(MachineFunction &MF) override; }; } // namespace char W65816SepRepCleanup::ID = 0; INITIALIZE_PASS(W65816SepRepCleanup, DEBUG_TYPE, "W65816 SEP/REP toggle coalescing", false, false) FunctionPass *llvm::createW65816SepRepCleanup() { return new W65816SepRepCleanup(); } // Returns the immediate value of `op` if MI is a `SEP #imm` or `REP #imm`, // else -1. static int getSepRepImm(const MachineInstr &MI, unsigned Opc) { if (MI.getOpcode() != Opc) return -1; if (MI.getNumOperands() < 1 || !MI.getOperand(0).isImm()) return -1; return MI.getOperand(0).getImm(); } // Returns true if MI may consume the carry or overflow flag — these // are the flags that ADC/SBC define but INA/DEA don't. Conservative: // any branch that reads C or V counts, plus the chained ADC/SBC ops // that wait for a prior carry-out. Anything else (CMP, CLC, SEC, // LDA, STA, AND, ORA, EOR, etc.) re-defines or doesn't read C/V. static bool readsCarryOrV(const MachineInstr &MI) { switch (MI.getOpcode()) { case W65816::BCS: // reads C case W65816::BCC: // reads C case W65816::BVS: // reads V case W65816::BVC: // reads V case W65816::ADC_StackRel: // reads C as carry-in case W65816::ADC_Imm16: case W65816::ADC_Imm8: case W65816::ADC_DP: case W65816::ADC_Abs: case W65816::SBC_StackRel: case W65816::SBC_Imm16: case W65816::SBC_Imm8: case W65816::SBC_DP: case W65816::SBC_Abs: // Chained-carry pseudos. These run BEFORE AsmPrinter expansion so // we must whitelist them explicitly — they're the hi-half of any // multi-precision add/sub and read the lo-half's carry-out. Without // these, the INA/DEA peephole below silently rewrites a lo-half // `ADCi16imm src, 2` to `INA; INA` (which DOES NOT set C), breaking // the i32 ADD carry chain. Caught as `arr[0] = arr[1]` writing to // wrong bank under ptr32 because the high half got a stale C. case W65816::ADCEi16imm: case W65816::SBCEi16imm: // The fi/abs/imm forms of ADC/SBC are also pre-AsmPrinter pseudos; // each expands to a real ADC_/SBC_ opcode that reads carry. case W65816::ADCi16imm: // lo-half (CLC + ADC_Imm16) case W65816::SBCi16imm: // lo-half (SEC + SBC_Imm16) case W65816::ADCfi: // chained-carry stack form case W65816::SBCfi: case W65816::ADCEfi: case W65816::SBCEfi: case W65816::ADCabs: case W65816::SBCabs: case W65816::ROL_A: // rotates fold C in case W65816::ROR_A: case W65816::ROL_DP: case W65816::ROL_Abs: case W65816::ROR_DP: case W65816::ROR_Abs: return true; default: return false; } } // Returns true if `Op` is one of the flag-redefining opcodes (CLC, SEC, // CMP*, CPX*, CPY*, REP, SEP) — observing C/V before this is safe. // Includes the pseudo CMP* variants (CMPi16imm etc.) since this peephole // runs at pre-emit, BEFORE the AsmPrinter expands them. static bool isFlagRedefiner(unsigned Op) { switch (Op) { case W65816::CLC: case W65816::SEC: case W65816::CMP_Imm8: case W65816::CMP_Imm16: case W65816::CMP_StackRel: case W65816::CMP_DP: case W65816::CMP_Abs: case W65816::CMPi16imm: case W65816::CMPi8imm: case W65816::CMPfi: case W65816::CMPabs: case W65816::CMP_RR: case W65816::CPX_Imm8: case W65816::CPX_Imm16: case W65816::CPX_DP: case W65816::CPX_Abs: case W65816::CPY_Imm8: case W65816::CPY_Imm16: case W65816::CPY_DP: case W65816::CPY_Abs: case W65816::REP: case W65816::SEP: return true; default: return false; } } // Returns true if a subsequent MI in the same MBB observes the C/V // flags before any flag-redefiner clears the dependency. At MBB end, // extends one step into each successor: if any successor's first // (non-debug) MI reads C/V before redefining them, the flag is live // across the edge — bail. This is critical for loop bodies where // the back-edge re-enters the same MBB at LDA/PHA (neither reads C/V), // so a per-iteration `clc; adc #2` is foldable. Cross-MBB carry chains // would normally use ADCEi16imm (not ADCi16imm), so this is safe. static bool carryFlagLiveAfter(MachineBasicBlock::iterator After, MachineBasicBlock &MBB) { // Phase 1: scan within this MBB. for (auto Probe = std::next(After); Probe != MBB.end(); ++Probe) { if (Probe->isDebugInstr()) continue; if (readsCarryOrV(*Probe)) return true; if (isFlagRedefiner(Probe->getOpcode())) return false; if (Probe->isCall()) return false; // callee resets flags } // Phase 2: peek into each successor's first few MIs. We BAIL only on // a positive C/V read; reaching MBB end or peek-cap without finding // one is treated as "carry dead" — ADCi16imm's carry-out is never // used in carry chains (those use ADCEi16imm), so a stray carry // floating into RTL or an unrelated arithmetic op causes no harm. const unsigned MaxPeek = 6; for (MachineBasicBlock *Succ : MBB.successors()) { unsigned Peeked = 0; for (auto &MI : *Succ) { if (MI.isDebugInstr()) continue; if (readsCarryOrV(MI)) return true; if (isFlagRedefiner(MI.getOpcode()) || MI.isCall()) break; if (++Peeked >= MaxPeek) break; } } return false; } // Convert `ADCi16imm dst, src, ±1`/`±2` and `SBCi16imm` similarly to // INA / INA;INA / DEA / DEA;DEA chains when C/V are dead. ADCi16imm // is a pseudo that expands to CLC+ADC_Imm16 (4B/5cyc). INA is 1B/2cyc. // Savings per ±1: 3B/3cyc; per ±2: 2B/1cyc. SBCi16imm is symmetric // (sub by N == add by -N), so SBC #1 → DEA, SBC #-1 → INA, etc. static bool foldImmAdcToInaDea(MachineBasicBlock &MBB, const W65816InstrInfo &TII) { bool Changed = false; auto It = MBB.begin(); while (It != MBB.end()) { unsigned Op = It->getOpcode(); bool isAdc = (Op == W65816::ADCi16imm); bool isSbc = (Op == W65816::SBCi16imm); if ((!isAdc && !isSbc) || It->getNumOperands() < 3 || !It->getOperand(2).isImm()) { ++It; continue; } int64_t Imm = (int16_t)It->getOperand(2).getImm(); // For SBC, negate: SBC by +N is "subtract N", same as ADC by -N. int64_t Effective = isSbc ? -Imm : Imm; if (Effective < -2 || Effective > 2 || Effective == 0) { ++It; continue; } if (carryFlagLiveAfter(It, MBB)) { ++It; continue; } DebugLoc DL = It->getDebugLoc(); unsigned NewOpc = (Effective > 0) ? W65816::INA : W65816::DEA; unsigned Count = (Effective > 0) ? Effective : -Effective; for (unsigned i = 0; i < Count; ++i) BuildMI(MBB, It, DL, TII.get(NewOpc)); auto NextIt = std::next(It); It->eraseFromParent(); It = NextIt; Changed = true; } return Changed; } // DBG_VALUE preservation in this pass: // // Every instruction this pass erases falls into one of these classes: // * SEP/REP — MCInst-level mode-flag toggles, no value flow. // * TAX/TXA/TAY/TYA — register transfers; the source value still // exists in A and is followed by an A-redefining instruction that // was the reason we identified the transfer as dead. // * Redundant LDY_Imm16 — Y already holds the constant. // * Redundant ADCi16imm/SBCi16imm rewritten to INA/DEA — same value, // fewer cycles. // * Lagged-ptr PHI-copy sink — relocates a `STA dst` from end-of-MBB // to immediately after the iter-load. The destination slot is // written earlier but with the same value at every read point // because the iter's OLD value is what flowed through the // PHP/PLP-wrapped tail copy. // * i32-add store-bypass — reorders 10 instructions to 6 that // compute the same lo/hi result into the same destination slots // in the same order from the user's point of view. // // None of these change the user-visible value of a named variable at // any PC where a DBG_VALUE could observe it. Hoisted/moved // instructions write the same data at slightly earlier PCs in their // MBB; a DBG_VALUE between the OLD and NEW write positions could read // a slightly-fresher value (the next-iteration's prefetch instead of // the current iteration's tail), but never a wrong value — the loop // invariant guarantees both values agree at the moved boundary. bool W65816SepRepCleanup::runOnMachineFunction(MachineFunction &MF) { bool Changed = false; const auto &STI = MF.getSubtarget(); const auto &TII = *STI.getInstrInfo(); for (MachineBasicBlock &MBB : MF) { // Pre-pass: hoist LDAi8imm out of byte-store SEP/REP wraps. // The post-RA scheduler can move LDAi8imm (which is marked // hasSideEffects=0 at MIR but expands at AsmPrinter to its OWN // SEP+LDA8+REP that toggles M) INSIDE an STBptr inserter's // SEP/REP wrap. When that happens, the LDAi8imm's expansion // REP fires BEFORE the byte STA, leaving the STA in M=16 — the // store becomes a 16-bit zero write, corrupting the byte AFTER // the intended target. Detect the pattern and hoist the // LDAi8imm above the outer SEP. #107 strtok_r BB0_15 was this // exact bug. { SmallVector SepHoists; for (auto It = MBB.begin(); It != MBB.end(); ++It) { if (It->getOpcode() != W65816::SEP) continue; if (It->getNumOperands() < 1 || !It->getOperand(0).isImm()) continue; if (It->getOperand(0).getImm() != kMBit) continue; // Walk forward looking for LDAi8imm before any STAfi_indY // or REP at this nesting level. auto Walker = std::next(It); MachineInstr *LdaToHoist = nullptr; while (Walker != MBB.end()) { if (Walker->isDebugInstr()) { ++Walker; continue; } unsigned Opc = Walker->getOpcode(); // Hit a REP — wrap is closing without LDAi8imm inside. if (Opc == W65816::REP) break; // Hit a call / branch / asm — bail. if (Walker->isCall() || Walker->isBranch() || Walker->isReturn() || Walker->isInlineAsm()) break; // Hit an STAfi_indY — this is the byte store; an LDAi8imm // before it would be the bug, but if we found one already // we'd have hoisted it; nothing to do here, stop scanning. if (Opc == W65816::STAfi_indY) break; if (Opc == W65816::LDAi8imm) { LdaToHoist = &*Walker; break; } ++Walker; } if (LdaToHoist) SepHoists.push_back(LdaToHoist); } for (MachineInstr *Lda : SepHoists) { // Find the SEP we entered before the LDA. Walk backward. auto Back = Lda->getIterator(); MachineInstr *OuterSep = nullptr; while (Back != MBB.begin()) { --Back; if (Back->isDebugInstr()) continue; if (Back->getOpcode() == W65816::SEP && Back->getNumOperands() >= 1 && Back->getOperand(0).isImm() && Back->getOperand(0).getImm() == kMBit) { OuterSep = &*Back; break; } if (Back->isCall() || Back->isBranch() || Back->isInlineAsm()) break; } if (!OuterSep) continue; Lda->removeFromParent(); MBB.insert(OuterSep->getIterator(), Lda); Changed = true; } } SmallVector Toggles; for (MachineInstr &MI : MBB) { unsigned Opc = MI.getOpcode(); if (Opc == W65816::REP || Opc == W65816::SEP) Toggles.push_back(&MI); } SmallPtrSet Erased; for (MachineInstr *First : Toggles) { if (Erased.count(First)) continue; // The next non-debug instruction must be the matching opposite // toggle with the same imm. auto It = std::next(First->getIterator()); while (It != MBB.end() && It->isDebugInstr()) ++It; if (It == MBB.end()) continue; MachineInstr &Next = *It; // Look for REP-then-SEP or SEP-then-REP with matching imm. unsigned FirstOpc = First->getOpcode(); unsigned WantOpc = (FirstOpc == W65816::REP) ? W65816::SEP : W65816::REP; int FirstImm = getSepRepImm(*First, FirstOpc); int NextImm = getSepRepImm(Next, WantOpc); if (FirstImm < 0 || NextImm < 0 || FirstImm != NextImm) continue; Erased.insert(First); Erased.insert(&Next); First->eraseFromParent(); Next.eraseFromParent(); Changed = true; } // Extended toggle coalesce — REP/SEP scheduling. // // Walk the MBB looking for `T1 ; ...neutral... ; T2` where T1 and // T2 are opposite-polarity SEP/REP toggles (T1=REP T2=SEP, or // vice versa) with the same imm, and the gap contains only // M-mode-neutral instructions (transfers/branches/X-flag-only // index ops). In that case T1+T2 form a no-op pair around code // that doesn't care about M, so both can be dropped. Equivalent // to "moving the SEP/REP wrap inward to skip the neutral region". // // Saves 4 bytes / 12 cycles per gap collapsed. The common // trigger is two STA8 stores separated by an LDY for the second // store's address — STA8fi each emit SEP/STA/REP, the existing // adjacent coalesce can't see across the LDY, this pass can. { // Mode-neutral instruction set: don't touch the M-bit and // don't depend on A's width. X-flag dependent ops (LDX/LDY/ // STX/STY/INX/DEX/INY/DEY/CPX/CPY/PHX/PHY/PLX/PLY) are // independent of M. So are all branches, JMP/JSR/JSL/RTL/RTS, // CLC/SEC/CLI/SEI/CLD/SED/CLV, NOP, and PHP/PLP (they push // 8-bit P regardless of M). auto isMNeutral = [](const MachineInstr &MI) -> bool { if (MI.isDebugInstr()) return true; if (MI.isBranch() || MI.isReturn()) return true; unsigned O = MI.getOpcode(); switch (O) { case W65816::LDX_Imm16: case W65816::LDX_DP: case W65816::LDX_Abs: case W65816::LDX_DPY: case W65816::LDX_AbsY: case W65816::LDY_Imm16: case W65816::LDY_DP: case W65816::LDY_Abs: case W65816::LDY_DPX: case W65816::LDY_AbsX: case W65816::STX_DP: case W65816::STX_Abs: case W65816::STX_DPY: case W65816::STY_DP: case W65816::STY_Abs: case W65816::STY_DPX: case W65816::INX: case W65816::DEX: case W65816::INY: case W65816::DEY: case W65816::CPX_Imm16: case W65816::CPX_DP: case W65816::CPX_Abs: case W65816::CPY_Imm16: case W65816::CPY_DP: case W65816::CPY_Abs: case W65816::PHX: case W65816::PHY: case W65816::PLX: case W65816::PLY: case W65816::CLC: case W65816::SEC: case W65816::PHP: case W65816::PLP: case W65816::NOP: return true; default: return false; } }; bool again = true; while (again) { again = false; for (auto It = MBB.begin(); It != MBB.end(); ++It) { unsigned Op1 = It->getOpcode(); if (Op1 != W65816::REP && Op1 != W65816::SEP) continue; if (It->getNumOperands() < 1 || !It->getOperand(0).isImm()) continue; int Imm1 = It->getOperand(0).getImm(); if (Imm1 != kMBit) continue; // M-bit only // Walk forward across mode-neutral ops looking for the matching // opposite toggle. Bail at calls, asm, ALU ops on A, etc. unsigned WantOp = (Op1 == W65816::REP) ? W65816::SEP : W65816::REP; auto Walker = std::next(It); MachineInstr *Match = nullptr; while (Walker != MBB.end()) { if (Walker->isDebugInstr()) { ++Walker; continue; } unsigned WO = Walker->getOpcode(); if (WO == WantOp && Walker->getNumOperands() >= 1 && Walker->getOperand(0).isImm() && Walker->getOperand(0).getImm() == Imm1) { Match = &*Walker; break; } // Bail on anything that touches A or otherwise cares about M. if (Walker->isCall() || Walker->isInlineAsm()) break; if (!isMNeutral(*Walker)) break; ++Walker; } if (!Match) continue; // Drop both toggles. Erasing changes iterator stability; restart. MachineInstr *T1 = &*It; T1->eraseFromParent(); Match->eraseFromParent(); Changed = true; again = true; break; } } } // Second peephole: collapse `ADCi16imm src, ±1/±2` (and SBCi16imm) // into INA/DEA chains when the carry flag they would set is unused. // ADCi16imm is a pseudo (expands to CLC+ADC_Imm16); we rewrite it // here BEFORE the AsmPrinter expansion runs. But this pass runs at // pre-emit, AFTER post-RA pseudo expansion. ADCi16imm survives // because its MCInst lowering is in W65816AsmPrinter (not in the // generic post-RA pseudo expander), so it's still in the MIR here. Changed |= foldImmAdcToInaDea(MBB, TII); // PHI-copy hoist. { auto isStaLike = [](const MachineInstr &MI) { unsigned O = MI.getOpcode(); return O == W65816::STA_StackRel || O == W65816::STZ_DP || O == W65816::STZ_Abs; }; auto isLdaSR = [](const MachineInstr &MI) { return MI.getOpcode() == W65816::LDA_StackRel; }; // Accept LDA_Imm16 (MC) AND LDAi16imm (pseudo) inside the wrap — // both are flag-clobbering A-loads of a 16-bit immediate, with // no stack-rel offset to bump-undo and no memory operand to // alias-check against the gap. Common in init blocks: `lda #0 ; // sta slot,s` wrapped around the loop pre-test. Some functions // still carry the pseudo LDAi16imm at SepRepCleanup time (post-RA // pseudo expansion didn't lower it), so accept both spellings. auto isImmLoad = [](const MachineInstr &MI) { unsigned O = MI.getOpcode(); return O == W65816::LDA_Imm16 || O == W65816::LDAi16imm; }; auto isFlagPreservingMem = [&](const MachineInstr &MI) { return isStaLike(MI) || isLdaSR(MI) || isImmLoad(MI); }; auto isLdaCount = [&](const MachineInstr &MI) { return isLdaSR(MI) || isImmLoad(MI); }; auto It = MBB.begin(); while (It != MBB.end()) { if (It->getOpcode() != W65816::PHP) { ++It; continue; } auto Php = It; // Walk forward: collect LDA/STA pairs, stop at PLP. auto Walker = std::next(Php); SmallVector Block; SmallSet ReadSlots; // post-unbump slots (effective) SmallSet WriteSlots; // post-unbump slots (effective) bool ok = true; while (Walker != MBB.end()) { if (Walker->isDebugInstr()) { ++Walker; continue; } if (Walker->getOpcode() == W65816::PLP) break; if (!isFlagPreservingMem(*Walker)) { ok = false; break; } // Track stack-rel slots so we can check the gap below. // Immediate loads have no stack-rel addr — skip. // In-wrap LDA_StackRel / STA_StackRel slots are BUMPED by +1 // to compensate for PHP's S-decrement; on hoist out of the // wrap we un-bump them. Record the POST-UNBUMP (effective) // slot here so the gap conflict-check uses the addresses // these ops will actually access in their new position. // Without this, an outside-wrap LDA at slot N would not // conflict with an in-wrap STA at slot N+1 even though the // un-bumped STA writes the SAME memory address as the LDA // reads — corrupting flag-test data flow. (bsearch's i32 // `lo < hi` termination compare under TTI-driven less- // aggressive inlining: hoisting STA 6 -> STA 5 above LDA 5 // re-reads the just-overwritten value.) unsigned WOpc = Walker->getOpcode(); bool isBumpedSR = (WOpc == W65816::LDA_StackRel || WOpc == W65816::STA_StackRel); if (!isImmLoad(*Walker) && Walker->getNumOperands() >= 1 && Walker->getOperand(0).isImm()) { int64_t off = Walker->getOperand(0).getImm(); int64_t effOff = isBumpedSR ? off - 1 : off; if (isLdaSR(*Walker)) ReadSlots.insert(effOff); else WriteSlots.insert(effOff); } Block.push_back(&*Walker); ++Walker; } if (!ok || Walker == MBB.end()) { ++It; continue; } auto Plp = Walker; // Trailing flag-preservers after PLP (STA/STZ only). These // already live OUTSIDE the wrap so their slot operand is the // effective (unbumped) value — no -1 adjustment. auto Tail = std::next(Plp); SmallVector Trailing; while (Tail != MBB.end()) { if (Tail->isDebugInstr()) { ++Tail; continue; } if (!isStaLike(*Tail)) break; if (Tail->getNumOperands() >= 1 && Tail->getOperand(0).isImm()) { WriteSlots.insert(Tail->getOperand(0).getImm()); } Trailing.push_back(&*Tail); ++Tail; } // Pair check: the wrap structure is a sequence of LDA-STA // memory-to-memory PHI copies, where the FINAL STA may live // outside the wrap (as Trailing) because STA doesn't clobber // flags. Count LDAs in Block vs total STAs (Block + Trailing). // If they're not equal, some LDA's $a-output is a register- // live-out PHI value (consumed by a back-edge successor's // first STA, e.g. the vararg `sta 0x5, s` pattern). Hoisting // it earlier would lose the value. unsigned NLda = 0, NSta = 0; for (MachineInstr *MI : Block) { if (isLdaCount(*MI)) ++NLda; else if (isStaLike(*MI)) ++NSta; } NSta += Trailing.size(); if (NLda != NSta) { ++It; continue; } // Even with paired LDA-STA, the LAST LDA's $a value can still // be consumed downstream — by a successor's first STA — making // it a fall-through register-PHI. If $a is live-out at MBB // end (any successor has $a as live-in), bail. Caught by // sumTable, where `lda #0` (wrap) feeds A into bb.2's `sta 0x1, // s`, with `sta 0x9, s` (trailing) just happening to also store // the same A — the pair count balances but A is still live-out. bool aLiveOut = false; for (MachineBasicBlock *Succ : MBB.successors()) { if (Succ->isLiveIn(W65816::A)) { aLiveOut = true; break; } } if (aLiveOut) { ++It; continue; } // Walk backward from PHP to find the hoist insertion point. // The hoisted block clobbers $a and $p (LDA writes both). // Skip insts that USE $a (consumer of an earlier $a producer) // or that DEFINE $p (flag-setter — its $p output will be // re-established by the same flag-setter). Stop at a pure A // producer (defines $a, doesn't use $a). // // Also bail if any in-gap inst writes a slot we read or reads // a slot we write (in-gap reads of our writes would observe // a stale value after hoist; in-gap writes to our reads would // produce a different value if hoisted before). auto isStackRelIndYRead = [](unsigned O) { switch (O) { case W65816::LDA_StackRelIndY: case W65816::ADC_StackRelIndY: case W65816::SBC_StackRelIndY: case W65816::CMP_StackRelIndY: case W65816::AND_StackRelIndY: case W65816::ORA_StackRelIndY: case W65816::EOR_StackRelIndY: case W65816::STA_StackRelIndY: return true; } return false; }; auto Back = Php; if (Back == MBB.begin()) { ++It; continue; } --Back; bool gapOK = true; while (true) { while (Back != MBB.begin() && Back->isDebugInstr()) --Back; if (Back->isDebugInstr()) { gapOK = false; break; } // Slot conflict check. unsigned BO = Back->getOpcode(); if ((BO == W65816::STA_StackRel || BO == W65816::STZ_DP || BO == W65816::STZ_Abs) && Back->getNumOperands() >= 1 && Back->getOperand(0).isImm()) { int64_t off = Back->getOperand(0).getImm(); if (ReadSlots.count(off)) { gapOK = false; break; } } if (BO == W65816::LDA_StackRel && Back->getNumOperands() >= 1 && Back->getOperand(0).isImm()) { int64_t off = Back->getOperand(0).getImm(); if (WriteSlots.count(off)) { gapOK = false; break; } } // *_StackRelIndY ops use their slot operand AS A POINTER for // the `(d,S),Y` deref. Hoisting a STA WriteSlot above an // IndY use of that slot changes which value the IndY reads // through. Forbid the hoist in that case. Caught by Layer 2 // ptr32 sumByteToZero loop: PHP-wrapped `LDA stack.3, 1; STA // stack.4` was being hoisted across `LDA_StackRelIndY stack.4`, // making the deref use stack.3's NEW value instead of the // LAGGED stack.4 value — off-by-one summing the byte stream. if (isStackRelIndYRead(BO) && Back->getNumOperands() >= 1 && Back->getOperand(0).isImm()) { int64_t off = Back->getOperand(0).getImm(); if (WriteSlots.count(off)) { gapOK = false; break; } } // Bail on call / branch / asm. if (Back->isCall() || Back->isBranch() || Back->isReturn() || Back->isInlineAsm()) { gapOK = false; break; } bool usesA = false; bool defsA = false; for (const MachineOperand &MO : Back->operands()) { if (MO.isReg() && MO.getReg() == W65816::A) { if (MO.isUse()) usesA = true; if (MO.isDef()) defsA = true; } } if (defsA && !usesA) break; // Pure A producer found. if (Back == MBB.begin()) { gapOK = false; break; } --Back; } if (!gapOK) { ++It; continue; } // Hoist: move Block and Trailing to before Back. Undo the // +1 stack-rel bump on Block's in-wrap memory ops; Trailing // stays AS-IS (it was already outside the wrap and never // bumped). for (MachineInstr *MI : Block) { // All ops in Block matched isFlagPreservingMem, so they're // LDA_StackRel/STA_StackRel/STZ_DP/STZ_Abs. LDA_StackRel // and STA_StackRel use operand 0 as the disp; that's the // bumped one. STZ_DP/STZ_Abs aren't stack-rel — no bump. unsigned MOpc = MI->getOpcode(); if (MOpc == W65816::LDA_StackRel || MOpc == W65816::STA_StackRel) { if (MI->getNumOperands() >= 1 && MI->getOperand(0).isImm()) { int64_t v = MI->getOperand(0).getImm(); MI->getOperand(0).setImm(v - 1); } } MI->removeFromParent(); MBB.insert(Back, MI); } for (MachineInstr *MI : Trailing) { MI->removeFromParent(); MBB.insert(Back, MI); } Php->eraseFromParent(); Plp->eraseFromParent(); Changed = true; // Restart iteration from the beginning since we mutated. It = MBB.begin(); } } // Lagged-ptr PHI-copy sink. In strLen / strcpy / sumByteToZero // loop bodies, the deref reads slot B (the "lagged" PHI value) // while slot A holds the just-incremented iter. At end of body, // a PHP/PLP-wrapped `LDA slot A ; STA slot B` propagates the new // iter to slot B for next iter. The wrap costs 8 cyc/iter (PHP + // PLP) plus 8 cyc for the LDA/STA pair. // // Equivalent rewrite: at the start of the body, BEFORE the // iter++, A already holds slot A's OLD value (loaded for the // INA). Insert `STA slot B` THERE — it copies OLD iter to slot // B, matching the lagged semantic. Slot B is no longer touched // at end of body, so the PHP/PLP wrap (+ its LDA/PLP/STA tail) // can be erased. Net: -11 cyc/iter on strLen (44 chars → -484 // cyc / -20%). // // Pattern at end of MBB (immediately before terminator): // ANDi #imm ; flag-setter // PHP // LDA_StackRel SrcOff ; reload iter NEW (SrcOff is // PHP-bumped: actually = // IterSlotOff + 1) // PLP // STA_StackRel DstOff ; slot B = iter NEW // Bxx ... ; conditional branch // // Earlier in MBB: // LDA_StackRel IterSlotOff ; A = OLD iter // INA_PSEUDO (or ADCi16imm 1) ; iter++ // STA_StackRel IterSlotOff ; iter = NEW // // Rewrite: insert `STA_StackRel DstOff` right after the LDA // (between LDA and INA). Erase the PHP/LDA/PLP/STA + the // ANDi-after-PHP wrap entirely. The ANDi at the front is kept // since it's also the BNE's flag source. { auto isCondBranch = [](const MachineInstr &MI) { unsigned O = MI.getOpcode(); return O == W65816::BNE || O == W65816::BEQ || O == W65816::BCC || O == W65816::BCS || O == W65816::BMI || O == W65816::BPL || O == W65816::BVC || O == W65816::BVS; }; auto isFlagSetter = [](const MachineInstr &MI) { unsigned O = MI.getOpcode(); return O == W65816::ANDi16imm || O == W65816::ANDi8imm || O == W65816::ORAi16imm || O == W65816::EORi16imm; }; // Find Bxx terminator. MachineInstr *Bxx = nullptr; for (auto It = MBB.rbegin(); It != MBB.rend(); ++It) { if (isCondBranch(*It)) { Bxx = &*It; break; } if (It->isBranch()) break; // BRA etc. — skip past it } if (!Bxx) goto skip_lagged_sink; { // Walk backward from Bxx to find STA, PLP, LDA, PHP. auto It2 = MachineBasicBlock::iterator(Bxx); if (It2 == MBB.begin()) goto skip_lagged_sink; --It2; // first non-branch if (It2->getOpcode() != W65816::STA_StackRel || !It2->getOperand(0).isImm()) goto skip_lagged_sink; MachineInstr *FinalSta = &*It2; int64_t DstOff = FinalSta->getOperand(0).getImm(); if (It2 == MBB.begin()) goto skip_lagged_sink; --It2; if (It2->getOpcode() != W65816::PLP) goto skip_lagged_sink; MachineInstr *Plp2 = &*It2; if (It2 == MBB.begin()) goto skip_lagged_sink; --It2; if (It2->getOpcode() != W65816::LDA_StackRel || !It2->getOperand(0).isImm()) goto skip_lagged_sink; MachineInstr *InnerLda = &*It2; int64_t SrcOff = InnerLda->getOperand(0).getImm(); if (It2 == MBB.begin()) goto skip_lagged_sink; --It2; if (It2->getOpcode() != W65816::PHP) goto skip_lagged_sink; MachineInstr *Php2 = &*It2; if (It2 == MBB.begin()) goto skip_lagged_sink; --It2; if (!isFlagSetter(*It2)) goto skip_lagged_sink; // The PHP-bumped SrcOff is the IterSlotOff + 1. int64_t IterSlotOff = SrcOff - 1; // Now find the iter++ sequence earlier in MBB: LDA IterSlotOff; // INA_PSEUDO; STA IterSlotOff. MachineInstr *IterLda = nullptr; MachineInstr *IterIna = nullptr; MachineInstr *IterSta = nullptr; for (auto Walk = MBB.begin(); Walk != MachineBasicBlock::iterator(Php2); ++Walk) { if (Walk->getOpcode() != W65816::LDA_StackRel) continue; if (!Walk->getOperand(0).isImm() || Walk->getOperand(0).getImm() != IterSlotOff) continue; auto N1 = std::next(Walk); while (N1 != MBB.end() && N1->isDebugInstr()) ++N1; if (N1 == MBB.end()) continue; if (N1->getOpcode() != W65816::INA_PSEUDO && N1->getOpcode() != W65816::ADCi16imm) continue; auto N2 = std::next(N1); while (N2 != MBB.end() && N2->isDebugInstr()) ++N2; if (N2 == MBB.end()) continue; if (N2->getOpcode() != W65816::STA_StackRel) continue; if (!N2->getOperand(0).isImm() || N2->getOperand(0).getImm() != IterSlotOff) continue; IterLda = &*Walk; IterIna = &*N1; IterSta = &*N2; break; } if (!IterLda) goto skip_lagged_sink; // Safety: make sure DstOff isn't written between IterLda and // the IndY use of DstOff. Walk forward from IterLda looking // for STA DstOff (other than our FinalSta) — if found, bail. for (auto Walk = std::next(MachineBasicBlock::iterator(IterSta)); Walk != MachineBasicBlock::iterator(Php2); ++Walk) { if (Walk->getOpcode() == W65816::STA_StackRel && Walk->getOperand(0).isImm() && Walk->getOperand(0).getImm() == DstOff) { goto skip_lagged_sink; } } // Apply: insert STA_StackRel DstOff right after IterLda, // BEFORE INA. const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); DebugLoc DL = IterLda->getDebugLoc(); BuildMI(MBB, std::next(MachineBasicBlock::iterator(IterLda)), DL, TII->get(W65816::STA_StackRel)) .addImm(DstOff) .addReg(W65816::A, RegState::Implicit); // Erase PHP, InnerLda, PLP, FinalSta. Php2->eraseFromParent(); InnerLda->eraseFromParent(); Plp2->eraseFromParent(); FinalSta->eraseFromParent(); Changed = true; } skip_lagged_sink:; } // i32 += i32 store-bypass. Regalloc materializes the call result // (A=lo, X=hi) into Wide32 spill slots before the add, then reads // them back — emitting 4 instructions of redundant store/reload: // // STA_StackRel slotA ; A (mul.lo) -> slotA // TXA ; A = X = mul.hi // STA_StackRel slotB ; mul.hi -> slotB // LDA_StackRel slotA ; reload mul.lo <-- redundant // CLC // ADC_StackRel slotC ; mul.lo + total.lo // STA_StackRel slotA ; sum-lo // LDA_StackRel slotB ; reload mul.hi <-- redundant // ADC_StackRel slotD ; mul.hi + total.hi + C // STA_StackRel slotB ; sum-hi // // Reorder to do the lo-add directly off A and the hi-add directly // off X (via TXA preserving carry): // // CLC // ADC_StackRel slotC ; A = mul.lo + total.lo // STA_StackRel slotA ; sum-lo // TXA ; A = X = mul.hi (C preserved) // ADC_StackRel slotD ; A = mul.hi + total.hi + C // STA_StackRel slotB ; sum-hi // // 10 -> 6 inst. Saves 4 inst / ~13 cyc per i32-add-of-call-result // site. Hits the sumOfSquares loop and any total += __umulhisi3 // pattern. { auto isStaSR = [](MachineInstr &MI, int64_t *off) { if (MI.getOpcode() != W65816::STA_StackRel) return false; if (MI.getNumOperands() < 1 || !MI.getOperand(0).isImm()) return false; if (off) *off = MI.getOperand(0).getImm(); return true; }; auto isLdaSR = [](MachineInstr &MI, int64_t *off) { if (MI.getOpcode() != W65816::LDA_StackRel) return false; if (MI.getNumOperands() < 1 || !MI.getOperand(0).isImm()) return false; if (off) *off = MI.getOperand(0).getImm(); return true; }; auto isAdcSR = [](MachineInstr &MI, int64_t *off) { if (MI.getOpcode() != W65816::ADC_StackRel) return false; if (MI.getNumOperands() < 1 || !MI.getOperand(0).isImm()) return false; if (off) *off = MI.getOperand(0).getImm(); return true; }; auto It = MBB.begin(); while (It != MBB.end()) { auto Cur = It; int64_t slotA = 0, slotB = 0, slotC = 0, slotD = 0; // Step 1: STA_StackRel slotA if (!isStaSR(*Cur, &slotA)) { ++It; continue; } auto P2 = std::next(Cur); while (P2 != MBB.end() && P2->isDebugInstr()) ++P2; if (P2 == MBB.end() || P2->getOpcode() != W65816::TXA) { ++It; continue; } auto P3 = std::next(P2); while (P3 != MBB.end() && P3->isDebugInstr()) ++P3; if (P3 == MBB.end() || !isStaSR(*P3, &slotB)) { ++It; continue; } if (slotA == slotB) { ++It; continue; } auto P4 = std::next(P3); while (P4 != MBB.end() && P4->isDebugInstr()) ++P4; int64_t lreloadA = 0; if (P4 == MBB.end() || !isLdaSR(*P4, &lreloadA) || lreloadA != slotA) { ++It; continue; } auto P5 = std::next(P4); while (P5 != MBB.end() && P5->isDebugInstr()) ++P5; if (P5 == MBB.end() || P5->getOpcode() != W65816::CLC) { ++It; continue; } auto P6 = std::next(P5); while (P6 != MBB.end() && P6->isDebugInstr()) ++P6; if (P6 == MBB.end() || !isAdcSR(*P6, &slotC)) { ++It; continue; } auto P7 = std::next(P6); while (P7 != MBB.end() && P7->isDebugInstr()) ++P7; int64_t outA = 0; if (P7 == MBB.end() || !isStaSR(*P7, &outA) || outA != slotA) { ++It; continue; } auto P8 = std::next(P7); while (P8 != MBB.end() && P8->isDebugInstr()) ++P8; int64_t lreloadB = 0; if (P8 == MBB.end() || !isLdaSR(*P8, &lreloadB) || lreloadB != slotB) { ++It; continue; } auto P9 = std::next(P8); while (P9 != MBB.end() && P9->isDebugInstr()) ++P9; if (P9 == MBB.end() || !isAdcSR(*P9, &slotD)) { ++It; continue; } auto P10 = std::next(P9); while (P10 != MBB.end() && P10->isDebugInstr()) ++P10; int64_t outB = 0; if (P10 == MBB.end() || !isStaSR(*P10, &outB) || outB != slotB) { ++It; continue; } // All 10 matched. slotA != slotB already. Also require all // four slots distinct. (slotC/slotD are the total.lo/hi read // addresses; in the canonical case slotC != slotA and slotD != // slotB; without this the rewrite would re-read its own output.) if (slotC == slotA || slotD == slotB || slotC == slotD) { ++It; continue; } // Rewrite: emit CLC ; ADC slotC ; STA slotA ; TXA ; ADC slotD ; // STA slotB before P1, then erase steps 1-10. DebugLoc DL = Cur->getDebugLoc(); BuildMI(MBB, Cur, DL, TII.get(W65816::CLC)); BuildMI(MBB, Cur, DL, TII.get(W65816::ADC_StackRel)) .addImm(slotC); BuildMI(MBB, Cur, DL, TII.get(W65816::STA_StackRel)) .addImm(slotA); BuildMI(MBB, Cur, DL, TII.get(W65816::TXA)); BuildMI(MBB, Cur, DL, TII.get(W65816::ADC_StackRel)) .addImm(slotD); BuildMI(MBB, Cur, DL, TII.get(W65816::STA_StackRel)) .addImm(slotB); // Advance It past the matched pattern before erasing (so we // don't iterate through deleted insts). It = std::next(P10); // Erase the 10 originals. Cur->eraseFromParent(); P2->eraseFromParent(); P3->eraseFromParent(); P4->eraseFromParent(); P5->eraseFromParent(); P6->eraseFromParent(); P7->eraseFromParent(); P8->eraseFromParent(); P9->eraseFromParent(); P10->eraseFromParent(); Changed = true; } } // Dead TAX / TXA elimination. STAfi declares `Defs = [A]` as a // safe over-approximation (eliminateFrameIndex emits a PHA-bracketed // sequence when the source is IMG-class). Regalloc honors that by // inserting `TAX ; ...STAfi... ; TXA` brackets around STAfi that // SOURCES from A — but in the A-source path A is preserved. The // TXA's output gets clobbered immediately by the next LDA*, so the // TXA is dead; once TXA is gone, the TAX's X-value has no consumer // and is dead too. This pattern recurs once per i32-spill site. // // Conservative: only elide TXA if the IMMEDIATE next non-debug // instruction defines $a (and doesn't read $a or N/Z first). No // intervening flag-readers between TXA and the A-define is then // guaranteed. Same logic for TYA. // // For TAX: elide if no instruction between TAX and the next $x def // reads $x (and we can prove the original X had no live consumer). // Done as a fixed-point: keep iterating until no change. auto definesReg = [](const MachineInstr &MI, unsigned Reg) -> bool { for (const MachineOperand &MO : MI.operands()) { if (MO.isReg() && MO.getReg() == Reg && MO.isDef()) return true; } return false; }; auto readsReg = [](const MachineInstr &MI, unsigned Reg) -> bool { for (const MachineOperand &MO : MI.operands()) { if (MO.isReg() && MO.getReg() == Reg && MO.isUse()) return true; } return false; }; bool again2 = true; while (again2) { again2 = false; // Pass A: dead TXA / TYA for (auto It = MBB.begin(); It != MBB.end(); ) { unsigned O = It->getOpcode(); if (O != W65816::TXA && O != W65816::TYA) { ++It; continue; } auto Next = std::next(It); while (Next != MBB.end() && Next->isDebugInstr()) ++Next; if (Next == MBB.end()) { ++It; continue; } // Next must define $a unconditionally, and must not read $a // (since we're about to discard the TXA-defined A) and must // not be a call / branch / inline asm (which conservatively // read $a). if (Next->isCall() || Next->isBranch() || Next->isReturn() || Next->isInlineAsm()) { ++It; continue; } if (!definesReg(*Next, W65816::A)) { ++It; continue; } if (readsReg(*Next, W65816::A)) { ++It; continue; } // P (flags) liveness: TXA/TYA set N/Z. If Next reads P, we'd // be discarding the flags it expects. Bxx and friends read P. // Conservative: also require Next does not read $p. if (readsReg(*Next, W65816::P)) { ++It; continue; } auto Dead = It++; Dead->eraseFromParent(); Changed = true; again2 = true; } // Pass B: dead TAX / TAY for (auto It = MBB.begin(); It != MBB.end(); ) { unsigned O = It->getOpcode(); unsigned Target; if (O == W65816::TAX) Target = W65816::X; else if (O == W65816::TAY) Target = W65816::Y; else { ++It; continue; } // Walk forward. TAX/TAY is dead if every use of Target is // preceded by a redefinition of Target (and the in-MBB region // between has no flag-reader that consumes TAX's N/Z). At MBB // end, check successor live-ins: if none has Target as live-in // it's also dead. // // Flag liveness: TAX defines $p (N/Z). A later $p-reader only // consumes TAX's flags if no intervening instruction REDEFINES // $p in the gap. Track `pRedef` to allow common patterns like // `TAX ; CLC ; ADC ; ...` where ADC reads $p but the $p it // reads is the freshly-CLC'd carry, not TAX's N/Z. auto Walker = std::next(It); bool deadIt = false; bool bailed = false; bool pRedef = false; while (Walker != MBB.end()) { if (Walker->isDebugInstr()) { ++Walker; continue; } if (Walker->isCall() || Walker->isInlineAsm()) { bailed = true; break; } // Branch / return: stop walking; rely on successor live-ins. if (Walker->isBranch() || Walker->isReturn()) break; if (readsReg(*Walker, Target)) { bailed = true; break; } if (readsReg(*Walker, W65816::P) && !pRedef) { bailed = true; break; } if (definesReg(*Walker, W65816::P)) pRedef = true; if (definesReg(*Walker, Target)) { deadIt = true; break; } ++Walker; } if (bailed) { ++It; continue; } if (!deadIt) { // Fell through to MBB end / branch. Check successor live-ins. bool liveOut = false; for (MachineBasicBlock *Succ : MBB.successors()) { if (Succ->isLiveIn(Target)) { liveOut = true; break; } } // Return blocks: $a and $x are the i32 return-value convention. // RTL doesn't model these as Uses, but they ARE live at the // return. Be conservative — don't elide TAX/TAY before a return. if (!MBB.empty() && MBB.back().isReturn()) liveOut = true; if (liveOut) { ++It; continue; } } auto Dead = It++; Dead->eraseFromParent(); Changed = true; again2 = true; } } // Third peephole: drop `LDY_Imm16 K` when Y already holds K from // an earlier LDY in the same MBB and no intervening MI clobbered // Y. Custom inserter emits LDY #0 before every LDAfi_indY/STAfi_indY, // even though Y already holds 0 from a previous emit — the // redundant LDYs survive MachineLICM because Y is a phys reg and // the inserter binds them tightly to each use. int yKnown = -1; // -1 means unknown; otherwise the immediate auto It2 = MBB.begin(); while (It2 != MBB.end()) { MachineInstr &MI = *It2; if (MI.isDebugInstr()) { ++It2; continue; } unsigned Op = MI.getOpcode(); if (Op == W65816::LDY_Imm16 && MI.getNumOperands() >= 1 && MI.getOperand(0).isImm()) { int K = MI.getOperand(0).getImm() & 0xFFFF; if (yKnown == K) { // Before erasing this redundant LDY: the prior LDY is still in // scope, so all of its Y-uses between the two LDYs are still // valid uses. But liveness already marked the LAST one (just // before the redundant LDY) as `implicit killed $y`, because // that LDY was about to redefine Y. After erasure, Y survives // through to the NEXT use, so the prior "kill" annotation is // wrong and the machine verifier rejects. Walk backward and // clear the kill flag on the most recent Y-using operand. for (auto Back = std::prev(It2);; --Back) { bool clearedAny = false; for (MachineOperand &MO : Back->operands()) { if (MO.isReg() && MO.getReg() == W65816::Y && MO.isUse() && MO.isKill()) { MO.setIsKill(false); clearedAny = true; } } if (clearedAny) break; if (Back == MBB.begin()) break; } auto Erase = It2++; Erase->eraseFromParent(); Changed = true; continue; } yKnown = K; } else { // Conservatively invalidate yKnown on anything that touches Y // or on calls / inline asm / any instruction that doesn't have // a clean "no Y effect" guarantee. Cheaper to underclaim than // miscompile. switch (Op) { case W65816::LDAfi_indY: // reads Y, doesn't def it — keep yKnown case W65816::STAfi_indY: case W65816::LDA_StackRelIndY: case W65816::STA_StackRelIndY: break; case W65816::TAY: case W65816::TXY: case W65816::INY: case W65816::DEY: case W65816::PLY: case W65816::LDY_DP: case W65816::LDY_Abs: case W65816::LDY_DPX: case W65816::LDY_AbsX: yKnown = -1; break; default: if (MI.isCall()) yKnown = -1; break; } } ++It2; } } // Three prototype peepholes were tried here and removed once shown // to regress benchmarks; design notes in // feedback_close_gap_attempts_round2.md / feedback_cmp_zero_elim.md: // - PHI store-forwarding (CRC32 regression / memmove safety hole). // - Redundant CMP #0 elimination (VLA sum_n carry-flag bookkeeping). // - Narrow PHI-copy slot collapse (qsort regression). return Changed; }