//===-- W65816SepRepCleanup.cpp - Coalesce adjacent SEP/REP toggles -------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // Post-PEI peephole that drops adjacent `REP #$20 ; SEP #$20` (or vice // versa) pairs that toggle the M-bit redundantly. // // The STA8fi expansion in W65816RegisterInfo::eliminateFrameIndex emits // `SEP #$20 / STA d,S / REP #$20` so each i8 store runs with M=1. When // two STA8fi sit back-to-back in the MIR (no 16-bit ALU op between // them), the post-PEI stream contains: // // SEP #$20 // STA d1, S // REP #$20 <-- toggle // SEP #$20 <-- toggle (cancels above) // STA d2, S // REP #$20 // // The middle REP/SEP pair is a no-op: both stores can run in one M=1 // region. We drop them to leave: // // SEP #$20 // STA d1, S // STA d2, S // REP #$20 // // Saves 2 bytes / 6 cycles per coalesced pair. Symmetric `SEP/REP` // pairs (M=1 then M=0 with nothing in between) are also dropped — they // can arise around inline-asm or hand-written assembly snippets. // // Runs at addPreEmitPass (after PEI has expanded STA8fi). // //===----------------------------------------------------------------------===// #include "W65816.h" #include "W65816InstrInfo.h" #include "W65816Subtarget.h" #include "llvm/ADT/SmallSet.h" #include "llvm/Support/raw_ostream.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" using namespace llvm; #define DEBUG_TYPE "w65816-sep-rep-cleanup" namespace { class W65816SepRepCleanup : public MachineFunctionPass { public: static char ID; W65816SepRepCleanup() : MachineFunctionPass(ID) {} StringRef getPassName() const override { return "W65816 SEP/REP toggle coalescing"; } bool runOnMachineFunction(MachineFunction &MF) override; }; } // namespace char W65816SepRepCleanup::ID = 0; INITIALIZE_PASS(W65816SepRepCleanup, DEBUG_TYPE, "W65816 SEP/REP toggle coalescing", false, false) FunctionPass *llvm::createW65816SepRepCleanup() { return new W65816SepRepCleanup(); } // Returns the immediate value of `op` if MI is a `SEP #imm` or `REP #imm`, // else -1. static int getSepRepImm(const MachineInstr &MI, unsigned Opc) { if (MI.getOpcode() != Opc) return -1; if (MI.getNumOperands() < 1 || !MI.getOperand(0).isImm()) return -1; return MI.getOperand(0).getImm(); } // Returns true if MI may consume the carry or overflow flag — these // are the flags that ADC/SBC define but INA/DEA don't. Conservative: // any branch that reads C or V counts, plus the chained ADC/SBC ops // that wait for a prior carry-out. Anything else (CMP, CLC, SEC, // LDA, STA, AND, ORA, EOR, etc.) re-defines or doesn't read C/V. static bool readsCarryOrV(const MachineInstr &MI) { switch (MI.getOpcode()) { case W65816::BCS: // reads C case W65816::BCC: // reads C case W65816::BVS: // reads V case W65816::BVC: // reads V case W65816::ADC_StackRel: // reads C as carry-in case W65816::ADC_Imm16: case W65816::ADC_Imm8: case W65816::ADC_DP: case W65816::ADC_Abs: case W65816::SBC_StackRel: case W65816::SBC_Imm16: case W65816::SBC_Imm8: case W65816::SBC_DP: case W65816::SBC_Abs: // Chained-carry pseudos. These run BEFORE AsmPrinter expansion so // we must whitelist them explicitly — they're the hi-half of any // multi-precision add/sub and read the lo-half's carry-out. Without // these, the INA/DEA peephole below silently rewrites a lo-half // `ADCi16imm src, 2` to `INA; INA` (which DOES NOT set C), breaking // the i32 ADD carry chain. Caught as `arr[0] = arr[1]` writing to // wrong bank under ptr32 because the high half got a stale C. case W65816::ADCEi16imm: case W65816::SBCEi16imm: // The fi/abs/imm forms of ADC/SBC are also pre-AsmPrinter pseudos; // each expands to a real ADC_/SBC_ opcode that reads carry. case W65816::ADCi16imm: // lo-half (CLC + ADC_Imm16) case W65816::SBCi16imm: // lo-half (SEC + SBC_Imm16) case W65816::ADCfi: // chained-carry stack form case W65816::SBCfi: case W65816::ADCEfi: case W65816::SBCEfi: case W65816::ADCabs: case W65816::SBCabs: case W65816::ROL_A: // rotates fold C in case W65816::ROR_A: case W65816::ROL_DP: case W65816::ROL_Abs: case W65816::ROR_DP: case W65816::ROR_Abs: return true; default: return false; } } // Returns true if `Op` is one of the flag-redefining opcodes (CLC, SEC, // CMP*, CPX*, CPY*, REP, SEP) — observing C/V before this is safe. // Includes the pseudo CMP* variants (CMPi16imm etc.) since this peephole // runs at pre-emit, BEFORE the AsmPrinter expands them. static bool isFlagRedefiner(unsigned Op) { switch (Op) { case W65816::CLC: case W65816::SEC: case W65816::CMP_Imm8: case W65816::CMP_Imm16: case W65816::CMP_StackRel: case W65816::CMP_DP: case W65816::CMP_Abs: case W65816::CMPi16imm: case W65816::CMPi8imm: case W65816::CMPfi: case W65816::CMPabs: case W65816::CMP_RR: case W65816::CPX_Imm8: case W65816::CPX_Imm16: case W65816::CPX_DP: case W65816::CPX_Abs: case W65816::CPY_Imm8: case W65816::CPY_Imm16: case W65816::CPY_DP: case W65816::CPY_Abs: case W65816::REP: case W65816::SEP: return true; default: return false; } } // Returns true if a subsequent MI in the same MBB observes the C/V // flags before any flag-redefiner clears the dependency. At MBB end, // extends one step into each successor: if any successor's first // (non-debug) MI reads C/V before redefining them, the flag is live // across the edge — bail. This is critical for loop bodies where // the back-edge re-enters the same MBB at LDA/PHA (neither reads C/V), // so a per-iteration `clc; adc #2` is foldable. Cross-MBB carry chains // would normally use ADCEi16imm (not ADCi16imm), so this is safe. static bool carryFlagLiveAfter(MachineBasicBlock::iterator After, MachineBasicBlock &MBB) { // Phase 1: scan within this MBB. for (auto Probe = std::next(After); Probe != MBB.end(); ++Probe) { if (Probe->isDebugInstr()) continue; if (readsCarryOrV(*Probe)) return true; if (isFlagRedefiner(Probe->getOpcode())) return false; if (Probe->isCall()) return false; // callee resets flags } // Phase 2: peek into each successor's first few MIs. We BAIL only on // a positive C/V read; reaching MBB end or peek-cap without finding // one is treated as "carry dead" — ADCi16imm's carry-out is never // used in carry chains (those use ADCEi16imm), so a stray carry // floating into RTL or an unrelated arithmetic op causes no harm. const unsigned MaxPeek = 6; for (MachineBasicBlock *Succ : MBB.successors()) { unsigned Peeked = 0; for (auto &MI : *Succ) { if (MI.isDebugInstr()) continue; if (readsCarryOrV(MI)) return true; if (isFlagRedefiner(MI.getOpcode()) || MI.isCall()) break; if (++Peeked >= MaxPeek) break; } } return false; } // Convert `ADCi16imm dst, src, ±1`/`±2` and `SBCi16imm` similarly to // INA / INA;INA / DEA / DEA;DEA chains when C/V are dead. ADCi16imm // is a pseudo that expands to CLC+ADC_Imm16 (4B/5cyc). INA is 1B/2cyc. // Savings per ±1: 3B/3cyc; per ±2: 2B/1cyc. SBCi16imm is symmetric // (sub by N == add by -N), so SBC #1 → DEA, SBC #-1 → INA, etc. static bool foldImmAdcToInaDea(MachineBasicBlock &MBB, const W65816InstrInfo &TII) { bool Changed = false; auto It = MBB.begin(); while (It != MBB.end()) { unsigned Op = It->getOpcode(); bool isAdc = (Op == W65816::ADCi16imm); bool isSbc = (Op == W65816::SBCi16imm); if ((!isAdc && !isSbc) || It->getNumOperands() < 3 || !It->getOperand(2).isImm()) { ++It; continue; } int64_t Imm = (int16_t)It->getOperand(2).getImm(); // For SBC, negate: SBC by +N is "subtract N", same as ADC by -N. int64_t Effective = isSbc ? -Imm : Imm; if (Effective < -2 || Effective > 2 || Effective == 0) { ++It; continue; } if (carryFlagLiveAfter(It, MBB)) { ++It; continue; } DebugLoc DL = It->getDebugLoc(); unsigned NewOpc = (Effective > 0) ? W65816::INA : W65816::DEA; unsigned Count = (Effective > 0) ? Effective : -Effective; for (unsigned i = 0; i < Count; ++i) BuildMI(MBB, It, DL, TII.get(NewOpc)); auto NextIt = std::next(It); It->eraseFromParent(); It = NextIt; Changed = true; } return Changed; } bool W65816SepRepCleanup::runOnMachineFunction(MachineFunction &MF) { bool Changed = false; const auto &STI = MF.getSubtarget(); const auto &TII = *STI.getInstrInfo(); for (MachineBasicBlock &MBB : MF) { // Pre-pass: hoist LDAi8imm out of byte-store SEP/REP wraps. // The post-RA scheduler can move LDAi8imm (which is marked // hasSideEffects=0 at MIR but expands at AsmPrinter to its OWN // SEP+LDA8+REP that toggles M) INSIDE an STBptr inserter's // SEP/REP wrap. When that happens, the LDAi8imm's expansion // REP fires BEFORE the byte STA, leaving the STA in M=16 — the // store becomes a 16-bit zero write, corrupting the byte AFTER // the intended target. Detect the pattern and hoist the // LDAi8imm above the outer SEP. #107 strtok_r BB0_15 was this // exact bug. { SmallVector SepHoists; for (auto It = MBB.begin(); It != MBB.end(); ++It) { if (It->getOpcode() != W65816::SEP) continue; if (It->getNumOperands() < 1 || !It->getOperand(0).isImm()) continue; if (It->getOperand(0).getImm() != 0x20) continue; // Walk forward looking for LDAi8imm before any STAfi_indY // or REP at this nesting level. auto Walker = std::next(It); MachineInstr *LdaToHoist = nullptr; while (Walker != MBB.end()) { if (Walker->isDebugInstr()) { ++Walker; continue; } unsigned Opc = Walker->getOpcode(); // Hit a REP — wrap is closing without LDAi8imm inside. if (Opc == W65816::REP) break; // Hit a call / branch / asm — bail. if (Walker->isCall() || Walker->isBranch() || Walker->isReturn() || Walker->isInlineAsm()) break; // Hit an STAfi_indY — this is the byte store; an LDAi8imm // before it would be the bug, but if we found one already // we'd have hoisted it; nothing to do here, stop scanning. if (Opc == W65816::STAfi_indY) break; if (Opc == W65816::LDAi8imm) { LdaToHoist = &*Walker; break; } ++Walker; } if (LdaToHoist) SepHoists.push_back(LdaToHoist); } for (MachineInstr *Lda : SepHoists) { // Find the SEP we entered before the LDA. Walk backward. auto Back = Lda->getIterator(); MachineInstr *OuterSep = nullptr; while (Back != MBB.begin()) { --Back; if (Back->isDebugInstr()) continue; if (Back->getOpcode() == W65816::SEP && Back->getNumOperands() >= 1 && Back->getOperand(0).isImm() && Back->getOperand(0).getImm() == 0x20) { OuterSep = &*Back; break; } if (Back->isCall() || Back->isBranch() || Back->isInlineAsm()) break; } if (!OuterSep) continue; Lda->removeFromParent(); MBB.insert(OuterSep->getIterator(), Lda); Changed = true; } } SmallVector Toggles; for (MachineInstr &MI : MBB) { unsigned Opc = MI.getOpcode(); if (Opc == W65816::REP || Opc == W65816::SEP) Toggles.push_back(&MI); } SmallPtrSet Erased; for (MachineInstr *First : Toggles) { if (Erased.count(First)) continue; // The next non-debug instruction must be the matching opposite // toggle with the same imm. auto It = std::next(First->getIterator()); while (It != MBB.end() && It->isDebugInstr()) ++It; if (It == MBB.end()) continue; MachineInstr &Next = *It; // Look for REP-then-SEP or SEP-then-REP with matching imm. unsigned FirstOpc = First->getOpcode(); unsigned WantOpc = (FirstOpc == W65816::REP) ? W65816::SEP : W65816::REP; int FirstImm = getSepRepImm(*First, FirstOpc); int NextImm = getSepRepImm(Next, WantOpc); if (FirstImm < 0 || NextImm < 0 || FirstImm != NextImm) continue; Erased.insert(First); Erased.insert(&Next); First->eraseFromParent(); Next.eraseFromParent(); Changed = true; } // Extended toggle coalesce — REP/SEP scheduling. // // Walk the MBB looking for `T1 ; ...neutral... ; T2` where T1 and // T2 are opposite-polarity SEP/REP toggles (T1=REP T2=SEP, or // vice versa) with the same imm, and the gap contains only // M-mode-neutral instructions (transfers/branches/X-flag-only // index ops). In that case T1+T2 form a no-op pair around code // that doesn't care about M, so both can be dropped. Equivalent // to "moving the SEP/REP wrap inward to skip the neutral region". // // Saves 4 bytes / 12 cycles per gap collapsed. The common // trigger is two STA8 stores separated by an LDY for the second // store's address — STA8fi each emit SEP/STA/REP, the existing // adjacent coalesce can't see across the LDY, this pass can. { // Mode-neutral instruction set: don't touch the M-bit and // don't depend on A's width. X-flag dependent ops (LDX/LDY/ // STX/STY/INX/DEX/INY/DEY/CPX/CPY/PHX/PHY/PLX/PLY) are // independent of M. So are all branches, JMP/JSR/JSL/RTL/RTS, // CLC/SEC/CLI/SEI/CLD/SED/CLV, NOP, and PHP/PLP (they push // 8-bit P regardless of M). auto isMNeutral = [](const MachineInstr &MI) -> bool { if (MI.isDebugInstr()) return true; if (MI.isBranch() || MI.isReturn()) return true; unsigned O = MI.getOpcode(); switch (O) { case W65816::LDX_Imm16: case W65816::LDX_DP: case W65816::LDX_Abs: case W65816::LDX_DPY: case W65816::LDX_AbsY: case W65816::LDY_Imm16: case W65816::LDY_DP: case W65816::LDY_Abs: case W65816::LDY_DPX: case W65816::LDY_AbsX: case W65816::STX_DP: case W65816::STX_Abs: case W65816::STX_DPY: case W65816::STY_DP: case W65816::STY_Abs: case W65816::STY_DPX: case W65816::INX: case W65816::DEX: case W65816::INY: case W65816::DEY: case W65816::CPX_Imm16: case W65816::CPX_DP: case W65816::CPX_Abs: case W65816::CPY_Imm16: case W65816::CPY_DP: case W65816::CPY_Abs: case W65816::PHX: case W65816::PHY: case W65816::PLX: case W65816::PLY: case W65816::CLC: case W65816::SEC: case W65816::PHP: case W65816::PLP: case W65816::NOP: return true; default: return false; } }; bool again = true; while (again) { again = false; for (auto It = MBB.begin(); It != MBB.end(); ++It) { unsigned Op1 = It->getOpcode(); if (Op1 != W65816::REP && Op1 != W65816::SEP) continue; if (It->getNumOperands() < 1 || !It->getOperand(0).isImm()) continue; int Imm1 = It->getOperand(0).getImm(); if (Imm1 != 0x20) continue; // M-bit only // Walk forward across mode-neutral ops looking for the matching // opposite toggle. Bail at calls, asm, ALU ops on A, etc. unsigned WantOp = (Op1 == W65816::REP) ? W65816::SEP : W65816::REP; auto Walker = std::next(It); MachineInstr *Match = nullptr; while (Walker != MBB.end()) { if (Walker->isDebugInstr()) { ++Walker; continue; } unsigned WO = Walker->getOpcode(); if (WO == WantOp && Walker->getNumOperands() >= 1 && Walker->getOperand(0).isImm() && Walker->getOperand(0).getImm() == Imm1) { Match = &*Walker; break; } // Bail on anything that touches A or otherwise cares about M. if (Walker->isCall() || Walker->isInlineAsm()) break; if (!isMNeutral(*Walker)) break; ++Walker; } if (!Match) continue; // Drop both toggles. Erasing changes iterator stability; restart. MachineInstr *T1 = &*It; T1->eraseFromParent(); Match->eraseFromParent(); Changed = true; again = true; break; } } } // Second peephole: collapse `ADCi16imm src, ±1/±2` (and SBCi16imm) // into INA/DEA chains when the carry flag they would set is unused. // ADCi16imm is a pseudo (expands to CLC+ADC_Imm16); we rewrite it // here BEFORE the AsmPrinter expansion runs. But this pass runs at // pre-emit, AFTER post-RA pseudo expansion. ADCi16imm survives // because its MCInst lowering is in W65816AsmPrinter (not in the // generic post-RA pseudo expander), so it's still in the MIR here. Changed |= foldImmAdcToInaDea(MBB, TII); // PHI-copy hoist. { auto isStaLike = [](const MachineInstr &MI) { unsigned O = MI.getOpcode(); return O == W65816::STA_StackRel || O == W65816::STZ_DP || O == W65816::STZ_Abs; }; auto isLdaSR = [](const MachineInstr &MI) { return MI.getOpcode() == W65816::LDA_StackRel; }; // Accept LDA_Imm16 (MC) AND LDAi16imm (pseudo) inside the wrap — // both are flag-clobbering A-loads of a 16-bit immediate, with // no stack-rel offset to bump-undo and no memory operand to // alias-check against the gap. Common in init blocks: `lda #0 ; // sta slot,s` wrapped around the loop pre-test. Some functions // still carry the pseudo LDAi16imm at SepRepCleanup time (post-RA // pseudo expansion didn't lower it), so accept both spellings. auto isImmLoad = [](const MachineInstr &MI) { unsigned O = MI.getOpcode(); return O == W65816::LDA_Imm16 || O == W65816::LDAi16imm; }; auto isFlagPreservingMem = [&](const MachineInstr &MI) { return isStaLike(MI) || isLdaSR(MI) || isImmLoad(MI); }; auto isLdaCount = [&](const MachineInstr &MI) { return isLdaSR(MI) || isImmLoad(MI); }; auto It = MBB.begin(); while (It != MBB.end()) { if (It->getOpcode() != W65816::PHP) { ++It; continue; } auto Php = It; // Walk forward: collect LDA/STA pairs, stop at PLP. auto Walker = std::next(Php); SmallVector Block; SmallSet ReadSlots; SmallSet WriteSlots; bool ok = true; while (Walker != MBB.end()) { if (Walker->isDebugInstr()) { ++Walker; continue; } if (Walker->getOpcode() == W65816::PLP) break; if (!isFlagPreservingMem(*Walker)) { ok = false; break; } // Track stack-rel slots so we can check the gap below. // Immediate loads have no stack-rel addr — skip. if (!isImmLoad(*Walker) && Walker->getNumOperands() >= 1 && Walker->getOperand(0).isImm()) { int64_t off = Walker->getOperand(0).getImm(); if (isLdaSR(*Walker)) ReadSlots.insert(off); else WriteSlots.insert(off); } Block.push_back(&*Walker); ++Walker; } if (!ok || Walker == MBB.end()) { ++It; continue; } auto Plp = Walker; // Trailing flag-preservers after PLP (STA/STZ only). auto Tail = std::next(Plp); SmallVector Trailing; while (Tail != MBB.end()) { if (Tail->isDebugInstr()) { ++Tail; continue; } if (!isStaLike(*Tail)) break; if (Tail->getNumOperands() >= 1 && Tail->getOperand(0).isImm()) { WriteSlots.insert(Tail->getOperand(0).getImm()); } Trailing.push_back(&*Tail); ++Tail; } // Pair check: the wrap structure is a sequence of LDA-STA // memory-to-memory PHI copies, where the FINAL STA may live // outside the wrap (as Trailing) because STA doesn't clobber // flags. Count LDAs in Block vs total STAs (Block + Trailing). // If they're not equal, some LDA's $a-output is a register- // live-out PHI value (consumed by a back-edge successor's // first STA, e.g. the vararg `sta 0x5, s` pattern). Hoisting // it earlier would lose the value. unsigned NLda = 0, NSta = 0; for (MachineInstr *MI : Block) { if (isLdaCount(*MI)) ++NLda; else if (isStaLike(*MI)) ++NSta; } NSta += Trailing.size(); if (NLda != NSta) { ++It; continue; } // Even with paired LDA-STA, the LAST LDA's $a value can still // be consumed downstream — by a successor's first STA — making // it a fall-through register-PHI. If $a is live-out at MBB // end (any successor has $a as live-in), bail. Caught by // sumTable, where `lda #0` (wrap) feeds A into bb.2's `sta 0x1, // s`, with `sta 0x9, s` (trailing) just happening to also store // the same A — the pair count balances but A is still live-out. bool aLiveOut = false; for (MachineBasicBlock *Succ : MBB.successors()) { if (Succ->isLiveIn(W65816::A)) { aLiveOut = true; break; } } if (aLiveOut) { ++It; continue; } // Walk backward from PHP to find the hoist insertion point. // The hoisted block clobbers $a and $p (LDA writes both). // Skip insts that USE $a (consumer of an earlier $a producer) // or that DEFINE $p (flag-setter — its $p output will be // re-established by the same flag-setter). Stop at a pure A // producer (defines $a, doesn't use $a). // // Also bail if any in-gap inst writes a slot we read or reads // a slot we write (in-gap reads of our writes would observe // a stale value after hoist; in-gap writes to our reads would // produce a different value if hoisted before). auto isStackRelIndYRead = [](unsigned O) { switch (O) { case W65816::LDA_StackRelIndY: case W65816::ADC_StackRelIndY: case W65816::SBC_StackRelIndY: case W65816::CMP_StackRelIndY: case W65816::AND_StackRelIndY: case W65816::ORA_StackRelIndY: case W65816::EOR_StackRelIndY: case W65816::STA_StackRelIndY: return true; } return false; }; auto Back = Php; if (Back == MBB.begin()) { ++It; continue; } --Back; bool gapOK = true; while (true) { while (Back != MBB.begin() && Back->isDebugInstr()) --Back; if (Back->isDebugInstr()) { gapOK = false; break; } // Slot conflict check. unsigned BO = Back->getOpcode(); if ((BO == W65816::STA_StackRel || BO == W65816::STZ_DP || BO == W65816::STZ_Abs) && Back->getNumOperands() >= 1 && Back->getOperand(0).isImm()) { int64_t off = Back->getOperand(0).getImm(); if (ReadSlots.count(off)) { gapOK = false; break; } } if (BO == W65816::LDA_StackRel && Back->getNumOperands() >= 1 && Back->getOperand(0).isImm()) { int64_t off = Back->getOperand(0).getImm(); if (WriteSlots.count(off)) { gapOK = false; break; } } // *_StackRelIndY ops use their slot operand AS A POINTER for // the `(d,S),Y` deref. Hoisting a STA WriteSlot above an // IndY use of that slot changes which value the IndY reads // through. Forbid the hoist in that case. Caught by Layer 2 // ptr32 sumByteToZero loop: PHP-wrapped `LDA stack.3, 1; STA // stack.4` was being hoisted across `LDA_StackRelIndY stack.4`, // making the deref use stack.3's NEW value instead of the // LAGGED stack.4 value — off-by-one summing the byte stream. if (isStackRelIndYRead(BO) && Back->getNumOperands() >= 1 && Back->getOperand(0).isImm()) { int64_t off = Back->getOperand(0).getImm(); if (WriteSlots.count(off)) { gapOK = false; break; } } // Bail on call / branch / asm. if (Back->isCall() || Back->isBranch() || Back->isReturn() || Back->isInlineAsm()) { gapOK = false; break; } bool usesA = false; bool defsA = false; for (const MachineOperand &MO : Back->operands()) { if (MO.isReg() && MO.getReg() == W65816::A) { if (MO.isUse()) usesA = true; if (MO.isDef()) defsA = true; } } if (defsA && !usesA) break; // Pure A producer found. if (Back == MBB.begin()) { gapOK = false; break; } --Back; } if (!gapOK) { ++It; continue; } // Hoist: move Block and Trailing to before Back. Undo the // +1 stack-rel bump on Block's in-wrap memory ops; Trailing // stays AS-IS (it was already outside the wrap and never // bumped). for (MachineInstr *MI : Block) { // All ops in Block matched isFlagPreservingMem, so they're // LDA_StackRel/STA_StackRel/STZ_DP/STZ_Abs. LDA_StackRel // and STA_StackRel use operand 0 as the disp; that's the // bumped one. STZ_DP/STZ_Abs aren't stack-rel — no bump. unsigned MOpc = MI->getOpcode(); if (MOpc == W65816::LDA_StackRel || MOpc == W65816::STA_StackRel) { if (MI->getNumOperands() >= 1 && MI->getOperand(0).isImm()) { int64_t v = MI->getOperand(0).getImm(); MI->getOperand(0).setImm(v - 1); } } MI->removeFromParent(); MBB.insert(Back, MI); } for (MachineInstr *MI : Trailing) { MI->removeFromParent(); MBB.insert(Back, MI); } Php->eraseFromParent(); Plp->eraseFromParent(); Changed = true; // Restart iteration from the beginning since we mutated. It = MBB.begin(); } } // Lagged-ptr PHI-copy sink. In strLen / strcpy / sumByteToZero // loop bodies, the deref reads slot B (the "lagged" PHI value) // while slot A holds the just-incremented iter. At end of body, // a PHP/PLP-wrapped `LDA slot A ; STA slot B` propagates the new // iter to slot B for next iter. The wrap costs 8 cyc/iter (PHP + // PLP) plus 8 cyc for the LDA/STA pair. // // Equivalent rewrite: at the start of the body, BEFORE the // iter++, A already holds slot A's OLD value (loaded for the // INA). Insert `STA slot B` THERE — it copies OLD iter to slot // B, matching the lagged semantic. Slot B is no longer touched // at end of body, so the PHP/PLP wrap (+ its LDA/PLP/STA tail) // can be erased. Net: -11 cyc/iter on strLen (44 chars → -484 // cyc / -20%). // // Pattern at end of MBB (immediately before terminator): // ANDi #imm ; flag-setter // PHP // LDA_StackRel SrcOff ; reload iter NEW (SrcOff is // PHP-bumped: actually = // IterSlotOff + 1) // PLP // STA_StackRel DstOff ; slot B = iter NEW // Bxx ... ; conditional branch // // Earlier in MBB: // LDA_StackRel IterSlotOff ; A = OLD iter // INA_PSEUDO (or ADCi16imm 1) ; iter++ // STA_StackRel IterSlotOff ; iter = NEW // // Rewrite: insert `STA_StackRel DstOff` right after the LDA // (between LDA and INA). Erase the PHP/LDA/PLP/STA + the // ANDi-after-PHP wrap entirely. The ANDi at the front is kept // since it's also the BNE's flag source. { auto isCondBranch = [](const MachineInstr &MI) { unsigned O = MI.getOpcode(); return O == W65816::BNE || O == W65816::BEQ || O == W65816::BCC || O == W65816::BCS || O == W65816::BMI || O == W65816::BPL || O == W65816::BVC || O == W65816::BVS; }; auto isFlagSetter = [](const MachineInstr &MI) { unsigned O = MI.getOpcode(); return O == W65816::ANDi16imm || O == W65816::ANDi8imm || O == W65816::ORAi16imm || O == W65816::EORi16imm; }; // Find Bxx terminator. MachineInstr *Bxx = nullptr; for (auto It = MBB.rbegin(); It != MBB.rend(); ++It) { if (isCondBranch(*It)) { Bxx = &*It; break; } if (It->isBranch()) break; // BRA etc. — skip past it } if (!Bxx) goto skip_lagged_sink; { // Walk backward from Bxx to find STA, PLP, LDA, PHP. auto It2 = MachineBasicBlock::iterator(Bxx); if (It2 == MBB.begin()) goto skip_lagged_sink; --It2; // first non-branch if (It2->getOpcode() != W65816::STA_StackRel || !It2->getOperand(0).isImm()) goto skip_lagged_sink; MachineInstr *FinalSta = &*It2; int64_t DstOff = FinalSta->getOperand(0).getImm(); if (It2 == MBB.begin()) goto skip_lagged_sink; --It2; if (It2->getOpcode() != W65816::PLP) goto skip_lagged_sink; MachineInstr *Plp2 = &*It2; if (It2 == MBB.begin()) goto skip_lagged_sink; --It2; if (It2->getOpcode() != W65816::LDA_StackRel || !It2->getOperand(0).isImm()) goto skip_lagged_sink; MachineInstr *InnerLda = &*It2; int64_t SrcOff = InnerLda->getOperand(0).getImm(); if (It2 == MBB.begin()) goto skip_lagged_sink; --It2; if (It2->getOpcode() != W65816::PHP) goto skip_lagged_sink; MachineInstr *Php2 = &*It2; if (It2 == MBB.begin()) goto skip_lagged_sink; --It2; if (!isFlagSetter(*It2)) goto skip_lagged_sink; // The PHP-bumped SrcOff is the IterSlotOff + 1. int64_t IterSlotOff = SrcOff - 1; // Now find the iter++ sequence earlier in MBB: LDA IterSlotOff; // INA_PSEUDO; STA IterSlotOff. MachineInstr *IterLda = nullptr; MachineInstr *IterIna = nullptr; MachineInstr *IterSta = nullptr; for (auto Walk = MBB.begin(); Walk != MachineBasicBlock::iterator(Php2); ++Walk) { if (Walk->getOpcode() != W65816::LDA_StackRel) continue; if (!Walk->getOperand(0).isImm() || Walk->getOperand(0).getImm() != IterSlotOff) continue; auto N1 = std::next(Walk); while (N1 != MBB.end() && N1->isDebugInstr()) ++N1; if (N1 == MBB.end()) continue; if (N1->getOpcode() != W65816::INA_PSEUDO && N1->getOpcode() != W65816::ADCi16imm) continue; auto N2 = std::next(N1); while (N2 != MBB.end() && N2->isDebugInstr()) ++N2; if (N2 == MBB.end()) continue; if (N2->getOpcode() != W65816::STA_StackRel) continue; if (!N2->getOperand(0).isImm() || N2->getOperand(0).getImm() != IterSlotOff) continue; IterLda = &*Walk; IterIna = &*N1; IterSta = &*N2; break; } if (!IterLda) goto skip_lagged_sink; // Safety: make sure DstOff isn't written between IterLda and // the IndY use of DstOff. Walk forward from IterLda looking // for STA DstOff (other than our FinalSta) — if found, bail. for (auto Walk = std::next(MachineBasicBlock::iterator(IterSta)); Walk != MachineBasicBlock::iterator(Php2); ++Walk) { if (Walk->getOpcode() == W65816::STA_StackRel && Walk->getOperand(0).isImm() && Walk->getOperand(0).getImm() == DstOff) { goto skip_lagged_sink; } } // Apply: insert STA_StackRel DstOff right after IterLda, // BEFORE INA. const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); DebugLoc DL = IterLda->getDebugLoc(); BuildMI(MBB, std::next(MachineBasicBlock::iterator(IterLda)), DL, TII->get(W65816::STA_StackRel)) .addImm(DstOff) .addReg(W65816::A, RegState::Implicit); // Erase PHP, InnerLda, PLP, FinalSta. Php2->eraseFromParent(); InnerLda->eraseFromParent(); Plp2->eraseFromParent(); FinalSta->eraseFromParent(); Changed = true; } skip_lagged_sink:; } // i32 += i32 store-bypass. Regalloc materializes the call result // (A=lo, X=hi) into Wide32 spill slots before the add, then reads // them back — emitting 4 instructions of redundant store/reload: // // STA_StackRel slotA ; A (mul.lo) -> slotA // TXA ; A = X = mul.hi // STA_StackRel slotB ; mul.hi -> slotB // LDA_StackRel slotA ; reload mul.lo <-- redundant // CLC // ADC_StackRel slotC ; mul.lo + total.lo // STA_StackRel slotA ; sum-lo // LDA_StackRel slotB ; reload mul.hi <-- redundant // ADC_StackRel slotD ; mul.hi + total.hi + C // STA_StackRel slotB ; sum-hi // // Reorder to do the lo-add directly off A and the hi-add directly // off X (via TXA preserving carry): // // CLC // ADC_StackRel slotC ; A = mul.lo + total.lo // STA_StackRel slotA ; sum-lo // TXA ; A = X = mul.hi (C preserved) // ADC_StackRel slotD ; A = mul.hi + total.hi + C // STA_StackRel slotB ; sum-hi // // 10 -> 6 inst. Saves 4 inst / ~13 cyc per i32-add-of-call-result // site. Hits the sumOfSquares loop and any total += __umulhisi3 // pattern. { auto isStaSR = [](MachineInstr &MI, int64_t *off) { if (MI.getOpcode() != W65816::STA_StackRel) return false; if (MI.getNumOperands() < 1 || !MI.getOperand(0).isImm()) return false; if (off) *off = MI.getOperand(0).getImm(); return true; }; auto isLdaSR = [](MachineInstr &MI, int64_t *off) { if (MI.getOpcode() != W65816::LDA_StackRel) return false; if (MI.getNumOperands() < 1 || !MI.getOperand(0).isImm()) return false; if (off) *off = MI.getOperand(0).getImm(); return true; }; auto isAdcSR = [](MachineInstr &MI, int64_t *off) { if (MI.getOpcode() != W65816::ADC_StackRel) return false; if (MI.getNumOperands() < 1 || !MI.getOperand(0).isImm()) return false; if (off) *off = MI.getOperand(0).getImm(); return true; }; auto It = MBB.begin(); while (It != MBB.end()) { auto Cur = It; int64_t slotA = 0, slotB = 0, slotC = 0, slotD = 0; // Step 1: STA_StackRel slotA if (!isStaSR(*Cur, &slotA)) { ++It; continue; } auto P2 = std::next(Cur); while (P2 != MBB.end() && P2->isDebugInstr()) ++P2; if (P2 == MBB.end() || P2->getOpcode() != W65816::TXA) { ++It; continue; } auto P3 = std::next(P2); while (P3 != MBB.end() && P3->isDebugInstr()) ++P3; if (P3 == MBB.end() || !isStaSR(*P3, &slotB)) { ++It; continue; } if (slotA == slotB) { ++It; continue; } auto P4 = std::next(P3); while (P4 != MBB.end() && P4->isDebugInstr()) ++P4; int64_t lreloadA = 0; if (P4 == MBB.end() || !isLdaSR(*P4, &lreloadA) || lreloadA != slotA) { ++It; continue; } auto P5 = std::next(P4); while (P5 != MBB.end() && P5->isDebugInstr()) ++P5; if (P5 == MBB.end() || P5->getOpcode() != W65816::CLC) { ++It; continue; } auto P6 = std::next(P5); while (P6 != MBB.end() && P6->isDebugInstr()) ++P6; if (P6 == MBB.end() || !isAdcSR(*P6, &slotC)) { ++It; continue; } auto P7 = std::next(P6); while (P7 != MBB.end() && P7->isDebugInstr()) ++P7; int64_t outA = 0; if (P7 == MBB.end() || !isStaSR(*P7, &outA) || outA != slotA) { ++It; continue; } auto P8 = std::next(P7); while (P8 != MBB.end() && P8->isDebugInstr()) ++P8; int64_t lreloadB = 0; if (P8 == MBB.end() || !isLdaSR(*P8, &lreloadB) || lreloadB != slotB) { ++It; continue; } auto P9 = std::next(P8); while (P9 != MBB.end() && P9->isDebugInstr()) ++P9; if (P9 == MBB.end() || !isAdcSR(*P9, &slotD)) { ++It; continue; } auto P10 = std::next(P9); while (P10 != MBB.end() && P10->isDebugInstr()) ++P10; int64_t outB = 0; if (P10 == MBB.end() || !isStaSR(*P10, &outB) || outB != slotB) { ++It; continue; } // All 10 matched. slotA != slotB already. Also require all // four slots distinct. (slotC/slotD are the total.lo/hi read // addresses; in the canonical case slotC != slotA and slotD != // slotB; without this the rewrite would re-read its own output.) if (slotC == slotA || slotD == slotB || slotC == slotD) { ++It; continue; } // Rewrite: emit CLC ; ADC slotC ; STA slotA ; TXA ; ADC slotD ; // STA slotB before P1, then erase steps 1-10. DebugLoc DL = Cur->getDebugLoc(); BuildMI(MBB, Cur, DL, TII.get(W65816::CLC)); BuildMI(MBB, Cur, DL, TII.get(W65816::ADC_StackRel)) .addImm(slotC); BuildMI(MBB, Cur, DL, TII.get(W65816::STA_StackRel)) .addImm(slotA); BuildMI(MBB, Cur, DL, TII.get(W65816::TXA)); BuildMI(MBB, Cur, DL, TII.get(W65816::ADC_StackRel)) .addImm(slotD); BuildMI(MBB, Cur, DL, TII.get(W65816::STA_StackRel)) .addImm(slotB); // Advance It past the matched pattern before erasing (so we // don't iterate through deleted insts). It = std::next(P10); // Erase the 10 originals. Cur->eraseFromParent(); P2->eraseFromParent(); P3->eraseFromParent(); P4->eraseFromParent(); P5->eraseFromParent(); P6->eraseFromParent(); P7->eraseFromParent(); P8->eraseFromParent(); P9->eraseFromParent(); P10->eraseFromParent(); Changed = true; } } // Dead TAX / TXA elimination. STAfi declares `Defs = [A]` as a // safe over-approximation (eliminateFrameIndex emits a PHA-bracketed // sequence when the source is IMG-class). Regalloc honors that by // inserting `TAX ; ...STAfi... ; TXA` brackets around STAfi that // SOURCES from A — but in the A-source path A is preserved. The // TXA's output gets clobbered immediately by the next LDA*, so the // TXA is dead; once TXA is gone, the TAX's X-value has no consumer // and is dead too. This pattern recurs once per i32-spill site. // // Conservative: only elide TXA if the IMMEDIATE next non-debug // instruction defines $a (and doesn't read $a or N/Z first). No // intervening flag-readers between TXA and the A-define is then // guaranteed. Same logic for TYA. // // For TAX: elide if no instruction between TAX and the next $x def // reads $x (and we can prove the original X had no live consumer). // Done as a fixed-point: keep iterating until no change. auto definesReg = [](const MachineInstr &MI, unsigned Reg) -> bool { for (const MachineOperand &MO : MI.operands()) { if (MO.isReg() && MO.getReg() == Reg && MO.isDef()) return true; } return false; }; auto readsReg = [](const MachineInstr &MI, unsigned Reg) -> bool { for (const MachineOperand &MO : MI.operands()) { if (MO.isReg() && MO.getReg() == Reg && MO.isUse()) return true; } return false; }; bool again2 = true; while (again2) { again2 = false; // Pass A: dead TXA / TYA for (auto It = MBB.begin(); It != MBB.end(); ) { unsigned O = It->getOpcode(); if (O != W65816::TXA && O != W65816::TYA) { ++It; continue; } auto Next = std::next(It); while (Next != MBB.end() && Next->isDebugInstr()) ++Next; if (Next == MBB.end()) { ++It; continue; } // Next must define $a unconditionally, and must not read $a // (since we're about to discard the TXA-defined A) and must // not be a call / branch / inline asm (which conservatively // read $a). if (Next->isCall() || Next->isBranch() || Next->isReturn() || Next->isInlineAsm()) { ++It; continue; } if (!definesReg(*Next, W65816::A)) { ++It; continue; } if (readsReg(*Next, W65816::A)) { ++It; continue; } // P (flags) liveness: TXA/TYA set N/Z. If Next reads P, we'd // be discarding the flags it expects. Bxx and friends read P. // Conservative: also require Next does not read $p. if (readsReg(*Next, W65816::P)) { ++It; continue; } auto Dead = It++; Dead->eraseFromParent(); Changed = true; again2 = true; } // Pass B: dead TAX / TAY for (auto It = MBB.begin(); It != MBB.end(); ) { unsigned O = It->getOpcode(); unsigned Target; if (O == W65816::TAX) Target = W65816::X; else if (O == W65816::TAY) Target = W65816::Y; else { ++It; continue; } // Walk forward. TAX/TAY is dead if every use of Target is // preceded by a redefinition of Target (and the in-MBB region // between has no flag-reader that consumes TAX's N/Z). At MBB // end, check successor live-ins: if none has Target as live-in // it's also dead. // // Flag liveness: TAX defines $p (N/Z). A later $p-reader only // consumes TAX's flags if no intervening instruction REDEFINES // $p in the gap. Track `pRedef` to allow common patterns like // `TAX ; CLC ; ADC ; ...` where ADC reads $p but the $p it // reads is the freshly-CLC'd carry, not TAX's N/Z. auto Walker = std::next(It); bool deadIt = false; bool bailed = false; bool pRedef = false; while (Walker != MBB.end()) { if (Walker->isDebugInstr()) { ++Walker; continue; } if (Walker->isCall() || Walker->isInlineAsm()) { bailed = true; break; } // Branch / return: stop walking; rely on successor live-ins. if (Walker->isBranch() || Walker->isReturn()) break; if (readsReg(*Walker, Target)) { bailed = true; break; } if (readsReg(*Walker, W65816::P) && !pRedef) { bailed = true; break; } if (definesReg(*Walker, W65816::P)) pRedef = true; if (definesReg(*Walker, Target)) { deadIt = true; break; } ++Walker; } if (bailed) { ++It; continue; } if (!deadIt) { // Fell through to MBB end / branch. Check successor live-ins. bool liveOut = false; for (MachineBasicBlock *Succ : MBB.successors()) { if (Succ->isLiveIn(Target)) { liveOut = true; break; } } // Return blocks: $a and $x are the i32 return-value convention. // RTL doesn't model these as Uses, but they ARE live at the // return. Be conservative — don't elide TAX/TAY before a return. if (!MBB.empty() && MBB.back().isReturn()) liveOut = true; if (liveOut) { ++It; continue; } } auto Dead = It++; Dead->eraseFromParent(); Changed = true; again2 = true; } } // Third peephole: drop `LDY_Imm16 K` when Y already holds K from // an earlier LDY in the same MBB and no intervening MI clobbered // Y. Custom inserter emits LDY #0 before every LDAfi_indY/STAfi_indY, // even though Y already holds 0 from a previous emit — the // redundant LDYs survive MachineLICM because Y is a phys reg and // the inserter binds them tightly to each use. int yKnown = -1; // -1 means unknown; otherwise the immediate auto It2 = MBB.begin(); while (It2 != MBB.end()) { MachineInstr &MI = *It2; if (MI.isDebugInstr()) { ++It2; continue; } unsigned Op = MI.getOpcode(); if (Op == W65816::LDY_Imm16 && MI.getNumOperands() >= 1 && MI.getOperand(0).isImm()) { int K = MI.getOperand(0).getImm() & 0xFFFF; if (yKnown == K) { // Before erasing this redundant LDY: the prior LDY is still in // scope, so all of its Y-uses between the two LDYs are still // valid uses. But liveness already marked the LAST one (just // before the redundant LDY) as `implicit killed $y`, because // that LDY was about to redefine Y. After erasure, Y survives // through to the NEXT use, so the prior "kill" annotation is // wrong and the machine verifier rejects. Walk backward and // clear the kill flag on the most recent Y-using operand. for (auto Back = std::prev(It2);; --Back) { bool clearedAny = false; for (MachineOperand &MO : Back->operands()) { if (MO.isReg() && MO.getReg() == W65816::Y && MO.isUse() && MO.isKill()) { MO.setIsKill(false); clearedAny = true; } } if (clearedAny) break; if (Back == MBB.begin()) break; } auto Erase = It2++; Erase->eraseFromParent(); Changed = true; continue; } yKnown = K; } else { // Conservatively invalidate yKnown on anything that touches Y // or on calls / inline asm / any instruction that doesn't have // a clean "no Y effect" guarantee. Cheaper to underclaim than // miscompile. switch (Op) { case W65816::LDAfi_indY: // reads Y, doesn't def it — keep yKnown case W65816::STAfi_indY: case W65816::LDA_StackRelIndY: case W65816::STA_StackRelIndY: break; case W65816::TAY: case W65816::TXY: case W65816::INY: case W65816::DEY: case W65816::PLY: case W65816::LDY_DP: case W65816::LDY_Abs: case W65816::LDY_DPX: case W65816::LDY_AbsX: yKnown = -1; break; default: if (MI.isCall()) yKnown = -1; break; } } ++It2; } } // Store forwarding (disabled — CRC32 regressed and I couldn't // nail down the safety hole in time). Even with PHP-wrap guards // and SP-modifier bails, the first fire (in memmove) silently // miscompiles something that CRC32 later depends on. Pattern // is sound; safety analysis isn't complete. See // feedback_close_gap_attempts_round2.md for details. #if 0 // Store forwarding for PHI memory copies. Pattern (sumSquares // loop body): // // STA X,s ; A → slot X (some intermediate result) // [code that modifies A but doesn't touch slot X or slot Y] // LDA X,s ; reload A from slot X // STA Y,s ; A → slot Y (the PHI copy) // // Transform: insert `STA Y,s` right after the first `STA X,s` (A // still holds the same value at that point), then drop the LDA- // STA pair. Net: -1 inst per pattern occurrence. // // Safety constraints (all between STA X and the LDA-STA pair, in // the same MBB, in straight-line code): // - No instruction writes slot X (else the LDA would see a // different value than the original STA). // - No instruction reads OR writes slot Y (else our early STA Y // would be observed mid-flight with a different value than // before, or our inserted store would be overwritten and the // intervening read of Y in the original would have seen the // overwrite). // - No call / inline asm / branch (conservatively: those can // touch memory we don't model). { auto isStackRelMC2 = [](unsigned Op) { return Op == W65816::LDA_StackRel || Op == W65816::STA_StackRel || Op == W65816::ADC_StackRel || Op == W65816::SBC_StackRel || Op == W65816::AND_StackRel || Op == W65816::ORA_StackRel || Op == W65816::EOR_StackRel || Op == W65816::CMP_StackRel; }; auto srAccess2 = [&](const MachineInstr &MI, int64_t &Off) -> bool { if (!isStackRelMC2(MI.getOpcode())) return false; if (MI.getNumOperands() < 1 || !MI.getOperand(0).isImm()) return false; Off = MI.getOperand(0).getImm(); return true; }; auto isStaSr = [](const MachineInstr &MI) { return MI.getOpcode() == W65816::STA_StackRel; }; auto isLdaSr = [](const MachineInstr &MI) { return MI.getOpcode() == W65816::LDA_StackRel; }; SmallVector ToErase; SmallVector, 4> ToInsert; static int g_fireLimit = -1; static int g_fireCount = 0; static bool initd = false; if (!initd) { if (const char *e = getenv("STORE_FWD_LIMIT")) g_fireLimit = atoi(e); initd = true; } for (MachineBasicBlock &MBB : MF) { for (auto It = MBB.begin(); It != MBB.end(); ++It) { if (!isStaSr(*It)) continue; int64_t X; if (!srAccess2(*It, X)) continue; MachineInstr *StaX = &*It; // Check if StaX is INSIDE an open PHP/PLP wrap. In that case // its operand offset has been pre-bumped by +1, and inserting // a sibling STA Y immediately after writes at the WRONG slot // (the un-bumped Y). Walk backward: if we find a PHP without // a matching PLP first, bail. { bool insideWrap = false; int depth = 0; auto B = It; while (B != MBB.begin()) { --B; if (B->getOpcode() == W65816::PLP) depth++; else if (B->getOpcode() == W65816::PHP) { if (depth > 0) depth--; else { insideWrap = true; break; } } } if (insideWrap) continue; } // Walk forward looking for LDA X ; STA Y. Conservative bail // on any non-tracked memory op (indirect pointer access, // DP/abs ops, etc.) which could alias slot Y via memory. bool ok = true; int64_t Y = -1; MachineInstr *LdaX = nullptr; MachineInstr *StaY = nullptr; for (auto Walker = std::next(It); Walker != MBB.end(); ++Walker) { if (Walker->isDebugInstr()) continue; if (Walker->isCall() || Walker->isInlineAsm() || Walker->isBranch() || Walker->isReturn()) { ok = false; break; } // Found LDA X? int64_t Off; if (isLdaSr(*Walker) && srAccess2(*Walker, Off) && Off == X) { LdaX = &*Walker; auto Next = std::next(Walker); while (Next != MBB.end() && Next->isDebugInstr()) ++Next; if (Next == MBB.end() || !isStaSr(*Next) || !srAccess2(*Next, Y) || Y == X) { ok = false; } else { StaY = &*Next; } break; } // Stack-rel access to X (write or read): bail. if (srAccess2(*Walker, Off) && Off == X) { ok = false; break; } // Any memory-touching op that's NOT a tracked stack-rel // access — bail. Indirect pointer stores/loads (DPIndY / // DPIndLong / abs / etc.) could alias slot Y via a pointer // we can't trace, and the safety check below would miss it. if ((Walker->mayLoad() || Walker->mayStore()) && !isStackRelMC2(Walker->getOpcode())) { ok = false; break; } // SP-modifying ops shift the stack-rel addressing window — // a later `lda X, s` reads a DIFFERENT byte than the earlier // `sta X, s` (or worse, the new stack pointer points into // saved P/retaddr). Bail on TCS (direct SP write) and on // any stack push/pop (PHx/PLx/PEA/PEI/COP/BRK). Also bail // on PHP/PLP because the wrap pass already bumped in-wrap // stack-rel ops by +1 — our inserted STA after STA X writes // at the un-bumped offset which gets the WRONG slot. { unsigned WO = Walker->getOpcode(); if (WO == W65816::TCS || WO == W65816::PHA || WO == W65816::PLA || WO == W65816::PHX || WO == W65816::PLX || WO == W65816::PHY || WO == W65816::PLY || WO == W65816::PHP || WO == W65816::PLP || WO == W65816::PHB || WO == W65816::PLB || WO == W65816::PHD || WO == W65816::PLD || WO == W65816::PHK || WO == W65816::PEA || WO == W65816::PEI_DP) { ok = false; break; } } } if (!ok || !LdaX || !StaY) continue; if (g_fireLimit >= 0 && g_fireCount >= g_fireLimit) continue; g_fireCount++; errs() << "SF FIRE " << g_fireCount << " in " << MF.getName() << " MBB " << MBB.getNumber() << " X=" << X << " Y=" << StaY->getOperand(0).getImm() << "\n"; // Now re-walk from std::next(It) up to LdaX and verify no // access to slot Y in that gap. ok = true; for (auto W2 = std::next(It); W2 != LdaX->getIterator(); ++W2) { if (W2->isDebugInstr()) continue; int64_t Off; if (srAccess2(*W2, Off) && Off == Y) { ok = false; break; } } if (!ok) continue; // Safe to apply: schedule the StaY-after-StaX insert, and // erase LdaX and StaY. ToInsert.push_back({StaX, Y}); ToErase.push_back(LdaX); ToErase.push_back(StaY); Changed = true; } } // Apply (insertions first; iterators stay valid through erase). for (auto &P : ToInsert) { MachineInstr *StaX = std::get<0>(P); int64_t Y = std::get<1>(P); MachineBasicBlock *MBB = StaX->getParent(); DebugLoc DL = StaX->getDebugLoc(); auto NextIt = std::next(StaX->getIterator()); BuildMI(*MBB, NextIt, DL, TII.get(W65816::STA_StackRel)) .addImm(Y); } for (MachineInstr *MI : ToErase) MI->eraseFromParent(); } #endif // (Redundant CMP #0 elimination — disabled, hit VLA sum_n // regression. Carry-flag bookkeeping across the CMP turned out to // have more cases than my forward-walk modeled. See // feedback_cmp_zero_elim.md.) #if 0 { auto isNZSetOnA = [](unsigned Op) { switch (Op) { case W65816::DEA_PSEUDO: case W65816::INA_PSEUDO: case W65816::ADC_StackRel: case W65816::ADC_DP: case W65816::ADC_Imm16: case W65816::SBC_StackRel: case W65816::SBC_DP: case W65816::SBC_Imm16: case W65816::AND_StackRel: case W65816::AND_DP: case W65816::AND_Imm16: case W65816::ORA_StackRel: case W65816::ORA_DP: case W65816::ORA_Imm16: case W65816::EOR_StackRel: case W65816::EOR_DP: case W65816::EOR_Imm16: case W65816::LDA_StackRel: case W65816::LDA_DP: case W65816::LDAi16imm: case W65816::LDA_Imm16: case W65816::TXA: case W65816::TYA: case W65816::ADCi16imm: case W65816::ADCEi16imm: case W65816::SBCi16imm: case W65816::SBCEi16imm: return true; default: return false; } }; auto isCmpZero = [](const MachineInstr &MI) { if (MI.getOpcode() != W65816::CMPi16imm) return false; // Operand layout: lhs (Acc16), imm. Find the imm. for (const MachineOperand &MO : MI.operands()) { if (MO.isImm()) return MO.getImm() == 0; } return false; }; auto modifiesA = [](const MachineInstr &MI) { for (const MachineOperand &MO : MI.operands()) { if (MO.isReg() && MO.getReg() == W65816::A && MO.isDef()) return true; } return false; }; auto readsC = [](const MachineInstr &MI) { // We don't model individual flag bits; approximate by checking // if the MI reads $p AND is one of the carry-consuming ops. unsigned Op = MI.getOpcode(); switch (Op) { case W65816::ADC_StackRel: case W65816::ADC_DP: case W65816::ADC_Imm16: case W65816::SBC_StackRel: case W65816::SBC_DP: case W65816::SBC_Imm16: case W65816::ADCEi16imm: case W65816::SBCEi16imm: case W65816::BCC: case W65816::BCS: case W65816::ROL_A: case W65816::ROR_A: return true; default: return false; } }; SmallVector CmpsToErase; for (MachineBasicBlock &MBB : MF) { for (MachineInstr &MI : MBB) { if (!isCmpZero(MI)) continue; // Walk backward, skipping flag-preserving instructions. bool foundProducer = false; auto Back = MI.getIterator(); while (Back != MBB.begin()) { --Back; if (Back->isDebugInstr()) continue; if (Back->isCall() || Back->isInlineAsm()) break; if (modifiesA(*Back)) { foundProducer = isNZSetOnA(Back->getOpcode()); break; } bool defsP = false; for (const MachineOperand &MO : Back->operands()) { if (MO.isReg() && MO.getReg() == W65816::P && MO.isDef()) { defsP = true; break; } } if (defsP) break; } if (!foundProducer) continue; // Walk FORWARD from CMP: until the next C-defining MI, no MI // reads C. bool cConsumed = false; for (auto Fwd = std::next(MI.getIterator()); Fwd != MBB.end(); ++Fwd) { if (Fwd->isDebugInstr()) continue; if (readsC(*Fwd)) { cConsumed = true; break; } // Next def of $p: subsequent reads aren't ours. bool defsP = false; for (const MachineOperand &MO : Fwd->operands()) { if (MO.isReg() && MO.getReg() == W65816::P && MO.isDef()) { defsP = true; break; } } if (defsP) break; } if (cConsumed) continue; CmpsToErase.push_back(&MI); } } for (MachineInstr *MI : CmpsToErase) MI->eraseFromParent(); if (!CmpsToErase.empty()) Changed = true; } #endif // (Narrow PHI-copy slot collapse — disabled, qsort regression.) #if 0 { auto isStackRelMC2 = [](unsigned Op) { return Op == W65816::LDA_StackRel || Op == W65816::STA_StackRel || Op == W65816::ADC_StackRel || Op == W65816::SBC_StackRel || Op == W65816::AND_StackRel || Op == W65816::ORA_StackRel || Op == W65816::EOR_StackRel || Op == W65816::CMP_StackRel; }; auto srAccess2 = [&](const MachineInstr &MI, int64_t &Off) { if (!isStackRelMC2(MI.getOpcode())) return false; if (MI.getNumOperands() < 1 || !MI.getOperand(0).isImm()) return false; Off = MI.getOperand(0).getImm(); return true; }; DenseMap Refs; DenseMap StaInst, LdaInst; DenseMap NSta, NLda; for (MachineBasicBlock &MBB : MF) { for (MachineInstr &MI : MBB) { int64_t Off; if (!srAccess2(MI, Off)) continue; Refs[Off]++; if (MI.getOpcode() == W65816::STA_StackRel) { NSta[Off]++; StaInst[Off] = &MI; } else if (MI.getOpcode() == W65816::LDA_StackRel) { NLda[Off]++; LdaInst[Off] = &MI; } } } SmallVector ToErase; for (auto &P : Refs) { int64_t X = P.first; if (P.second != 2) continue; // exactly 2 references if (NSta[X] != 1 || NLda[X] != 1) continue; MachineInstr *Sta = StaInst[X]; MachineInstr *Lda = LdaInst[X]; if (Sta->getParent() != Lda->getParent()) continue; MachineBasicBlock *MBB = Sta->getParent(); // Sta must be before Lda. bool staBefore = false; for (auto It = MBB->begin(); It != MBB->end(); ++It) { if (&*It == Sta) { staBefore = true; break; } if (&*It == Lda) break; } if (!staBefore) continue; // Next after Lda must be STA Y where Y != X. auto NextIt = std::next(Lda->getIterator()); while (NextIt != MBB->end() && NextIt->isDebugInstr()) ++NextIt; if (NextIt == MBB->end()) continue; int64_t Y; if (NextIt->getOpcode() != W65816::STA_StackRel || !srAccess2(*NextIt, Y) || Y == X) continue; // Between Sta and Lda, no read/write of slot Y, no call, no // anything that would re-set slot Y's value mid-flight. bool ok = true; for (auto It = std::next(Sta->getIterator()); It != Lda->getIterator(); ++It) { if (It->isDebugInstr()) continue; if (It->isCall() || It->isInlineAsm()) { ok = false; break; } int64_t Off; if (srAccess2(*It, Off) && Off == Y) { ok = false; break; } } if (!ok) continue; // Redirect the original STA to write to Y; delete the LDA-STA pair. Sta->getOperand(0).setImm(Y); ToErase.push_back(Lda); ToErase.push_back(&*NextIt); Changed = true; } for (MachineInstr *MI : ToErase) MI->eraseFromParent(); } #endif return Changed; }