1133 lines
49 KiB
C++
1133 lines
49 KiB
C++
//===-- W65816SepRepCleanup.cpp - Coalesce adjacent SEP/REP toggles -------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// Post-PEI peephole that drops adjacent `REP #$20 ; SEP #$20` (or vice
|
|
// versa) pairs that toggle the M-bit redundantly.
|
|
//
|
|
// The STA8fi expansion in W65816RegisterInfo::eliminateFrameIndex emits
|
|
// `SEP #$20 / STA d,S / REP #$20` so each i8 store runs with M=1. When
|
|
// two STA8fi sit back-to-back in the MIR (no 16-bit ALU op between
|
|
// them), the post-PEI stream contains:
|
|
//
|
|
// SEP #$20
|
|
// STA d1, S
|
|
// REP #$20 <-- toggle
|
|
// SEP #$20 <-- toggle (cancels above)
|
|
// STA d2, S
|
|
// REP #$20
|
|
//
|
|
// The middle REP/SEP pair is a no-op: both stores can run in one M=1
|
|
// region. We drop them to leave:
|
|
//
|
|
// SEP #$20
|
|
// STA d1, S
|
|
// STA d2, S
|
|
// REP #$20
|
|
//
|
|
// Saves 2 bytes / 6 cycles per coalesced pair. Symmetric `SEP/REP`
|
|
// pairs (M=1 then M=0 with nothing in between) are also dropped — they
|
|
// can arise around inline-asm or hand-written assembly snippets.
|
|
//
|
|
// Runs at addPreEmitPass (after PEI has expanded STA8fi).
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "W65816.h"
|
|
#include "W65816InstrInfo.h"
|
|
#include "W65816Subtarget.h"
|
|
#include "llvm/ADT/SmallSet.h"
|
|
#include "llvm/Support/raw_ostream.h"
|
|
#include "llvm/CodeGen/MachineFunction.h"
|
|
#include "llvm/CodeGen/MachineFunctionPass.h"
|
|
#include "llvm/CodeGen/MachineInstr.h"
|
|
|
|
using namespace llvm;
|
|
|
|
#define DEBUG_TYPE "w65816-sep-rep-cleanup"
|
|
|
|
// W65816 processor status M-bit mask (set/clear via SEP/REP #$20).
|
|
static constexpr int kMBit = 0x20;
|
|
|
|
namespace {
|
|
|
|
class W65816SepRepCleanup : public MachineFunctionPass {
|
|
public:
|
|
static char ID;
|
|
|
|
W65816SepRepCleanup() : MachineFunctionPass(ID) {}
|
|
|
|
StringRef getPassName() const override {
|
|
return "W65816 SEP/REP toggle coalescing";
|
|
}
|
|
|
|
bool runOnMachineFunction(MachineFunction &MF) override;
|
|
};
|
|
|
|
} // namespace
|
|
|
|
char W65816SepRepCleanup::ID = 0;
|
|
|
|
INITIALIZE_PASS(W65816SepRepCleanup, DEBUG_TYPE,
|
|
"W65816 SEP/REP toggle coalescing", false, false)
|
|
|
|
FunctionPass *llvm::createW65816SepRepCleanup() {
|
|
return new W65816SepRepCleanup();
|
|
}
|
|
|
|
// Returns the immediate value of `op` if MI is a `SEP #imm` or `REP #imm`,
|
|
// else -1.
|
|
static int getSepRepImm(const MachineInstr &MI, unsigned Opc) {
|
|
if (MI.getOpcode() != Opc)
|
|
return -1;
|
|
if (MI.getNumOperands() < 1 || !MI.getOperand(0).isImm())
|
|
return -1;
|
|
return MI.getOperand(0).getImm();
|
|
}
|
|
|
|
// Returns true if MI may consume the carry or overflow flag — these
|
|
// are the flags that ADC/SBC define but INA/DEA don't. Conservative:
|
|
// any branch that reads C or V counts, plus the chained ADC/SBC ops
|
|
// that wait for a prior carry-out. Anything else (CMP, CLC, SEC,
|
|
// LDA, STA, AND, ORA, EOR, etc.) re-defines or doesn't read C/V.
|
|
static bool readsCarryOrV(const MachineInstr &MI) {
|
|
switch (MI.getOpcode()) {
|
|
case W65816::BCS: // reads C
|
|
case W65816::BCC: // reads C
|
|
case W65816::BVS: // reads V
|
|
case W65816::BVC: // reads V
|
|
case W65816::ADC_StackRel: // reads C as carry-in
|
|
case W65816::ADC_Imm16:
|
|
case W65816::ADC_Imm8:
|
|
case W65816::ADC_DP:
|
|
case W65816::ADC_Abs:
|
|
case W65816::SBC_StackRel:
|
|
case W65816::SBC_Imm16:
|
|
case W65816::SBC_Imm8:
|
|
case W65816::SBC_DP:
|
|
case W65816::SBC_Abs:
|
|
// Chained-carry pseudos. These run BEFORE AsmPrinter expansion so
|
|
// we must whitelist them explicitly — they're the hi-half of any
|
|
// multi-precision add/sub and read the lo-half's carry-out. Without
|
|
// these, the INA/DEA peephole below silently rewrites a lo-half
|
|
// `ADCi16imm src, 2` to `INA; INA` (which DOES NOT set C), breaking
|
|
// the i32 ADD carry chain. Caught as `arr[0] = arr[1]` writing to
|
|
// wrong bank under ptr32 because the high half got a stale C.
|
|
case W65816::ADCEi16imm:
|
|
case W65816::SBCEi16imm:
|
|
// The fi/abs/imm forms of ADC/SBC are also pre-AsmPrinter pseudos;
|
|
// each expands to a real ADC_/SBC_ opcode that reads carry.
|
|
case W65816::ADCi16imm: // lo-half (CLC + ADC_Imm16)
|
|
case W65816::SBCi16imm: // lo-half (SEC + SBC_Imm16)
|
|
case W65816::ADCfi: // chained-carry stack form
|
|
case W65816::SBCfi:
|
|
case W65816::ADCEfi:
|
|
case W65816::SBCEfi:
|
|
case W65816::ADCabs:
|
|
case W65816::SBCabs:
|
|
case W65816::ROL_A: // rotates fold C in
|
|
case W65816::ROR_A:
|
|
case W65816::ROL_DP:
|
|
case W65816::ROL_Abs:
|
|
case W65816::ROR_DP:
|
|
case W65816::ROR_Abs:
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Returns true if `Op` is one of the flag-redefining opcodes (CLC, SEC,
|
|
// CMP*, CPX*, CPY*, REP, SEP) — observing C/V before this is safe.
|
|
// Includes the pseudo CMP* variants (CMPi16imm etc.) since this peephole
|
|
// runs at pre-emit, BEFORE the AsmPrinter expands them.
|
|
static bool isFlagRedefiner(unsigned Op) {
|
|
switch (Op) {
|
|
case W65816::CLC:
|
|
case W65816::SEC:
|
|
case W65816::CMP_Imm8: case W65816::CMP_Imm16:
|
|
case W65816::CMP_StackRel: case W65816::CMP_DP: case W65816::CMP_Abs:
|
|
case W65816::CMPi16imm: case W65816::CMPi8imm:
|
|
case W65816::CMPfi: case W65816::CMPabs:
|
|
case W65816::CMP_RR:
|
|
case W65816::CPX_Imm8: case W65816::CPX_Imm16:
|
|
case W65816::CPX_DP: case W65816::CPX_Abs:
|
|
case W65816::CPY_Imm8: case W65816::CPY_Imm16:
|
|
case W65816::CPY_DP: case W65816::CPY_Abs:
|
|
case W65816::REP: case W65816::SEP:
|
|
return true;
|
|
default: return false;
|
|
}
|
|
}
|
|
|
|
// Returns true if a subsequent MI in the same MBB observes the C/V
|
|
// flags before any flag-redefiner clears the dependency. At MBB end,
|
|
// extends one step into each successor: if any successor's first
|
|
// (non-debug) MI reads C/V before redefining them, the flag is live
|
|
// across the edge — bail. This is critical for loop bodies where
|
|
// the back-edge re-enters the same MBB at LDA/PHA (neither reads C/V),
|
|
// so a per-iteration `clc; adc #2` is foldable. Cross-MBB carry chains
|
|
// would normally use ADCEi16imm (not ADCi16imm), so this is safe.
|
|
static bool carryFlagLiveAfter(MachineBasicBlock::iterator After,
|
|
MachineBasicBlock &MBB) {
|
|
// Phase 1: scan within this MBB.
|
|
for (auto Probe = std::next(After); Probe != MBB.end(); ++Probe) {
|
|
if (Probe->isDebugInstr()) continue;
|
|
if (readsCarryOrV(*Probe)) return true;
|
|
if (isFlagRedefiner(Probe->getOpcode())) return false;
|
|
if (Probe->isCall()) return false; // callee resets flags
|
|
}
|
|
// Phase 2: peek into each successor's first few MIs. We BAIL only on
|
|
// a positive C/V read; reaching MBB end or peek-cap without finding
|
|
// one is treated as "carry dead" — ADCi16imm's carry-out is never
|
|
// used in carry chains (those use ADCEi16imm), so a stray carry
|
|
// floating into RTL or an unrelated arithmetic op causes no harm.
|
|
const unsigned MaxPeek = 6;
|
|
for (MachineBasicBlock *Succ : MBB.successors()) {
|
|
unsigned Peeked = 0;
|
|
for (auto &MI : *Succ) {
|
|
if (MI.isDebugInstr()) continue;
|
|
if (readsCarryOrV(MI)) return true;
|
|
if (isFlagRedefiner(MI.getOpcode()) || MI.isCall()) break;
|
|
if (++Peeked >= MaxPeek) break;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
// Convert `ADCi16imm dst, src, ±1`/`±2` and `SBCi16imm` similarly to
|
|
// INA / INA;INA / DEA / DEA;DEA chains when C/V are dead. ADCi16imm
|
|
// is a pseudo that expands to CLC+ADC_Imm16 (4B/5cyc). INA is 1B/2cyc.
|
|
// Savings per ±1: 3B/3cyc; per ±2: 2B/1cyc. SBCi16imm is symmetric
|
|
// (sub by N == add by -N), so SBC #1 → DEA, SBC #-1 → INA, etc.
|
|
static bool foldImmAdcToInaDea(MachineBasicBlock &MBB,
|
|
const W65816InstrInfo &TII) {
|
|
bool Changed = false;
|
|
auto It = MBB.begin();
|
|
while (It != MBB.end()) {
|
|
unsigned Op = It->getOpcode();
|
|
bool isAdc = (Op == W65816::ADCi16imm);
|
|
bool isSbc = (Op == W65816::SBCi16imm);
|
|
if ((!isAdc && !isSbc) || It->getNumOperands() < 3 ||
|
|
!It->getOperand(2).isImm()) { ++It; continue; }
|
|
int64_t Imm = (int16_t)It->getOperand(2).getImm();
|
|
// For SBC, negate: SBC by +N is "subtract N", same as ADC by -N.
|
|
int64_t Effective = isSbc ? -Imm : Imm;
|
|
if (Effective < -2 || Effective > 2 || Effective == 0) { ++It; continue; }
|
|
if (carryFlagLiveAfter(It, MBB)) { ++It; continue; }
|
|
|
|
DebugLoc DL = It->getDebugLoc();
|
|
unsigned NewOpc = (Effective > 0) ? W65816::INA : W65816::DEA;
|
|
unsigned Count = (Effective > 0) ? Effective : -Effective;
|
|
for (unsigned i = 0; i < Count; ++i)
|
|
BuildMI(MBB, It, DL, TII.get(NewOpc));
|
|
auto NextIt = std::next(It);
|
|
It->eraseFromParent();
|
|
It = NextIt;
|
|
Changed = true;
|
|
}
|
|
return Changed;
|
|
}
|
|
|
|
// DBG_VALUE preservation in this pass:
|
|
//
|
|
// Every instruction this pass erases falls into one of these classes:
|
|
// * SEP/REP — MCInst-level mode-flag toggles, no value flow.
|
|
// * TAX/TXA/TAY/TYA — register transfers; the source value still
|
|
// exists in A and is followed by an A-redefining instruction that
|
|
// was the reason we identified the transfer as dead.
|
|
// * Redundant LDY_Imm16 — Y already holds the constant.
|
|
// * Redundant ADCi16imm/SBCi16imm rewritten to INA/DEA — same value,
|
|
// fewer cycles.
|
|
// * Lagged-ptr PHI-copy sink — relocates a `STA dst` from end-of-MBB
|
|
// to immediately after the iter-load. The destination slot is
|
|
// written earlier but with the same value at every read point
|
|
// because the iter's OLD value is what flowed through the
|
|
// PHP/PLP-wrapped tail copy.
|
|
// * i32-add store-bypass — reorders 10 instructions to 6 that
|
|
// compute the same lo/hi result into the same destination slots
|
|
// in the same order from the user's point of view.
|
|
//
|
|
// None of these change the user-visible value of a named variable at
|
|
// any PC where a DBG_VALUE could observe it. Hoisted/moved
|
|
// instructions write the same data at slightly earlier PCs in their
|
|
// MBB; a DBG_VALUE between the OLD and NEW write positions could read
|
|
// a slightly-fresher value (the next-iteration's prefetch instead of
|
|
// the current iteration's tail), but never a wrong value — the loop
|
|
// invariant guarantees both values agree at the moved boundary.
|
|
bool W65816SepRepCleanup::runOnMachineFunction(MachineFunction &MF) {
|
|
bool Changed = false;
|
|
const auto &STI = MF.getSubtarget<W65816Subtarget>();
|
|
const auto &TII = *STI.getInstrInfo();
|
|
for (MachineBasicBlock &MBB : MF) {
|
|
// Pre-pass: hoist LDAi8imm out of byte-store SEP/REP wraps.
|
|
// The post-RA scheduler can move LDAi8imm (which is marked
|
|
// hasSideEffects=0 at MIR but expands at AsmPrinter to its OWN
|
|
// SEP+LDA8+REP that toggles M) INSIDE an STBptr inserter's
|
|
// SEP/REP wrap. When that happens, the LDAi8imm's expansion
|
|
// REP fires BEFORE the byte STA, leaving the STA in M=16 — the
|
|
// store becomes a 16-bit zero write, corrupting the byte AFTER
|
|
// the intended target. Detect the pattern and hoist the
|
|
// LDAi8imm above the outer SEP. #107 strtok_r BB0_15 was this
|
|
// exact bug.
|
|
{
|
|
SmallVector<MachineInstr *, 8> SepHoists;
|
|
for (auto It = MBB.begin(); It != MBB.end(); ++It) {
|
|
if (It->getOpcode() != W65816::SEP) continue;
|
|
if (It->getNumOperands() < 1 || !It->getOperand(0).isImm()) continue;
|
|
if (It->getOperand(0).getImm() != kMBit) continue;
|
|
// Walk forward looking for LDAi8imm before any STAfi_indY
|
|
// or REP at this nesting level.
|
|
auto Walker = std::next(It);
|
|
MachineInstr *LdaToHoist = nullptr;
|
|
while (Walker != MBB.end()) {
|
|
if (Walker->isDebugInstr()) { ++Walker; continue; }
|
|
unsigned Opc = Walker->getOpcode();
|
|
// Hit a REP — wrap is closing without LDAi8imm inside.
|
|
if (Opc == W65816::REP) break;
|
|
// Hit a call / branch / asm — bail.
|
|
if (Walker->isCall() || Walker->isBranch() ||
|
|
Walker->isReturn() || Walker->isInlineAsm()) break;
|
|
// Hit an STAfi_indY — this is the byte store; an LDAi8imm
|
|
// before it would be the bug, but if we found one already
|
|
// we'd have hoisted it; nothing to do here, stop scanning.
|
|
if (Opc == W65816::STAfi_indY) break;
|
|
if (Opc == W65816::LDAi8imm) {
|
|
LdaToHoist = &*Walker;
|
|
break;
|
|
}
|
|
++Walker;
|
|
}
|
|
if (LdaToHoist)
|
|
SepHoists.push_back(LdaToHoist);
|
|
}
|
|
for (MachineInstr *Lda : SepHoists) {
|
|
// Find the SEP we entered before the LDA. Walk backward.
|
|
auto Back = Lda->getIterator();
|
|
MachineInstr *OuterSep = nullptr;
|
|
while (Back != MBB.begin()) {
|
|
--Back;
|
|
if (Back->isDebugInstr()) continue;
|
|
if (Back->getOpcode() == W65816::SEP &&
|
|
Back->getNumOperands() >= 1 &&
|
|
Back->getOperand(0).isImm() &&
|
|
Back->getOperand(0).getImm() == kMBit) {
|
|
OuterSep = &*Back;
|
|
break;
|
|
}
|
|
if (Back->isCall() || Back->isBranch() || Back->isInlineAsm())
|
|
break;
|
|
}
|
|
if (!OuterSep) continue;
|
|
Lda->removeFromParent();
|
|
MBB.insert(OuterSep->getIterator(), Lda);
|
|
Changed = true;
|
|
}
|
|
}
|
|
|
|
SmallVector<MachineInstr *, 8> Toggles;
|
|
for (MachineInstr &MI : MBB) {
|
|
unsigned Opc = MI.getOpcode();
|
|
if (Opc == W65816::REP || Opc == W65816::SEP)
|
|
Toggles.push_back(&MI);
|
|
}
|
|
SmallPtrSet<MachineInstr *, 8> Erased;
|
|
for (MachineInstr *First : Toggles) {
|
|
if (Erased.count(First)) continue;
|
|
// The next non-debug instruction must be the matching opposite
|
|
// toggle with the same imm.
|
|
auto It = std::next(First->getIterator());
|
|
while (It != MBB.end() && It->isDebugInstr()) ++It;
|
|
if (It == MBB.end()) continue;
|
|
MachineInstr &Next = *It;
|
|
// Look for REP-then-SEP or SEP-then-REP with matching imm.
|
|
unsigned FirstOpc = First->getOpcode();
|
|
unsigned WantOpc = (FirstOpc == W65816::REP) ? W65816::SEP : W65816::REP;
|
|
int FirstImm = getSepRepImm(*First, FirstOpc);
|
|
int NextImm = getSepRepImm(Next, WantOpc);
|
|
if (FirstImm < 0 || NextImm < 0 || FirstImm != NextImm) continue;
|
|
Erased.insert(First);
|
|
Erased.insert(&Next);
|
|
First->eraseFromParent();
|
|
Next.eraseFromParent();
|
|
Changed = true;
|
|
}
|
|
|
|
// Extended toggle coalesce — REP/SEP scheduling.
|
|
//
|
|
// Walk the MBB looking for `T1 ; ...neutral... ; T2` where T1 and
|
|
// T2 are opposite-polarity SEP/REP toggles (T1=REP T2=SEP, or
|
|
// vice versa) with the same imm, and the gap contains only
|
|
// M-mode-neutral instructions (transfers/branches/X-flag-only
|
|
// index ops). In that case T1+T2 form a no-op pair around code
|
|
// that doesn't care about M, so both can be dropped. Equivalent
|
|
// to "moving the SEP/REP wrap inward to skip the neutral region".
|
|
//
|
|
// Saves 4 bytes / 12 cycles per gap collapsed. The common
|
|
// trigger is two STA8 stores separated by an LDY for the second
|
|
// store's address — STA8fi each emit SEP/STA/REP, the existing
|
|
// adjacent coalesce can't see across the LDY, this pass can.
|
|
{
|
|
// Mode-neutral instruction set: don't touch the M-bit and
|
|
// don't depend on A's width. X-flag dependent ops (LDX/LDY/
|
|
// STX/STY/INX/DEX/INY/DEY/CPX/CPY/PHX/PHY/PLX/PLY) are
|
|
// independent of M. So are all branches, JMP/JSR/JSL/RTL/RTS,
|
|
// CLC/SEC/CLI/SEI/CLD/SED/CLV, NOP, and PHP/PLP (they push
|
|
// 8-bit P regardless of M).
|
|
auto isMNeutral = [](const MachineInstr &MI) -> bool {
|
|
if (MI.isDebugInstr()) return true;
|
|
if (MI.isBranch() || MI.isReturn()) return true;
|
|
unsigned O = MI.getOpcode();
|
|
switch (O) {
|
|
case W65816::LDX_Imm16: case W65816::LDX_DP: case W65816::LDX_Abs:
|
|
case W65816::LDX_DPY: case W65816::LDX_AbsY:
|
|
case W65816::LDY_Imm16: case W65816::LDY_DP: case W65816::LDY_Abs:
|
|
case W65816::LDY_DPX: case W65816::LDY_AbsX:
|
|
case W65816::STX_DP: case W65816::STX_Abs: case W65816::STX_DPY:
|
|
case W65816::STY_DP: case W65816::STY_Abs: case W65816::STY_DPX:
|
|
case W65816::INX: case W65816::DEX:
|
|
case W65816::INY: case W65816::DEY:
|
|
case W65816::CPX_Imm16: case W65816::CPX_DP: case W65816::CPX_Abs:
|
|
case W65816::CPY_Imm16: case W65816::CPY_DP: case W65816::CPY_Abs:
|
|
case W65816::PHX: case W65816::PHY:
|
|
case W65816::PLX: case W65816::PLY:
|
|
case W65816::CLC: case W65816::SEC:
|
|
case W65816::PHP: case W65816::PLP:
|
|
case W65816::NOP:
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
};
|
|
|
|
bool again = true;
|
|
while (again) {
|
|
again = false;
|
|
for (auto It = MBB.begin(); It != MBB.end(); ++It) {
|
|
unsigned Op1 = It->getOpcode();
|
|
if (Op1 != W65816::REP && Op1 != W65816::SEP) continue;
|
|
if (It->getNumOperands() < 1 || !It->getOperand(0).isImm()) continue;
|
|
int Imm1 = It->getOperand(0).getImm();
|
|
if (Imm1 != kMBit) continue; // M-bit only
|
|
// Walk forward across mode-neutral ops looking for the matching
|
|
// opposite toggle. Bail at calls, asm, ALU ops on A, etc.
|
|
unsigned WantOp = (Op1 == W65816::REP) ? W65816::SEP : W65816::REP;
|
|
auto Walker = std::next(It);
|
|
MachineInstr *Match = nullptr;
|
|
while (Walker != MBB.end()) {
|
|
if (Walker->isDebugInstr()) { ++Walker; continue; }
|
|
unsigned WO = Walker->getOpcode();
|
|
if (WO == WantOp && Walker->getNumOperands() >= 1 &&
|
|
Walker->getOperand(0).isImm() &&
|
|
Walker->getOperand(0).getImm() == Imm1) {
|
|
Match = &*Walker;
|
|
break;
|
|
}
|
|
// Bail on anything that touches A or otherwise cares about M.
|
|
if (Walker->isCall() || Walker->isInlineAsm()) break;
|
|
if (!isMNeutral(*Walker)) break;
|
|
++Walker;
|
|
}
|
|
if (!Match) continue;
|
|
// Drop both toggles. Erasing changes iterator stability; restart.
|
|
MachineInstr *T1 = &*It;
|
|
T1->eraseFromParent();
|
|
Match->eraseFromParent();
|
|
Changed = true;
|
|
again = true;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Second peephole: collapse `ADCi16imm src, ±1/±2` (and SBCi16imm)
|
|
// into INA/DEA chains when the carry flag they would set is unused.
|
|
// ADCi16imm is a pseudo (expands to CLC+ADC_Imm16); we rewrite it
|
|
// here BEFORE the AsmPrinter expansion runs. But this pass runs at
|
|
// pre-emit, AFTER post-RA pseudo expansion. ADCi16imm survives
|
|
// because its MCInst lowering is in W65816AsmPrinter (not in the
|
|
// generic post-RA pseudo expander), so it's still in the MIR here.
|
|
Changed |= foldImmAdcToInaDea(MBB, TII);
|
|
|
|
// PHI-copy hoist.
|
|
{
|
|
auto isStaLike = [](const MachineInstr &MI) {
|
|
unsigned O = MI.getOpcode();
|
|
return O == W65816::STA_StackRel || O == W65816::STZ_DP ||
|
|
O == W65816::STZ_Abs;
|
|
};
|
|
auto isLdaSR = [](const MachineInstr &MI) {
|
|
return MI.getOpcode() == W65816::LDA_StackRel;
|
|
};
|
|
// Accept LDA_Imm16 (MC) AND LDAi16imm (pseudo) inside the wrap —
|
|
// both are flag-clobbering A-loads of a 16-bit immediate, with
|
|
// no stack-rel offset to bump-undo and no memory operand to
|
|
// alias-check against the gap. Common in init blocks: `lda #0 ;
|
|
// sta slot,s` wrapped around the loop pre-test. Some functions
|
|
// still carry the pseudo LDAi16imm at SepRepCleanup time (post-RA
|
|
// pseudo expansion didn't lower it), so accept both spellings.
|
|
auto isImmLoad = [](const MachineInstr &MI) {
|
|
unsigned O = MI.getOpcode();
|
|
return O == W65816::LDA_Imm16 || O == W65816::LDAi16imm;
|
|
};
|
|
auto isFlagPreservingMem = [&](const MachineInstr &MI) {
|
|
return isStaLike(MI) || isLdaSR(MI) || isImmLoad(MI);
|
|
};
|
|
auto isLdaCount = [&](const MachineInstr &MI) {
|
|
return isLdaSR(MI) || isImmLoad(MI);
|
|
};
|
|
auto It = MBB.begin();
|
|
while (It != MBB.end()) {
|
|
if (It->getOpcode() != W65816::PHP) { ++It; continue; }
|
|
auto Php = It;
|
|
// Walk forward: collect LDA/STA pairs, stop at PLP.
|
|
auto Walker = std::next(Php);
|
|
SmallVector<MachineInstr *, 8> Block;
|
|
SmallSet<int64_t, 8> ReadSlots; // post-unbump slots (effective)
|
|
SmallSet<int64_t, 8> WriteSlots; // post-unbump slots (effective)
|
|
bool ok = true;
|
|
while (Walker != MBB.end()) {
|
|
if (Walker->isDebugInstr()) { ++Walker; continue; }
|
|
if (Walker->getOpcode() == W65816::PLP) break;
|
|
if (!isFlagPreservingMem(*Walker)) { ok = false; break; }
|
|
// Track stack-rel slots so we can check the gap below.
|
|
// Immediate loads have no stack-rel addr — skip.
|
|
// In-wrap LDA_StackRel / STA_StackRel slots are BUMPED by +1
|
|
// to compensate for PHP's S-decrement; on hoist out of the
|
|
// wrap we un-bump them. Record the POST-UNBUMP (effective)
|
|
// slot here so the gap conflict-check uses the addresses
|
|
// these ops will actually access in their new position.
|
|
// Without this, an outside-wrap LDA at slot N would not
|
|
// conflict with an in-wrap STA at slot N+1 even though the
|
|
// un-bumped STA writes the SAME memory address as the LDA
|
|
// reads — corrupting flag-test data flow. (bsearch's i32
|
|
// `lo < hi` termination compare under TTI-driven less-
|
|
// aggressive inlining: hoisting STA 6 -> STA 5 above LDA 5
|
|
// re-reads the just-overwritten value.)
|
|
unsigned WOpc = Walker->getOpcode();
|
|
bool isBumpedSR = (WOpc == W65816::LDA_StackRel ||
|
|
WOpc == W65816::STA_StackRel);
|
|
if (!isImmLoad(*Walker) &&
|
|
Walker->getNumOperands() >= 1 &&
|
|
Walker->getOperand(0).isImm()) {
|
|
int64_t off = Walker->getOperand(0).getImm();
|
|
int64_t effOff = isBumpedSR ? off - 1 : off;
|
|
if (isLdaSR(*Walker)) ReadSlots.insert(effOff);
|
|
else WriteSlots.insert(effOff);
|
|
}
|
|
Block.push_back(&*Walker);
|
|
++Walker;
|
|
}
|
|
if (!ok || Walker == MBB.end()) { ++It; continue; }
|
|
auto Plp = Walker;
|
|
// Trailing flag-preservers after PLP (STA/STZ only). These
|
|
// already live OUTSIDE the wrap so their slot operand is the
|
|
// effective (unbumped) value — no -1 adjustment.
|
|
auto Tail = std::next(Plp);
|
|
SmallVector<MachineInstr *, 4> Trailing;
|
|
while (Tail != MBB.end()) {
|
|
if (Tail->isDebugInstr()) { ++Tail; continue; }
|
|
if (!isStaLike(*Tail)) break;
|
|
if (Tail->getNumOperands() >= 1 && Tail->getOperand(0).isImm()) {
|
|
WriteSlots.insert(Tail->getOperand(0).getImm());
|
|
}
|
|
Trailing.push_back(&*Tail);
|
|
++Tail;
|
|
}
|
|
// Pair check: the wrap structure is a sequence of LDA-STA
|
|
// memory-to-memory PHI copies, where the FINAL STA may live
|
|
// outside the wrap (as Trailing) because STA doesn't clobber
|
|
// flags. Count LDAs in Block vs total STAs (Block + Trailing).
|
|
// If they're not equal, some LDA's $a-output is a register-
|
|
// live-out PHI value (consumed by a back-edge successor's
|
|
// first STA, e.g. the vararg `sta 0x5, s` pattern). Hoisting
|
|
// it earlier would lose the value.
|
|
unsigned NLda = 0, NSta = 0;
|
|
for (MachineInstr *MI : Block) {
|
|
if (isLdaCount(*MI)) ++NLda;
|
|
else if (isStaLike(*MI)) ++NSta;
|
|
}
|
|
NSta += Trailing.size();
|
|
if (NLda != NSta) { ++It; continue; }
|
|
// Even with paired LDA-STA, the LAST LDA's $a value can still
|
|
// be consumed downstream — by a successor's first STA — making
|
|
// it a fall-through register-PHI. If $a is live-out at MBB
|
|
// end (any successor has $a as live-in), bail. Caught by
|
|
// sumTable, where `lda #0` (wrap) feeds A into bb.2's `sta 0x1,
|
|
// s`, with `sta 0x9, s` (trailing) just happening to also store
|
|
// the same A — the pair count balances but A is still live-out.
|
|
bool aLiveOut = false;
|
|
for (MachineBasicBlock *Succ : MBB.successors()) {
|
|
if (Succ->isLiveIn(W65816::A)) { aLiveOut = true; break; }
|
|
}
|
|
if (aLiveOut) { ++It; continue; }
|
|
// Walk backward from PHP to find the hoist insertion point.
|
|
// The hoisted block clobbers $a and $p (LDA writes both).
|
|
// Skip insts that USE $a (consumer of an earlier $a producer)
|
|
// or that DEFINE $p (flag-setter — its $p output will be
|
|
// re-established by the same flag-setter). Stop at a pure A
|
|
// producer (defines $a, doesn't use $a).
|
|
//
|
|
// Also bail if any in-gap inst writes a slot we read or reads
|
|
// a slot we write (in-gap reads of our writes would observe
|
|
// a stale value after hoist; in-gap writes to our reads would
|
|
// produce a different value if hoisted before).
|
|
auto isStackRelIndYRead = [](unsigned O) {
|
|
switch (O) {
|
|
case W65816::LDA_StackRelIndY:
|
|
case W65816::ADC_StackRelIndY:
|
|
case W65816::SBC_StackRelIndY:
|
|
case W65816::CMP_StackRelIndY:
|
|
case W65816::AND_StackRelIndY:
|
|
case W65816::ORA_StackRelIndY:
|
|
case W65816::EOR_StackRelIndY:
|
|
case W65816::STA_StackRelIndY:
|
|
return true;
|
|
}
|
|
return false;
|
|
};
|
|
auto Back = Php;
|
|
if (Back == MBB.begin()) { ++It; continue; }
|
|
--Back;
|
|
bool gapOK = true;
|
|
while (true) {
|
|
while (Back != MBB.begin() && Back->isDebugInstr()) --Back;
|
|
if (Back->isDebugInstr()) { gapOK = false; break; }
|
|
// Slot conflict check.
|
|
unsigned BO = Back->getOpcode();
|
|
if ((BO == W65816::STA_StackRel || BO == W65816::STZ_DP ||
|
|
BO == W65816::STZ_Abs) &&
|
|
Back->getNumOperands() >= 1 && Back->getOperand(0).isImm()) {
|
|
int64_t off = Back->getOperand(0).getImm();
|
|
if (ReadSlots.count(off)) { gapOK = false; break; }
|
|
}
|
|
if (BO == W65816::LDA_StackRel &&
|
|
Back->getNumOperands() >= 1 && Back->getOperand(0).isImm()) {
|
|
int64_t off = Back->getOperand(0).getImm();
|
|
if (WriteSlots.count(off)) { gapOK = false; break; }
|
|
}
|
|
// *_StackRelIndY ops use their slot operand AS A POINTER for
|
|
// the `(d,S),Y` deref. Hoisting a STA WriteSlot above an
|
|
// IndY use of that slot changes which value the IndY reads
|
|
// through. Forbid the hoist in that case. Caught by Layer 2
|
|
// ptr32 sumByteToZero loop: PHP-wrapped `LDA stack.3, 1; STA
|
|
// stack.4` was being hoisted across `LDA_StackRelIndY stack.4`,
|
|
// making the deref use stack.3's NEW value instead of the
|
|
// LAGGED stack.4 value — off-by-one summing the byte stream.
|
|
if (isStackRelIndYRead(BO) &&
|
|
Back->getNumOperands() >= 1 && Back->getOperand(0).isImm()) {
|
|
int64_t off = Back->getOperand(0).getImm();
|
|
if (WriteSlots.count(off)) { gapOK = false; break; }
|
|
}
|
|
// Bail on call / branch / asm.
|
|
if (Back->isCall() || Back->isBranch() ||
|
|
Back->isReturn() || Back->isInlineAsm()) {
|
|
gapOK = false; break;
|
|
}
|
|
bool usesA = false;
|
|
bool defsA = false;
|
|
for (const MachineOperand &MO : Back->operands()) {
|
|
if (MO.isReg() && MO.getReg() == W65816::A) {
|
|
if (MO.isUse()) usesA = true;
|
|
if (MO.isDef()) defsA = true;
|
|
}
|
|
}
|
|
if (defsA && !usesA) break; // Pure A producer found.
|
|
if (Back == MBB.begin()) { gapOK = false; break; }
|
|
--Back;
|
|
}
|
|
if (!gapOK) { ++It; continue; }
|
|
// Hoist: move Block and Trailing to before Back. Undo the
|
|
// +1 stack-rel bump on Block's in-wrap memory ops; Trailing
|
|
// stays AS-IS (it was already outside the wrap and never
|
|
// bumped).
|
|
for (MachineInstr *MI : Block) {
|
|
// All ops in Block matched isFlagPreservingMem, so they're
|
|
// LDA_StackRel/STA_StackRel/STZ_DP/STZ_Abs. LDA_StackRel
|
|
// and STA_StackRel use operand 0 as the disp; that's the
|
|
// bumped one. STZ_DP/STZ_Abs aren't stack-rel — no bump.
|
|
unsigned MOpc = MI->getOpcode();
|
|
if (MOpc == W65816::LDA_StackRel || MOpc == W65816::STA_StackRel) {
|
|
if (MI->getNumOperands() >= 1 && MI->getOperand(0).isImm()) {
|
|
int64_t v = MI->getOperand(0).getImm();
|
|
MI->getOperand(0).setImm(v - 1);
|
|
}
|
|
}
|
|
MI->removeFromParent();
|
|
MBB.insert(Back, MI);
|
|
}
|
|
for (MachineInstr *MI : Trailing) {
|
|
MI->removeFromParent();
|
|
MBB.insert(Back, MI);
|
|
}
|
|
Php->eraseFromParent();
|
|
Plp->eraseFromParent();
|
|
Changed = true;
|
|
// Restart iteration from the beginning since we mutated.
|
|
It = MBB.begin();
|
|
}
|
|
}
|
|
|
|
// Lagged-ptr PHI-copy sink. In strLen / strcpy / sumByteToZero
|
|
// loop bodies, the deref reads slot B (the "lagged" PHI value)
|
|
// while slot A holds the just-incremented iter. At end of body,
|
|
// a PHP/PLP-wrapped `LDA slot A ; STA slot B` propagates the new
|
|
// iter to slot B for next iter. The wrap costs 8 cyc/iter (PHP +
|
|
// PLP) plus 8 cyc for the LDA/STA pair.
|
|
//
|
|
// Equivalent rewrite: at the start of the body, BEFORE the
|
|
// iter++, A already holds slot A's OLD value (loaded for the
|
|
// INA). Insert `STA slot B` THERE — it copies OLD iter to slot
|
|
// B, matching the lagged semantic. Slot B is no longer touched
|
|
// at end of body, so the PHP/PLP wrap (+ its LDA/PLP/STA tail)
|
|
// can be erased. Net: -11 cyc/iter on strLen (44 chars → -484
|
|
// cyc / -20%).
|
|
//
|
|
// Pattern at end of MBB (immediately before terminator):
|
|
// ANDi #imm ; flag-setter
|
|
// PHP
|
|
// LDA_StackRel SrcOff ; reload iter NEW (SrcOff is
|
|
// PHP-bumped: actually =
|
|
// IterSlotOff + 1)
|
|
// PLP
|
|
// STA_StackRel DstOff ; slot B = iter NEW
|
|
// Bxx ... ; conditional branch
|
|
//
|
|
// Earlier in MBB:
|
|
// LDA_StackRel IterSlotOff ; A = OLD iter
|
|
// INA_PSEUDO (or ADCi16imm 1) ; iter++
|
|
// STA_StackRel IterSlotOff ; iter = NEW
|
|
//
|
|
// Rewrite: insert `STA_StackRel DstOff` right after the LDA
|
|
// (between LDA and INA). Erase the PHP/LDA/PLP/STA + the
|
|
// ANDi-after-PHP wrap entirely. The ANDi at the front is kept
|
|
// since it's also the BNE's flag source.
|
|
{
|
|
auto isCondBranch = [](const MachineInstr &MI) {
|
|
unsigned O = MI.getOpcode();
|
|
return O == W65816::BNE || O == W65816::BEQ ||
|
|
O == W65816::BCC || O == W65816::BCS ||
|
|
O == W65816::BMI || O == W65816::BPL ||
|
|
O == W65816::BVC || O == W65816::BVS;
|
|
};
|
|
auto isFlagSetter = [](const MachineInstr &MI) {
|
|
unsigned O = MI.getOpcode();
|
|
return O == W65816::ANDi16imm || O == W65816::ANDi8imm ||
|
|
O == W65816::ORAi16imm || O == W65816::EORi16imm;
|
|
};
|
|
// Find Bxx terminator.
|
|
MachineInstr *Bxx = nullptr;
|
|
for (auto It = MBB.rbegin(); It != MBB.rend(); ++It) {
|
|
if (isCondBranch(*It)) { Bxx = &*It; break; }
|
|
if (It->isBranch()) break; // BRA etc. — skip past it
|
|
}
|
|
if (!Bxx) goto skip_lagged_sink;
|
|
{
|
|
// Walk backward from Bxx to find STA, PLP, LDA, PHP.
|
|
auto It2 = MachineBasicBlock::iterator(Bxx);
|
|
if (It2 == MBB.begin()) goto skip_lagged_sink;
|
|
--It2; // first non-branch
|
|
if (It2->getOpcode() != W65816::STA_StackRel ||
|
|
!It2->getOperand(0).isImm()) goto skip_lagged_sink;
|
|
MachineInstr *FinalSta = &*It2;
|
|
int64_t DstOff = FinalSta->getOperand(0).getImm();
|
|
if (It2 == MBB.begin()) goto skip_lagged_sink;
|
|
--It2;
|
|
if (It2->getOpcode() != W65816::PLP) goto skip_lagged_sink;
|
|
MachineInstr *Plp2 = &*It2;
|
|
if (It2 == MBB.begin()) goto skip_lagged_sink;
|
|
--It2;
|
|
if (It2->getOpcode() != W65816::LDA_StackRel ||
|
|
!It2->getOperand(0).isImm()) goto skip_lagged_sink;
|
|
MachineInstr *InnerLda = &*It2;
|
|
int64_t SrcOff = InnerLda->getOperand(0).getImm();
|
|
if (It2 == MBB.begin()) goto skip_lagged_sink;
|
|
--It2;
|
|
if (It2->getOpcode() != W65816::PHP) goto skip_lagged_sink;
|
|
MachineInstr *Php2 = &*It2;
|
|
if (It2 == MBB.begin()) goto skip_lagged_sink;
|
|
--It2;
|
|
if (!isFlagSetter(*It2)) goto skip_lagged_sink;
|
|
// The PHP-bumped SrcOff is the IterSlotOff + 1.
|
|
int64_t IterSlotOff = SrcOff - 1;
|
|
// Now find the iter++ sequence earlier in MBB: LDA IterSlotOff;
|
|
// INA_PSEUDO; STA IterSlotOff.
|
|
MachineInstr *IterLda = nullptr;
|
|
MachineInstr *IterIna = nullptr;
|
|
MachineInstr *IterSta = nullptr;
|
|
for (auto Walk = MBB.begin(); Walk != MachineBasicBlock::iterator(Php2); ++Walk) {
|
|
if (Walk->getOpcode() != W65816::LDA_StackRel) continue;
|
|
if (!Walk->getOperand(0).isImm() ||
|
|
Walk->getOperand(0).getImm() != IterSlotOff) continue;
|
|
auto N1 = std::next(Walk);
|
|
while (N1 != MBB.end() && N1->isDebugInstr()) ++N1;
|
|
if (N1 == MBB.end()) continue;
|
|
if (N1->getOpcode() != W65816::INA_PSEUDO &&
|
|
N1->getOpcode() != W65816::ADCi16imm) continue;
|
|
auto N2 = std::next(N1);
|
|
while (N2 != MBB.end() && N2->isDebugInstr()) ++N2;
|
|
if (N2 == MBB.end()) continue;
|
|
if (N2->getOpcode() != W65816::STA_StackRel) continue;
|
|
if (!N2->getOperand(0).isImm() ||
|
|
N2->getOperand(0).getImm() != IterSlotOff) continue;
|
|
IterLda = &*Walk;
|
|
IterIna = &*N1;
|
|
IterSta = &*N2;
|
|
break;
|
|
}
|
|
if (!IterLda) goto skip_lagged_sink;
|
|
// Safety: make sure DstOff isn't written between IterLda and
|
|
// the IndY use of DstOff. Walk forward from IterLda looking
|
|
// for STA DstOff (other than our FinalSta) — if found, bail.
|
|
for (auto Walk = std::next(MachineBasicBlock::iterator(IterSta));
|
|
Walk != MachineBasicBlock::iterator(Php2); ++Walk) {
|
|
if (Walk->getOpcode() == W65816::STA_StackRel &&
|
|
Walk->getOperand(0).isImm() &&
|
|
Walk->getOperand(0).getImm() == DstOff) {
|
|
goto skip_lagged_sink;
|
|
}
|
|
}
|
|
// Apply: insert STA_StackRel DstOff right after IterLda,
|
|
// BEFORE INA.
|
|
const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
|
|
DebugLoc DL = IterLda->getDebugLoc();
|
|
BuildMI(MBB, std::next(MachineBasicBlock::iterator(IterLda)),
|
|
DL, TII->get(W65816::STA_StackRel))
|
|
.addImm(DstOff)
|
|
.addReg(W65816::A, RegState::Implicit);
|
|
// Erase PHP, InnerLda, PLP, FinalSta.
|
|
Php2->eraseFromParent();
|
|
InnerLda->eraseFromParent();
|
|
Plp2->eraseFromParent();
|
|
FinalSta->eraseFromParent();
|
|
Changed = true;
|
|
}
|
|
skip_lagged_sink:;
|
|
}
|
|
|
|
// i32 += i32 store-bypass. Regalloc materializes the call result
|
|
// (A=lo, X=hi) into Wide32 spill slots before the add, then reads
|
|
// them back — emitting 4 instructions of redundant store/reload:
|
|
//
|
|
// STA_StackRel slotA ; A (mul.lo) -> slotA
|
|
// TXA ; A = X = mul.hi
|
|
// STA_StackRel slotB ; mul.hi -> slotB
|
|
// LDA_StackRel slotA ; reload mul.lo <-- redundant
|
|
// CLC
|
|
// ADC_StackRel slotC ; mul.lo + total.lo
|
|
// STA_StackRel slotA ; sum-lo
|
|
// LDA_StackRel slotB ; reload mul.hi <-- redundant
|
|
// ADC_StackRel slotD ; mul.hi + total.hi + C
|
|
// STA_StackRel slotB ; sum-hi
|
|
//
|
|
// Reorder to do the lo-add directly off A and the hi-add directly
|
|
// off X (via TXA preserving carry):
|
|
//
|
|
// CLC
|
|
// ADC_StackRel slotC ; A = mul.lo + total.lo
|
|
// STA_StackRel slotA ; sum-lo
|
|
// TXA ; A = X = mul.hi (C preserved)
|
|
// ADC_StackRel slotD ; A = mul.hi + total.hi + C
|
|
// STA_StackRel slotB ; sum-hi
|
|
//
|
|
// 10 -> 6 inst. Saves 4 inst / ~13 cyc per i32-add-of-call-result
|
|
// site. Hits the sumOfSquares loop and any total += __umulhisi3
|
|
// pattern.
|
|
{
|
|
auto isStaSR = [](MachineInstr &MI, int64_t *off) {
|
|
if (MI.getOpcode() != W65816::STA_StackRel) return false;
|
|
if (MI.getNumOperands() < 1 || !MI.getOperand(0).isImm()) return false;
|
|
if (off) *off = MI.getOperand(0).getImm();
|
|
return true;
|
|
};
|
|
auto isLdaSR = [](MachineInstr &MI, int64_t *off) {
|
|
if (MI.getOpcode() != W65816::LDA_StackRel) return false;
|
|
if (MI.getNumOperands() < 1 || !MI.getOperand(0).isImm()) return false;
|
|
if (off) *off = MI.getOperand(0).getImm();
|
|
return true;
|
|
};
|
|
auto isAdcSR = [](MachineInstr &MI, int64_t *off) {
|
|
if (MI.getOpcode() != W65816::ADC_StackRel) return false;
|
|
if (MI.getNumOperands() < 1 || !MI.getOperand(0).isImm()) return false;
|
|
if (off) *off = MI.getOperand(0).getImm();
|
|
return true;
|
|
};
|
|
auto It = MBB.begin();
|
|
while (It != MBB.end()) {
|
|
auto Cur = It;
|
|
int64_t slotA = 0, slotB = 0, slotC = 0, slotD = 0;
|
|
// Step 1: STA_StackRel slotA
|
|
if (!isStaSR(*Cur, &slotA)) { ++It; continue; }
|
|
auto P2 = std::next(Cur);
|
|
while (P2 != MBB.end() && P2->isDebugInstr()) ++P2;
|
|
if (P2 == MBB.end() || P2->getOpcode() != W65816::TXA) { ++It; continue; }
|
|
auto P3 = std::next(P2);
|
|
while (P3 != MBB.end() && P3->isDebugInstr()) ++P3;
|
|
if (P3 == MBB.end() || !isStaSR(*P3, &slotB)) { ++It; continue; }
|
|
if (slotA == slotB) { ++It; continue; }
|
|
auto P4 = std::next(P3);
|
|
while (P4 != MBB.end() && P4->isDebugInstr()) ++P4;
|
|
int64_t lreloadA = 0;
|
|
if (P4 == MBB.end() || !isLdaSR(*P4, &lreloadA) || lreloadA != slotA) {
|
|
++It; continue;
|
|
}
|
|
auto P5 = std::next(P4);
|
|
while (P5 != MBB.end() && P5->isDebugInstr()) ++P5;
|
|
if (P5 == MBB.end() || P5->getOpcode() != W65816::CLC) {
|
|
++It; continue;
|
|
}
|
|
auto P6 = std::next(P5);
|
|
while (P6 != MBB.end() && P6->isDebugInstr()) ++P6;
|
|
if (P6 == MBB.end() || !isAdcSR(*P6, &slotC)) { ++It; continue; }
|
|
auto P7 = std::next(P6);
|
|
while (P7 != MBB.end() && P7->isDebugInstr()) ++P7;
|
|
int64_t outA = 0;
|
|
if (P7 == MBB.end() || !isStaSR(*P7, &outA) || outA != slotA) {
|
|
++It; continue;
|
|
}
|
|
auto P8 = std::next(P7);
|
|
while (P8 != MBB.end() && P8->isDebugInstr()) ++P8;
|
|
int64_t lreloadB = 0;
|
|
if (P8 == MBB.end() || !isLdaSR(*P8, &lreloadB) || lreloadB != slotB) {
|
|
++It; continue;
|
|
}
|
|
auto P9 = std::next(P8);
|
|
while (P9 != MBB.end() && P9->isDebugInstr()) ++P9;
|
|
if (P9 == MBB.end() || !isAdcSR(*P9, &slotD)) { ++It; continue; }
|
|
auto P10 = std::next(P9);
|
|
while (P10 != MBB.end() && P10->isDebugInstr()) ++P10;
|
|
int64_t outB = 0;
|
|
if (P10 == MBB.end() || !isStaSR(*P10, &outB) || outB != slotB) {
|
|
++It; continue;
|
|
}
|
|
// All 10 matched. slotA != slotB already. Also require all
|
|
// four slots distinct. (slotC/slotD are the total.lo/hi read
|
|
// addresses; in the canonical case slotC != slotA and slotD !=
|
|
// slotB; without this the rewrite would re-read its own output.)
|
|
if (slotC == slotA || slotD == slotB ||
|
|
slotC == slotD) {
|
|
++It; continue;
|
|
}
|
|
// Rewrite: emit CLC ; ADC slotC ; STA slotA ; TXA ; ADC slotD ;
|
|
// STA slotB before P1, then erase steps 1-10.
|
|
DebugLoc DL = Cur->getDebugLoc();
|
|
BuildMI(MBB, Cur, DL, TII.get(W65816::CLC));
|
|
BuildMI(MBB, Cur, DL, TII.get(W65816::ADC_StackRel))
|
|
.addImm(slotC);
|
|
BuildMI(MBB, Cur, DL, TII.get(W65816::STA_StackRel))
|
|
.addImm(slotA);
|
|
BuildMI(MBB, Cur, DL, TII.get(W65816::TXA));
|
|
BuildMI(MBB, Cur, DL, TII.get(W65816::ADC_StackRel))
|
|
.addImm(slotD);
|
|
BuildMI(MBB, Cur, DL, TII.get(W65816::STA_StackRel))
|
|
.addImm(slotB);
|
|
// Advance It past the matched pattern before erasing (so we
|
|
// don't iterate through deleted insts).
|
|
It = std::next(P10);
|
|
// Erase the 10 originals.
|
|
Cur->eraseFromParent(); P2->eraseFromParent();
|
|
P3->eraseFromParent(); P4->eraseFromParent();
|
|
P5->eraseFromParent(); P6->eraseFromParent();
|
|
P7->eraseFromParent(); P8->eraseFromParent();
|
|
P9->eraseFromParent(); P10->eraseFromParent();
|
|
Changed = true;
|
|
}
|
|
}
|
|
|
|
// Dead TAX / TXA elimination. STAfi declares `Defs = [A]` as a
|
|
// safe over-approximation (eliminateFrameIndex emits a PHA-bracketed
|
|
// sequence when the source is IMG-class). Regalloc honors that by
|
|
// inserting `TAX ; ...STAfi... ; TXA` brackets around STAfi that
|
|
// SOURCES from A — but in the A-source path A is preserved. The
|
|
// TXA's output gets clobbered immediately by the next LDA*, so the
|
|
// TXA is dead; once TXA is gone, the TAX's X-value has no consumer
|
|
// and is dead too. This pattern recurs once per i32-spill site.
|
|
//
|
|
// Conservative: only elide TXA if the IMMEDIATE next non-debug
|
|
// instruction defines $a (and doesn't read $a or N/Z first). No
|
|
// intervening flag-readers between TXA and the A-define is then
|
|
// guaranteed. Same logic for TYA.
|
|
//
|
|
// For TAX: elide if no instruction between TAX and the next $x def
|
|
// reads $x (and we can prove the original X had no live consumer).
|
|
// Done as a fixed-point: keep iterating until no change.
|
|
auto definesReg = [](const MachineInstr &MI, unsigned Reg) -> bool {
|
|
for (const MachineOperand &MO : MI.operands()) {
|
|
if (MO.isReg() && MO.getReg() == Reg && MO.isDef())
|
|
return true;
|
|
}
|
|
return false;
|
|
};
|
|
auto readsReg = [](const MachineInstr &MI, unsigned Reg) -> bool {
|
|
for (const MachineOperand &MO : MI.operands()) {
|
|
if (MO.isReg() && MO.getReg() == Reg && MO.isUse())
|
|
return true;
|
|
}
|
|
return false;
|
|
};
|
|
bool again2 = true;
|
|
while (again2) {
|
|
again2 = false;
|
|
// Pass A: dead TXA / TYA
|
|
for (auto It = MBB.begin(); It != MBB.end(); ) {
|
|
unsigned O = It->getOpcode();
|
|
if (O != W65816::TXA && O != W65816::TYA) { ++It; continue; }
|
|
auto Next = std::next(It);
|
|
while (Next != MBB.end() && Next->isDebugInstr()) ++Next;
|
|
if (Next == MBB.end()) { ++It; continue; }
|
|
// Next must define $a unconditionally, and must not read $a
|
|
// (since we're about to discard the TXA-defined A) and must
|
|
// not be a call / branch / inline asm (which conservatively
|
|
// read $a).
|
|
if (Next->isCall() || Next->isBranch() ||
|
|
Next->isReturn() || Next->isInlineAsm()) {
|
|
++It; continue;
|
|
}
|
|
if (!definesReg(*Next, W65816::A)) { ++It; continue; }
|
|
if (readsReg(*Next, W65816::A)) { ++It; continue; }
|
|
// P (flags) liveness: TXA/TYA set N/Z. If Next reads P, we'd
|
|
// be discarding the flags it expects. Bxx and friends read P.
|
|
// Conservative: also require Next does not read $p.
|
|
if (readsReg(*Next, W65816::P)) { ++It; continue; }
|
|
auto Dead = It++;
|
|
Dead->eraseFromParent();
|
|
Changed = true;
|
|
again2 = true;
|
|
}
|
|
// Pass B: dead TAX / TAY
|
|
for (auto It = MBB.begin(); It != MBB.end(); ) {
|
|
unsigned O = It->getOpcode();
|
|
unsigned Target;
|
|
if (O == W65816::TAX) Target = W65816::X;
|
|
else if (O == W65816::TAY) Target = W65816::Y;
|
|
else { ++It; continue; }
|
|
// Walk forward. TAX/TAY is dead if every use of Target is
|
|
// preceded by a redefinition of Target (and the in-MBB region
|
|
// between has no flag-reader that consumes TAX's N/Z). At MBB
|
|
// end, check successor live-ins: if none has Target as live-in
|
|
// it's also dead.
|
|
//
|
|
// Flag liveness: TAX defines $p (N/Z). A later $p-reader only
|
|
// consumes TAX's flags if no intervening instruction REDEFINES
|
|
// $p in the gap. Track `pRedef` to allow common patterns like
|
|
// `TAX ; CLC ; ADC ; ...` where ADC reads $p but the $p it
|
|
// reads is the freshly-CLC'd carry, not TAX's N/Z.
|
|
auto Walker = std::next(It);
|
|
bool deadIt = false;
|
|
bool bailed = false;
|
|
bool pRedef = false;
|
|
while (Walker != MBB.end()) {
|
|
if (Walker->isDebugInstr()) { ++Walker; continue; }
|
|
if (Walker->isCall() || Walker->isInlineAsm()) {
|
|
bailed = true; break;
|
|
}
|
|
// Branch / return: stop walking; rely on successor live-ins.
|
|
if (Walker->isBranch() || Walker->isReturn()) break;
|
|
if (readsReg(*Walker, Target)) { bailed = true; break; }
|
|
if (readsReg(*Walker, W65816::P) && !pRedef) {
|
|
bailed = true; break;
|
|
}
|
|
if (definesReg(*Walker, W65816::P)) pRedef = true;
|
|
if (definesReg(*Walker, Target)) { deadIt = true; break; }
|
|
++Walker;
|
|
}
|
|
if (bailed) { ++It; continue; }
|
|
if (!deadIt) {
|
|
// Fell through to MBB end / branch. Check successor live-ins.
|
|
bool liveOut = false;
|
|
for (MachineBasicBlock *Succ : MBB.successors()) {
|
|
if (Succ->isLiveIn(Target)) { liveOut = true; break; }
|
|
}
|
|
// Return blocks: $a and $x are the i32 return-value convention.
|
|
// RTL doesn't model these as Uses, but they ARE live at the
|
|
// return. Be conservative — don't elide TAX/TAY before a return.
|
|
if (!MBB.empty() && MBB.back().isReturn()) liveOut = true;
|
|
if (liveOut) { ++It; continue; }
|
|
}
|
|
auto Dead = It++;
|
|
Dead->eraseFromParent();
|
|
Changed = true;
|
|
again2 = true;
|
|
}
|
|
}
|
|
|
|
// Third peephole: drop `LDY_Imm16 K` when Y already holds K from
|
|
// an earlier LDY in the same MBB and no intervening MI clobbered
|
|
// Y. Custom inserter emits LDY #0 before every LDAfi_indY/STAfi_indY,
|
|
// even though Y already holds 0 from a previous emit — the
|
|
// redundant LDYs survive MachineLICM because Y is a phys reg and
|
|
// the inserter binds them tightly to each use.
|
|
int yKnown = -1; // -1 means unknown; otherwise the immediate
|
|
auto It2 = MBB.begin();
|
|
while (It2 != MBB.end()) {
|
|
MachineInstr &MI = *It2;
|
|
if (MI.isDebugInstr()) { ++It2; continue; }
|
|
unsigned Op = MI.getOpcode();
|
|
if (Op == W65816::LDY_Imm16 && MI.getNumOperands() >= 1 &&
|
|
MI.getOperand(0).isImm()) {
|
|
int K = MI.getOperand(0).getImm() & 0xFFFF;
|
|
if (yKnown == K) {
|
|
// Before erasing this redundant LDY: the prior LDY is still in
|
|
// scope, so all of its Y-uses between the two LDYs are still
|
|
// valid uses. But liveness already marked the LAST one (just
|
|
// before the redundant LDY) as `implicit killed $y`, because
|
|
// that LDY was about to redefine Y. After erasure, Y survives
|
|
// through to the NEXT use, so the prior "kill" annotation is
|
|
// wrong and the machine verifier rejects. Walk backward and
|
|
// clear the kill flag on the most recent Y-using operand.
|
|
for (auto Back = std::prev(It2);; --Back) {
|
|
bool clearedAny = false;
|
|
for (MachineOperand &MO : Back->operands()) {
|
|
if (MO.isReg() && MO.getReg() == W65816::Y &&
|
|
MO.isUse() && MO.isKill()) {
|
|
MO.setIsKill(false);
|
|
clearedAny = true;
|
|
}
|
|
}
|
|
if (clearedAny) break;
|
|
if (Back == MBB.begin()) break;
|
|
}
|
|
auto Erase = It2++;
|
|
Erase->eraseFromParent();
|
|
Changed = true;
|
|
continue;
|
|
}
|
|
yKnown = K;
|
|
} else {
|
|
// Conservatively invalidate yKnown on anything that touches Y
|
|
// or on calls / inline asm / any instruction that doesn't have
|
|
// a clean "no Y effect" guarantee. Cheaper to underclaim than
|
|
// miscompile.
|
|
switch (Op) {
|
|
case W65816::LDAfi_indY: // reads Y, doesn't def it — keep yKnown
|
|
case W65816::STAfi_indY:
|
|
case W65816::LDA_StackRelIndY:
|
|
case W65816::STA_StackRelIndY:
|
|
break;
|
|
case W65816::TAY: case W65816::TXY:
|
|
case W65816::INY: case W65816::DEY:
|
|
case W65816::PLY: case W65816::LDY_DP: case W65816::LDY_Abs:
|
|
case W65816::LDY_DPX: case W65816::LDY_AbsX:
|
|
yKnown = -1; break;
|
|
default:
|
|
if (MI.isCall()) yKnown = -1;
|
|
break;
|
|
}
|
|
}
|
|
++It2;
|
|
}
|
|
}
|
|
|
|
// Three prototype peepholes were tried here and removed once shown
|
|
// to regress benchmarks; design notes in
|
|
// feedback_close_gap_attempts_round2.md / feedback_cmp_zero_elim.md:
|
|
// - PHI store-forwarding (CRC32 regression / memmove safety hole).
|
|
// - Redundant CMP #0 elimination (VLA sum_n carry-flag bookkeeping).
|
|
// - Narrow PHI-copy slot collapse (qsort regression).
|
|
|
|
return Changed;
|
|
}
|