733 lines
28 KiB
C++
733 lines
28 KiB
C++
//===-- W65816StackSlotMerge.cpp - Merge value-equivalent stack slots ----===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===---------------------------------------------------------------------===//
|
|
//
|
|
// Pre-emit pass that runs after PEI (eliminateFrameIndex) and merges
|
|
// pairs of stack-rel slots that hold the same value at every observable
|
|
// program point — typically the PHI src/dst pair PHI-elim leaves at
|
|
// the back-edge of a loop body.
|
|
//
|
|
// LLVM's StackSlotColoring merges slots with non-overlapping liveness.
|
|
// It can't merge slots that are simultaneously live but happen to hold
|
|
// the same value (which is what a PHI memory-copy creates). This pass
|
|
// catches that case via a stricter "value equivalence" check.
|
|
//
|
|
// Canonical pattern (sumSquares loop body):
|
|
//
|
|
// .LBB0_4:
|
|
// LDA 0x7, s ; PHA ; JSL __umulhisi3 ; PLY
|
|
// CLC ; ADC 0x3, s ; STA 0xb, s ; new total.lo (write X)
|
|
// TXA ; ADC 0x1, s ; STA 0x9, s
|
|
// LDA 0x7, s ; INC A ; STA 0x7, s
|
|
// LDA 0xb, s ; STA 0x3, s ; PHI copy: load X, store Y
|
|
// LDA 0x9, s ; STA 0x1, s
|
|
// ...
|
|
//
|
|
// The pair (0xb, 0x3) is the lo-half PHI memory copy. Slots 0xb and
|
|
// 0x3 always hold the same value at every read site:
|
|
// - Function entry: both initialized to 0 (`lda #0; sta 0xb, s` in
|
|
// entry, `lda #0; sta 0x3, s` in preheader).
|
|
// - Loop iteration: the PHI copy moves the new total.lo from 0xb to
|
|
// 0x3 at the end of every iteration.
|
|
// - Exit: only 0xb is read (return value), but its value equals 0x3's.
|
|
//
|
|
// Rename 0xb → 0x3 function-wide; the now self-copy `lda 0x3; sta 0x3`
|
|
// is dead and we erase it. Saves 2 inst per PHI copy occurrence (the
|
|
// memory copy round-trip). sumSquares loop body shrinks from 21 to
|
|
// 17 inst per iter.
|
|
//
|
|
// Safety check (sufficient condition for value equivalence):
|
|
// 1. Both slots have ≥1 STA in the function (skips arg slots passed
|
|
// by the caller — those have only LDA reads, no STAs, and renaming
|
|
// would change where we read the arg from).
|
|
// 2. For every STA X in the function, find a "twin" STA Y at a
|
|
// program point where the values match. Matching = either:
|
|
// (a) Same MBB, same A-source value (no intervening A-define).
|
|
// Covers the loop-body iter-end pattern: STA X then later
|
|
// LDA X ; STA Y. Also covers entry's `lda #N ; sta X` if
|
|
// the same MBB also has `sta Y`.
|
|
// (b) Different MBBs, both preceded by `LDA #const` of the same
|
|
// constant. Covers entry-block STA X=0 paired with
|
|
// preheader STA Y=0.
|
|
// 3. Symmetric: for every STA Y, find a twin STA X.
|
|
// 4. No "orphan" STAs. If a STA X or STA Y has no twin, bail.
|
|
//
|
|
// When all checks pass, the rename function-wide preserves semantics:
|
|
// every read of slot X at program point P sees the same value that
|
|
// slot Y holds at P (and vice versa).
|
|
//
|
|
//===---------------------------------------------------------------------===//
|
|
|
|
#include "W65816.h"
|
|
#include "W65816InstrInfo.h"
|
|
#include "W65816Subtarget.h"
|
|
#include "llvm/ADT/DenseMap.h"
|
|
#include "llvm/ADT/SmallVector.h"
|
|
#include "llvm/CodeGen/MachineDominators.h"
|
|
#include "llvm/CodeGen/MachineFunction.h"
|
|
#include "llvm/CodeGen/MachineFunctionPass.h"
|
|
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
|
#include "llvm/InitializePasses.h"
|
|
#include "llvm/Support/Debug.h"
|
|
|
|
using namespace llvm;
|
|
|
|
#define DEBUG_TYPE "w65816-stack-slot-merge"
|
|
|
|
|
|
namespace {
|
|
|
|
|
|
class W65816StackSlotMerge : public MachineFunctionPass {
|
|
public:
|
|
static char ID;
|
|
W65816StackSlotMerge() : MachineFunctionPass(ID) {}
|
|
StringRef getPassName() const override {
|
|
return "W65816 merge value-equivalent stack slots (PHI-copy collapse)";
|
|
}
|
|
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
|
AU.addRequired<MachineDominatorTreeWrapperPass>();
|
|
AU.setPreservesCFG();
|
|
MachineFunctionPass::getAnalysisUsage(AU);
|
|
}
|
|
bool runOnMachineFunction(MachineFunction &MF) override;
|
|
};
|
|
|
|
|
|
} // namespace
|
|
|
|
|
|
char W65816StackSlotMerge::ID = 0;
|
|
|
|
INITIALIZE_PASS_BEGIN(W65816StackSlotMerge, DEBUG_TYPE,
|
|
"W65816 stack slot merge", false, false)
|
|
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
|
|
INITIALIZE_PASS_END(W65816StackSlotMerge, DEBUG_TYPE,
|
|
"W65816 stack slot merge", false, false)
|
|
|
|
|
|
FunctionPass *llvm::createW65816StackSlotMerge() {
|
|
return new W65816StackSlotMerge();
|
|
}
|
|
|
|
|
|
// Stack-relative MC opcodes — the ops that survive eliminateFrameIndex
|
|
// and reference a slot via an 8-bit SP-relative offset.
|
|
static bool isStackRelOp(unsigned Op) {
|
|
return Op == W65816::LDA_StackRel || Op == W65816::STA_StackRel ||
|
|
Op == W65816::ADC_StackRel || Op == W65816::SBC_StackRel ||
|
|
Op == W65816::AND_StackRel || Op == W65816::ORA_StackRel ||
|
|
Op == W65816::EOR_StackRel || Op == W65816::CMP_StackRel;
|
|
}
|
|
|
|
|
|
// Returns true if MI is a stack-rel op; out-param Off receives the slot
|
|
// offset (operand 0).
|
|
static bool srAccess(const MachineInstr &MI, int64_t &Off) {
|
|
if (!isStackRelOp(MI.getOpcode())) return false;
|
|
if (MI.getNumOperands() < 1 || !MI.getOperand(0).isImm()) return false;
|
|
Off = MI.getOperand(0).getImm();
|
|
return true;
|
|
}
|
|
|
|
|
|
// True if the MI semantically defines A. Covers both the explicit
|
|
// case (operand has reg=A,isDef) AND the implicit case where the
|
|
// tablegen InstDP / InstAbs / etc. base classes omit the A-Def
|
|
// annotation despite LDA semantically writing A (a backend modelling
|
|
// gap — many `LDA_DP`, `LDA_Abs`, `LDA_LongX`, etc. are missing the
|
|
// implicit-def in the MIR even though they load into A). Opcode-
|
|
// based fallback catches all of them.
|
|
static bool semanticallyDefsA(const MachineInstr &MI) {
|
|
for (const MachineOperand &MO : MI.operands()) {
|
|
if (MO.isReg() && MO.getReg() == W65816::A && MO.isDef())
|
|
return true;
|
|
}
|
|
unsigned Op = MI.getOpcode();
|
|
switch (Op) {
|
|
case W65816::LDA_DP: case W65816::LDA_DPX:
|
|
case W65816::LDA_DPInd: case W65816::LDA_DPIndY:
|
|
case W65816::LDA_DPIndX:
|
|
case W65816::LDA_Abs: case W65816::LDA_AbsX:
|
|
case W65816::LDA_AbsY: case W65816::LDA_Long:
|
|
case W65816::LDA_LongX:
|
|
case W65816::PLA:
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
|
|
// Walk backward from MI in its MBB looking for the most recent A-define.
|
|
// Returns the MI that defines A, or nullptr if none in the same MBB.
|
|
// Skips debug instructions. Stops at MBB boundary, calls, branches,
|
|
// inline asm.
|
|
static MachineInstr *findPriorADef(MachineInstr *MI) {
|
|
MachineBasicBlock *MBB = MI->getParent();
|
|
auto It = MI->getIterator();
|
|
while (It != MBB->begin()) {
|
|
--It;
|
|
if (It->isDebugInstr()) continue;
|
|
if (It->isCall() || It->isInlineAsm()) return nullptr;
|
|
if (semanticallyDefsA(*It)) return &*It;
|
|
}
|
|
return nullptr;
|
|
}
|
|
|
|
|
|
// Walk forward from `Start` (exclusive) up to (but not including) `End`
|
|
// in the same MBB, tracking whether slot `WatchSlot` is written.
|
|
// Returns true if slot `WatchSlot` is NOT written in the interval.
|
|
static bool slotNotWrittenBetween(MachineBasicBlock::iterator Start,
|
|
MachineBasicBlock::iterator End,
|
|
int64_t WatchSlot) {
|
|
for (auto It = std::next(Start); It != End; ++It) {
|
|
if (It->isDebugInstr()) continue;
|
|
int64_t Off;
|
|
if (It->getOpcode() == W65816::STA_StackRel && srAccess(*It, Off) &&
|
|
Off == WatchSlot) {
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
|
|
// Returns true if MI clobbers P (N/Z/C/V flags). Mirrors LLVM's
|
|
// operand-based check + an opcode whitelist for tablegen entries that
|
|
// omit `Defs = [P]` (InstImplied, InstStackRel, etc.).
|
|
static bool clobbersFlagsP(const MachineInstr &MI) {
|
|
for (const MachineOperand &MO : MI.operands()) {
|
|
if (MO.isReg() && MO.getReg() == W65816::P && MO.isDef())
|
|
return true;
|
|
}
|
|
if (MI.isCall() || MI.isInlineAsm()) return true;
|
|
unsigned Op = MI.getOpcode();
|
|
switch (Op) {
|
|
case W65816::PLA: case W65816::PLY: case W65816::PLX:
|
|
case W65816::PLP:
|
|
case W65816::INA: case W65816::DEA:
|
|
case W65816::INX: case W65816::DEX:
|
|
case W65816::INY: case W65816::DEY:
|
|
case W65816::TAX: case W65816::TAY:
|
|
case W65816::TYA: case W65816::TXA:
|
|
case W65816::TYX: case W65816::TXY:
|
|
case W65816::LDA_StackRel: case W65816::LDA_DP:
|
|
case W65816::LDA_DPX: case W65816::LDA_DPInd:
|
|
case W65816::LDA_DPIndY: case W65816::LDA_DPIndX:
|
|
case W65816::LDA_Abs: case W65816::LDA_AbsX:
|
|
case W65816::LDA_AbsY: case W65816::LDA_Long:
|
|
case W65816::LDA_LongX:
|
|
case W65816::ADC_StackRel: case W65816::SBC_StackRel:
|
|
case W65816::CMP_StackRel: case W65816::AND_StackRel:
|
|
case W65816::ORA_StackRel: case W65816::EOR_StackRel:
|
|
case W65816::ADC_DP: case W65816::ADC_Abs:
|
|
case W65816::SBC_DP: case W65816::SBC_Abs:
|
|
case W65816::CMP_DP: case W65816::CMP_Abs:
|
|
case W65816::AND_DP: case W65816::AND_Abs:
|
|
case W65816::ORA_DP: case W65816::ORA_Abs:
|
|
case W65816::EOR_DP: case W65816::EOR_Abs:
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
|
|
// Returns true if MI reads P flags (conditional branches, PLP, etc.).
|
|
static bool usesFlagsP(const MachineInstr &MI) {
|
|
if (MI.isConditionalBranch()) return true;
|
|
for (const MachineOperand &MO : MI.operands()) {
|
|
if (MO.isReg() && MO.getReg() == W65816::P && MO.isUse() &&
|
|
!MO.isDef())
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
|
|
// Returns the MOST RECENT A-defining MI strictly before MI in its MBB,
|
|
// skipping debug instructions. Returns nullptr if none in the same MBB.
|
|
static MachineInstr *findMostRecentADef(MachineInstr *MI) {
|
|
MachineBasicBlock *MBB = MI->getParent();
|
|
auto It = MI->getIterator();
|
|
while (It != MBB->begin()) {
|
|
--It;
|
|
if (It->isDebugInstr()) continue;
|
|
if (semanticallyDefsA(*It)) return &*It;
|
|
}
|
|
return nullptr;
|
|
}
|
|
|
|
|
|
// "Twin" check. Given a STA X at position StaX and a candidate slot Y,
|
|
// scan the function's STA Y instances and return one that's value-
|
|
// equivalent under the rules described in the header comment.
|
|
//
|
|
// Source-value equivalence cases:
|
|
// (1) Same-MBB twin store: no A-define between StaX and the candidate
|
|
// StaY → both store the same A value. Pure twin pattern.
|
|
// (2) Same-MBB PHI-copy: the candidate StaY is preceded by
|
|
// `LDA_StackRel slotX` (PHI-copy reload). Even if many A-defines
|
|
// sit between StaX and StaY, the LDA X re-establishes A =
|
|
// slot[X] = value StaX wrote (assuming slot X wasn't re-written
|
|
// in the gap).
|
|
// (3) Different MBBs, both preceded by LDA_Imm16 / LDAi16imm of the
|
|
// same constant. Covers entry/preheader init parallel pair.
|
|
static MachineInstr *findTwin(MachineInstr *StaX,
|
|
ArrayRef<MachineInstr *> StasY) {
|
|
MachineBasicBlock *MBBStaX = StaX->getParent();
|
|
int64_t XOff = StaX->getOperand(0).getImm();
|
|
// Cases (1) + (2): same MBB.
|
|
for (MachineInstr *StaY : StasY) {
|
|
if (StaY->getParent() != MBBStaX) continue;
|
|
// Determine ordering.
|
|
MachineInstr *Earlier = nullptr;
|
|
MachineInstr *Later = nullptr;
|
|
for (auto It = MBBStaX->begin(); It != MBBStaX->end(); ++It) {
|
|
if (&*It == StaX) { Earlier = StaX; Later = StaY; break; }
|
|
if (&*It == StaY) { Earlier = StaY; Later = StaX; break; }
|
|
}
|
|
if (!Earlier || !Later) continue;
|
|
int64_t EOff = Earlier->getOperand(0).getImm();
|
|
// Case (2): if Later is preceded by `LDA_StackRel <Earlier's slot>`
|
|
// (the PHI-copy reload), it's a PHI twin. Also require slot
|
|
// Earlier-slot wasn't re-written between Earlier and Later.
|
|
MachineInstr *PriorOfLater = findMostRecentADef(Later);
|
|
if (PriorOfLater) {
|
|
int64_t Off;
|
|
if (PriorOfLater->getOpcode() == W65816::LDA_StackRel &&
|
|
srAccess(*PriorOfLater, Off) && Off == EOff &&
|
|
slotNotWrittenBetween(Earlier->getIterator(),
|
|
PriorOfLater->getIterator(), EOff)) {
|
|
return StaY;
|
|
}
|
|
}
|
|
// Case (1): no A-define between Earlier and Later — same A value.
|
|
{
|
|
bool noADefs = true;
|
|
for (auto It = std::next(Earlier->getIterator());
|
|
It != Later->getIterator(); ++It) {
|
|
if (It->isDebugInstr()) continue;
|
|
if (semanticallyDefsA(*It)) { noADefs = false; break; }
|
|
}
|
|
if (noADefs) return StaY;
|
|
}
|
|
}
|
|
// Case (3): different MBBs, both preceded by LDA_Imm16 / LDAi16imm
|
|
// with the same constant.
|
|
MachineInstr *PriorX = findPriorADef(StaX);
|
|
if (!PriorX) return nullptr;
|
|
unsigned PriorXOp = PriorX->getOpcode();
|
|
if (PriorXOp != W65816::LDA_Imm16 && PriorXOp != W65816::LDAi16imm)
|
|
return nullptr;
|
|
int64_t XConst = 0;
|
|
for (const MachineOperand &MO : PriorX->operands()) {
|
|
if (MO.isImm()) { XConst = MO.getImm(); break; }
|
|
}
|
|
for (MachineInstr *StaY : StasY) {
|
|
if (StaY->getParent() == MBBStaX) continue;
|
|
MachineInstr *PriorY = findPriorADef(StaY);
|
|
if (!PriorY) continue;
|
|
if (PriorY->getOpcode() != PriorXOp) continue;
|
|
int64_t YConst = 0;
|
|
for (const MachineOperand &MO : PriorY->operands()) {
|
|
if (MO.isImm()) { YConst = MO.getImm(); break; }
|
|
}
|
|
if (XConst == YConst) return StaY;
|
|
}
|
|
(void)XOff;
|
|
return nullptr;
|
|
}
|
|
|
|
|
|
// Run Phase 6a + Phase 6 (per-MBB peepholes) — independent of rename
|
|
// logic, so they fire on every function. Returns true if anything
|
|
// changed.
|
|
static bool runPerMBBPeepholes(MachineFunction &MF) {
|
|
bool Changed = false;
|
|
|
|
// Phase 6a: redundant `STA Y, s` immediately followed by `LDA Y, s`.
|
|
for (MachineBasicBlock &MBB : MF) {
|
|
SmallVector<MachineInstr *, 4> Dead;
|
|
for (auto It = MBB.begin(); It != MBB.end(); ++It) {
|
|
if (It->isDebugInstr()) continue;
|
|
if (It->getOpcode() != W65816::STA_StackRel) continue;
|
|
int64_t StaSlot;
|
|
if (!srAccess(*It, StaSlot)) continue;
|
|
auto NextIt = std::next(It);
|
|
while (NextIt != MBB.end() && NextIt->isDebugInstr()) ++NextIt;
|
|
if (NextIt == MBB.end()) continue;
|
|
if (NextIt->getOpcode() != W65816::LDA_StackRel) continue;
|
|
int64_t LdaSlot;
|
|
if (!srAccess(*NextIt, LdaSlot)) continue;
|
|
if (StaSlot != LdaSlot) continue;
|
|
bool flagsSafe = false;
|
|
bool aIsUsedBeforeClobber = false;
|
|
for (auto Fwd = std::next(NextIt); Fwd != MBB.end(); ++Fwd) {
|
|
if (Fwd->isDebugInstr()) continue;
|
|
// Calls/JSLs that take A as arg — even though clobbersFlagsP
|
|
// returns true for them, the elimination could mis-track A's
|
|
// live-in to the call. Bail.
|
|
if (Fwd->isCall()) break;
|
|
// Generic: any instr that has `implicit $a` as a USE — A is
|
|
// live going in. Bail to avoid live-range trouble.
|
|
for (const MachineOperand &MO : Fwd->operands()) {
|
|
if (MO.isReg() && MO.getReg() == W65816::A && MO.isUse() &&
|
|
!MO.isDef()) {
|
|
aIsUsedBeforeClobber = true;
|
|
break;
|
|
}
|
|
}
|
|
if (aIsUsedBeforeClobber) break;
|
|
if (usesFlagsP(*Fwd)) break;
|
|
if (Fwd->isTerminator() && !Fwd->isConditionalBranch()) {
|
|
flagsSafe = true; break;
|
|
}
|
|
if (clobbersFlagsP(*Fwd)) { flagsSafe = true; break; }
|
|
}
|
|
if (!flagsSafe) continue;
|
|
Dead.push_back(&*NextIt);
|
|
}
|
|
for (MachineInstr *MI : Dead) {
|
|
MI->eraseFromParent();
|
|
Changed = true;
|
|
}
|
|
}
|
|
|
|
// Phase 6: per-MBB redundant `LDA #K` elimination.
|
|
auto isAandPPreserving = [](const MachineInstr &MI) -> bool {
|
|
unsigned Op = MI.getOpcode();
|
|
switch (Op) {
|
|
case W65816::STA_StackRel:
|
|
case W65816::STA_DP: case W65816::STA_DPX:
|
|
case W65816::STA_DPInd: case W65816::STA_DPIndY:
|
|
case W65816::STA_DPIndX:
|
|
case W65816::STA_Abs: case W65816::STA_AbsX:
|
|
case W65816::STA_AbsY: case W65816::STA_Long:
|
|
case W65816::STA_LongX:
|
|
case W65816::STX_DP: case W65816::STX_Abs:
|
|
case W65816::STY_DP: case W65816::STY_Abs: case W65816::STY_DPX:
|
|
case W65816::STZ_DP: case W65816::STZ_Abs:
|
|
case W65816::STZ_DPX: case W65816::STZ_AbsX:
|
|
return true;
|
|
default:
|
|
break;
|
|
}
|
|
for (const MachineOperand &MO : MI.operands()) {
|
|
if (MO.isReg() && MO.getReg() == W65816::P && MO.isDef())
|
|
return false;
|
|
}
|
|
if (MI.mayStore() && !MI.mayLoad() && !semanticallyDefsA(MI))
|
|
return true;
|
|
return false;
|
|
};
|
|
auto isLdaImmK = [](const MachineInstr &MI, int64_t &K) -> bool {
|
|
unsigned Op = MI.getOpcode();
|
|
if (Op != W65816::LDA_Imm16 && Op != W65816::LDAi16imm) return false;
|
|
for (const MachineOperand &MO : MI.operands()) {
|
|
if (MO.isImm()) { K = MO.getImm(); return true; }
|
|
}
|
|
return false;
|
|
};
|
|
for (MachineBasicBlock &MBB : MF) {
|
|
std::optional<int64_t> KnownK;
|
|
SmallVector<MachineInstr *, 4> Dead;
|
|
for (auto It = MBB.begin(); It != MBB.end(); ++It) {
|
|
if (It->isDebugInstr()) continue;
|
|
int64_t K;
|
|
if (isLdaImmK(*It, K)) {
|
|
if (KnownK && *KnownK == K) {
|
|
Dead.push_back(&*It);
|
|
continue;
|
|
}
|
|
KnownK = K;
|
|
continue;
|
|
}
|
|
if (isAandPPreserving(*It)) continue;
|
|
KnownK.reset();
|
|
}
|
|
for (MachineInstr *MI : Dead) {
|
|
MI->eraseFromParent();
|
|
Changed = true;
|
|
}
|
|
}
|
|
|
|
return Changed;
|
|
}
|
|
|
|
|
|
bool W65816StackSlotMerge::runOnMachineFunction(MachineFunction &MF) {
|
|
if (skipFunction(MF.getFunction())) return false;
|
|
if (MF.getFunction().hasOptNone()) return false;
|
|
|
|
// Run per-MBB peepholes first — independent of rename logic.
|
|
bool peepChanged = runPerMBBPeepholes(MF);
|
|
|
|
// Phase 1: index all stack-rel STA/LDA grouped by slot offset.
|
|
DenseMap<int64_t, SmallVector<MachineInstr *, 4>> Stas;
|
|
DenseMap<int64_t, SmallVector<MachineInstr *, 4>> Ldas;
|
|
DenseMap<int64_t, unsigned> AllRefs; // STA + LDA + ADC + ... count
|
|
for (MachineBasicBlock &MBB : MF) {
|
|
for (MachineInstr &MI : MBB) {
|
|
int64_t Off;
|
|
if (!srAccess(MI, Off)) continue;
|
|
AllRefs[Off]++;
|
|
if (MI.getOpcode() == W65816::STA_StackRel) {
|
|
Stas[Off].push_back(&MI);
|
|
} else if (MI.getOpcode() == W65816::LDA_StackRel) {
|
|
Ldas[Off].push_back(&MI);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Phase 2: find PHI-copy site candidates. Pattern: LDA X ; STA Y
|
|
// in a LOOP BODY MBB (= the MBB has itself as a predecessor, i.e.
|
|
// a self-loop back-edge). Restricting to loop bodies distinguishes
|
|
// genuine PHI-cycle copies from one-shot temp transfers (where
|
|
// slot X is just a scratch register dropped on the way to slot Y
|
|
// for an unrelated purpose, like qsortIter's pointer-construction
|
|
// pattern `STA 5; ...; LDA 5; STA 39` followed by `LDA 39; STA dp`).
|
|
DenseMap<int64_t, int64_t> PhiCopyPair; // X -> Y
|
|
for (MachineBasicBlock &MBB : MF) {
|
|
// Self-loop check: MBB must have itself as a predecessor.
|
|
bool selfLoop = false;
|
|
for (MachineBasicBlock *Pred : MBB.predecessors()) {
|
|
if (Pred == &MBB) { selfLoop = true; break; }
|
|
}
|
|
if (!selfLoop) continue;
|
|
for (auto It = MBB.begin(); It != MBB.end(); ++It) {
|
|
if (It->getOpcode() != W65816::LDA_StackRel) continue;
|
|
int64_t X;
|
|
if (!srAccess(*It, X)) continue;
|
|
auto NextIt = std::next(It);
|
|
while (NextIt != MBB.end() && NextIt->isDebugInstr()) ++NextIt;
|
|
if (NextIt == MBB.end()) continue;
|
|
if (NextIt->getOpcode() != W65816::STA_StackRel) continue;
|
|
int64_t Y;
|
|
if (!srAccess(*NextIt, Y) || Y == X) continue;
|
|
if (PhiCopyPair.count(X)) continue;
|
|
PhiCopyPair[X] = Y;
|
|
}
|
|
}
|
|
|
|
// Phase 3: validate each pair and apply rename if safe.
|
|
// Track which slots have already been merged so we don't double-merge.
|
|
DenseMap<int64_t, int64_t> Renames; // X -> Y
|
|
for (auto &P : PhiCopyPair) {
|
|
int64_t X = P.first, Y = P.second;
|
|
// Don't re-merge an already-processed slot.
|
|
if (Renames.count(X) || Renames.count(Y)) continue;
|
|
// Arg-slot guard: skip slots with no STAs (caller-passed args).
|
|
if (Stas[X].empty() || Stas[Y].empty()) continue;
|
|
|
|
// Validate that every STA X has a twin STA Y.
|
|
bool allPaired = true;
|
|
for (MachineInstr *StaX : Stas[X]) {
|
|
if (!findTwin(StaX, Stas[Y])) { allPaired = false; break; }
|
|
}
|
|
if (!allPaired) continue;
|
|
|
|
// Symmetric: every STA Y must have a twin STA X.
|
|
for (MachineInstr *StaY : Stas[Y]) {
|
|
if (!findTwin(StaY, Stas[X])) { allPaired = false; break; }
|
|
}
|
|
if (!allPaired) continue;
|
|
|
|
LLVM_DEBUG(dbgs() << "StackSlotMerge: rename slot " << X
|
|
<< " -> " << Y << " in " << MF.getName() << "\n");
|
|
Renames[X] = Y;
|
|
}
|
|
if (Renames.empty()) return false;
|
|
|
|
// Phase 4: apply rename.
|
|
bool Changed = false;
|
|
for (MachineBasicBlock &MBB : MF) {
|
|
SmallVector<MachineInstr *, 4> ToErase;
|
|
for (MachineInstr &MI : MBB) {
|
|
int64_t Off;
|
|
if (!srAccess(MI, Off)) continue;
|
|
auto It = Renames.find(Off);
|
|
if (It == Renames.end()) continue;
|
|
MI.getOperand(0).setImm(It->second);
|
|
Changed = true;
|
|
}
|
|
// After rename, look for now-redundant LDA-STA pairs to the same
|
|
// slot (the PHI-copy self-copy). Erase them.
|
|
for (auto It = MBB.begin(); It != MBB.end(); ++It) {
|
|
if (It->getOpcode() != W65816::LDA_StackRel) continue;
|
|
int64_t LdaOff;
|
|
if (!srAccess(*It, LdaOff)) continue;
|
|
auto NextIt = std::next(It);
|
|
while (NextIt != MBB.end() && NextIt->isDebugInstr()) ++NextIt;
|
|
if (NextIt == MBB.end()) continue;
|
|
if (NextIt->getOpcode() != W65816::STA_StackRel) continue;
|
|
int64_t StaOff;
|
|
if (!srAccess(*NextIt, StaOff)) continue;
|
|
if (LdaOff != StaOff) continue;
|
|
ToErase.push_back(&*It);
|
|
ToErase.push_back(&*NextIt);
|
|
}
|
|
for (MachineInstr *MI : ToErase) MI->eraseFromParent();
|
|
if (!ToErase.empty()) Changed = true;
|
|
}
|
|
|
|
// Phase 5: redundant constant-init elimination. After rename, the
|
|
// Case (3) twin pairings leave us with TWO sites writing the same
|
|
// constant to the same slot (one renamed from X to Y, the other was
|
|
// already targeting Y). The dominated one is redundant — its slot
|
|
// already holds the constant from the dominating write.
|
|
//
|
|
// Generalize: scan post-rename for ALL `LDA_Imm16 K ; STA_StackRel Y`
|
|
// pairs (or LDAi16imm K; STA Y). For each pair, look for another
|
|
// such pair with the same (K, Y) where one DOMINATES the other AND
|
|
// no slot-Y access exists on any path between them. Erase the
|
|
// dominated STA + its preceding LDA (if A isn't otherwise consumed).
|
|
{
|
|
auto isLdaImm = [](const MachineInstr &MI) {
|
|
unsigned Op = MI.getOpcode();
|
|
return Op == W65816::LDA_Imm16 || Op == W65816::LDAi16imm;
|
|
};
|
|
auto immValue = [](const MachineInstr &MI) -> int64_t {
|
|
for (const MachineOperand &MO : MI.operands()) {
|
|
if (MO.isImm()) return MO.getImm();
|
|
}
|
|
return 0;
|
|
};
|
|
// Collect `LDA #K ; STA_StackRel Y` pairs, grouped by Y.
|
|
DenseMap<int64_t, SmallVector<std::pair<MachineInstr *, int64_t>, 4>>
|
|
ConstStas;
|
|
for (MachineBasicBlock &MBB : MF) {
|
|
for (auto It = MBB.begin(); It != MBB.end(); ++It) {
|
|
if (!isLdaImm(*It)) continue;
|
|
int64_t K = immValue(*It);
|
|
auto NextIt = std::next(It);
|
|
while (NextIt != MBB.end() && NextIt->isDebugInstr()) ++NextIt;
|
|
if (NextIt == MBB.end()) continue;
|
|
if (NextIt->getOpcode() != W65816::STA_StackRel) continue;
|
|
int64_t Y;
|
|
if (!srAccess(*NextIt, Y)) continue;
|
|
ConstStas[Y].push_back({&*NextIt, K});
|
|
}
|
|
}
|
|
// For each slot Y with at least two const-init STAs, check for
|
|
// dominator redundancy.
|
|
auto &MDT = getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
|
|
// Check that no instruction WRITES slot Y on any path between
|
|
// From and To. Reads are fine because both From and To write
|
|
// the same constant K — any intermediate read would see K either
|
|
// way (since From dominates, From has already executed). Calls
|
|
// are bailout conditions: a call might write to the stack via
|
|
// address-taken locals or other side effects we don't model.
|
|
auto noSlotWriteOnPath = [&](MachineInstr *From, MachineInstr *To,
|
|
int64_t Y) -> bool {
|
|
MachineBasicBlock *FromMBB = From->getParent();
|
|
MachineBasicBlock *ToMBB = To->getParent();
|
|
auto opWritesY = [&](MachineInstr &MI) {
|
|
if (MI.isCall() || MI.isInlineAsm()) return true;
|
|
int64_t Off;
|
|
if (MI.getOpcode() == W65816::STA_StackRel &&
|
|
srAccess(MI, Off) && Off == Y) {
|
|
return true;
|
|
}
|
|
return false;
|
|
};
|
|
// (a) After From in its MBB.
|
|
for (auto It = std::next(From->getIterator()); It != FromMBB->end();
|
|
++It) {
|
|
if (It->isDebugInstr()) continue;
|
|
if (opWritesY(*It)) return false;
|
|
}
|
|
// (b) BFS forward from FromMBB's successors, stopping at ToMBB.
|
|
SmallPtrSet<MachineBasicBlock *, 8> Visited;
|
|
SmallVector<MachineBasicBlock *, 8> Stack;
|
|
for (auto *Succ : FromMBB->successors()) Stack.push_back(Succ);
|
|
while (!Stack.empty()) {
|
|
auto *MBB = Stack.pop_back_val();
|
|
if (MBB == ToMBB) continue; // checked separately in (c)
|
|
if (!Visited.insert(MBB).second) continue;
|
|
for (auto &MI : *MBB) {
|
|
if (MI.isDebugInstr()) continue;
|
|
if (opWritesY(MI)) return false;
|
|
}
|
|
for (auto *Succ : MBB->successors()) Stack.push_back(Succ);
|
|
}
|
|
// (c) In ToMBB, before To, any write of Y?
|
|
for (auto It = ToMBB->begin(); It != To->getIterator(); ++It) {
|
|
if (It->isDebugInstr()) continue;
|
|
if (opWritesY(*It)) return false;
|
|
}
|
|
return true;
|
|
};
|
|
SmallVector<MachineInstr *, 8> ToErase;
|
|
LLVM_DEBUG({
|
|
dbgs() << "Phase 5 in " << MF.getName() << ":\n";
|
|
for (auto &P : ConstStas) {
|
|
dbgs() << " slot " << P.first << " has " << P.second.size()
|
|
<< " const STAs\n";
|
|
}
|
|
});
|
|
for (auto &P : ConstStas) {
|
|
int64_t Y = P.first;
|
|
auto &stas = P.second;
|
|
if (stas.size() < 2) continue;
|
|
// For each pair (i, j) where i dominates j with same constant K:
|
|
for (auto &Sj : stas) {
|
|
MachineInstr *DominatedSta = Sj.first;
|
|
int64_t Kj = Sj.second;
|
|
for (auto &Si : stas) {
|
|
if (&Si == &Sj) continue;
|
|
if (Si.second != Kj) continue; // different K
|
|
MachineInstr *DominatorSta = Si.first;
|
|
if (!MDT.dominates(DominatorSta, DominatedSta)) continue;
|
|
if (!noSlotWriteOnPath(DominatorSta, DominatedSta, Y)) continue;
|
|
// Flag safety: erasing `LDA #K; STA Y` removes a flag-setting
|
|
// op (the LDA). Walk forward from the STA looking for next
|
|
// flag-clobber or unconditional terminator (safe) vs.
|
|
// flag-use (unsafe).
|
|
MachineBasicBlock *MBB = DominatedSta->getParent();
|
|
bool flagsSafeP5 = false;
|
|
for (auto Fwd = std::next(DominatedSta->getIterator());
|
|
Fwd != MBB->end(); ++Fwd) {
|
|
if (Fwd->isDebugInstr()) continue;
|
|
if (usesFlagsP(*Fwd)) break;
|
|
if (Fwd->isTerminator() && !Fwd->isConditionalBranch()) {
|
|
flagsSafeP5 = true; break;
|
|
}
|
|
if (clobbersFlagsP(*Fwd)) { flagsSafeP5 = true; break; }
|
|
}
|
|
if (!flagsSafeP5) continue;
|
|
// Erase DominatedSta and its preceding LDA #K.
|
|
auto Prev = DominatedSta->getIterator();
|
|
while (Prev != MBB->begin()) {
|
|
--Prev;
|
|
if (!Prev->isDebugInstr()) break;
|
|
}
|
|
if (Prev != DominatedSta->getIterator() && isLdaImm(*Prev) &&
|
|
immValue(*Prev) == Kj) {
|
|
// Verify A isn't consumed between LDA and STA — they're
|
|
// adjacent so no consumers exist; safe. Erase both.
|
|
ToErase.push_back(&*Prev);
|
|
}
|
|
ToErase.push_back(DominatedSta);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
// De-dup ToErase before erasing.
|
|
SmallPtrSet<MachineInstr *, 8> ErasedSet;
|
|
for (MachineInstr *MI : ToErase) {
|
|
if (ErasedSet.insert(MI).second) {
|
|
MI->eraseFromParent();
|
|
Changed = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
return Changed || peepChanged;
|
|
}
|