460 lines
18 KiB
C++
460 lines
18 KiB
C++
//===-- W65816UnLSR.cpp - Undo LSR for global-array pointer-walks --------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===---------------------------------------------------------------------===//
|
|
//
|
|
// Post-LSR IR pass. LSR converts `for (i; i<n; i++) arr[i]` loops into
|
|
// pointer-walking:
|
|
//
|
|
// %lsr.iv1 = phi ptr [@arr, ...], [%scevgep, ...]
|
|
// %lsr.iv = phi i16 [%n, ...], [%lsr.iv.next, ...]
|
|
// %v = load i16, ptr %lsr.iv1
|
|
// %scevgep = getelementptr i8, ptr %lsr.iv1, i32 stride
|
|
// %lsr.iv.next = add i16 %lsr.iv, -1
|
|
//
|
|
// On the W65816 this is a regression: pointer-deref needs the 24-bit
|
|
// `[dp],Y` addressing (sets up $E0/$E2 each iter), whereas indexed
|
|
// access via `lda <global>, X` is 3 bytes / 5 cyc DBR-relative. The
|
|
// LDA_AbsX SDAG combine recognizes `(load (add Wrapper, idx))` but
|
|
// LSR has destroyed that shape.
|
|
//
|
|
// This pass detects the shape and converts back: introduce a forward
|
|
// counter PHI (0, 1, 2, ...), replace loads/stores through the pointer
|
|
// PHI with indexed GEPs `getelementptr T, @global, %i`. The old pointer
|
|
// PHI becomes dead and is DCE'd. The existing count-down counter PHI
|
|
// stays as the loop control.
|
|
//
|
|
// Restrictions:
|
|
// * Pointer PHI's initial value must be a `GlobalAddress` (otherwise
|
|
// it might point at a non-DBR-bank object, and our combine doesn't
|
|
// fire / produces wrong code).
|
|
// * Pointer PHI must have exactly two incoming values (header + latch).
|
|
// * Pointer PHI's only uses must be the stride GEP and loads/stores.
|
|
// * Stride must be a positive constant.
|
|
// * The loop must have an existing trip-count counter so we don't
|
|
// introduce extra register pressure.
|
|
//
|
|
//===---------------------------------------------------------------------===//
|
|
|
|
#include "W65816.h"
|
|
#include "llvm/ADT/DenseMap.h"
|
|
#include "llvm/ADT/SmallPtrSet.h"
|
|
#include "llvm/ADT/SmallVector.h"
|
|
#include "llvm/Analysis/LoopInfo.h"
|
|
#include "llvm/Analysis/ScalarEvolution.h"
|
|
#include "llvm/InitializePasses.h"
|
|
#include "llvm/IR/Constants.h"
|
|
#include "llvm/IR/Function.h"
|
|
#include "llvm/IR/IRBuilder.h"
|
|
#include "llvm/IR/InstIterator.h"
|
|
#include "llvm/IR/Instructions.h"
|
|
#include "llvm/IR/Module.h"
|
|
#include "llvm/Pass.h"
|
|
#include "llvm/Support/Debug.h"
|
|
|
|
using namespace llvm;
|
|
|
|
#define DEBUG_TYPE "w65816-un-lsr"
|
|
|
|
namespace {
|
|
|
|
class W65816UnLSR : public FunctionPass {
|
|
public:
|
|
static char ID;
|
|
W65816UnLSR() : FunctionPass(ID) {}
|
|
|
|
StringRef getPassName() const override {
|
|
return "W65816 undo LSR pointer-walk for global-array access";
|
|
}
|
|
|
|
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
|
AU.addRequired<LoopInfoWrapperPass>();
|
|
AU.setPreservesCFG();
|
|
}
|
|
|
|
bool runOnFunction(Function &F) override;
|
|
|
|
private:
|
|
bool processLoop(Loop *L);
|
|
bool processCounterToPtrPHIs(Loop *L);
|
|
};
|
|
|
|
} // namespace
|
|
|
|
char W65816UnLSR::ID = 0;
|
|
|
|
INITIALIZE_PASS_BEGIN(W65816UnLSR, DEBUG_TYPE,
|
|
"W65816 undo LSR for global-array", false, false)
|
|
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
|
|
INITIALIZE_PASS_END(W65816UnLSR, DEBUG_TYPE,
|
|
"W65816 undo LSR for global-array", false, false)
|
|
|
|
FunctionPass *llvm::createW65816UnLSR() { return new W65816UnLSR(); }
|
|
|
|
bool W65816UnLSR::runOnFunction(Function &F) {
|
|
if (F.hasOptNone())
|
|
return false;
|
|
LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
|
|
bool Changed = false;
|
|
for (Loop *L : LI) {
|
|
Changed |= processLoop(L);
|
|
Changed |= processCounterToPtrPHIs(L);
|
|
SmallVector<Loop *, 4> Worklist(L->begin(), L->end());
|
|
while (!Worklist.empty()) {
|
|
Loop *Sub = Worklist.pop_back_val();
|
|
Changed |= processLoop(Sub);
|
|
Changed |= processCounterToPtrPHIs(Sub);
|
|
Worklist.append(Sub->begin(), Sub->end());
|
|
}
|
|
}
|
|
return Changed;
|
|
}
|
|
|
|
|
|
// strcpy-style undo: LSR converts two pointer PHIs (`src.addr.0` and
|
|
// `d.0` each stepping by 1) into a single counter PHI (`lsr.iv`) plus
|
|
// GEPs `(base, counter)` per iter. On 65816 the counter+GEP form
|
|
// each iter does i32 (base + counter) on each pointer — much more
|
|
// expensive than just incrementing two i16 pointer PHIs.
|
|
//
|
|
// Pattern (post-LSR):
|
|
// %lsr.iv = phi i32 [0, %entry], [%lsr.iv.next, %latch]
|
|
// %scevgep_i = getelementptr i8, ptr %base_i, i32 %lsr.iv (for each base_i)
|
|
// ... loads/stores via %scevgep_i ...
|
|
// %lsr.iv.next = add i32 %lsr.iv, 1
|
|
//
|
|
// Where each %base_i is loop-invariant (typically a function arg).
|
|
//
|
|
// Rewrite: for each base_i, introduce a pointer PHI that strides by 1
|
|
// per iter. Replace %scevgep_i with the new pointer PHI. If counter
|
|
// has no other uses, eliminate it.
|
|
bool W65816UnLSR::processCounterToPtrPHIs(Loop *L) {
|
|
BasicBlock *Header = L->getHeader();
|
|
BasicBlock *Latch = L->getLoopLatch();
|
|
BasicBlock *Preheader = L->getLoopPreheader();
|
|
if (!Latch || !Preheader) return false;
|
|
|
|
// Find an integer counter PHI starting at 0 with step +1.
|
|
PHINode *Counter = nullptr;
|
|
Value *CounterNext = nullptr;
|
|
for (PHINode &PN : Header->phis()) {
|
|
if (!PN.getType()->isIntegerTy()) continue;
|
|
if (PN.getNumIncomingValues() != 2) continue;
|
|
Value *Init = nullptr, *Step = nullptr;
|
|
for (unsigned i = 0; i < PN.getNumIncomingValues(); ++i) {
|
|
BasicBlock *Pred = PN.getIncomingBlock(i);
|
|
if (L->contains(Pred)) Step = PN.getIncomingValue(i);
|
|
else Init = PN.getIncomingValue(i);
|
|
}
|
|
if (!Init || !Step) continue;
|
|
auto *InitCI = dyn_cast<ConstantInt>(Init);
|
|
if (!InitCI || !InitCI->isZero()) continue;
|
|
auto *StepBO = dyn_cast<BinaryOperator>(Step);
|
|
if (!StepBO || StepBO->getOpcode() != Instruction::Add) continue;
|
|
Value *Other = nullptr;
|
|
if (StepBO->getOperand(0) == &PN) Other = StepBO->getOperand(1);
|
|
else if (StepBO->getOperand(1) == &PN) Other = StepBO->getOperand(0);
|
|
if (!Other) continue;
|
|
auto *StepCI = dyn_cast<ConstantInt>(Other);
|
|
if (!StepCI || !StepCI->isOne()) continue;
|
|
Counter = &PN;
|
|
CounterNext = StepBO;
|
|
break;
|
|
}
|
|
if (!Counter) return false;
|
|
|
|
// Find GEPs `getelementptr i8, %base, %counter` (or %counter.next)
|
|
// where base is loop-invariant. Collect them and verify the counter
|
|
// has no OTHER uses outside this pattern.
|
|
SmallVector<GetElementPtrInst *, 4> GEPs;
|
|
for (User *U : Counter->users()) {
|
|
if (U == CounterNext) continue;
|
|
auto *GEP = dyn_cast<GetElementPtrInst>(U);
|
|
if (!GEP) return false;
|
|
if (GEP->getNumIndices() != 1) return false;
|
|
if (GEP->getOperand(1) != Counter) return false;
|
|
Value *Base = GEP->getPointerOperand();
|
|
// base must be loop-invariant. Instructions inside the loop fail;
|
|
// arguments and globals are always invariant.
|
|
if (auto *BaseI = dyn_cast<Instruction>(Base))
|
|
if (L->contains(BaseI)) return false;
|
|
if (!Base->getType()->isPointerTy()) return false;
|
|
// Only handle the i8 element type (byte stride). Other strides
|
|
// would need different ptr-PHI step values.
|
|
if (!GEP->getSourceElementType()->isIntegerTy(8)) return false;
|
|
GEPs.push_back(GEP);
|
|
}
|
|
// Also accept if CounterNext is used as a GEP index (sometimes LSR
|
|
// uses the post-increment value). Walk those too.
|
|
for (User *U : CounterNext->users()) {
|
|
if (U == Counter) continue;
|
|
auto *GEP = dyn_cast<GetElementPtrInst>(U);
|
|
if (GEP) {
|
|
// Bail if CounterNext is used as a GEP index — we'd need to add
|
|
// a +1 offset to the new pointer PHI to match. Keep this simple
|
|
// for now: only handle uses of Counter, not CounterNext.
|
|
if (GEP->getNumIndices() == 1 && GEP->getOperand(1) == CounterNext)
|
|
return false;
|
|
}
|
|
// Allow icmp / branch / other non-GEP uses of CounterNext — those
|
|
// are the loop's exit test, fine to leave alone.
|
|
}
|
|
if (GEPs.empty()) return false;
|
|
|
|
// For each unique base, build a pointer PHI.
|
|
LLVMContext &Ctx = Header->getContext();
|
|
Type *I8 = Type::getInt8Ty(Ctx);
|
|
DenseMap<Value *, PHINode *> BasePhis;
|
|
for (GetElementPtrInst *GEP : GEPs) {
|
|
Value *Base = GEP->getPointerOperand();
|
|
if (BasePhis.count(Base)) continue;
|
|
IRBuilder<> B(&Header->front());
|
|
PHINode *PtrPHI = B.CreatePHI(Base->getType(), 2, "unlsr.ptr");
|
|
PtrPHI->addIncoming(Base, Preheader);
|
|
// Build the step GEP in the latch (just before terminator).
|
|
IRBuilder<> BL(Latch->getTerminator());
|
|
Value *PtrNext = BL.CreateGEP(I8, PtrPHI,
|
|
ConstantInt::get(Type::getInt16Ty(Ctx), 1),
|
|
"unlsr.ptr.next");
|
|
PtrPHI->addIncoming(PtrNext, Latch);
|
|
BasePhis[Base] = PtrPHI;
|
|
}
|
|
|
|
// Replace each GEP's uses with the corresponding pointer PHI.
|
|
for (GetElementPtrInst *GEP : GEPs) {
|
|
GEP->replaceAllUsesWith(BasePhis[GEP->getPointerOperand()]);
|
|
}
|
|
// Erase the now-dead GEPs.
|
|
for (GetElementPtrInst *GEP : GEPs) {
|
|
if (GEP->use_empty()) GEP->eraseFromParent();
|
|
}
|
|
|
|
// If counter has no other uses (besides CounterNext and the latch
|
|
// incoming), eliminate it. CounterNext might still be used by the
|
|
// exit test — leave that alone.
|
|
bool counterDead = true;
|
|
for (User *U : Counter->users()) {
|
|
if (U == CounterNext) continue;
|
|
counterDead = false;
|
|
break;
|
|
}
|
|
if (counterDead) {
|
|
// CounterNext might be used by other PHIs / icmp. Don't erase if so.
|
|
bool counterNextHasOtherUses = false;
|
|
for (User *U : CounterNext->users()) {
|
|
if (U == Counter) continue;
|
|
counterNextHasOtherUses = true;
|
|
break;
|
|
}
|
|
if (!counterNextHasOtherUses) {
|
|
Type *IntT = Counter->getType();
|
|
cast<Instruction>(CounterNext)->replaceAllUsesWith(
|
|
UndefValue::get(IntT));
|
|
Counter->replaceAllUsesWith(UndefValue::get(IntT));
|
|
cast<Instruction>(CounterNext)->eraseFromParent();
|
|
Counter->eraseFromParent();
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool W65816UnLSR::processLoop(Loop *L) {
|
|
BasicBlock *Header = L->getHeader();
|
|
BasicBlock *Latch = L->getLoopLatch();
|
|
if (!Latch)
|
|
return false;
|
|
// Single-block loops are fine (header == latch).
|
|
|
|
// Find a pointer PHI whose initial value is a GlobalAddress and whose
|
|
// latch-incoming is a constant-stride GEP off itself.
|
|
SmallVector<PHINode *, 4> Candidates;
|
|
for (PHINode &PN : Header->phis()) {
|
|
if (!PN.getType()->isPointerTy()) continue;
|
|
if (PN.getNumIncomingValues() != 2) continue;
|
|
Candidates.push_back(&PN);
|
|
}
|
|
|
|
bool Changed = false;
|
|
for (PHINode *PtrPHI : Candidates) {
|
|
// Identify which incoming is from the preheader vs the latch.
|
|
Value *InitVal = nullptr;
|
|
Value *StepVal = nullptr;
|
|
for (unsigned i = 0; i < PtrPHI->getNumIncomingValues(); ++i) {
|
|
BasicBlock *Pred = PtrPHI->getIncomingBlock(i);
|
|
Value *Inc = PtrPHI->getIncomingValue(i);
|
|
if (L->contains(Pred))
|
|
StepVal = Inc;
|
|
else
|
|
InitVal = Inc;
|
|
}
|
|
if (!InitVal || !StepVal) continue;
|
|
|
|
// InitVal must be a GlobalAddress (GlobalVariable). Could relax to
|
|
// any constant base, but globals are the only case our combine
|
|
// handles correctly.
|
|
auto *InitGV = dyn_cast<GlobalVariable>(InitVal);
|
|
if (!InitGV) continue;
|
|
|
|
// StepVal must be a `getelementptr i8, %PtrPHI, ConstantInt`.
|
|
auto *StepGEP = dyn_cast<GetElementPtrInst>(StepVal);
|
|
if (!StepGEP) continue;
|
|
if (StepGEP->getPointerOperand() != PtrPHI) continue;
|
|
if (StepGEP->getNumIndices() != 1) continue;
|
|
auto *StrideCI = dyn_cast<ConstantInt>(StepGEP->getOperand(1));
|
|
if (!StrideCI) continue;
|
|
int64_t StrideBytes = StrideCI->getSExtValue();
|
|
if (StrideBytes <= 0) continue;
|
|
// Determine element type for the indexed GEP. Walk all loads/
|
|
// stores through PtrPHI; require they all access the same primitive
|
|
// type whose size equals StrideBytes (typical: i16 array → stride 2).
|
|
Type *ElemTy = nullptr;
|
|
SmallVector<Instruction *, 4> Accesses;
|
|
bool ok = true;
|
|
for (User *U : PtrPHI->users()) {
|
|
if (U == StepGEP) continue;
|
|
if (auto *LD = dyn_cast<LoadInst>(U)) {
|
|
if (LD->getPointerOperand() != PtrPHI) { ok = false; break; }
|
|
Type *T = LD->getType();
|
|
if (!T->isIntegerTy()) { ok = false; break; }
|
|
if (!ElemTy) ElemTy = T;
|
|
else if (ElemTy != T) { ok = false; break; }
|
|
Accesses.push_back(LD);
|
|
continue;
|
|
}
|
|
if (auto *ST = dyn_cast<StoreInst>(U)) {
|
|
if (ST->getPointerOperand() != PtrPHI) { ok = false; break; }
|
|
Type *T = ST->getValueOperand()->getType();
|
|
if (!T->isIntegerTy()) { ok = false; break; }
|
|
if (!ElemTy) ElemTy = T;
|
|
else if (ElemTy != T) { ok = false; break; }
|
|
Accesses.push_back(ST);
|
|
continue;
|
|
}
|
|
// Any other use → bail.
|
|
ok = false;
|
|
break;
|
|
}
|
|
if (!ok || !ElemTy || Accesses.empty()) continue;
|
|
Module &M = *Header->getModule();
|
|
const DataLayout &DL = M.getDataLayout();
|
|
uint64_t ElemBytes = DL.getTypeAllocSize(ElemTy);
|
|
if ((int64_t)ElemBytes != StrideBytes) continue;
|
|
|
|
// We have a clean global-array pointer-walk. Build a forward
|
|
// counter PHI starting at 0, incrementing by 1. Replace the
|
|
// pointer-PHI's loads/stores with `GEP ElemTy, @global, %i`.
|
|
LLVMContext &Ctx = Header->getContext();
|
|
IntegerType *I16 = Type::getInt16Ty(Ctx);
|
|
IRBuilder<> B(&Header->front());
|
|
PHINode *Counter = B.CreatePHI(I16, 2, "unlsr.i");
|
|
BasicBlock *Preheader = L->getLoopPreheader();
|
|
if (!Preheader) continue;
|
|
Counter->addIncoming(ConstantInt::get(I16, 0), Preheader);
|
|
// Build the counter increment in the latch, just before the
|
|
// terminator.
|
|
IRBuilder<> BL(Latch->getTerminator());
|
|
Value *CounterNext = BL.CreateAdd(Counter, ConstantInt::get(I16, 1),
|
|
"unlsr.i.next");
|
|
Counter->addIncoming(CounterNext, Latch);
|
|
|
|
// Replace each load/store with a GEP-then-load/store.
|
|
for (Instruction *Acc : Accesses) {
|
|
IRBuilder<> BA(Acc);
|
|
Value *Idx = Counter;
|
|
// GEP type is i16 elem, idx is i16.
|
|
Value *Addr = BA.CreateGEP(ElemTy, InitGV, Idx, "unlsr.addr");
|
|
if (auto *LD = dyn_cast<LoadInst>(Acc)) {
|
|
LD->setOperand(LoadInst::getPointerOperandIndex(), Addr);
|
|
} else if (auto *ST = dyn_cast<StoreInst>(Acc)) {
|
|
ST->setOperand(StoreInst::getPointerOperandIndex(), Addr);
|
|
}
|
|
}
|
|
// Erase the now-unused step GEP and pointer PHI.
|
|
if (StepGEP->use_empty()) StepGEP->eraseFromParent();
|
|
if (PtrPHI->use_empty()) PtrPHI->eraseFromParent();
|
|
|
|
// Try to eliminate the count-down LSR counter (`%lsr.iv = phi i16
|
|
// [%n, preheader], [%lsr.iv.next, latch]; %lsr.iv.next = add %lsr.iv,
|
|
// -1; %exitcond = icmp eq %lsr.iv.next, 0`). Replace the exit
|
|
// comparison with `icmp eq %unlsr.i.next, %n`, then erase the
|
|
// count-down chain. Saves one i16 PHI + dec + cmp per loop.
|
|
for (PHINode &LSR : Header->phis()) {
|
|
if (&LSR == Counter) continue;
|
|
if (!LSR.getType()->isIntegerTy(16)) continue;
|
|
if (LSR.getNumIncomingValues() != 2) continue;
|
|
// Identify preheader vs latch incomings.
|
|
Value *LsrInit = nullptr;
|
|
Value *LsrStep = nullptr;
|
|
for (unsigned i = 0; i < LSR.getNumIncomingValues(); ++i) {
|
|
BasicBlock *Pred = LSR.getIncomingBlock(i);
|
|
if (L->contains(Pred)) LsrStep = LSR.getIncomingValue(i);
|
|
else LsrInit = LSR.getIncomingValue(i);
|
|
}
|
|
if (!LsrInit || !LsrStep) continue;
|
|
// %lsr.iv.next must be `add %lsr.iv, -1`.
|
|
auto *LsrStepBO = dyn_cast<BinaryOperator>(LsrStep);
|
|
if (!LsrStepBO || LsrStepBO->getOpcode() != Instruction::Add) continue;
|
|
Value *OtherOp = nullptr;
|
|
if (LsrStepBO->getOperand(0) == &LSR) OtherOp = LsrStepBO->getOperand(1);
|
|
else if (LsrStepBO->getOperand(1) == &LSR) OtherOp = LsrStepBO->getOperand(0);
|
|
if (!OtherOp) continue;
|
|
auto *DecCI = dyn_cast<ConstantInt>(OtherOp);
|
|
if (!DecCI || !DecCI->isMinusOne()) continue;
|
|
// %lsr.iv.next typically has 2 uses: the icmp exit comparison and
|
|
// the PHI back-edge (LSR.users includes LsrStepBO as the latch
|
|
// incoming). Allow that; find the icmp among the uses.
|
|
ICmpInst *ExitCmp = nullptr;
|
|
bool extraUse = false;
|
|
for (User *U : LsrStepBO->users()) {
|
|
if (U == &LSR) continue;
|
|
if (auto *IC = dyn_cast<ICmpInst>(U)) {
|
|
if (ExitCmp) { extraUse = true; break; }
|
|
ExitCmp = IC;
|
|
continue;
|
|
}
|
|
extraUse = true;
|
|
break;
|
|
}
|
|
if (extraUse || !ExitCmp) continue;
|
|
if (ExitCmp->getPredicate() != ICmpInst::ICMP_EQ &&
|
|
ExitCmp->getPredicate() != ICmpInst::ICMP_NE) continue;
|
|
Value *CmpOther = nullptr;
|
|
if (ExitCmp->getOperand(0) == LsrStepBO)
|
|
CmpOther = ExitCmp->getOperand(1);
|
|
else if (ExitCmp->getOperand(1) == LsrStepBO)
|
|
CmpOther = ExitCmp->getOperand(0);
|
|
if (!CmpOther) continue;
|
|
auto *CmpZero = dyn_cast<ConstantInt>(CmpOther);
|
|
if (!CmpZero || !CmpZero->isZero()) continue;
|
|
bool lsrOnlyIncrUse = true;
|
|
for (User *U : LSR.users()) {
|
|
if (U == LsrStepBO) continue;
|
|
lsrOnlyIncrUse = false; break;
|
|
}
|
|
if (!lsrOnlyIncrUse) continue;
|
|
// CounterNext was inserted just before the latch terminator, but
|
|
// ExitCmp may live higher in the block (originally placed there
|
|
// by LSR). Move CounterNext to just before ExitCmp so it
|
|
// dominates the cmp's use.
|
|
if (auto *CounterNextI = dyn_cast<Instruction>(CounterNext))
|
|
CounterNextI->moveBefore(ExitCmp->getIterator());
|
|
// Replace exit comparison: icmp eq %unlsr.i.next, %lsrInit.
|
|
ExitCmp->setOperand(0, CounterNext);
|
|
ExitCmp->setOperand(1, LsrInit);
|
|
// RAUW each value with undef before erasing — they have a mutual
|
|
// reference (PHI → BO via latch-incoming, BO → PHI via add operand).
|
|
Type *I16T = LSR.getType();
|
|
LSR.replaceAllUsesWith(UndefValue::get(I16T));
|
|
LsrStepBO->replaceAllUsesWith(UndefValue::get(I16T));
|
|
LsrStepBO->eraseFromParent();
|
|
LSR.eraseFromParent();
|
|
break;
|
|
}
|
|
|
|
Changed = true;
|
|
}
|
|
return Changed;
|
|
}
|