//===-- W65816UnLSR.cpp - Undo LSR for global-array pointer-walks --------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===---------------------------------------------------------------------===// // // Post-LSR IR pass. LSR converts `for (i; i, X` is 3 bytes / 5 cyc DBR-relative. The // LDA_AbsX SDAG combine recognizes `(load (add Wrapper, idx))` but // LSR has destroyed that shape. // // This pass detects the shape and converts back: introduce a forward // counter PHI (0, 1, 2, ...), replace loads/stores through the pointer // PHI with indexed GEPs `getelementptr T, @global, %i`. The old pointer // PHI becomes dead and is DCE'd. The existing count-down counter PHI // stays as the loop control. // // Restrictions: // * Pointer PHI's initial value must be a `GlobalAddress` (otherwise // it might point at a non-DBR-bank object, and our combine doesn't // fire / produces wrong code). // * Pointer PHI must have exactly two incoming values (header + latch). // * Pointer PHI's only uses must be the stride GEP and loads/stores. // * Stride must be a positive constant. // * The loop must have an existing trip-count counter so we don't // introduce extra register pressure. // //===---------------------------------------------------------------------===// #include "W65816.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/InitializePasses.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" using namespace llvm; #define DEBUG_TYPE "w65816-un-lsr" namespace { class W65816UnLSR : public FunctionPass { public: static char ID; W65816UnLSR() : FunctionPass(ID) {} StringRef getPassName() const override { return "W65816 undo LSR pointer-walk for global-array access"; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.setPreservesCFG(); } bool runOnFunction(Function &F) override; private: bool processLoop(Loop *L); bool processCounterToPtrPHIs(Loop *L); }; } // namespace char W65816UnLSR::ID = 0; INITIALIZE_PASS_BEGIN(W65816UnLSR, DEBUG_TYPE, "W65816 undo LSR for global-array", false, false) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_END(W65816UnLSR, DEBUG_TYPE, "W65816 undo LSR for global-array", false, false) FunctionPass *llvm::createW65816UnLSR() { return new W65816UnLSR(); } bool W65816UnLSR::runOnFunction(Function &F) { if (F.hasOptNone()) return false; LoopInfo &LI = getAnalysis().getLoopInfo(); bool Changed = false; for (Loop *L : LI) { Changed |= processLoop(L); Changed |= processCounterToPtrPHIs(L); SmallVector Worklist(L->begin(), L->end()); while (!Worklist.empty()) { Loop *Sub = Worklist.pop_back_val(); Changed |= processLoop(Sub); Changed |= processCounterToPtrPHIs(Sub); Worklist.append(Sub->begin(), Sub->end()); } } return Changed; } // strcpy-style undo: LSR converts two pointer PHIs (`src.addr.0` and // `d.0` each stepping by 1) into a single counter PHI (`lsr.iv`) plus // GEPs `(base, counter)` per iter. On 65816 the counter+GEP form // each iter does i32 (base + counter) on each pointer — much more // expensive than just incrementing two i16 pointer PHIs. // // Pattern (post-LSR): // %lsr.iv = phi i32 [0, %entry], [%lsr.iv.next, %latch] // %scevgep_i = getelementptr i8, ptr %base_i, i32 %lsr.iv (for each base_i) // ... loads/stores via %scevgep_i ... // %lsr.iv.next = add i32 %lsr.iv, 1 // // Where each %base_i is loop-invariant (typically a function arg). // // Rewrite: for each base_i, introduce a pointer PHI that strides by 1 // per iter. Replace %scevgep_i with the new pointer PHI. If counter // has no other uses, eliminate it. bool W65816UnLSR::processCounterToPtrPHIs(Loop *L) { BasicBlock *Header = L->getHeader(); BasicBlock *Latch = L->getLoopLatch(); BasicBlock *Preheader = L->getLoopPreheader(); if (!Latch || !Preheader) return false; // Find an integer counter PHI starting at 0 with step +1. PHINode *Counter = nullptr; Value *CounterNext = nullptr; for (PHINode &PN : Header->phis()) { if (!PN.getType()->isIntegerTy()) continue; if (PN.getNumIncomingValues() != 2) continue; Value *Init = nullptr, *Step = nullptr; for (unsigned i = 0; i < PN.getNumIncomingValues(); ++i) { BasicBlock *Pred = PN.getIncomingBlock(i); if (L->contains(Pred)) Step = PN.getIncomingValue(i); else Init = PN.getIncomingValue(i); } if (!Init || !Step) continue; auto *InitCI = dyn_cast(Init); if (!InitCI || !InitCI->isZero()) continue; auto *StepBO = dyn_cast(Step); if (!StepBO || StepBO->getOpcode() != Instruction::Add) continue; Value *Other = nullptr; if (StepBO->getOperand(0) == &PN) Other = StepBO->getOperand(1); else if (StepBO->getOperand(1) == &PN) Other = StepBO->getOperand(0); if (!Other) continue; auto *StepCI = dyn_cast(Other); if (!StepCI || !StepCI->isOne()) continue; Counter = &PN; CounterNext = StepBO; break; } if (!Counter) return false; // Find GEPs `getelementptr i8, %base, %counter` (or %counter.next) // where base is loop-invariant. Collect them and verify the counter // has no OTHER uses outside this pattern. SmallVector GEPs; for (User *U : Counter->users()) { if (U == CounterNext) continue; auto *GEP = dyn_cast(U); if (!GEP) return false; if (GEP->getNumIndices() != 1) return false; if (GEP->getOperand(1) != Counter) return false; Value *Base = GEP->getPointerOperand(); // base must be loop-invariant. Instructions inside the loop fail; // arguments and globals are always invariant. if (auto *BaseI = dyn_cast(Base)) if (L->contains(BaseI)) return false; if (!Base->getType()->isPointerTy()) return false; // Only handle the i8 element type (byte stride). Other strides // would need different ptr-PHI step values. if (!GEP->getSourceElementType()->isIntegerTy(8)) return false; GEPs.push_back(GEP); } // Also accept if CounterNext is used as a GEP index (sometimes LSR // uses the post-increment value). Walk those too. for (User *U : CounterNext->users()) { if (U == Counter) continue; auto *GEP = dyn_cast(U); if (GEP) { // Bail if CounterNext is used as a GEP index — we'd need to add // a +1 offset to the new pointer PHI to match. Keep this simple // for now: only handle uses of Counter, not CounterNext. if (GEP->getNumIndices() == 1 && GEP->getOperand(1) == CounterNext) return false; } // Allow icmp / branch / other non-GEP uses of CounterNext — those // are the loop's exit test, fine to leave alone. } if (GEPs.empty()) return false; // For each unique base, build a pointer PHI. LLVMContext &Ctx = Header->getContext(); Type *I8 = Type::getInt8Ty(Ctx); DenseMap BasePhis; for (GetElementPtrInst *GEP : GEPs) { Value *Base = GEP->getPointerOperand(); if (BasePhis.count(Base)) continue; IRBuilder<> B(&Header->front()); PHINode *PtrPHI = B.CreatePHI(Base->getType(), 2, "unlsr.ptr"); PtrPHI->addIncoming(Base, Preheader); // Build the step GEP in the latch (just before terminator). IRBuilder<> BL(Latch->getTerminator()); Value *PtrNext = BL.CreateGEP(I8, PtrPHI, ConstantInt::get(Type::getInt16Ty(Ctx), 1), "unlsr.ptr.next"); PtrPHI->addIncoming(PtrNext, Latch); BasePhis[Base] = PtrPHI; } // Replace each GEP's uses with the corresponding pointer PHI. for (GetElementPtrInst *GEP : GEPs) { GEP->replaceAllUsesWith(BasePhis[GEP->getPointerOperand()]); } // Erase the now-dead GEPs. for (GetElementPtrInst *GEP : GEPs) { if (GEP->use_empty()) GEP->eraseFromParent(); } // If counter has no other uses (besides CounterNext and the latch // incoming), eliminate it. CounterNext might still be used by the // exit test — leave that alone. bool counterDead = true; for (User *U : Counter->users()) { if (U == CounterNext) continue; counterDead = false; break; } if (counterDead) { // CounterNext might be used by other PHIs / icmp. Don't erase if so. bool counterNextHasOtherUses = false; for (User *U : CounterNext->users()) { if (U == Counter) continue; counterNextHasOtherUses = true; break; } if (!counterNextHasOtherUses) { Type *IntT = Counter->getType(); cast(CounterNext)->replaceAllUsesWith( UndefValue::get(IntT)); Counter->replaceAllUsesWith(UndefValue::get(IntT)); cast(CounterNext)->eraseFromParent(); Counter->eraseFromParent(); } } return true; } bool W65816UnLSR::processLoop(Loop *L) { BasicBlock *Header = L->getHeader(); BasicBlock *Latch = L->getLoopLatch(); if (!Latch) return false; // Single-block loops are fine (header == latch). // Find a pointer PHI whose initial value is a GlobalAddress and whose // latch-incoming is a constant-stride GEP off itself. SmallVector Candidates; for (PHINode &PN : Header->phis()) { if (!PN.getType()->isPointerTy()) continue; if (PN.getNumIncomingValues() != 2) continue; Candidates.push_back(&PN); } bool Changed = false; for (PHINode *PtrPHI : Candidates) { // Identify which incoming is from the preheader vs the latch. Value *InitVal = nullptr; Value *StepVal = nullptr; for (unsigned i = 0; i < PtrPHI->getNumIncomingValues(); ++i) { BasicBlock *Pred = PtrPHI->getIncomingBlock(i); Value *Inc = PtrPHI->getIncomingValue(i); if (L->contains(Pred)) StepVal = Inc; else InitVal = Inc; } if (!InitVal || !StepVal) continue; // InitVal must be a GlobalAddress (GlobalVariable). Could relax to // any constant base, but globals are the only case our combine // handles correctly. auto *InitGV = dyn_cast(InitVal); if (!InitGV) continue; // StepVal must be a `getelementptr i8, %PtrPHI, ConstantInt`. auto *StepGEP = dyn_cast(StepVal); if (!StepGEP) continue; if (StepGEP->getPointerOperand() != PtrPHI) continue; if (StepGEP->getNumIndices() != 1) continue; auto *StrideCI = dyn_cast(StepGEP->getOperand(1)); if (!StrideCI) continue; int64_t StrideBytes = StrideCI->getSExtValue(); if (StrideBytes <= 0) continue; // Determine element type for the indexed GEP. Walk all loads/ // stores through PtrPHI; require they all access the same primitive // type whose size equals StrideBytes (typical: i16 array → stride 2). Type *ElemTy = nullptr; SmallVector Accesses; bool ok = true; for (User *U : PtrPHI->users()) { if (U == StepGEP) continue; if (auto *LD = dyn_cast(U)) { if (LD->getPointerOperand() != PtrPHI) { ok = false; break; } Type *T = LD->getType(); if (!T->isIntegerTy()) { ok = false; break; } if (!ElemTy) ElemTy = T; else if (ElemTy != T) { ok = false; break; } Accesses.push_back(LD); continue; } if (auto *ST = dyn_cast(U)) { if (ST->getPointerOperand() != PtrPHI) { ok = false; break; } Type *T = ST->getValueOperand()->getType(); if (!T->isIntegerTy()) { ok = false; break; } if (!ElemTy) ElemTy = T; else if (ElemTy != T) { ok = false; break; } Accesses.push_back(ST); continue; } // Any other use → bail. ok = false; break; } if (!ok || !ElemTy || Accesses.empty()) continue; Module &M = *Header->getModule(); const DataLayout &DL = M.getDataLayout(); uint64_t ElemBytes = DL.getTypeAllocSize(ElemTy); if ((int64_t)ElemBytes != StrideBytes) continue; // We have a clean global-array pointer-walk. Build a forward // counter PHI starting at 0, incrementing by 1. Replace the // pointer-PHI's loads/stores with `GEP ElemTy, @global, %i`. LLVMContext &Ctx = Header->getContext(); IntegerType *I16 = Type::getInt16Ty(Ctx); IRBuilder<> B(&Header->front()); PHINode *Counter = B.CreatePHI(I16, 2, "unlsr.i"); BasicBlock *Preheader = L->getLoopPreheader(); if (!Preheader) continue; Counter->addIncoming(ConstantInt::get(I16, 0), Preheader); // Build the counter increment in the latch, just before the // terminator. IRBuilder<> BL(Latch->getTerminator()); Value *CounterNext = BL.CreateAdd(Counter, ConstantInt::get(I16, 1), "unlsr.i.next"); Counter->addIncoming(CounterNext, Latch); // Replace each load/store with a GEP-then-load/store. for (Instruction *Acc : Accesses) { IRBuilder<> BA(Acc); Value *Idx = Counter; // GEP type is i16 elem, idx is i16. Value *Addr = BA.CreateGEP(ElemTy, InitGV, Idx, "unlsr.addr"); if (auto *LD = dyn_cast(Acc)) { LD->setOperand(LoadInst::getPointerOperandIndex(), Addr); } else if (auto *ST = dyn_cast(Acc)) { ST->setOperand(StoreInst::getPointerOperandIndex(), Addr); } } // Erase the now-unused step GEP and pointer PHI. if (StepGEP->use_empty()) StepGEP->eraseFromParent(); if (PtrPHI->use_empty()) PtrPHI->eraseFromParent(); // Try to eliminate the count-down LSR counter (`%lsr.iv = phi i16 // [%n, preheader], [%lsr.iv.next, latch]; %lsr.iv.next = add %lsr.iv, // -1; %exitcond = icmp eq %lsr.iv.next, 0`). Replace the exit // comparison with `icmp eq %unlsr.i.next, %n`, then erase the // count-down chain. Saves one i16 PHI + dec + cmp per loop. for (PHINode &LSR : Header->phis()) { if (&LSR == Counter) continue; if (!LSR.getType()->isIntegerTy(16)) continue; if (LSR.getNumIncomingValues() != 2) continue; // Identify preheader vs latch incomings. Value *LsrInit = nullptr; Value *LsrStep = nullptr; for (unsigned i = 0; i < LSR.getNumIncomingValues(); ++i) { BasicBlock *Pred = LSR.getIncomingBlock(i); if (L->contains(Pred)) LsrStep = LSR.getIncomingValue(i); else LsrInit = LSR.getIncomingValue(i); } if (!LsrInit || !LsrStep) continue; // %lsr.iv.next must be `add %lsr.iv, -1`. auto *LsrStepBO = dyn_cast(LsrStep); if (!LsrStepBO || LsrStepBO->getOpcode() != Instruction::Add) continue; Value *OtherOp = nullptr; if (LsrStepBO->getOperand(0) == &LSR) OtherOp = LsrStepBO->getOperand(1); else if (LsrStepBO->getOperand(1) == &LSR) OtherOp = LsrStepBO->getOperand(0); if (!OtherOp) continue; auto *DecCI = dyn_cast(OtherOp); if (!DecCI || !DecCI->isMinusOne()) continue; // %lsr.iv.next typically has 2 uses: the icmp exit comparison and // the PHI back-edge (LSR.users includes LsrStepBO as the latch // incoming). Allow that; find the icmp among the uses. ICmpInst *ExitCmp = nullptr; bool extraUse = false; for (User *U : LsrStepBO->users()) { if (U == &LSR) continue; if (auto *IC = dyn_cast(U)) { if (ExitCmp) { extraUse = true; break; } ExitCmp = IC; continue; } extraUse = true; break; } if (extraUse || !ExitCmp) continue; if (ExitCmp->getPredicate() != ICmpInst::ICMP_EQ && ExitCmp->getPredicate() != ICmpInst::ICMP_NE) continue; Value *CmpOther = nullptr; if (ExitCmp->getOperand(0) == LsrStepBO) CmpOther = ExitCmp->getOperand(1); else if (ExitCmp->getOperand(1) == LsrStepBO) CmpOther = ExitCmp->getOperand(0); if (!CmpOther) continue; auto *CmpZero = dyn_cast(CmpOther); if (!CmpZero || !CmpZero->isZero()) continue; bool lsrOnlyIncrUse = true; for (User *U : LSR.users()) { if (U == LsrStepBO) continue; lsrOnlyIncrUse = false; break; } if (!lsrOnlyIncrUse) continue; // CounterNext was inserted just before the latch terminator, but // ExitCmp may live higher in the block (originally placed there // by LSR). Move CounterNext to just before ExitCmp so it // dominates the cmp's use. if (auto *CounterNextI = dyn_cast(CounterNext)) CounterNextI->moveBefore(ExitCmp->getIterator()); // Replace exit comparison: icmp eq %unlsr.i.next, %lsrInit. ExitCmp->setOperand(0, CounterNext); ExitCmp->setOperand(1, LsrInit); // RAUW each value with undef before erasing — they have a mutual // reference (PHI → BO via latch-incoming, BO → PHI via add operand). Type *I16T = LSR.getType(); LSR.replaceAllUsesWith(UndefValue::get(I16T)); LsrStepBO->replaceAllUsesWith(UndefValue::get(I16T)); LsrStepBO->eraseFromParent(); LSR.eraseFromParent(); break; } Changed = true; } return Changed; }