//===-- W65816NarrowI32Mul.cpp - Narrow i32 multiplies -------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===---------------------------------------------------------------------===// // // IR-level peephole. Detects `mul i32 X, Y` where both X and Y have // their top 16 bits provably zero (via LLVM's IR-level computeKnownBits) // and rewrites to a call to `__umulhisi3` — a 16x16 -> 32 unsigned // multiply (~30% faster than __mulsi3 for `(u32)i * i` patterns). // // Why an IR pass instead of a Custom SDAG lowering: LLVM's IndVarSimplify // loop pass widens narrow induction variables (e.g. an i16 loop counter // later zext'd to i32) into i32 PHIs. By SDAG-build time the zext is // gone — the MUL's operand is just `CopyFromReg %2:i32`, an opaque value. // SDAG's computeKnownBits can't trace back across BB boundaries through // CopyFromReg. IR-level computeKnownBits, by contrast, walks the use-def // graph (including PHIs) and can prove the high bits zero. // // Runs in addISelPrepare (right before SDAG-ISel) so it sees the // final-shape IR. The libcall declaration is auto-added if missing. // //===---------------------------------------------------------------------===// #include "W65816.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/InitializePasses.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" #include "llvm/Pass.h" #include "llvm/Support/KnownBits.h" using namespace llvm; #define DEBUG_TYPE "w65816-narrow-i32-mul" namespace { class W65816NarrowI32Mul : public FunctionPass { public: static char ID; W65816NarrowI32Mul() : FunctionPass(ID) {} StringRef getPassName() const override { return "W65816 narrow i32 multiplies to __umulhisi3"; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.setPreservesCFG(); } bool runOnFunction(Function &F) override; }; } // namespace char W65816NarrowI32Mul::ID = 0; INITIALIZE_PASS_BEGIN(W65816NarrowI32Mul, DEBUG_TYPE, "W65816 narrow i32 multiplies", false, false) INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_END(W65816NarrowI32Mul, DEBUG_TYPE, "W65816 narrow i32 multiplies", false, false) // Get-or-declare `__umulhisi3(i16, i16) -> i32` in the module. static FunctionCallee getUmulhisi3(Module &M) { LLVMContext &Ctx = M.getContext(); Type *I16 = Type::getInt16Ty(Ctx); Type *I32 = Type::getInt32Ty(Ctx); FunctionType *FT = FunctionType::get(I32, {I16, I16}, false); return M.getOrInsertFunction("__umulhisi3", FT); } // True iff the top 16 bits of V are known zero. Tries IR-level // computeKnownBits first; if that doesn't prove enough, falls back // to ScalarEvolution's unsigned-range analysis (which handles // loop-bounded induction variables that KnownBits can't). static bool top16Zero(Value *V, const DataLayout &DL, ScalarEvolution &SE) { KnownBits K = computeKnownBits(V, DL); if (K.countMinLeadingZeros() >= 16) { return true; } if (!SE.isSCEVable(V->getType())) { return false; } const SCEV *S = SE.getSCEV(V); ConstantRange R = SE.getUnsignedRange(S); return R.getActiveBits() <= 16; } bool W65816NarrowI32Mul::runOnFunction(Function &F) { Module *M = F.getParent(); const DataLayout &DL = M->getDataLayout(); Type *I16 = Type::getInt16Ty(F.getContext()); ScalarEvolution &SE = getAnalysis().getSE(); SmallVector Worklist; for (Instruction &I : instructions(F)) { auto *BO = dyn_cast(&I); if (!BO || BO->getOpcode() != Instruction::Mul) { continue; } if (!BO->getType()->isIntegerTy(32)) { continue; } if (!top16Zero(BO->getOperand(0), DL, SE)) { continue; } if (!top16Zero(BO->getOperand(1), DL, SE)) { continue; } Worklist.push_back(BO); } if (Worklist.empty()) { return false; } FunctionCallee Callee = getUmulhisi3(*M); for (BinaryOperator *BO : Worklist) { IRBuilder<> B(BO); Value *A = B.CreateTrunc(BO->getOperand(0), I16); Value *Bv = B.CreateTrunc(BO->getOperand(1), I16); Value *Call = B.CreateCall(Callee, {A, Bv}); BO->replaceAllUsesWith(Call); BO->eraseFromParent(); } return true; } FunctionPass *llvm::createW65816NarrowI32Mul() { return new W65816NarrowI32Mul(); }