150 lines
4.7 KiB
C++
150 lines
4.7 KiB
C++
//===-- W65816NarrowI32Mul.cpp - Narrow i32 multiplies -------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===---------------------------------------------------------------------===//
|
|
//
|
|
// IR-level peephole. Detects `mul i32 X, Y` where both X and Y have
|
|
// their top 16 bits provably zero (via LLVM's IR-level computeKnownBits)
|
|
// and rewrites to a call to `__umulhisi3` — a 16x16 -> 32 unsigned
|
|
// multiply (~30% faster than __mulsi3 for `(u32)i * i` patterns).
|
|
//
|
|
// Why an IR pass instead of a Custom SDAG lowering: LLVM's IndVarSimplify
|
|
// loop pass widens narrow induction variables (e.g. an i16 loop counter
|
|
// later zext'd to i32) into i32 PHIs. By SDAG-build time the zext is
|
|
// gone — the MUL's operand is just `CopyFromReg %2:i32`, an opaque value.
|
|
// SDAG's computeKnownBits can't trace back across BB boundaries through
|
|
// CopyFromReg. IR-level computeKnownBits, by contrast, walks the use-def
|
|
// graph (including PHIs) and can prove the high bits zero.
|
|
//
|
|
// Runs in addISelPrepare (right before SDAG-ISel) so it sees the
|
|
// final-shape IR. The libcall declaration is auto-added if missing.
|
|
//
|
|
//===---------------------------------------------------------------------===//
|
|
|
|
#include "W65816.h"
|
|
#include "llvm/Analysis/LoopInfo.h"
|
|
#include "llvm/Analysis/ScalarEvolution.h"
|
|
#include "llvm/InitializePasses.h"
|
|
#include "llvm/Analysis/ValueTracking.h"
|
|
#include "llvm/IR/Constants.h"
|
|
#include "llvm/IR/Function.h"
|
|
#include "llvm/IR/IRBuilder.h"
|
|
#include "llvm/IR/InstIterator.h"
|
|
#include "llvm/IR/Instructions.h"
|
|
#include "llvm/IR/Module.h"
|
|
#include "llvm/Pass.h"
|
|
#include "llvm/Support/KnownBits.h"
|
|
|
|
using namespace llvm;
|
|
|
|
#define DEBUG_TYPE "w65816-narrow-i32-mul"
|
|
|
|
|
|
namespace {
|
|
|
|
|
|
class W65816NarrowI32Mul : public FunctionPass {
|
|
public:
|
|
static char ID;
|
|
W65816NarrowI32Mul() : FunctionPass(ID) {}
|
|
|
|
StringRef getPassName() const override {
|
|
return "W65816 narrow i32 multiplies to __umulhisi3";
|
|
}
|
|
|
|
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
|
AU.addRequired<ScalarEvolutionWrapperPass>();
|
|
AU.setPreservesCFG();
|
|
}
|
|
|
|
bool runOnFunction(Function &F) override;
|
|
};
|
|
|
|
|
|
} // namespace
|
|
|
|
|
|
char W65816NarrowI32Mul::ID = 0;
|
|
|
|
INITIALIZE_PASS_BEGIN(W65816NarrowI32Mul, DEBUG_TYPE,
|
|
"W65816 narrow i32 multiplies", false, false)
|
|
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
|
|
INITIALIZE_PASS_END(W65816NarrowI32Mul, DEBUG_TYPE,
|
|
"W65816 narrow i32 multiplies", false, false)
|
|
|
|
|
|
// Get-or-declare `__umulhisi3(i16, i16) -> i32` in the module.
|
|
static FunctionCallee getUmulhisi3(Module &M) {
|
|
LLVMContext &Ctx = M.getContext();
|
|
Type *I16 = Type::getInt16Ty(Ctx);
|
|
Type *I32 = Type::getInt32Ty(Ctx);
|
|
FunctionType *FT = FunctionType::get(I32, {I16, I16}, false);
|
|
return M.getOrInsertFunction("__umulhisi3", FT);
|
|
}
|
|
|
|
|
|
// True iff the top 16 bits of V are known zero. Tries IR-level
|
|
// computeKnownBits first; if that doesn't prove enough, falls back
|
|
// to ScalarEvolution's unsigned-range analysis (which handles
|
|
// loop-bounded induction variables that KnownBits can't).
|
|
static bool top16Zero(Value *V, const DataLayout &DL, ScalarEvolution &SE) {
|
|
KnownBits K = computeKnownBits(V, DL);
|
|
if (K.countMinLeadingZeros() >= 16) {
|
|
return true;
|
|
}
|
|
if (!SE.isSCEVable(V->getType())) {
|
|
return false;
|
|
}
|
|
const SCEV *S = SE.getSCEV(V);
|
|
ConstantRange R = SE.getUnsignedRange(S);
|
|
return R.getActiveBits() <= 16;
|
|
}
|
|
|
|
|
|
bool W65816NarrowI32Mul::runOnFunction(Function &F) {
|
|
Module *M = F.getParent();
|
|
const DataLayout &DL = M->getDataLayout();
|
|
Type *I16 = Type::getInt16Ty(F.getContext());
|
|
ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
|
|
|
|
SmallVector<BinaryOperator *, 8> Worklist;
|
|
for (Instruction &I : instructions(F)) {
|
|
auto *BO = dyn_cast<BinaryOperator>(&I);
|
|
if (!BO || BO->getOpcode() != Instruction::Mul) {
|
|
continue;
|
|
}
|
|
if (!BO->getType()->isIntegerTy(32)) {
|
|
continue;
|
|
}
|
|
if (!top16Zero(BO->getOperand(0), DL, SE)) {
|
|
continue;
|
|
}
|
|
if (!top16Zero(BO->getOperand(1), DL, SE)) {
|
|
continue;
|
|
}
|
|
Worklist.push_back(BO);
|
|
}
|
|
|
|
if (Worklist.empty()) {
|
|
return false;
|
|
}
|
|
|
|
FunctionCallee Callee = getUmulhisi3(*M);
|
|
for (BinaryOperator *BO : Worklist) {
|
|
IRBuilder<> B(BO);
|
|
Value *A = B.CreateTrunc(BO->getOperand(0), I16);
|
|
Value *Bv = B.CreateTrunc(BO->getOperand(1), I16);
|
|
Value *Call = B.CreateCall(Callee, {A, Bv});
|
|
BO->replaceAllUsesWith(Call);
|
|
BO->eraseFromParent();
|
|
}
|
|
return true;
|
|
}
|
|
|
|
|
|
FunctionPass *llvm::createW65816NarrowI32Mul() {
|
|
return new W65816NarrowI32Mul();
|
|
}
|