65816-llvm-mos/src/llvm/lib/Target/W65816/W65816NarrowI32Mul.cpp
Scott Duensing e65fedc8e1 Checkpoint
2026-05-13 15:48:34 -05:00

150 lines
4.7 KiB
C++

//===-- W65816NarrowI32Mul.cpp - Narrow i32 multiplies -------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===---------------------------------------------------------------------===//
//
// IR-level peephole. Detects `mul i32 X, Y` where both X and Y have
// their top 16 bits provably zero (via LLVM's IR-level computeKnownBits)
// and rewrites to a call to `__umulhisi3` — a 16x16 -> 32 unsigned
// multiply (~30% faster than __mulsi3 for `(u32)i * i` patterns).
//
// Why an IR pass instead of a Custom SDAG lowering: LLVM's IndVarSimplify
// loop pass widens narrow induction variables (e.g. an i16 loop counter
// later zext'd to i32) into i32 PHIs. By SDAG-build time the zext is
// gone — the MUL's operand is just `CopyFromReg %2:i32`, an opaque value.
// SDAG's computeKnownBits can't trace back across BB boundaries through
// CopyFromReg. IR-level computeKnownBits, by contrast, walks the use-def
// graph (including PHIs) and can prove the high bits zero.
//
// Runs in addISelPrepare (right before SDAG-ISel) so it sees the
// final-shape IR. The libcall declaration is auto-added if missing.
//
//===---------------------------------------------------------------------===//
#include "W65816.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/InitializePasses.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Module.h"
#include "llvm/Pass.h"
#include "llvm/Support/KnownBits.h"
using namespace llvm;
#define DEBUG_TYPE "w65816-narrow-i32-mul"
namespace {
class W65816NarrowI32Mul : public FunctionPass {
public:
static char ID;
W65816NarrowI32Mul() : FunctionPass(ID) {}
StringRef getPassName() const override {
return "W65816 narrow i32 multiplies to __umulhisi3";
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<ScalarEvolutionWrapperPass>();
AU.setPreservesCFG();
}
bool runOnFunction(Function &F) override;
};
} // namespace
char W65816NarrowI32Mul::ID = 0;
INITIALIZE_PASS_BEGIN(W65816NarrowI32Mul, DEBUG_TYPE,
"W65816 narrow i32 multiplies", false, false)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
INITIALIZE_PASS_END(W65816NarrowI32Mul, DEBUG_TYPE,
"W65816 narrow i32 multiplies", false, false)
// Get-or-declare `__umulhisi3(i16, i16) -> i32` in the module.
static FunctionCallee getUmulhisi3(Module &M) {
LLVMContext &Ctx = M.getContext();
Type *I16 = Type::getInt16Ty(Ctx);
Type *I32 = Type::getInt32Ty(Ctx);
FunctionType *FT = FunctionType::get(I32, {I16, I16}, false);
return M.getOrInsertFunction("__umulhisi3", FT);
}
// True iff the top 16 bits of V are known zero. Tries IR-level
// computeKnownBits first; if that doesn't prove enough, falls back
// to ScalarEvolution's unsigned-range analysis (which handles
// loop-bounded induction variables that KnownBits can't).
static bool top16Zero(Value *V, const DataLayout &DL, ScalarEvolution &SE) {
KnownBits K = computeKnownBits(V, DL);
if (K.countMinLeadingZeros() >= 16) {
return true;
}
if (!SE.isSCEVable(V->getType())) {
return false;
}
const SCEV *S = SE.getSCEV(V);
ConstantRange R = SE.getUnsignedRange(S);
return R.getActiveBits() <= 16;
}
bool W65816NarrowI32Mul::runOnFunction(Function &F) {
Module *M = F.getParent();
const DataLayout &DL = M->getDataLayout();
Type *I16 = Type::getInt16Ty(F.getContext());
ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
SmallVector<BinaryOperator *, 8> Worklist;
for (Instruction &I : instructions(F)) {
auto *BO = dyn_cast<BinaryOperator>(&I);
if (!BO || BO->getOpcode() != Instruction::Mul) {
continue;
}
if (!BO->getType()->isIntegerTy(32)) {
continue;
}
if (!top16Zero(BO->getOperand(0), DL, SE)) {
continue;
}
if (!top16Zero(BO->getOperand(1), DL, SE)) {
continue;
}
Worklist.push_back(BO);
}
if (Worklist.empty()) {
return false;
}
FunctionCallee Callee = getUmulhisi3(*M);
for (BinaryOperator *BO : Worklist) {
IRBuilder<> B(BO);
Value *A = B.CreateTrunc(BO->getOperand(0), I16);
Value *Bv = B.CreateTrunc(BO->getOperand(1), I16);
Value *Call = B.CreateCall(Callee, {A, Bv});
BO->replaceAllUsesWith(Call);
BO->eraseFromParent();
}
return true;
}
FunctionPass *llvm::createW65816NarrowI32Mul() {
return new W65816NarrowI32Mul();
}