65816-llvm-mos/src/llvm/lib/Target/W65816/W65816NarrowI32Mul.cpp

//===-- W65816NarrowI32Mul.cpp - Narrow i32 multiplies -------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===---------------------------------------------------------------------===//
//
// IR-level peephole.  Detects `mul i32 X, Y` where both X and Y have
// their top 16 bits provably zero (via LLVM's IR-level computeKnownBits)
// and rewrites to a call to `__umulhisi3` — a 16x16 -> 32 unsigned
// multiply (~30% faster than __mulsi3 for `(u32)i * i` patterns).
//
// Why an IR pass instead of a Custom SDAG lowering: LLVM's IndVarSimplify
// loop pass widens narrow induction variables (e.g. an i16 loop counter
// later zext'd to i32) into i32 PHIs.  By SDAG-build time the zext is
// gone — the MUL's operand is just `CopyFromReg %2:i32`, an opaque value.
// SDAG's computeKnownBits can't trace back across BB boundaries through
// CopyFromReg.  IR-level computeKnownBits, by contrast, walks the use-def
// graph (including PHIs) and can prove the high bits zero.
//
// Runs in addISelPrepare (right before SDAG-ISel) so it sees the
// final-shape IR.  The libcall declaration is auto-added if missing.
//
//===---------------------------------------------------------------------===//

#include "W65816.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/InitializePasses.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Module.h"
#include "llvm/Pass.h"
#include "llvm/Support/KnownBits.h"

using namespace llvm;

#define DEBUG_TYPE "w65816-narrow-i32-mul"


namespace {


class W65816NarrowI32Mul : public FunctionPass {
public:
  static char ID;
  W65816NarrowI32Mul() : FunctionPass(ID) {}

  StringRef getPassName() const override {
    return "W65816 narrow i32 multiplies to __umulhisi3";
  }

  void getAnalysisUsage(AnalysisUsage &AU) const override {
    AU.addRequired<ScalarEvolutionWrapperPass>();
    AU.setPreservesCFG();
  }

  bool runOnFunction(Function &F) override;
};


} // namespace


char W65816NarrowI32Mul::ID = 0;

INITIALIZE_PASS_BEGIN(W65816NarrowI32Mul, DEBUG_TYPE,
                      "W65816 narrow i32 multiplies", false, false)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
INITIALIZE_PASS_END(W65816NarrowI32Mul, DEBUG_TYPE,
                    "W65816 narrow i32 multiplies", false, false)


// Get-or-declare `__umulhisi3(i16, i16) -> i32` in the module.
static FunctionCallee getUmulhisi3(Module &M) {
  LLVMContext &Ctx = M.getContext();
  Type *I16 = Type::getInt16Ty(Ctx);
  Type *I32 = Type::getInt32Ty(Ctx);
  FunctionType *FT = FunctionType::get(I32, {I16, I16}, false);
  return M.getOrInsertFunction("__umulhisi3", FT);
}


// True iff the top 16 bits of V are known zero.  Tries IR-level
// computeKnownBits first; if that doesn't prove enough, falls back
// to ScalarEvolution's unsigned-range analysis (which handles
// loop-bounded induction variables that KnownBits can't).
static bool top16Zero(Value *V, const DataLayout &DL, ScalarEvolution &SE) {
  KnownBits K = computeKnownBits(V, DL);
  if (K.countMinLeadingZeros() >= 16) {
    return true;
  }
  if (!SE.isSCEVable(V->getType())) {
    return false;
  }
  const SCEV *S = SE.getSCEV(V);
  ConstantRange R = SE.getUnsignedRange(S);
  return R.getActiveBits() <= 16;
}


bool W65816NarrowI32Mul::runOnFunction(Function &F) {
  Module *M = F.getParent();
  const DataLayout &DL = M->getDataLayout();
  Type *I16 = Type::getInt16Ty(F.getContext());
  ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();

  SmallVector<BinaryOperator *, 8> Worklist;
  for (Instruction &I : instructions(F)) {
    auto *BO = dyn_cast<BinaryOperator>(&I);
    if (!BO || BO->getOpcode() != Instruction::Mul) {
      continue;
    }
    if (!BO->getType()->isIntegerTy(32)) {
      continue;
    }
    if (!top16Zero(BO->getOperand(0), DL, SE)) {
      continue;
    }
    if (!top16Zero(BO->getOperand(1), DL, SE)) {
      continue;
    }
    Worklist.push_back(BO);
  }

  if (Worklist.empty()) {
    return false;
  }

  FunctionCallee Callee = getUmulhisi3(*M);
  for (BinaryOperator *BO : Worklist) {
    IRBuilder<> B(BO);
    Value *A = B.CreateTrunc(BO->getOperand(0), I16);
    Value *Bv = B.CreateTrunc(BO->getOperand(1), I16);
    Value *Call = B.CreateCall(Callee, {A, Bv});
    BO->replaceAllUsesWith(Call);
    BO->eraseFromParent();
  }
  return true;
}


FunctionPass *llvm::createW65816NarrowI32Mul() {
  return new W65816NarrowI32Mul();
}