65816-llvm-mos/src/llvm/lib/Target/W65816/W65816TargetMachine.cpp

//===-- W65816TargetMachine.cpp - Define TargetMachine for W65816 ---------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Top-level implementation for the W65816 target.
//
//===----------------------------------------------------------------------===//

#include "W65816TargetMachine.h"
#include "W65816.h"
#include "W65816MachineFunctionInfo.h"
#include "TargetInfo/W65816TargetInfo.h"
#include "llvm/CodeGen/MachineCSE.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/MC/TargetRegistry.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Compiler.h"
#include <optional>

using namespace llvm;

// Data layout for the 65816 lives in Triple::computeDataLayout via
// patches/0005-target-data-layout-w65816.patch.  The string is:
//   e       - little endian
//   m:e     - ELF-style symbol mangling
//   p:32:16 - 32-bit pointers (lo16 + hi-bank), 16-bit alignment
//   i16:16  - 16-bit integers aligned to 16 bits
//   i32:16  - 32-bit integers aligned to 16 bits
//   a:8     - alloca defaults to 1-byte alignment
//   n8:16   - native integer widths
//   S8      - 1-byte natural stack alignment.  JSL's 3-byte ret-addr
//             push means SP is never reliably 2-aligned inside a
//             callee; the older S16 caused SDAG to fold &buf[1] to
//             buf | 1, which breaks for odd-aligned stack locals.

extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
LLVMInitializeW65816Target() {
  RegisterTargetMachine<W65816TargetMachine> X(getTheW65816Target());
  PassRegistry &PR = *PassRegistry::getPassRegistry();
  initializeW65816AsmPrinterPass(PR);
  initializeW65816DAGToDAGISelLegacyPass(PR);
  initializeW65816StackSlotCleanupPass(PR);
  initializeW65816ABridgeViaXPass(PR);
  initializeW65816WidenAcc16Pass(PR);
  initializeW65816SpillToXPass(PR);
  initializeW65816NegYIndYPass(PR);
  initializeW65816PreSpillCrossCallPass(PR);
  initializeW65816SjLjFinalizePass(PR);
  initializeW65816LowerWide32Pass(PR);
  initializeW65816I32IncFoldPass(PR);
  initializeW65816ImgCalleeSavePass(PR);
  initializeW65816NarrowI32MulPass(PR);
  initializeW65816PromoteFiToImgPass(PR);
  initializeW65816StackSlotMergePass(PR);

  // Default IndVarSimplify's exit-value rewriter to "never".  The
  // closed-form replacement frequently widens an i16 induction var
  // expression to i64 to avoid overflow proofs, then lowers the
  // multiply to __muldi3.  On a 16-bit target the libcall costs
  // dramatically more than the natural loop it replaces — sumOfSquares
  // shrinks from 335B (with __muldi3) to 128B (with __mulsi3 in the
  // loop) just by suppressing this rewrite, with no other benchmark
  // affected.  We do this by name through the cl::opt registry so
  // it doesn't require patching upstream llvm-mos.
  auto &Opts = cl::getRegisteredOptions();
  if (auto *Opt = Opts.lookup("replexitval")) {
    Opt->addOccurrence(0, "replexitval", "never");
  }
}

static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
  return RM.value_or(Reloc::Static);
}

W65816TargetMachine::W65816TargetMachine(const Target &T, const Triple &TT,
                                         StringRef CPU, StringRef FS,
                                         const TargetOptions &Options,
                                         std::optional<Reloc::Model> RM,
                                         std::optional<CodeModel::Model> CM,
                                         CodeGenOptLevel OL, bool JIT)
    : CodeGenTargetMachineImpl(T, TT.computeDataLayout(), TT, CPU, FS, Options,
                               getEffectiveRelocModel(RM),
                               getEffectiveCodeModel(CM, CodeModel::Small), OL),
      TLOF(std::make_unique<TargetLoweringObjectFileELF>()),
      Subtarget(TT, std::string(CPU), std::string(FS), *this) {
  initAsmInfo();
}

W65816TargetMachine::~W65816TargetMachine() = default;

namespace {

/// W65816 Code Generator Pass Configuration Options.
class W65816PassConfig : public TargetPassConfig {
public:
  W65816PassConfig(W65816TargetMachine &TM, PassManagerBase &PM)
      : TargetPassConfig(TM, PM) {}

  W65816TargetMachine &getW65816TargetMachine() const {
    return getTM<W65816TargetMachine>();
  }

  bool addInstSelector() override;
  void addPreRegAlloc() override;
  void addPostRegAlloc() override;
  void addPreEmitPass() override;
  void addMachineSSAOptimization() override;
  void addISelPrepare() override;

  // Greedy at -O1+; fast at -O0/optnone.  Greedy used to abort with
  // "Def isn't really dead" inside LiveRangeEdit::eliminateDeadDef
  // when InlineSpiller converted a redundant STAfi (Defs = [A]) into
  // a KILL pseudo while only marking explicit defs dead — leaving the
  // implicit-def $a live, then later trying to delete it.  Patched in
  // tools/llvm-mos/llvm/lib/CodeGen/InlineSpiller.cpp to mark all defs
  // (explicit + implicit) dead.  Bench wins after the switch:
  // popcount −19.4%, strcpy −18.9%, memcmp −8.6%, bsearch −9.2%,
  // fib(10) −2.6%.
  //
  // At -O0 / optnone (Optimized=false) we use FAST: greedy at -O0
  // left spurious COPY pseudos that lowered to STA dp / LDA dp pairs
  // around modify-in-place ops (e.g. INA), miscompiling a + 1.
  FunctionPass *createTargetRegisterAllocator(bool Optimized) override {
    return Optimized ? createGreedyRegisterAllocator()
                     : createFastRegisterAllocator();
  }
};

} // namespace

TargetPassConfig *W65816TargetMachine::createPassConfig(PassManagerBase &PM) {
  return new W65816PassConfig(*this, PM);
}

void W65816PassConfig::addISelPrepare() {
  // SjLjEHPrepare ran in addPassesToHandleExceptions just before this;
  // our finalize pass inserts an actual setjmp at function entry +
  // a switch-on-call_site dispatch block, and erases the eh.sjlj.*
  // intrinsics our backend doesn't natively lower.  Must run BEFORE
  // the base ISelPrepare passes so isel sees the cleaned IR.
  addPass(createW65816SjLjFinalize());
  // IR-level peephole: narrow `mul i32 X, Y` to a __umulhisi3 call
  // when IR-level computeKnownBits proves the top 16 bits of both
  // operands are zero.  Catches the sumSquares-style `(u32)i * i`
  // pattern that SDAG-level analysis can't see across BB boundaries.
  addPass(createW65816NarrowI32Mul());
  TargetPassConfig::addISelPrepare();
}

void W65816PassConfig::addMachineSSAOptimization() {
  // MachineCSE used to be disabled here because it incorrectly
  // eliminated "redundant" CMP instructions: P was considered
  // "available" but on this target P is clobbered by every
  // intervening LDA/STA/ADC.  The principled fix is to model
  // Uses=[P] on Bxx (so MachineCSE sees the dep) and let the
  // pass run normally — that landed in W65816InstrInfo.td.
  TargetPassConfig::addMachineSSAOptimization();

  // MachineBlockPlacement is now re-enabled.  Previously disabled
  // because W65816InstrInfo::analyzeBranch returned unanalyzable
  // unconditionally; we now decode the BRA / BRL / JMP_Abs uncond
  // direct-branch case (see W65816InstrInfo::analyzeBranch) which is
  // enough to satisfy MBP's fall-through assertion.  Conditional
  // branches stay opaque on purpose: their condition is encoded in
  // the OPCODE and the P-flag input must stay adjacent to a preceding
  // CMP, which BranchFolder doesn't know to preserve.
}

void W65816PassConfig::addPreRegAlloc() {
  // Decompose Wide32 vregs (i32 register pairs) into pairs of i16 vregs
  // BEFORE the other Acc16-targeting pre-RA passes run.  Each later
  // pass walks Acc16/Idx16/Img16 vregs; running this first means they
  // see the decomposed halves uniformly.
  addPass(createW65816LowerWide32());
  addPass(createW65816ABridgeViaX());
  addPass(createW65816TiedDefSpill());
  addPass(createW65816WidenAcc16());
  // Pre-spill cross-call Acc16 vregs in high-call functions to
  // relieve greedy regalloc pressure.  Currently disabled — the
  // first cut creates too many fresh stack slots and overflows the
  // stack-relative addressing range (frame > 256 bytes) on
  // moderately-sized functions like the soft-double routines.
  // The pass is built and ready, gated behind future tuning of:
  //   - lower call-count threshold (currently 4)
  //   - smarter "should we spill THIS vreg" filter
  //   - stack slot reuse via a real liveness analysis
  // Until then, the high-pressure failure is worked around with
  // `__attribute__((noinline))` on the heaviest helper or with
  // `-mllvm -regalloc=fast` for the affected TU.
  // addPass(createW65816PreSpillCrossCall());
}

void W65816PassConfig::addPostRegAlloc() {
  // FI→IMG promotion runs FIRST.  It scans for high-traffic i16
  // FrameIndex slots (LDAfi/STAfi/ADCfi/etc.) and rewrites them to
  // STA_DP/LDA_DP/ADC_DP/... pointed at free IMG8..IMG15 DP slots.
  // The introduced IMG8..15 references are then picked up by
  // ImgCalleeSave to get prologue save + epilogue restore.  See
  // W65816PromoteFiToImg.cpp.
  addPass(createW65816PromoteFiToImg());
  // ImgCalleeSave detects IMG8..IMG15 usage post-regalloc and inserts
  // prologue save + epilogue restore so those slots act as callee-
  // saved at the asm level.  Fixes picol's `expr 1+2 == 4` bug:
  // high-pressure recursive double fns use IMG8..IMG15 as scratch but,
  // without this pass, expected them preserved across calls — and
  // callees were happy to clobber them.  See W65816ImgCalleeSave.cpp.
  addPass(createW65816ImgCalleeSave());
  // SpillToX converts STA/LDA pairs to TAX/TXA bridges; StackSlotCleanup
  // then deletes still-adjacent redundant spills.  A second SpillToX
  // invocation collapses any TAX/TXA pair left adjacent by cleanup
  // (e.g. when an inner copy between bridge endpoints went away).
  addPass(createW65816SpillToX());
  addPass(createW65816StackSlotCleanup());
  addPass(createW65816SpillToX());
  // Disable MachineCopyPropagation: it eliminates `COPY $img = $a`
  // thinking the IMG dest is dead (no explicit physreg use of $img
  // remains after PEI expands STAfi-with-Img16-source into LDA_DP).
  // The COPY actually expands to STA_DP $D0 — a memory store to a
  // DP slot that libcalls (softDouble, softFloat) ALSO use as their
  // own arg-save scratch.  When MCP drops the COPY, the subsequent
  // LDA_DP $D0 reads stale memory.  Caught by `g = g/x` Newton loop:
  // iter-1's saved x_ml at $D0 was never actually written, so iter-2
  // read garbage.  The principled fix would mark IMG-targeted COPYs
  // as memory-side-effecting, but TII doesn't expose that hook;
  // disabling MCP loses some optimization but is safe.
  disablePass(&llvm::MachineCopyPropagationID);
}

void W65816PassConfig::addPreEmitPass() {
  // SpillToX one more time: now that postrapseudos has expanded
  // physreg-COPY pseudos into the real TAX/TXA opcodes, adjacent
  // TXA;TAX pairs (which the earlier SpillToX invocations couldn't
  // see in COPY form) become collapsable.
  addPass(createW65816SpillToX());
  // Rewrite negative-Y indirect-Y stack-rel ops.  Must run BEFORE
  // BranchExpand because the rewrite expands one instruction into
  // several and shifts branch distances.  The pass internally checks
  // X-liveness and saves/restores X via DP $E0 when SpillToX has
  // a value parked there; without that check, the rewrite's TAX
  // would clobber spill-bridged values (caught by `addOff(p,i) {
  // p[i-1] += p[i]; }` returning p[i-1] + &p[i-1] instead of +b).
  // W65816NegYIndY was a workaround for the (sr,s),Y bank-wrap on
  // negative-Y indirect-stack-rel loads.  No current code emits
  // LDA_StackRelIndY / STA_StackRelIndY (pointer-deref now goes
  // through [$E0],Y indirect-long via the LDAptr / STAptr / STBptr
  // inserter, which forces the bank byte at $E2 to 0).  Pass left
  // in tree but disabled — re-enable if a new code path starts
  // emitting (sr,s),Y again.
  // addPass(createW65816NegYIndY());
  // Branch expansion runs after that so the BRA introduced for long
  // conditional branches gets seen by SepRepCleanup (which can
  // coalesce SEP/REP brackets across the new bridge MBBs).
  // Distance estimation now uses TII::getInstSizeInBytes so it's
  // byte-accurate; the 110-byte threshold leaves margin without
  // expanding short branches that would otherwise survive as Bxx.
  // Detect i32 += 1 patterns (LDA/ADC #1/STA/LDA/ADCE #0/STA) and
  // rewrite to a tighter LDA/INA/STA + INC_HI_IF_CARRY form that
  // skips the hi half on the no-carry path.  Must run BEFORE
  // BranchExpand so the inserted conditional skip's distances are
  // covered by the branch-distance estimator.  Also before
  // SepRepCleanup (which has the existing ADC #±1 → INA peephole)
  // because we deliberately KEEP ADCi16imm 1 so this pass can match
  // it; the subsequent SepRepCleanup will see only the residual
  // (non-fold-eligible) ADCi16imm cases.
  addPass(createW65816I32IncFold());
  addPass(createW65816BranchExpand());
  addPass(createW65816SepRepCleanup());
  // Merge value-equivalent stack slots last.  Runs AFTER SepRepCleanup's
  // PHI-copy hoist so the LDA-X ; STA-Y pair has been pulled out of
  // any PHP/PLP wrap — that way the stack-rel offsets on both ops are
  // the unbumped values and offset-based slot matching is stable.
  // Saves 2 inst per PHI-copy occurrence (the memory copy round-trip
  // collapses when X and Y are renamed to the same slot).  See
  // W65816StackSlotMerge.cpp.
  addPass(createW65816StackSlotMerge());
}

MachineFunctionInfo *W65816TargetMachine::createMachineFunctionInfo(
    BumpPtrAllocator &Allocator, const Function &F,
    const TargetSubtargetInfo *STI) const {
  return W65816MachineFunctionInfo::create<W65816MachineFunctionInfo>(Allocator,
                                                                      F, STI);
}

bool W65816PassConfig::addInstSelector() {
  addPass(createW65816ISelDag(getW65816TargetMachine(), getOptLevel()));
  return false;
}