65816-llvm-mos/src/llvm/lib/Target/W65816/W65816TargetMachine.cpp
Scott Duensing 42f0d16d07 Checkpoint
2026-05-13 20:54:28 -05:00

294 lines
14 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

//===-- W65816TargetMachine.cpp - Define TargetMachine for W65816 ---------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Top-level implementation for the W65816 target.
//
//===----------------------------------------------------------------------===//
#include "W65816TargetMachine.h"
#include "W65816.h"
#include "W65816MachineFunctionInfo.h"
#include "TargetInfo/W65816TargetInfo.h"
#include "llvm/CodeGen/MachineCSE.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/MC/TargetRegistry.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Compiler.h"
#include <optional>
using namespace llvm;
// Data layout for the 65816 lives in Triple::computeDataLayout via
// patches/0005-target-data-layout-w65816.patch. The string is:
// e - little endian
// m:e - ELF-style symbol mangling
// p:32:16 - 32-bit pointers (lo16 + hi-bank), 16-bit alignment
// i16:16 - 16-bit integers aligned to 16 bits
// i32:16 - 32-bit integers aligned to 16 bits
// a:8 - alloca defaults to 1-byte alignment
// n8:16 - native integer widths
// S8 - 1-byte natural stack alignment. JSL's 3-byte ret-addr
// push means SP is never reliably 2-aligned inside a
// callee; the older S16 caused SDAG to fold &buf[1] to
// buf | 1, which breaks for odd-aligned stack locals.
extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
LLVMInitializeW65816Target() {
RegisterTargetMachine<W65816TargetMachine> X(getTheW65816Target());
PassRegistry &PR = *PassRegistry::getPassRegistry();
initializeW65816AsmPrinterPass(PR);
initializeW65816DAGToDAGISelLegacyPass(PR);
initializeW65816StackSlotCleanupPass(PR);
initializeW65816ABridgeViaXPass(PR);
initializeW65816WidenAcc16Pass(PR);
initializeW65816SpillToXPass(PR);
initializeW65816NegYIndYPass(PR);
initializeW65816PreSpillCrossCallPass(PR);
initializeW65816SjLjFinalizePass(PR);
initializeW65816LowerWide32Pass(PR);
initializeW65816I32IncFoldPass(PR);
initializeW65816ImgCalleeSavePass(PR);
initializeW65816NarrowI32MulPass(PR);
initializeW65816PromoteFiToImgPass(PR);
initializeW65816StackSlotMergePass(PR);
// Default IndVarSimplify's exit-value rewriter to "never". The
// closed-form replacement frequently widens an i16 induction var
// expression to i64 to avoid overflow proofs, then lowers the
// multiply to __muldi3. On a 16-bit target the libcall costs
// dramatically more than the natural loop it replaces — sumOfSquares
// shrinks from 335B (with __muldi3) to 128B (with __mulsi3 in the
// loop) just by suppressing this rewrite, with no other benchmark
// affected. We do this by name through the cl::opt registry so
// it doesn't require patching upstream llvm-mos.
auto &Opts = cl::getRegisteredOptions();
if (auto *Opt = Opts.lookup("replexitval")) {
Opt->addOccurrence(0, "replexitval", "never");
}
}
static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
return RM.value_or(Reloc::Static);
}
W65816TargetMachine::W65816TargetMachine(const Target &T, const Triple &TT,
StringRef CPU, StringRef FS,
const TargetOptions &Options,
std::optional<Reloc::Model> RM,
std::optional<CodeModel::Model> CM,
CodeGenOptLevel OL, bool JIT)
: CodeGenTargetMachineImpl(T, TT.computeDataLayout(), TT, CPU, FS, Options,
getEffectiveRelocModel(RM),
getEffectiveCodeModel(CM, CodeModel::Small), OL),
TLOF(std::make_unique<TargetLoweringObjectFileELF>()),
Subtarget(TT, std::string(CPU), std::string(FS), *this) {
initAsmInfo();
}
W65816TargetMachine::~W65816TargetMachine() = default;
namespace {
/// W65816 Code Generator Pass Configuration Options.
class W65816PassConfig : public TargetPassConfig {
public:
W65816PassConfig(W65816TargetMachine &TM, PassManagerBase &PM)
: TargetPassConfig(TM, PM) {}
W65816TargetMachine &getW65816TargetMachine() const {
return getTM<W65816TargetMachine>();
}
bool addInstSelector() override;
void addPreRegAlloc() override;
void addPostRegAlloc() override;
void addPreEmitPass() override;
void addMachineSSAOptimization() override;
void addISelPrepare() override;
// Greedy at -O1+; fast at -O0/optnone. Greedy used to abort with
// "Def isn't really dead" inside LiveRangeEdit::eliminateDeadDef
// when InlineSpiller converted a redundant STAfi (Defs = [A]) into
// a KILL pseudo while only marking explicit defs dead — leaving the
// implicit-def $a live, then later trying to delete it. Patched in
// tools/llvm-mos/llvm/lib/CodeGen/InlineSpiller.cpp to mark all defs
// (explicit + implicit) dead. Bench wins after the switch:
// popcount 19.4%, strcpy 18.9%, memcmp 8.6%, bsearch 9.2%,
// fib(10) 2.6%.
//
// At -O0 / optnone (Optimized=false) we use FAST: greedy at -O0
// left spurious COPY pseudos that lowered to STA dp / LDA dp pairs
// around modify-in-place ops (e.g. INA), miscompiling a + 1.
FunctionPass *createTargetRegisterAllocator(bool Optimized) override {
return Optimized ? createGreedyRegisterAllocator()
: createFastRegisterAllocator();
}
};
} // namespace
TargetPassConfig *W65816TargetMachine::createPassConfig(PassManagerBase &PM) {
return new W65816PassConfig(*this, PM);
}
void W65816PassConfig::addISelPrepare() {
// SjLjEHPrepare ran in addPassesToHandleExceptions just before this;
// our finalize pass inserts an actual setjmp at function entry +
// a switch-on-call_site dispatch block, and erases the eh.sjlj.*
// intrinsics our backend doesn't natively lower. Must run BEFORE
// the base ISelPrepare passes so isel sees the cleaned IR.
addPass(createW65816SjLjFinalize());
// IR-level peephole: narrow `mul i32 X, Y` to a __umulhisi3 call
// when IR-level computeKnownBits proves the top 16 bits of both
// operands are zero. Catches the sumSquares-style `(u32)i * i`
// pattern that SDAG-level analysis can't see across BB boundaries.
addPass(createW65816NarrowI32Mul());
TargetPassConfig::addISelPrepare();
}
void W65816PassConfig::addMachineSSAOptimization() {
// MachineCSE used to be disabled here because it incorrectly
// eliminated "redundant" CMP instructions: P was considered
// "available" but on this target P is clobbered by every
// intervening LDA/STA/ADC. The principled fix is to model
// Uses=[P] on Bxx (so MachineCSE sees the dep) and let the
// pass run normally — that landed in W65816InstrInfo.td.
TargetPassConfig::addMachineSSAOptimization();
// MachineBlockPlacement is now re-enabled. Previously disabled
// because W65816InstrInfo::analyzeBranch returned unanalyzable
// unconditionally; we now decode the BRA / BRL / JMP_Abs uncond
// direct-branch case (see W65816InstrInfo::analyzeBranch) which is
// enough to satisfy MBP's fall-through assertion. Conditional
// branches stay opaque on purpose: their condition is encoded in
// the OPCODE and the P-flag input must stay adjacent to a preceding
// CMP, which BranchFolder doesn't know to preserve.
}
void W65816PassConfig::addPreRegAlloc() {
// Decompose Wide32 vregs (i32 register pairs) into pairs of i16 vregs
// BEFORE the other Acc16-targeting pre-RA passes run. Each later
// pass walks Acc16/Idx16/Img16 vregs; running this first means they
// see the decomposed halves uniformly.
addPass(createW65816LowerWide32());
addPass(createW65816ABridgeViaX());
addPass(createW65816TiedDefSpill());
addPass(createW65816WidenAcc16());
// Pre-spill cross-call Acc16 vregs in high-call functions to
// relieve greedy regalloc pressure. Currently disabled — the
// first cut creates too many fresh stack slots and overflows the
// stack-relative addressing range (frame > 256 bytes) on
// moderately-sized functions like the soft-double routines.
// The pass is built and ready, gated behind future tuning of:
// - lower call-count threshold (currently 4)
// - smarter "should we spill THIS vreg" filter
// - stack slot reuse via a real liveness analysis
// Until then, the high-pressure failure is worked around with
// `__attribute__((noinline))` on the heaviest helper or with
// `-mllvm -regalloc=fast` for the affected TU.
// addPass(createW65816PreSpillCrossCall());
}
void W65816PassConfig::addPostRegAlloc() {
// FI→IMG promotion runs FIRST. It scans for high-traffic i16
// FrameIndex slots (LDAfi/STAfi/ADCfi/etc.) and rewrites them to
// STA_DP/LDA_DP/ADC_DP/... pointed at free IMG8..IMG15 DP slots.
// The introduced IMG8..15 references are then picked up by
// ImgCalleeSave to get prologue save + epilogue restore. See
// W65816PromoteFiToImg.cpp.
addPass(createW65816PromoteFiToImg());
// ImgCalleeSave detects IMG8..IMG15 usage post-regalloc and inserts
// prologue save + epilogue restore so those slots act as callee-
// saved at the asm level. Fixes picol's `expr 1+2 == 4` bug:
// high-pressure recursive double fns use IMG8..IMG15 as scratch but,
// without this pass, expected them preserved across calls — and
// callees were happy to clobber them. See W65816ImgCalleeSave.cpp.
addPass(createW65816ImgCalleeSave());
// SpillToX converts STA/LDA pairs to TAX/TXA bridges; StackSlotCleanup
// then deletes still-adjacent redundant spills. A second SpillToX
// invocation collapses any TAX/TXA pair left adjacent by cleanup
// (e.g. when an inner copy between bridge endpoints went away).
addPass(createW65816SpillToX());
addPass(createW65816StackSlotCleanup());
addPass(createW65816SpillToX());
// Disable MachineCopyPropagation: it eliminates `COPY $img = $a`
// thinking the IMG dest is dead (no explicit physreg use of $img
// remains after PEI expands STAfi-with-Img16-source into LDA_DP).
// The COPY actually expands to STA_DP $D0 — a memory store to a
// DP slot that libcalls (softDouble, softFloat) ALSO use as their
// own arg-save scratch. When MCP drops the COPY, the subsequent
// LDA_DP $D0 reads stale memory. Caught by `g = g/x` Newton loop:
// iter-1's saved x_ml at $D0 was never actually written, so iter-2
// read garbage. The principled fix would mark IMG-targeted COPYs
// as memory-side-effecting, but TII doesn't expose that hook;
// disabling MCP loses some optimization but is safe.
disablePass(&llvm::MachineCopyPropagationID);
}
void W65816PassConfig::addPreEmitPass() {
// SpillToX one more time: now that postrapseudos has expanded
// physreg-COPY pseudos into the real TAX/TXA opcodes, adjacent
// TXA;TAX pairs (which the earlier SpillToX invocations couldn't
// see in COPY form) become collapsable.
addPass(createW65816SpillToX());
// Rewrite negative-Y indirect-Y stack-rel ops. Must run BEFORE
// BranchExpand because the rewrite expands one instruction into
// several and shifts branch distances. The pass internally checks
// X-liveness and saves/restores X via DP $E0 when SpillToX has
// a value parked there; without that check, the rewrite's TAX
// would clobber spill-bridged values (caught by `addOff(p,i) {
// p[i-1] += p[i]; }` returning p[i-1] + &p[i-1] instead of +b).
// W65816NegYIndY was a workaround for the (sr,s),Y bank-wrap on
// negative-Y indirect-stack-rel loads. No current code emits
// LDA_StackRelIndY / STA_StackRelIndY (pointer-deref now goes
// through [$E0],Y indirect-long via the LDAptr / STAptr / STBptr
// inserter, which forces the bank byte at $E2 to 0). Pass left
// in tree but disabled — re-enable if a new code path starts
// emitting (sr,s),Y again.
// addPass(createW65816NegYIndY());
// Branch expansion runs after that so the BRA introduced for long
// conditional branches gets seen by SepRepCleanup (which can
// coalesce SEP/REP brackets across the new bridge MBBs).
// Distance estimation now uses TII::getInstSizeInBytes so it's
// byte-accurate; the 110-byte threshold leaves margin without
// expanding short branches that would otherwise survive as Bxx.
// Detect i32 += 1 patterns (LDA/ADC #1/STA/LDA/ADCE #0/STA) and
// rewrite to a tighter LDA/INA/STA + INC_HI_IF_CARRY form that
// skips the hi half on the no-carry path. Must run BEFORE
// BranchExpand so the inserted conditional skip's distances are
// covered by the branch-distance estimator. Also before
// SepRepCleanup (which has the existing ADC #±1 → INA peephole)
// because we deliberately KEEP ADCi16imm 1 so this pass can match
// it; the subsequent SepRepCleanup will see only the residual
// (non-fold-eligible) ADCi16imm cases.
addPass(createW65816I32IncFold());
addPass(createW65816BranchExpand());
addPass(createW65816SepRepCleanup());
// Merge value-equivalent stack slots last. Runs AFTER SepRepCleanup's
// PHI-copy hoist so the LDA-X ; STA-Y pair has been pulled out of
// any PHP/PLP wrap — that way the stack-rel offsets on both ops are
// the unbumped values and offset-based slot matching is stable.
// Saves 2 inst per PHI-copy occurrence (the memory copy round-trip
// collapses when X and Y are renamed to the same slot). See
// W65816StackSlotMerge.cpp.
addPass(createW65816StackSlotMerge());
}
MachineFunctionInfo *W65816TargetMachine::createMachineFunctionInfo(
BumpPtrAllocator &Allocator, const Function &F,
const TargetSubtargetInfo *STI) const {
return W65816MachineFunctionInfo::create<W65816MachineFunctionInfo>(Allocator,
F, STI);
}
bool W65816PassConfig::addInstSelector() {
addPass(createW65816ISelDag(getW65816TargetMachine(), getOptLevel()));
return false;
}