294 lines
14 KiB
C++
294 lines
14 KiB
C++
//===-- W65816TargetMachine.cpp - Define TargetMachine for W65816 ---------===//
|
||
//
|
||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||
// See https://llvm.org/LICENSE.txt for license information.
|
||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||
//
|
||
//===----------------------------------------------------------------------===//
|
||
//
|
||
// Top-level implementation for the W65816 target.
|
||
//
|
||
//===----------------------------------------------------------------------===//
|
||
|
||
#include "W65816TargetMachine.h"
|
||
#include "W65816.h"
|
||
#include "W65816MachineFunctionInfo.h"
|
||
#include "TargetInfo/W65816TargetInfo.h"
|
||
#include "llvm/CodeGen/MachineCSE.h"
|
||
#include "llvm/CodeGen/Passes.h"
|
||
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
|
||
#include "llvm/CodeGen/TargetPassConfig.h"
|
||
#include "llvm/MC/TargetRegistry.h"
|
||
#include "llvm/Support/CommandLine.h"
|
||
#include "llvm/Support/Compiler.h"
|
||
#include <optional>
|
||
|
||
using namespace llvm;
|
||
|
||
// Data layout for the 65816 lives in Triple::computeDataLayout via
|
||
// patches/0005-target-data-layout-w65816.patch. The string is:
|
||
// e - little endian
|
||
// m:e - ELF-style symbol mangling
|
||
// p:32:16 - 32-bit pointers (lo16 + hi-bank), 16-bit alignment
|
||
// i16:16 - 16-bit integers aligned to 16 bits
|
||
// i32:16 - 32-bit integers aligned to 16 bits
|
||
// a:8 - alloca defaults to 1-byte alignment
|
||
// n8:16 - native integer widths
|
||
// S8 - 1-byte natural stack alignment. JSL's 3-byte ret-addr
|
||
// push means SP is never reliably 2-aligned inside a
|
||
// callee; the older S16 caused SDAG to fold &buf[1] to
|
||
// buf | 1, which breaks for odd-aligned stack locals.
|
||
|
||
extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
|
||
LLVMInitializeW65816Target() {
|
||
RegisterTargetMachine<W65816TargetMachine> X(getTheW65816Target());
|
||
PassRegistry &PR = *PassRegistry::getPassRegistry();
|
||
initializeW65816AsmPrinterPass(PR);
|
||
initializeW65816DAGToDAGISelLegacyPass(PR);
|
||
initializeW65816StackSlotCleanupPass(PR);
|
||
initializeW65816ABridgeViaXPass(PR);
|
||
initializeW65816WidenAcc16Pass(PR);
|
||
initializeW65816SpillToXPass(PR);
|
||
initializeW65816NegYIndYPass(PR);
|
||
initializeW65816PreSpillCrossCallPass(PR);
|
||
initializeW65816SjLjFinalizePass(PR);
|
||
initializeW65816LowerWide32Pass(PR);
|
||
initializeW65816I32IncFoldPass(PR);
|
||
initializeW65816ImgCalleeSavePass(PR);
|
||
initializeW65816NarrowI32MulPass(PR);
|
||
initializeW65816PromoteFiToImgPass(PR);
|
||
initializeW65816StackSlotMergePass(PR);
|
||
|
||
// Default IndVarSimplify's exit-value rewriter to "never". The
|
||
// closed-form replacement frequently widens an i16 induction var
|
||
// expression to i64 to avoid overflow proofs, then lowers the
|
||
// multiply to __muldi3. On a 16-bit target the libcall costs
|
||
// dramatically more than the natural loop it replaces — sumOfSquares
|
||
// shrinks from 335B (with __muldi3) to 128B (with __mulsi3 in the
|
||
// loop) just by suppressing this rewrite, with no other benchmark
|
||
// affected. We do this by name through the cl::opt registry so
|
||
// it doesn't require patching upstream llvm-mos.
|
||
auto &Opts = cl::getRegisteredOptions();
|
||
if (auto *Opt = Opts.lookup("replexitval")) {
|
||
Opt->addOccurrence(0, "replexitval", "never");
|
||
}
|
||
}
|
||
|
||
static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
|
||
return RM.value_or(Reloc::Static);
|
||
}
|
||
|
||
W65816TargetMachine::W65816TargetMachine(const Target &T, const Triple &TT,
|
||
StringRef CPU, StringRef FS,
|
||
const TargetOptions &Options,
|
||
std::optional<Reloc::Model> RM,
|
||
std::optional<CodeModel::Model> CM,
|
||
CodeGenOptLevel OL, bool JIT)
|
||
: CodeGenTargetMachineImpl(T, TT.computeDataLayout(), TT, CPU, FS, Options,
|
||
getEffectiveRelocModel(RM),
|
||
getEffectiveCodeModel(CM, CodeModel::Small), OL),
|
||
TLOF(std::make_unique<TargetLoweringObjectFileELF>()),
|
||
Subtarget(TT, std::string(CPU), std::string(FS), *this) {
|
||
initAsmInfo();
|
||
}
|
||
|
||
W65816TargetMachine::~W65816TargetMachine() = default;
|
||
|
||
namespace {
|
||
|
||
/// W65816 Code Generator Pass Configuration Options.
|
||
class W65816PassConfig : public TargetPassConfig {
|
||
public:
|
||
W65816PassConfig(W65816TargetMachine &TM, PassManagerBase &PM)
|
||
: TargetPassConfig(TM, PM) {}
|
||
|
||
W65816TargetMachine &getW65816TargetMachine() const {
|
||
return getTM<W65816TargetMachine>();
|
||
}
|
||
|
||
bool addInstSelector() override;
|
||
void addPreRegAlloc() override;
|
||
void addPostRegAlloc() override;
|
||
void addPreEmitPass() override;
|
||
void addMachineSSAOptimization() override;
|
||
void addISelPrepare() override;
|
||
|
||
// Greedy at -O1+; fast at -O0/optnone. Greedy used to abort with
|
||
// "Def isn't really dead" inside LiveRangeEdit::eliminateDeadDef
|
||
// when InlineSpiller converted a redundant STAfi (Defs = [A]) into
|
||
// a KILL pseudo while only marking explicit defs dead — leaving the
|
||
// implicit-def $a live, then later trying to delete it. Patched in
|
||
// tools/llvm-mos/llvm/lib/CodeGen/InlineSpiller.cpp to mark all defs
|
||
// (explicit + implicit) dead. Bench wins after the switch:
|
||
// popcount −19.4%, strcpy −18.9%, memcmp −8.6%, bsearch −9.2%,
|
||
// fib(10) −2.6%.
|
||
//
|
||
// At -O0 / optnone (Optimized=false) we use FAST: greedy at -O0
|
||
// left spurious COPY pseudos that lowered to STA dp / LDA dp pairs
|
||
// around modify-in-place ops (e.g. INA), miscompiling a + 1.
|
||
FunctionPass *createTargetRegisterAllocator(bool Optimized) override {
|
||
return Optimized ? createGreedyRegisterAllocator()
|
||
: createFastRegisterAllocator();
|
||
}
|
||
};
|
||
|
||
} // namespace
|
||
|
||
TargetPassConfig *W65816TargetMachine::createPassConfig(PassManagerBase &PM) {
|
||
return new W65816PassConfig(*this, PM);
|
||
}
|
||
|
||
void W65816PassConfig::addISelPrepare() {
|
||
// SjLjEHPrepare ran in addPassesToHandleExceptions just before this;
|
||
// our finalize pass inserts an actual setjmp at function entry +
|
||
// a switch-on-call_site dispatch block, and erases the eh.sjlj.*
|
||
// intrinsics our backend doesn't natively lower. Must run BEFORE
|
||
// the base ISelPrepare passes so isel sees the cleaned IR.
|
||
addPass(createW65816SjLjFinalize());
|
||
// IR-level peephole: narrow `mul i32 X, Y` to a __umulhisi3 call
|
||
// when IR-level computeKnownBits proves the top 16 bits of both
|
||
// operands are zero. Catches the sumSquares-style `(u32)i * i`
|
||
// pattern that SDAG-level analysis can't see across BB boundaries.
|
||
addPass(createW65816NarrowI32Mul());
|
||
TargetPassConfig::addISelPrepare();
|
||
}
|
||
|
||
void W65816PassConfig::addMachineSSAOptimization() {
|
||
// MachineCSE used to be disabled here because it incorrectly
|
||
// eliminated "redundant" CMP instructions: P was considered
|
||
// "available" but on this target P is clobbered by every
|
||
// intervening LDA/STA/ADC. The principled fix is to model
|
||
// Uses=[P] on Bxx (so MachineCSE sees the dep) and let the
|
||
// pass run normally — that landed in W65816InstrInfo.td.
|
||
TargetPassConfig::addMachineSSAOptimization();
|
||
|
||
// MachineBlockPlacement is now re-enabled. Previously disabled
|
||
// because W65816InstrInfo::analyzeBranch returned unanalyzable
|
||
// unconditionally; we now decode the BRA / BRL / JMP_Abs uncond
|
||
// direct-branch case (see W65816InstrInfo::analyzeBranch) which is
|
||
// enough to satisfy MBP's fall-through assertion. Conditional
|
||
// branches stay opaque on purpose: their condition is encoded in
|
||
// the OPCODE and the P-flag input must stay adjacent to a preceding
|
||
// CMP, which BranchFolder doesn't know to preserve.
|
||
}
|
||
|
||
void W65816PassConfig::addPreRegAlloc() {
|
||
// Decompose Wide32 vregs (i32 register pairs) into pairs of i16 vregs
|
||
// BEFORE the other Acc16-targeting pre-RA passes run. Each later
|
||
// pass walks Acc16/Idx16/Img16 vregs; running this first means they
|
||
// see the decomposed halves uniformly.
|
||
addPass(createW65816LowerWide32());
|
||
addPass(createW65816ABridgeViaX());
|
||
addPass(createW65816TiedDefSpill());
|
||
addPass(createW65816WidenAcc16());
|
||
// Pre-spill cross-call Acc16 vregs in high-call functions to
|
||
// relieve greedy regalloc pressure. Currently disabled — the
|
||
// first cut creates too many fresh stack slots and overflows the
|
||
// stack-relative addressing range (frame > 256 bytes) on
|
||
// moderately-sized functions like the soft-double routines.
|
||
// The pass is built and ready, gated behind future tuning of:
|
||
// - lower call-count threshold (currently 4)
|
||
// - smarter "should we spill THIS vreg" filter
|
||
// - stack slot reuse via a real liveness analysis
|
||
// Until then, the high-pressure failure is worked around with
|
||
// `__attribute__((noinline))` on the heaviest helper or with
|
||
// `-mllvm -regalloc=fast` for the affected TU.
|
||
// addPass(createW65816PreSpillCrossCall());
|
||
}
|
||
|
||
void W65816PassConfig::addPostRegAlloc() {
|
||
// FI→IMG promotion runs FIRST. It scans for high-traffic i16
|
||
// FrameIndex slots (LDAfi/STAfi/ADCfi/etc.) and rewrites them to
|
||
// STA_DP/LDA_DP/ADC_DP/... pointed at free IMG8..IMG15 DP slots.
|
||
// The introduced IMG8..15 references are then picked up by
|
||
// ImgCalleeSave to get prologue save + epilogue restore. See
|
||
// W65816PromoteFiToImg.cpp.
|
||
addPass(createW65816PromoteFiToImg());
|
||
// ImgCalleeSave detects IMG8..IMG15 usage post-regalloc and inserts
|
||
// prologue save + epilogue restore so those slots act as callee-
|
||
// saved at the asm level. Fixes picol's `expr 1+2 == 4` bug:
|
||
// high-pressure recursive double fns use IMG8..IMG15 as scratch but,
|
||
// without this pass, expected them preserved across calls — and
|
||
// callees were happy to clobber them. See W65816ImgCalleeSave.cpp.
|
||
addPass(createW65816ImgCalleeSave());
|
||
// SpillToX converts STA/LDA pairs to TAX/TXA bridges; StackSlotCleanup
|
||
// then deletes still-adjacent redundant spills. A second SpillToX
|
||
// invocation collapses any TAX/TXA pair left adjacent by cleanup
|
||
// (e.g. when an inner copy between bridge endpoints went away).
|
||
addPass(createW65816SpillToX());
|
||
addPass(createW65816StackSlotCleanup());
|
||
addPass(createW65816SpillToX());
|
||
// Disable MachineCopyPropagation: it eliminates `COPY $img = $a`
|
||
// thinking the IMG dest is dead (no explicit physreg use of $img
|
||
// remains after PEI expands STAfi-with-Img16-source into LDA_DP).
|
||
// The COPY actually expands to STA_DP $D0 — a memory store to a
|
||
// DP slot that libcalls (softDouble, softFloat) ALSO use as their
|
||
// own arg-save scratch. When MCP drops the COPY, the subsequent
|
||
// LDA_DP $D0 reads stale memory. Caught by `g = g/x` Newton loop:
|
||
// iter-1's saved x_ml at $D0 was never actually written, so iter-2
|
||
// read garbage. The principled fix would mark IMG-targeted COPYs
|
||
// as memory-side-effecting, but TII doesn't expose that hook;
|
||
// disabling MCP loses some optimization but is safe.
|
||
disablePass(&llvm::MachineCopyPropagationID);
|
||
}
|
||
|
||
void W65816PassConfig::addPreEmitPass() {
|
||
// SpillToX one more time: now that postrapseudos has expanded
|
||
// physreg-COPY pseudos into the real TAX/TXA opcodes, adjacent
|
||
// TXA;TAX pairs (which the earlier SpillToX invocations couldn't
|
||
// see in COPY form) become collapsable.
|
||
addPass(createW65816SpillToX());
|
||
// Rewrite negative-Y indirect-Y stack-rel ops. Must run BEFORE
|
||
// BranchExpand because the rewrite expands one instruction into
|
||
// several and shifts branch distances. The pass internally checks
|
||
// X-liveness and saves/restores X via DP $E0 when SpillToX has
|
||
// a value parked there; without that check, the rewrite's TAX
|
||
// would clobber spill-bridged values (caught by `addOff(p,i) {
|
||
// p[i-1] += p[i]; }` returning p[i-1] + &p[i-1] instead of +b).
|
||
// W65816NegYIndY was a workaround for the (sr,s),Y bank-wrap on
|
||
// negative-Y indirect-stack-rel loads. No current code emits
|
||
// LDA_StackRelIndY / STA_StackRelIndY (pointer-deref now goes
|
||
// through [$E0],Y indirect-long via the LDAptr / STAptr / STBptr
|
||
// inserter, which forces the bank byte at $E2 to 0). Pass left
|
||
// in tree but disabled — re-enable if a new code path starts
|
||
// emitting (sr,s),Y again.
|
||
// addPass(createW65816NegYIndY());
|
||
// Branch expansion runs after that so the BRA introduced for long
|
||
// conditional branches gets seen by SepRepCleanup (which can
|
||
// coalesce SEP/REP brackets across the new bridge MBBs).
|
||
// Distance estimation now uses TII::getInstSizeInBytes so it's
|
||
// byte-accurate; the 110-byte threshold leaves margin without
|
||
// expanding short branches that would otherwise survive as Bxx.
|
||
// Detect i32 += 1 patterns (LDA/ADC #1/STA/LDA/ADCE #0/STA) and
|
||
// rewrite to a tighter LDA/INA/STA + INC_HI_IF_CARRY form that
|
||
// skips the hi half on the no-carry path. Must run BEFORE
|
||
// BranchExpand so the inserted conditional skip's distances are
|
||
// covered by the branch-distance estimator. Also before
|
||
// SepRepCleanup (which has the existing ADC #±1 → INA peephole)
|
||
// because we deliberately KEEP ADCi16imm 1 so this pass can match
|
||
// it; the subsequent SepRepCleanup will see only the residual
|
||
// (non-fold-eligible) ADCi16imm cases.
|
||
addPass(createW65816I32IncFold());
|
||
addPass(createW65816BranchExpand());
|
||
addPass(createW65816SepRepCleanup());
|
||
// Merge value-equivalent stack slots last. Runs AFTER SepRepCleanup's
|
||
// PHI-copy hoist so the LDA-X ; STA-Y pair has been pulled out of
|
||
// any PHP/PLP wrap — that way the stack-rel offsets on both ops are
|
||
// the unbumped values and offset-based slot matching is stable.
|
||
// Saves 2 inst per PHI-copy occurrence (the memory copy round-trip
|
||
// collapses when X and Y are renamed to the same slot). See
|
||
// W65816StackSlotMerge.cpp.
|
||
addPass(createW65816StackSlotMerge());
|
||
}
|
||
|
||
MachineFunctionInfo *W65816TargetMachine::createMachineFunctionInfo(
|
||
BumpPtrAllocator &Allocator, const Function &F,
|
||
const TargetSubtargetInfo *STI) const {
|
||
return W65816MachineFunctionInfo::create<W65816MachineFunctionInfo>(Allocator,
|
||
F, STI);
|
||
}
|
||
|
||
bool W65816PassConfig::addInstSelector() {
|
||
addPass(createW65816ISelDag(getW65816TargetMachine(), getOptLevel()));
|
||
return false;
|
||
}
|