210 lines
9.1 KiB
C++
210 lines
9.1 KiB
C++
//===-- W65816TargetMachine.cpp - Define TargetMachine for W65816 ---------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// Top-level implementation for the W65816 target.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "W65816TargetMachine.h"
|
|
#include "W65816.h"
|
|
#include "W65816MachineFunctionInfo.h"
|
|
#include "TargetInfo/W65816TargetInfo.h"
|
|
#include "llvm/CodeGen/MachineCSE.h"
|
|
#include "llvm/CodeGen/Passes.h"
|
|
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
|
|
#include "llvm/CodeGen/TargetPassConfig.h"
|
|
#include "llvm/MC/TargetRegistry.h"
|
|
#include "llvm/Support/CommandLine.h"
|
|
#include "llvm/Support/Compiler.h"
|
|
#include <optional>
|
|
|
|
using namespace llvm;
|
|
|
|
// Data layout for the 65816 lives in Triple::computeDataLayout via
|
|
// patches/0005-target-data-layout-w65816.patch. The string is:
|
|
// e - little endian
|
|
// m:e - ELF-style symbol mangling
|
|
// p:16:8 - 16-bit pointers, 8-bit stack alignment
|
|
// i16:16 - 16-bit integers aligned to 16 bits
|
|
// i32:16 - 32-bit integers aligned to 16 bits
|
|
// n8:16 - native integer widths
|
|
// S16 - 16-bit natural stack alignment
|
|
|
|
extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
|
|
LLVMInitializeW65816Target() {
|
|
RegisterTargetMachine<W65816TargetMachine> X(getTheW65816Target());
|
|
PassRegistry &PR = *PassRegistry::getPassRegistry();
|
|
initializeW65816AsmPrinterPass(PR);
|
|
initializeW65816DAGToDAGISelLegacyPass(PR);
|
|
initializeW65816StackSlotCleanupPass(PR);
|
|
initializeW65816ABridgeViaXPass(PR);
|
|
initializeW65816WidenAcc16Pass(PR);
|
|
initializeW65816SpillToXPass(PR);
|
|
initializeW65816NegYIndYPass(PR);
|
|
initializeW65816PreSpillCrossCallPass(PR);
|
|
|
|
// Default IndVarSimplify's exit-value rewriter to "never". The
|
|
// closed-form replacement frequently widens an i16 induction var
|
|
// expression to i64 to avoid overflow proofs, then lowers the
|
|
// multiply to __muldi3. On a 16-bit target the libcall costs
|
|
// dramatically more than the natural loop it replaces — sumOfSquares
|
|
// shrinks from 335B (with __muldi3) to 128B (with __mulsi3 in the
|
|
// loop) just by suppressing this rewrite, with no other benchmark
|
|
// affected. We do this by name through the cl::opt registry so
|
|
// it doesn't require patching upstream llvm-mos.
|
|
auto &Opts = cl::getRegisteredOptions();
|
|
if (auto *Opt = Opts.lookup("replexitval")) {
|
|
Opt->addOccurrence(0, "replexitval", "never");
|
|
}
|
|
}
|
|
|
|
static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
|
|
return RM.value_or(Reloc::Static);
|
|
}
|
|
|
|
W65816TargetMachine::W65816TargetMachine(const Target &T, const Triple &TT,
|
|
StringRef CPU, StringRef FS,
|
|
const TargetOptions &Options,
|
|
std::optional<Reloc::Model> RM,
|
|
std::optional<CodeModel::Model> CM,
|
|
CodeGenOptLevel OL, bool JIT)
|
|
: CodeGenTargetMachineImpl(T, TT.computeDataLayout(), TT, CPU, FS, Options,
|
|
getEffectiveRelocModel(RM),
|
|
getEffectiveCodeModel(CM, CodeModel::Small), OL),
|
|
TLOF(std::make_unique<TargetLoweringObjectFileELF>()),
|
|
Subtarget(TT, std::string(CPU), std::string(FS), *this) {
|
|
initAsmInfo();
|
|
}
|
|
|
|
W65816TargetMachine::~W65816TargetMachine() = default;
|
|
|
|
namespace {
|
|
|
|
/// W65816 Code Generator Pass Configuration Options.
|
|
class W65816PassConfig : public TargetPassConfig {
|
|
public:
|
|
W65816PassConfig(W65816TargetMachine &TM, PassManagerBase &PM)
|
|
: TargetPassConfig(TM, PM) {}
|
|
|
|
W65816TargetMachine &getW65816TargetMachine() const {
|
|
return getTM<W65816TargetMachine>();
|
|
}
|
|
|
|
bool addInstSelector() override;
|
|
void addPreRegAlloc() override;
|
|
void addPostRegAlloc() override;
|
|
void addPreEmitPass() override;
|
|
void addMachineSSAOptimization() override;
|
|
|
|
// W65816's only 16-bit ALU register is A. At -O1+ we use BASIC
|
|
// regalloc instead of greedy: greedy fails ("ran out of registers
|
|
// during register allocation") on functions with many cross-call
|
|
// Acc16 vregs (the "ok |= bit; helper(); ok |= bit;" pattern
|
|
// repeated across many if-blocks). Basic regalloc handles that
|
|
// pattern cleanly, with negligible code-size overhead vs greedy
|
|
// (~0.7% on the bench suite).
|
|
//
|
|
// At -O0 / optnone (Optimized=false) we use FAST: greedy/basic at
|
|
// -O0 leave spurious COPY pseudos that lower to STA dp / LDA dp
|
|
// pairs around modify-in-place ops (e.g. INA), miscompiling a + 1.
|
|
//
|
|
// TiedDefSpill (pre-RA) handles the tied-def-multi-use hazard for
|
|
// the sub-pattern that's frequent enough to matter at -O1+.
|
|
FunctionPass *createTargetRegisterAllocator(bool Optimized) override {
|
|
return Optimized ? createBasicRegisterAllocator()
|
|
: createFastRegisterAllocator();
|
|
}
|
|
};
|
|
|
|
} // namespace
|
|
|
|
TargetPassConfig *W65816TargetMachine::createPassConfig(PassManagerBase &PM) {
|
|
return new W65816PassConfig(*this, PM);
|
|
}
|
|
|
|
void W65816PassConfig::addMachineSSAOptimization() {
|
|
// MachineCSE used to be disabled here because it incorrectly
|
|
// eliminated "redundant" CMP instructions: P was considered
|
|
// "available" but on this target P is clobbered by every
|
|
// intervening LDA/STA/ADC. The principled fix is to model
|
|
// Uses=[P] on Bxx (so MachineCSE sees the dep) and let the
|
|
// pass run normally — that landed in W65816InstrInfo.td.
|
|
TargetPassConfig::addMachineSSAOptimization();
|
|
}
|
|
|
|
void W65816PassConfig::addPreRegAlloc() {
|
|
addPass(createW65816ABridgeViaX());
|
|
addPass(createW65816TiedDefSpill());
|
|
addPass(createW65816WidenAcc16());
|
|
// Pre-spill cross-call Acc16 vregs in high-call functions to
|
|
// relieve greedy regalloc pressure. Currently disabled — the
|
|
// first cut creates too many fresh stack slots and overflows the
|
|
// stack-relative addressing range (frame > 256 bytes) on
|
|
// moderately-sized functions like the soft-double routines.
|
|
// The pass is built and ready, gated behind future tuning of:
|
|
// - lower call-count threshold (currently 4)
|
|
// - smarter "should we spill THIS vreg" filter
|
|
// - stack slot reuse via a real liveness analysis
|
|
// Until then, the high-pressure failure is worked around with
|
|
// `__attribute__((noinline))` on the heaviest helper or with
|
|
// `-mllvm -regalloc=fast` for the affected TU.
|
|
// addPass(createW65816PreSpillCrossCall());
|
|
}
|
|
|
|
void W65816PassConfig::addPostRegAlloc() {
|
|
// SpillToX converts STA/LDA pairs to TAX/TXA bridges; StackSlotCleanup
|
|
// then deletes still-adjacent redundant spills. A second SpillToX
|
|
// invocation collapses any TAX/TXA pair left adjacent by cleanup
|
|
// (e.g. when an inner copy between bridge endpoints went away).
|
|
addPass(createW65816SpillToX());
|
|
addPass(createW65816StackSlotCleanup());
|
|
addPass(createW65816SpillToX());
|
|
}
|
|
|
|
void W65816PassConfig::addPreEmitPass() {
|
|
// SpillToX one more time: now that postrapseudos has expanded
|
|
// physreg-COPY pseudos into the real TAX/TXA opcodes, adjacent
|
|
// TXA;TAX pairs (which the earlier SpillToX invocations couldn't
|
|
// see in COPY form) become collapsable.
|
|
addPass(createW65816SpillToX());
|
|
// Rewrite negative-Y indirect-Y stack-rel ops. Must run BEFORE
|
|
// BranchExpand because the rewrite expands one instruction into
|
|
// several and shifts branch distances. The pass internally checks
|
|
// X-liveness and saves/restores X via DP $E0 when SpillToX has
|
|
// a value parked there; without that check, the rewrite's TAX
|
|
// would clobber spill-bridged values (caught by `addOff(p,i) {
|
|
// p[i-1] += p[i]; }` returning p[i-1] + &p[i-1] instead of +b).
|
|
// W65816NegYIndY was a workaround for the (sr,s),Y bank-wrap on
|
|
// negative-Y indirect-stack-rel loads. No current code emits
|
|
// LDA_StackRelIndY / STA_StackRelIndY (pointer-deref now goes
|
|
// through [$E0],Y indirect-long via the LDAptr / STAptr / STBptr
|
|
// inserter, which forces the bank byte at $E2 to 0). Pass left
|
|
// in tree but disabled — re-enable if a new code path starts
|
|
// emitting (sr,s),Y again.
|
|
// addPass(createW65816NegYIndY());
|
|
// Branch expansion runs after that so the BRA introduced for long
|
|
// conditional branches gets seen by SepRepCleanup (which can
|
|
// coalesce SEP/REP brackets across the new bridge MBBs).
|
|
// Distance estimation now uses TII::getInstSizeInBytes so it's
|
|
// byte-accurate; the 110-byte threshold leaves margin without
|
|
// expanding short branches that would otherwise survive as Bxx.
|
|
addPass(createW65816BranchExpand());
|
|
addPass(createW65816SepRepCleanup());
|
|
}
|
|
|
|
MachineFunctionInfo *W65816TargetMachine::createMachineFunctionInfo(
|
|
BumpPtrAllocator &Allocator, const Function &F,
|
|
const TargetSubtargetInfo *STI) const {
|
|
return W65816MachineFunctionInfo::create<W65816MachineFunctionInfo>(Allocator,
|
|
F, STI);
|
|
}
|
|
|
|
bool W65816PassConfig::addInstSelector() {
|
|
addPass(createW65816ISelDag(getW65816TargetMachine(), getOptLevel()));
|
|
return false;
|
|
}
|