//===-- W65816TargetMachine.cpp - Define TargetMachine for W65816 ---------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // Top-level implementation for the W65816 target. // //===----------------------------------------------------------------------===// #include "W65816TargetMachine.h" #include "W65816.h" #include "W65816MachineFunctionInfo.h" #include "TargetInfo/W65816TargetInfo.h" #include "llvm/CodeGen/MachineCSE.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/MC/TargetRegistry.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include using namespace llvm; // Data layout for the 65816 lives in Triple::computeDataLayout via // patches/0005-target-data-layout-w65816.patch. The string is: // e - little endian // m:e - ELF-style symbol mangling // p:32:16 - 32-bit pointers (lo16 + hi-bank), 16-bit alignment // i16:16 - 16-bit integers aligned to 16 bits // i32:16 - 32-bit integers aligned to 16 bits // a:8 - alloca defaults to 1-byte alignment // n8:16 - native integer widths // S8 - 1-byte natural stack alignment. JSL's 3-byte ret-addr // push means SP is never reliably 2-aligned inside a // callee; the older S16 caused SDAG to fold &buf[1] to // buf | 1, which breaks for odd-aligned stack locals. extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeW65816Target() { RegisterTargetMachine X(getTheW65816Target()); PassRegistry &PR = *PassRegistry::getPassRegistry(); initializeW65816AsmPrinterPass(PR); initializeW65816DAGToDAGISelLegacyPass(PR); initializeW65816StackSlotCleanupPass(PR); initializeW65816ABridgeViaXPass(PR); initializeW65816WidenAcc16Pass(PR); initializeW65816SpillToXPass(PR); initializeW65816NegYIndYPass(PR); initializeW65816PreSpillCrossCallPass(PR); initializeW65816SjLjFinalizePass(PR); initializeW65816LowerWide32Pass(PR); initializeW65816I32IncFoldPass(PR); initializeW65816ImgCalleeSavePass(PR); initializeW65816NarrowI32MulPass(PR); initializeW65816PromoteFiToImgPass(PR); initializeW65816StackSlotMergePass(PR); // Default IndVarSimplify's exit-value rewriter to "never". The // closed-form replacement frequently widens an i16 induction var // expression to i64 to avoid overflow proofs, then lowers the // multiply to __muldi3. On a 16-bit target the libcall costs // dramatically more than the natural loop it replaces — sumOfSquares // shrinks from 335B (with __muldi3) to 128B (with __mulsi3 in the // loop) just by suppressing this rewrite, with no other benchmark // affected. We do this by name through the cl::opt registry so // it doesn't require patching upstream llvm-mos. auto &Opts = cl::getRegisteredOptions(); if (auto *Opt = Opts.lookup("replexitval")) { Opt->addOccurrence(0, "replexitval", "never"); } } static Reloc::Model getEffectiveRelocModel(std::optional RM) { return RM.value_or(Reloc::Static); } W65816TargetMachine::W65816TargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, std::optional RM, std::optional CM, CodeGenOptLevel OL, bool JIT) : CodeGenTargetMachineImpl(T, TT.computeDataLayout(), TT, CPU, FS, Options, getEffectiveRelocModel(RM), getEffectiveCodeModel(CM, CodeModel::Small), OL), TLOF(std::make_unique()), Subtarget(TT, std::string(CPU), std::string(FS), *this) { initAsmInfo(); } W65816TargetMachine::~W65816TargetMachine() = default; namespace { /// W65816 Code Generator Pass Configuration Options. class W65816PassConfig : public TargetPassConfig { public: W65816PassConfig(W65816TargetMachine &TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) {} W65816TargetMachine &getW65816TargetMachine() const { return getTM(); } bool addInstSelector() override; void addPreRegAlloc() override; void addPostRegAlloc() override; void addPreEmitPass() override; void addMachineSSAOptimization() override; void addISelPrepare() override; // Greedy at -O1+; fast at -O0/optnone. Greedy used to abort with // "Def isn't really dead" inside LiveRangeEdit::eliminateDeadDef // when InlineSpiller converted a redundant STAfi (Defs = [A]) into // a KILL pseudo while only marking explicit defs dead — leaving the // implicit-def $a live, then later trying to delete it. Patched in // tools/llvm-mos/llvm/lib/CodeGen/InlineSpiller.cpp to mark all defs // (explicit + implicit) dead. Bench wins after the switch: // popcount −19.4%, strcpy −18.9%, memcmp −8.6%, bsearch −9.2%, // fib(10) −2.6%. // // At -O0 / optnone (Optimized=false) we use FAST: greedy at -O0 // left spurious COPY pseudos that lowered to STA dp / LDA dp pairs // around modify-in-place ops (e.g. INA), miscompiling a + 1. FunctionPass *createTargetRegisterAllocator(bool Optimized) override { return Optimized ? createGreedyRegisterAllocator() : createFastRegisterAllocator(); } }; } // namespace TargetPassConfig *W65816TargetMachine::createPassConfig(PassManagerBase &PM) { return new W65816PassConfig(*this, PM); } void W65816PassConfig::addISelPrepare() { // SjLjEHPrepare ran in addPassesToHandleExceptions just before this; // our finalize pass inserts an actual setjmp at function entry + // a switch-on-call_site dispatch block, and erases the eh.sjlj.* // intrinsics our backend doesn't natively lower. Must run BEFORE // the base ISelPrepare passes so isel sees the cleaned IR. addPass(createW65816SjLjFinalize()); // IR-level peephole: narrow `mul i32 X, Y` to a __umulhisi3 call // when IR-level computeKnownBits proves the top 16 bits of both // operands are zero. Catches the sumSquares-style `(u32)i * i` // pattern that SDAG-level analysis can't see across BB boundaries. addPass(createW65816NarrowI32Mul()); TargetPassConfig::addISelPrepare(); } void W65816PassConfig::addMachineSSAOptimization() { // MachineCSE used to be disabled here because it incorrectly // eliminated "redundant" CMP instructions: P was considered // "available" but on this target P is clobbered by every // intervening LDA/STA/ADC. The principled fix is to model // Uses=[P] on Bxx (so MachineCSE sees the dep) and let the // pass run normally — that landed in W65816InstrInfo.td. TargetPassConfig::addMachineSSAOptimization(); // MachineBlockPlacement is now re-enabled. Previously disabled // because W65816InstrInfo::analyzeBranch returned unanalyzable // unconditionally; we now decode the BRA / BRL / JMP_Abs uncond // direct-branch case (see W65816InstrInfo::analyzeBranch) which is // enough to satisfy MBP's fall-through assertion. Conditional // branches stay opaque on purpose: their condition is encoded in // the OPCODE and the P-flag input must stay adjacent to a preceding // CMP, which BranchFolder doesn't know to preserve. } void W65816PassConfig::addPreRegAlloc() { // Decompose Wide32 vregs (i32 register pairs) into pairs of i16 vregs // BEFORE the other Acc16-targeting pre-RA passes run. Each later // pass walks Acc16/Idx16/Img16 vregs; running this first means they // see the decomposed halves uniformly. addPass(createW65816LowerWide32()); addPass(createW65816ABridgeViaX()); addPass(createW65816TiedDefSpill()); addPass(createW65816WidenAcc16()); // Pre-spill cross-call Acc16 vregs in high-call functions to // relieve greedy regalloc pressure. Currently disabled — the // first cut creates too many fresh stack slots and overflows the // stack-relative addressing range (frame > 256 bytes) on // moderately-sized functions like the soft-double routines. // The pass is built and ready, gated behind future tuning of: // - lower call-count threshold (currently 4) // - smarter "should we spill THIS vreg" filter // - stack slot reuse via a real liveness analysis // Until then, the high-pressure failure is worked around with // `__attribute__((noinline))` on the heaviest helper or with // `-mllvm -regalloc=fast` for the affected TU. // addPass(createW65816PreSpillCrossCall()); } void W65816PassConfig::addPostRegAlloc() { // FI→IMG promotion runs FIRST. It scans for high-traffic i16 // FrameIndex slots (LDAfi/STAfi/ADCfi/etc.) and rewrites them to // STA_DP/LDA_DP/ADC_DP/... pointed at free IMG8..IMG15 DP slots. // The introduced IMG8..15 references are then picked up by // ImgCalleeSave to get prologue save + epilogue restore. See // W65816PromoteFiToImg.cpp. addPass(createW65816PromoteFiToImg()); // ImgCalleeSave detects IMG8..IMG15 usage post-regalloc and inserts // prologue save + epilogue restore so those slots act as callee- // saved at the asm level. Fixes picol's `expr 1+2 == 4` bug: // high-pressure recursive double fns use IMG8..IMG15 as scratch but, // without this pass, expected them preserved across calls — and // callees were happy to clobber them. See W65816ImgCalleeSave.cpp. addPass(createW65816ImgCalleeSave()); // SpillToX converts STA/LDA pairs to TAX/TXA bridges; StackSlotCleanup // then deletes still-adjacent redundant spills. A second SpillToX // invocation collapses any TAX/TXA pair left adjacent by cleanup // (e.g. when an inner copy between bridge endpoints went away). addPass(createW65816SpillToX()); addPass(createW65816StackSlotCleanup()); addPass(createW65816SpillToX()); // Disable MachineCopyPropagation: it eliminates `COPY $img = $a` // thinking the IMG dest is dead (no explicit physreg use of $img // remains after PEI expands STAfi-with-Img16-source into LDA_DP). // The COPY actually expands to STA_DP $D0 — a memory store to a // DP slot that libcalls (softDouble, softFloat) ALSO use as their // own arg-save scratch. When MCP drops the COPY, the subsequent // LDA_DP $D0 reads stale memory. Caught by `g = g/x` Newton loop: // iter-1's saved x_ml at $D0 was never actually written, so iter-2 // read garbage. The principled fix would mark IMG-targeted COPYs // as memory-side-effecting, but TII doesn't expose that hook; // disabling MCP loses some optimization but is safe. disablePass(&llvm::MachineCopyPropagationID); } void W65816PassConfig::addPreEmitPass() { // SpillToX one more time: now that postrapseudos has expanded // physreg-COPY pseudos into the real TAX/TXA opcodes, adjacent // TXA;TAX pairs (which the earlier SpillToX invocations couldn't // see in COPY form) become collapsable. addPass(createW65816SpillToX()); // Rewrite negative-Y indirect-Y stack-rel ops. Must run BEFORE // BranchExpand because the rewrite expands one instruction into // several and shifts branch distances. The pass internally checks // X-liveness and saves/restores X via DP $E0 when SpillToX has // a value parked there; without that check, the rewrite's TAX // would clobber spill-bridged values (caught by `addOff(p,i) { // p[i-1] += p[i]; }` returning p[i-1] + &p[i-1] instead of +b). // W65816NegYIndY was a workaround for the (sr,s),Y bank-wrap on // negative-Y indirect-stack-rel loads. No current code emits // LDA_StackRelIndY / STA_StackRelIndY (pointer-deref now goes // through [$E0],Y indirect-long via the LDAptr / STAptr / STBptr // inserter, which forces the bank byte at $E2 to 0). Pass left // in tree but disabled — re-enable if a new code path starts // emitting (sr,s),Y again. // addPass(createW65816NegYIndY()); // Branch expansion runs after that so the BRA introduced for long // conditional branches gets seen by SepRepCleanup (which can // coalesce SEP/REP brackets across the new bridge MBBs). // Distance estimation now uses TII::getInstSizeInBytes so it's // byte-accurate; the 110-byte threshold leaves margin without // expanding short branches that would otherwise survive as Bxx. // Detect i32 += 1 patterns (LDA/ADC #1/STA/LDA/ADCE #0/STA) and // rewrite to a tighter LDA/INA/STA + INC_HI_IF_CARRY form that // skips the hi half on the no-carry path. Must run BEFORE // BranchExpand so the inserted conditional skip's distances are // covered by the branch-distance estimator. Also before // SepRepCleanup (which has the existing ADC #±1 → INA peephole) // because we deliberately KEEP ADCi16imm 1 so this pass can match // it; the subsequent SepRepCleanup will see only the residual // (non-fold-eligible) ADCi16imm cases. addPass(createW65816I32IncFold()); addPass(createW65816BranchExpand()); addPass(createW65816SepRepCleanup()); // Merge value-equivalent stack slots last. Runs AFTER SepRepCleanup's // PHI-copy hoist so the LDA-X ; STA-Y pair has been pulled out of // any PHP/PLP wrap — that way the stack-rel offsets on both ops are // the unbumped values and offset-based slot matching is stable. // Saves 2 inst per PHI-copy occurrence (the memory copy round-trip // collapses when X and Y are renamed to the same slot). See // W65816StackSlotMerge.cpp. addPass(createW65816StackSlotMerge()); } MachineFunctionInfo *W65816TargetMachine::createMachineFunctionInfo( BumpPtrAllocator &Allocator, const Function &F, const TargetSubtargetInfo *STI) const { return W65816MachineFunctionInfo::create(Allocator, F, STI); } bool W65816PassConfig::addInstSelector() { addPass(createW65816ISelDag(getW65816TargetMachine(), getOptLevel())); return false; }