65816-llvm-mos/src/llvm/lib/Target/W65816/W65816ISelLowering.cpp
Scott Duensing da095402ec Updated
2026-06-02 23:17:57 -05:00

3914 lines
183 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

//===-- W65816ISelLowering.cpp - W65816 DAG Lowering Implementation -------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Minimum DAG lowering sufficient for a no-argument function returning an
// i16 constant. Argument passing and non-trivial calls still unimplemented.
//
//===----------------------------------------------------------------------===//
#include "W65816ISelLowering.h"
#include "W65816InstrInfo.h"
#include "W65816MachineFunctionInfo.h"
#include "W65816SelectionDAGInfo.h"
#include "W65816Subtarget.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/IR/Function.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorHandling.h"
using namespace llvm;
#define DEBUG_TYPE "w65816-lower"
// Loader-compat workaround: when set, LDAptr/STAptr/STBptr inserters
// load the bank byte from DP $BE (initialized by crt0 to PHK / current
// PBR) instead of forcing it to 0 via STZ $E2. This makes pointer
// derefs land in the user's bank — matching where DBR-relative
// absolute stores go — so library functions like gmtime that store
// into static buffers via DBR-relative paths are visible to caller-
// side pointer-deref reads. Costs 2 extra bytes / 4 cycles per ptr-
// deref (LDA dp + STA dp vs STZ dp). Default off to keep
// size-sensitive builds (toolbox) under the $C000 IO-window ceiling.
static cl::opt<bool> LoaderBankDeref(
"w65816-loader-bank-deref",
cl::desc("LDAptr/STAptr inserters read bank from DP $BE (set by "
"crt0 to PHK) instead of STZ $E2. Required for GS/OS "
"Loader compatibility; default off for size-sensitive "
"builds."),
cl::init(false), cl::Hidden);
// Layer 2 ptr32 opt: when set, ptr32 derefs assume the pointer's bank
// byte matches DBR. Uses `lda (d,s),Y` (opcode 0xB3, stack-relative
// indirect indexed-Y) instead of staging at $E0/$E2 and using
// `lda [dp],Y` (24-bit indirect-long). Saves ~4 instructions per
// deref. Correct only for code that touches memory inside DBR's bank
// — malloc'd Lua state + globals + BSS qualify; cross-bank pointers
// (rare) do not. Caller's responsibility. Tested by hand on lapi.c.
//
// NOTE: not static -- W65816Layer2Gate.cpp reads this to stamp the
// "w65816-layer2" function attribute on every function compiled with
// Layer 2 on, so the LTO-time gate can detect mismatched TUs. Phase
// 1.12 of GAP_CLOSURE_PLAN.md.
cl::opt<bool> DbrSafePtrs(
"w65816-dbr-safe-ptrs",
cl::desc("ptr32 derefs use 16-bit stack-rel-indirect-Y, assuming "
"the pointer's bank byte matches DBR. Significantly "
"shrinks struct-field-heavy code (Lua's lapi.c: ~3.4×"
"much smaller) at the cost of safety for cross-bank "
"pointers (which become a miscompile)."),
cl::init(false), cl::Hidden);
W65816TargetLowering::W65816TargetLowering(const TargetMachine &TM,
const W65816Subtarget &STI)
: TargetLowering(TM, STI) {
// Register classes for the two scalar modes. The register allocator sees
// A, X and Y as both 8-bit and 16-bit; a later REP/SEP pass is responsible
// for ensuring the dynamic mode matches the selected class.
addRegisterClass(MVT::i8, &W65816::Acc8RegClass);
addRegisterClass(MVT::i16, &W65816::Acc16RegClass);
addRegisterClass(MVT::i32, &W65816::Wide32RegClass);
computeRegisterProperties(STI.getRegisterInfo());
setStackPointerRegisterToSaveRestore(W65816::SP);
setBooleanContents(ZeroOrOneBooleanContent);
setBooleanVectorContents(ZeroOrOneBooleanContent);
// GlobalAddress and ExternalSymbol: lower to W65816ISD::Wrapper so a
// tablegen pattern can fold them into instruction operands.
setOperationAction(ISD::GlobalAddress, MVT::i16, Custom);
setOperationAction(ISD::ExternalSymbol, MVT::i16, Custom);
setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
setOperationAction(ISD::ExternalSymbol, MVT::i32, Custom);
// FrameIndex i32 has its own DAG-to-DAG path in W65816ISelDAGToDAG.cpp.
// BR_CC is custom-lowered to a CMP + W65816ISD::BR_CC chain so we can
// emit the right BEQ/BNE/BCS/BCC mnemonic per condition.
setOperationAction(ISD::BR_CC, MVT::i16, Custom);
setOperationAction(ISD::BR_CC, MVT::i8, Custom);
setOperationAction(ISD::BRCOND, MVT::Other, Expand);
setOperationAction(ISD::BR_JT, MVT::Other, Expand);
// BRIND (computed-goto `goto *p`, indirectbr IR) has no direct
// 65816 instruction — JMP (abs) / JMP [abs] read the target pointer
// from MEMORY, not a register. Custom-lower to: store the pointer's
// 16-bit low half (offset within the program's PBR-pinned code bank)
// to $00B8 (the __indirTarget DP slot already reserved for indirect
// calls — see libgcc.s), then emit a `JMP ($00B8)` via the BRIND
// pseudo. Single-bank assumption on the target's code: same as
// every other JMP/BRA in our codegen.
//
// The ptr is i32 under p:32:16 (current default) — extract sub_lo.
// Under p:16 (legacy ptr16), it's already i16.
setOperationAction(ISD::BRIND, MVT::Other, Custom);
// SETCC and SELECT_CC: custom-lowered to a CMP + W65816ISD::SELECT_CC
// pseudo (with usesCustomInserter=1) that EmitInstrWithCustomInserter
// expands into a Bxx + diamond CFG + PHI. SETCC funnels through the
// same path with TVal=1 / FVal=0. SELECT (no condition operand) is
// expanded to SELECT_CC by the legalizer using SETNE against zero.
setOperationAction(ISD::SETCC, MVT::i16, Custom);
setOperationAction(ISD::SETCC, MVT::i8, Custom);
setOperationAction(ISD::SELECT_CC, MVT::i16, Custom);
setOperationAction(ISD::SELECT_CC, MVT::i8, Custom);
setOperationAction(ISD::SELECT, MVT::i16, Expand);
setOperationAction(ISD::SELECT, MVT::i8, Expand);
// 65816 has no inline sign-extend instruction; synthesize i8 -> i16
// via a bit-7 test and SELECT_CC (see LowerSignExtend).
setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Custom);
// BSWAP: no native byte-swap instruction (XBA swaps the two halves
// of the 16-bit accumulator only when in 8-bit M mode, hard to
// exploit cleanly). Lower to shifts + ORs via the generic Expand
// path — SDAG turns `bswap(i32)` into four byte extracts ORed back
// together, which our existing patterns handle. Required for
// portable C that constructs a big-endian word from byte loads:
// `((u32)b[0] << 24) | ((u32)b[1] << 16) | ((u32)b[2] << 8) | b[3]`
// (SHA-256 message-schedule, JPEG/PNG headers, etc.).
setOperationAction(ISD::BSWAP, MVT::i16, Expand);
setOperationAction(ISD::BSWAP, MVT::i32, Expand);
setOperationAction(ISD::BSWAP, MVT::i64, Expand);
// We have zextload-i8 and extload-i8 patterns (LDA + AND #$FF / bare
// LDA for the anyext case). No native sextload; mark it Expand so
// LLVM rewrites `sextload i16, i8` into `(sign_extend (load i8))`,
// which then flows through LowerSignExtend's branchless 3-insn
// sequence (AND #$00FF; EOR #$0080; SEC; SBC #$0080).
for (MVT VT : MVT::integer_valuetypes())
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Expand);
// GlobalOpt sometimes narrows a `short` global to `i1` when it sees
// every assignment is 0 or 1. Custom-lower so LowerLoad rewrites
// `zext/sext/anyext from i1` into a plain byte load + appropriate
// mask. Both i16 and i8 result widths can appear, depending on
// whether the consumer wants the value as `short` or `bool`.
for (MVT ResVT : {MVT::i8, MVT::i16}) {
setLoadExtAction(ISD::ZEXTLOAD, ResVT, MVT::i1, Custom);
setLoadExtAction(ISD::SEXTLOAD, ResVT, MVT::i1, Custom);
setLoadExtAction(ISD::EXTLOAD, ResVT, MVT::i1, Custom);
}
// Only register i32 ext-load / trunc-store and Custom actions when
// i32 is actually a legal type (ptr32 mode active). Otherwise the
// Custom-action calls intercept i16/i8 ops, and LowerTruncate's
// SDValue()-on-non-i32 bail breaks the i16→i8 trunc pattern (same
// root cause as the earlier LOAD-Custom-breaks-LDAptr issue).
bool ptr32Active = isTypeLegal(MVT::i32);
if (ptr32Active) {
for (MVT MemVT : {MVT::i8, MVT::i16}) {
setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, MemVT, Expand);
setLoadExtAction(ISD::SEXTLOAD, MVT::i32, MemVT, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::i32, MemVT, Expand);
setTruncStoreAction(MVT::i32, MemVT, Expand);
}
// Truncating byte stores (`s->c = (char)v`) land as TRUNCSTORE
// i16->i8 in SDAG after combiner canonicalization. Custom-route
// through LowerStore so the ptr-offset peel fires for them too.
setTruncStoreAction(MVT::i16, MVT::i8, Custom);
}
// Vararg support: VASTART writes the address of the first vararg slot
// to the va_list pointer. VAARG/VACOPY/VAEND use the default
// expansions that load through that pointer and bump it. This makes
// <stdarg.h>-style functions (e.g. printf-likes) compile cleanly.
setOperationAction(ISD::VASTART, MVT::Other, Custom);
// Custom VAARG so we DON'T align the va_list pointer. The default
// expansion rounds up to the type's preferred alignment (S16 = 2),
// but caller-pushed args land at PHA's resulting odd S+1 address.
// Aligning would skip the low byte and read garbage.
setOperationAction(ISD::VAARG, MVT::Other, Custom);
setOperationAction(ISD::VACOPY, MVT::Other, Expand);
setOperationAction(ISD::VAEND, MVT::Other, Expand);
// C++ exceptions (SJLJ model) — clang lowers exception machinery into
// these intrinsics via SjLjEHPrepare. We don't have native handling
// for any of them on this target; mark Expand so LegalizeDAG falls
// back to its no-op stubs (setjmp returns 0, longjmp is a no-op,
// setup_dispatch is a chain pass-through). The actual EH semantics
// are provided at runtime by libcxxabi (__cxa_throw etc.) calling
// _Unwind_SjLj_RaiseException, which in turn longjmps via the
// function context the prologue prepared. See
// runtime/src/libcxxabiSjlj.c for the runtime side.
setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Expand);
setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i16, Expand);
setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Expand);
setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
// SJLJ exception lowering uses FRAMEADDR(0) to read the current frame
// pointer. We don't reserve a frame pointer in general; return the
// entry-SP-equivalent value (current SP read via TSC) — good enough
// for SJLJ's purpose of identifying the call frame.
setOperationAction(ISD::FRAMEADDR, MVT::i16, Custom);
setOperationAction(ISD::FRAMEADDR, MVT::i32, Custom);
// stacksave / stackrestore — used by SjLjEHPrepare to save/restore SP
// around invoke calls. The jmp_buf already captures SP via TSC in
// our setjmp implementation, so these are redundant here. Lower
// stacksave to a constant 0 (the value is stored into the function
// context but never used for restoration on our target) and
// stackrestore to a chain pass-through (no-op).
// SJLJ EH uses STACKSAVE/STACKRESTORE. Default Expand calls
// CopyFromReg/$SP which fails because SP has no register class.
// Custom-lower to a Constant 0 (stacksave) and chain-passthrough
// (stackrestore) — our SJLJ runtime doesn't actually use these
// values; setjmp/longjmp manage SP directly via TSC/TCS.
setOperationAction(ISD::STACKSAVE, MVT::Other, Custom);
setOperationAction(ISD::STACKRESTORE, MVT::Other, Custom);
// FRAMEADDR is set Custom above for SJLJ; don't set it Expand here
// (the second setOperationAction would override the first).
setOperationAction(ISD::RETURNADDR, MVT::i16, Expand);
// W65816 pointers are i32; legalizer queries the action for the pointer
// type, so register Expand for i32 too. Without this,
// __builtin_return_address(0) ICEs in LowerOperation (no Custom handler
// for RETURNADDR).
setOperationAction(ISD::RETURNADDR, MVT::i32, Expand);
setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i16, Expand);
setOperationAction(ISD::EH_DWARF_CFA, MVT::i16, Expand);
// ISD::TRAP — __builtin_trap(), -fsanitize-trap=undefined. Default
// expansion is a libcall to abort(); UBSan-min wants a BRK with a
// pickup sentinel instead so the trap site is identifiable from a
// memory dump without a working stdio path. Custom-lower to a
// W65816ISD::TRAP target node; the InstrInfo.td pattern routes it
// to BRK_pseudo, whose AsmPrinter expansion writes 0xBE to $70 and
// then issues BRK + a self-loop (headless MAME mis-vectors BRK, so
// the spin is what actually halts).
setOperationAction(ISD::TRAP, MVT::Other, Custom);
// DEBUGTRAP follows the same shape — same node, same expansion.
setOperationAction(ISD::DEBUGTRAP, MVT::Other, Custom);
// The 65816 has no hardware multiplier or divider. Multiply by a
// power-of-two constant is auto-rewritten to shifts by the DAG
// combiner; arbitrary multiply / divide / mod go through libcalls
// (`__mulhi3` for i16 multiply etc.). The libcall expander emits a
// standard CALL node which flows through LowerCall, so multi-arg
// call lowering must be working first (it is, see task #26).
setOperationAction(ISD::MULHU, MVT::i16, Expand);
setOperationAction(ISD::MULHS, MVT::i16, Expand);
setOperationAction(ISD::SMUL_LOHI, MVT::i16, Expand);
setOperationAction(ISD::UMUL_LOHI, MVT::i16, Expand);
setOperationAction(ISD::MUL, MVT::i16, LibCall);
// i8 multiply / mulh / div / rem: SDAG narrows e.g. `x / 10` to
// `mulhu i8 x, -51` + shift when it proves operands fit in i8.
// The 65816 has no native 8-bit multiplier; route everything
// through the 16-bit libcalls by Promoting i8 ops to i16.
setOperationAction(ISD::MUL, MVT::i8, Promote);
setOperationAction(ISD::MULHU, MVT::i8, Promote);
setOperationAction(ISD::MULHS, MVT::i8, Promote);
setOperationAction(ISD::SDIV, MVT::i8, Promote);
setOperationAction(ISD::UDIV, MVT::i8, Promote);
setOperationAction(ISD::SREM, MVT::i8, Promote);
setOperationAction(ISD::UREM, MVT::i8, Promote);
setOperationAction(ISD::SMUL_LOHI, MVT::i8, Expand);
setOperationAction(ISD::UMUL_LOHI, MVT::i8, Expand);
// CTPOP/CTLZ/CTTZ/ROTL/ROTR — no hardware support. Expand lets the
// type legalizer rewrite into a sequence of basic ops. Without
// this, e.g. `x && !(x & (x-1))` (LLVM canonicalises to popcount==1)
// or `(x << 1) | (x >> 15)` (canonicalised to rotl) hit "Cannot
// Select" at isel.
for (MVT VT : {MVT::i8, MVT::i16, MVT::i32}) {
setOperationAction(ISD::CTPOP, VT, Expand);
setOperationAction(ISD::CTLZ, VT, Expand);
setOperationAction(ISD::CTTZ, VT, Expand);
setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
setOperationAction(ISD::ROTL, VT, Expand);
setOperationAction(ISD::ROTR, VT, Expand);
}
setOperationAction(ISD::SDIV, MVT::i16, LibCall);
setOperationAction(ISD::UDIV, MVT::i16, LibCall);
setOperationAction(ISD::SREM, MVT::i16, LibCall);
setOperationAction(ISD::UREM, MVT::i16, LibCall);
setOperationAction(ISD::SDIVREM, MVT::i16, Expand);
setOperationAction(ISD::UDIVREM, MVT::i16, Expand);
// Variable-amount and large-constant shifts. We have inline
// patterns for shift-by-1..4; everything else goes through
// __ashlhi3 / __lshrhi3 / __ashrhi3. Setting the action to Custom
// lets us return SDValue() for the fast cases and route everything
// else through the libcall lowering helper.
setOperationAction(ISD::SHL, MVT::i16, Custom);
setOperationAction(ISD::SRL, MVT::i16, Custom);
setOperationAction(ISD::SRA, MVT::i16, Custom);
// i8 shifts go through Custom too — LowerShift detects the i8 result
// and routes through trunc(i16-shift(zext_or_sext(lhs), amount)).
// Avoids needing a parallel set of qi3 libcalls.
setOperationAction(ISD::SHL, MVT::i8, Custom);
setOperationAction(ISD::SRL, MVT::i8, Custom);
setOperationAction(ISD::SRA, MVT::i8, Custom);
// LOAD / STORE Custom-lowering for ptr32 mode is intentionally NOT
// wired here in ptr16 mode. Setting LOAD Custom and returning
// SDValue() from LowerLoad short-circuits the i16-result LDAptr/
// STAptr selection paths (the Custom→empty→Legal fall-through doesn't
// re-enter pattern matching). When ptr32 is activated, this hook
// needs a different gating mechanism — likely an isel-time
// replacement triggered by addrspacecast or a target DAG combine.
// See LowerLoad / LowerStore — currently dead code.
// ADDC/ADDE/SUBC/SUBE are the legacy SDNodes with implicit Glue carrying
// the carry/borrow flag between the two halves of a multi-precision add or
// sub. Setting them Legal triggers the type legalizer's carry-chain split
// for i32 ADD/SUB, which lowers to native ADC/SBC pairs (~7 instructions)
// instead of the default UADDO+SETCC+ADD-of-bool path (~25 instructions).
// The matching tablegen pseudos add Defs/Uses on the P register, which
// tablegen wires up to the SDNode's SDNPInGlue/SDNPOutGlue automatically.
setOperationAction(ISD::ADDC, MVT::i16, Legal);
setOperationAction(ISD::ADDE, MVT::i16, Legal);
setOperationAction(ISD::SUBC, MVT::i16, Legal);
setOperationAction(ISD::SUBE, MVT::i16, Legal);
// i32 (long). Type legalization splits i32 into two i16 halves; with
// ADDC/ADDE Legal (above), ADD/SUB go through the native carry chain.
// AND/OR/XOR split cleanly into per-half ops with no carry to track.
// Multiply/divide/shift go through libcall stubs whose
// implementations live in runtime/src/libgcc.s. SHL_PARTS / SRL_PARTS
// / SRA_PARTS are the SDNodes the type legalizer emits when splitting
// a variable-amount shift; without an action they get "Cannot select".
// LibCall on the parent node routes the whole shift through one
// __ashlsi3 / __lshrsi3 / __ashrsi3 call, which is both smaller and
// simpler than implementing a 32-bit shift in 65816 assembly inline.
for (MVT VT : {MVT::i32}) {
// MUL i32 is Custom-lowered: the typical fall-through libcall is
// __mulsi3 (32x32 -> 32), but when both operands are ZEXT from i16
// we can emit __umulhisi3 (16x16 -> 32) instead. Saves ~60 cyc per
// call on the `(unsigned long)i * i` pattern — see LowerMUL_I32.
setOperationAction(ISD::MUL, VT, Custom);
setOperationAction(ISD::SDIV, VT, LibCall);
setOperationAction(ISD::UDIV, VT, LibCall);
setOperationAction(ISD::SREM, VT, LibCall);
setOperationAction(ISD::UREM, VT, LibCall);
setOperationAction(ISD::MULHU, VT, Expand);
setOperationAction(ISD::MULHS, VT, Expand);
setOperationAction(ISD::SMUL_LOHI, VT, Expand);
setOperationAction(ISD::UMUL_LOHI, VT, Expand);
setOperationAction(ISD::SDIVREM, VT, Expand);
setOperationAction(ISD::UDIVREM, VT, Expand);
// i32 shifts route through a libcall via the
// preferredShiftLegalizationStrategy override (see header). No
// explicit SHL/SHL_PARTS action needed — the override forces the
// type-legalizer's libcall path before SHL_PARTS would be emitted.
}
// i64 shifts — route to libcall before the type legalizer tries
// to split via the next-legal-type (which becomes i32 in ptr32 mode
// and triggers a SDAG combine loop on `i64 >> K` patterns). By
// marking SHL/SRL/SRA i64 LibCall here, the operation legalizer
// picks up the libcall path even though i64 itself is illegal.
for (MVT VT : {MVT::i64}) {
setOperationAction(ISD::SHL, VT, LibCall);
setOperationAction(ISD::SRL, VT, LibCall);
setOperationAction(ISD::SRA, VT, LibCall);
}
if (ptr32Active) {
for (unsigned Op : {ISD::ADD, ISD::SUB, ISD::AND, ISD::OR, ISD::XOR})
setOperationAction(Op, MVT::i32, Custom);
setOperationAction(ISD::SHL, MVT::i32, Custom);
setOperationAction(ISD::SRL, MVT::i32, Custom);
setOperationAction(ISD::SRA, MVT::i32, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::i32, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::i32, Custom);
setOperationAction(ISD::ANY_EXTEND, MVT::i32, Custom);
// SIGN_EXTEND_INREG with i32 result and inner type i1/i8/i16:
// the combiner emits this for `(int32_t)((int8_t)x)` and for
// `-(crc & 1ul)` (the i1 case shows up in CRC32 loops). No
// tablegen pattern covers the i32 form; Custom-lower to per-half
// ops. IMPORTANT: LegalizeDAG looks up the action for
// SIGN_EXTEND_INREG using the INNER VT (the operand value type),
// not the result VT. See LegalizeDAG.cpp:
// Action = TLI.getOperationAction(Op, InnerType);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Custom);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Custom);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Custom);
setOperationAction(ISD::TRUNCATE, MVT::i16, Custom);
setOperationAction(ISD::TRUNCATE, MVT::i8, Custom);
setOperationAction(ISD::LOAD, MVT::i32, Custom);
setOperationAction(ISD::STORE, MVT::i32, Custom);
// Also Custom for i16/i8 LOAD/STORE in ptr32 mode so LowerLoad/
// LowerStore can fold Wide32(Wrapper, WrapperBank) of the same
// global (or a raw GlobalAddress) to a plain abs-16 access
// (DBR-relative). Without this, every `g` access for a
// same-segment global goes through the 14-byte [dp],y
// indirect-long path even though the bank is implicit in DBR.
setOperationAction(ISD::STORE, MVT::i16, Custom);
setOperationAction(ISD::STORE, MVT::i8, Custom);
setOperationAction(ISD::LOAD, MVT::i16, Custom);
setOperationAction(ISD::LOAD, MVT::i8, Custom);
// ZEXTLOAD i16-from-i8 also Custom — the DAG combiner folds
// (zext (load i8 @g)) into one zextload SDNode, so we need to
// apply the same global-address fold there. SEXTLOAD/EXTLOAD
// already have Expand actions from earlier setLoadExtAction
// calls; leave those alone (Custom would require parallel
// tablegen patterns we don't have).
setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, MVT::i8, Custom);
setOperationAction(ISD::SETCC, MVT::i32, Custom);
setOperationAction(ISD::BR_CC, MVT::i32, Custom);
setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
setOperationAction(ISD::SELECT, MVT::i32, Custom);
setOperationAction(ISD::Constant, MVT::i32, Custom);
}
// Disable jump tables. Generating them costs us BRIND (indirect
// branch via 16-bit pointer load), which we don't have. A long
// if-else chain compiles fine without them. Setting the threshold
// to UINT_MAX makes LLVM never form a jump table.
setMinimumJumpTableEntries(UINT_MAX);
// Variable-length arrays / dynamic stack allocation. Lowered to
// `tsc; sec; sbc size; tcs; inc a` — A returns the address of the
// allocated region. Limitation: this shifts SP, so any FrameIndex
// accessed *after* a DYNAMIC_STACKALLOC reads from a wrong offset
// (we have no frame pointer). Suitable for the common pattern
// "alloca; initialise; pass; return"; complex VLA use mixed with
// local-variable access across the alloca will miscompile. A real
// FP (DP slot or X-as-FP) would lift this restriction.
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i16, Custom);
if (ptr32Active)
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
// Opt into PerformDAGCombine on LOAD nodes — needed for the
// address-select reverse combine (see W65816TargetLowering::
// PerformDAGCombine).
// setTargetDAGCombine(ISD::LOAD); // bisecting pickif hang
// SHL combine disabled while debugging the ptr32 i64-phi hang.
// setTargetDAGCombine(ISD::SHL);
// Combine STORE / LOAD with const-int i32 pointer to a form that
// survives LowerI32Constant (which would otherwise split the ptr
// into a Wide32 reg pair and lose the const-addr fast path).
// See PerformDAGCombine.
setTargetDAGCombine(ISD::STORE);
setTargetDAGCombine(ISD::LOAD);
}
// Map an LLVM SETCC condition to a W65816 branch. Returns the condition
// code along with possibly-swapped LHS/RHS; some signed comparisons are
// rewritten to use unsigned ones with a tweaked operand because the
// 65816 has no native signed branch other than BMI/BPL on a value, not
// on a comparison result.
// Map an LLVM SETCC condition to a 65816 branch. Unsigned codes use
// BCS/BCC after CMP. Signed SETLT/SETGE map to BMI/BPL — correct only
// when the comparison cannot overflow. For values produced by typical
// C arithmetic on i16 this is usually fine; values near INT16_MIN/MAX
// could give wrong results until we emit the BVS handling sequence.
// SETGT / SETLE are rewritten to SETLT / SETGE with constant + 1 in
// LowerBR_CC, mirroring the SETULE / SETUGT path.
static W65816CC::CondCode mapCC(ISD::CondCode CC) {
switch (CC) {
case ISD::SETEQ: return W65816CC::COND_EQ;
case ISD::SETNE: return W65816CC::COND_NE;
case ISD::SETUGE: return W65816CC::COND_HS;
case ISD::SETULT: return W65816CC::COND_LO;
case ISD::SETLT: return W65816CC::COND_MI;
case ISD::SETGE: return W65816CC::COND_PL;
default:
return W65816CC::COND_INVALID;
}
}
// If both compare operands are i8, widen them to i16 so the existing
// i16 CMP path can handle them. Use ZEXT for unsigned/eq/ne CCs and
// SEXT for signed CCs — picking the wrong extension would invert the
// answer (e.g. -1i8 sext to 0xFFFF compares < 1 signed; zext to 0x00FF
// compares > 1 unsigned, which would flip a signed less-than).
static void promoteI8Cmp(SDValue &LHS, SDValue &RHS, ISD::CondCode CC,
SelectionDAG &DAG, const SDLoc &DL) {
if (LHS.getValueType() != MVT::i8) return;
unsigned Ext;
switch (CC) {
case ISD::SETLT: case ISD::SETLE: case ISD::SETGT: case ISD::SETGE:
Ext = ISD::SIGN_EXTEND; break;
default:
Ext = ISD::ZERO_EXTEND; break; // unsigned + eq/ne
}
LHS = DAG.getNode(Ext, DL, MVT::i16, LHS);
RHS = DAG.getNode(Ext, DL, MVT::i16, RHS);
}
// Normalize a (LHS, RHS, CC) triple so the result is something we can
// emit with one CMP + Bxx. Returns the W65816 condition code; updates
// LHS/RHS/CC in place. Returns COND_INVALID on failure.
static W65816CC::CondCode normalizeCC(SDValue &LHS, SDValue &RHS,
ISD::CondCode &CC, SelectionDAG &DAG,
const SDLoc &DL) {
promoteI8Cmp(LHS, RHS, CC, DAG, DL);
// CMP wants the comparand (constant or memory) on the right. If a DAG
// pre-pass put the constant on the left, swap and flip the condition.
if (isa<ConstantSDNode>(LHS) && !isa<ConstantSDNode>(RHS)) {
std::swap(LHS, RHS);
CC = ISD::getSetCCSwappedOperands(CC);
}
// Signed compare via "EOR with sign bit then unsigned compare":
// a < b (signed) iff (a ^ 0x8000) < (b ^ 0x8000) (unsigned)
// The XOR flips the sign bit, which converts signed-int ordering to
// unsigned-int ordering on the same bits. This avoids the WDC's
// missing "BLT signed" — BMI/BPL alone read the sign of (a-b)
// without the V-flag overflow correction, giving wrong results
// when the subtraction overflows (e.g., INT16_MIN < 1 produced
// false because (-32768 - 1) = +32767 has N=0). After the EOR
// transform we use BCC/BCS which depend on the carry from CMP and
// don't suffer overflow corruption.
//
// Cost: 1 EOR per operand (3 bytes each in M=16) — comparable to
// the V-aware multi-branch sequence (5+ bytes of branches), but
// happens at SDAG time so subsequent SDAG combining can fold
// EORs against constants or already-EOR'd values.
bool SignedCmp = (CC == ISD::SETLT || CC == ISD::SETLE ||
CC == ISD::SETGT || CC == ISD::SETGE);
if (SignedCmp && LHS.getValueType() == MVT::i16) {
EVT VT = LHS.getValueType();
SDValue Mask = DAG.getConstant(0x8000, DL, VT);
LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, Mask);
RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, Mask);
switch (CC) {
case ISD::SETLT: CC = ISD::SETULT; break;
case ISD::SETLE: CC = ISD::SETULE; break;
case ISD::SETGT: CC = ISD::SETUGT; break;
case ISD::SETGE: CC = ISD::SETUGE; break;
default: break;
}
}
// Rewrite SETULE / SETUGT to SETULT / SETUGE with constant +/- 1.
// (SETLE / SETGT have already been converted to their unsigned
// counterparts above for i16; this handles original SETULE/SETUGT
// and the post-transform SETULE/SETUGT.) Keeps the variable on the
// LHS and lets us use BCS / BCC natively.
if (auto *RhsConst = dyn_cast<ConstantSDNode>(RHS)) {
int64_t V = RhsConst->getSExtValue();
uint64_t UV = (uint64_t)V & 0xFFFF;
if (CC == ISD::SETULE && UV < 0xffff) {
RHS = DAG.getConstant(UV + 1, DL, RHS.getValueType());
CC = ISD::SETULT;
} else if (CC == ISD::SETUGT && UV < 0xffff) {
RHS = DAG.getConstant(UV + 1, DL, RHS.getValueType());
CC = ISD::SETUGE;
} else if (CC == ISD::SETLE && V < 0x7fff) {
// Reachable only when SignedCmp transform was skipped (i8 case
// before promoteI8Cmp could get it, or non-i16 in the future).
RHS = DAG.getConstant(V + 1, DL, RHS.getValueType());
CC = ISD::SETLT;
} else if (CC == ISD::SETGT && V < 0x7fff) {
RHS = DAG.getConstant(V + 1, DL, RHS.getValueType());
CC = ISD::SETGE;
}
}
W65816CC::CondCode TCC = mapCC(CC);
if (TCC == W65816CC::COND_INVALID) {
// Try swapping operands first — preferable since it leaves us with
// a single-Bxx form. But reject the swap if it would put a load on
// the LHS (we can't pattern-match cmp(load,reg) without spilling A).
bool RhsIsLoad = isa<LoadSDNode>(RHS.getNode());
bool LhsIsLoad = isa<LoadSDNode>(LHS.getNode());
bool SwapWouldHurt = RhsIsLoad && !LhsIsLoad;
if (!SwapWouldHurt) {
std::swap(LHS, RHS);
CC = ISD::getSetCCSwappedOperands(CC);
TCC = mapCC(CC);
}
}
// Final fallback: GT/LE/UGT/ULE without a useful swap target. Use a
// multi-branch pseudo CC; the SELECT_CC16 custom inserter expands it
// into a 3-BB diamond. Only valid for SELECT_CC, not for BR_CC —
// LowerBR_CC re-routes those through SETCC + BR_CC NE.
if (TCC == W65816CC::COND_INVALID) {
switch (CC) {
case ISD::SETGT: TCC = W65816CC::COND_GT_MB; break;
case ISD::SETLE: TCC = W65816CC::COND_LE_MB; break;
case ISD::SETUGT: TCC = W65816CC::COND_HI_MB; break;
case ISD::SETULE: TCC = W65816CC::COND_LS_MB; break;
default: break;
}
}
return TCC;
}
// Wide32 build/extract helpers, used by LowerLoad/Store/Extend/Truncate/
// I32Bin/BR_CC to construct or destructure i32 SDValues across the
// sub_lo / sub_hi halves of the Wide32 register class.
static SDValue buildWide32(SelectionDAG &DAG, const SDLoc &DL,
SDValue Lo, SDValue Hi) {
SDValue RC = DAG.getTargetConstant(W65816::Wide32RegClassID, DL, MVT::i32);
SDValue SubLo = DAG.getTargetConstant(llvm::sub_lo, DL, MVT::i32);
SDValue SubHi = DAG.getTargetConstant(llvm::sub_hi, DL, MVT::i32);
SDNode *RS = DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, DL, MVT::i32,
{RC, Lo, SubLo, Hi, SubHi});
return SDValue(RS, 0);
}
// Look through a buildWide32(Lo, Hi) -> REG_SEQUENCE(RC, Lo, sub_lo,
// Hi, sub_hi) pair: if X is exactly that machine node, return the
// matching half operand directly. Avoids a TargetExtractSubreg that
// would re-enter the SDAG combiner and re-build the i32 constant /
// pair, looping forever (observed as OOM in the combiner on `*t = 0`).
static SDValue lookThroughRegSeq(SDValue X, unsigned WantSub) {
if (!X.getNode() || !X.isMachineOpcode()) return SDValue();
if (X.getMachineOpcode() != TargetOpcode::REG_SEQUENCE) return SDValue();
// Layout: op0 = RC, then (Reg, SubIdx) pairs.
for (unsigned i = 1; i + 1 < X.getNumOperands(); i += 2) {
SDValue SubIdx = X.getOperand(i + 1);
auto *CIdx = dyn_cast<ConstantSDNode>(SubIdx);
if (!CIdx) continue;
if (CIdx->getZExtValue() == WantSub)
return X.getOperand(i);
}
return SDValue();
}
static SDValue extractWide32Lo(SelectionDAG &DAG, const SDLoc &DL, SDValue X) {
// For constants, materialise the lo half as an i16 constant directly
// — getTargetExtractSubreg on a Constant SDNode produces a malformed
// MachineSDNode (constants don't carry sub-regs) and triggers
// SDAG combine loops downstream.
if (auto *C = dyn_cast<ConstantSDNode>(X)) {
return DAG.getConstant(C->getZExtValue() & 0xFFFFu, DL, MVT::i16);
}
// For un-lowered GlobalAddress / ExternalSymbol nodes (which reach
// here when the store-lowering runs before LowerOperation has split
// the constant into a Wide32 pair), emit a fresh Wrapper / WrapperBank
// pair directly. getTargetExtractSubreg on a GlobalAddress node
// produces a malformed result (no sub-reg info on a non-register).
if (auto *GA = dyn_cast<GlobalAddressSDNode>(X)) {
SDValue T = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, MVT::i16,
GA->getOffset());
return DAG.getNode(W65816ISD::Wrapper, DL, MVT::i16, T);
}
if (auto *ES = dyn_cast<ExternalSymbolSDNode>(X)) {
SDValue T = DAG.getTargetExternalSymbol(ES->getSymbol(), MVT::i16);
return DAG.getNode(W65816ISD::Wrapper, DL, MVT::i16, T);
}
if (SDValue Half = lookThroughRegSeq(X, llvm::sub_lo))
return Half;
return DAG.getTargetExtractSubreg(llvm::sub_lo, DL, MVT::i16, X);
}
static SDValue extractWide32Hi(SelectionDAG &DAG, const SDLoc &DL, SDValue X) {
if (auto *C = dyn_cast<ConstantSDNode>(X)) {
return DAG.getConstant((C->getZExtValue() >> 16) & 0xFFFFu, DL, MVT::i16);
}
if (auto *GA = dyn_cast<GlobalAddressSDNode>(X)) {
SDValue T = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, MVT::i16,
GA->getOffset());
return DAG.getNode(W65816ISD::WrapperBank, DL, MVT::i16, T);
}
if (auto *ES = dyn_cast<ExternalSymbolSDNode>(X)) {
SDValue T = DAG.getTargetExternalSymbol(ES->getSymbol(), MVT::i16);
return DAG.getNode(W65816ISD::WrapperBank, DL, MVT::i16, T);
}
if (SDValue Half = lookThroughRegSeq(X, llvm::sub_hi))
return Half;
return DAG.getTargetExtractSubreg(llvm::sub_hi, DL, MVT::i16, X);
}
// Match `Ptr = REG_SEQUENCE(ADDC(BaseLo, KLo), sub_lo,
// ADDE(BaseHi, 0, carry), sub_hi)` shape
// produced by LowerI32Bin for `(add Wide32, const)` where the constant
// fits an unsigned 16-bit Y (KHi must be 0). Returns true with OutBase
// = buildWide32(BaseLo, BaseHi) and OutOff = KLo on a successful peel.
// The bank-byte carry-in is intentionally dropped: the `[dp],Y` deref
// adds Y to the 24-bit pointer without propagating beyond 16 bits.
// Caller's responsibility that the target object doesn't span a bank.
static bool peelPtr32Offset(SelectionDAG &DAG, SDLoc DL, SDValue Ptr,
SDValue &OutBase, uint16_t &OutOff) {
if (Ptr.getValueType() != MVT::i32) return false;
// Pre-LowerI32Bin shape: `ISD::ADD(BaseWide32, i32 const)`. LowerLoad
// runs before LowerI32Bin in legalization order, so the ADD is still
// visible as an ISD::ADD when LowerLoad inspects Ptr.
if (Ptr.getOpcode() == ISD::ADD) {
SDValue L = Ptr.getOperand(0);
SDValue R = Ptr.getOperand(1);
auto *KC = dyn_cast<ConstantSDNode>(R);
if (!KC) {
KC = dyn_cast<ConstantSDNode>(L);
if (!KC) return false;
L = R;
}
uint64_t K = KC->getZExtValue();
if (K == 0 || K > 0xFFFFu) return false;
OutOff = (uint16_t)K;
OutBase = L;
return true;
}
// Post-LowerI32Bin shape (REG_SEQUENCE of ADDC/ADDE). May not occur
// in practice given the ADD path above, but kept for robustness.
if (!Ptr.getNode() || !Ptr.isMachineOpcode()) return false;
if (Ptr.getMachineOpcode() != TargetOpcode::REG_SEQUENCE) return false;
SDValue Lo = lookThroughRegSeq(Ptr, llvm::sub_lo);
SDValue Hi = lookThroughRegSeq(Ptr, llvm::sub_hi);
if (!Lo || !Hi) return false;
if (Lo.getOpcode() != ISD::ADDC) return false;
if (Hi.getOpcode() != ISD::ADDE) return false;
if (Hi.getOperand(2) != Lo.getValue(1)) return false;
auto *KLo = dyn_cast<ConstantSDNode>(Lo.getOperand(1));
auto *KHi = dyn_cast<ConstantSDNode>(Hi.getOperand(1));
if (!KLo || !KHi) return false;
if (KHi->getZExtValue() != 0) return false;
uint64_t K = KLo->getZExtValue() & 0xFFFFu;
if (K == 0) return false;
OutOff = (uint16_t)K;
OutBase = buildWide32(DAG, DL, Lo.getOperand(0), Hi.getOperand(0));
return true;
}
SDValue W65816TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
SDValue Chain = Op.getOperand(0);
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
SDValue LHS = Op.getOperand(2);
SDValue RHS = Op.getOperand(3);
SDValue Dest = Op.getOperand(4);
SDLoc DL(Op);
EVT VT = LHS.getValueType();
// i32 BR_CC: synthesize an i16 boolean from per-half compares, then
// branch on (bool != 0). Avoids the legalizer's generic Expand that
// re-enters our SETCC/BR_CC custom paths in an infinite loop.
if (VT == MVT::i32) {
SDValue LL = extractWide32Lo(DAG, DL, LHS);
SDValue LH = extractWide32Hi(DAG, DL, LHS);
SDValue RL = extractWide32Lo(DAG, DL, RHS);
SDValue RH = extractWide32Hi(DAG, DL, RHS);
// Fast path: i32 == 0 / != 0 → (LL | LH) cmp 0. Drops two i16
// setcc materializations + an AND + (for NE) an XOR; the BR_CC
// can branch directly on the OR-test. Hot in `while (x)` and
// any i32-counter loop test.
if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
isa<ConstantSDNode>(RHS) &&
cast<ConstantSDNode>(RHS)->isZero()) {
SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i16, LL, LH);
SDValue Z16 = DAG.getConstant(0, DL, MVT::i16);
return DAG.getNode(ISD::BR_CC, DL, MVT::Other, Chain,
DAG.getCondCode(CC), Or, Z16, Dest);
}
SDValue Bool;
if (CC == ISD::SETEQ || CC == ISD::SETNE) {
SDValue EqLo = DAG.getSetCC(DL, MVT::i16, LL, RL, ISD::SETEQ);
SDValue EqHi = DAG.getSetCC(DL, MVT::i16, LH, RH, ISD::SETEQ);
Bool = DAG.getNode(ISD::AND, DL, MVT::i16, EqLo, EqHi);
if (CC == ISD::SETNE)
Bool = DAG.getNode(ISD::XOR, DL, MVT::i16, Bool,
DAG.getConstant(1, DL, MVT::i16));
} else {
// (a CC b) where CC is ordered:
// = (hi_a HiStrict hi_b) || (hi_a == hi_b && lo_a LoCC lo_b)
// HiStrict is the strict variant of CC (LE -> LT etc.) so the
// tie-breaker (hi==hi && lo CC lo) handles the equality case
// properly. LoCC is always the unsigned variant of CC because
// the low half is unsigned (the high half carries the sign).
ISD::CondCode HiCC, LoCCu;
switch (CC) {
case ISD::SETLT: HiCC = ISD::SETLT; LoCCu = ISD::SETULT; break;
case ISD::SETLE: HiCC = ISD::SETLT; LoCCu = ISD::SETULE; break;
case ISD::SETGT: HiCC = ISD::SETGT; LoCCu = ISD::SETUGT; break;
case ISD::SETGE: HiCC = ISD::SETGT; LoCCu = ISD::SETUGE; break;
case ISD::SETULT: HiCC = ISD::SETULT; LoCCu = ISD::SETULT; break;
case ISD::SETULE: HiCC = ISD::SETULT; LoCCu = ISD::SETULE; break;
case ISD::SETUGT: HiCC = ISD::SETUGT; LoCCu = ISD::SETUGT; break;
case ISD::SETUGE: HiCC = ISD::SETUGT; LoCCu = ISD::SETUGE; break;
default:
report_fatal_error("W65816: unexpected i32 BR_CC condition");
}
SDValue HiOk = DAG.getSetCC(DL, MVT::i16, LH, RH, HiCC);
SDValue HiEq = DAG.getSetCC(DL, MVT::i16, LH, RH, ISD::SETEQ);
SDValue LoOk = DAG.getSetCC(DL, MVT::i16, LL, RL, LoCCu);
SDValue Tie = DAG.getNode(ISD::AND, DL, MVT::i16, HiEq, LoOk);
Bool = DAG.getNode(ISD::OR, DL, MVT::i16, HiOk, Tie);
}
SDValue Zero = DAG.getConstant(0, DL, MVT::i16);
return DAG.getNode(ISD::BR_CC, DL, MVT::Other, Chain,
DAG.getCondCode(ISD::SETNE), Bool, Zero, Dest);
}
W65816CC::CondCode TCC = normalizeCC(LHS, RHS, CC, DAG, DL);
if (TCC == W65816CC::COND_INVALID)
report_fatal_error("W65816: branch condition not yet implemented");
// Multi-branch CCs only have inserter support via SELECT_CC16. For
// BR_CC, reroute through SETCC: materialise the boolean to A, then
// branch on NE-vs-zero. One extra LDA but always works.
if (TCC >= W65816CC::COND_GT_MB) {
SDValue Bool = DAG.getNode(ISD::SETCC, DL, VT, LHS, RHS,
DAG.getCondCode(CC));
SDValue Zero = DAG.getConstant(0, DL, VT);
return DAG.getNode(ISD::BR_CC, DL, MVT::Other, Chain,
DAG.getCondCode(ISD::SETNE), Bool, Zero, Dest);
}
SDValue Glue = DAG.getNode(W65816ISD::CMP, DL, MVT::Glue, LHS, RHS);
SDValue CCOp = DAG.getTargetConstant(TCC, DL, MVT::i8);
return DAG.getNode(W65816ISD::BR_CC, DL, MVT::Other, Chain, Dest, CCOp,
Glue);
}
// LowerBRIND — `brind (chain, target_ptr)`. Computed-goto / IR
// `indirectbr` lowers to BRIND with a pointer-typed target. Under
// p:32:16 (default datalayout) that pointer is i32, so the generic
// legalizer's "Cannot select brind" path fires unless we step in.
//
// Lowering strategy (mirrors __jsl_indir's mechanism):
// 1. If target is i32 (Wide32), extract sub_lo — only the 16-bit
// offset within PBR matters because JMP (abs) keeps current PBR.
// 2. Store that i16 to constant address $00B8 — the shared
// __indirTarget DP slot. Pinned at $00B8 so JMP (abs)'s bank-0
// vector fetch reads it regardless of DBR / segment placement
// (see libgcc.s for the full rationale).
// 3. Emit W65816ISD::BRIND with the chained store — the BRINDpseudo
// tablegen pattern selects to JMP_AbsInd $00B8.
SDValue W65816TargetLowering::LowerBRIND(SDValue Op,
SelectionDAG &DAG) const {
SDValue Chain = Op.getOperand(0);
SDValue Target = Op.getOperand(1);
SDLoc DL(Op);
// Reduce the target to i16 — the low half of the (i32) pointer
// holds the in-bank offset that JMP indirect dispatches through.
SDValue Off16;
if (Target.getValueType() == MVT::i32) {
Off16 = extractWide32Lo(DAG, DL, Target);
} else if (Target.getValueType() == MVT::i16) {
Off16 = Target;
} else {
// Defensive: shouldn't happen with our current type-legalization,
// but if it does, defer to the legalizer.
return SDValue();
}
// Store the 16-bit target to $00B8. The (store Acc16, (iPTR timm))
// tablegen pattern lowers this to STAabs ($00B8) — the AsmPrinter
// routes bank-0 const-int stores to STA_Abs (3 bytes, DBR-relative).
// Since DP=0 at runtime, `sta $00B8` lands at $00:00B8 == DP slot
// $B8, which is exactly where __jsl_indir reads via `jmp ($00B8)`.
//
// CRITICAL: use TargetConstant (not Constant) so the i32 Constant is
// NOT Custom-lowered through LowerI32Constant — which would split
// 0x00B8 into a REG_SEQUENCE(0xB8, 0). LowerStore then can't see
// a clean ConstantSDNode at Ptr, mis-routes the i16 store to the
// generic ST_PTR slow path ([E0],Y indirect-long with full Wide32
// address staging), and creates significant Wide32 register pressure
// — multi-cgoto VM interpreters with several BRINDs in one function
// then over-pressure the regalloc and abort with "ran out of
// registers". With TargetConstant the tablegen pattern at
// InstrInfo.td:433 fires directly: `sta $b8` — one instruction, no
// Wide32 vreg, no DPF0/DPF1 staging.
EVT PtrVT = getPointerTy(DAG.getDataLayout());
SDValue Addr = DAG.getTargetConstant(0x00B8, DL, PtrVT);
SDValue Store = DAG.getStore(Chain, DL, Off16, Addr,
MachinePointerInfo());
// Emit the indirect JMP. W65816ISD::BR_IND has chain-only semantics
// (no operand beyond chain) — the target is implicit ($00B8). The
// store above sequences before the JMP via the chain dependency.
return DAG.getNode(W65816ISD::BR_IND, DL, MVT::Other, Store);
}
SDValue W65816TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
// setcc lhs, rhs, cc -> select_cc lhs, rhs, 1, 0, cc.
// The SELECT_CC then re-enters LowerOperation and we lower it via the
// diamond-CFG path. setBooleanContents(ZeroOrOne) means callers see
// the result as a clean 0/1 value.
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
SDLoc DL(Op);
EVT VT = Op.getValueType();
// i32 SETCC: split into per-half compares. Result type is i16 (the
// legalizer keeps the boolean result type narrow regardless of LHS
// width).
if (LHS.getValueType() == MVT::i32) {
SDValue LL = extractWide32Lo(DAG, DL, LHS);
SDValue LH = extractWide32Hi(DAG, DL, LHS);
SDValue RL = extractWide32Lo(DAG, DL, RHS);
SDValue RH = extractWide32Hi(DAG, DL, RHS);
// Fast path: i32 == 0 / != 0 → (LL | LH) cmp 0. One i16 OR + one
// i16 setcc instead of two setcc + AND (+ XOR for NE).
if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
isa<ConstantSDNode>(RHS) &&
cast<ConstantSDNode>(RHS)->isZero()) {
SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i16, LL, LH);
SDValue Z16 = DAG.getConstant(0, DL, MVT::i16);
return DAG.getSetCC(DL, VT, Or, Z16, CC);
}
if (CC == ISD::SETEQ || CC == ISD::SETNE) {
SDValue EqLo = DAG.getSetCC(DL, VT, LL, RL, ISD::SETEQ);
SDValue EqHi = DAG.getSetCC(DL, VT, LH, RH, ISD::SETEQ);
SDValue Eq = DAG.getNode(ISD::AND, DL, VT, EqLo, EqHi);
if (CC == ISD::SETNE)
Eq = DAG.getNode(ISD::XOR, DL, VT, Eq, DAG.getConstant(1, DL, VT));
return Eq;
}
ISD::CondCode HiCC, LoCCu;
switch (CC) {
case ISD::SETLT: HiCC = ISD::SETLT; LoCCu = ISD::SETULT; break;
case ISD::SETLE: HiCC = ISD::SETLT; LoCCu = ISD::SETULE; break;
case ISD::SETGT: HiCC = ISD::SETGT; LoCCu = ISD::SETUGT; break;
case ISD::SETGE: HiCC = ISD::SETGT; LoCCu = ISD::SETUGE; break;
case ISD::SETULT: HiCC = ISD::SETULT; LoCCu = ISD::SETULT; break;
case ISD::SETULE: HiCC = ISD::SETULT; LoCCu = ISD::SETULE; break;
case ISD::SETUGT: HiCC = ISD::SETUGT; LoCCu = ISD::SETUGT; break;
case ISD::SETUGE: HiCC = ISD::SETUGT; LoCCu = ISD::SETUGE; break;
default:
report_fatal_error("W65816: unexpected i32 SETCC condition");
}
SDValue HiOk = DAG.getSetCC(DL, VT, LH, RH, HiCC);
SDValue HiEq = DAG.getSetCC(DL, VT, LH, RH, ISD::SETEQ);
SDValue LoOk = DAG.getSetCC(DL, VT, LL, RL, LoCCu);
SDValue Tie = DAG.getNode(ISD::AND, DL, VT, HiEq, LoOk);
return DAG.getNode(ISD::OR, DL, VT, HiOk, Tie);
}
SDValue One = DAG.getConstant(1, DL, VT);
SDValue Zero = DAG.getConstant(0, DL, VT);
return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, One, Zero,
DAG.getCondCode(CC));
}
SDValue W65816TargetLowering::LowerSELECT_CC(SDValue Op,
SelectionDAG &DAG) const {
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
SDValue TVal = Op.getOperand(2);
SDValue FVal = Op.getOperand(3);
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
SDLoc DL(Op);
// i32 SELECT_CC: synthesize an i16 boolean from the i32 compare via
// LowerSETCC's i32 path, then select between the i32 halves driven
// by the boolean. Avoids creating the i32 W65816::CMP we have no
// pattern for.
if (LHS.getValueType() == MVT::i32) {
// Materialise the i16 boolean.
SDValue Bool = DAG.getSetCC(DL, MVT::i16, LHS, RHS, CC);
SDValue Zero = DAG.getConstant(0, DL, MVT::i16);
if (Op.getValueType() == MVT::i32) {
SDValue TLo = extractWide32Lo(DAG, DL, TVal);
SDValue THi = extractWide32Hi(DAG, DL, TVal);
SDValue FLo = extractWide32Lo(DAG, DL, FVal);
SDValue FHi = extractWide32Hi(DAG, DL, FVal);
SDValue Lo = DAG.getSelectCC(DL, Bool, Zero, TLo, FLo, ISD::SETNE);
SDValue Hi = DAG.getSelectCC(DL, Bool, Zero, THi, FHi, ISD::SETNE);
return buildWide32(DAG, DL, Lo, Hi);
}
return DAG.getSelectCC(DL, Bool, Zero, TVal, FVal, ISD::SETNE);
}
// SELECT_CC with i32 result (i16 LHS): split TVal/FVal into halves
// and run a per-half i16 SELECT_CC sharing the same condition.
if (Op.getValueType() == MVT::i32) {
SDValue TLo = extractWide32Lo(DAG, DL, TVal);
SDValue THi = extractWide32Hi(DAG, DL, TVal);
SDValue FLo = extractWide32Lo(DAG, DL, FVal);
SDValue FHi = extractWide32Hi(DAG, DL, FVal);
SDValue Lo = DAG.getSelectCC(DL, LHS, RHS, TLo, FLo, CC);
SDValue Hi = DAG.getSelectCC(DL, LHS, RHS, THi, FHi, CC);
return buildWide32(DAG, DL, Lo, Hi);
}
W65816CC::CondCode TCC = normalizeCC(LHS, RHS, CC, DAG, DL);
if (TCC == W65816CC::COND_INVALID)
report_fatal_error("W65816: select_cc condition not yet implemented");
SDValue Glue = DAG.getNode(W65816ISD::CMP, DL, MVT::Glue, LHS, RHS);
SDValue CCOp = DAG.getTargetConstant(TCC, DL, MVT::i8);
// SDTypeProfile declares 1 result (the selected value). Earlier
// code passed a 2-VT list (value + Glue) which was silently wrong
// and trips an SDNode-validity assertion in assertions builds.
SDValue Ops[] = {TVal, FVal, CCOp, Glue};
return DAG.getNode(W65816ISD::SELECT_CC, DL, Op.getValueType(), Ops);
}
// i8 -> i16 sign extend. Branchless 3-instruction trick:
// sext(x) = ((x & 0xFF) ^ 0x80) - 0x80
// Verify: x=0x00 -> 0x80 - 0x80 = 0x0000. x=0x7F -> 0xFF - 0x80 = 0x7F.
// x=0x80 -> 0x00 - 0x80 = 0xFF80 (-128). x=0xFF -> 0x7F - 0x80
// = 0xFFFF (-1).
// Lowers to: AND #$00FF; EOR #$0080; SEC; SBC #$0080 (10 bytes total,
// no branches, no temp slots — much cheaper than the SELECT_CC diamond
// version that produced ~14 instructions plus stack spills).
SDValue W65816TargetLowering::LowerSignExtend(SDValue Op,
SelectionDAG &DAG) const {
SDValue X = Op.getOperand(0);
if (X.getValueType() != MVT::i8 || Op.getValueType() != MVT::i16)
return SDValue();
SDLoc DL(Op);
SDValue Z = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, X);
SDValue Sign = DAG.getConstant(0x0080, DL, MVT::i16);
SDValue Xor = DAG.getNode(ISD::XOR, DL, MVT::i16, Z, Sign);
return DAG.getNode(ISD::SUB, DL, MVT::i16, Xor, Sign);
}
// ptr32 foundation hook. In ptr16 mode (PointerWidth=16, current
// default) addresses are i16 and we return SDValue() so the legalizer
// keeps the load and the existing LDAptr / STAptr selection patterns
// match. In ptr32 mode addresses are i32 and we wrap the load in
// W65816ISD::LD_PTR via getMemIntrinsicNode so the [dp],Y inserter
// can take the bank byte from sub_hi instead of forcing 0.
//
// Byte loads (zextload, anyext, true i8) keep going through the i16
// LDA + AND #$FF idiom — same trick the existing LDAptr uses; for
// ptr32 mode the load is still 16 bits, just bank-explicit.
SDValue W65816TargetLowering::LowerLoad(SDValue Op,
SelectionDAG &DAG) const {
LoadSDNode *Ld = cast<LoadSDNode>(Op);
SDValue Chain = Ld->getChain();
SDValue Ptr = Ld->getBasePtr();
EVT VT = Op.getValueType();
SDLoc DL(Op);
// Const-int address: leave the SDAG alone so the tablegen pattern
// `(load (iPTR imm))` → LDA8long fires (bank-explicit). See the
// mirrored short-circuit at the top of LowerStore.
if (isa<ConstantSDNode>(Ptr) && (VT == MVT::i8 || VT == MVT::i16))
return SDValue();
// i32 LOAD: split into two i16 loads at offsets 0 and 2 then
// REG_SEQUENCE the halves into a Wide32. Address may be i16 (stack
// slot, global) or i32 (ptr32 deref); the recursive ADD handles
// address arithmetic correctly via LowerI32Bin.
if (VT == MVT::i32) {
EVT PtrVT = Ptr.getValueType();
SDValue Two = DAG.getConstant(2, DL, PtrVT);
SDValue Ptr2 = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr, Two);
SDValue Lo = DAG.getLoad(MVT::i16, DL, Chain, Ptr,
Ld->getPointerInfo(),
Ld->getAlign(),
Ld->getMemOperand()->getFlags());
SDValue Hi = DAG.getLoad(MVT::i16, DL, Chain, Ptr2,
Ld->getPointerInfo().getWithOffset(2),
Ld->getAlign(),
Ld->getMemOperand()->getFlags());
SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
Lo.getValue(1), Hi.getValue(1));
SDValue Val = buildWide32(DAG, DL, Lo, Hi);
return DAG.getMergeValues({Val, NewChain}, DL);
}
// Same fold as LowerStore: a Wide32 ptr built from Wrapper +
// WrapperBank of the same global, OR a raw GlobalAddress, lets us
// emit an abs-16 (DBR-relative) load (LDA / LDA8abs) instead of
// the slower [dp],Y indirect-long. Our globals are in the load
// segment that crt0 pins to DBR.
SDValue FoldedLo;
if (auto *GA = dyn_cast<GlobalAddressSDNode>(Ptr)) {
FoldedLo = DAG.getNode(W65816ISD::Wrapper, DL, MVT::i16,
DAG.getTargetGlobalAddress(GA->getGlobal(), DL, MVT::i16,
GA->getOffset()));
} else if (auto *ES = dyn_cast<ExternalSymbolSDNode>(Ptr)) {
FoldedLo = DAG.getNode(W65816ISD::Wrapper, DL, MVT::i16,
DAG.getTargetExternalSymbol(ES->getSymbol(), MVT::i16));
} else if (Ptr.getNode()->isMachineOpcode() &&
Ptr.getMachineOpcode() == TargetOpcode::REG_SEQUENCE) {
SDValue PLo, PHi;
for (unsigned i = 1; i + 1 < Ptr.getNumOperands(); i += 2) {
if (auto *CIdx = dyn_cast<ConstantSDNode>(Ptr.getOperand(i + 1))) {
if (CIdx->getZExtValue() == llvm::sub_lo) PLo = Ptr.getOperand(i);
else if (CIdx->getZExtValue() == llvm::sub_hi) PHi = Ptr.getOperand(i);
}
}
if (PLo && PHi &&
PLo.getOpcode() == W65816ISD::Wrapper &&
PHi.getOpcode() == W65816ISD::WrapperBank) {
SDValue WLo = PLo.getOperand(0);
SDValue WHi = PHi.getOperand(0);
auto *GLo = dyn_cast<GlobalAddressSDNode>(WLo);
auto *GHi = dyn_cast<GlobalAddressSDNode>(WHi);
auto *ELo = dyn_cast<ExternalSymbolSDNode>(WLo);
auto *EHi = dyn_cast<ExternalSymbolSDNode>(WHi);
bool SameGlobal = (GLo && GHi && GLo->getGlobal() == GHi->getGlobal() &&
GLo->getOffset() == GHi->getOffset());
bool SameExtern = (ELo && EHi &&
StringRef(ELo->getSymbol()) == EHi->getSymbol());
if (SameGlobal || SameExtern)
FoldedLo = PLo;
}
}
if (FoldedLo) {
EVT MemVT = Ld->getMemoryVT();
ISD::LoadExtType ExtType = Ld->getExtensionType();
if (ExtType == ISD::NON_EXTLOAD && MemVT == Op.getValueType()) {
return DAG.getLoad(Op.getValueType(), DL, Chain, FoldedLo,
Ld->getPointerInfo(),
Ld->getAlign(),
Ld->getMemOperand()->getFlags());
}
// i1 memory type comes from GlobalOpt narrowing `short` globals
// whose only assignments are 0/1. Treat as i8 load + appropriate
// mask — the underlying memory is still byte-sized.
if (MemVT == MVT::i1) {
SDValue ByteLd = DAG.getExtLoad(ISD::ZEXTLOAD, DL, MVT::i16, Chain,
FoldedLo, MVT::i8,
Ld->getMemOperand());
SDValue Val = ByteLd;
if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::EXTLOAD) {
Val = DAG.getNode(ISD::AND, DL, MVT::i16, ByteLd,
DAG.getConstant(1, DL, MVT::i16));
} else if (ExtType == ISD::SEXTLOAD) {
// i1 sign-extend: bit 0 -> all bits. AND #1 then NEG.
SDValue Bit = DAG.getNode(ISD::AND, DL, MVT::i16, ByteLd,
DAG.getConstant(1, DL, MVT::i16));
Val = DAG.getNode(ISD::SUB, DL, MVT::i16,
DAG.getConstant(0, DL, MVT::i16), Bit);
}
if (Op.getValueType() == MVT::i8)
Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Val);
return DAG.getMergeValues({Val, ByteLd.getValue(1)}, DL);
}
return DAG.getExtLoad(ExtType, DL, Op.getValueType(), Chain, FoldedLo,
MemVT, Ld->getMemOperand());
}
// ptr16 mode: address is i16, let the default selection handle it.
if (Ptr.getValueType() != MVT::i32)
return SDValue();
EVT MemVT = Ld->getMemoryVT();
// Widen i1 memVT to i8 (single-byte storage). getMemIntrinsicNode
// asserts memvt must be supported; i1 isn't.
if (MemVT == MVT::i1) MemVT = MVT::i8;
SDVTList VTs = DAG.getVTList(MVT::i16, MVT::Other);
// Try to peel a constant offset from Ptr and route through
// LD_PTR_OFF — folds `(ptr + K)` into the Y-register of `[E0],Y`,
// saving the i32 ADD's CLC/ADC carry chain. ~3 instr per access.
// See feedback_ptr32_deref_fold_layer1_mi.md.
// LD_PTR_OFF: deferred — the peel fires correctly but the resulting
// SDAG breaks the JSON-tokenizer + snprintf smoke tests in ways
// bisection didn't isolate. Stick with LD_PTR (no peel) here; the
// LowerStore peel for ST_PTR_OFF / STB_PTR_OFF keeps the store-side
// optimization. Future: route loads through a SDAG combine that
// runs post-LegalizeOps so we see the final REG_SEQUENCE shape.
SDValue Ops[] = { Chain, Ptr };
SDValue LdNode = DAG.getMemIntrinsicNode(W65816ISD::LD_PTR, DL, VTs, Ops,
MemVT, Ld->getMemOperand());
SDValue Val = LdNode;
// Byte memory access: mask the high byte for zextload, leave anyext.
// i1 memVT was widened to i8 above; the mask path is the same.
if (MemVT == MVT::i8) {
EVT OrigMemVT = Ld->getMemoryVT();
SDValue MaskC = DAG.getConstant(OrigMemVT == MVT::i1 ? 1 : 0xFF,
DL, MVT::i16);
if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
(OrigMemVT == MVT::i1 && Ld->getExtensionType() == ISD::EXTLOAD))
Val = DAG.getNode(ISD::AND, DL, MVT::i16, Val, MaskC);
else if (Ld->getExtensionType() == ISD::SEXTLOAD)
Val = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i16, Val,
DAG.getValueType(MVT::i8));
}
// Narrow back to i8 if the consumer wanted i8.
if (VT == MVT::i8)
Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Val);
return DAG.getMergeValues({Val, LdNode.getValue(1)}, DL);
}
// ZERO/SIGN/ANY_EXTEND i8/i16 -> i32: build a Wide32 from the i16
// payload and a 0 / sign-fill / undef high half.
SDValue W65816TargetLowering::LowerExtend(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
if (Op.getValueType() != MVT::i32)
return SDValue();
SDValue X = Op.getOperand(0);
// Promote i8 inputs to i16 first via the same opcode.
if (X.getValueType() == MVT::i8)
X = DAG.getNode(Op.getOpcode(), DL, MVT::i16, X);
SDValue Lo = X;
SDValue Hi;
if (Op.getOpcode() == ISD::ZERO_EXTEND) {
Hi = DAG.getConstant(0, DL, MVT::i16);
} else if (Op.getOpcode() == ISD::SIGN_EXTEND) {
// Sign-fill via SRA #15 — uses our SRA15A pattern (4 insns) and
// stays i16-typed in both LHS and RHS, dodging the combiner's
// shift-amount-promote when ptr32 makes pointer-typed shift
// amounts i32.
Hi = DAG.getNode(ISD::SRA, DL, MVT::i16, Lo,
DAG.getConstant(15, DL, MVT::i16));
} else {
Hi = DAG.getUNDEF(MVT::i16);
}
return buildWide32(DAG, DL, Lo, Hi);
}
// SIGN_EXTEND_INREG i32 with inner type i1/i8/i16: sign-extend the low
// N bits of an i32 input to fill all 32 bits. The legalizer leaves
// this op alone when i32 is legal — but no tablegen pattern matches
// the i32 form, so without this Custom hook isel aborts with
// "Cannot select: sign_extend_inreg ... ValueType:i1" on shapes like
// `-(crc & 1ul)` in CRC32 loops.
//
// Strategy: for inner VT V (= i1 / i8 / i16), the low half's
// `sext_inreg` (already pattern-matched at i16) produces the signed
// i16 value — then sign-fill the high half via SRA #15 of the lo
// result.
SDValue W65816TargetLowering::LowerSignExtendInReg(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
SDValue X = Op.getOperand(0);
EVT InnerVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
EVT ResVT = Op.getValueType();
// i16 result: replicate the existing tablegen patterns. We MUST
// handle this case rather than returning SDValue(), because
// setOperationAction's Custom-returns-SDValue() falls through to
// default Expand (= SRA/SHL chain), not to tablegen pattern match.
// The two existing patterns are:
// (sext_inreg Acc16:$src, i1) -> NEGA16 (AND $src, 1)
// (sext_inreg Acc16:$src, i8) -> ((src & 0xFF) ^ 0x80) - 0x80
// Reproduce them at the SDAG level so the legalizer's Custom
// dispatch returns a fully-lowered tree.
if (ResVT == MVT::i16) {
if (InnerVT == MVT::i1) {
SDValue Bit = DAG.getNode(ISD::AND, DL, MVT::i16, X,
DAG.getConstant(1, DL, MVT::i16));
return DAG.getNode(ISD::SUB, DL, MVT::i16,
DAG.getConstant(0, DL, MVT::i16), Bit);
}
if (InnerVT == MVT::i8) {
SDValue Masked = DAG.getNode(ISD::AND, DL, MVT::i16, X,
DAG.getConstant(0xFF, DL, MVT::i16));
SDValue Xored = DAG.getNode(ISD::XOR, DL, MVT::i16, Masked,
DAG.getConstant(0x80, DL, MVT::i16));
return DAG.getNode(ISD::SUB, DL, MVT::i16, Xored,
DAG.getConstant(0x80, DL, MVT::i16));
}
// inner i16 = no-op.
return X;
}
if (ResVT != MVT::i32)
return SDValue();
// i32 result: project the input's low half (X is i32 Wide32 here),
// apply the inner-VT sext on the i16 low half, sign-fill the hi.
SDValue Lo = extractWide32Lo(DAG, DL, X);
if (InnerVT != MVT::i16) {
Lo = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i16, Lo,
DAG.getValueType(InnerVT));
}
// Sign-fill the hi half via SRA #15 — same idiom LowerExtend uses for
// SIGN_EXTEND i16 -> i32.
SDValue Hi = DAG.getNode(ISD::SRA, DL, MVT::i16, Lo,
DAG.getConstant(15, DL, MVT::i16));
return buildWide32(DAG, DL, Lo, Hi);
}
// TRUNCATE i32 -> i16: project sub_lo.
SDValue W65816TargetLowering::LowerTruncate(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
if (Op.getOperand(0).getValueType() != MVT::i32)
return SDValue();
if (Op.getValueType() == MVT::i16)
return extractWide32Lo(DAG, DL, Op.getOperand(0));
if (Op.getValueType() == MVT::i8) {
// i32 -> i16 -> i8. The i8 trunc pattern is COPY_TO_REGCLASS at MC
// level; the i16 sub_lo extract is the work.
SDValue Lo16 = extractWide32Lo(DAG, DL, Op.getOperand(0));
return DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Lo16);
}
return SDValue();
}
// i32 Constant: split into two i16 constants and REG_SEQUENCE.
SDValue W65816TargetLowering::LowerI32Constant(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
if (Op.getValueType() != MVT::i32) return SDValue();
uint64_t V = cast<ConstantSDNode>(Op)->getZExtValue();
SDValue Lo = DAG.getConstant(V & 0xFFFFu, DL, MVT::i16);
SDValue Hi = DAG.getConstant((V >> 16) & 0xFFFFu, DL, MVT::i16);
return buildWide32(DAG, DL, Lo, Hi);
}
// ADD/SUB/AND/OR/XOR i32 -> per-half i16 op. ADDC/ADDE chain for ADD,
// SUBC/SUBE for SUB. AND/OR/XOR are independent halves.
SDValue W65816TargetLowering::LowerI32Bin(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
if (Op.getValueType() != MVT::i32)
return SDValue();
SDValue L = Op.getOperand(0);
SDValue R = Op.getOperand(1);
SDValue LL = extractWide32Lo(DAG, DL, L);
SDValue LH = extractWide32Hi(DAG, DL, L);
SDValue RL = extractWide32Lo(DAG, DL, R);
SDValue RH = extractWide32Hi(DAG, DL, R);
SDValue Lo, Hi;
switch (Op.getOpcode()) {
case ISD::AND:
Lo = DAG.getNode(ISD::AND, DL, MVT::i16, LL, RL);
Hi = DAG.getNode(ISD::AND, DL, MVT::i16, LH, RH);
break;
case ISD::OR:
Lo = DAG.getNode(ISD::OR, DL, MVT::i16, LL, RL);
Hi = DAG.getNode(ISD::OR, DL, MVT::i16, LH, RH);
break;
case ISD::XOR:
Lo = DAG.getNode(ISD::XOR, DL, MVT::i16, LL, RL);
Hi = DAG.getNode(ISD::XOR, DL, MVT::i16, LH, RH);
break;
case ISD::ADD: {
SDVTList VTs = DAG.getVTList(MVT::i16, MVT::Glue);
SDValue Lo2 = DAG.getNode(ISD::ADDC, DL, VTs, LL, RL);
Lo = Lo2.getValue(0);
SDValue Carry = Lo2.getValue(1);
Hi = DAG.getNode(ISD::ADDE, DL, VTs, LH, RH, Carry).getValue(0);
break;
}
case ISD::SUB: {
SDVTList VTs = DAG.getVTList(MVT::i16, MVT::Glue);
SDValue Lo2 = DAG.getNode(ISD::SUBC, DL, VTs, LL, RL);
Lo = Lo2.getValue(0);
SDValue Borrow = Lo2.getValue(1);
Hi = DAG.getNode(ISD::SUBE, DL, VTs, LH, RH, Borrow).getValue(0);
break;
}
default:
return SDValue();
}
return buildWide32(DAG, DL, Lo, Hi);
}
// Store companion to LowerLoad. For i32 addresses, dispatch to the
// 16-bit ST_PTR or the byte-truncating STB_PTR target node based on
// MemoryVT. For i16 addresses (ptr16 mode), bail out and let the
// existing STAptr / STBptr patterns match.
SDValue W65816TargetLowering::LowerStore(SDValue Op,
SelectionDAG &DAG) const {
StoreSDNode *St = cast<StoreSDNode>(Op);
SDValue Chain = St->getChain();
SDValue Val = St->getValue();
SDValue Ptr = St->getBasePtr();
EVT MemVT = St->getMemoryVT();
SDLoc DL(Op);
// Const-int address (`*(volatile uint8*)0xC035 = v`): leave the SDAG
// alone so the tablegen pattern `(store Acc8, (iPTR imm))` →
// STA8long fires. Without this short-circuit the i32-pointer code
// below promotes the constant address into a Wide32 register pair
// and routes through STBptr32 ([dp],Y), which is 16 B / 30 cyc and
// (worse) bank-tracks DBR.
if (isa<ConstantSDNode>(Ptr))
return SDValue();
// i32 STORE: split into two halves. Critical: the per-half stores
// MUST go through the target-specific W65816ISD::ST_PTR node and not
// through plain ISD::STORE, otherwise the SDAG combiner's
// MergeConsecutiveStores re-combines them into a single i32 store
// that re-enters LowerStore — infinite loop, OOM in the combiner.
// For i16 ptrs (legacy ptr16), fall back to ISD::STORE; the regular
// store-merger doesn't trip there because address splitting via
// ISD::ADD on i16 doesn't itself fan out into ptr-pair operations.
if (Val.getValueType() == MVT::i32) {
SDValue Lo = extractWide32Lo(DAG, DL, Val);
SDValue Hi = extractWide32Hi(DAG, DL, Val);
EVT PtrVT = Ptr.getValueType();
// ptr32 const-i32-addr fast path: `*(uint32_t*)0x5000 = v` should
// lower to two STAabs (DBR-relative, 5 cyc each) instead of two
// [dp],Y stores via ST_PTR. Detect Wide32-zero-hi Constant ptr,
// emit two i16 stores at TargetConstant:i32 addrs. TargetConstant
// (not Constant) so LowerI32Constant doesn't re-fire and recreate
// the REG_SEQUENCE. The STAabs timm pattern matches.
if (PtrVT == MVT::i32 && Ptr.getNode()->isMachineOpcode() &&
Ptr.getMachineOpcode() == TargetOpcode::REG_SEQUENCE) {
SDValue PtrLo, PtrHi;
for (unsigned i = 1; i + 1 < Ptr.getNumOperands(); i += 2) {
if (auto *CIdx = dyn_cast<ConstantSDNode>(Ptr.getOperand(i + 1))) {
if (CIdx->getZExtValue() == llvm::sub_lo) PtrLo = Ptr.getOperand(i);
else if (CIdx->getZExtValue() == llvm::sub_hi) PtrHi = Ptr.getOperand(i);
}
}
auto *PtrHiC = dyn_cast_or_null<ConstantSDNode>(PtrHi);
auto *PtrLoC = dyn_cast_or_null<ConstantSDNode>(PtrLo);
if (PtrLoC && PtrHiC && PtrHiC->getZExtValue() == 0) {
uint64_t Base = PtrLoC->getZExtValue() & 0xFFFF;
SDValue PLo = DAG.getTargetConstant(Base, DL, MVT::i32);
SDValue PHi = DAG.getTargetConstant((Base + 2) & 0xFFFF, DL, MVT::i32);
SDValue StLo = DAG.getStore(Chain, DL, Lo, PLo,
St->getPointerInfo(),
St->getAlign(),
St->getMemOperand()->getFlags());
SDValue StHi = DAG.getStore(StLo, DL, Hi, PHi,
St->getPointerInfo().getWithOffset(2),
St->getAlign(),
St->getMemOperand()->getFlags());
return StHi;
}
}
SDValue Two = DAG.getConstant(2, DL, PtrVT);
SDValue Ptr2 = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr, Two);
if (PtrVT == MVT::i32) {
// ptr32 path — emit two W65816ISD::ST_PTR target nodes, sequentially
// chained. The combiner cannot merge target-opaque MemIntrinsic
// stores.
SDVTList VTs = DAG.getVTList(MVT::Other);
SDValue OpsLo[] = { Chain, Lo, Ptr };
SDValue StLo = DAG.getMemIntrinsicNode(
W65816ISD::ST_PTR, DL, VTs, OpsLo, MVT::i16,
St->getMemOperand());
SDValue OpsHi[] = { StLo, Hi, Ptr2 };
MachineMemOperand *MMOHi = DAG.getMachineFunction().getMachineMemOperand(
St->getMemOperand(), 2, 2);
SDValue StHi = DAG.getMemIntrinsicNode(
W65816ISD::ST_PTR, DL, VTs, OpsHi, MVT::i16, MMOHi);
return StHi;
}
// ptr16 path — emit two regular i16 stores serially chained so the
// store-merger sees them as a 4-byte sequence (which it will likely
// leave alone since the resulting i32 store has no legal target
// pattern in ptr16 mode anyway).
SDValue StLo = DAG.getStore(Chain, DL, Lo, Ptr,
St->getPointerInfo(),
St->getAlign(),
St->getMemOperand()->getFlags());
SDValue StHi = DAG.getStore(StLo, DL, Hi, Ptr2,
St->getPointerInfo().getWithOffset(2),
St->getAlign(),
St->getMemOperand()->getFlags());
return StHi;
}
// Optimization: if the store goes through a global address (raw
// GlobalAddress/ExternalSymbol, or a Wide32 built from Wrapper +
// WrapperBank of the same symbol), lower to a plain i16/i8 store
// through a single Wrapper@symbol so the tablegen pattern
// (store Acc8/Acc16, (W65816Wrapper tglobaladdr:$g))
// selects STA8abs / STAabs (DBR-relative). Our globals live in
// the load segment that crt0 pins to DBR, so abs-16 reaches them.
// This avoids the 14-byte [dp],y indirect-long path AND re-enables
// the STZ peephole that the indirect path defeats.
SDValue FoldedLo;
if (auto *GA = dyn_cast<GlobalAddressSDNode>(Ptr)) {
FoldedLo = DAG.getNode(W65816ISD::Wrapper, DL, MVT::i16,
DAG.getTargetGlobalAddress(GA->getGlobal(), DL, MVT::i16,
GA->getOffset()));
} else if (auto *ES = dyn_cast<ExternalSymbolSDNode>(Ptr)) {
FoldedLo = DAG.getNode(W65816ISD::Wrapper, DL, MVT::i16,
DAG.getTargetExternalSymbol(ES->getSymbol(), MVT::i16));
} else if (Ptr.getNode()->isMachineOpcode() &&
Ptr.getMachineOpcode() == TargetOpcode::REG_SEQUENCE) {
SDValue PLo, PHi;
for (unsigned i = 1; i + 1 < Ptr.getNumOperands(); i += 2) {
if (auto *CIdx = dyn_cast<ConstantSDNode>(Ptr.getOperand(i + 1))) {
if (CIdx->getZExtValue() == llvm::sub_lo) PLo = Ptr.getOperand(i);
else if (CIdx->getZExtValue() == llvm::sub_hi) PHi = Ptr.getOperand(i);
}
}
if (PLo && PHi &&
PLo.getOpcode() == W65816ISD::Wrapper &&
PHi.getOpcode() == W65816ISD::WrapperBank) {
SDValue WLo = PLo.getOperand(0);
SDValue WHi = PHi.getOperand(0);
auto *GLo = dyn_cast<GlobalAddressSDNode>(WLo);
auto *GHi = dyn_cast<GlobalAddressSDNode>(WHi);
auto *ELo = dyn_cast<ExternalSymbolSDNode>(WLo);
auto *EHi = dyn_cast<ExternalSymbolSDNode>(WHi);
bool SameGlobal = (GLo && GHi && GLo->getGlobal() == GHi->getGlobal() &&
GLo->getOffset() == GHi->getOffset());
bool SameExtern = (ELo && EHi &&
StringRef(ELo->getSymbol()) == EHi->getSymbol());
if (SameGlobal || SameExtern)
FoldedLo = PLo;
}
}
if (FoldedLo) {
// Preserve memVT — original may be a truncating store (e.g.,
// i16 value into i8 memory). getStore picks memVT from Val's
// type, which can mismatch the original MachineMemOperand size.
if (MemVT == Val.getValueType()) {
return DAG.getStore(Chain, DL, Val, FoldedLo,
St->getPointerInfo(), St->getAlign(),
St->getMemOperand()->getFlags());
}
return DAG.getTruncStore(Chain, DL, Val, FoldedLo, MemVT,
St->getMemOperand());
}
// No i32 ptr → nothing for us to do; let the default ISD::STORE
// path handle it. (Also avoids accidentally wrapping an i16 ptr
// store into ST_PTR below, whose ptr operand must be i32.)
if (Ptr.getValueType() != MVT::i32)
return SDValue();
// The pseudos take Acc16 (i16) as the value half; the SEP/REP wrap
// around STBptr32 narrows in memory. Promote i8 values to i16 with
// ANY_EXTEND — the inserter only writes one byte, so the high half
// is don't-care.
if (Val.getValueType() == MVT::i8)
Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, Val);
SDVTList VTs = DAG.getVTList(MVT::Other);
SDValue Base; uint16_t Off = 0;
if (peelPtr32Offset(DAG, DL, Ptr, Base, Off)) {
unsigned OffOpc = (MemVT == MVT::i8) ? unsigned(W65816ISD::STB_PTR_OFF)
: unsigned(W65816ISD::ST_PTR_OFF);
SDValue OffN = DAG.getTargetConstant(Off, DL, MVT::i16);
SDValue OpsOff[] = { Chain, Val, Base, OffN };
return DAG.getMemIntrinsicNode(OffOpc, DL, VTs, OpsOff, MemVT,
St->getMemOperand());
}
unsigned NodeOpc = (MemVT == MVT::i8) ? unsigned(W65816ISD::STB_PTR)
: unsigned(W65816ISD::ST_PTR);
SDValue Ops[] = { Chain, Val, Ptr };
return DAG.getMemIntrinsicNode(NodeOpc, DL, VTs, Ops, MemVT,
St->getMemOperand());
}
// VAARG: load *ap, advance ap by sizeof(VT). Unlike the default
// expansion, we do NOT align ap to the type's preferred alignment —
// caller-pushed varargs land at byte-granular addresses (PHA from an
// odd S leaves the low byte at S+1 which is even, but our prologue's
// TSC-sequence can produce odd S, etc.). Aligning ap would skip the
// pushed value's low byte.
static SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) {
SDLoc DL(Op);
SDValue Chain = Op.getOperand(0);
SDValue VAListPtr = Op.getOperand(1);
EVT VT = Op.getValueType();
// ap (va_list) is `char *` on this target — i16 under ptr16, i32
// under ptr32. Load and store it at PtrVT so we don't truncate and
// lose the high half (under ptr32, hi=0 so the truncation read garbage
// back, then the i16 store wrote i16 over the lo half but left an
// unrelated value in the hi — silent miscompile of every variadic
// call on ptr32).
EVT PtrVT = VAListPtr.getValueType();
SDValue Ap = DAG.getLoad(PtrVT, DL, Chain, VAListPtr,
MachinePointerInfo());
Chain = Ap.getValue(1);
// For the actual data deref: under ptr16 we route i16 through
// VAARG_LOAD (bank-0-explicit `[dp],Y`). Under ptr32, ap is already
// a Wide32 ptr with hi=0 (caller set up the va_list to point into the
// call-frame stack-args region, bank 0); a regular load through that
// pointer routes to LDAptr32 / STBptr32 which already deref bank-0.
SDValue Val;
if (VT == MVT::i16 && PtrVT == MVT::i16) {
SDVTList VTs = DAG.getVTList(MVT::i16, MVT::Other);
Val = DAG.getNode(W65816ISD::VAARG_LOAD, DL, VTs, Chain, Ap);
Chain = Val.getValue(1);
} else {
Val = DAG.getLoad(VT, DL, Chain, Ap, MachinePointerInfo());
Chain = Val.getValue(1);
}
// ap += sizeof(VT) (rounded up to whole bytes).
unsigned Size = (VT.getSizeInBits() + 7) / 8;
SDValue NewAp = DAG.getNode(ISD::ADD, DL, PtrVT, Ap,
DAG.getConstant(Size, DL, PtrVT));
Chain = DAG.getStore(Chain, DL, NewAp, VAListPtr, MachinePointerInfo());
return DAG.getMergeValues({Val, Chain}, DL);
}
// VASTART: store the address of the first vararg slot (recorded by
// LowerFormalArguments via VarArgsFrameIndex) to the va_list pointer.
// va_list is just `i16 *next` here — minimum implementation.
static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG,
const W65816TargetLowering &TLI) {
MachineFunction &MF = DAG.getMachineFunction();
auto *FuncInfo = MF.getInfo<W65816MachineFunctionInfo>();
SDLoc DL(Op);
// FrameIndex must be at PtrVT (i16 under ptr16, i32 under ptr32) so
// the subsequent store writes the full pointer width. Under ptr32
// the i32 FI lowers via the i32 pointer-store path; the high half
// is implicitly 0 (stack is bank 0) and stored alongside the lo.
EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
SDValue VAFI = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
PtrVT);
SDValue Chain = Op.getOperand(0);
SDValue VAListPtr = Op.getOperand(1);
const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
return DAG.getStore(Chain, DL, VAFI, VAListPtr, MachinePointerInfo(SV));
}
SDValue W65816TargetLowering::LowerOperation(SDValue Op,
SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
case ISD::BR_CC: return LowerBR_CC(Op, DAG);
case ISD::BRIND: return LowerBRIND(Op, DAG);
case ISD::SETCC: return LowerSETCC(Op, DAG);
case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
case ISD::SELECT: {
// Custom-lower SELECT for i32 result: split into per-half
// selects. Without this, the legalizer's default (rewriting
// SELECT to SELECT_CC against zero) produces SELECT_CC i32 of
// a different shape that re-enters Custom and creates a cycle.
if (Op.getValueType() != MVT::i32)
return SDValue();
SDValue Cond = Op.getOperand(0);
SDValue TVal = Op.getOperand(1);
SDValue FVal = Op.getOperand(2);
SDLoc DL(Op);
SDValue TLo = extractWide32Lo(DAG, DL, TVal);
SDValue THi = extractWide32Hi(DAG, DL, TVal);
SDValue FLo = extractWide32Lo(DAG, DL, FVal);
SDValue FHi = extractWide32Hi(DAG, DL, FVal);
SDValue Lo = DAG.getNode(ISD::SELECT, DL, MVT::i16, Cond, TLo, FLo);
SDValue Hi = DAG.getNode(ISD::SELECT, DL, MVT::i16, Cond, THi, FHi);
return buildWide32(DAG, DL, Lo, Hi);
}
case ISD::SIGN_EXTEND:
if (Op.getValueType() == MVT::i32) return LowerExtend(Op, DAG);
return LowerSignExtend(Op, DAG);
case ISD::VASTART: return LowerVASTART(Op, DAG, *this);
case ISD::VAARG: return LowerVAARG(Op, DAG);
case ISD::SHL:
case ISD::SRL:
case ISD::SRA: return LowerShift(Op, DAG);
case ISD::ZERO_EXTEND:
case ISD::ANY_EXTEND: return LowerExtend(Op, DAG);
case ISD::SIGN_EXTEND_INREG: return LowerSignExtendInReg(Op, DAG);
case ISD::TRUNCATE: return LowerTruncate(Op, DAG);
case ISD::ADD:
case ISD::SUB:
case ISD::AND:
case ISD::OR:
case ISD::XOR: return LowerI32Bin(Op, DAG);
case ISD::MUL: return LowerMUL_I32(Op, DAG);
case ISD::LOAD: return LowerLoad(Op, DAG);
case ISD::STORE: return LowerStore(Op, DAG);
case ISD::Constant: return LowerI32Constant(Op, DAG);
// SJLJ EH: setup_dispatch is a no-op on this target — the dispatcher
// logic lives entirely in the SJLJ runtime (_Unwind_SjLj_Resume +
// longjmp into the function context's jmp_buf). The isel layer
// doesn't need to emit any code; just thread the chain through.
case ISD::EH_SJLJ_SETUP_DISPATCH:
return Op.getOperand(0);
case ISD::TRAP:
case ISD::DEBUGTRAP: {
// Wrap the incoming chain in a W65816ISD::TRAP node; the InstrInfo.td
// pattern (W65816trap) selects BRK_pseudo, which the AsmPrinter
// expands to sentinel-store + BRK + self-loop. Threading the chain
// through keeps memory-ordering side effects honest (the trap is
// observed after any prior store).
SDLoc DL(Op);
SDValue Chain = Op.getOperand(0);
return DAG.getNode(W65816ISD::TRAP, DL, MVT::Other, Chain);
}
case ISD::DYNAMIC_STACKALLOC: return LowerDynamicStackalloc(Op, DAG);
case ISD::STACKSAVE: {
// Return Constant 0 — SJLJ stores this into the function context
// but our setjmp/longjmp manage SP directly, so the value is dead.
SDLoc DL(Op);
EVT VT = Op.getValueType();
SDValue Chain = Op.getOperand(0);
SDValue Result;
if (VT == MVT::i16)
Result = DAG.getConstant(0, DL, MVT::i16);
else
Result = buildWide32(DAG, DL,
DAG.getConstant(0, DL, MVT::i16),
DAG.getConstant(0, DL, MVT::i16));
return DAG.getMergeValues({Result, Chain}, DL);
}
case ISD::STACKRESTORE:
// No-op — pass the chain through.
return Op.getOperand(0);
case ISD::FRAMEADDR: {
// FRAMEADDR(N): SJLJ uses N=0 (current frame). We don't reserve a
// frame pointer and SP isn't trivially CopyFromReg-able (no
// register class). Return Constant 0 — SJLJ uses it as an opaque
// per-frame identifier; the SJLJ runtime tracks frames by jmp_buf
// chaining (FnCtx::prev) rather than by FRAMEADDR value, so a
// constant works for single-throw / non-nested-catch programs.
// True multi-frame SJLJ would need a TSC-based unique value.
SDLoc DL(Op);
EVT VT = Op.getValueType();
if (VT == MVT::i16)
return DAG.getConstant(0, DL, MVT::i16);
SDValue Lo = DAG.getConstant(0, DL, MVT::i16);
SDValue Hi = DAG.getConstant(0, DL, MVT::i16);
return buildWide32(DAG, DL, Lo, Hi);
}
default:
#ifndef NDEBUG
Op.dump();
#endif
llvm_unreachable("W65816: unexpected operation in LowerOperation");
}
}
std::pair<unsigned, const TargetRegisterClass *>
W65816TargetLowering::getRegForInlineAsmConstraint(
const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
// Strip leading '{' and trailing '}' for the long form.
StringRef C = Constraint;
if (C.size() >= 2 && C.front() == '{' && C.back() == '}')
C = C.substr(1, C.size() - 2);
if (VT == MVT::i8) {
if (C == "a") return {W65816::A, &W65816::Acc8RegClass};
if (C == "x") return {W65816::X, &W65816::Idx8RegClass};
if (C == "y") return {W65816::Y, &W65816::Idx8RegClass};
if (C == "r") return {W65816::A, &W65816::Acc8RegClass};
} else { // i16 default; pointer types fold here too
if (C == "a") return {W65816::A, &W65816::Acc16RegClass};
if (C == "x") return {W65816::X, &W65816::Idx16RegClass};
if (C == "y") return {W65816::Y, &W65816::Idx16RegClass};
if (C == "r") return {W65816::A, &W65816::Acc16RegClass};
}
return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
}
SDValue W65816TargetLowering::LowerDynamicStackalloc(SDValue Op,
SelectionDAG &DAG) const {
// (DYNAMIC_STACKALLOC chain, size, align) -> (ptr, chain).
// Lowered as: stash entry SP -> DP $F4 (handled by emitPrologue when
// MFI.hasVarSizedObjects), then `tsc; sec; sbc size; tcs; inc a`.
// The epilogue restores SP from $F4.
//
// Limitation: any FrameIndex (local, spill slot, parameter) accessed
// *after* the alloca reads from a wrong stack-relative offset because
// PEI bakes FI offsets relative to the static-frame SP, not the
// post-alloca SP. A real frame pointer would lift this; for now we
// accept the limitation and document it. The simplest safe pattern
// is "VLA at end of function, used immediately, no further FI access";
// anything else is at-your-own-risk until FP support lands.
SDLoc DL(Op);
SDValue Chain = Op.getOperand(0);
SDValue Size = Op.getOperand(1);
EVT ResultVT = Op.getValueType();
// Under ptr32, both the result pointer and the size are Wide32 i32
// values. Extract the i16 lo half of size (a VLA larger than 64KB
// doesn't fit in our stack anyway), do the i16 ALLOCA, then build
// the Wide32 result with bank=0 (stack is always bank 0).
if (ResultVT == MVT::i32) {
SDValue Size16 = (Size.getValueType() == MVT::i32)
? extractWide32Lo(DAG, DL, Size)
: Size;
SDValue ChainAndPtr = DAG.getNode(W65816ISD::ALLOCA, DL,
DAG.getVTList(MVT::i16, MVT::Other),
Chain, Size16);
SDValue Ptr16 = ChainAndPtr.getValue(0);
SDValue NewChain = ChainAndPtr.getValue(1);
SDValue Bank = DAG.getConstant(0, DL, MVT::i16);
SDValue Ptr32 = buildWide32(DAG, DL, Ptr16, Bank);
return DAG.getMergeValues({Ptr32, NewChain}, DL);
}
SDValue ChainAndPtr = DAG.getNode(W65816ISD::ALLOCA, DL,
DAG.getVTList(MVT::i16, MVT::Other),
Chain, Size);
SDValue Ptr = ChainAndPtr.getValue(0);
SDValue NewChain = ChainAndPtr.getValue(1);
return DAG.getMergeValues({Ptr, NewChain}, DL);
}
SDValue W65816TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
// i8 shifts: promote to i16, shift, truncate. SRA promotes via SEXT
// (preserves sign for arithmetic right shift); SHL/SRL via ZEXT
// (logical / left shifts don't care about high bits). This routes
// i8 shifts through the same i16 fast paths and libcalls — no
// parallel qi3 libcall set needed. The DAG combiner would otherwise
// narrow `(trunc (shl (zext X), K))` back to `(shl X, K)` of i8,
// re-entering this hook in an infinite loop; the
// `isTypeDesirableForOp(SHL/SRL/SRA, i8) -> false` override above
// disables that combine.
if (Op.getValueType() == MVT::i8) {
SDLoc DL(Op);
SDValue X = Op.getOperand(0);
SDValue N = Op.getOperand(1);
unsigned Ext = (Op.getOpcode() == ISD::SRA) ? ISD::SIGN_EXTEND
: ISD::ZERO_EXTEND;
SDValue X16 = DAG.getNode(Ext, DL, MVT::i16, X);
SDValue N16 = N.getValueType() == MVT::i16
? N
: DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, N);
// Special case: i8 SRA by 7 of a sign-extended value is the
// sign-fill operation — every result bit is the input's bit 7.
// For sext(i8 x), bit 15 == bit 7, so `(sra (sext x), 7)` yields
// the same result as `(sra (sext x), 15)`, which we have a tight
// 4-insn pattern for via SRA15A. Avoids the __ashrhi3 libcall
// (~10 insns plus arg push/pop overhead) — abs8 dropped from 47
// to 35 insns with this rewrite in place.
if (Op.getOpcode() == ISD::SRA) {
if (auto *C = dyn_cast<ConstantSDNode>(N)) {
if (C->getZExtValue() == 7) {
N16 = DAG.getConstant(15, DL, MVT::i16);
}
}
}
SDValue R16 = DAG.getNode(Op.getOpcode(), DL, MVT::i16, X16, N16);
return DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, R16);
}
// Fast path: shift-by-{1,2,3,4} have inline tablegen patterns. Return
// Op (the unchanged node) so the legalizer leaves it alone — the
// pattern matcher catches it later. Returning SDValue() instead
// would fall through to the generic Expand path, which generates a
// BUILD_VECTOR-based magic-constant rewrite that we can't lower.
// Also allow `(srl x, 15)` through — pattern SRL15A handles it as
// `ASL A; LDA #0; ROL A` (3 bytes), much shorter than the libcall.
// The type-legalizer's i32-shift-by-1 expansion emits this exact
// node for the high-half "bit-from-low" slot.
// Everything else goes to a libcall (__ashlhi3 / __lshrhi3 / __ashrhi3).
// i16 only — i32 always routes to libcall (no inline i32 patterns).
SDValue Amount = Op.getOperand(1);
if (Op.getValueType() == MVT::i16) {
if (auto *C = dyn_cast<ConstantSDNode>(Amount)) {
uint64_t N = C->getZExtValue();
if ((Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL) &&
N >= 1 && N <= 14)
return Op;
if (N == 15 &&
(Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL))
return Op;
if (N == 1 && Op.getOpcode() == ISD::SRA)
return Op;
if (N == 15 && Op.getOpcode() == ISD::SRA)
return Op;
}
}
bool IsI32 = Op.getValueType() == MVT::i32;
// Inline i32 shift-by-small-constant. The libcall path is ~140 cyc
// (post-tightening); unrolling N i16 ops plus carry propagation runs
// in ~30-90 cyc. popcount, djb2-style hashes, BigInt-style code, and
// CRC routines all hit this. Larger N falls through to the libcall —
// the unrolled cost grows linearly while the libcall is constant.
// Cutoff at N=5 chosen empirically: djb2's `(h << 5) + h` is the
// common one that benefits. SRA needs an arithmetic-fill shift on
// the high half (i16 SRA by 1 is tablegen-supported); the low half is
// filled from the high's departing bit just like SRL.
if (IsI32) {
if (auto *C = dyn_cast<ConstantSDNode>(Amount)) {
uint64_t N = C->getZExtValue();
unsigned Op0 = Op.getOpcode();
if (N >= 1 && N <= 5 &&
(Op0 == ISD::SHL || Op0 == ISD::SRL || Op0 == ISD::SRA)) {
SDLoc DL(Op);
SDValue X = Op.getOperand(0);
SDValue Lo = extractWide32Lo(DAG, DL, X);
SDValue Hi = extractWide32Hi(DAG, DL, X);
SDValue ShN = DAG.getConstant(N, DL, MVT::i16);
SDValue ShCo = DAG.getConstant(16 - N, DL, MVT::i16);
if (Op0 == ISD::SHL) {
// (Hi:Lo) << N == ((Hi << N) | (Lo >> (16-N))) : (Lo << N)
// 4 SDAG ops instead of N iterations of 4 ops. Lets the
// combiner / isel produce ASLA16-cascade + SRL8A+LSRA16-
// cascade + single OR, avoiding the bit-by-bit OR cascade
// that the unrolled form produced.
SDValue NewLo = DAG.getNode(ISD::SHL, DL, MVT::i16, Lo, ShN);
SDValue HiTop = DAG.getNode(ISD::SRL, DL, MVT::i16, Lo, ShCo);
SDValue HiShl = DAG.getNode(ISD::SHL, DL, MVT::i16, Hi, ShN);
SDValue NewHi = DAG.getNode(ISD::OR, DL, MVT::i16, HiShl, HiTop);
return buildWide32(DAG, DL, NewLo, NewHi);
} else {
// SRL/SRA by N: NewHi = Hi >> N (logical or arithmetic);
// NewLo = (Lo >> N) | (Hi << (16-N)).
SDValue NewHi = DAG.getNode(Op0, DL, MVT::i16, Hi, ShN);
SDValue LoTop = DAG.getNode(ISD::SHL, DL, MVT::i16, Hi, ShCo);
SDValue LoSrl = DAG.getNode(ISD::SRL, DL, MVT::i16, Lo, ShN);
SDValue NewLo = DAG.getNode(ISD::OR, DL, MVT::i16, LoSrl, LoTop);
return buildWide32(DAG, DL, NewLo, NewHi);
}
}
}
}
RTLIB::Libcall LC;
switch (Op.getOpcode()) {
case ISD::SHL: LC = IsI32 ? RTLIB::SHL_I32 : RTLIB::SHL_I16; break;
case ISD::SRL: LC = IsI32 ? RTLIB::SRL_I32 : RTLIB::SRL_I16; break;
case ISD::SRA: LC = IsI32 ? RTLIB::SRA_I32 : RTLIB::SRA_I16; break;
default: llvm_unreachable("not a shift");
}
SDValue Val = Op.getOperand(0);
if (IsI32 && Op.getOpcode() == ISD::SHL) {
// Force the high half of the input to be concretely zero when the
// shift count K is >= 16, so bits K..31 of the input are
// mathematically irrelevant. SDAG legalisation can mark those bits
// as `undef` to give the regalloc freedom, but our libcall (a true
// 32-bit shift-and-rotate loop in libgcc.s) reads ALL 32 input
// bits and propagates garbage into the result's low half. Caught
// by dadd via the dpack-inline `(u64 e) << 52` path which split
// into __ashlsi3(e_lo, 20) with X = undef → wrong mantissa.
// For SRL/SRA we'd zero/sign-extend the LOW half similarly when
// K >= 16, but those paths aren't exercising the bug yet.
if (auto *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
unsigned K = (unsigned)C->getZExtValue();
if (K >= 16) {
SDValue Lo = extractWide32Lo(DAG, SDLoc(Op), Val);
SDValue Zero = DAG.getConstant(0, SDLoc(Op), MVT::i16);
Val = buildWide32(DAG, SDLoc(Op), Lo, Zero);
}
}
}
SmallVector<SDValue, 2> Args = {Val, Op.getOperand(1)};
TargetLowering::MakeLibCallOptions Opts;
Opts.setIsSigned(Op.getOpcode() == ISD::SRA);
return makeLibCall(DAG, LC, Op.getValueType(), Args, Opts, SDLoc(Op)).first;
}
SDValue W65816TargetLowering::LowerGlobalAddress(SDValue Op,
SelectionDAG &DAG) const {
auto *GA = cast<GlobalAddressSDNode>(Op);
SDLoc DL(Op);
EVT PtrVT = Op.getValueType(); // i16 in ptr16 mode, i32 in ptr32 mode
if (PtrVT == MVT::i32) {
// i32 GlobalAddress: build Wide32 from (i16 offset, i16 bank).
// Lo = Wrapper(target) → fixup_16 (offset bytes)
// Hi = WrapperBank(target) → fixup_bank16 (bank byte + 0 pad)
// The linker / OMF Loader patch both halves so the runtime
// pointer reflects the actual placed segment, not the link-time
// text-base. Resolves the long-standing "ldx #0 is hardcoded"
// bug that broke toolbox-call pointer args.
SDValue OffTgt = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
MVT::i16, GA->getOffset());
SDValue Lo = DAG.getNode(W65816ISD::Wrapper, DL, MVT::i16, OffTgt);
SDValue Hi = DAG.getNode(W65816ISD::WrapperBank, DL, MVT::i16, OffTgt);
return buildWide32(DAG, DL, Lo, Hi);
}
SDValue Tgt = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, PtrVT,
GA->getOffset());
return DAG.getNode(W65816ISD::Wrapper, DL, PtrVT, Tgt);
}
SDValue W65816TargetLowering::LowerExternalSymbol(SDValue Op,
SelectionDAG &DAG) const {
auto *ES = cast<ExternalSymbolSDNode>(Op);
SDLoc DL(Op);
EVT PtrVT = Op.getValueType();
if (PtrVT == MVT::i32) {
SDValue OffTgt = DAG.getTargetExternalSymbol(ES->getSymbol(), MVT::i16);
SDValue Lo = DAG.getNode(W65816ISD::Wrapper, DL, MVT::i16, OffTgt);
SDValue Hi = DAG.getNode(W65816ISD::WrapperBank, DL, MVT::i16, OffTgt);
return buildWide32(DAG, DL, Lo, Hi);
}
SDValue Tgt = DAG.getTargetExternalSymbol(ES->getSymbol(), PtrVT);
return DAG.getNode(W65816ISD::Wrapper, DL, PtrVT, Tgt);
}
SDValue W65816TargetLowering::LowerFormalArguments(
SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
// ABI: first i16/i8 argument is passed in A; remaining arguments are
// pushed by the caller right-to-left and read via stack-relative
// addressing. After JSL pushes 3 bytes of return address, the layout
// viewed from the callee is:
// (high addr) arg N-1
// ...
// arg 1
// ret-addr-bank <- (4,S) when M=0
// ret-addr-hi <- (3,S)
// ret-addr-lo <- (2,S)
// (low addr) <next push> <- (1,S)
//
// Each i16 stack arg occupies 2 bytes. arg 1 lives at (4,S).
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
// i32 first-arg ABI. Two flavors as in LowerCall:
// - Legal-i32 (Wide32 reg class registered): single i32 InputArg.
// - Split-i32 (legacy): two i16 InputArgs both with OrigArgIndex==0.
bool I32SplitFirstArg =
Ins.size() >= 2 && Ins[0].VT == MVT::i16 && Ins[1].VT == MVT::i16 &&
Ins[0].OrigArgIndex == 0 && Ins[1].OrigArgIndex == 0;
// True iff the FIRST original arg spans 4 i16s (i.e., is i64). Used
// below to choose the Img16-via-STX_DP X-arg path for i64 callees,
// which dodges greedy's TXA-bridge-clobbers-A spill bug. i32-first
// doesn't get the same treatment because the change pessimizes
// simple functions like `int add32(int a, int b) { return a+b; }`
// where greedy's regular A:X handling is fine.
// Two shapes for i64-first-arg under different ptr modes:
// ptr16 (i32 illegal): Ins[0..3] = 4 i16 halves of arg0
// ptr32 (i32 legal): Ins[0..1] = 2 i32 halves of arg0 — but the
// IR-level "single i64 first arg" still splits
// to 4 i16 in Outs/Ins because i64 isn't legal.
// So the i16-form detection still applies here.
bool I64FirstArg =
Ins.size() >= 4 && Ins[0].VT == MVT::i16 && Ins[1].VT == MVT::i16 &&
Ins[2].VT == MVT::i16 && Ins[3].VT == MVT::i16 &&
Ins[0].OrigArgIndex == 0 && Ins[1].OrigArgIndex == 0 &&
Ins[2].OrigArgIndex == 0 && Ins[3].OrigArgIndex == 0;
// Also detect the i32-split shape: Ins[0..1] = 2 i32 halves of arg0
// (with OrigArgIndex==0 on both). This happens with ptr32 active and
// i64 legalized via i32-split rather than i16-quad-split.
if (!I64FirstArg && Ins.size() >= 2 && Ins[0].VT == MVT::i32 &&
Ins[1].VT == MVT::i32 && Ins[0].OrigArgIndex == 0 &&
Ins[1].OrigArgIndex == 0)
I64FirstArg = true;
unsigned ArgIdx = 0;
// Stack offset is measured from S+1 (the WDC convention) and grows
// upward as we walk through the stack-passed args.
unsigned StackOffset = 4; // Skip 3 ret-addr bytes; first slot at S+4.
for (const ISD::InputArg &Arg : Ins) {
MVT VT = Arg.VT;
if (VT != MVT::i16 && VT != MVT::i8 && VT != MVT::i32)
report_fatal_error("W65816: argument type not yet supported");
if (ArgIdx == 0 && VT == MVT::i32) {
// Whole-i32 first arg: lo half live-in via $a, hi via $x.
// The W65816LowerWide32 pre-RA pass walks the resulting
// REG_SEQUENCE and rewrites Wide32 uses into pairs of i16
// operations — keeping AX32 out of the regalloc's pair-
// allocation path entirely.
// For i64-first-arg signatures (the IR has a single i64 arg
// that splits to 2 i32 in Ins[0..1] under ptr32), route BOTH
// halves through Img16. Without this the regalloc emits
// `TXA; STA spill_X; STA spill_A` at function entry — the TXA
// clobbers $a (arg0_0) before the A-spill saves it, so both
// spill slots end up holding arg0_1. Caused __adddf3(1.5,2.5)
// → 1.5 because the cb-test path read TXA-corrupted A.
// Route the hi half through Img16 (DP-backed) for whole-i32 first
// args. The Idx16 (X-only) class collapses through the W65816LowerWide32
// pre-RA pass to plain Acc16, after which regalloc treats both halves
// as competing for $a — a TXA at the top of any non-trivial function
// body destroys arg0_lo before it's spilled (silent miscompile of
// every i32-arg function with > a few uses). Img16 forces an
// STX_DP at function entry, immune to A-reuse. i64-first already
// did this; under ptr32 the same hazard hits any i32 arg.
const TargetRegisterClass *VRegLoRC =
I64FirstArg ? &W65816::Img16RegClass : &W65816::Acc16RegClass;
const TargetRegisterClass *VRegHiRC = &W65816::Img16RegClass;
Register VRegLo = MRI.createVirtualRegister(VRegLoRC);
Register VRegHi = MRI.createVirtualRegister(VRegHiRC);
MRI.addLiveIn(W65816::A, VRegLo);
MRI.addLiveIn(W65816::X, VRegHi);
SDValue Lo = DAG.getCopyFromReg(Chain, DL, VRegLo, MVT::i16);
SDValue Hi = DAG.getCopyFromReg(Chain, DL, VRegHi, MVT::i16);
InVals.push_back(buildWide32(DAG, DL, Lo, Hi));
} else if (ArgIdx == 0) {
// First arg in A. For i64-first-arg signatures (4 i16 halves of
// arg0 with OrigArgIndex==0), route arg0_0 through Img16 the same
// way ArgIdx==1 does — via an entry STA-to-DP-slot at function
// entry. Without this, the regalloc emits a TXA bridge for
// arg0_1's spill that clobbers $a (= arg0_0) BEFORE arg0_0 has
// been saved, and BOTH arg0_0 and arg0_1's spill slots end up
// holding arg0_1. Observed as `__adddf3(1.5, 2.5) → 1.5` because
// the cb-test BEQ sees flags from a TXA-clobbered LDA cb path.
const TargetRegisterClass *RC =
(VT == MVT::i16)
? (I64FirstArg ? &W65816::Img16RegClass : &W65816::Acc16RegClass)
: &W65816::Acc8RegClass;
Register VReg = MRI.createVirtualRegister(RC);
MRI.addLiveIn(W65816::A, VReg);
InVals.push_back(DAG.getCopyFromReg(Chain, DL, VReg, VT));
} else if (ArgIdx == 1 && I32SplitFirstArg) {
// First-arg hi half (or arg0_ml for i64-first-arg): in X.
// For i64-first-arg signatures (4 i16s with OrigArgIndex 0), use
// Img16 so greedy parks the value in an IMG slot via STX_DP,
// dodging the TXA-bridge-clobbers-A spill bug. i32-first stays
// on the original Idx16 path because the change pessimizes
// simple cases (verified: vprintf's writeULong/__udivsi3 chain
// crashes if i32-first is also rerouted). Caught by udivmod.
const TargetRegisterClass *RC = I64FirstArg ? &W65816::Img16RegClass
: &W65816::Idx16RegClass;
Register VReg = MRI.createVirtualRegister(RC);
MRI.addLiveIn(W65816::X, VReg);
InVals.push_back(DAG.getCopyFromReg(Chain, DL, VReg, MVT::i16));
} else if (VT == MVT::i32) {
// i32 stack arg: 4 bytes, loaded as 2 i16 halves and assembled
// via REG_SEQUENCE into a Wide32 SDValue.
int FILo = MFI.CreateFixedObject(2, StackOffset, /*Immutable*/true);
int FIHi = MFI.CreateFixedObject(2, StackOffset + 2, /*Immutable*/true);
StackOffset += 4;
SDValue FINLo = DAG.getFrameIndex(FILo, MVT::i16);
SDValue FINHi = DAG.getFrameIndex(FIHi, MVT::i16);
SDValue Lo = DAG.getLoad(MVT::i16, DL, Chain, FINLo,
MachinePointerInfo::getFixedStack(MF, FILo));
SDValue Hi = DAG.getLoad(MVT::i16, DL, Chain, FINHi,
MachinePointerInfo::getFixedStack(MF, FIHi));
InVals.push_back(buildWide32(DAG, DL, Lo, Hi));
} else {
// Subsequent args are loaded from the stack. i8 args are
// promoted to i16 slots (matching CC_W65816's CCPromoteToType)
// so the load can run in the function's default 16-bit M mode
// without needing a per-byte SEP/REP wrap; we then truncate the
// i16 back to i8 for the IR. i16 args are loaded directly.
unsigned ObjSize = 2;
int FI = MFI.CreateFixedObject(ObjSize, StackOffset, /*Immutable*/true);
StackOffset += ObjSize;
SDValue FIN = DAG.getFrameIndex(FI, MVT::i16);
SDValue Val = DAG.getLoad(
MVT::i16, DL, Chain, FIN,
MachinePointerInfo::getFixedStack(MF, FI));
if (VT == MVT::i8)
Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Val);
InVals.push_back(Val);
}
++ArgIdx;
}
// Vararg support: stash the FrameIndex of the next stack-arg slot
// (where the caller's first vararg lives) so VASTART can use it
// as the va_list start. StackOffset has been advanced past every
// named stack arg; the first vararg sits at SP + StackOffset.
if (IsVarArg) {
int FI = MFI.CreateFixedObject(2, StackOffset, /*Immutable=*/true);
auto *FuncInfo = MF.getInfo<W65816MachineFunctionInfo>();
FuncInfo->setVarArgsFrameIndex(FI);
}
return Chain;
}
SDValue
W65816TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const {
// Multi-arg ABI: arg 0 in A, args 1..N-1 pushed in REVERSE order via
// PUSH16 (PHA) so the callee's `(4,S)` reads pick up arg 1, `(6,S)`
// gets arg 2, etc. CALLSEQ_START records the byte count;
// ADJCALLSTACKUP after the call emits `tsc;clc;adc #N;tcs` to
// release the pushed bytes (eliminateCallFramePseudoInstr).
SelectionDAG &DAG = CLI.DAG;
SDLoc &DL = CLI.DL;
SDValue Chain = CLI.Chain;
SDValue Callee = CLI.Callee;
auto &Outs = CLI.Outs;
auto &OutVals = CLI.OutVals;
auto &Ins = CLI.Ins;
if (CLI.IsTailCall)
CLI.IsTailCall = false;
// Up to 4 return halves (i64 split): i8/i16 in A; i32 in A:X;
// i64 in A:X:Y plus DP $F0..$F1 for the highest half. See
// LowerReturn comment for the ABI.
if (Ins.size() > 4)
report_fatal_error("W65816: return type wider than 64 bits not supported");
// Indirect calls (function pointers): redirect through the runtime
// trampoline `__jsl_indir`. The 65816 has no JSL-indirect; instead,
// we store the dynamic target to a fixed bank-0 slot ($00:00B8 — see
// libgcc.s for why) and JSL the trampoline, which does
// `JMP ($00B8)`. JMP (abs) reads its vector from bank 0 unconditionally,
// so anchoring the slot in bank 0 makes the dispatch work under GS/OS
// Loader / GNO non-bank-0 placement (where the program's BSS would
// otherwise live in PBR — the JMP couldn't reach it). Single-bank
// assumption remains on the *target's* code (JMP indirect keeps PBR).
bool IsIndirect = !isa<GlobalAddressSDNode>(Callee) &&
!isa<ExternalSymbolSDNode>(Callee);
if (IsIndirect) {
// Emit a constant-address store: tblgen pattern (store Acc16,
// (iPTR imm:$addr)) -> STA_Long $0000B8 (4-byte abs-long, bank
// explicit, ignores DBR).
SDValue ConstAddr =
DAG.getConstant(0xB8, DL,
getPointerTy(DAG.getDataLayout()));
Chain = DAG.getStore(Chain, DL, Callee, ConstAddr,
MachinePointerInfo());
// Replace the callee with __jsl_indir for the actual JSL.
Callee = DAG.getExternalSymbol("__jsl_indir", MVT::i16);
}
for (const ISD::OutputArg &O : Outs) {
if (O.VT != MVT::i16 && O.VT != MVT::i8 && O.VT != MVT::i32)
report_fatal_error("W65816: argument type not yet supported");
}
// i32 first-arg ABI. Two flavors:
// - Legal-i32: Outs[0].VT == i32 (whole pair). Pass in AX32.
// - Split-i32 (legacy): Outs[0]/Outs[1] both i16 with OrigArgIndex==0.
// Pass low in A, high in X.
bool I32WholeFirstArg =
!Outs.empty() && Outs[0].VT == MVT::i32;
bool I32SplitFirstArg =
Outs.size() >= 2 && Outs[0].VT == MVT::i16 && Outs[1].VT == MVT::i16 &&
Outs[0].OrigArgIndex == 0 && Outs[1].OrigArgIndex == 0;
unsigned FirstStackArg = I32WholeFirstArg ? 1
: I32SplitFirstArg ? 2 : 1;
// i8 stack args are promoted to i16 (2-byte slots) so the callee can
// read them with a 16-bit M load — matches LowerFormalArguments and
// CC_W65816's CCPromoteToType<i16>. i32 stack args occupy 4 bytes
// (2 PUSH16s).
unsigned StackBytes = 0;
for (unsigned i = FirstStackArg; i < Outs.size(); ++i)
StackBytes += (Outs[i].VT == MVT::i32) ? 4 : 2;
Chain = DAG.getCALLSEQ_START(Chain, StackBytes, 0, DL);
// Push stack-passed args in reverse so arg FirstStackArg ends up at
// the lowest post-JSL stack-relative offset (4,S). Each push uses A
// by default; if the value being pushed is already a `CopyFromReg X`
// (e.g. forwarding the i32-first-arg-in-A:X hi half), push directly
// from X via PHX — saves the TXA + A-spill round-trip that would
// otherwise be required.
SDValue Glue;
// Helper: push a single i16-sized value via PHA.
auto pushI16 = [&](SDValue V) {
bool ViaX = false;
if (V.getOpcode() == ISD::CopyFromReg) {
auto *RegN = dyn_cast<RegisterSDNode>(V.getOperand(1).getNode());
if (RegN) {
Register R = RegN->getReg();
if (R.isPhysical() && R == W65816::X) {
ViaX = true;
} else if (R.isVirtual()) {
MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
if (MRI.getRegClass(R) == &W65816::Idx16RegClass) {
for (auto &LI : MRI.liveins())
if (LI.second == R && LI.first == W65816::X) {
ViaX = true;
break;
}
}
}
}
}
if (ViaX) {
Chain = DAG.getCopyToReg(Chain, DL, W65816::X, V, Glue);
Glue = Chain.getValue(1);
Chain = DAG.getNode(W65816ISD::PUSH_X, DL,
DAG.getVTList(MVT::Other, MVT::Glue), Chain, Glue);
} else {
Chain = DAG.getCopyToReg(Chain, DL, W65816::A, V, Glue);
Glue = Chain.getValue(1);
Chain = DAG.getNode(W65816ISD::PUSH, DL,
DAG.getVTList(MVT::Other, MVT::Glue), Chain, Glue);
}
Glue = Chain.getValue(1);
};
for (int i = (int)Outs.size() - 1; i >= (int)FirstStackArg; --i) {
SDValue V = OutVals[i];
if (Outs[i].VT == MVT::i32) {
// Push i32 stack arg: hi half first (lands at higher address),
// lo half second (lands at lower address = the slot the callee
// reads as the start of the i32).
SDValue Lo = extractWide32Lo(DAG, DL, V);
SDValue Hi = extractWide32Hi(DAG, DL, V);
pushI16(Hi);
pushI16(Lo);
continue;
}
if (Outs[i].VT == MVT::i8)
V = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, V);
pushI16(V);
}
// i32 first-arg. Whole (legal-i32): split into lo/hi and copy
// to $a/$x separately — avoids AX32 in the MIR (see
// W65816LowerWide32). Split-i32 (legacy 2-i16): hi in X first,
// then lo in A below.
if (I32WholeFirstArg) {
SDValue Lo = extractWide32Lo(DAG, DL, OutVals[0]);
SDValue Hi = extractWide32Hi(DAG, DL, OutVals[0]);
Chain = DAG.getCopyToReg(Chain, DL, W65816::X, Hi, Glue);
Glue = Chain.getValue(1);
Chain = DAG.getCopyToReg(Chain, DL, W65816::A, Lo, Glue);
Glue = Chain.getValue(1);
} else if (I32SplitFirstArg) {
Chain = DAG.getCopyToReg(Chain, DL, W65816::X, OutVals[1], Glue);
Glue = Chain.getValue(1);
}
// Arg 0 in A — only for non-whole-i32 first-arg. Whole-i32
// already copied to A/X above.
if (!I32WholeFirstArg && !OutVals.empty()) {
Chain = DAG.getCopyToReg(Chain, DL, W65816::A, OutVals[0], Glue);
Glue = Chain.getValue(1);
}
// Callee target type must match iPTR (i16 in ptr16, i32 in ptr32).
// The CALL SDNode's operand-type profile (SDT_W65816Call) is iPTR;
// hardcoding MVT::i16 here mismatches under p:32:16.
EVT CalleeVT = getPointerTy(DAG.getDataLayout());
if (auto *GA = dyn_cast<GlobalAddressSDNode>(Callee))
Callee = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, CalleeVT);
else if (auto *ES = dyn_cast<ExternalSymbolSDNode>(Callee))
Callee = DAG.getTargetExternalSymbol(ES->getSymbol(), CalleeVT);
SmallVector<SDValue, 4> CallOps = {Chain, Callee};
if (I32WholeFirstArg) {
CallOps.push_back(DAG.getRegister(W65816::A, MVT::i16));
CallOps.push_back(DAG.getRegister(W65816::X, MVT::i16));
} else if (!OutVals.empty()) {
CallOps.push_back(DAG.getRegister(W65816::A, Outs[0].VT));
if (I32SplitFirstArg)
CallOps.push_back(DAG.getRegister(W65816::X, Outs[1].VT));
}
if (Glue.getNode())
CallOps.push_back(Glue);
Chain = DAG.getNode(W65816ISD::CALL, DL,
DAG.getVTList(MVT::Other, MVT::Glue), CallOps);
Glue = Chain.getValue(1);
Chain = DAG.getCALLSEQ_END(Chain, StackBytes, 0, Glue, DL);
Glue = Chain.getValue(1);
// Read return value(s). Mirrors LowerReturn: i8/i16 in A, i32 in
// AX32 (whole) or split A/X (legacy), and 4-half (i64 / 2x i32) in
// A, X, Y, DPF0. i32 Ins are read as a single i32 from the half
// pair (A:X for the first, Y:DPF0 for a second-pair-of-halves).
// Whole-i32 single return: read lo from $a, hi from $x. Avoids
// using AX32 in the SDAG / MIR — see W65816LowerWide32 pass.
if (Ins.size() == 1 && Ins[0].VT == MVT::i32) {
SDValue Lo = DAG.getCopyFromReg(Chain, DL, W65816::A, MVT::i16, Glue);
Chain = Lo.getValue(1);
Glue = Lo.getValue(2);
SDValue Hi = DAG.getCopyFromReg(Chain, DL, W65816::X, MVT::i16, Glue);
Chain = Hi.getValue(1);
Glue = Hi.getValue(2);
InVals.push_back(buildWide32(DAG, DL, Lo, Hi));
return Chain;
}
// Build a flat list of i16 halves expected from the call. Then
// walk it, copying from A, X, Y, DPF0 in order. Re-assemble i32
// halves into a Wide32 SDValue at the end.
SmallVector<MVT, 4> ExpVT;
for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
MVT VT = Ins[i].VT;
if (VT == MVT::i32) {
ExpVT.push_back(MVT::i16);
ExpVT.push_back(MVT::i16);
} else if (VT == MVT::i16 || VT == MVT::i8) {
ExpVT.push_back(VT);
} else {
report_fatal_error("W65816: return half must be i8/i16/i32");
}
}
if (ExpVT.size() > 4)
report_fatal_error("W65816: return type wider than 64 bits not supported");
static constexpr Register RetRegs[4] = {W65816::A, W65816::X, W65816::Y,
W65816::DPF0};
SmallVector<SDValue, 4> Halves;
for (unsigned i = 0; i != ExpVT.size(); ++i) {
SDValue V = DAG.getCopyFromReg(Chain, DL, RetRegs[i], ExpVT[i], Glue);
Chain = V.getValue(1);
Glue = V.getValue(2);
Halves.push_back(V);
}
// Re-pack halves into the original Ins shape (i32s rebuild via
// REG_SEQUENCE; i8/i16 pass through).
unsigned hi = 0;
for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
if (Ins[i].VT == MVT::i32) {
InVals.push_back(buildWide32(DAG, DL, Halves[hi], Halves[hi + 1]));
hi += 2;
} else {
InVals.push_back(Halves[hi]);
hi += 1;
}
}
return Chain;
}
SDValue W65816TargetLowering::LowerReturn(
SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
SelectionDAG &DAG) const {
// Return ABI:
// i8/i16: value in A.
// i32: low half (Outs[0]) in A, high half (Outs[1]) in X.
// i64: halves in A, X, Y, and a fixed direct-page slot at $F0..$F1
// (Outs[0..2] -> A,X,Y; Outs[3] stored to the DP slot).
// wider: not yet supported.
// Type legalization splits an i32 into 2 consecutive i16 Outs and an
// i64 into 4. Emission order matters: we copy the *highest* halves
// first so that the regalloc can place each through A (the only
// ALU reg) without conflict. The TAX/TAY in copyPhysReg preserves
// A, so subsequent low-half copies to A don't clobber.
// With i32 legal, an Outs entry may be MVT::i32; we expand each i32
// into its two i16 halves (sub_lo/sub_hi via EXTRACT_SUBREG) so the
// legacy A/X/Y/DPF0 4-half return ABI continues to work for the
// multi-half return cases (i64 returned as 2 i32, struct of 2 long
// returned as 2 i32, etc.).
SmallVector<MVT, 4> ExpVT;
SmallVector<SDValue, 4> ExpVals;
for (unsigned i = 0; i != Outs.size(); ++i) {
MVT VT = Outs[i].VT;
if (VT == MVT::i32) {
ExpVT.push_back(MVT::i16);
ExpVT.push_back(MVT::i16);
ExpVals.push_back(extractWide32Lo(DAG, DL, OutVals[i]));
ExpVals.push_back(extractWide32Hi(DAG, DL, OutVals[i]));
} else if (VT == MVT::i16 || VT == MVT::i8) {
ExpVT.push_back(VT);
ExpVals.push_back(OutVals[i]);
} else {
report_fatal_error("W65816: return half must be i8/i16/i32");
}
}
if (ExpVT.size() > 4)
report_fatal_error("W65816: return type wider than 64 bits not supported");
// Single whole-i32 return: copy directly to AX32 instead of two
// halves to A and X. Saves the regalloc/coalescer some work.
bool I32WholeReturn = (Outs.size() == 1 && Outs[0].VT == MVT::i32);
SDValue Glue;
SmallVector<SDValue, 8> RetOps(1, Chain);
if (I32WholeReturn) {
// Split the i32 OutVal into lo/hi and copy each separately to
// $a / $x (no AX32 in the SDAG — see W65816LowerWide32).
SDValue Lo = extractWide32Lo(DAG, DL, OutVals[0]);
SDValue Hi = extractWide32Hi(DAG, DL, OutVals[0]);
Chain = DAG.getCopyToReg(Chain, DL, W65816::X, Hi, Glue);
Glue = Chain.getValue(1);
Chain = DAG.getCopyToReg(Chain, DL, W65816::A, Lo, Glue);
Glue = Chain.getValue(1);
RetOps.push_back(DAG.getRegister(W65816::A, MVT::i16));
RetOps.push_back(DAG.getRegister(W65816::X, MVT::i16));
RetOps[0] = Chain;
if (Glue.getNode())
RetOps.push_back(Glue);
return DAG.getNode(W65816ISD::RET_GLUE, DL, MVT::Other, RetOps);
}
// Outs[3] -> DP $F0 via CopyToReg(DPF0). Using the DPF0 fake physreg
// (lowered to `STA $F0` by copyPhysReg) is critical: a generic
// ISD::STORE with addr=0xF0 lowered to `sta (d,s),y`, an indirect
// through the DBR, which silently misbehaved when DBR != 0. STA dp
// uses D + dp directly and is unaffected by DBR. Done first so its
// computation can use A freely before A holds the low result. Glued
// to RET_GLUE via the RetOps Register entry below so DCE doesn't
// strip the COPY.
// Use the expanded i16-half list (i32 outs split into 2 i16 halves).
if (ExpVals.size() >= 4) {
Chain = DAG.getCopyToReg(Chain, DL, W65816::DPF0, ExpVals[3], Glue);
Glue = Chain.getValue(1);
}
if (ExpVals.size() >= 3) {
Chain = DAG.getCopyToReg(Chain, DL, W65816::Y, ExpVals[2], Glue);
Glue = Chain.getValue(1);
}
if (ExpVals.size() >= 2) {
Chain = DAG.getCopyToReg(Chain, DL, W65816::X, ExpVals[1], Glue);
Glue = Chain.getValue(1);
}
if (!ExpVals.empty()) {
Chain = DAG.getCopyToReg(Chain, DL, W65816::A, ExpVals[0], Glue);
Glue = Chain.getValue(1);
RetOps.push_back(DAG.getRegister(W65816::A, ExpVT[0]));
}
if (ExpVals.size() >= 2)
RetOps.push_back(DAG.getRegister(W65816::X, ExpVT[1]));
if (ExpVals.size() >= 3)
RetOps.push_back(DAG.getRegister(W65816::Y, ExpVT[2]));
if (ExpVals.size() >= 4)
RetOps.push_back(DAG.getRegister(W65816::DPF0, ExpVT[3]));
RetOps[0] = Chain;
if (Glue.getNode())
RetOps.push_back(Glue);
return DAG.getNode(W65816ISD::RET_GLUE, DL, MVT::Other, RetOps);
}
SDValue
W65816TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
// (shl i32 X, K) -> chain of K (add x, x) for small K. After type
// legalisation the i32 add splits via ADDC/ADDE pseudos which expand
// to native ASL/ROL + carry-chain — much cheaper than the type-
// legaliser's SHL_PARTS expansion which uses our 3-insn SRL15A trick
// to compute the bit crossing the half boundary. Each ADD expands to
// ~10 insns; SHL_PARTS expansion is ~26 for K=1, ~33 for K=2, ~34 for
// K=3. ADD-chain wins at K<=2 and breaks even at K=3 — cap at K=2.
// `x*N` (which the combiner canonicalises pow-of-2 muls to `x<<K`)
// benefits the most. i16 SHL by 1..15 has dedicated ASLA16 patterns
// already, so we restrict the rewrite to i32+.
// (shl i32 X, K) -> ADD chain for small K — but only when i32 is
// ILLEGAL (i.e., gets type-split into i16 halves). When i32 is a
// legal type (Wide32 reg class for ptr32 mode), the rewrite cycles
// against LLVM's generic `(add x, x) -> (shl x, 1)` combine in the
// i64 → 2 i32 split path, hanging the legalizer.
// STORE / LOAD with ConstantSDNode ptr (e.g. `*(volatile uint8*)0xC035 = v`):
// wrap the immediate in a W65816ISD::WRAPPER (using a TargetGlobalAddress-
// like marker would be cleaner but we lack the symbol table). Re-issue
// the store/load with the same ptr but the constant marked TargetConstant
// — TargetConstant is opaque to LowerI32Constant, so it survives intact
// to ISel, where the existing tablegen pattern
// `(store Acc8, (iPTR imm)) -> STA8long`
// matches (`imm` accepts both Constant and TargetConstant). 4 B / 6 cyc
// bank-explicit `sta long` instead of 16 B / 30 cyc [dp],Y.
// Wide32-of-Wrapper-with-zero-hi → i16 Wrapper. Under p:32:16,
// LowerGlobalAddress builds GlobalAddress as a Wide32 reg pair
// `(REG_SEQUENCE Wrapper(off_i16), 0_i16)`. Stores/loads against
// this Wide32 ptr fall to the heavy [dp],Y path (16 B / 30 cyc)
// even when the bank half is the constant 0 — we want the cheap
// DBR-relative `sta g` / `lda g` (3 B / 5 cyc). Detect the shape
// and recombine the ptr to its 16-bit form so the existing
// tablegen `(store v, (Wrapper tglob))` → STAabs / `(load (Wrapper
// tglob))` → LDAabs patterns fire. Crucially, this is correct
// ONLY when bank=0 — under GS/OS Loader, DBR is set to our bank
// by crt0Gsos, so DBR-relative addressing reaches the same global.
// Returns either an i16 Wrapper (drop into i16 STAabs/LDAabs pattern)
// or a TargetConstant:i32 (for const-addr i16 stores so the timm
// pattern fires and produces STAabs). TargetConstant — not regular
// Constant — because LowerI32Constant only matches ISD::Constant; if
// we returned a fresh ConstantSDNode it would re-fire LowerI32Constant
// and produce another Wide32 REG_SEQUENCE → infinite combine loop.
auto unwrapWide32WithZeroHi = [&](SDValue Ptr) -> SDValue {
if (Ptr.getValueType() != MVT::i32) return SDValue();
if (!Ptr.getNode()->isMachineOpcode()) return SDValue();
if (Ptr.getMachineOpcode() != TargetOpcode::REG_SEQUENCE)
return SDValue();
SDValue Lo, Hi;
for (unsigned i = 1; i + 1 < Ptr.getNumOperands(); i += 2) {
auto *CIdx = dyn_cast<ConstantSDNode>(Ptr.getOperand(i + 1));
if (!CIdx) continue;
if (CIdx->getZExtValue() == llvm::sub_lo) Lo = Ptr.getOperand(i);
else if (CIdx->getZExtValue() == llvm::sub_hi) Hi = Ptr.getOperand(i);
}
if (!Lo || !Hi) return SDValue();
auto *HiC = dyn_cast<ConstantSDNode>(Hi);
if (!HiC || HiC->getZExtValue() != 0) return SDValue();
if (Lo.getOpcode() == W65816ISD::Wrapper) return Lo;
if (auto *LoC = dyn_cast<ConstantSDNode>(Lo)) {
// Recombine into a TargetConstant:i32 so the `(store v, (iPTR
// timm))` STAabs pattern fires. Returning an i16 Constant
// would create a malformed STORE node (Ptr type mismatch) and
// returning a regular Constant:i32 would re-trigger
// LowerI32Constant.
return DCI.DAG.getTargetConstant(LoC->getZExtValue(), SDLoc(Ptr),
MVT::i32);
}
return SDValue();
};
if (N->getOpcode() == ISD::STORE) {
auto *St = cast<StoreSDNode>(N);
EVT MemVT = St->getMemoryVT();
SDValue Ptr = St->getBasePtr();
// Skip i32 stores — LowerStore's i32 path has its own Wide32-zero-hi
// const-addr fast path that emits two i16 stores at separate
// TargetConstant addrs. Unwrapping here would short-circuit that
// and produce a malformed ADD(TargetConstant, Constant) when the
// hi-half store needs Ptr+2.
if (MemVT != MVT::i32) {
if (SDValue I16Ptr = unwrapWide32WithZeroHi(Ptr)) {
SelectionDAG &DAG = DCI.DAG;
SDLoc DL(N);
return DAG.getTruncStore(St->getChain(), DL, St->getValue(), I16Ptr,
MemVT, St->getMemOperand());
}
// Global+i16-idx fast path for STORES (companion to the LOAD
// branch below). Ptr = REG_SEQUENCE(ADDC(Wrapper, idx), ADDE(...)).
// Rewrite to CopyToReg($a, val) + CopyToReg($x, idx) + STA_AbsX.
if ((MemVT == MVT::i16 || MemVT == MVT::i8) &&
Ptr.getNode() && Ptr.isMachineOpcode() &&
Ptr.getMachineOpcode() == TargetOpcode::REG_SEQUENCE) {
SDValue Lo = lookThroughRegSeq(Ptr, llvm::sub_lo);
if (Lo && Lo.getOpcode() == ISD::ADDC) {
auto lookThroughExtractSubLo = [](SDValue V) -> SDValue {
if (V.getNode() && V.isMachineOpcode() &&
V.getMachineOpcode() == TargetOpcode::EXTRACT_SUBREG) {
SDValue Src = V.getOperand(0);
if (Src.isMachineOpcode() &&
Src.getMachineOpcode() == TargetOpcode::REG_SEQUENCE) {
if (SDValue X = lookThroughRegSeq(Src, llvm::sub_lo))
return X;
}
}
return V;
};
SDValue A = lookThroughExtractSubLo(Lo.getOperand(0));
SDValue B = lookThroughExtractSubLo(Lo.getOperand(1));
auto isWrapperGlobal = [](SDValue V) {
if (V.getOpcode() != W65816ISD::Wrapper) return false;
unsigned Op = V.getOperand(0).getOpcode();
return Op == ISD::TargetGlobalAddress ||
Op == ISD::TargetExternalSymbol;
};
SDValue Sym, Idx;
if (isWrapperGlobal(A)) { Sym = A.getOperand(0); Idx = B; }
else if (isWrapperGlobal(B)) { Sym = B.getOperand(0); Idx = A; }
if (Sym && Idx.getValueType() == MVT::i16) {
SelectionDAG &DAG = DCI.DAG;
SDLoc DL(N);
SDValue Chain = St->getChain();
SDValue Val = St->getValue();
// STA8absX copies $a register at i16 width (M=0); the SEP
// wraps narrow it. Promote i8 stored value to i16.
if (Val.getValueType() == MVT::i8)
Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, Val);
SDValue Glue;
SDValue C1 = DAG.getCopyToReg(Chain, DL, W65816::X, Idx, Glue);
Glue = C1.getValue(1);
SDValue C2 = DAG.getCopyToReg(C1, DL, W65816::A, Val, Glue);
Glue = C2.getValue(1);
SDVTList StaVTs = DAG.getVTList(MVT::Other, MVT::Glue);
unsigned Opc = (MemVT == MVT::i8) ? W65816::STA8absX
: W65816::STA_AbsX;
SDNode *Sta = DAG.getMachineNode(Opc, DL, StaVTs,
{Sym, C2, Glue});
return SDValue(Sta, 0);
}
}
}
}
// i8 const-addr → STA8long (timm pattern); i16 const-addr →
// STAabs (timm pattern, DBR-relative). Wrap as TargetConstant so
// LowerI32Constant doesn't re-enter and break the const-pattern
// match. i32 stores split into 2 i16 stores via LowerStore so they
// come back through this combine as MemVT==i16.
if (MemVT != MVT::i8 && MemVT != MVT::i16) return SDValue();
if (auto *C = dyn_cast<ConstantSDNode>(Ptr)) {
SelectionDAG &DAG = DCI.DAG;
SDLoc DL(N);
SDValue NewPtr = DAG.getTargetConstant(C->getZExtValue(), DL,
Ptr.getValueType());
return DAG.getTruncStore(St->getChain(), DL, St->getValue(), NewPtr,
MemVT, St->getMemOperand());
}
}
if (N->getOpcode() == ISD::LOAD) {
auto *Ld = cast<LoadSDNode>(N);
EVT MemVT = Ld->getMemoryVT();
EVT VT = Ld->getValueType(0);
SDValue Ptr = Ld->getBasePtr();
// Wide32-of-Wrapper-with-zero-hi → i16 Wrapper (companion to the
// STORE side just above). Lets `(load (Wrapper g))` → LDAabs fire.
// Skip i32 loads — LowerLoad's i32 path does its own Ptr+2 ADD
// arithmetic and would choke on a TargetConstant unwrap result.
if (MemVT != MVT::i32) {
if (SDValue I16Ptr = unwrapWide32WithZeroHi(Ptr)) {
SelectionDAG &DAG = DCI.DAG;
SDLoc DL(N);
return DAG.getExtLoad(Ld->getExtensionType(), DL, VT,
Ld->getChain(), I16Ptr, MemVT,
Ld->getMemOperand());
}
// Global+i16-idx fast path: Ptr is REG_SEQUENCE produced by
// LowerI32Bin from `(add (Wrapper sym) (zext i16 idx))`.
// sub_lo = ADDC(Wrapper, idx) — operands are TargetExtractSubreg
// wrapping each side's Wide32
// sub_hi = ADDE(0, 0, carry) — ignored (idx fits in 16 bits,
// so any carry stays in bank)
// Rewrite the LOAD to a CopyToReg($x, idx) + LDA_AbsX(sym)
// sequence. Saves ~45 bytes / ~70 cyc vs the 24-bit [dp],Y deref.
// Correct under the data-bank invariant (DBR = global's bank).
if ((MemVT == MVT::i16 || MemVT == MVT::i8) &&
Ptr.getNode() && Ptr.isMachineOpcode() &&
Ptr.getMachineOpcode() == TargetOpcode::REG_SEQUENCE) {
SDValue Lo = lookThroughRegSeq(Ptr, llvm::sub_lo);
if (Lo && Lo.getOpcode() == ISD::ADDC) {
auto lookThroughExtractSubLo = [](SDValue V) -> SDValue {
if (V.getNode() && V.isMachineOpcode() &&
V.getMachineOpcode() == TargetOpcode::EXTRACT_SUBREG) {
SDValue Src = V.getOperand(0);
if (Src.isMachineOpcode() &&
Src.getMachineOpcode() == TargetOpcode::REG_SEQUENCE) {
if (SDValue X = lookThroughRegSeq(Src, llvm::sub_lo))
return X;
}
}
return V;
};
SDValue A = lookThroughExtractSubLo(Lo.getOperand(0));
SDValue B = lookThroughExtractSubLo(Lo.getOperand(1));
auto isWrapperGlobal = [](SDValue V) {
if (V.getOpcode() != W65816ISD::Wrapper) return false;
unsigned Op = V.getOperand(0).getOpcode();
return Op == ISD::TargetGlobalAddress ||
Op == ISD::TargetExternalSymbol;
};
SDValue Sym, Idx;
if (isWrapperGlobal(A)) { Sym = A.getOperand(0); Idx = B; }
else if (isWrapperGlobal(B)) { Sym = B.getOperand(0); Idx = A; }
if (Sym && Idx.getValueType() == MVT::i16) {
SelectionDAG &DAG = DCI.DAG;
SDLoc DL(N);
SDValue Chain = Ld->getChain();
SDValue Glue;
SDValue NewChain = DAG.getCopyToReg(Chain, DL, W65816::X, Idx,
Glue);
Glue = NewChain.getValue(1);
SDVTList LdaVTs = DAG.getVTList(MVT::Other, MVT::Glue);
unsigned Opc = (MemVT == MVT::i8) ? W65816::LDA8absX
: W65816::LDA_AbsX;
SDNode *Lda = DAG.getMachineNode(Opc, DL, LdaVTs,
{Sym, NewChain, Glue});
SDValue LdaChain = SDValue(Lda, 0);
SDValue LdaGlue = SDValue(Lda, 1);
// Read A as the original LOAD's result VT directly. For
// i8 LOAD with i8 VT: read i8. For i8 LOAD with i16 VT
// (zext/sext): read i16 (high byte is whatever was in $a
// before — wrong for zext, fine for sext, depends on the
// M=8 LDA behavior). M=8 LDA only writes the low byte of
// $a, leaving the high byte intact. Safe wrt liveness
// because we're reading $a immediately after SEP/REP
// around the load, but the high byte is now whatever
// pre-LDA value $a held — for zext we must mask it.
SDValue Val = DAG.getCopyFromReg(LdaChain, DL, W65816::A,
VT, LdaGlue);
SDValue Chain2 = Val.getValue(1);
if (MemVT == MVT::i8 && VT == MVT::i16) {
if (Ld->getExtensionType() == ISD::ZEXTLOAD) {
Val = DAG.getNode(ISD::AND, DL, MVT::i16, Val,
DAG.getConstant(0xFF, DL, MVT::i16));
} else if (Ld->getExtensionType() == ISD::SEXTLOAD) {
Val = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i16,
Val, DAG.getValueType(MVT::i8));
}
// EXTLOAD: high byte don't-care, leave alone.
}
return DAG.getMergeValues({Val, Chain2}, DL);
}
}
}
}
// Only the i8 const-addr path has dedicated tablegen patterns
// (LDA8long); skip i16 const-addr loads (no LDAabs imm pattern)
// and i32 (would re-fire on the same node with different shape).
if (MemVT != MVT::i8 || (VT != MVT::i8 && VT != MVT::i16))
return SDValue();
if (auto *C = dyn_cast<ConstantSDNode>(Ptr)) {
SelectionDAG &DAG = DCI.DAG;
SDLoc DL(N);
SDValue NewPtr = DAG.getTargetConstant(C->getZExtValue(), DL,
Ptr.getValueType());
return DAG.getExtLoad(Ld->getExtensionType(), DL, VT,
Ld->getChain(), NewPtr, MemVT,
Ld->getMemOperand());
}
}
if (N->getOpcode() == ISD::SHL && N->getValueType(0).getSizeInBits() >= 32 &&
!isTypeLegal(N->getValueType(0))) {
if (auto *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
uint64_t K = C->getZExtValue();
if (K >= 1 && K <= 2) {
SelectionDAG &DAG = DCI.DAG;
SDValue X = N->getOperand(0);
SDLoc DL(N);
EVT VT = N->getValueType(0);
SDValue R = X;
for (uint64_t i = 0; i < K; ++i)
R = DAG.getNode(ISD::ADD, DL, VT, R, R);
return R;
}
}
}
return SDValue();
}
// Custom-lowering for ISD::MUL i32. When both operands are ZEXT from
// i16 (or provably have high 16 bits = 0), emit a libcall to
// __umulhisi3 (16x16 -> 32) instead of the heavier __mulsi3 (32x32 ->
// 32). Saves the 32-bit arg marshaling AND the 32-bit accumulator
// math inside the libcall — roughly equivalent to Calypsi 5.16's
// `_Mul16`. Falls through to the standard __mulsi3 libcall otherwise.
SDValue W65816TargetLowering::LowerMUL_I32(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
EVT VT = Op.getValueType();
assert(VT == MVT::i32 && "LowerMUL_I32 expects i32");
SDValue Lhs = Op.getOperand(0);
SDValue Rhs = Op.getOperand(1);
auto narrowToI16 = [&](SDValue V) -> SDValue {
// Explicit zext-from-i16 (the IR-level form, before SDAG flattening).
if (V.getOpcode() == ISD::ZERO_EXTEND &&
V.getOperand(0).getValueType() == MVT::i16)
return V.getOperand(0);
// ANY_EXTEND-from-i16 is also fine since multiplication of the low
// 16 bits gives the same 32-bit result whatever the high bits were.
if (V.getOpcode() == ISD::ANY_EXTEND &&
V.getOperand(0).getValueType() == MVT::i16)
return V.getOperand(0);
// High 16 bits provably zero?
KnownBits K = DAG.computeKnownBits(V);
if (K.countMinLeadingZeros() >= 16)
return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, V);
return SDValue();
};
// Mul-by-constant strength reduction: (X * K) where K-1 or K+1 is
// a small power of 2 (shift count 1..5, matching our inlined i32
// SHL range) expands to (X << N) +/- X — saves a __mulsi3 libcall
// (~250 cyc) for ~70 cyc of inlined shift+add. Catches djb2Hash's
// `h * 33` = (h << 5) + h.
//
// Patterns covered:
// K = 2^N + 1 in {3,5,9,17,33} → (X << N) + X
// K = 2^N - 1 in {7,15,31} → (X << N) - X
// Larger N hits the i32 SHL libcall path (no longer profitable).
if (auto *CN = dyn_cast<ConstantSDNode>(Rhs)) {
int64_t K = CN->getSExtValue();
for (unsigned N = 1; N <= 5; N++) {
int64_t Pow = int64_t{1} << N;
SDValue ShAmt = DAG.getConstant(N, DL, MVT::i16);
if (K == Pow + 1) {
SDValue Shl = DAG.getNode(ISD::SHL, DL, MVT::i32, Lhs, ShAmt);
return DAG.getNode(ISD::ADD, DL, MVT::i32, Shl, Lhs);
}
if (K == Pow - 1) {
SDValue Shl = DAG.getNode(ISD::SHL, DL, MVT::i32, Lhs, ShAmt);
return DAG.getNode(ISD::SUB, DL, MVT::i32, Shl, Lhs);
}
}
}
SDValue A = narrowToI16(Lhs);
SDValue B = narrowToI16(Rhs);
if (A && B) {
TargetLowering::ArgListTy Args;
Args.push_back({A, Type::getInt16Ty(*DAG.getContext())});
Args.push_back({B, Type::getInt16Ty(*DAG.getContext())});
SDValue Callee = DAG.getExternalSymbol(
"__umulhisi3", getPointerTy(DAG.getDataLayout()));
TargetLowering::CallLoweringInfo CLI(DAG);
CLI.setDebugLoc(DL)
.setChain(DAG.getEntryNode())
.setLibCallee(CallingConv::C,
Type::getInt32Ty(*DAG.getContext()),
Callee, std::move(Args));
auto [Ret, Chain] = LowerCallTo(CLI);
return Ret;
}
// Fall back to the standard __mulsi3 libcall.
TargetLowering::ArgListTy Args;
Args.push_back({Lhs, Type::getInt32Ty(*DAG.getContext())});
Args.push_back({Rhs, Type::getInt32Ty(*DAG.getContext())});
SDValue Callee = DAG.getExternalSymbol(
"__mulsi3", getPointerTy(DAG.getDataLayout()));
TargetLowering::CallLoweringInfo CLI(DAG);
CLI.setDebugLoc(DL)
.setChain(DAG.getEntryNode())
.setLibCallee(CallingConv::C,
Type::getInt32Ty(*DAG.getContext()),
Callee, std::move(Args));
auto [Ret, Chain] = LowerCallTo(CLI);
return Ret;
}
// Map a W65816CC code to the matching Bxx opcode.
static unsigned getBranchOpcodeForCC(unsigned CC) {
switch (CC) {
case W65816CC::COND_EQ: return W65816::BEQ;
case W65816CC::COND_NE: return W65816::BNE;
case W65816CC::COND_HS: return W65816::BCS;
case W65816CC::COND_LO: return W65816::BCC;
case W65816CC::COND_MI: return W65816::BMI;
case W65816CC::COND_PL: return W65816::BPL;
case W65816CC::COND_VS: return W65816::BVS;
case W65816CC::COND_VC: return W65816::BVC;
}
llvm_unreachable("invalid W65816 condition code");
}
// For multi-branch CCs, return the (branchA, branchB, BothMeanTrue) tuple.
// branchA is tested first; if it takes, we go to TrueBB if BothMeanTrue
// (i.e. both branches are "take if true"), otherwise to FalseBB. branchB
// is tested next with the same semantic.
//
// GT : (BPL && BNE) → BEQ FalseBB; BPL TrueBB; fall-through FalseBB
// LE : (BMI || BEQ) → BEQ TrueBB; BMI TrueBB; fall-through FalseBB
// HI : (BCS && BNE) → BEQ FalseBB; BCS TrueBB; fall-through FalseBB
// LS : (BCC || BEQ) → BEQ TrueBB; BCC TrueBB; fall-through FalseBB
struct MultiBranch {
unsigned First, Second;
bool FirstToTrue, SecondToTrue;
};
static MultiBranch getMultiBranch(unsigned CC) {
switch (CC) {
case W65816CC::COND_GT_MB:
return {W65816::BEQ, W65816::BPL, false, true};
case W65816CC::COND_LE_MB:
return {W65816::BEQ, W65816::BMI, true, true};
case W65816CC::COND_HI_MB:
return {W65816::BEQ, W65816::BCS, false, true};
case W65816CC::COND_LS_MB:
return {W65816::BEQ, W65816::BCC, true, true};
}
llvm_unreachable("not a multi-branch CC");
}
// Emit a two-Acc16 binary op as STAfi src2; OPfi dst, src1. Allocates
// a fresh 2-byte stack slot per call. For CMP (HasOut=false) there's
// no destination register, just the two src operands. Always spill
// the SECOND operand so non-commutative ops (sub, cmp) compute
// src1 OP src2 correctly via OPfi (which gives src1 OP load(spill)).
static MachineBasicBlock *
emitRROp(MachineInstr &MI, MachineBasicBlock *BB, unsigned StoreOp,
unsigned OpFI, bool HasOut) {
MachineFunction *MF = BB->getParent();
const W65816Subtarget &STI = MF->getSubtarget<W65816Subtarget>();
const W65816InstrInfo &TII = *STI.getInstrInfo();
DebugLoc DL = MI.getDebugLoc();
int FI = MF->getFrameInfo().CreateStackObject(2, Align(2),
/*isSpillSlot=*/true);
unsigned LhsIdx = HasOut ? 1 : 0;
unsigned RhsIdx = HasOut ? 2 : 1;
Register Src1 = MI.getOperand(LhsIdx).getReg();
Register Src2 = MI.getOperand(RhsIdx).getReg();
// Spill src2 (the rhs). Then OPfi computes src1 OP load(spill).
BuildMI(*BB, MI.getIterator(), DL, TII.get(StoreOp))
.addReg(Src2)
.addFrameIndex(FI)
.addImm(0);
if (HasOut) {
Register Dst = MI.getOperand(0).getReg();
BuildMI(*BB, MI.getIterator(), DL, TII.get(OpFI), Dst)
.addReg(Src1)
.addFrameIndex(FI)
.addImm(0);
} else {
BuildMI(*BB, MI.getIterator(), DL, TII.get(OpFI))
.addReg(Src1)
.addFrameIndex(FI)
.addImm(0);
}
MI.eraseFromParent();
return BB;
}
MachineBasicBlock *
W65816TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *BB) const {
// The only opcode we currently emit with usesCustomInserter=1 is
// SELECT_CC16. Expand it into a diamond CFG with a PHI. For
// single-branch CCs:
//
// thisMBB:
// ... CMP already emitted ...
// Bxx sinkMBB ; branch to "true" path
// ; fall through to copy0MBB
// copy0MBB:
// ; (no instructions; PHI picks fval here)
// sinkMBB:
// dst = PHI [tval, thisMBB], [fval, copy0MBB]
//
// For multi-branch CCs (GT/LE/UGT/ULE without const RHS, where a
// single Bxx isn't enough), insert two branches. Both target either
// sinkMBB or copy0MBB depending on the condition.
switch (MI.getOpcode()) {
default:
llvm_unreachable("unexpected instruction in EmitInstrWithCustomInserter");
case W65816::ADD_RR:
return emitRROp(MI, BB, W65816::STAfi, W65816::ADCfi, /*HasOut=*/true);
case W65816::SUB_RR:
return emitRROp(MI, BB, W65816::STAfi, W65816::SBCfi, /*HasOut=*/true);
// Carry-chain variants for the hi half of an i32 split. STAfi doesn't
// touch P, so the carry from the previous addc/adde survives the
// spill and is consumed by ADCEfi/SBCEfi below.
case W65816::ADDE_RR:
return emitRROp(MI, BB, W65816::STAfi, W65816::ADCEfi, /*HasOut=*/true);
case W65816::SUBE_RR:
return emitRROp(MI, BB, W65816::STAfi, W65816::SBCEfi, /*HasOut=*/true);
case W65816::AND_RR:
return emitRROp(MI, BB, W65816::STAfi, W65816::ANDfi, /*HasOut=*/true);
case W65816::ORA_RR:
return emitRROp(MI, BB, W65816::STAfi, W65816::ORAfi, /*HasOut=*/true);
case W65816::EOR_RR:
return emitRROp(MI, BB, W65816::STAfi, W65816::EORfi, /*HasOut=*/true);
case W65816::CMP_RR:
return emitRROp(MI, BB, W65816::STAfi, W65816::CMPfi, /*HasOut=*/false);
case W65816::LDAptr32S:
case W65816::STAptr32S:
case W65816::STBptr32S: {
// Split-pair variant: ptr is 2 i16 operands (lo + hi) instead of
// 1 Wide32 reg pair. Used by the W65816LowerWide32 pre-RA pass
// to dodge pair-allocation pressure. Otherwise identical to
// the LDAptr32 inserter below.
MachineFunction *MF = BB->getParent();
const W65816Subtarget &STI = MF->getSubtarget<W65816Subtarget>();
const W65816InstrInfo &TII = *STI.getInstrInfo();
DebugLoc DL = MI.getDebugLoc();
bool IsLoad = MI.getOpcode() == W65816::LDAptr32S;
bool IsByteStore = MI.getOpcode() == W65816::STBptr32S;
Register PtrLo = MI.getOperand(IsLoad ? 1 : 1).getReg();
Register PtrHi = MI.getOperand(IsLoad ? 2 : 2).getReg();
int FILo = MF->getFrameInfo().CreateStackObject(2, Align(2),
/*isSpillSlot=*/false);
int FIHi = MF->getFrameInfo().CreateStackObject(2, Align(2),
/*isSpillSlot=*/false);
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi))
.addReg(PtrLo).addFrameIndex(FILo).addImm(0);
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi))
.addReg(PtrHi).addFrameIndex(FIHi).addImm(0);
// STA_DP's tablegen def has no implicit A Use, so without an
// explicit kill marker between adjacent LDAfi-STA_DP-LDAfi-STA_DP
// pairs the fast regalloc collapses two A-loads into one (the
// first's value is overwritten before STA_DP can store it). Add
// implicit Use of A on the STA_DP to encode the dependency. This
// also helps post-RA passes track A liveness correctly.
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi),
W65816::A).addFrameIndex(FILo).addImm(0);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::STA_DP)).addImm(0xE0)
.addReg(W65816::A, RegState::Implicit);
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi),
W65816::A).addFrameIndex(FIHi).addImm(0);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::STA_DP)).addImm(0xE2)
.addReg(W65816::A, RegState::Implicit);
if (IsLoad) {
Register Dst = MI.getOperand(0).getReg();
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::LDY_Imm16)).addImm(0);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::LDA_DPIndLongY)).addImm(0xE0);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(TargetOpcode::COPY), Dst).addReg(W65816::A);
} else {
Register Val = MI.getOperand(0).getReg();
BuildMI(*BB, MI.getIterator(), DL,
TII.get(TargetOpcode::COPY), W65816::A).addReg(Val);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::LDY_Imm16)).addImm(0);
if (IsByteStore)
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::SEP)).addImm(0x20);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::STA_DPIndLongY)).addImm(0xE0);
if (IsByteStore)
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::REP)).addImm(0x20);
}
MI.eraseFromParent();
return BB;
}
case W65816::LDAptr32:
case W65816::STAptr32:
case W65816::STBptr32: {
// Same shape as the i16 LDAptr/STAptr/STBptr inserter, but the
// pointer is a Wide32 register pair: sub_lo carries the low 16
// bits of the address, sub_hi carries the bank byte in its low
// half (high half is pad, ORCA convention). Stage at $E0..$E2,
// then [dp],Y addresses the right bank without forcing 0.
//
// MI-level peephole: if the Wide32 ptr is the sole user of a
// `REG_SEQUENCE(ADCi16imm BaseLo K, sub_lo, ADCEi16imm BaseHi 0,
// sub_hi)` chain (= `(add Wide32, K)` after ISel), peel the
// offset and pass K via the Y register on the `[dp],Y` deref.
// Saves ~3 instructions per access (the CLC/ADC/ADC carry chain).
// The bank-wrap caveat from LDAptr32Off applies: Y addition does
// NOT propagate beyond 16 bits, so the target object must not
// span a bank boundary (true for malloc'd / globally-allocated
// ptr32 objects; struct sizeof is far below 64KB).
//
// Doing this here rather than in LowerLoad / a SDAG combine avoids
// the JSON-tokenizer + BST + sprintf smoke regressions those paths
// tripped — the rewrites perturbed SDAG scheduling in ways that
// bisection couldn't pin down. At MI level, the rewrite is
// structural: ADCi16imm/ADCEi16imm become dead and get DCE'd.
//
// Dead unless ptr32 mode is active (LowerLoad/LowerStore are gated
// on i32 address type).
MachineFunction *MF = BB->getParent();
const W65816Subtarget &STI = MF->getSubtarget<W65816Subtarget>();
const W65816InstrInfo &TII = *STI.getInstrInfo();
MachineRegisterInfo &MRI = MF->getRegInfo();
DebugLoc DL = MI.getDebugLoc();
bool IsLoad = MI.getOpcode() == W65816::LDAptr32;
bool IsByteStore = MI.getOpcode() == W65816::STBptr32;
Register Ptr = MI.getOperand(IsLoad ? 1 : 1).getReg();
// Try the ADC-chain peel. We need:
// 1. Ptr has exactly one use (this MI) — else other users still
// need the full computed Wide32, no net win.
// 2. Ptr was defined by a REG_SEQUENCE.
// 3. Sub_lo source is ADCi16imm BaseLoReg KLo.
// 4. Sub_hi source is ADCEi16imm BaseHiReg 0.
// 5. KLo > 0 and KLo fits 16-bit unsigned.
Register PeelBaseLo, PeelBaseHi;
int64_t PeelOff = 0;
MachineInstr *DeadLoDef = nullptr;
MachineInstr *DeadHiDef = nullptr;
MachineInstr *DeadPtrDef = nullptr;
SmallVector<MachineInstr *, 4> ExtraChainDeads;
if (IsLoad && MRI.hasOneUse(Ptr)) {
MachineInstr *PtrDef = MRI.getUniqueVRegDef(Ptr);
if (PtrDef && PtrDef->getOpcode() == TargetOpcode::REG_SEQUENCE) {
Register SubLoReg, SubHiReg;
for (unsigned i = 1, e = PtrDef->getNumOperands(); i + 1 < e; i += 2) {
unsigned SubIdx = PtrDef->getOperand(i + 1).getImm();
Register R = PtrDef->getOperand(i).getReg();
if (SubIdx == llvm::sub_lo) SubLoReg = R;
else if (SubIdx == llvm::sub_hi) SubHiReg = R;
}
MachineInstr *LoDef = SubLoReg ? MRI.getUniqueVRegDef(SubLoReg)
: nullptr;
MachineInstr *HiDef = SubHiReg ? MRI.getUniqueVRegDef(SubHiReg)
: nullptr;
// We don't require SubLoReg/SubHiReg to be single-use: an
// ADCi16imm result CSE'd across multiple users (e.g., `L+K`
// also used as input to `(L+K)+M`) is fine — peeling THIS load
// doesn't kill the original ADC chain (other users still need
// it). We only erase the chain if it's all single-use end-to-end.
bool OuterSingleUse =
MRI.hasOneUse(SubLoReg) && MRI.hasOneUse(SubHiReg);
if (LoDef && HiDef &&
LoDef->getOpcode() == W65816::ADCi16imm &&
HiDef->getOpcode() == W65816::ADCEi16imm &&
// ADCi16imm and ADCEi16imm must be in the same MBB so we
// can verify nothing clobbers $p between them.
LoDef->getParent() == HiDef->getParent()) {
// Walk forward from LoDef to HiDef. If any instr between
// them defines $p, the ADCE reads a tampered carry and our
// simple substitution would change semantics.
bool PChainOK = true;
for (auto It = std::next(LoDef->getIterator());
It != HiDef->getIterator() && PChainOK; ++It) {
for (const MachineOperand &MO : It->operands()) {
if (MO.isReg() && MO.getReg() == W65816::P &&
MO.isDef() && !MO.isDead()) {
PChainOK = false;
break;
}
}
}
int64_t KLo = LoDef->getOperand(2).getImm();
int64_t KHi = HiDef->getOperand(2).getImm();
Register CandLo = LoDef->getOperand(1).getReg();
Register CandHi = HiDef->getOperand(1).getReg();
// Accept a vreg that's `COPY <phys-reg>` for any of the
// arg/accumulator/index physregs. This catches both incoming
// function args ($a/$x at entry) AND values that came from
// a preceding load (where the result was COPYed off $a).
auto isFromArgCopy = [&](Register R) -> bool {
if (!R.isVirtual()) return false;
MachineInstr *Def = MRI.getUniqueVRegDef(R);
if (!Def || !Def->isCopy()) return false;
const MachineOperand &Src = Def->getOperand(1);
if (!Src.isReg() || !Src.getReg().isPhysical()) return false;
unsigned P = Src.getReg();
return P == W65816::A || P == W65816::X || P == W65816::Y;
};
// A vreg is "from a fixed (caller-pushed) stack arg" if its
// unique def is LDAfi against a fixed FrameIndex (negative
// index in MachineFrameInfo). Caller-pushed args live in
// immutable slots, so reading them later is value-equivalent
// to reading them at function entry.
auto isFromFixedArgSlot = [&](Register R) -> bool {
if (!R.isVirtual()) return false;
MachineInstr *Def = MRI.getUniqueVRegDef(R);
if (!Def || Def->getOpcode() != W65816::LDAfi) return false;
const MachineOperand &FIOp = Def->getOperand(1);
if (!FIOp.isFI()) return false;
int FI = FIOp.getIndex();
const MachineFrameInfo &MFI = MF->getFrameInfo();
return MFI.isFixedObjectIndex(FI);
};
auto isFromArg = [&](Register R) -> bool {
if (isFromArgCopy(R)) return true;
if (isFromFixedArgSlot(R)) return true;
if (!R.isVirtual()) return false;
MachineInstr *Def = MRI.getUniqueVRegDef(R);
if (!Def || !Def->isCopy()) return false;
const MachineOperand &Src = Def->getOperand(1);
if (!Src.isReg() || !Src.getReg().isVirtual()) return false;
return isFromArgCopy(Src.getReg()) ||
isFromFixedArgSlot(Src.getReg());
};
// Recursive walk: nested ADC chains arise from i32-LOAD split
// (high half loads at `Ptr+2`, where `Ptr` is itself `arg+K`).
// Walk back, accumulating offset, until we reach an arg-base
// OR exhaust the chain.
//
// We allow inner ADC results to have multiple users — this
// happens when the SDAG CSEs `L+K` and reuses it as input to
// `(L+K)+M`. In that case, peeling THIS load doesn't kill
// the inner ADC chain (other users still need it), so we
// don't erase those inner Ms. Only the outer-most chain
// (single-use) and PtrDef are erased.
//
// Bisecting: try peeling whenever the chain reaches a
// "stable" base — args, fixed-arg-slot loads, OR any vreg
// (widest). Wider gates have historically tripped a
// FrameLowering-related smoke regression in sprintf.
int64_t Off = KLo;
bool ChainOK = (PChainOK && KHi == 0 && KLo > 0 && KLo <= 0xFFFF);
// Cap on chain walks (avoid pathological deep chains).
unsigned MaxChainDepth = 8;
// Track per-layer "all single-use" status — only erase layers
// up to the first non-single-use one.
unsigned SingleUseLayers = OuterSingleUse ? 1 : 0;
SmallVector<MachineInstr *, 6> ChainDeads;
if (OuterSingleUse) {
ChainDeads.push_back(LoDef);
ChainDeads.push_back(HiDef);
}
// Narrow gate: walk back only until we reach an arg-base or
// arg-slot base. A truly wide gate (peel any chain regardless
// of base) makes Lua ~+0.85% LARGER because each peel adds 4B
// of stack-slot staging that exceeds the carry-chain savings
// for deep-chain cases. Tested 2026-05-25.
while (ChainOK && MaxChainDepth-- > 0 &&
(!isFromArg(CandLo) || !isFromArg(CandHi))) {
if (!CandLo.isVirtual() || !CandHi.isVirtual()) {
ChainOK = false; break;
}
MachineInstr *InnerLo = MRI.getUniqueVRegDef(CandLo);
MachineInstr *InnerHi = MRI.getUniqueVRegDef(CandHi);
if (!InnerLo || !InnerHi ||
InnerLo->getOpcode() != W65816::ADCi16imm ||
InnerHi->getOpcode() != W65816::ADCEi16imm ||
InnerLo->getParent() != InnerHi->getParent()) {
ChainOK = false; break;
}
bool InnerSingleUse = MRI.hasOneUse(CandLo) && MRI.hasOneUse(CandHi);
bool InnerPOK = true;
for (auto It = std::next(InnerLo->getIterator());
It != InnerHi->getIterator() && InnerPOK; ++It) {
for (const MachineOperand &MO : It->operands()) {
if (MO.isReg() && MO.getReg() == W65816::P &&
MO.isDef() && !MO.isDead()) {
InnerPOK = false; break;
}
}
}
if (!InnerPOK) { ChainOK = false; break; }
int64_t InnerKLo = InnerLo->getOperand(2).getImm();
int64_t InnerKHi = InnerHi->getOperand(2).getImm();
if (InnerKHi != 0) { ChainOK = false; break; }
int64_t NewOff = Off + InnerKLo;
if (NewOff > 0xFFFF) { ChainOK = false; break; }
Off = NewOff;
CandLo = InnerLo->getOperand(1).getReg();
CandHi = InnerHi->getOperand(1).getReg();
// Track whether this inner layer is erasable (all-single-use
// from outer through here).
if (InnerSingleUse && SingleUseLayers ==
ChainDeads.size() / 2) {
SingleUseLayers++;
ChainDeads.push_back(InnerLo);
ChainDeads.push_back(InnerHi);
}
// Even if not single-use, we keep walking back — the peel
// is still correct (just doesn't kill the inner chain).
}
if (ChainOK && Off > 0 && Off <= 0xFFFF &&
isFromArg(CandLo) && isFromArg(CandHi)) {
PeelBaseLo = CandLo;
PeelBaseHi = CandHi;
PeelOff = Off;
DeadPtrDef = PtrDef;
// Only erase the ADC chain if it's all-single-use end to
// end. Otherwise leave it alive — other users need it.
if (OuterSingleUse) {
DeadLoDef = LoDef;
DeadHiDef = HiDef;
for (unsigned i = 2; i < ChainDeads.size(); ++i)
ExtraChainDeads.push_back(ChainDeads[i]);
}
}
}
}
}
// Layer 2 fast path: -w65816-dbr-safe-ptrs assumes the bank byte
// matches DBR, letting us skip $E0/$E2 staging entirely. Emit just
// a STAfi of sub_lo and an LDAfi_indY/STAfi_indY deref via the
// 16-bit stack-rel-indirect-Y opcode (0xB3 / 0x93). ~4 instr per
// deref saved vs the heavy [dp],Y indirect-long path.
if (DbrSafePtrs) {
Register PtrLo = MRI.createVirtualRegister(&W65816::Wide16RegClass);
if (PeelOff) {
BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), PtrLo)
.addReg(PeelBaseLo);
} else {
BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), PtrLo)
.addReg(Ptr, (RegState)0, llvm::sub_lo);
}
int FILo = MF->getFrameInfo().CreateStackObject(2, Align(2),
/*isSpillSlot=*/false);
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi))
.addReg(PtrLo).addFrameIndex(FILo).addImm(0);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::LDY_Imm16)).addImm(PeelOff);
if (IsLoad) {
Register Dst = MI.getOperand(0).getReg();
// LDAfi_indY $dst, FILo — PEI resolves to LDA (FILo,S),Y.
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi_indY),
W65816::A).addFrameIndex(FILo).addImm(0);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(TargetOpcode::COPY), Dst).addReg(W65816::A);
} else {
Register Val = MI.getOperand(0).getReg();
BuildMI(*BB, MI.getIterator(), DL,
TII.get(TargetOpcode::COPY), W65816::A).addReg(Val);
if (IsByteStore)
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::SEP)).addImm(0x20);
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi_indY))
.addReg(W65816::A).addFrameIndex(FILo).addImm(0);
if (IsByteStore)
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::REP)).addImm(0x20);
}
MI.eraseFromParent();
if (DeadPtrDef) DeadPtrDef->eraseFromParent();
if (DeadLoDef) DeadLoDef->eraseFromParent();
if (DeadHiDef) DeadHiDef->eraseFromParent();
for (MachineInstr *D : ExtraChainDeads) D->eraseFromParent();
return BB;
}
// Extract the i16 sub-halves of the Wide32 ptr. At custom-inserter
// time Ptr is still a virtual register, so `TRI.getSubReg` won't
// work (it's physreg-only). Use COPY-with-subreg-index instead;
// the regalloc + virtreg-rewriter resolves this to the right
// physreg operand later.
Register PtrLo = MRI.createVirtualRegister(&W65816::Wide16RegClass);
Register PtrHi = MRI.createVirtualRegister(&W65816::Wide16RegClass);
if (PeelOff) {
// Peeled path: pull base halves from the ADC chain's inputs.
BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), PtrLo)
.addReg(PeelBaseLo);
BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), PtrHi)
.addReg(PeelBaseHi);
} else {
BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), PtrLo)
.addReg(Ptr, (RegState)0, llvm::sub_lo);
BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), PtrHi)
.addReg(Ptr, (RegState)0, llvm::sub_hi);
}
// Spill each half to a fresh slot, reload via LDAfi. Same RA-
// pinning rationale as the i16 LDAptr inserter.
int FILo = MF->getFrameInfo().CreateStackObject(2, Align(2),
/*isSpillSlot=*/false);
int FIHi = MF->getFrameInfo().CreateStackObject(2, Align(2),
/*isSpillSlot=*/false);
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi))
.addReg(PtrLo).addFrameIndex(FILo).addImm(0);
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi))
.addReg(PtrHi).addFrameIndex(FIHi).addImm(0);
// Change 3: $E0/$E2 staging CSE. Look backward in this MBB for
// the previous ptr32-deref expansion. If its base halves match
// ours (same vreg source) and nothing between has clobbered
// $E0/$E2/$Y or the staged values, skip the LDAfi+STA_DP pairs
// and reuse the previously-staged $E0..$E2.
//
// Inserter pattern signature (from below, latest-emitted first):
// STA_DP $E2 (impl A)
// LDAfi <FIHi'> -> A
// STA_DP $E0 (impl A)
// LDAfi <FILo'> -> A
// STAfi <srcHi'>, FIHi', 0 <- prior PtrHi
// STAfi <srcLo'>, FILo', 0 <- prior PtrLo
bool ReuseStaging = false;
{
Register MySrcLo = PeelOff ? PeelBaseLo : Ptr;
Register MySrcHi = PeelOff ? PeelBaseHi : Register();
// For non-peel path, both halves come from `Ptr` via subreg; the
// CSE check uses the whole Ptr vreg (so two LDAptr32 with the
// same Ptr vreg can share staging).
auto It = MI.getIterator();
MachineInstr *PrevStaE2 = nullptr;
MachineInstr *PrevLdaHi = nullptr;
MachineInstr *PrevStaE0 = nullptr;
MachineInstr *PrevLdaLo = nullptr;
MachineInstr *PrevStaHi = nullptr;
MachineInstr *PrevStaLo = nullptr;
auto clobbersE0E2 = [&](MachineInstr &PrevMI) -> bool {
// Any call clobbers everything in DP — including $E0..$E3.
if (PrevMI.isCall()) return true;
switch (PrevMI.getOpcode()) {
// FrameLowering's long-indirect expansion of these uses $E2
// as A-stash scratch (see W65816RegisterInfo.cpp).
case W65816::ADCfi: case W65816::ADCEfi:
case W65816::ANDfi: case W65816::ORAfi: case W65816::EORfi:
case W65816::SBCfi: case W65816::SBCEfi:
case W65816::CMPfi:
return true;
case W65816::STA_DP:
case W65816::STZ_DP:
if (PrevMI.getOperand(0).isImm()) {
int64_t Imm = PrevMI.getOperand(0).getImm();
if (Imm == 0xE0 || Imm == 0xE1 ||
Imm == 0xE2 || Imm == 0xE3)
return true;
}
break;
}
return false;
};
// Scan back, fail-soft.
const unsigned MaxScan = 60;
unsigned Scanned = 0;
while (It != BB->begin() && Scanned++ < MaxScan) {
--It;
MachineInstr &P = *It;
if (!PrevStaE2) {
if (P.getOpcode() == W65816::STA_DP &&
P.getOperand(0).isImm() &&
P.getOperand(0).getImm() == 0xE2) {
PrevStaE2 = &P;
continue;
}
if (clobbersE0E2(P)) break;
continue;
}
// After PrevStaE2, expect LDAfi <FIHi'>.
if (!PrevLdaHi) {
if (P.getOpcode() == W65816::LDAfi) { PrevLdaHi = &P; continue; }
break;
}
if (!PrevStaE0) {
if (P.getOpcode() == W65816::STA_DP &&
P.getOperand(0).isImm() &&
P.getOperand(0).getImm() == 0xE0) {
PrevStaE0 = &P;
continue;
}
break;
}
if (!PrevLdaLo) {
if (P.getOpcode() == W65816::LDAfi) { PrevLdaLo = &P; continue; }
break;
}
// Now look for STAfi srcHi', FIHi' and STAfi srcLo', FILo'.
// They appear in either order; the inserter above emits Lo first
// then Hi, but scanning back, we hit Hi first.
if (!PrevStaHi) {
if (P.getOpcode() == W65816::STAfi &&
P.getOperand(1).isFI() &&
P.getOperand(1).getIndex() ==
PrevLdaHi->getOperand(1).getIndex()) {
PrevStaHi = &P;
continue;
}
break;
}
if (!PrevStaLo) {
if (P.getOpcode() == W65816::STAfi &&
P.getOperand(1).isFI() &&
P.getOperand(1).getIndex() ==
PrevLdaLo->getOperand(1).getIndex()) {
PrevStaLo = &P;
// Done with the structural match — fall through to operand
// comparison.
}
break;
}
}
if (PrevStaLo && PrevStaHi) {
Register PrevSrcLo = PrevStaLo->getOperand(0).getReg();
Register PrevSrcHi = PrevStaHi->getOperand(0).getReg();
// Match if the source vregs are identical to mine. For non-peel
// path, PtrLo/PtrHi were freshly created via COPY from Ptr.sub_*
// — match by tracing PrevSrcLo/Hi back through their COPY (if
// any) to the Ptr vreg.
auto traceToPtr = [&](Register R) -> Register {
if (!R.isVirtual()) return R;
MachineInstr *D = MRI.getUniqueVRegDef(R);
while (D && D->isCopy()) {
const MachineOperand &S = D->getOperand(1);
if (!S.isReg() || !S.getReg().isVirtual()) break;
R = S.getReg();
D = MRI.getUniqueVRegDef(R);
// For subreg copies, stop — we'd lose sub-half info.
if (D && D->getOpcode() == TargetOpcode::REG_SEQUENCE) break;
}
return R;
};
Register MyTraceLo = traceToPtr(PeelOff ? PeelBaseLo : PtrLo);
Register MyTraceHi = traceToPtr(PeelOff ? PeelBaseHi : PtrHi);
Register PrevTraceLo = traceToPtr(PrevSrcLo);
Register PrevTraceHi = traceToPtr(PrevSrcHi);
if (MyTraceLo == PrevTraceLo && MyTraceHi == PrevTraceHi &&
MyTraceLo.isValid() && MyTraceHi.isValid()) {
ReuseStaging = true;
}
}
(void)MySrcLo; (void)MySrcHi; // not used directly; trace covers
}
// Stage the 24-bit address at $E0..$E2 unless CSE allows reusing
// the previous staging.
// STA_DP's tablegen def has no implicit A Use, so without an
// explicit kill marker between adjacent LDAfi-STA_DP-LDAfi-STA_DP
// pairs the fast regalloc collapses two A-loads into one (the
// first's value is overwritten before STA_DP can store it). Add
// implicit Use of A on the STA_DP to encode the dependency. This
// also helps post-RA passes track A liveness correctly.
if (!ReuseStaging) {
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi),
W65816::A).addFrameIndex(FILo).addImm(0);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::STA_DP)).addImm(0xE0)
.addReg(W65816::A, RegState::Implicit);
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi),
W65816::A).addFrameIndex(FIHi).addImm(0);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::STA_DP)).addImm(0xE2)
.addReg(W65816::A, RegState::Implicit);
}
if (IsLoad) {
Register Dst = MI.getOperand(0).getReg();
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::LDY_Imm16)).addImm(PeelOff);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::LDA_DPIndLongY)).addImm(0xE0);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(TargetOpcode::COPY), Dst).addReg(W65816::A);
} else {
Register Val = MI.getOperand(0).getReg();
BuildMI(*BB, MI.getIterator(), DL,
TII.get(TargetOpcode::COPY), W65816::A).addReg(Val);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::LDY_Imm16)).addImm(PeelOff);
if (IsByteStore)
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::SEP)).addImm(0x20);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::STA_DPIndLongY)).addImm(0xE0);
if (IsByteStore)
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::REP)).addImm(0x20);
}
MI.eraseFromParent();
if (DeadPtrDef) DeadPtrDef->eraseFromParent();
if (DeadLoDef) DeadLoDef->eraseFromParent();
if (DeadHiDef) DeadHiDef->eraseFromParent();
for (MachineInstr *D : ExtraChainDeads) D->eraseFromParent();
return BB;
}
case W65816::LDAptr32Off:
case W65816::STAptr32Off:
case W65816::STBptr32Off: {
// ptr32 deref with constant offset. The 65816's `[dp],Y` adds Y
// to the 24-bit pointer at `dp..dp+2` to form the effective
// address — so we can stage the RAW pointer at $E0..$E2 and put
// the offset in Y, skipping the i32-add carry chain entirely.
//
// Saves ~3 instructions per access vs the previous approach
// (which did `lo+off; hi+carry` to compute the pointer then
// derefed with Y=0). Big win on heavy struct-field code like
// Lua's lapi.c. See memory: ptr32-deref-fold-layer1-mi-opcodes.
//
// Bank-wrap caveat: `[dp],Y` doesn't propagate Y into the bank
// byte at $E2 — if pointer+Y crosses a bank boundary, the result
// wraps within the 24-bit address space (not into the next bank).
// For struct fields with offsets < 64KB on malloc'd or globally-
// allocated objects that don't straddle bank boundaries this is
// safe; the caller must not place objects spanning $XX:FFFF.
//
// Dead unless ptr32 mode is active.
MachineFunction *MF = BB->getParent();
const W65816Subtarget &STI = MF->getSubtarget<W65816Subtarget>();
const W65816InstrInfo &TII = *STI.getInstrInfo();
MachineRegisterInfo &MRI = MF->getRegInfo();
DebugLoc DL = MI.getDebugLoc();
bool IsLoad = MI.getOpcode() == W65816::LDAptr32Off;
bool IsByteStore = MI.getOpcode() == W65816::STBptr32Off;
Register Ptr = MI.getOperand(1).getReg();
int64_t Off = MI.getOperand(2).getImm();
// See LDAptr32 inserter above: vreg sub-regs need COPY-with-subreg
// (TRI.getSubReg is physreg-only at custom-inserter time).
Register PtrLo = MRI.createVirtualRegister(&W65816::Wide16RegClass);
Register PtrHi = MRI.createVirtualRegister(&W65816::Wide16RegClass);
BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), PtrLo)
.addReg(Ptr, (RegState)0, llvm::sub_lo);
BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), PtrHi)
.addReg(Ptr, (RegState)0, llvm::sub_hi);
int FILo = MF->getFrameInfo().CreateStackObject(2, Align(2),
/*isSpillSlot=*/false);
int FIHi = MF->getFrameInfo().CreateStackObject(2, Align(2),
/*isSpillSlot=*/false);
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi))
.addReg(PtrLo).addFrameIndex(FILo).addImm(0);
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi))
.addReg(PtrHi).addFrameIndex(FIHi).addImm(0);
// ptr_lo -> $E0..$E1 (no offset add)
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi),
W65816::A).addFrameIndex(FILo).addImm(0);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::STA_DP)).addImm(0xE0);
// ptr_hi -> $E2..$E3 (no carry propagation needed)
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi),
W65816::A).addFrameIndex(FIHi).addImm(0);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::STA_DP)).addImm(0xE2);
if (IsLoad) {
Register Dst = MI.getOperand(0).getReg();
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::LDY_Imm16)).addImm(Off);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::LDA_DPIndLongY)).addImm(0xE0);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(TargetOpcode::COPY), Dst).addReg(W65816::A);
} else {
Register Val = MI.getOperand(0).getReg();
BuildMI(*BB, MI.getIterator(), DL,
TII.get(TargetOpcode::COPY), W65816::A).addReg(Val);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::LDY_Imm16)).addImm(Off);
if (IsByteStore)
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::SEP)).addImm(0x20);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::STA_DPIndLongY)).addImm(0xE0);
if (IsByteStore)
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::REP)).addImm(0x20);
}
MI.eraseFromParent();
return BB;
}
case W65816::LDAptrOff:
case W65816::STAptrOff:
case W65816::STBptrOff: {
// Pointer access with a constant offset. Folds the offset into
// the pointer (CLC; ADC #off in A) BEFORE staging at $E0..$E2,
// then accesses via [$E0],Y with Y=0. We can't fold into Y
// because [dp],Y on the W65816 adds Y to the full 24-bit pointer
// — for a negative Y like 0xFFFE (= -2 signed), the addition
// crosses into bank 1. Folding into the pointer keeps the add
// at 16-bit (in A) so the bank byte stays 0.
//
// DBR-independent — see LDAptr/STAptr/STBptr.
MachineFunction *MF = BB->getParent();
const W65816Subtarget &STI = MF->getSubtarget<W65816Subtarget>();
const W65816InstrInfo &TII = *STI.getInstrInfo();
DebugLoc DL = MI.getDebugLoc();
bool IsLoad = MI.getOpcode() == W65816::LDAptrOff;
bool IsByteStore = MI.getOpcode() == W65816::STBptrOff;
Register Ptr = MI.getOperand(1).getReg();
int64_t Off = MI.getOperand(2).getImm();
// Spill the pointer vreg to a fresh 2-byte stack slot, then
// reload via LDAfi. Forces RA to materialize the source — see
// the LDAptr/STAptr/STBptr case below for the full rationale.
int FI = MF->getFrameInfo().CreateStackObject(2, Align(2),
/*isSpillSlot=*/false);
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi))
.addReg(Ptr).addFrameIndex(FI).addImm(0);
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi),
W65816::A).addFrameIndex(FI).addImm(0);
// Compute ptr + off in A. CLC + ADC for the add.
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::CLC));
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::ADC_Imm16)).addImm(Off);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::STA_DP)).addImm(0xE0);
if (LoaderBankDeref) {
// Bank byte from $BE (crt0-initialised) — Loader compat path.
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::LDA_DP)).addImm(0xBE);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::STA_DP)).addImm(0xE2);
} else {
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::STZ_DP)).addImm(0xE2);
}
if (IsLoad) {
Register Dst = MI.getOperand(0).getReg();
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::LDY_Imm16)).addImm(0);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::LDA_DPIndLongY)).addImm(0xE0);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(TargetOpcode::COPY), Dst).addReg(W65816::A);
} else {
Register Val = MI.getOperand(0).getReg();
BuildMI(*BB, MI.getIterator(), DL,
TII.get(TargetOpcode::COPY), W65816::A).addReg(Val);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::LDY_Imm16)).addImm(0);
if (IsByteStore)
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::SEP)).addImm(0x20);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::STA_DPIndLongY)).addImm(0xE0);
if (IsByteStore)
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::REP)).addImm(0x20);
}
MI.eraseFromParent();
return BB;
}
case W65816::LDAptr:
case W65816::LDAptrBank0:
case W65816::STAptr:
case W65816::STBptr: {
// Pointer load/store via [dp],Y indirect-long (opcodes 0xB7 / 0x97):
// STA $E0 ; pointer low/hi at $E0..$E1
// STZ $E2 ; bank byte at $E2 = 0
// LDY #0
// LDA [$E0], Y ; bank 0:ptr + 0
// STA [$E0], Y
// Bank-explicit ZERO — DBR-independent. Both the runInMame stack
// ($00:0FFF down) and BSS / heap globals (placed at $00:xxxx) live
// in bank 0, so pointer-derefs always reach the right memory even
// when the user has switched DBR for a bank-2 store via `pha;plb`.
//
// Trade-off: under GS/OS Loader the user's data lives in their bank
// (not bank 0), so library functions that write directly to globals
// via `sta abs` (DBR-relative, lands in user bank) and user code that
// reads via pointer-deref (lands in bank 0 by this lowering) get
// INCONSISTENT results — silent miscompile. gmtime hit this with
// its __gmtimeBuf static. Workaround for affected library code:
// launder the buffer pointer through inline asm (see gmtime in
// runtime/src/timeExt.c) so clang doesn't IPSCCP-fold it; the writes
// then go via [dp],Y too and match the user reads.
//
// Const-int pointers (`*(volatile uint16 *)0x5000 = v`) are NOT
// lowered through this pseudo — TableGen patterns route them to
// STAlong / STA8long / STAabs by type. See InstrInfo.td.
//
// We use $E0..$E2 in libcall-scratch DP — safe because the
// pseudo expansion is a leaf (no calls between SEP and STA),
// and any subsequent libcall reinitialises its own scratch.
//
// Why [dp],Y not abs-long-X (`STA $0,X`)? abs-long-X is shorter
// (~3 bytes less) but uses X to hold the pointer. In high-
// pressure functions like the recursive expression parser, X
// is often live with another value, and forcing X to be free
// for every pointer-deref triggered "ran out of registers".
// [dp],Y uses A and Y only — leaves X for spill-bridge use.
//
// STBptr (truncating i8 store) wraps the actual STA in SEP/REP
// so M=8 across the store and only one byte is written.
MachineFunction *MF = BB->getParent();
const W65816Subtarget &STI = MF->getSubtarget<W65816Subtarget>();
const W65816InstrInfo &TII = *STI.getInstrInfo();
DebugLoc DL = MI.getDebugLoc();
bool IsLoad = MI.getOpcode() == W65816::LDAptr ||
MI.getOpcode() == W65816::LDAptrBank0;
bool IsByteStore = MI.getOpcode() == W65816::STBptr;
// LDAptrBank0 hardcodes bank=0 (STZ $E2) regardless of LoaderBankDeref.
// Used by va_arg under Loader where the deref is a stack pointer
// (= bank 0 always on W65816) but $BE points to our code bank.
bool ForceBank0 = MI.getOpcode() == W65816::LDAptrBank0;
Register Ptr = MI.getOperand(1).getReg();
// Why we spill the pointer to a fresh stack slot first:
// a direct `COPY $a = ptr_vreg ; STA $E0` lets RA elide the COPY
// when ptr_vreg is already allocated to A. In a loop body where
// multiple Acc16 PHIs (pointer + accumulator) compete for A, the
// PHI elimination pass picks one to be in A at the bottom of the
// block and silently drops the COPY needed to refresh A with the
// OTHER value at the top of the next iteration — silent miscompile
// (sumTable read its own accumulator as the pointer on iter 2+).
// STAfi forces RA to materialize ptr_vreg's value so it gets stored
// to the slot, then LDAfi reads it back as a real machine load.
int FI = MF->getFrameInfo().CreateStackObject(2, Align(2),
/*isSpillSlot=*/false);
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi))
.addReg(Ptr).addFrameIndex(FI).addImm(0);
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi),
W65816::A).addFrameIndex(FI).addImm(0);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::STA_DP)).addImm(0xE0);
if (LoaderBankDeref && !ForceBank0) {
// Bank byte from $BE (crt0-initialised) — Loader compat path.
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::LDA_DP)).addImm(0xBE);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::STA_DP)).addImm(0xE2);
} else {
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::STZ_DP)).addImm(0xE2);
}
if (IsLoad) {
Register Dst = MI.getOperand(0).getReg();
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::LDY_Imm16)).addImm(0);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::LDA_DPIndLongY)).addImm(0xE0);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(TargetOpcode::COPY), Dst).addReg(W65816::A);
} else {
Register Val = MI.getOperand(0).getReg();
BuildMI(*BB, MI.getIterator(), DL,
TII.get(TargetOpcode::COPY), W65816::A).addReg(Val);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::LDY_Imm16)).addImm(0);
if (IsByteStore)
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::SEP)).addImm(0x20);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::STA_DPIndLongY)).addImm(0xE0);
if (IsByteStore)
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::REP)).addImm(0x20);
}
MI.eraseFromParent();
return BB;
}
case W65816::SELECT_CC8:
case W65816::SELECT_CC16: {
const W65816Subtarget &STI = BB->getParent()->getSubtarget<W65816Subtarget>();
const W65816InstrInfo &TII = *STI.getInstrInfo();
MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
DebugLoc DL = MI.getDebugLoc();
MachineFunction *MF = BB->getParent();
const BasicBlock *LLVM_BB = BB->getBasicBlock();
MachineFunction::iterator It = ++BB->getIterator();
MachineBasicBlock *thisMBB = BB;
MachineBasicBlock *copy0MBB = MF->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(LLVM_BB);
MF->insert(It, copy0MBB);
MF->insert(It, sinkMBB);
// Move the rest of thisMBB after MI to sinkMBB.
sinkMBB->splice(sinkMBB->begin(), BB,
std::next(MachineBasicBlock::iterator(MI)), BB->end());
sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
unsigned CC = MI.getOperand(3).getImm();
// Helper: if `OpReg` is defined by a single-use, side-effect-free,
// constant-source LDA in thisMBB, MOVE that LDA into `DstMBB` (at
// its start). Returns true on success.
auto tryHoistConstInit = [&](Register OpReg,
MachineBasicBlock *DstMBB) -> bool {
if (!OpReg.isVirtual()) return false;
if (!MRI.hasOneNonDBGUse(OpReg)) return false;
MachineInstr *Def = MRI.getUniqueVRegDef(OpReg);
if (!Def || Def->getParent() != thisMBB) return false;
if (Def->getOpcode() != W65816::LDAi16imm &&
Def->getOpcode() != W65816::LDAi8imm)
return false;
if (Def->getNumOperands() < 2 || !Def->getOperand(1).isImm())
return false;
Def->removeFromParent();
DstMBB->insert(DstMBB->begin(), Def);
return true;
};
Register TValReg = MI.getOperand(1).getReg();
Register FValReg = MI.getOperand(2).getReg();
auto IsConstLda = [&](Register R) {
if (!R.isVirtual() || !MRI.hasOneNonDBGUse(R)) return false;
MachineInstr *D = MRI.getUniqueVRegDef(R);
return D && D->getParent() == thisMBB &&
(D->getOpcode() == W65816::LDAi16imm ||
D->getOpcode() == W65816::LDAi8imm) &&
D->getNumOperands() >= 2 && D->getOperand(1).isImm();
};
bool BothConst = (CC < W65816CC::COND_GT_MB) &&
IsConstLda(TValReg) && IsConstLda(FValReg);
if (BothConst) {
// 4-block diamond: thisMBB has only the test (CMP) and Bxx; the
// tval and fval LDAs each live in their own destination block,
// which is reached only via the branch — so neither LDA's flag
// side-effect can corrupt the CMP→Bxx test window. This is the
// proper fix for the "LDA between CMP and Bxx" bug catalogued in
// project_known_issue_lda_flags.md (replacing the earlier 3-block
// workaround that only hoisted fval).
//
// thisMBB: ...; CMP; Bxx tvalMBB
// copy0MBB: LDA #fval; BRA sinkMBB (FALSE path)
// tvalMBB: LDA #tval (TRUE path; falls to sink)
// sinkMBB: PHI [tval from tvalMBB, fval from copy0MBB]
MachineBasicBlock *tvalMBB = MF->CreateMachineBasicBlock(LLVM_BB);
MF->insert(sinkMBB->getIterator(), tvalMBB);
BB->addSuccessor(copy0MBB);
BB->addSuccessor(tvalMBB);
copy0MBB->addSuccessor(sinkMBB);
tvalMBB->addSuccessor(sinkMBB);
unsigned BrOp = getBranchOpcodeForCC(CC);
BuildMI(thisMBB, DL, TII.get(BrOp)).addMBB(tvalMBB);
BuildMI(copy0MBB, DL, TII.get(W65816::BRA)).addMBB(sinkMBB);
tryHoistConstInit(TValReg, tvalMBB);
tryHoistConstInit(FValReg, copy0MBB);
BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII.get(W65816::PHI),
MI.getOperand(0).getReg())
.addReg(TValReg).addMBB(tvalMBB)
.addReg(FValReg).addMBB(copy0MBB);
} else {
// 3-block diamond: keep the existing layout and (where possible)
// hoist fval into copy0MBB. Used when one or both operands are
// computed values (not constants), or when the multi-branch CC
// requires two Bxx in thisMBB.
BB->addSuccessor(copy0MBB);
BB->addSuccessor(sinkMBB);
if (CC < W65816CC::COND_GT_MB) {
unsigned BrOp = getBranchOpcodeForCC(CC);
BuildMI(thisMBB, DL, TII.get(BrOp)).addMBB(sinkMBB);
} else {
MultiBranch MB = getMultiBranch(CC);
MachineBasicBlock *Tgt1 = MB.FirstToTrue ? sinkMBB : copy0MBB;
MachineBasicBlock *Tgt2 = MB.SecondToTrue ? sinkMBB : copy0MBB;
BuildMI(thisMBB, DL, TII.get(MB.First)).addMBB(Tgt1);
BuildMI(thisMBB, DL, TII.get(MB.Second)).addMBB(Tgt2);
}
copy0MBB->addSuccessor(sinkMBB);
tryHoistConstInit(FValReg, copy0MBB);
BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII.get(W65816::PHI),
MI.getOperand(0).getReg())
.addReg(TValReg).addMBB(thisMBB)
.addReg(FValReg).addMBB(copy0MBB);
}
MI.eraseFromParent();
return sinkMBB;
}
}
}