65816-llvm-mos/src/llvm/lib/Target/W65816/W65816ISelLowering.cpp
Scott Duensing 0210b06a5e Checkpoint
2026-05-06 17:42:52 -05:00

2748 lines
127 KiB
C++

//===-- W65816ISelLowering.cpp - W65816 DAG Lowering Implementation -------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Minimum DAG lowering sufficient for a no-argument function returning an
// i16 constant. Argument passing and non-trivial calls still unimplemented.
//
//===----------------------------------------------------------------------===//
#include "W65816ISelLowering.h"
#include "W65816InstrInfo.h"
#include "W65816MachineFunctionInfo.h"
#include "W65816SelectionDAGInfo.h"
#include "W65816Subtarget.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
#include "llvm/IR/Function.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorHandling.h"
using namespace llvm;
#define DEBUG_TYPE "w65816-lower"
// Loader-compat workaround: when set, LDAptr/STAptr/STBptr inserters
// load the bank byte from DP $BE (initialized by crt0 to PHK / current
// PBR) instead of forcing it to 0 via STZ $E2. This makes pointer
// derefs land in the user's bank — matching where DBR-relative
// absolute stores go — so library functions like gmtime that store
// into static buffers via DBR-relative paths are visible to caller-
// side pointer-deref reads. Costs 2 extra bytes / 4 cycles per ptr-
// deref (LDA dp + STA dp vs STZ dp). Default off to keep
// size-sensitive builds (toolbox) under the $C000 IO-window ceiling.
static cl::opt<bool> LoaderBankDeref(
"w65816-loader-bank-deref",
cl::desc("LDAptr/STAptr inserters read bank from DP $BE (set by "
"crt0 to PHK) instead of STZ $E2. Required for GS/OS "
"Loader compatibility; default off for size-sensitive "
"builds."),
cl::init(false), cl::Hidden);
W65816TargetLowering::W65816TargetLowering(const TargetMachine &TM,
const W65816Subtarget &STI)
: TargetLowering(TM, STI) {
// Register classes for the two scalar modes. The register allocator sees
// A, X and Y as both 8-bit and 16-bit; a later REP/SEP pass is responsible
// for ensuring the dynamic mode matches the selected class.
addRegisterClass(MVT::i8, &W65816::Acc8RegClass);
addRegisterClass(MVT::i16, &W65816::Acc16RegClass);
addRegisterClass(MVT::i32, &W65816::Wide32RegClass);
computeRegisterProperties(STI.getRegisterInfo());
setStackPointerRegisterToSaveRestore(W65816::SP);
setBooleanContents(ZeroOrOneBooleanContent);
setBooleanVectorContents(ZeroOrOneBooleanContent);
// GlobalAddress and ExternalSymbol: lower to W65816ISD::Wrapper so a
// tablegen pattern can fold them into instruction operands.
setOperationAction(ISD::GlobalAddress, MVT::i16, Custom);
setOperationAction(ISD::ExternalSymbol, MVT::i16, Custom);
setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
setOperationAction(ISD::ExternalSymbol, MVT::i32, Custom);
// FrameIndex i32 has its own DAG-to-DAG path in W65816ISelDAGToDAG.cpp.
// BR_CC is custom-lowered to a CMP + W65816ISD::BR_CC chain so we can
// emit the right BEQ/BNE/BCS/BCC mnemonic per condition.
setOperationAction(ISD::BR_CC, MVT::i16, Custom);
setOperationAction(ISD::BR_CC, MVT::i8, Custom);
setOperationAction(ISD::BRCOND, MVT::Other, Expand);
setOperationAction(ISD::BR_JT, MVT::Other, Expand);
// SETCC and SELECT_CC: custom-lowered to a CMP + W65816ISD::SELECT_CC
// pseudo (with usesCustomInserter=1) that EmitInstrWithCustomInserter
// expands into a Bxx + diamond CFG + PHI. SETCC funnels through the
// same path with TVal=1 / FVal=0. SELECT (no condition operand) is
// expanded to SELECT_CC by the legalizer using SETNE against zero.
setOperationAction(ISD::SETCC, MVT::i16, Custom);
setOperationAction(ISD::SETCC, MVT::i8, Custom);
setOperationAction(ISD::SELECT_CC, MVT::i16, Custom);
setOperationAction(ISD::SELECT_CC, MVT::i8, Custom);
setOperationAction(ISD::SELECT, MVT::i16, Expand);
setOperationAction(ISD::SELECT, MVT::i8, Expand);
// 65816 has no inline sign-extend instruction; synthesize i8 -> i16
// via a bit-7 test and SELECT_CC (see LowerSignExtend).
setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Custom);
// We have zextload-i8 and extload-i8 patterns (LDA + AND #$FF / bare
// LDA for the anyext case). No native sextload; mark it Expand so
// LLVM rewrites `sextload i16, i8` into `(sign_extend (load i8))`,
// which then flows through LowerSignExtend's branchless 3-insn
// sequence (AND #$00FF; EOR #$0080; SEC; SBC #$0080).
for (MVT VT : MVT::integer_valuetypes())
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Expand);
// Only register i32 ext-load / trunc-store and Custom actions when
// i32 is actually a legal type (ptr32 mode active). Otherwise the
// Custom-action calls intercept i16/i8 ops, and LowerTruncate's
// SDValue()-on-non-i32 bail breaks the i16→i8 trunc pattern (same
// root cause as the earlier LOAD-Custom-breaks-LDAptr issue).
bool ptr32Active = isTypeLegal(MVT::i32);
if (ptr32Active) {
for (MVT MemVT : {MVT::i8, MVT::i16}) {
setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, MemVT, Expand);
setLoadExtAction(ISD::SEXTLOAD, MVT::i32, MemVT, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::i32, MemVT, Expand);
setTruncStoreAction(MVT::i32, MemVT, Expand);
}
}
// Vararg support: VASTART writes the address of the first vararg slot
// to the va_list pointer. VAARG/VACOPY/VAEND use the default
// expansions that load through that pointer and bump it. This makes
// <stdarg.h>-style functions (e.g. printf-likes) compile cleanly.
setOperationAction(ISD::VASTART, MVT::Other, Custom);
// Custom VAARG so we DON'T align the va_list pointer. The default
// expansion rounds up to the type's preferred alignment (S16 = 2),
// but caller-pushed args land at PHA's resulting odd S+1 address.
// Aligning would skip the low byte and read garbage.
setOperationAction(ISD::VAARG, MVT::Other, Custom);
setOperationAction(ISD::VACOPY, MVT::Other, Expand);
setOperationAction(ISD::VAEND, MVT::Other, Expand);
// C++ exceptions (SJLJ model) — clang lowers exception machinery into
// these intrinsics via SjLjEHPrepare. We don't have native handling
// for any of them on this target; mark Expand so LegalizeDAG falls
// back to its no-op stubs (setjmp returns 0, longjmp is a no-op,
// setup_dispatch is a chain pass-through). The actual EH semantics
// are provided at runtime by libcxxabi (__cxa_throw etc.) calling
// _Unwind_SjLj_RaiseException, which in turn longjmps via the
// function context the prologue prepared. See
// runtime/src/libcxxabiSjlj.c for the runtime side.
setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Expand);
setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i16, Expand);
setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Expand);
setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
// SJLJ exception lowering uses FRAMEADDR(0) to read the current frame
// pointer. We don't reserve a frame pointer in general; return the
// entry-SP-equivalent value (current SP read via TSC) — good enough
// for SJLJ's purpose of identifying the call frame.
setOperationAction(ISD::FRAMEADDR, MVT::i16, Custom);
setOperationAction(ISD::FRAMEADDR, MVT::i32, Custom);
// stacksave / stackrestore — used by SjLjEHPrepare to save/restore SP
// around invoke calls. The jmp_buf already captures SP via TSC in
// our setjmp implementation, so these are redundant here. Lower
// stacksave to a constant 0 (the value is stored into the function
// context but never used for restoration on our target) and
// stackrestore to a chain pass-through (no-op).
// SJLJ EH uses STACKSAVE/STACKRESTORE. Default Expand calls
// CopyFromReg/$SP which fails because SP has no register class.
// Custom-lower to a Constant 0 (stacksave) and chain-passthrough
// (stackrestore) — our SJLJ runtime doesn't actually use these
// values; setjmp/longjmp manage SP directly via TSC/TCS.
setOperationAction(ISD::STACKSAVE, MVT::Other, Custom);
setOperationAction(ISD::STACKRESTORE, MVT::Other, Custom);
// FRAMEADDR is set Custom above for SJLJ; don't set it Expand here
// (the second setOperationAction would override the first).
setOperationAction(ISD::RETURNADDR, MVT::i16, Expand);
setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i16, Expand);
setOperationAction(ISD::EH_DWARF_CFA, MVT::i16, Expand);
// The 65816 has no hardware multiplier or divider. Multiply by a
// power-of-two constant is auto-rewritten to shifts by the DAG
// combiner; arbitrary multiply / divide / mod go through libcalls
// (`__mulhi3` for i16 multiply etc.). The libcall expander emits a
// standard CALL node which flows through LowerCall, so multi-arg
// call lowering must be working first (it is, see task #26).
setOperationAction(ISD::MULHU, MVT::i16, Expand);
setOperationAction(ISD::MULHS, MVT::i16, Expand);
setOperationAction(ISD::SMUL_LOHI, MVT::i16, Expand);
setOperationAction(ISD::UMUL_LOHI, MVT::i16, Expand);
setOperationAction(ISD::MUL, MVT::i16, LibCall);
// CTPOP/CTLZ/CTTZ/ROTL/ROTR — no hardware support. Expand lets the
// type legalizer rewrite into a sequence of basic ops. Without
// this, e.g. `x && !(x & (x-1))` (LLVM canonicalises to popcount==1)
// or `(x << 1) | (x >> 15)` (canonicalised to rotl) hit "Cannot
// Select" at isel.
for (MVT VT : {MVT::i8, MVT::i16, MVT::i32}) {
setOperationAction(ISD::CTPOP, VT, Expand);
setOperationAction(ISD::CTLZ, VT, Expand);
setOperationAction(ISD::CTTZ, VT, Expand);
setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
setOperationAction(ISD::ROTL, VT, Expand);
setOperationAction(ISD::ROTR, VT, Expand);
}
setOperationAction(ISD::SDIV, MVT::i16, LibCall);
setOperationAction(ISD::UDIV, MVT::i16, LibCall);
setOperationAction(ISD::SREM, MVT::i16, LibCall);
setOperationAction(ISD::UREM, MVT::i16, LibCall);
setOperationAction(ISD::SDIVREM, MVT::i16, Expand);
setOperationAction(ISD::UDIVREM, MVT::i16, Expand);
// Variable-amount and large-constant shifts. We have inline
// patterns for shift-by-1..4; everything else goes through
// __ashlhi3 / __lshrhi3 / __ashrhi3. Setting the action to Custom
// lets us return SDValue() for the fast cases and route everything
// else through the libcall lowering helper.
setOperationAction(ISD::SHL, MVT::i16, Custom);
setOperationAction(ISD::SRL, MVT::i16, Custom);
setOperationAction(ISD::SRA, MVT::i16, Custom);
// i8 shifts go through Custom too — LowerShift detects the i8 result
// and routes through trunc(i16-shift(zext_or_sext(lhs), amount)).
// Avoids needing a parallel set of qi3 libcalls.
setOperationAction(ISD::SHL, MVT::i8, Custom);
setOperationAction(ISD::SRL, MVT::i8, Custom);
setOperationAction(ISD::SRA, MVT::i8, Custom);
// LOAD / STORE Custom-lowering for ptr32 mode is intentionally NOT
// wired here in ptr16 mode. Setting LOAD Custom and returning
// SDValue() from LowerLoad short-circuits the i16-result LDAptr/
// STAptr selection paths (the Custom→empty→Legal fall-through doesn't
// re-enter pattern matching). When ptr32 is activated, this hook
// needs a different gating mechanism — likely an isel-time
// replacement triggered by addrspacecast or a target DAG combine.
// See LowerLoad / LowerStore — currently dead code.
// ADDC/ADDE/SUBC/SUBE are the legacy SDNodes with implicit Glue carrying
// the carry/borrow flag between the two halves of a multi-precision add or
// sub. Setting them Legal triggers the type legalizer's carry-chain split
// for i32 ADD/SUB, which lowers to native ADC/SBC pairs (~7 instructions)
// instead of the default UADDO+SETCC+ADD-of-bool path (~25 instructions).
// The matching tablegen pseudos add Defs/Uses on the P register, which
// tablegen wires up to the SDNode's SDNPInGlue/SDNPOutGlue automatically.
setOperationAction(ISD::ADDC, MVT::i16, Legal);
setOperationAction(ISD::ADDE, MVT::i16, Legal);
setOperationAction(ISD::SUBC, MVT::i16, Legal);
setOperationAction(ISD::SUBE, MVT::i16, Legal);
// i32 (long). Type legalization splits i32 into two i16 halves; with
// ADDC/ADDE Legal (above), ADD/SUB go through the native carry chain.
// AND/OR/XOR split cleanly into per-half ops with no carry to track.
// Multiply/divide/shift go through libcall stubs whose
// implementations live in runtime/src/libgcc.s. SHL_PARTS / SRL_PARTS
// / SRA_PARTS are the SDNodes the type legalizer emits when splitting
// a variable-amount shift; without an action they get "Cannot select".
// LibCall on the parent node routes the whole shift through one
// __ashlsi3 / __lshrsi3 / __ashrsi3 call, which is both smaller and
// simpler than implementing a 32-bit shift in 65816 assembly inline.
for (MVT VT : {MVT::i32}) {
setOperationAction(ISD::MUL, VT, LibCall);
setOperationAction(ISD::SDIV, VT, LibCall);
setOperationAction(ISD::UDIV, VT, LibCall);
setOperationAction(ISD::SREM, VT, LibCall);
setOperationAction(ISD::UREM, VT, LibCall);
setOperationAction(ISD::MULHU, VT, Expand);
setOperationAction(ISD::MULHS, VT, Expand);
setOperationAction(ISD::SMUL_LOHI, VT, Expand);
setOperationAction(ISD::UMUL_LOHI, VT, Expand);
setOperationAction(ISD::SDIVREM, VT, Expand);
setOperationAction(ISD::UDIVREM, VT, Expand);
// i32 shifts route through a libcall via the
// preferredShiftLegalizationStrategy override (see header). No
// explicit SHL/SHL_PARTS action needed — the override forces the
// type-legalizer's libcall path before SHL_PARTS would be emitted.
}
// i64 shifts — route to libcall before the type legalizer tries
// to split via the next-legal-type (which becomes i32 in ptr32 mode
// and triggers a SDAG combine loop on `i64 >> K` patterns). By
// marking SHL/SRL/SRA i64 LibCall here, the operation legalizer
// picks up the libcall path even though i64 itself is illegal.
for (MVT VT : {MVT::i64}) {
setOperationAction(ISD::SHL, VT, LibCall);
setOperationAction(ISD::SRL, VT, LibCall);
setOperationAction(ISD::SRA, VT, LibCall);
}
if (ptr32Active) {
for (unsigned Op : {ISD::ADD, ISD::SUB, ISD::AND, ISD::OR, ISD::XOR})
setOperationAction(Op, MVT::i32, Custom);
setOperationAction(ISD::SHL, MVT::i32, Custom);
setOperationAction(ISD::SRL, MVT::i32, Custom);
setOperationAction(ISD::SRA, MVT::i32, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::i32, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::i32, Custom);
setOperationAction(ISD::ANY_EXTEND, MVT::i32, Custom);
// SIGN_EXTEND_INREG with i32 result and inner type i1/i8/i16:
// the combiner emits this for `(int32_t)((int8_t)x)` and for
// `-(crc & 1ul)` (the i1 case shows up in CRC32 loops). No
// tablegen pattern covers the i32 form; Custom-lower to per-half
// ops. IMPORTANT: LegalizeDAG looks up the action for
// SIGN_EXTEND_INREG using the INNER VT (the operand value type),
// not the result VT. See LegalizeDAG.cpp:
// Action = TLI.getOperationAction(Op, InnerType);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Custom);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Custom);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Custom);
setOperationAction(ISD::TRUNCATE, MVT::i16, Custom);
setOperationAction(ISD::TRUNCATE, MVT::i8, Custom);
setOperationAction(ISD::LOAD, MVT::i32, Custom);
setOperationAction(ISD::STORE, MVT::i32, Custom);
setOperationAction(ISD::SETCC, MVT::i32, Custom);
setOperationAction(ISD::BR_CC, MVT::i32, Custom);
setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
setOperationAction(ISD::SELECT, MVT::i32, Custom);
setOperationAction(ISD::Constant, MVT::i32, Custom);
}
// Disable jump tables. Generating them costs us BRIND (indirect
// branch via 16-bit pointer load), which we don't have. A long
// if-else chain compiles fine without them. Setting the threshold
// to UINT_MAX makes LLVM never form a jump table.
setMinimumJumpTableEntries(UINT_MAX);
// Variable-length arrays / dynamic stack allocation. Lowered to
// `tsc; sec; sbc size; tcs; inc a` — A returns the address of the
// allocated region. Limitation: this shifts SP, so any FrameIndex
// accessed *after* a DYNAMIC_STACKALLOC reads from a wrong offset
// (we have no frame pointer). Suitable for the common pattern
// "alloca; initialise; pass; return"; complex VLA use mixed with
// local-variable access across the alloca will miscompile. A real
// FP (DP slot or X-as-FP) would lift this restriction.
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i16, Custom);
// Opt into PerformDAGCombine on LOAD nodes — needed for the
// address-select reverse combine (see W65816TargetLowering::
// PerformDAGCombine).
// setTargetDAGCombine(ISD::LOAD); // bisecting pickif hang
// SHL combine disabled while debugging the ptr32 i64-phi hang.
// setTargetDAGCombine(ISD::SHL);
// Combine STORE / LOAD with const-int i32 pointer to a form that
// survives LowerI32Constant (which would otherwise split the ptr
// into a Wide32 reg pair and lose the const-addr fast path).
// See PerformDAGCombine.
setTargetDAGCombine(ISD::STORE);
setTargetDAGCombine(ISD::LOAD);
}
// Map an LLVM SETCC condition to a W65816 branch. Returns the condition
// code along with possibly-swapped LHS/RHS; some signed comparisons are
// rewritten to use unsigned ones with a tweaked operand because the
// 65816 has no native signed branch other than BMI/BPL on a value, not
// on a comparison result.
// Map an LLVM SETCC condition to a 65816 branch. Unsigned codes use
// BCS/BCC after CMP. Signed SETLT/SETGE map to BMI/BPL — correct only
// when the comparison cannot overflow. For values produced by typical
// C arithmetic on i16 this is usually fine; values near INT16_MIN/MAX
// could give wrong results until we emit the BVS handling sequence.
// SETGT / SETLE are rewritten to SETLT / SETGE with constant + 1 in
// LowerBR_CC, mirroring the SETULE / SETUGT path.
static W65816CC::CondCode mapCC(ISD::CondCode CC) {
switch (CC) {
case ISD::SETEQ: return W65816CC::COND_EQ;
case ISD::SETNE: return W65816CC::COND_NE;
case ISD::SETUGE: return W65816CC::COND_HS;
case ISD::SETULT: return W65816CC::COND_LO;
case ISD::SETLT: return W65816CC::COND_MI;
case ISD::SETGE: return W65816CC::COND_PL;
default:
return W65816CC::COND_INVALID;
}
}
// If both compare operands are i8, widen them to i16 so the existing
// i16 CMP path can handle them. Use ZEXT for unsigned/eq/ne CCs and
// SEXT for signed CCs — picking the wrong extension would invert the
// answer (e.g. -1i8 sext to 0xFFFF compares < 1 signed; zext to 0x00FF
// compares > 1 unsigned, which would flip a signed less-than).
static void promoteI8Cmp(SDValue &LHS, SDValue &RHS, ISD::CondCode CC,
SelectionDAG &DAG, const SDLoc &DL) {
if (LHS.getValueType() != MVT::i8) return;
unsigned Ext;
switch (CC) {
case ISD::SETLT: case ISD::SETLE: case ISD::SETGT: case ISD::SETGE:
Ext = ISD::SIGN_EXTEND; break;
default:
Ext = ISD::ZERO_EXTEND; break; // unsigned + eq/ne
}
LHS = DAG.getNode(Ext, DL, MVT::i16, LHS);
RHS = DAG.getNode(Ext, DL, MVT::i16, RHS);
}
// Normalize a (LHS, RHS, CC) triple so the result is something we can
// emit with one CMP + Bxx. Returns the W65816 condition code; updates
// LHS/RHS/CC in place. Returns COND_INVALID on failure.
static W65816CC::CondCode normalizeCC(SDValue &LHS, SDValue &RHS,
ISD::CondCode &CC, SelectionDAG &DAG,
const SDLoc &DL) {
promoteI8Cmp(LHS, RHS, CC, DAG, DL);
// CMP wants the comparand (constant or memory) on the right. If a DAG
// pre-pass put the constant on the left, swap and flip the condition.
if (isa<ConstantSDNode>(LHS) && !isa<ConstantSDNode>(RHS)) {
std::swap(LHS, RHS);
CC = ISD::getSetCCSwappedOperands(CC);
}
// Signed compare via "EOR with sign bit then unsigned compare":
// a < b (signed) iff (a ^ 0x8000) < (b ^ 0x8000) (unsigned)
// The XOR flips the sign bit, which converts signed-int ordering to
// unsigned-int ordering on the same bits. This avoids the WDC's
// missing "BLT signed" — BMI/BPL alone read the sign of (a-b)
// without the V-flag overflow correction, giving wrong results
// when the subtraction overflows (e.g., INT16_MIN < 1 produced
// false because (-32768 - 1) = +32767 has N=0). After the EOR
// transform we use BCC/BCS which depend on the carry from CMP and
// don't suffer overflow corruption.
//
// Cost: 1 EOR per operand (3 bytes each in M=16) — comparable to
// the V-aware multi-branch sequence (5+ bytes of branches), but
// happens at SDAG time so subsequent SDAG combining can fold
// EORs against constants or already-EOR'd values.
bool SignedCmp = (CC == ISD::SETLT || CC == ISD::SETLE ||
CC == ISD::SETGT || CC == ISD::SETGE);
if (SignedCmp && LHS.getValueType() == MVT::i16) {
EVT VT = LHS.getValueType();
SDValue Mask = DAG.getConstant(0x8000, DL, VT);
LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, Mask);
RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, Mask);
switch (CC) {
case ISD::SETLT: CC = ISD::SETULT; break;
case ISD::SETLE: CC = ISD::SETULE; break;
case ISD::SETGT: CC = ISD::SETUGT; break;
case ISD::SETGE: CC = ISD::SETUGE; break;
default: break;
}
}
// Rewrite SETULE / SETUGT to SETULT / SETUGE with constant +/- 1.
// (SETLE / SETGT have already been converted to their unsigned
// counterparts above for i16; this handles original SETULE/SETUGT
// and the post-transform SETULE/SETUGT.) Keeps the variable on the
// LHS and lets us use BCS / BCC natively.
if (auto *RhsConst = dyn_cast<ConstantSDNode>(RHS)) {
int64_t V = RhsConst->getSExtValue();
uint64_t UV = (uint64_t)V & 0xFFFF;
if (CC == ISD::SETULE && UV < 0xffff) {
RHS = DAG.getConstant(UV + 1, DL, RHS.getValueType());
CC = ISD::SETULT;
} else if (CC == ISD::SETUGT && UV < 0xffff) {
RHS = DAG.getConstant(UV + 1, DL, RHS.getValueType());
CC = ISD::SETUGE;
} else if (CC == ISD::SETLE && V < 0x7fff) {
// Reachable only when SignedCmp transform was skipped (i8 case
// before promoteI8Cmp could get it, or non-i16 in the future).
RHS = DAG.getConstant(V + 1, DL, RHS.getValueType());
CC = ISD::SETLT;
} else if (CC == ISD::SETGT && V < 0x7fff) {
RHS = DAG.getConstant(V + 1, DL, RHS.getValueType());
CC = ISD::SETGE;
}
}
W65816CC::CondCode TCC = mapCC(CC);
if (TCC == W65816CC::COND_INVALID) {
// Try swapping operands first — preferable since it leaves us with
// a single-Bxx form. But reject the swap if it would put a load on
// the LHS (we can't pattern-match cmp(load,reg) without spilling A).
bool RhsIsLoad = isa<LoadSDNode>(RHS.getNode());
bool LhsIsLoad = isa<LoadSDNode>(LHS.getNode());
bool SwapWouldHurt = RhsIsLoad && !LhsIsLoad;
if (!SwapWouldHurt) {
std::swap(LHS, RHS);
CC = ISD::getSetCCSwappedOperands(CC);
TCC = mapCC(CC);
}
}
// Final fallback: GT/LE/UGT/ULE without a useful swap target. Use a
// multi-branch pseudo CC; the SELECT_CC16 custom inserter expands it
// into a 3-BB diamond. Only valid for SELECT_CC, not for BR_CC —
// LowerBR_CC re-routes those through SETCC + BR_CC NE.
if (TCC == W65816CC::COND_INVALID) {
switch (CC) {
case ISD::SETGT: TCC = W65816CC::COND_GT_MB; break;
case ISD::SETLE: TCC = W65816CC::COND_LE_MB; break;
case ISD::SETUGT: TCC = W65816CC::COND_HI_MB; break;
case ISD::SETULE: TCC = W65816CC::COND_LS_MB; break;
default: break;
}
}
return TCC;
}
// Wide32 build/extract helpers, used by LowerLoad/Store/Extend/Truncate/
// I32Bin/BR_CC to construct or destructure i32 SDValues across the
// sub_lo / sub_hi halves of the Wide32 register class.
static SDValue buildWide32(SelectionDAG &DAG, const SDLoc &DL,
SDValue Lo, SDValue Hi) {
SDValue RC = DAG.getTargetConstant(W65816::Wide32RegClassID, DL, MVT::i32);
SDValue SubLo = DAG.getTargetConstant(llvm::sub_lo, DL, MVT::i32);
SDValue SubHi = DAG.getTargetConstant(llvm::sub_hi, DL, MVT::i32);
SDNode *RS = DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, DL, MVT::i32,
{RC, Lo, SubLo, Hi, SubHi});
return SDValue(RS, 0);
}
// Look through a buildWide32(Lo, Hi) -> REG_SEQUENCE(RC, Lo, sub_lo,
// Hi, sub_hi) pair: if X is exactly that machine node, return the
// matching half operand directly. Avoids a TargetExtractSubreg that
// would re-enter the SDAG combiner and re-build the i32 constant /
// pair, looping forever (observed as OOM in the combiner on `*t = 0`).
static SDValue lookThroughRegSeq(SDValue X, unsigned WantSub) {
if (!X.getNode() || !X.isMachineOpcode()) return SDValue();
if (X.getMachineOpcode() != TargetOpcode::REG_SEQUENCE) return SDValue();
// Layout: op0 = RC, then (Reg, SubIdx) pairs.
for (unsigned i = 1; i + 1 < X.getNumOperands(); i += 2) {
SDValue SubIdx = X.getOperand(i + 1);
auto *CIdx = dyn_cast<ConstantSDNode>(SubIdx);
if (!CIdx) continue;
if (CIdx->getZExtValue() == WantSub)
return X.getOperand(i);
}
return SDValue();
}
static SDValue extractWide32Lo(SelectionDAG &DAG, const SDLoc &DL, SDValue X) {
// For constants, materialise the lo half as an i16 constant directly
// — getTargetExtractSubreg on a Constant SDNode produces a malformed
// MachineSDNode (constants don't carry sub-regs) and triggers
// SDAG combine loops downstream.
if (auto *C = dyn_cast<ConstantSDNode>(X)) {
return DAG.getConstant(C->getZExtValue() & 0xFFFFu, DL, MVT::i16);
}
if (SDValue Half = lookThroughRegSeq(X, llvm::sub_lo))
return Half;
return DAG.getTargetExtractSubreg(llvm::sub_lo, DL, MVT::i16, X);
}
static SDValue extractWide32Hi(SelectionDAG &DAG, const SDLoc &DL, SDValue X) {
if (auto *C = dyn_cast<ConstantSDNode>(X)) {
return DAG.getConstant((C->getZExtValue() >> 16) & 0xFFFFu, DL, MVT::i16);
}
if (SDValue Half = lookThroughRegSeq(X, llvm::sub_hi))
return Half;
return DAG.getTargetExtractSubreg(llvm::sub_hi, DL, MVT::i16, X);
}
SDValue W65816TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
SDValue Chain = Op.getOperand(0);
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
SDValue LHS = Op.getOperand(2);
SDValue RHS = Op.getOperand(3);
SDValue Dest = Op.getOperand(4);
SDLoc DL(Op);
EVT VT = LHS.getValueType();
// i32 BR_CC: synthesize an i16 boolean from per-half compares, then
// branch on (bool != 0). Avoids the legalizer's generic Expand that
// re-enters our SETCC/BR_CC custom paths in an infinite loop.
if (VT == MVT::i32) {
SDValue LL = extractWide32Lo(DAG, DL, LHS);
SDValue LH = extractWide32Hi(DAG, DL, LHS);
SDValue RL = extractWide32Lo(DAG, DL, RHS);
SDValue RH = extractWide32Hi(DAG, DL, RHS);
SDValue Bool;
if (CC == ISD::SETEQ || CC == ISD::SETNE) {
SDValue EqLo = DAG.getSetCC(DL, MVT::i16, LL, RL, ISD::SETEQ);
SDValue EqHi = DAG.getSetCC(DL, MVT::i16, LH, RH, ISD::SETEQ);
Bool = DAG.getNode(ISD::AND, DL, MVT::i16, EqLo, EqHi);
if (CC == ISD::SETNE)
Bool = DAG.getNode(ISD::XOR, DL, MVT::i16, Bool,
DAG.getConstant(1, DL, MVT::i16));
} else {
// (a CC b) where CC is ordered:
// = (hi_a HiStrict hi_b) || (hi_a == hi_b && lo_a LoCC lo_b)
// HiStrict is the strict variant of CC (LE -> LT etc.) so the
// tie-breaker (hi==hi && lo CC lo) handles the equality case
// properly. LoCC is always the unsigned variant of CC because
// the low half is unsigned (the high half carries the sign).
ISD::CondCode HiCC, LoCCu;
switch (CC) {
case ISD::SETLT: HiCC = ISD::SETLT; LoCCu = ISD::SETULT; break;
case ISD::SETLE: HiCC = ISD::SETLT; LoCCu = ISD::SETULE; break;
case ISD::SETGT: HiCC = ISD::SETGT; LoCCu = ISD::SETUGT; break;
case ISD::SETGE: HiCC = ISD::SETGT; LoCCu = ISD::SETUGE; break;
case ISD::SETULT: HiCC = ISD::SETULT; LoCCu = ISD::SETULT; break;
case ISD::SETULE: HiCC = ISD::SETULT; LoCCu = ISD::SETULE; break;
case ISD::SETUGT: HiCC = ISD::SETUGT; LoCCu = ISD::SETUGT; break;
case ISD::SETUGE: HiCC = ISD::SETUGT; LoCCu = ISD::SETUGE; break;
default:
report_fatal_error("W65816: unexpected i32 BR_CC condition");
}
SDValue HiOk = DAG.getSetCC(DL, MVT::i16, LH, RH, HiCC);
SDValue HiEq = DAG.getSetCC(DL, MVT::i16, LH, RH, ISD::SETEQ);
SDValue LoOk = DAG.getSetCC(DL, MVT::i16, LL, RL, LoCCu);
SDValue Tie = DAG.getNode(ISD::AND, DL, MVT::i16, HiEq, LoOk);
Bool = DAG.getNode(ISD::OR, DL, MVT::i16, HiOk, Tie);
}
SDValue Zero = DAG.getConstant(0, DL, MVT::i16);
return DAG.getNode(ISD::BR_CC, DL, MVT::Other, Chain,
DAG.getCondCode(ISD::SETNE), Bool, Zero, Dest);
}
W65816CC::CondCode TCC = normalizeCC(LHS, RHS, CC, DAG, DL);
if (TCC == W65816CC::COND_INVALID)
report_fatal_error("W65816: branch condition not yet implemented");
// Multi-branch CCs only have inserter support via SELECT_CC16. For
// BR_CC, reroute through SETCC: materialise the boolean to A, then
// branch on NE-vs-zero. One extra LDA but always works.
if (TCC >= W65816CC::COND_GT_MB) {
SDValue Bool = DAG.getNode(ISD::SETCC, DL, VT, LHS, RHS,
DAG.getCondCode(CC));
SDValue Zero = DAG.getConstant(0, DL, VT);
return DAG.getNode(ISD::BR_CC, DL, MVT::Other, Chain,
DAG.getCondCode(ISD::SETNE), Bool, Zero, Dest);
}
SDValue Glue = DAG.getNode(W65816ISD::CMP, DL, MVT::Glue, LHS, RHS);
SDValue CCOp = DAG.getTargetConstant(TCC, DL, MVT::i8);
return DAG.getNode(W65816ISD::BR_CC, DL, MVT::Other, Chain, Dest, CCOp,
Glue);
}
SDValue W65816TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
// setcc lhs, rhs, cc -> select_cc lhs, rhs, 1, 0, cc.
// The SELECT_CC then re-enters LowerOperation and we lower it via the
// diamond-CFG path. setBooleanContents(ZeroOrOne) means callers see
// the result as a clean 0/1 value.
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
SDLoc DL(Op);
EVT VT = Op.getValueType();
// i32 SETCC: split into per-half compares. Result type is i16 (the
// legalizer keeps the boolean result type narrow regardless of LHS
// width).
if (LHS.getValueType() == MVT::i32) {
SDValue LL = extractWide32Lo(DAG, DL, LHS);
SDValue LH = extractWide32Hi(DAG, DL, LHS);
SDValue RL = extractWide32Lo(DAG, DL, RHS);
SDValue RH = extractWide32Hi(DAG, DL, RHS);
if (CC == ISD::SETEQ || CC == ISD::SETNE) {
SDValue EqLo = DAG.getSetCC(DL, VT, LL, RL, ISD::SETEQ);
SDValue EqHi = DAG.getSetCC(DL, VT, LH, RH, ISD::SETEQ);
SDValue Eq = DAG.getNode(ISD::AND, DL, VT, EqLo, EqHi);
if (CC == ISD::SETNE)
Eq = DAG.getNode(ISD::XOR, DL, VT, Eq, DAG.getConstant(1, DL, VT));
return Eq;
}
ISD::CondCode HiCC, LoCCu;
switch (CC) {
case ISD::SETLT: HiCC = ISD::SETLT; LoCCu = ISD::SETULT; break;
case ISD::SETLE: HiCC = ISD::SETLT; LoCCu = ISD::SETULE; break;
case ISD::SETGT: HiCC = ISD::SETGT; LoCCu = ISD::SETUGT; break;
case ISD::SETGE: HiCC = ISD::SETGT; LoCCu = ISD::SETUGE; break;
case ISD::SETULT: HiCC = ISD::SETULT; LoCCu = ISD::SETULT; break;
case ISD::SETULE: HiCC = ISD::SETULT; LoCCu = ISD::SETULE; break;
case ISD::SETUGT: HiCC = ISD::SETUGT; LoCCu = ISD::SETUGT; break;
case ISD::SETUGE: HiCC = ISD::SETUGT; LoCCu = ISD::SETUGE; break;
default:
report_fatal_error("W65816: unexpected i32 SETCC condition");
}
SDValue HiOk = DAG.getSetCC(DL, VT, LH, RH, HiCC);
SDValue HiEq = DAG.getSetCC(DL, VT, LH, RH, ISD::SETEQ);
SDValue LoOk = DAG.getSetCC(DL, VT, LL, RL, LoCCu);
SDValue Tie = DAG.getNode(ISD::AND, DL, VT, HiEq, LoOk);
return DAG.getNode(ISD::OR, DL, VT, HiOk, Tie);
}
SDValue One = DAG.getConstant(1, DL, VT);
SDValue Zero = DAG.getConstant(0, DL, VT);
return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, One, Zero,
DAG.getCondCode(CC));
}
SDValue W65816TargetLowering::LowerSELECT_CC(SDValue Op,
SelectionDAG &DAG) const {
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
SDValue TVal = Op.getOperand(2);
SDValue FVal = Op.getOperand(3);
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
SDLoc DL(Op);
// i32 SELECT_CC: synthesize an i16 boolean from the i32 compare via
// LowerSETCC's i32 path, then select between the i32 halves driven
// by the boolean. Avoids creating the i32 W65816::CMP we have no
// pattern for.
if (LHS.getValueType() == MVT::i32) {
// Materialise the i16 boolean.
SDValue Bool = DAG.getSetCC(DL, MVT::i16, LHS, RHS, CC);
SDValue Zero = DAG.getConstant(0, DL, MVT::i16);
if (Op.getValueType() == MVT::i32) {
SDValue TLo = extractWide32Lo(DAG, DL, TVal);
SDValue THi = extractWide32Hi(DAG, DL, TVal);
SDValue FLo = extractWide32Lo(DAG, DL, FVal);
SDValue FHi = extractWide32Hi(DAG, DL, FVal);
SDValue Lo = DAG.getSelectCC(DL, Bool, Zero, TLo, FLo, ISD::SETNE);
SDValue Hi = DAG.getSelectCC(DL, Bool, Zero, THi, FHi, ISD::SETNE);
return buildWide32(DAG, DL, Lo, Hi);
}
return DAG.getSelectCC(DL, Bool, Zero, TVal, FVal, ISD::SETNE);
}
// SELECT_CC with i32 result (i16 LHS): split TVal/FVal into halves
// and run a per-half i16 SELECT_CC sharing the same condition.
if (Op.getValueType() == MVT::i32) {
SDValue TLo = extractWide32Lo(DAG, DL, TVal);
SDValue THi = extractWide32Hi(DAG, DL, TVal);
SDValue FLo = extractWide32Lo(DAG, DL, FVal);
SDValue FHi = extractWide32Hi(DAG, DL, FVal);
SDValue Lo = DAG.getSelectCC(DL, LHS, RHS, TLo, FLo, CC);
SDValue Hi = DAG.getSelectCC(DL, LHS, RHS, THi, FHi, CC);
return buildWide32(DAG, DL, Lo, Hi);
}
W65816CC::CondCode TCC = normalizeCC(LHS, RHS, CC, DAG, DL);
if (TCC == W65816CC::COND_INVALID)
report_fatal_error("W65816: select_cc condition not yet implemented");
SDValue Glue = DAG.getNode(W65816ISD::CMP, DL, MVT::Glue, LHS, RHS);
SDValue CCOp = DAG.getTargetConstant(TCC, DL, MVT::i8);
// SDTypeProfile declares 1 result (the selected value). Earlier
// code passed a 2-VT list (value + Glue) which was silently wrong
// and trips an SDNode-validity assertion in assertions builds.
SDValue Ops[] = {TVal, FVal, CCOp, Glue};
return DAG.getNode(W65816ISD::SELECT_CC, DL, Op.getValueType(), Ops);
}
// i8 -> i16 sign extend. Branchless 3-instruction trick:
// sext(x) = ((x & 0xFF) ^ 0x80) - 0x80
// Verify: x=0x00 -> 0x80 - 0x80 = 0x0000. x=0x7F -> 0xFF - 0x80 = 0x7F.
// x=0x80 -> 0x00 - 0x80 = 0xFF80 (-128). x=0xFF -> 0x7F - 0x80
// = 0xFFFF (-1).
// Lowers to: AND #$00FF; EOR #$0080; SEC; SBC #$0080 (10 bytes total,
// no branches, no temp slots — much cheaper than the SELECT_CC diamond
// version that produced ~14 instructions plus stack spills).
SDValue W65816TargetLowering::LowerSignExtend(SDValue Op,
SelectionDAG &DAG) const {
SDValue X = Op.getOperand(0);
if (X.getValueType() != MVT::i8 || Op.getValueType() != MVT::i16)
return SDValue();
SDLoc DL(Op);
SDValue Z = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, X);
SDValue Sign = DAG.getConstant(0x0080, DL, MVT::i16);
SDValue Xor = DAG.getNode(ISD::XOR, DL, MVT::i16, Z, Sign);
return DAG.getNode(ISD::SUB, DL, MVT::i16, Xor, Sign);
}
// ptr32 foundation hook. In ptr16 mode (PointerWidth=16, current
// default) addresses are i16 and we return SDValue() so the legalizer
// keeps the load and the existing LDAptr / STAptr selection patterns
// match. In ptr32 mode addresses are i32 and we wrap the load in
// W65816ISD::LD_PTR via getMemIntrinsicNode so the [dp],Y inserter
// can take the bank byte from sub_hi instead of forcing 0.
//
// Byte loads (zextload, anyext, true i8) keep going through the i16
// LDA + AND #$FF idiom — same trick the existing LDAptr uses; for
// ptr32 mode the load is still 16 bits, just bank-explicit.
SDValue W65816TargetLowering::LowerLoad(SDValue Op,
SelectionDAG &DAG) const {
LoadSDNode *Ld = cast<LoadSDNode>(Op);
SDValue Chain = Ld->getChain();
SDValue Ptr = Ld->getBasePtr();
EVT VT = Op.getValueType();
SDLoc DL(Op);
// Const-int address: leave the SDAG alone so the tablegen pattern
// `(load (iPTR imm))` → LDA8long fires (bank-explicit). See the
// mirrored short-circuit at the top of LowerStore.
if (isa<ConstantSDNode>(Ptr) && (VT == MVT::i8 || VT == MVT::i16))
return SDValue();
// i32 LOAD: split into two i16 loads at offsets 0 and 2 then
// REG_SEQUENCE the halves into a Wide32. Address may be i16 (stack
// slot, global) or i32 (ptr32 deref); the recursive ADD handles
// address arithmetic correctly via LowerI32Bin.
if (VT == MVT::i32) {
EVT PtrVT = Ptr.getValueType();
SDValue Two = DAG.getConstant(2, DL, PtrVT);
SDValue Ptr2 = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr, Two);
SDValue Lo = DAG.getLoad(MVT::i16, DL, Chain, Ptr,
Ld->getPointerInfo(),
Ld->getAlign(),
Ld->getMemOperand()->getFlags());
SDValue Hi = DAG.getLoad(MVT::i16, DL, Chain, Ptr2,
Ld->getPointerInfo().getWithOffset(2),
Ld->getAlign(),
Ld->getMemOperand()->getFlags());
SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
Lo.getValue(1), Hi.getValue(1));
SDValue Val = buildWide32(DAG, DL, Lo, Hi);
return DAG.getMergeValues({Val, NewChain}, DL);
}
// ptr16 mode: address is i16, let the default selection handle it.
if (Ptr.getValueType() != MVT::i32)
return SDValue();
EVT MemVT = Ld->getMemoryVT();
SDVTList VTs = DAG.getVTList(MVT::i16, MVT::Other);
SDValue Ops[] = { Chain, Ptr };
SDValue LdNode = DAG.getMemIntrinsicNode(W65816ISD::LD_PTR, DL, VTs, Ops,
MVT::i16, Ld->getMemOperand());
SDValue Val = LdNode;
// Byte memory access: mask the high byte for zextload, leave anyext.
if (MemVT == MVT::i8) {
if (Ld->getExtensionType() == ISD::ZEXTLOAD)
Val = DAG.getNode(ISD::AND, DL, MVT::i16, Val,
DAG.getConstant(0xFF, DL, MVT::i16));
else if (Ld->getExtensionType() == ISD::SEXTLOAD)
Val = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i16, Val,
DAG.getValueType(MVT::i8));
}
// Narrow back to i8 if the consumer wanted i8.
if (VT == MVT::i8)
Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Val);
return DAG.getMergeValues({Val, LdNode.getValue(1)}, DL);
}
// ZERO/SIGN/ANY_EXTEND i8/i16 -> i32: build a Wide32 from the i16
// payload and a 0 / sign-fill / undef high half.
SDValue W65816TargetLowering::LowerExtend(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
if (Op.getValueType() != MVT::i32)
return SDValue();
SDValue X = Op.getOperand(0);
// Promote i8 inputs to i16 first via the same opcode.
if (X.getValueType() == MVT::i8)
X = DAG.getNode(Op.getOpcode(), DL, MVT::i16, X);
SDValue Lo = X;
SDValue Hi;
if (Op.getOpcode() == ISD::ZERO_EXTEND) {
Hi = DAG.getConstant(0, DL, MVT::i16);
} else if (Op.getOpcode() == ISD::SIGN_EXTEND) {
// Sign-fill via SRA #15 — uses our SRA15A pattern (4 insns) and
// stays i16-typed in both LHS and RHS, dodging the combiner's
// shift-amount-promote when ptr32 makes pointer-typed shift
// amounts i32.
Hi = DAG.getNode(ISD::SRA, DL, MVT::i16, Lo,
DAG.getConstant(15, DL, MVT::i16));
} else {
Hi = DAG.getUNDEF(MVT::i16);
}
return buildWide32(DAG, DL, Lo, Hi);
}
// SIGN_EXTEND_INREG i32 with inner type i1/i8/i16: sign-extend the low
// N bits of an i32 input to fill all 32 bits. The legalizer leaves
// this op alone when i32 is legal — but no tablegen pattern matches
// the i32 form, so without this Custom hook isel aborts with
// "Cannot select: sign_extend_inreg ... ValueType:i1" on shapes like
// `-(crc & 1ul)` in CRC32 loops.
//
// Strategy: for inner VT V (= i1 / i8 / i16), the low half's
// `sext_inreg` (already pattern-matched at i16) produces the signed
// i16 value — then sign-fill the high half via SRA #15 of the lo
// result.
SDValue W65816TargetLowering::LowerSignExtendInReg(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
SDValue X = Op.getOperand(0);
EVT InnerVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
EVT ResVT = Op.getValueType();
// i16 result: replicate the existing tablegen patterns. We MUST
// handle this case rather than returning SDValue(), because
// setOperationAction's Custom-returns-SDValue() falls through to
// default Expand (= SRA/SHL chain), not to tablegen pattern match.
// The two existing patterns are:
// (sext_inreg Acc16:$src, i1) -> NEGA16 (AND $src, 1)
// (sext_inreg Acc16:$src, i8) -> ((src & 0xFF) ^ 0x80) - 0x80
// Reproduce them at the SDAG level so the legalizer's Custom
// dispatch returns a fully-lowered tree.
if (ResVT == MVT::i16) {
if (InnerVT == MVT::i1) {
SDValue Bit = DAG.getNode(ISD::AND, DL, MVT::i16, X,
DAG.getConstant(1, DL, MVT::i16));
return DAG.getNode(ISD::SUB, DL, MVT::i16,
DAG.getConstant(0, DL, MVT::i16), Bit);
}
if (InnerVT == MVT::i8) {
SDValue Masked = DAG.getNode(ISD::AND, DL, MVT::i16, X,
DAG.getConstant(0xFF, DL, MVT::i16));
SDValue Xored = DAG.getNode(ISD::XOR, DL, MVT::i16, Masked,
DAG.getConstant(0x80, DL, MVT::i16));
return DAG.getNode(ISD::SUB, DL, MVT::i16, Xored,
DAG.getConstant(0x80, DL, MVT::i16));
}
// inner i16 = no-op.
return X;
}
if (ResVT != MVT::i32)
return SDValue();
// i32 result: project the input's low half (X is i32 Wide32 here),
// apply the inner-VT sext on the i16 low half, sign-fill the hi.
SDValue Lo = extractWide32Lo(DAG, DL, X);
if (InnerVT != MVT::i16) {
Lo = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i16, Lo,
DAG.getValueType(InnerVT));
}
// Sign-fill the hi half via SRA #15 — same idiom LowerExtend uses for
// SIGN_EXTEND i16 -> i32.
SDValue Hi = DAG.getNode(ISD::SRA, DL, MVT::i16, Lo,
DAG.getConstant(15, DL, MVT::i16));
return buildWide32(DAG, DL, Lo, Hi);
}
// TRUNCATE i32 -> i16: project sub_lo.
SDValue W65816TargetLowering::LowerTruncate(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
if (Op.getOperand(0).getValueType() != MVT::i32)
return SDValue();
if (Op.getValueType() == MVT::i16)
return extractWide32Lo(DAG, DL, Op.getOperand(0));
if (Op.getValueType() == MVT::i8) {
// i32 -> i16 -> i8. The i8 trunc pattern is COPY_TO_REGCLASS at MC
// level; the i16 sub_lo extract is the work.
SDValue Lo16 = extractWide32Lo(DAG, DL, Op.getOperand(0));
return DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Lo16);
}
return SDValue();
}
// i32 Constant: split into two i16 constants and REG_SEQUENCE.
SDValue W65816TargetLowering::LowerI32Constant(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
if (Op.getValueType() != MVT::i32) return SDValue();
uint64_t V = cast<ConstantSDNode>(Op)->getZExtValue();
SDValue Lo = DAG.getConstant(V & 0xFFFFu, DL, MVT::i16);
SDValue Hi = DAG.getConstant((V >> 16) & 0xFFFFu, DL, MVT::i16);
return buildWide32(DAG, DL, Lo, Hi);
}
// ADD/SUB/AND/OR/XOR i32 -> per-half i16 op. ADDC/ADDE chain for ADD,
// SUBC/SUBE for SUB. AND/OR/XOR are independent halves.
SDValue W65816TargetLowering::LowerI32Bin(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
if (Op.getValueType() != MVT::i32)
return SDValue();
SDValue L = Op.getOperand(0);
SDValue R = Op.getOperand(1);
SDValue LL = extractWide32Lo(DAG, DL, L);
SDValue LH = extractWide32Hi(DAG, DL, L);
SDValue RL = extractWide32Lo(DAG, DL, R);
SDValue RH = extractWide32Hi(DAG, DL, R);
SDValue Lo, Hi;
switch (Op.getOpcode()) {
case ISD::AND:
Lo = DAG.getNode(ISD::AND, DL, MVT::i16, LL, RL);
Hi = DAG.getNode(ISD::AND, DL, MVT::i16, LH, RH);
break;
case ISD::OR:
Lo = DAG.getNode(ISD::OR, DL, MVT::i16, LL, RL);
Hi = DAG.getNode(ISD::OR, DL, MVT::i16, LH, RH);
break;
case ISD::XOR:
Lo = DAG.getNode(ISD::XOR, DL, MVT::i16, LL, RL);
Hi = DAG.getNode(ISD::XOR, DL, MVT::i16, LH, RH);
break;
case ISD::ADD: {
SDVTList VTs = DAG.getVTList(MVT::i16, MVT::Glue);
SDValue Lo2 = DAG.getNode(ISD::ADDC, DL, VTs, LL, RL);
Lo = Lo2.getValue(0);
SDValue Carry = Lo2.getValue(1);
Hi = DAG.getNode(ISD::ADDE, DL, VTs, LH, RH, Carry).getValue(0);
break;
}
case ISD::SUB: {
SDVTList VTs = DAG.getVTList(MVT::i16, MVT::Glue);
SDValue Lo2 = DAG.getNode(ISD::SUBC, DL, VTs, LL, RL);
Lo = Lo2.getValue(0);
SDValue Borrow = Lo2.getValue(1);
Hi = DAG.getNode(ISD::SUBE, DL, VTs, LH, RH, Borrow).getValue(0);
break;
}
default:
return SDValue();
}
return buildWide32(DAG, DL, Lo, Hi);
}
// Store companion to LowerLoad. For i32 addresses, dispatch to the
// 16-bit ST_PTR or the byte-truncating STB_PTR target node based on
// MemoryVT. For i16 addresses (ptr16 mode), bail out and let the
// existing STAptr / STBptr patterns match.
SDValue W65816TargetLowering::LowerStore(SDValue Op,
SelectionDAG &DAG) const {
StoreSDNode *St = cast<StoreSDNode>(Op);
SDValue Chain = St->getChain();
SDValue Val = St->getValue();
SDValue Ptr = St->getBasePtr();
EVT MemVT = St->getMemoryVT();
SDLoc DL(Op);
// Const-int address (`*(volatile uint8*)0xC035 = v`): leave the SDAG
// alone so the tablegen pattern `(store Acc8, (iPTR imm))` →
// STA8long fires. Without this short-circuit the i32-pointer code
// below promotes the constant address into a Wide32 register pair
// and routes through STBptr32 ([dp],Y), which is 16 B / 30 cyc and
// (worse) bank-tracks DBR.
if (isa<ConstantSDNode>(Ptr))
return SDValue();
// i32 STORE: split into two halves. Critical: the per-half stores
// MUST go through the target-specific W65816ISD::ST_PTR node and not
// through plain ISD::STORE, otherwise the SDAG combiner's
// MergeConsecutiveStores re-combines them into a single i32 store
// that re-enters LowerStore — infinite loop, OOM in the combiner.
// For i16 ptrs (legacy ptr16), fall back to ISD::STORE; the regular
// store-merger doesn't trip there because address splitting via
// ISD::ADD on i16 doesn't itself fan out into ptr-pair operations.
if (Val.getValueType() == MVT::i32) {
SDValue Lo = extractWide32Lo(DAG, DL, Val);
SDValue Hi = extractWide32Hi(DAG, DL, Val);
EVT PtrVT = Ptr.getValueType();
// ptr32 const-i32-addr fast path: `*(uint32_t*)0x5000 = v` should
// lower to two STAabs (DBR-relative, 5 cyc each) instead of two
// [dp],Y stores via ST_PTR. Detect Wide32-zero-hi Constant ptr,
// emit two i16 stores at TargetConstant:i32 addrs. TargetConstant
// (not Constant) so LowerI32Constant doesn't re-fire and recreate
// the REG_SEQUENCE. The STAabs timm pattern matches.
if (PtrVT == MVT::i32 && Ptr.getNode()->isMachineOpcode() &&
Ptr.getMachineOpcode() == TargetOpcode::REG_SEQUENCE) {
SDValue PtrLo, PtrHi;
for (unsigned i = 1; i + 1 < Ptr.getNumOperands(); i += 2) {
if (auto *CIdx = dyn_cast<ConstantSDNode>(Ptr.getOperand(i + 1))) {
if (CIdx->getZExtValue() == llvm::sub_lo) PtrLo = Ptr.getOperand(i);
else if (CIdx->getZExtValue() == llvm::sub_hi) PtrHi = Ptr.getOperand(i);
}
}
auto *PtrHiC = dyn_cast_or_null<ConstantSDNode>(PtrHi);
auto *PtrLoC = dyn_cast_or_null<ConstantSDNode>(PtrLo);
if (PtrLoC && PtrHiC && PtrHiC->getZExtValue() == 0) {
uint64_t Base = PtrLoC->getZExtValue() & 0xFFFF;
SDValue PLo = DAG.getTargetConstant(Base, DL, MVT::i32);
SDValue PHi = DAG.getTargetConstant((Base + 2) & 0xFFFF, DL, MVT::i32);
SDValue StLo = DAG.getStore(Chain, DL, Lo, PLo,
St->getPointerInfo(),
St->getAlign(),
St->getMemOperand()->getFlags());
SDValue StHi = DAG.getStore(StLo, DL, Hi, PHi,
St->getPointerInfo().getWithOffset(2),
St->getAlign(),
St->getMemOperand()->getFlags());
return StHi;
}
}
SDValue Two = DAG.getConstant(2, DL, PtrVT);
SDValue Ptr2 = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr, Two);
if (PtrVT == MVT::i32) {
// ptr32 path — emit two W65816ISD::ST_PTR target nodes, sequentially
// chained. The combiner cannot merge target-opaque MemIntrinsic
// stores.
SDVTList VTs = DAG.getVTList(MVT::Other);
SDValue OpsLo[] = { Chain, Lo, Ptr };
SDValue StLo = DAG.getMemIntrinsicNode(
W65816ISD::ST_PTR, DL, VTs, OpsLo, MVT::i16,
St->getMemOperand());
SDValue OpsHi[] = { StLo, Hi, Ptr2 };
MachineMemOperand *MMOHi = DAG.getMachineFunction().getMachineMemOperand(
St->getMemOperand(), 2, 2);
SDValue StHi = DAG.getMemIntrinsicNode(
W65816ISD::ST_PTR, DL, VTs, OpsHi, MVT::i16, MMOHi);
return StHi;
}
// ptr16 path — emit two regular i16 stores serially chained so the
// store-merger sees them as a 4-byte sequence (which it will likely
// leave alone since the resulting i32 store has no legal target
// pattern in ptr16 mode anyway).
SDValue StLo = DAG.getStore(Chain, DL, Lo, Ptr,
St->getPointerInfo(),
St->getAlign(),
St->getMemOperand()->getFlags());
SDValue StHi = DAG.getStore(StLo, DL, Hi, Ptr2,
St->getPointerInfo().getWithOffset(2),
St->getAlign(),
St->getMemOperand()->getFlags());
return StHi;
}
if (Ptr.getValueType() != MVT::i32)
return SDValue();
// The pseudos take Acc16 (i16) as the value half; the SEP/REP wrap
// around STBptr32 narrows in memory. Promote i8 values to i16 with
// ANY_EXTEND — the inserter only writes one byte, so the high half
// is don't-care.
if (Val.getValueType() == MVT::i8)
Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, Val);
unsigned NodeOpc = (MemVT == MVT::i8) ? unsigned(W65816ISD::STB_PTR)
: unsigned(W65816ISD::ST_PTR);
SDVTList VTs = DAG.getVTList(MVT::Other);
SDValue Ops[] = { Chain, Val, Ptr };
return DAG.getMemIntrinsicNode(NodeOpc, DL, VTs, Ops, MemVT,
St->getMemOperand());
}
// VAARG: load *ap, advance ap by sizeof(VT). Unlike the default
// expansion, we do NOT align ap to the type's preferred alignment —
// caller-pushed varargs land at byte-granular addresses (PHA from an
// odd S leaves the low byte at S+1 which is even, but our prologue's
// TSC-sequence can produce odd S, etc.). Aligning ap would skip the
// pushed value's low byte.
static SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) {
SDLoc DL(Op);
SDValue Chain = Op.getOperand(0);
SDValue VAListPtr = Op.getOperand(1);
EVT VT = Op.getValueType();
// ap (va_list) is `char *` on this target — i16 under ptr16, i32
// under ptr32. Load and store it at PtrVT so we don't truncate and
// lose the high half (under ptr32, hi=0 so the truncation read garbage
// back, then the i16 store wrote i16 over the lo half but left an
// unrelated value in the hi — silent miscompile of every variadic
// call on ptr32).
EVT PtrVT = VAListPtr.getValueType();
SDValue Ap = DAG.getLoad(PtrVT, DL, Chain, VAListPtr,
MachinePointerInfo());
Chain = Ap.getValue(1);
// For the actual data deref: under ptr16 we route i16 through
// VAARG_LOAD (bank-0-explicit `[dp],Y`). Under ptr32, ap is already
// a Wide32 ptr with hi=0 (caller set up the va_list to point into the
// call-frame stack-args region, bank 0); a regular load through that
// pointer routes to LDAptr32 / STBptr32 which already deref bank-0.
SDValue Val;
if (VT == MVT::i16 && PtrVT == MVT::i16) {
SDVTList VTs = DAG.getVTList(MVT::i16, MVT::Other);
Val = DAG.getNode(W65816ISD::VAARG_LOAD, DL, VTs, Chain, Ap);
Chain = Val.getValue(1);
} else {
Val = DAG.getLoad(VT, DL, Chain, Ap, MachinePointerInfo());
Chain = Val.getValue(1);
}
// ap += sizeof(VT) (rounded up to whole bytes).
unsigned Size = (VT.getSizeInBits() + 7) / 8;
SDValue NewAp = DAG.getNode(ISD::ADD, DL, PtrVT, Ap,
DAG.getConstant(Size, DL, PtrVT));
Chain = DAG.getStore(Chain, DL, NewAp, VAListPtr, MachinePointerInfo());
return DAG.getMergeValues({Val, Chain}, DL);
}
// VASTART: store the address of the first vararg slot (recorded by
// LowerFormalArguments via VarArgsFrameIndex) to the va_list pointer.
// va_list is just `i16 *next` here — minimum implementation.
static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG,
const W65816TargetLowering &TLI) {
MachineFunction &MF = DAG.getMachineFunction();
auto *FuncInfo = MF.getInfo<W65816MachineFunctionInfo>();
SDLoc DL(Op);
// FrameIndex must be at PtrVT (i16 under ptr16, i32 under ptr32) so
// the subsequent store writes the full pointer width. Under ptr32
// the i32 FI lowers via the i32 pointer-store path; the high half
// is implicitly 0 (stack is bank 0) and stored alongside the lo.
EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
SDValue VAFI = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
PtrVT);
SDValue Chain = Op.getOperand(0);
SDValue VAListPtr = Op.getOperand(1);
const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
return DAG.getStore(Chain, DL, VAFI, VAListPtr, MachinePointerInfo(SV));
}
SDValue W65816TargetLowering::LowerOperation(SDValue Op,
SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
case ISD::BR_CC: return LowerBR_CC(Op, DAG);
case ISD::SETCC: return LowerSETCC(Op, DAG);
case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
case ISD::SELECT: {
// Custom-lower SELECT for i32 result: split into per-half
// selects. Without this, the legalizer's default (rewriting
// SELECT to SELECT_CC against zero) produces SELECT_CC i32 of
// a different shape that re-enters Custom and creates a cycle.
if (Op.getValueType() != MVT::i32)
return SDValue();
SDValue Cond = Op.getOperand(0);
SDValue TVal = Op.getOperand(1);
SDValue FVal = Op.getOperand(2);
SDLoc DL(Op);
SDValue TLo = extractWide32Lo(DAG, DL, TVal);
SDValue THi = extractWide32Hi(DAG, DL, TVal);
SDValue FLo = extractWide32Lo(DAG, DL, FVal);
SDValue FHi = extractWide32Hi(DAG, DL, FVal);
SDValue Lo = DAG.getNode(ISD::SELECT, DL, MVT::i16, Cond, TLo, FLo);
SDValue Hi = DAG.getNode(ISD::SELECT, DL, MVT::i16, Cond, THi, FHi);
return buildWide32(DAG, DL, Lo, Hi);
}
case ISD::SIGN_EXTEND:
if (Op.getValueType() == MVT::i32) return LowerExtend(Op, DAG);
return LowerSignExtend(Op, DAG);
case ISD::VASTART: return LowerVASTART(Op, DAG, *this);
case ISD::VAARG: return LowerVAARG(Op, DAG);
case ISD::SHL:
case ISD::SRL:
case ISD::SRA: return LowerShift(Op, DAG);
case ISD::ZERO_EXTEND:
case ISD::ANY_EXTEND: return LowerExtend(Op, DAG);
case ISD::SIGN_EXTEND_INREG: return LowerSignExtendInReg(Op, DAG);
case ISD::TRUNCATE: return LowerTruncate(Op, DAG);
case ISD::ADD:
case ISD::SUB:
case ISD::AND:
case ISD::OR:
case ISD::XOR: return LowerI32Bin(Op, DAG);
case ISD::LOAD: return LowerLoad(Op, DAG);
case ISD::STORE: return LowerStore(Op, DAG);
case ISD::Constant: return LowerI32Constant(Op, DAG);
// SJLJ EH: setup_dispatch is a no-op on this target — the dispatcher
// logic lives entirely in the SJLJ runtime (_Unwind_SjLj_Resume +
// longjmp into the function context's jmp_buf). The isel layer
// doesn't need to emit any code; just thread the chain through.
case ISD::EH_SJLJ_SETUP_DISPATCH:
return Op.getOperand(0);
case ISD::DYNAMIC_STACKALLOC: return LowerDynamicStackalloc(Op, DAG);
case ISD::STACKSAVE: {
// Return Constant 0 — SJLJ stores this into the function context
// but our setjmp/longjmp manage SP directly, so the value is dead.
SDLoc DL(Op);
EVT VT = Op.getValueType();
SDValue Chain = Op.getOperand(0);
SDValue Result;
if (VT == MVT::i16)
Result = DAG.getConstant(0, DL, MVT::i16);
else
Result = buildWide32(DAG, DL,
DAG.getConstant(0, DL, MVT::i16),
DAG.getConstant(0, DL, MVT::i16));
return DAG.getMergeValues({Result, Chain}, DL);
}
case ISD::STACKRESTORE:
// No-op — pass the chain through.
return Op.getOperand(0);
case ISD::FRAMEADDR: {
// FRAMEADDR(N): SJLJ uses N=0 (current frame). We don't reserve a
// frame pointer and SP isn't trivially CopyFromReg-able (no
// register class). Return Constant 0 — SJLJ uses it as an opaque
// per-frame identifier; the SJLJ runtime tracks frames by jmp_buf
// chaining (FnCtx::prev) rather than by FRAMEADDR value, so a
// constant works for single-throw / non-nested-catch programs.
// True multi-frame SJLJ would need a TSC-based unique value.
SDLoc DL(Op);
EVT VT = Op.getValueType();
if (VT == MVT::i16)
return DAG.getConstant(0, DL, MVT::i16);
SDValue Lo = DAG.getConstant(0, DL, MVT::i16);
SDValue Hi = DAG.getConstant(0, DL, MVT::i16);
return buildWide32(DAG, DL, Lo, Hi);
}
default:
Op.dump();
llvm_unreachable("W65816: unexpected operation in LowerOperation");
}
}
std::pair<unsigned, const TargetRegisterClass *>
W65816TargetLowering::getRegForInlineAsmConstraint(
const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
// Strip leading '{' and trailing '}' for the long form.
StringRef C = Constraint;
if (C.size() >= 2 && C.front() == '{' && C.back() == '}')
C = C.substr(1, C.size() - 2);
if (VT == MVT::i8) {
if (C == "a") return {W65816::A, &W65816::Acc8RegClass};
if (C == "x") return {W65816::X, &W65816::Idx8RegClass};
if (C == "y") return {W65816::Y, &W65816::Idx8RegClass};
if (C == "r") return {W65816::A, &W65816::Acc8RegClass};
} else { // i16 default; pointer types fold here too
if (C == "a") return {W65816::A, &W65816::Acc16RegClass};
if (C == "x") return {W65816::X, &W65816::Idx16RegClass};
if (C == "y") return {W65816::Y, &W65816::Idx16RegClass};
if (C == "r") return {W65816::A, &W65816::Acc16RegClass};
}
return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
}
SDValue W65816TargetLowering::LowerDynamicStackalloc(SDValue Op,
SelectionDAG &DAG) const {
// (DYNAMIC_STACKALLOC chain, size, align) -> (ptr, chain).
// Lowered as: stash entry SP -> DP $F4 (handled by emitPrologue when
// MFI.hasVarSizedObjects), then `tsc; sec; sbc size; tcs; inc a`.
// The epilogue restores SP from $F4.
//
// Limitation: any FrameIndex (local, spill slot, parameter) accessed
// *after* the alloca reads from a wrong stack-relative offset because
// PEI bakes FI offsets relative to the static-frame SP, not the
// post-alloca SP. A real frame pointer would lift this; for now we
// accept the limitation and document it. The simplest safe pattern
// is "VLA at end of function, used immediately, no further FI access";
// anything else is at-your-own-risk until FP support lands.
SDLoc DL(Op);
SDValue Chain = Op.getOperand(0);
SDValue Size = Op.getOperand(1);
SDValue ChainAndPtr = DAG.getNode(W65816ISD::ALLOCA, DL,
DAG.getVTList(MVT::i16, MVT::Other),
Chain, Size);
SDValue Ptr = ChainAndPtr.getValue(0);
SDValue NewChain = ChainAndPtr.getValue(1);
return DAG.getMergeValues({Ptr, NewChain}, DL);
}
SDValue W65816TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
// i8 shifts: promote to i16, shift, truncate. SRA promotes via SEXT
// (preserves sign for arithmetic right shift); SHL/SRL via ZEXT
// (logical / left shifts don't care about high bits). This routes
// i8 shifts through the same i16 fast paths and libcalls — no
// parallel qi3 libcall set needed. The DAG combiner would otherwise
// narrow `(trunc (shl (zext X), K))` back to `(shl X, K)` of i8,
// re-entering this hook in an infinite loop; the
// `isTypeDesirableForOp(SHL/SRL/SRA, i8) -> false` override above
// disables that combine.
if (Op.getValueType() == MVT::i8) {
SDLoc DL(Op);
SDValue X = Op.getOperand(0);
SDValue N = Op.getOperand(1);
unsigned Ext = (Op.getOpcode() == ISD::SRA) ? ISD::SIGN_EXTEND
: ISD::ZERO_EXTEND;
SDValue X16 = DAG.getNode(Ext, DL, MVT::i16, X);
SDValue N16 = N.getValueType() == MVT::i16
? N
: DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, N);
// Special case: i8 SRA by 7 of a sign-extended value is the
// sign-fill operation — every result bit is the input's bit 7.
// For sext(i8 x), bit 15 == bit 7, so `(sra (sext x), 7)` yields
// the same result as `(sra (sext x), 15)`, which we have a tight
// 4-insn pattern for via SRA15A. Avoids the __ashrhi3 libcall
// (~10 insns plus arg push/pop overhead) — abs8 dropped from 47
// to 35 insns with this rewrite in place.
if (Op.getOpcode() == ISD::SRA) {
if (auto *C = dyn_cast<ConstantSDNode>(N)) {
if (C->getZExtValue() == 7) {
N16 = DAG.getConstant(15, DL, MVT::i16);
}
}
}
SDValue R16 = DAG.getNode(Op.getOpcode(), DL, MVT::i16, X16, N16);
return DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, R16);
}
// Fast path: shift-by-{1,2,3,4} have inline tablegen patterns. Return
// Op (the unchanged node) so the legalizer leaves it alone — the
// pattern matcher catches it later. Returning SDValue() instead
// would fall through to the generic Expand path, which generates a
// BUILD_VECTOR-based magic-constant rewrite that we can't lower.
// Also allow `(srl x, 15)` through — pattern SRL15A handles it as
// `ASL A; LDA #0; ROL A` (3 bytes), much shorter than the libcall.
// The type-legalizer's i32-shift-by-1 expansion emits this exact
// node for the high-half "bit-from-low" slot.
// Everything else goes to a libcall (__ashlhi3 / __lshrhi3 / __ashrhi3).
// i16 only — i32 always routes to libcall (no inline i32 patterns).
SDValue Amount = Op.getOperand(1);
if (Op.getValueType() == MVT::i16) {
if (auto *C = dyn_cast<ConstantSDNode>(Amount)) {
uint64_t N = C->getZExtValue();
if ((Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL) &&
N >= 1 && N <= 14)
return Op;
if (N == 15 &&
(Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL))
return Op;
if (N == 1 && Op.getOpcode() == ISD::SRA)
return Op;
if (N == 15 && Op.getOpcode() == ISD::SRA)
return Op;
}
}
bool IsI32 = Op.getValueType() == MVT::i32;
RTLIB::Libcall LC;
switch (Op.getOpcode()) {
case ISD::SHL: LC = IsI32 ? RTLIB::SHL_I32 : RTLIB::SHL_I16; break;
case ISD::SRL: LC = IsI32 ? RTLIB::SRL_I32 : RTLIB::SRL_I16; break;
case ISD::SRA: LC = IsI32 ? RTLIB::SRA_I32 : RTLIB::SRA_I16; break;
default: llvm_unreachable("not a shift");
}
// makeLibCall wants the args as TargetLowering::ArgListEntry; the
// simpler getNode form is to manually build the call. But the
// makeLibCall helper handles the calling convention.
SmallVector<SDValue, 2> Args = {Op.getOperand(0), Op.getOperand(1)};
TargetLowering::MakeLibCallOptions Opts;
Opts.setIsSigned(Op.getOpcode() == ISD::SRA);
return makeLibCall(DAG, LC, Op.getValueType(), Args, Opts, SDLoc(Op)).first;
}
SDValue W65816TargetLowering::LowerGlobalAddress(SDValue Op,
SelectionDAG &DAG) const {
auto *GA = cast<GlobalAddressSDNode>(Op);
SDLoc DL(Op);
EVT PtrVT = Op.getValueType(); // i16 in ptr16 mode, i32 in ptr32 mode
if (PtrVT == MVT::i32) {
// i32 GlobalAddress: build Wide32 from (i16 offset, i16 bank).
// The i16 offset goes through W65816ISD::Wrapper as before — IMM16
// cRELOC rewrites the offset under Loader. The bank half is set to
// 0 here, but crt0Gsos's $BE-init or a future per-pointer bank
// relocation can be threaded through. TODO: wire bank cRELOC.
SDValue OffTgt = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
MVT::i16, GA->getOffset());
SDValue Lo = DAG.getNode(W65816ISD::Wrapper, DL, MVT::i16, OffTgt);
SDValue Hi = DAG.getConstant(0, DL, MVT::i16);
return buildWide32(DAG, DL, Lo, Hi);
}
SDValue Tgt = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, PtrVT,
GA->getOffset());
return DAG.getNode(W65816ISD::Wrapper, DL, PtrVT, Tgt);
}
SDValue W65816TargetLowering::LowerExternalSymbol(SDValue Op,
SelectionDAG &DAG) const {
auto *ES = cast<ExternalSymbolSDNode>(Op);
SDLoc DL(Op);
EVT PtrVT = Op.getValueType();
if (PtrVT == MVT::i32) {
SDValue OffTgt = DAG.getTargetExternalSymbol(ES->getSymbol(), MVT::i16);
SDValue Lo = DAG.getNode(W65816ISD::Wrapper, DL, MVT::i16, OffTgt);
SDValue Hi = DAG.getConstant(0, DL, MVT::i16);
return buildWide32(DAG, DL, Lo, Hi);
}
SDValue Tgt = DAG.getTargetExternalSymbol(ES->getSymbol(), PtrVT);
return DAG.getNode(W65816ISD::Wrapper, DL, PtrVT, Tgt);
}
SDValue W65816TargetLowering::LowerFormalArguments(
SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
// ABI: first i16/i8 argument is passed in A; remaining arguments are
// pushed by the caller right-to-left and read via stack-relative
// addressing. After JSL pushes 3 bytes of return address, the layout
// viewed from the callee is:
// (high addr) arg N-1
// ...
// arg 1
// ret-addr-bank <- (4,S) when M=0
// ret-addr-hi <- (3,S)
// ret-addr-lo <- (2,S)
// (low addr) <next push> <- (1,S)
//
// Each i16 stack arg occupies 2 bytes. arg 1 lives at (4,S).
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
// i32 first-arg ABI. Two flavors as in LowerCall:
// - Legal-i32 (Wide32 reg class registered): single i32 InputArg.
// - Split-i32 (legacy): two i16 InputArgs both with OrigArgIndex==0.
bool I32SplitFirstArg =
Ins.size() >= 2 && Ins[0].VT == MVT::i16 && Ins[1].VT == MVT::i16 &&
Ins[0].OrigArgIndex == 0 && Ins[1].OrigArgIndex == 0;
// True iff the FIRST original arg spans 4 i16s (i.e., is i64). Used
// below to choose the Img16-via-STX_DP X-arg path for i64 callees,
// which dodges greedy's TXA-bridge-clobbers-A spill bug. i32-first
// doesn't get the same treatment because the change pessimizes
// simple functions like `int add32(int a, int b) { return a+b; }`
// where greedy's regular A:X handling is fine.
// Two shapes for i64-first-arg under different ptr modes:
// ptr16 (i32 illegal): Ins[0..3] = 4 i16 halves of arg0
// ptr32 (i32 legal): Ins[0..1] = 2 i32 halves of arg0 — but the
// IR-level "single i64 first arg" still splits
// to 4 i16 in Outs/Ins because i64 isn't legal.
// So the i16-form detection still applies here.
bool I64FirstArg =
Ins.size() >= 4 && Ins[0].VT == MVT::i16 && Ins[1].VT == MVT::i16 &&
Ins[2].VT == MVT::i16 && Ins[3].VT == MVT::i16 &&
Ins[0].OrigArgIndex == 0 && Ins[1].OrigArgIndex == 0 &&
Ins[2].OrigArgIndex == 0 && Ins[3].OrigArgIndex == 0;
// Also detect the i32-split shape: Ins[0..1] = 2 i32 halves of arg0
// (with OrigArgIndex==0 on both). This happens with ptr32 active and
// i64 legalized via i32-split rather than i16-quad-split.
if (!I64FirstArg && Ins.size() >= 2 && Ins[0].VT == MVT::i32 &&
Ins[1].VT == MVT::i32 && Ins[0].OrigArgIndex == 0 &&
Ins[1].OrigArgIndex == 0)
I64FirstArg = true;
unsigned ArgIdx = 0;
// Stack offset is measured from S+1 (the WDC convention) and grows
// upward as we walk through the stack-passed args.
unsigned StackOffset = 4; // Skip 3 ret-addr bytes; first slot at S+4.
for (const ISD::InputArg &Arg : Ins) {
MVT VT = Arg.VT;
if (VT != MVT::i16 && VT != MVT::i8 && VT != MVT::i32)
report_fatal_error("W65816: argument type not yet supported");
if (ArgIdx == 0 && VT == MVT::i32) {
// Whole-i32 first arg: lo half live-in via $a, hi via $x.
// The W65816LowerWide32 pre-RA pass walks the resulting
// REG_SEQUENCE and rewrites Wide32 uses into pairs of i16
// operations — keeping AX32 out of the regalloc's pair-
// allocation path entirely.
// For i64-first-arg signatures (the IR has a single i64 arg
// that splits to 2 i32 in Ins[0..1] under ptr32), route BOTH
// halves through Img16. Without this the regalloc emits
// `TXA; STA spill_X; STA spill_A` at function entry — the TXA
// clobbers $a (arg0_0) before the A-spill saves it, so both
// spill slots end up holding arg0_1. Caused __adddf3(1.5,2.5)
// → 1.5 because the cb-test path read TXA-corrupted A.
// Route the hi half through Img16 (DP-backed) for whole-i32 first
// args. The Idx16 (X-only) class collapses through the W65816LowerWide32
// pre-RA pass to plain Acc16, after which regalloc treats both halves
// as competing for $a — a TXA at the top of any non-trivial function
// body destroys arg0_lo before it's spilled (silent miscompile of
// every i32-arg function with > a few uses). Img16 forces an
// STX_DP at function entry, immune to A-reuse. i64-first already
// did this; under ptr32 the same hazard hits any i32 arg.
const TargetRegisterClass *VRegLoRC =
I64FirstArg ? &W65816::Img16RegClass : &W65816::Acc16RegClass;
const TargetRegisterClass *VRegHiRC = &W65816::Img16RegClass;
Register VRegLo = MRI.createVirtualRegister(VRegLoRC);
Register VRegHi = MRI.createVirtualRegister(VRegHiRC);
MRI.addLiveIn(W65816::A, VRegLo);
MRI.addLiveIn(W65816::X, VRegHi);
SDValue Lo = DAG.getCopyFromReg(Chain, DL, VRegLo, MVT::i16);
SDValue Hi = DAG.getCopyFromReg(Chain, DL, VRegHi, MVT::i16);
InVals.push_back(buildWide32(DAG, DL, Lo, Hi));
} else if (ArgIdx == 0) {
// First arg in A. For i64-first-arg signatures (4 i16 halves of
// arg0 with OrigArgIndex==0), route arg0_0 through Img16 the same
// way ArgIdx==1 does — via an entry STA-to-DP-slot at function
// entry. Without this, the regalloc emits a TXA bridge for
// arg0_1's spill that clobbers $a (= arg0_0) BEFORE arg0_0 has
// been saved, and BOTH arg0_0 and arg0_1's spill slots end up
// holding arg0_1. Observed as `__adddf3(1.5, 2.5) → 1.5` because
// the cb-test BEQ sees flags from a TXA-clobbered LDA cb path.
const TargetRegisterClass *RC =
(VT == MVT::i16)
? (I64FirstArg ? &W65816::Img16RegClass : &W65816::Acc16RegClass)
: &W65816::Acc8RegClass;
Register VReg = MRI.createVirtualRegister(RC);
MRI.addLiveIn(W65816::A, VReg);
InVals.push_back(DAG.getCopyFromReg(Chain, DL, VReg, VT));
} else if (ArgIdx == 1 && I32SplitFirstArg) {
// First-arg hi half (or arg0_ml for i64-first-arg): in X.
// For i64-first-arg signatures (4 i16s with OrigArgIndex 0), use
// Img16 so greedy parks the value in an IMG slot via STX_DP,
// dodging the TXA-bridge-clobbers-A spill bug. i32-first stays
// on the original Idx16 path because the change pessimizes
// simple cases (verified: vprintf's writeULong/__udivsi3 chain
// crashes if i32-first is also rerouted). Caught by udivmod.
const TargetRegisterClass *RC = I64FirstArg ? &W65816::Img16RegClass
: &W65816::Idx16RegClass;
Register VReg = MRI.createVirtualRegister(RC);
MRI.addLiveIn(W65816::X, VReg);
InVals.push_back(DAG.getCopyFromReg(Chain, DL, VReg, MVT::i16));
} else if (VT == MVT::i32) {
// i32 stack arg: 4 bytes, loaded as 2 i16 halves and assembled
// via REG_SEQUENCE into a Wide32 SDValue.
int FILo = MFI.CreateFixedObject(2, StackOffset, /*Immutable*/true);
int FIHi = MFI.CreateFixedObject(2, StackOffset + 2, /*Immutable*/true);
StackOffset += 4;
SDValue FINLo = DAG.getFrameIndex(FILo, MVT::i16);
SDValue FINHi = DAG.getFrameIndex(FIHi, MVT::i16);
SDValue Lo = DAG.getLoad(MVT::i16, DL, Chain, FINLo,
MachinePointerInfo::getFixedStack(MF, FILo));
SDValue Hi = DAG.getLoad(MVT::i16, DL, Chain, FINHi,
MachinePointerInfo::getFixedStack(MF, FIHi));
InVals.push_back(buildWide32(DAG, DL, Lo, Hi));
} else {
// Subsequent args are loaded from the stack. i8 args are
// promoted to i16 slots (matching CC_W65816's CCPromoteToType)
// so the load can run in the function's default 16-bit M mode
// without needing a per-byte SEP/REP wrap; we then truncate the
// i16 back to i8 for the IR. i16 args are loaded directly.
unsigned ObjSize = 2;
int FI = MFI.CreateFixedObject(ObjSize, StackOffset, /*Immutable*/true);
StackOffset += ObjSize;
SDValue FIN = DAG.getFrameIndex(FI, MVT::i16);
SDValue Val = DAG.getLoad(
MVT::i16, DL, Chain, FIN,
MachinePointerInfo::getFixedStack(MF, FI));
if (VT == MVT::i8)
Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Val);
InVals.push_back(Val);
}
++ArgIdx;
}
// Vararg support: stash the FrameIndex of the next stack-arg slot
// (where the caller's first vararg lives) so VASTART can use it
// as the va_list start. StackOffset has been advanced past every
// named stack arg; the first vararg sits at SP + StackOffset.
if (IsVarArg) {
int FI = MFI.CreateFixedObject(2, StackOffset, /*Immutable=*/true);
auto *FuncInfo = MF.getInfo<W65816MachineFunctionInfo>();
FuncInfo->setVarArgsFrameIndex(FI);
}
return Chain;
}
SDValue
W65816TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const {
// Multi-arg ABI: arg 0 in A, args 1..N-1 pushed in REVERSE order via
// PUSH16 (PHA) so the callee's `(4,S)` reads pick up arg 1, `(6,S)`
// gets arg 2, etc. CALLSEQ_START records the byte count;
// ADJCALLSTACKUP after the call emits `tsc;clc;adc #N;tcs` to
// release the pushed bytes (eliminateCallFramePseudoInstr).
SelectionDAG &DAG = CLI.DAG;
SDLoc &DL = CLI.DL;
SDValue Chain = CLI.Chain;
SDValue Callee = CLI.Callee;
auto &Outs = CLI.Outs;
auto &OutVals = CLI.OutVals;
auto &Ins = CLI.Ins;
if (CLI.IsTailCall)
CLI.IsTailCall = false;
// Up to 4 return halves (i64 split): i8/i16 in A; i32 in A:X;
// i64 in A:X:Y plus DP $F0..$F1 for the highest half. See
// LowerReturn comment for the ABI.
if (Ins.size() > 4)
report_fatal_error("W65816: return type wider than 64 bits not supported");
// Indirect calls (function pointers): redirect through the runtime
// trampoline `__jsl_indir`. The 65816 has no JSL-indirect; instead,
// we store the dynamic target to a global (`__indirTarget`), then
// JSL the trampoline, which immediately does `JMP (__indirTarget)`.
// The target's RTL pops the original JSL's return frame and returns
// straight back to the caller — no double-RTL or extra frame.
// Caveat: single-bank only (JMP indirect is bank-local).
bool IsIndirect = !isa<GlobalAddressSDNode>(Callee) &&
!isa<ExternalSymbolSDNode>(Callee);
if (IsIndirect) {
// Store the dynamic target to __indirTarget *before* any other
// setup, since pushing args clobbers A. STAabs takes an
// ExternalSymbol-wrapped address operand.
SDValue TargetSym = DAG.getTargetExternalSymbol("__indirTarget",
MVT::i16);
SDValue WrappedSym = DAG.getNode(W65816ISD::Wrapper, DL, MVT::i16,
TargetSym);
Chain = DAG.getStore(Chain, DL, Callee, WrappedSym,
MachinePointerInfo());
// Replace the callee with __jsl_indir for the actual JSL.
Callee = DAG.getExternalSymbol("__jsl_indir", MVT::i16);
}
for (const ISD::OutputArg &O : Outs) {
if (O.VT != MVT::i16 && O.VT != MVT::i8 && O.VT != MVT::i32)
report_fatal_error("W65816: argument type not yet supported");
}
// i32 first-arg ABI. Two flavors:
// - Legal-i32: Outs[0].VT == i32 (whole pair). Pass in AX32.
// - Split-i32 (legacy): Outs[0]/Outs[1] both i16 with OrigArgIndex==0.
// Pass low in A, high in X.
bool I32WholeFirstArg =
!Outs.empty() && Outs[0].VT == MVT::i32;
bool I32SplitFirstArg =
Outs.size() >= 2 && Outs[0].VT == MVT::i16 && Outs[1].VT == MVT::i16 &&
Outs[0].OrigArgIndex == 0 && Outs[1].OrigArgIndex == 0;
unsigned FirstStackArg = I32WholeFirstArg ? 1
: I32SplitFirstArg ? 2 : 1;
// i8 stack args are promoted to i16 (2-byte slots) so the callee can
// read them with a 16-bit M load — matches LowerFormalArguments and
// CC_W65816's CCPromoteToType<i16>. i32 stack args occupy 4 bytes
// (2 PUSH16s).
unsigned StackBytes = 0;
for (unsigned i = FirstStackArg; i < Outs.size(); ++i)
StackBytes += (Outs[i].VT == MVT::i32) ? 4 : 2;
Chain = DAG.getCALLSEQ_START(Chain, StackBytes, 0, DL);
// Push stack-passed args in reverse so arg FirstStackArg ends up at
// the lowest post-JSL stack-relative offset (4,S). Each push uses A
// by default; if the value being pushed is already a `CopyFromReg X`
// (e.g. forwarding the i32-first-arg-in-A:X hi half), push directly
// from X via PHX — saves the TXA + A-spill round-trip that would
// otherwise be required.
SDValue Glue;
// Helper: push a single i16-sized value via PHA.
auto pushI16 = [&](SDValue V) {
bool ViaX = false;
if (V.getOpcode() == ISD::CopyFromReg) {
auto *RegN = dyn_cast<RegisterSDNode>(V.getOperand(1).getNode());
if (RegN) {
Register R = RegN->getReg();
if (R.isPhysical() && R == W65816::X) {
ViaX = true;
} else if (R.isVirtual()) {
MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
if (MRI.getRegClass(R) == &W65816::Idx16RegClass) {
for (auto &LI : MRI.liveins())
if (LI.second == R && LI.first == W65816::X) {
ViaX = true;
break;
}
}
}
}
}
if (ViaX) {
Chain = DAG.getCopyToReg(Chain, DL, W65816::X, V, Glue);
Glue = Chain.getValue(1);
Chain = DAG.getNode(W65816ISD::PUSH_X, DL,
DAG.getVTList(MVT::Other, MVT::Glue), Chain, Glue);
} else {
Chain = DAG.getCopyToReg(Chain, DL, W65816::A, V, Glue);
Glue = Chain.getValue(1);
Chain = DAG.getNode(W65816ISD::PUSH, DL,
DAG.getVTList(MVT::Other, MVT::Glue), Chain, Glue);
}
Glue = Chain.getValue(1);
};
for (int i = (int)Outs.size() - 1; i >= (int)FirstStackArg; --i) {
SDValue V = OutVals[i];
if (Outs[i].VT == MVT::i32) {
// Push i32 stack arg: hi half first (lands at higher address),
// lo half second (lands at lower address = the slot the callee
// reads as the start of the i32).
SDValue Lo = extractWide32Lo(DAG, DL, V);
SDValue Hi = extractWide32Hi(DAG, DL, V);
pushI16(Hi);
pushI16(Lo);
continue;
}
if (Outs[i].VT == MVT::i8)
V = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, V);
pushI16(V);
}
// i32 first-arg. Whole (legal-i32): split into lo/hi and copy
// to $a/$x separately — avoids AX32 in the MIR (see
// W65816LowerWide32). Split-i32 (legacy 2-i16): hi in X first,
// then lo in A below.
if (I32WholeFirstArg) {
SDValue Lo = extractWide32Lo(DAG, DL, OutVals[0]);
SDValue Hi = extractWide32Hi(DAG, DL, OutVals[0]);
Chain = DAG.getCopyToReg(Chain, DL, W65816::X, Hi, Glue);
Glue = Chain.getValue(1);
Chain = DAG.getCopyToReg(Chain, DL, W65816::A, Lo, Glue);
Glue = Chain.getValue(1);
} else if (I32SplitFirstArg) {
Chain = DAG.getCopyToReg(Chain, DL, W65816::X, OutVals[1], Glue);
Glue = Chain.getValue(1);
}
// Arg 0 in A — only for non-whole-i32 first-arg. Whole-i32
// already copied to A/X above.
if (!I32WholeFirstArg && !OutVals.empty()) {
Chain = DAG.getCopyToReg(Chain, DL, W65816::A, OutVals[0], Glue);
Glue = Chain.getValue(1);
}
// Callee target type must match iPTR (i16 in ptr16, i32 in ptr32).
// The CALL SDNode's operand-type profile (SDT_W65816Call) is iPTR;
// hardcoding MVT::i16 here mismatches under p:32:16.
EVT CalleeVT = getPointerTy(DAG.getDataLayout());
if (auto *GA = dyn_cast<GlobalAddressSDNode>(Callee))
Callee = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, CalleeVT);
else if (auto *ES = dyn_cast<ExternalSymbolSDNode>(Callee))
Callee = DAG.getTargetExternalSymbol(ES->getSymbol(), CalleeVT);
SmallVector<SDValue, 4> CallOps = {Chain, Callee};
if (I32WholeFirstArg) {
CallOps.push_back(DAG.getRegister(W65816::A, MVT::i16));
CallOps.push_back(DAG.getRegister(W65816::X, MVT::i16));
} else if (!OutVals.empty()) {
CallOps.push_back(DAG.getRegister(W65816::A, Outs[0].VT));
if (I32SplitFirstArg)
CallOps.push_back(DAG.getRegister(W65816::X, Outs[1].VT));
}
if (Glue.getNode())
CallOps.push_back(Glue);
Chain = DAG.getNode(W65816ISD::CALL, DL,
DAG.getVTList(MVT::Other, MVT::Glue), CallOps);
Glue = Chain.getValue(1);
Chain = DAG.getCALLSEQ_END(Chain, StackBytes, 0, Glue, DL);
Glue = Chain.getValue(1);
// Read return value(s). Mirrors LowerReturn: i8/i16 in A, i32 in
// AX32 (whole) or split A/X (legacy), and 4-half (i64 / 2x i32) in
// A, X, Y, DPF0. i32 Ins are read as a single i32 from the half
// pair (A:X for the first, Y:DPF0 for a second-pair-of-halves).
// Whole-i32 single return: read lo from $a, hi from $x. Avoids
// using AX32 in the SDAG / MIR — see W65816LowerWide32 pass.
if (Ins.size() == 1 && Ins[0].VT == MVT::i32) {
SDValue Lo = DAG.getCopyFromReg(Chain, DL, W65816::A, MVT::i16, Glue);
Chain = Lo.getValue(1);
Glue = Lo.getValue(2);
SDValue Hi = DAG.getCopyFromReg(Chain, DL, W65816::X, MVT::i16, Glue);
Chain = Hi.getValue(1);
Glue = Hi.getValue(2);
InVals.push_back(buildWide32(DAG, DL, Lo, Hi));
return Chain;
}
// Build a flat list of i16 halves expected from the call. Then
// walk it, copying from A, X, Y, DPF0 in order. Re-assemble i32
// halves into a Wide32 SDValue at the end.
SmallVector<MVT, 4> ExpVT;
for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
MVT VT = Ins[i].VT;
if (VT == MVT::i32) {
ExpVT.push_back(MVT::i16);
ExpVT.push_back(MVT::i16);
} else if (VT == MVT::i16 || VT == MVT::i8) {
ExpVT.push_back(VT);
} else {
report_fatal_error("W65816: return half must be i8/i16/i32");
}
}
if (ExpVT.size() > 4)
report_fatal_error("W65816: return type wider than 64 bits not supported");
static constexpr Register RetRegs[4] = {W65816::A, W65816::X, W65816::Y,
W65816::DPF0};
SmallVector<SDValue, 4> Halves;
for (unsigned i = 0; i != ExpVT.size(); ++i) {
SDValue V = DAG.getCopyFromReg(Chain, DL, RetRegs[i], ExpVT[i], Glue);
Chain = V.getValue(1);
Glue = V.getValue(2);
Halves.push_back(V);
}
// Re-pack halves into the original Ins shape (i32s rebuild via
// REG_SEQUENCE; i8/i16 pass through).
unsigned hi = 0;
for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
if (Ins[i].VT == MVT::i32) {
InVals.push_back(buildWide32(DAG, DL, Halves[hi], Halves[hi + 1]));
hi += 2;
} else {
InVals.push_back(Halves[hi]);
hi += 1;
}
}
return Chain;
}
SDValue W65816TargetLowering::LowerReturn(
SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
SelectionDAG &DAG) const {
// Return ABI:
// i8/i16: value in A.
// i32: low half (Outs[0]) in A, high half (Outs[1]) in X.
// i64: halves in A, X, Y, and a fixed direct-page slot at $F0..$F1
// (Outs[0..2] -> A,X,Y; Outs[3] stored to the DP slot).
// wider: not yet supported.
// Type legalization splits an i32 into 2 consecutive i16 Outs and an
// i64 into 4. Emission order matters: we copy the *highest* halves
// first so that the regalloc can place each through A (the only
// ALU reg) without conflict. The TAX/TAY in copyPhysReg preserves
// A, so subsequent low-half copies to A don't clobber.
// With i32 legal, an Outs entry may be MVT::i32; we expand each i32
// into its two i16 halves (sub_lo/sub_hi via EXTRACT_SUBREG) so the
// legacy A/X/Y/DPF0 4-half return ABI continues to work for the
// multi-half return cases (i64 returned as 2 i32, struct of 2 long
// returned as 2 i32, etc.).
SmallVector<MVT, 4> ExpVT;
SmallVector<SDValue, 4> ExpVals;
for (unsigned i = 0; i != Outs.size(); ++i) {
MVT VT = Outs[i].VT;
if (VT == MVT::i32) {
ExpVT.push_back(MVT::i16);
ExpVT.push_back(MVT::i16);
ExpVals.push_back(extractWide32Lo(DAG, DL, OutVals[i]));
ExpVals.push_back(extractWide32Hi(DAG, DL, OutVals[i]));
} else if (VT == MVT::i16 || VT == MVT::i8) {
ExpVT.push_back(VT);
ExpVals.push_back(OutVals[i]);
} else {
report_fatal_error("W65816: return half must be i8/i16/i32");
}
}
if (ExpVT.size() > 4)
report_fatal_error("W65816: return type wider than 64 bits not supported");
// Single whole-i32 return: copy directly to AX32 instead of two
// halves to A and X. Saves the regalloc/coalescer some work.
bool I32WholeReturn = (Outs.size() == 1 && Outs[0].VT == MVT::i32);
SDValue Glue;
SmallVector<SDValue, 8> RetOps(1, Chain);
if (I32WholeReturn) {
// Split the i32 OutVal into lo/hi and copy each separately to
// $a / $x (no AX32 in the SDAG — see W65816LowerWide32).
SDValue Lo = extractWide32Lo(DAG, DL, OutVals[0]);
SDValue Hi = extractWide32Hi(DAG, DL, OutVals[0]);
Chain = DAG.getCopyToReg(Chain, DL, W65816::X, Hi, Glue);
Glue = Chain.getValue(1);
Chain = DAG.getCopyToReg(Chain, DL, W65816::A, Lo, Glue);
Glue = Chain.getValue(1);
RetOps.push_back(DAG.getRegister(W65816::A, MVT::i16));
RetOps.push_back(DAG.getRegister(W65816::X, MVT::i16));
RetOps[0] = Chain;
if (Glue.getNode())
RetOps.push_back(Glue);
return DAG.getNode(W65816ISD::RET_GLUE, DL, MVT::Other, RetOps);
}
// Outs[3] -> DP $F0 via CopyToReg(DPF0). Using the DPF0 fake physreg
// (lowered to `STA $F0` by copyPhysReg) is critical: a generic
// ISD::STORE with addr=0xF0 lowered to `sta (d,s),y`, an indirect
// through the DBR, which silently misbehaved when DBR != 0. STA dp
// uses D + dp directly and is unaffected by DBR. Done first so its
// computation can use A freely before A holds the low result. Glued
// to RET_GLUE via the RetOps Register entry below so DCE doesn't
// strip the COPY.
// Use the expanded i16-half list (i32 outs split into 2 i16 halves).
if (ExpVals.size() >= 4) {
Chain = DAG.getCopyToReg(Chain, DL, W65816::DPF0, ExpVals[3], Glue);
Glue = Chain.getValue(1);
}
if (ExpVals.size() >= 3) {
Chain = DAG.getCopyToReg(Chain, DL, W65816::Y, ExpVals[2], Glue);
Glue = Chain.getValue(1);
}
if (ExpVals.size() >= 2) {
Chain = DAG.getCopyToReg(Chain, DL, W65816::X, ExpVals[1], Glue);
Glue = Chain.getValue(1);
}
if (!ExpVals.empty()) {
Chain = DAG.getCopyToReg(Chain, DL, W65816::A, ExpVals[0], Glue);
Glue = Chain.getValue(1);
RetOps.push_back(DAG.getRegister(W65816::A, ExpVT[0]));
}
if (ExpVals.size() >= 2)
RetOps.push_back(DAG.getRegister(W65816::X, ExpVT[1]));
if (ExpVals.size() >= 3)
RetOps.push_back(DAG.getRegister(W65816::Y, ExpVT[2]));
if (ExpVals.size() >= 4)
RetOps.push_back(DAG.getRegister(W65816::DPF0, ExpVT[3]));
RetOps[0] = Chain;
if (Glue.getNode())
RetOps.push_back(Glue);
return DAG.getNode(W65816ISD::RET_GLUE, DL, MVT::Other, RetOps);
}
SDValue
W65816TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
// (shl i32 X, K) -> chain of K (add x, x) for small K. After type
// legalisation the i32 add splits via ADDC/ADDE pseudos which expand
// to native ASL/ROL + carry-chain — much cheaper than the type-
// legaliser's SHL_PARTS expansion which uses our 3-insn SRL15A trick
// to compute the bit crossing the half boundary. Each ADD expands to
// ~10 insns; SHL_PARTS expansion is ~26 for K=1, ~33 for K=2, ~34 for
// K=3. ADD-chain wins at K<=2 and breaks even at K=3 — cap at K=2.
// `x*N` (which the combiner canonicalises pow-of-2 muls to `x<<K`)
// benefits the most. i16 SHL by 1..15 has dedicated ASLA16 patterns
// already, so we restrict the rewrite to i32+.
// (shl i32 X, K) -> ADD chain for small K — but only when i32 is
// ILLEGAL (i.e., gets type-split into i16 halves). When i32 is a
// legal type (Wide32 reg class for ptr32 mode), the rewrite cycles
// against LLVM's generic `(add x, x) -> (shl x, 1)` combine in the
// i64 → 2 i32 split path, hanging the legalizer.
// STORE / LOAD with ConstantSDNode ptr (e.g. `*(volatile uint8*)0xC035 = v`):
// wrap the immediate in a W65816ISD::WRAPPER (using a TargetGlobalAddress-
// like marker would be cleaner but we lack the symbol table). Re-issue
// the store/load with the same ptr but the constant marked TargetConstant
// — TargetConstant is opaque to LowerI32Constant, so it survives intact
// to ISel, where the existing tablegen pattern
// `(store Acc8, (iPTR imm)) -> STA8long`
// matches (`imm` accepts both Constant and TargetConstant). 4 B / 6 cyc
// bank-explicit `sta long` instead of 16 B / 30 cyc [dp],Y.
// Wide32-of-Wrapper-with-zero-hi → i16 Wrapper. Under p:32:16,
// LowerGlobalAddress builds GlobalAddress as a Wide32 reg pair
// `(REG_SEQUENCE Wrapper(off_i16), 0_i16)`. Stores/loads against
// this Wide32 ptr fall to the heavy [dp],Y path (16 B / 30 cyc)
// even when the bank half is the constant 0 — we want the cheap
// DBR-relative `sta g` / `lda g` (3 B / 5 cyc). Detect the shape
// and recombine the ptr to its 16-bit form so the existing
// tablegen `(store v, (Wrapper tglob))` → STAabs / `(load (Wrapper
// tglob))` → LDAabs patterns fire. Crucially, this is correct
// ONLY when bank=0 — under GS/OS Loader, DBR is set to our bank
// by crt0Gsos, so DBR-relative addressing reaches the same global.
// Returns either an i16 Wrapper (drop into i16 STAabs/LDAabs pattern)
// or a TargetConstant:i32 (for const-addr i16 stores so the timm
// pattern fires and produces STAabs). TargetConstant — not regular
// Constant — because LowerI32Constant only matches ISD::Constant; if
// we returned a fresh ConstantSDNode it would re-fire LowerI32Constant
// and produce another Wide32 REG_SEQUENCE → infinite combine loop.
auto unwrapWide32WithZeroHi = [&](SDValue Ptr) -> SDValue {
if (Ptr.getValueType() != MVT::i32) return SDValue();
if (!Ptr.getNode()->isMachineOpcode()) return SDValue();
if (Ptr.getMachineOpcode() != TargetOpcode::REG_SEQUENCE)
return SDValue();
SDValue Lo, Hi;
for (unsigned i = 1; i + 1 < Ptr.getNumOperands(); i += 2) {
auto *CIdx = dyn_cast<ConstantSDNode>(Ptr.getOperand(i + 1));
if (!CIdx) continue;
if (CIdx->getZExtValue() == llvm::sub_lo) Lo = Ptr.getOperand(i);
else if (CIdx->getZExtValue() == llvm::sub_hi) Hi = Ptr.getOperand(i);
}
if (!Lo || !Hi) return SDValue();
auto *HiC = dyn_cast<ConstantSDNode>(Hi);
if (!HiC || HiC->getZExtValue() != 0) return SDValue();
if (Lo.getOpcode() == W65816ISD::Wrapper) return Lo;
if (auto *LoC = dyn_cast<ConstantSDNode>(Lo)) {
// Recombine into a TargetConstant:i32 so the `(store v, (iPTR
// timm))` STAabs pattern fires. Returning an i16 Constant
// would create a malformed STORE node (Ptr type mismatch) and
// returning a regular Constant:i32 would re-trigger
// LowerI32Constant.
return DCI.DAG.getTargetConstant(LoC->getZExtValue(), SDLoc(Ptr),
MVT::i32);
}
return SDValue();
};
if (N->getOpcode() == ISD::STORE) {
auto *St = cast<StoreSDNode>(N);
EVT MemVT = St->getMemoryVT();
SDValue Ptr = St->getBasePtr();
// Skip i32 stores — LowerStore's i32 path has its own Wide32-zero-hi
// const-addr fast path that emits two i16 stores at separate
// TargetConstant addrs. Unwrapping here would short-circuit that
// and produce a malformed ADD(TargetConstant, Constant) when the
// hi-half store needs Ptr+2.
if (MemVT != MVT::i32) {
if (SDValue I16Ptr = unwrapWide32WithZeroHi(Ptr)) {
SelectionDAG &DAG = DCI.DAG;
SDLoc DL(N);
return DAG.getTruncStore(St->getChain(), DL, St->getValue(), I16Ptr,
MemVT, St->getMemOperand());
}
}
// i8 const-addr → STA8long (timm pattern); i16 const-addr →
// STAabs (timm pattern, DBR-relative). Wrap as TargetConstant so
// LowerI32Constant doesn't re-enter and break the const-pattern
// match. i32 stores split into 2 i16 stores via LowerStore so they
// come back through this combine as MemVT==i16.
if (MemVT != MVT::i8 && MemVT != MVT::i16) return SDValue();
if (auto *C = dyn_cast<ConstantSDNode>(Ptr)) {
SelectionDAG &DAG = DCI.DAG;
SDLoc DL(N);
SDValue NewPtr = DAG.getTargetConstant(C->getZExtValue(), DL,
Ptr.getValueType());
return DAG.getTruncStore(St->getChain(), DL, St->getValue(), NewPtr,
MemVT, St->getMemOperand());
}
}
if (N->getOpcode() == ISD::LOAD) {
auto *Ld = cast<LoadSDNode>(N);
EVT MemVT = Ld->getMemoryVT();
EVT VT = Ld->getValueType(0);
SDValue Ptr = Ld->getBasePtr();
// Wide32-of-Wrapper-with-zero-hi → i16 Wrapper (companion to the
// STORE side just above). Lets `(load (Wrapper g))` → LDAabs fire.
// Skip i32 loads — LowerLoad's i32 path does its own Ptr+2 ADD
// arithmetic and would choke on a TargetConstant unwrap result.
if (MemVT != MVT::i32) {
if (SDValue I16Ptr = unwrapWide32WithZeroHi(Ptr)) {
SelectionDAG &DAG = DCI.DAG;
SDLoc DL(N);
return DAG.getExtLoad(Ld->getExtensionType(), DL, VT,
Ld->getChain(), I16Ptr, MemVT,
Ld->getMemOperand());
}
}
// Only the i8 const-addr path has dedicated tablegen patterns
// (LDA8long); skip i16 const-addr loads (no LDAabs imm pattern)
// and i32 (would re-fire on the same node with different shape).
if (MemVT != MVT::i8 || (VT != MVT::i8 && VT != MVT::i16))
return SDValue();
if (auto *C = dyn_cast<ConstantSDNode>(Ptr)) {
SelectionDAG &DAG = DCI.DAG;
SDLoc DL(N);
SDValue NewPtr = DAG.getTargetConstant(C->getZExtValue(), DL,
Ptr.getValueType());
return DAG.getExtLoad(Ld->getExtensionType(), DL, VT,
Ld->getChain(), NewPtr, MemVT,
Ld->getMemOperand());
}
}
if (N->getOpcode() == ISD::SHL && N->getValueType(0).getSizeInBits() >= 32 &&
!isTypeLegal(N->getValueType(0))) {
if (auto *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
uint64_t K = C->getZExtValue();
if (K >= 1 && K <= 2) {
SelectionDAG &DAG = DCI.DAG;
SDValue X = N->getOperand(0);
SDLoc DL(N);
EVT VT = N->getValueType(0);
SDValue R = X;
for (uint64_t i = 0; i < K; ++i)
R = DAG.getNode(ISD::ADD, DL, VT, R, R);
return R;
}
}
}
return SDValue();
}
// Map a W65816CC code to the matching Bxx opcode.
static unsigned getBranchOpcodeForCC(unsigned CC) {
switch (CC) {
case W65816CC::COND_EQ: return W65816::BEQ;
case W65816CC::COND_NE: return W65816::BNE;
case W65816CC::COND_HS: return W65816::BCS;
case W65816CC::COND_LO: return W65816::BCC;
case W65816CC::COND_MI: return W65816::BMI;
case W65816CC::COND_PL: return W65816::BPL;
case W65816CC::COND_VS: return W65816::BVS;
case W65816CC::COND_VC: return W65816::BVC;
}
llvm_unreachable("invalid W65816 condition code");
}
// For multi-branch CCs, return the (branchA, branchB, BothMeanTrue) tuple.
// branchA is tested first; if it takes, we go to TrueBB if BothMeanTrue
// (i.e. both branches are "take if true"), otherwise to FalseBB. branchB
// is tested next with the same semantic.
//
// GT : (BPL && BNE) → BEQ FalseBB; BPL TrueBB; fall-through FalseBB
// LE : (BMI || BEQ) → BEQ TrueBB; BMI TrueBB; fall-through FalseBB
// HI : (BCS && BNE) → BEQ FalseBB; BCS TrueBB; fall-through FalseBB
// LS : (BCC || BEQ) → BEQ TrueBB; BCC TrueBB; fall-through FalseBB
struct MultiBranch {
unsigned First, Second;
bool FirstToTrue, SecondToTrue;
};
static MultiBranch getMultiBranch(unsigned CC) {
switch (CC) {
case W65816CC::COND_GT_MB:
return {W65816::BEQ, W65816::BPL, false, true};
case W65816CC::COND_LE_MB:
return {W65816::BEQ, W65816::BMI, true, true};
case W65816CC::COND_HI_MB:
return {W65816::BEQ, W65816::BCS, false, true};
case W65816CC::COND_LS_MB:
return {W65816::BEQ, W65816::BCC, true, true};
}
llvm_unreachable("not a multi-branch CC");
}
// Emit a two-Acc16 binary op as STAfi src2; OPfi dst, src1. Allocates
// a fresh 2-byte stack slot per call. For CMP (HasOut=false) there's
// no destination register, just the two src operands. Always spill
// the SECOND operand so non-commutative ops (sub, cmp) compute
// src1 OP src2 correctly via OPfi (which gives src1 OP load(spill)).
static MachineBasicBlock *
emitRROp(MachineInstr &MI, MachineBasicBlock *BB, unsigned StoreOp,
unsigned OpFI, bool HasOut) {
MachineFunction *MF = BB->getParent();
const W65816Subtarget &STI = MF->getSubtarget<W65816Subtarget>();
const W65816InstrInfo &TII = *STI.getInstrInfo();
DebugLoc DL = MI.getDebugLoc();
int FI = MF->getFrameInfo().CreateStackObject(2, Align(2),
/*isSpillSlot=*/true);
unsigned LhsIdx = HasOut ? 1 : 0;
unsigned RhsIdx = HasOut ? 2 : 1;
Register Src1 = MI.getOperand(LhsIdx).getReg();
Register Src2 = MI.getOperand(RhsIdx).getReg();
// Spill src2 (the rhs). Then OPfi computes src1 OP load(spill).
BuildMI(*BB, MI.getIterator(), DL, TII.get(StoreOp))
.addReg(Src2)
.addFrameIndex(FI)
.addImm(0);
if (HasOut) {
Register Dst = MI.getOperand(0).getReg();
BuildMI(*BB, MI.getIterator(), DL, TII.get(OpFI), Dst)
.addReg(Src1)
.addFrameIndex(FI)
.addImm(0);
} else {
BuildMI(*BB, MI.getIterator(), DL, TII.get(OpFI))
.addReg(Src1)
.addFrameIndex(FI)
.addImm(0);
}
MI.eraseFromParent();
return BB;
}
MachineBasicBlock *
W65816TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *BB) const {
// The only opcode we currently emit with usesCustomInserter=1 is
// SELECT_CC16. Expand it into a diamond CFG with a PHI. For
// single-branch CCs:
//
// thisMBB:
// ... CMP already emitted ...
// Bxx sinkMBB ; branch to "true" path
// ; fall through to copy0MBB
// copy0MBB:
// ; (no instructions; PHI picks fval here)
// sinkMBB:
// dst = PHI [tval, thisMBB], [fval, copy0MBB]
//
// For multi-branch CCs (GT/LE/UGT/ULE without const RHS, where a
// single Bxx isn't enough), insert two branches. Both target either
// sinkMBB or copy0MBB depending on the condition.
switch (MI.getOpcode()) {
default:
llvm_unreachable("unexpected instruction in EmitInstrWithCustomInserter");
case W65816::ADD_RR:
return emitRROp(MI, BB, W65816::STAfi, W65816::ADCfi, /*HasOut=*/true);
case W65816::SUB_RR:
return emitRROp(MI, BB, W65816::STAfi, W65816::SBCfi, /*HasOut=*/true);
// Carry-chain variants for the hi half of an i32 split. STAfi doesn't
// touch P, so the carry from the previous addc/adde survives the
// spill and is consumed by ADCEfi/SBCEfi below.
case W65816::ADDE_RR:
return emitRROp(MI, BB, W65816::STAfi, W65816::ADCEfi, /*HasOut=*/true);
case W65816::SUBE_RR:
return emitRROp(MI, BB, W65816::STAfi, W65816::SBCEfi, /*HasOut=*/true);
case W65816::AND_RR:
return emitRROp(MI, BB, W65816::STAfi, W65816::ANDfi, /*HasOut=*/true);
case W65816::ORA_RR:
return emitRROp(MI, BB, W65816::STAfi, W65816::ORAfi, /*HasOut=*/true);
case W65816::EOR_RR:
return emitRROp(MI, BB, W65816::STAfi, W65816::EORfi, /*HasOut=*/true);
case W65816::CMP_RR:
return emitRROp(MI, BB, W65816::STAfi, W65816::CMPfi, /*HasOut=*/false);
case W65816::LDAptr32S:
case W65816::STAptr32S:
case W65816::STBptr32S: {
// Split-pair variant: ptr is 2 i16 operands (lo + hi) instead of
// 1 Wide32 reg pair. Used by the W65816LowerWide32 pre-RA pass
// to dodge pair-allocation pressure. Otherwise identical to
// the LDAptr32 inserter below.
MachineFunction *MF = BB->getParent();
const W65816Subtarget &STI = MF->getSubtarget<W65816Subtarget>();
const W65816InstrInfo &TII = *STI.getInstrInfo();
DebugLoc DL = MI.getDebugLoc();
bool IsLoad = MI.getOpcode() == W65816::LDAptr32S;
bool IsByteStore = MI.getOpcode() == W65816::STBptr32S;
Register PtrLo = MI.getOperand(IsLoad ? 1 : 1).getReg();
Register PtrHi = MI.getOperand(IsLoad ? 2 : 2).getReg();
int FILo = MF->getFrameInfo().CreateStackObject(2, Align(2),
/*isSpillSlot=*/false);
int FIHi = MF->getFrameInfo().CreateStackObject(2, Align(2),
/*isSpillSlot=*/false);
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi))
.addReg(PtrLo).addFrameIndex(FILo).addImm(0);
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi))
.addReg(PtrHi).addFrameIndex(FIHi).addImm(0);
// STA_DP's tablegen def has no implicit A Use, so without an
// explicit kill marker between adjacent LDAfi-STA_DP-LDAfi-STA_DP
// pairs the fast regalloc collapses two A-loads into one (the
// first's value is overwritten before STA_DP can store it). Add
// implicit Use of A on the STA_DP to encode the dependency. This
// also helps post-RA passes track A liveness correctly.
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi),
W65816::A).addFrameIndex(FILo).addImm(0);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::STA_DP)).addImm(0xE0)
.addReg(W65816::A, RegState::Implicit);
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi),
W65816::A).addFrameIndex(FIHi).addImm(0);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::STA_DP)).addImm(0xE2)
.addReg(W65816::A, RegState::Implicit);
if (IsLoad) {
Register Dst = MI.getOperand(0).getReg();
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::LDY_Imm16)).addImm(0);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::LDA_DPIndLongY)).addImm(0xE0);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(TargetOpcode::COPY), Dst).addReg(W65816::A);
} else {
Register Val = MI.getOperand(0).getReg();
BuildMI(*BB, MI.getIterator(), DL,
TII.get(TargetOpcode::COPY), W65816::A).addReg(Val);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::LDY_Imm16)).addImm(0);
if (IsByteStore)
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::SEP)).addImm(0x20);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::STA_DPIndLongY)).addImm(0xE0);
if (IsByteStore)
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::REP)).addImm(0x20);
}
MI.eraseFromParent();
return BB;
}
case W65816::LDAptr32:
case W65816::STAptr32:
case W65816::STBptr32: {
// Same shape as the i16 LDAptr/STAptr/STBptr inserter, but the
// pointer is a Wide32 register pair: sub_lo carries the low 16
// bits of the address, sub_hi carries the bank byte in its low
// half (high half is pad, ORCA convention). Stage at $E0..$E2,
// then [dp],Y addresses the right bank without forcing 0.
//
// Dead unless ptr32 mode is active (LowerLoad/LowerStore are gated
// on i32 address type).
MachineFunction *MF = BB->getParent();
const W65816Subtarget &STI = MF->getSubtarget<W65816Subtarget>();
const W65816InstrInfo &TII = *STI.getInstrInfo();
MachineRegisterInfo &MRI = MF->getRegInfo();
DebugLoc DL = MI.getDebugLoc();
bool IsLoad = MI.getOpcode() == W65816::LDAptr32;
bool IsByteStore = MI.getOpcode() == W65816::STBptr32;
Register Ptr = MI.getOperand(IsLoad ? 1 : 1).getReg();
// Extract the i16 sub-halves of the Wide32 ptr. At custom-inserter
// time Ptr is still a virtual register, so `TRI.getSubReg` won't
// work (it's physreg-only). Use COPY-with-subreg-index instead;
// the regalloc + virtreg-rewriter resolves this to the right
// physreg operand later.
Register PtrLo = MRI.createVirtualRegister(&W65816::Wide16RegClass);
Register PtrHi = MRI.createVirtualRegister(&W65816::Wide16RegClass);
BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), PtrLo)
.addReg(Ptr, (RegState)0, llvm::sub_lo);
BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), PtrHi)
.addReg(Ptr, (RegState)0, llvm::sub_hi);
// Spill each half to a fresh slot, reload via LDAfi. Same RA-
// pinning rationale as the i16 LDAptr inserter.
int FILo = MF->getFrameInfo().CreateStackObject(2, Align(2),
/*isSpillSlot=*/false);
int FIHi = MF->getFrameInfo().CreateStackObject(2, Align(2),
/*isSpillSlot=*/false);
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi))
.addReg(PtrLo).addFrameIndex(FILo).addImm(0);
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi))
.addReg(PtrHi).addFrameIndex(FIHi).addImm(0);
// Stage the 24-bit address at $E0..$E2: sub_lo at $E0..$E1,
// bank byte (low half of sub_hi) at $E2. We write 16 bits at $E2
// — the high byte ($E3) gets sub_hi's pad byte (0 by ORCA) — but
// only $E2 is consulted by [dp],Y so $E3 contamination is harmless
// until something else uses $E3.
// STA_DP's tablegen def has no implicit A Use, so without an
// explicit kill marker between adjacent LDAfi-STA_DP-LDAfi-STA_DP
// pairs the fast regalloc collapses two A-loads into one (the
// first's value is overwritten before STA_DP can store it). Add
// implicit Use of A on the STA_DP to encode the dependency. This
// also helps post-RA passes track A liveness correctly.
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi),
W65816::A).addFrameIndex(FILo).addImm(0);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::STA_DP)).addImm(0xE0)
.addReg(W65816::A, RegState::Implicit);
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi),
W65816::A).addFrameIndex(FIHi).addImm(0);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::STA_DP)).addImm(0xE2)
.addReg(W65816::A, RegState::Implicit);
if (IsLoad) {
Register Dst = MI.getOperand(0).getReg();
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::LDY_Imm16)).addImm(0);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::LDA_DPIndLongY)).addImm(0xE0);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(TargetOpcode::COPY), Dst).addReg(W65816::A);
} else {
Register Val = MI.getOperand(0).getReg();
BuildMI(*BB, MI.getIterator(), DL,
TII.get(TargetOpcode::COPY), W65816::A).addReg(Val);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::LDY_Imm16)).addImm(0);
if (IsByteStore)
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::SEP)).addImm(0x20);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::STA_DPIndLongY)).addImm(0xE0);
if (IsByteStore)
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::REP)).addImm(0x20);
}
MI.eraseFromParent();
return BB;
}
case W65816::LDAptr32Off:
case W65816::STAptr32Off:
case W65816::STBptr32Off: {
// ptr32 deref with constant offset. Compute (sub_lo + off) into A
// with CLC; ADC, store at $E0..$E1; then propagate the carry into
// the bank byte via ADC #0 on (sub_hi) and store at $E2. Carry
// propagation is conservatively always emitted — bank wrapping is
// rare but real (bank-spanning struct or negative offset).
//
// Dead unless ptr32 mode is active.
MachineFunction *MF = BB->getParent();
const W65816Subtarget &STI = MF->getSubtarget<W65816Subtarget>();
const W65816InstrInfo &TII = *STI.getInstrInfo();
MachineRegisterInfo &MRI = MF->getRegInfo();
DebugLoc DL = MI.getDebugLoc();
bool IsLoad = MI.getOpcode() == W65816::LDAptr32Off;
bool IsByteStore = MI.getOpcode() == W65816::STBptr32Off;
Register Ptr = MI.getOperand(1).getReg();
int64_t Off = MI.getOperand(2).getImm();
// See LDAptr32 inserter above: vreg sub-regs need COPY-with-subreg
// (TRI.getSubReg is physreg-only at custom-inserter time).
Register PtrLo = MRI.createVirtualRegister(&W65816::Wide16RegClass);
Register PtrHi = MRI.createVirtualRegister(&W65816::Wide16RegClass);
BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), PtrLo)
.addReg(Ptr, (RegState)0, llvm::sub_lo);
BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), PtrHi)
.addReg(Ptr, (RegState)0, llvm::sub_hi);
int FILo = MF->getFrameInfo().CreateStackObject(2, Align(2),
/*isSpillSlot=*/false);
int FIHi = MF->getFrameInfo().CreateStackObject(2, Align(2),
/*isSpillSlot=*/false);
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi))
.addReg(PtrLo).addFrameIndex(FILo).addImm(0);
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi))
.addReg(PtrHi).addFrameIndex(FIHi).addImm(0);
// (sub_lo + off) -> $E0..$E1
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi),
W65816::A).addFrameIndex(FILo).addImm(0);
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::CLC));
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::ADC_Imm16)).addImm(Off);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::STA_DP)).addImm(0xE0);
// (sub_hi + 0 + carry) -> $E2..$E3. ADC #0 picks up the carry
// from the previous ADC; if no carry, sub_hi is unchanged.
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi),
W65816::A).addFrameIndex(FIHi).addImm(0);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::ADC_Imm16)).addImm(0);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::STA_DP)).addImm(0xE2);
if (IsLoad) {
Register Dst = MI.getOperand(0).getReg();
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::LDY_Imm16)).addImm(0);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::LDA_DPIndLongY)).addImm(0xE0);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(TargetOpcode::COPY), Dst).addReg(W65816::A);
} else {
Register Val = MI.getOperand(0).getReg();
BuildMI(*BB, MI.getIterator(), DL,
TII.get(TargetOpcode::COPY), W65816::A).addReg(Val);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::LDY_Imm16)).addImm(0);
if (IsByteStore)
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::SEP)).addImm(0x20);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::STA_DPIndLongY)).addImm(0xE0);
if (IsByteStore)
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::REP)).addImm(0x20);
}
MI.eraseFromParent();
return BB;
}
case W65816::LDAptrOff:
case W65816::STAptrOff:
case W65816::STBptrOff: {
// Pointer access with a constant offset. Folds the offset into
// the pointer (CLC; ADC #off in A) BEFORE staging at $E0..$E2,
// then accesses via [$E0],Y with Y=0. We can't fold into Y
// because [dp],Y on the W65816 adds Y to the full 24-bit pointer
// — for a negative Y like 0xFFFE (= -2 signed), the addition
// crosses into bank 1. Folding into the pointer keeps the add
// at 16-bit (in A) so the bank byte stays 0.
//
// DBR-independent — see LDAptr/STAptr/STBptr.
MachineFunction *MF = BB->getParent();
const W65816Subtarget &STI = MF->getSubtarget<W65816Subtarget>();
const W65816InstrInfo &TII = *STI.getInstrInfo();
DebugLoc DL = MI.getDebugLoc();
bool IsLoad = MI.getOpcode() == W65816::LDAptrOff;
bool IsByteStore = MI.getOpcode() == W65816::STBptrOff;
Register Ptr = MI.getOperand(1).getReg();
int64_t Off = MI.getOperand(2).getImm();
// Spill the pointer vreg to a fresh 2-byte stack slot, then
// reload via LDAfi. Forces RA to materialize the source — see
// the LDAptr/STAptr/STBptr case below for the full rationale.
int FI = MF->getFrameInfo().CreateStackObject(2, Align(2),
/*isSpillSlot=*/false);
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi))
.addReg(Ptr).addFrameIndex(FI).addImm(0);
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi),
W65816::A).addFrameIndex(FI).addImm(0);
// Compute ptr + off in A. CLC + ADC for the add.
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::CLC));
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::ADC_Imm16)).addImm(Off);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::STA_DP)).addImm(0xE0);
if (LoaderBankDeref) {
// Bank byte from $BE (crt0-initialised) — Loader compat path.
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::LDA_DP)).addImm(0xBE);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::STA_DP)).addImm(0xE2);
} else {
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::STZ_DP)).addImm(0xE2);
}
if (IsLoad) {
Register Dst = MI.getOperand(0).getReg();
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::LDY_Imm16)).addImm(0);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::LDA_DPIndLongY)).addImm(0xE0);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(TargetOpcode::COPY), Dst).addReg(W65816::A);
} else {
Register Val = MI.getOperand(0).getReg();
BuildMI(*BB, MI.getIterator(), DL,
TII.get(TargetOpcode::COPY), W65816::A).addReg(Val);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::LDY_Imm16)).addImm(0);
if (IsByteStore)
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::SEP)).addImm(0x20);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::STA_DPIndLongY)).addImm(0xE0);
if (IsByteStore)
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::REP)).addImm(0x20);
}
MI.eraseFromParent();
return BB;
}
case W65816::LDAptr:
case W65816::LDAptrBank0:
case W65816::STAptr:
case W65816::STBptr: {
// Pointer load/store via [dp],Y indirect-long (opcodes 0xB7 / 0x97):
// STA $E0 ; pointer low/hi at $E0..$E1
// STZ $E2 ; bank byte at $E2 = 0
// LDY #0
// LDA [$E0], Y ; bank 0:ptr + 0
// STA [$E0], Y
// Bank-explicit ZERO — DBR-independent. Both the runInMame stack
// ($00:0FFF down) and BSS / heap globals (placed at $00:xxxx) live
// in bank 0, so pointer-derefs always reach the right memory even
// when the user has switched DBR for a bank-2 store via `pha;plb`.
//
// Trade-off: under GS/OS Loader the user's data lives in their bank
// (not bank 0), so library functions that write directly to globals
// via `sta abs` (DBR-relative, lands in user bank) and user code that
// reads via pointer-deref (lands in bank 0 by this lowering) get
// INCONSISTENT results — silent miscompile. gmtime hit this with
// its __gmtimeBuf static. Workaround for affected library code:
// launder the buffer pointer through inline asm (see gmtime in
// runtime/src/timeExt.c) so clang doesn't IPSCCP-fold it; the writes
// then go via [dp],Y too and match the user reads.
//
// Const-int pointers (`*(volatile uint16 *)0x5000 = v`) are NOT
// lowered through this pseudo — TableGen patterns route them to
// STAlong / STA8long / STAabs by type. See InstrInfo.td.
//
// We use $E0..$E2 in libcall-scratch DP — safe because the
// pseudo expansion is a leaf (no calls between SEP and STA),
// and any subsequent libcall reinitialises its own scratch.
//
// Why [dp],Y not abs-long-X (`STA $0,X`)? abs-long-X is shorter
// (~3 bytes less) but uses X to hold the pointer. In high-
// pressure functions like the recursive expression parser, X
// is often live with another value, and forcing X to be free
// for every pointer-deref triggered "ran out of registers".
// [dp],Y uses A and Y only — leaves X for spill-bridge use.
//
// STBptr (truncating i8 store) wraps the actual STA in SEP/REP
// so M=8 across the store and only one byte is written.
MachineFunction *MF = BB->getParent();
const W65816Subtarget &STI = MF->getSubtarget<W65816Subtarget>();
const W65816InstrInfo &TII = *STI.getInstrInfo();
DebugLoc DL = MI.getDebugLoc();
bool IsLoad = MI.getOpcode() == W65816::LDAptr ||
MI.getOpcode() == W65816::LDAptrBank0;
bool IsByteStore = MI.getOpcode() == W65816::STBptr;
// LDAptrBank0 hardcodes bank=0 (STZ $E2) regardless of LoaderBankDeref.
// Used by va_arg under Loader where the deref is a stack pointer
// (= bank 0 always on W65816) but $BE points to our code bank.
bool ForceBank0 = MI.getOpcode() == W65816::LDAptrBank0;
Register Ptr = MI.getOperand(1).getReg();
// Why we spill the pointer to a fresh stack slot first:
// a direct `COPY $a = ptr_vreg ; STA $E0` lets RA elide the COPY
// when ptr_vreg is already allocated to A. In a loop body where
// multiple Acc16 PHIs (pointer + accumulator) compete for A, the
// PHI elimination pass picks one to be in A at the bottom of the
// block and silently drops the COPY needed to refresh A with the
// OTHER value at the top of the next iteration — silent miscompile
// (sumTable read its own accumulator as the pointer on iter 2+).
// STAfi forces RA to materialize ptr_vreg's value so it gets stored
// to the slot, then LDAfi reads it back as a real machine load.
int FI = MF->getFrameInfo().CreateStackObject(2, Align(2),
/*isSpillSlot=*/false);
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi))
.addReg(Ptr).addFrameIndex(FI).addImm(0);
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi),
W65816::A).addFrameIndex(FI).addImm(0);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::STA_DP)).addImm(0xE0);
if (LoaderBankDeref && !ForceBank0) {
// Bank byte from $BE (crt0-initialised) — Loader compat path.
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::LDA_DP)).addImm(0xBE);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::STA_DP)).addImm(0xE2);
} else {
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::STZ_DP)).addImm(0xE2);
}
if (IsLoad) {
Register Dst = MI.getOperand(0).getReg();
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::LDY_Imm16)).addImm(0);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::LDA_DPIndLongY)).addImm(0xE0);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(TargetOpcode::COPY), Dst).addReg(W65816::A);
} else {
Register Val = MI.getOperand(0).getReg();
BuildMI(*BB, MI.getIterator(), DL,
TII.get(TargetOpcode::COPY), W65816::A).addReg(Val);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::LDY_Imm16)).addImm(0);
if (IsByteStore)
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::SEP)).addImm(0x20);
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::STA_DPIndLongY)).addImm(0xE0);
if (IsByteStore)
BuildMI(*BB, MI.getIterator(), DL,
TII.get(W65816::REP)).addImm(0x20);
}
MI.eraseFromParent();
return BB;
}
case W65816::SELECT_CC8:
case W65816::SELECT_CC16: {
const W65816Subtarget &STI = BB->getParent()->getSubtarget<W65816Subtarget>();
const W65816InstrInfo &TII = *STI.getInstrInfo();
MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
DebugLoc DL = MI.getDebugLoc();
MachineFunction *MF = BB->getParent();
const BasicBlock *LLVM_BB = BB->getBasicBlock();
MachineFunction::iterator It = ++BB->getIterator();
MachineBasicBlock *thisMBB = BB;
MachineBasicBlock *copy0MBB = MF->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(LLVM_BB);
MF->insert(It, copy0MBB);
MF->insert(It, sinkMBB);
// Move the rest of thisMBB after MI to sinkMBB.
sinkMBB->splice(sinkMBB->begin(), BB,
std::next(MachineBasicBlock::iterator(MI)), BB->end());
sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
unsigned CC = MI.getOperand(3).getImm();
// Helper: if `OpReg` is defined by a single-use, side-effect-free,
// constant-source LDA in thisMBB, MOVE that LDA into `DstMBB` (at
// its start). Returns true on success.
auto tryHoistConstInit = [&](Register OpReg,
MachineBasicBlock *DstMBB) -> bool {
if (!OpReg.isVirtual()) return false;
if (!MRI.hasOneNonDBGUse(OpReg)) return false;
MachineInstr *Def = MRI.getUniqueVRegDef(OpReg);
if (!Def || Def->getParent() != thisMBB) return false;
if (Def->getOpcode() != W65816::LDAi16imm &&
Def->getOpcode() != W65816::LDAi8imm)
return false;
if (Def->getNumOperands() < 2 || !Def->getOperand(1).isImm())
return false;
Def->removeFromParent();
DstMBB->insert(DstMBB->begin(), Def);
return true;
};
Register TValReg = MI.getOperand(1).getReg();
Register FValReg = MI.getOperand(2).getReg();
auto IsConstLda = [&](Register R) {
if (!R.isVirtual() || !MRI.hasOneNonDBGUse(R)) return false;
MachineInstr *D = MRI.getUniqueVRegDef(R);
return D && D->getParent() == thisMBB &&
(D->getOpcode() == W65816::LDAi16imm ||
D->getOpcode() == W65816::LDAi8imm) &&
D->getNumOperands() >= 2 && D->getOperand(1).isImm();
};
bool BothConst = (CC < W65816CC::COND_GT_MB) &&
IsConstLda(TValReg) && IsConstLda(FValReg);
if (BothConst) {
// 4-block diamond: thisMBB has only the test (CMP) and Bxx; the
// tval and fval LDAs each live in their own destination block,
// which is reached only via the branch — so neither LDA's flag
// side-effect can corrupt the CMP→Bxx test window. This is the
// proper fix for the "LDA between CMP and Bxx" bug catalogued in
// project_known_issue_lda_flags.md (replacing the earlier 3-block
// workaround that only hoisted fval).
//
// thisMBB: ...; CMP; Bxx tvalMBB
// copy0MBB: LDA #fval; BRA sinkMBB (FALSE path)
// tvalMBB: LDA #tval (TRUE path; falls to sink)
// sinkMBB: PHI [tval from tvalMBB, fval from copy0MBB]
MachineBasicBlock *tvalMBB = MF->CreateMachineBasicBlock(LLVM_BB);
MF->insert(sinkMBB->getIterator(), tvalMBB);
BB->addSuccessor(copy0MBB);
BB->addSuccessor(tvalMBB);
copy0MBB->addSuccessor(sinkMBB);
tvalMBB->addSuccessor(sinkMBB);
unsigned BrOp = getBranchOpcodeForCC(CC);
BuildMI(thisMBB, DL, TII.get(BrOp)).addMBB(tvalMBB);
BuildMI(copy0MBB, DL, TII.get(W65816::BRA)).addMBB(sinkMBB);
tryHoistConstInit(TValReg, tvalMBB);
tryHoistConstInit(FValReg, copy0MBB);
BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII.get(W65816::PHI),
MI.getOperand(0).getReg())
.addReg(TValReg).addMBB(tvalMBB)
.addReg(FValReg).addMBB(copy0MBB);
} else {
// 3-block diamond: keep the existing layout and (where possible)
// hoist fval into copy0MBB. Used when one or both operands are
// computed values (not constants), or when the multi-branch CC
// requires two Bxx in thisMBB.
BB->addSuccessor(copy0MBB);
BB->addSuccessor(sinkMBB);
if (CC < W65816CC::COND_GT_MB) {
unsigned BrOp = getBranchOpcodeForCC(CC);
BuildMI(thisMBB, DL, TII.get(BrOp)).addMBB(sinkMBB);
} else {
MultiBranch MB = getMultiBranch(CC);
MachineBasicBlock *Tgt1 = MB.FirstToTrue ? sinkMBB : copy0MBB;
MachineBasicBlock *Tgt2 = MB.SecondToTrue ? sinkMBB : copy0MBB;
BuildMI(thisMBB, DL, TII.get(MB.First)).addMBB(Tgt1);
BuildMI(thisMBB, DL, TII.get(MB.Second)).addMBB(Tgt2);
}
copy0MBB->addSuccessor(sinkMBB);
tryHoistConstInit(FValReg, copy0MBB);
BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII.get(W65816::PHI),
MI.getOperand(0).getReg())
.addReg(TValReg).addMBB(thisMBB)
.addReg(FValReg).addMBB(copy0MBB);
}
MI.eraseFromParent();
return sinkMBB;
}
}
}