65816-llvm-mos/src/llvm/lib/Target/W65816/W65816ISelLowering.cpp

//===-- W65816ISelLowering.cpp - W65816 DAG Lowering Implementation -------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Minimum DAG lowering sufficient for a no-argument function returning an
// i16 constant.  Argument passing and non-trivial calls still unimplemented.
//
//===----------------------------------------------------------------------===//

#include "W65816ISelLowering.h"
#include "W65816InstrInfo.h"
#include "W65816MachineFunctionInfo.h"
#include "W65816SelectionDAGInfo.h"
#include "W65816Subtarget.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/IR/Function.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorHandling.h"

using namespace llvm;

#define DEBUG_TYPE "w65816-lower"

// Loader-compat workaround: when set, LDAptr/STAptr/STBptr inserters
// load the bank byte from DP $BE (initialized by crt0 to PHK / current
// PBR) instead of forcing it to 0 via STZ $E2.  This makes pointer
// derefs land in the user's bank — matching where DBR-relative
// absolute stores go — so library functions like gmtime that store
// into static buffers via DBR-relative paths are visible to caller-
// side pointer-deref reads.  Costs 2 extra bytes / 4 cycles per ptr-
// deref (LDA dp + STA dp vs STZ dp).  Default off to keep
// size-sensitive builds (toolbox) under the $C000 IO-window ceiling.
static cl::opt<bool> LoaderBankDeref(
    "w65816-loader-bank-deref",
    cl::desc("LDAptr/STAptr inserters read bank from DP $BE (set by "
             "crt0 to PHK) instead of STZ $E2.  Required for GS/OS "
             "Loader compatibility; default off for size-sensitive "
             "builds."),
    cl::init(false), cl::Hidden);

// Layer 2 ptr32 opt: when set, ptr32 derefs assume the pointer's bank
// byte matches DBR.  Uses `lda (d,s),Y` (opcode 0xB3, stack-relative
// indirect indexed-Y) instead of staging at $E0/$E2 and using
// `lda [dp],Y` (24-bit indirect-long).  Saves ~4 instructions per
// deref.  Correct only for code that touches memory inside DBR's bank
// — malloc'd Lua state + globals + BSS qualify; cross-bank pointers
// (rare) do not.  Caller's responsibility.  Tested by hand on lapi.c.
//
// NOTE: not static -- W65816Layer2Gate.cpp reads this to stamp the
// "w65816-layer2" function attribute on every function compiled with
// Layer 2 on, so the LTO-time gate can detect mismatched TUs.  Phase
// 1.12 of GAP_CLOSURE_PLAN.md.
cl::opt<bool> DbrSafePtrs(
    "w65816-dbr-safe-ptrs",
    cl::desc("ptr32 derefs use 16-bit stack-rel-indirect-Y, assuming "
             "the pointer's bank byte matches DBR.  Significantly "
             "shrinks struct-field-heavy code (Lua's lapi.c: ~3.4× → "
             "much smaller) at the cost of safety for cross-bank "
             "pointers (which become a miscompile)."),
    cl::init(false), cl::Hidden);

W65816TargetLowering::W65816TargetLowering(const TargetMachine &TM,
                                           const W65816Subtarget &STI)
    : TargetLowering(TM, STI) {
  // Register classes for the two scalar modes.  The register allocator sees
  // A, X and Y as both 8-bit and 16-bit; a later REP/SEP pass is responsible
  // for ensuring the dynamic mode matches the selected class.
  addRegisterClass(MVT::i8,  &W65816::Acc8RegClass);
  addRegisterClass(MVT::i16, &W65816::Acc16RegClass);
  addRegisterClass(MVT::i32, &W65816::Wide32RegClass);

  computeRegisterProperties(STI.getRegisterInfo());

  setStackPointerRegisterToSaveRestore(W65816::SP);
  setBooleanContents(ZeroOrOneBooleanContent);
  setBooleanVectorContents(ZeroOrOneBooleanContent);

  // GlobalAddress and ExternalSymbol: lower to W65816ISD::Wrapper so a
  // tablegen pattern can fold them into instruction operands.
  setOperationAction(ISD::GlobalAddress,  MVT::i16, Custom);
  setOperationAction(ISD::ExternalSymbol, MVT::i16, Custom);
  setOperationAction(ISD::GlobalAddress,  MVT::i32, Custom);
  setOperationAction(ISD::ExternalSymbol, MVT::i32, Custom);
  // FrameIndex i32 has its own DAG-to-DAG path in W65816ISelDAGToDAG.cpp.

  // BR_CC is custom-lowered to a CMP + W65816ISD::BR_CC chain so we can
  // emit the right BEQ/BNE/BCS/BCC mnemonic per condition.
  setOperationAction(ISD::BR_CC,    MVT::i16, Custom);
  setOperationAction(ISD::BR_CC,    MVT::i8,  Custom);
  setOperationAction(ISD::BRCOND,   MVT::Other, Expand);
  setOperationAction(ISD::BR_JT,    MVT::Other, Expand);
  // BRIND (computed-goto `goto *p`, indirectbr IR) has no direct
  // 65816 instruction — JMP (abs) / JMP [abs] read the target pointer
  // from MEMORY, not a register.  Custom-lower to: store the pointer's
  // 16-bit low half (offset within the program's PBR-pinned code bank)
  // to $00B8 (the __indirTarget DP slot already reserved for indirect
  // calls — see libgcc.s), then emit a `JMP ($00B8)` via the BRIND
  // pseudo.  Single-bank assumption on the target's code: same as
  // every other JMP/BRA in our codegen.
  //
  // The ptr is i32 under p:32:16 (current default) — extract sub_lo.
  // Under p:16 (legacy ptr16), it's already i16.
  setOperationAction(ISD::BRIND, MVT::Other, Custom);

  // SETCC and SELECT_CC: custom-lowered to a CMP + W65816ISD::SELECT_CC
  // pseudo (with usesCustomInserter=1) that EmitInstrWithCustomInserter
  // expands into a Bxx + diamond CFG + PHI.  SETCC funnels through the
  // same path with TVal=1 / FVal=0.  SELECT (no condition operand) is
  // expanded to SELECT_CC by the legalizer using SETNE against zero.
  setOperationAction(ISD::SETCC,     MVT::i16, Custom);
  setOperationAction(ISD::SETCC,     MVT::i8,  Custom);
  setOperationAction(ISD::SELECT_CC, MVT::i16, Custom);
  setOperationAction(ISD::SELECT_CC, MVT::i8,  Custom);
  setOperationAction(ISD::SELECT,    MVT::i16, Expand);
  setOperationAction(ISD::SELECT,    MVT::i8,  Expand);
  // 65816 has no inline sign-extend instruction; synthesize i8 -> i16
  // via a bit-7 test and SELECT_CC (see LowerSignExtend).
  setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Custom);

  // BSWAP: no native byte-swap instruction (XBA swaps the two halves
  // of the 16-bit accumulator only when in 8-bit M mode, hard to
  // exploit cleanly).  Lower to shifts + ORs via the generic Expand
  // path — SDAG turns `bswap(i32)` into four byte extracts ORed back
  // together, which our existing patterns handle.  Required for
  // portable C that constructs a big-endian word from byte loads:
  // `((u32)b[0] << 24) | ((u32)b[1] << 16) | ((u32)b[2] << 8) | b[3]`
  // (SHA-256 message-schedule, JPEG/PNG headers, etc.).
  setOperationAction(ISD::BSWAP, MVT::i16, Expand);
  setOperationAction(ISD::BSWAP, MVT::i32, Expand);
  setOperationAction(ISD::BSWAP, MVT::i64, Expand);

  // We have zextload-i8 and extload-i8 patterns (LDA + AND #$FF / bare
  // LDA for the anyext case).  No native sextload; mark it Expand so
  // LLVM rewrites `sextload i16, i8` into `(sign_extend (load i8))`,
  // which then flows through LowerSignExtend's branchless 3-insn
  // sequence (AND #$00FF; EOR #$0080; SEC; SBC #$0080).
  for (MVT VT : MVT::integer_valuetypes())
    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Expand);

  // GlobalOpt sometimes narrows a `short` global to `i1` when it sees
  // every assignment is 0 or 1.  Custom-lower so LowerLoad rewrites
  // `zext/sext/anyext from i1` into a plain byte load + appropriate
  // mask.  Both i16 and i8 result widths can appear, depending on
  // whether the consumer wants the value as `short` or `bool`.
  for (MVT ResVT : {MVT::i8, MVT::i16}) {
    setLoadExtAction(ISD::ZEXTLOAD, ResVT, MVT::i1, Custom);
    setLoadExtAction(ISD::SEXTLOAD, ResVT, MVT::i1, Custom);
    setLoadExtAction(ISD::EXTLOAD,  ResVT, MVT::i1, Custom);
  }

  // Only register i32 ext-load / trunc-store and Custom actions when
  // i32 is actually a legal type (ptr32 mode active).  Otherwise the
  // Custom-action calls intercept i16/i8 ops, and LowerTruncate's
  // SDValue()-on-non-i32 bail breaks the i16→i8 trunc pattern (same
  // root cause as the earlier LOAD-Custom-breaks-LDAptr issue).
  bool ptr32Active = isTypeLegal(MVT::i32);
  if (ptr32Active) {
    for (MVT MemVT : {MVT::i8, MVT::i16}) {
      setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, MemVT, Expand);
      setLoadExtAction(ISD::SEXTLOAD, MVT::i32, MemVT, Expand);
      setLoadExtAction(ISD::EXTLOAD,  MVT::i32, MemVT, Expand);
      setTruncStoreAction(MVT::i32, MemVT, Expand);
    }
    // Truncating byte stores (`s->c = (char)v`) land as TRUNCSTORE
    // i16->i8 in SDAG after combiner canonicalization.  Custom-route
    // through LowerStore so the ptr-offset peel fires for them too.
    setTruncStoreAction(MVT::i16, MVT::i8, Custom);
  }

  // Vararg support: VASTART writes the address of the first vararg slot
  // to the va_list pointer.  VAARG/VACOPY/VAEND use the default
  // expansions that load through that pointer and bump it.  This makes
  // <stdarg.h>-style functions (e.g. printf-likes) compile cleanly.
  setOperationAction(ISD::VASTART, MVT::Other, Custom);
  // Custom VAARG so we DON'T align the va_list pointer.  The default
  // expansion rounds up to the type's preferred alignment (S16 = 2),
  // but caller-pushed args land at PHA's resulting odd S+1 address.
  // Aligning would skip the low byte and read garbage.
  setOperationAction(ISD::VAARG,   MVT::Other, Custom);
  setOperationAction(ISD::VACOPY,  MVT::Other, Expand);
  setOperationAction(ISD::VAEND,   MVT::Other, Expand);

  // C++ exceptions (SJLJ model) — clang lowers exception machinery into
  // these intrinsics via SjLjEHPrepare.  We don't have native handling
  // for any of them on this target; mark Expand so LegalizeDAG falls
  // back to its no-op stubs (setjmp returns 0, longjmp is a no-op,
  // setup_dispatch is a chain pass-through).  The actual EH semantics
  // are provided at runtime by libcxxabi (__cxa_throw etc.) calling
  // _Unwind_SjLj_RaiseException, which in turn longjmps via the
  // function context the prologue prepared.  See
  // runtime/src/libcxxabiSjlj.c for the runtime side.
  setOperationAction(ISD::EH_SJLJ_SETJMP,         MVT::i32,   Expand);
  setOperationAction(ISD::EH_SJLJ_SETJMP,         MVT::i16,   Expand);
  setOperationAction(ISD::EH_SJLJ_LONGJMP,        MVT::Other, Expand);
  setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
  // SJLJ exception lowering uses FRAMEADDR(0) to read the current frame
  // pointer.  We don't reserve a frame pointer in general; return the
  // entry-SP-equivalent value (current SP read via TSC) — good enough
  // for SJLJ's purpose of identifying the call frame.
  setOperationAction(ISD::FRAMEADDR,              MVT::i16,   Custom);
  setOperationAction(ISD::FRAMEADDR,              MVT::i32,   Custom);
  // stacksave / stackrestore — used by SjLjEHPrepare to save/restore SP
  // around invoke calls.  The jmp_buf already captures SP via TSC in
  // our setjmp implementation, so these are redundant here.  Lower
  // stacksave to a constant 0 (the value is stored into the function
  // context but never used for restoration on our target) and
  // stackrestore to a chain pass-through (no-op).
  // SJLJ EH uses STACKSAVE/STACKRESTORE.  Default Expand calls
  // CopyFromReg/$SP which fails because SP has no register class.
  // Custom-lower to a Constant 0 (stacksave) and chain-passthrough
  // (stackrestore) — our SJLJ runtime doesn't actually use these
  // values; setjmp/longjmp manage SP directly via TSC/TCS.
  setOperationAction(ISD::STACKSAVE,    MVT::Other, Custom);
  setOperationAction(ISD::STACKRESTORE, MVT::Other, Custom);
  // FRAMEADDR is set Custom above for SJLJ; don't set it Expand here
  // (the second setOperationAction would override the first).
  setOperationAction(ISD::RETURNADDR,             MVT::i16,   Expand);
  // W65816 pointers are i32; legalizer queries the action for the pointer
  // type, so register Expand for i32 too. Without this,
  // __builtin_return_address(0) ICEs in LowerOperation (no Custom handler
  // for RETURNADDR).
  setOperationAction(ISD::RETURNADDR,             MVT::i32,   Expand);
  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET,   MVT::i16,   Expand);
  setOperationAction(ISD::EH_DWARF_CFA,           MVT::i16,   Expand);

  // ISD::TRAP — __builtin_trap(), -fsanitize-trap=undefined.  Default
  // expansion is a libcall to abort(); UBSan-min wants a BRK with a
  // pickup sentinel instead so the trap site is identifiable from a
  // memory dump without a working stdio path.  Custom-lower to a
  // W65816ISD::TRAP target node; the InstrInfo.td pattern routes it
  // to BRK_pseudo, whose AsmPrinter expansion writes 0xBE to $70 and
  // then issues BRK + a self-loop (headless MAME mis-vectors BRK, so
  // the spin is what actually halts).
  setOperationAction(ISD::TRAP,                   MVT::Other, Custom);
  // DEBUGTRAP follows the same shape — same node, same expansion.
  setOperationAction(ISD::DEBUGTRAP,              MVT::Other, Custom);

  // The 65816 has no hardware multiplier or divider.  Multiply by a
  // power-of-two constant is auto-rewritten to shifts by the DAG
  // combiner; arbitrary multiply / divide / mod go through libcalls
  // (`__mulhi3` for i16 multiply etc.).  The libcall expander emits a
  // standard CALL node which flows through LowerCall, so multi-arg
  // call lowering must be working first (it is, see task #26).
  setOperationAction(ISD::MULHU,  MVT::i16, Expand);
  setOperationAction(ISD::MULHS,  MVT::i16, Expand);
  setOperationAction(ISD::SMUL_LOHI, MVT::i16, Expand);
  setOperationAction(ISD::UMUL_LOHI, MVT::i16, Expand);
  setOperationAction(ISD::MUL,    MVT::i16, LibCall);

  // i8 multiply / mulh / div / rem: SDAG narrows e.g. `x / 10` to
  // `mulhu i8 x, -51` + shift when it proves operands fit in i8.
  // The 65816 has no native 8-bit multiplier; route everything
  // through the 16-bit libcalls by Promoting i8 ops to i16.
  setOperationAction(ISD::MUL,   MVT::i8, Promote);
  setOperationAction(ISD::MULHU, MVT::i8, Promote);
  setOperationAction(ISD::MULHS, MVT::i8, Promote);
  setOperationAction(ISD::SDIV,  MVT::i8, Promote);
  setOperationAction(ISD::UDIV,  MVT::i8, Promote);
  setOperationAction(ISD::SREM,  MVT::i8, Promote);
  setOperationAction(ISD::UREM,  MVT::i8, Promote);
  setOperationAction(ISD::SMUL_LOHI, MVT::i8, Expand);
  setOperationAction(ISD::UMUL_LOHI, MVT::i8, Expand);
  // CTPOP/CTLZ/CTTZ/ROTL/ROTR — no hardware support.  Expand lets the
  // type legalizer rewrite into a sequence of basic ops.  Without
  // this, e.g. `x && !(x & (x-1))` (LLVM canonicalises to popcount==1)
  // or `(x << 1) | (x >> 15)` (canonicalised to rotl) hit "Cannot
  // Select" at isel.
  for (MVT VT : {MVT::i8, MVT::i16, MVT::i32}) {
    setOperationAction(ISD::CTPOP, VT, Expand);
    setOperationAction(ISD::CTLZ,  VT, Expand);
    setOperationAction(ISD::CTTZ,  VT, Expand);
    setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
    setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
    setOperationAction(ISD::ROTL, VT, Expand);
    setOperationAction(ISD::ROTR, VT, Expand);
  }
  setOperationAction(ISD::SDIV,   MVT::i16, LibCall);
  setOperationAction(ISD::UDIV,   MVT::i16, LibCall);
  setOperationAction(ISD::SREM,   MVT::i16, LibCall);
  setOperationAction(ISD::UREM,   MVT::i16, LibCall);
  setOperationAction(ISD::SDIVREM, MVT::i16, Expand);
  setOperationAction(ISD::UDIVREM, MVT::i16, Expand);

  // Variable-amount and large-constant shifts.  We have inline
  // patterns for shift-by-1..4; everything else goes through
  // __ashlhi3 / __lshrhi3 / __ashrhi3.  Setting the action to Custom
  // lets us return SDValue() for the fast cases and route everything
  // else through the libcall lowering helper.
  setOperationAction(ISD::SHL, MVT::i16, Custom);
  setOperationAction(ISD::SRL, MVT::i16, Custom);
  setOperationAction(ISD::SRA, MVT::i16, Custom);
  // i8 shifts go through Custom too — LowerShift detects the i8 result
  // and routes through trunc(i16-shift(zext_or_sext(lhs), amount)).
  // Avoids needing a parallel set of qi3 libcalls.
  setOperationAction(ISD::SHL, MVT::i8, Custom);
  setOperationAction(ISD::SRL, MVT::i8, Custom);
  setOperationAction(ISD::SRA, MVT::i8, Custom);

  // LOAD / STORE Custom-lowering for ptr32 mode is intentionally NOT
  // wired here in ptr16 mode.  Setting LOAD Custom and returning
  // SDValue() from LowerLoad short-circuits the i16-result LDAptr/
  // STAptr selection paths (the Custom→empty→Legal fall-through doesn't
  // re-enter pattern matching).  When ptr32 is activated, this hook
  // needs a different gating mechanism — likely an isel-time
  // replacement triggered by addrspacecast or a target DAG combine.
  // See LowerLoad / LowerStore — currently dead code.

  // ADDC/ADDE/SUBC/SUBE are the legacy SDNodes with implicit Glue carrying
  // the carry/borrow flag between the two halves of a multi-precision add or
  // sub.  Setting them Legal triggers the type legalizer's carry-chain split
  // for i32 ADD/SUB, which lowers to native ADC/SBC pairs (~7 instructions)
  // instead of the default UADDO+SETCC+ADD-of-bool path (~25 instructions).
  // The matching tablegen pseudos add Defs/Uses on the P register, which
  // tablegen wires up to the SDNode's SDNPInGlue/SDNPOutGlue automatically.
  setOperationAction(ISD::ADDC, MVT::i16, Legal);
  setOperationAction(ISD::ADDE, MVT::i16, Legal);
  setOperationAction(ISD::SUBC, MVT::i16, Legal);
  setOperationAction(ISD::SUBE, MVT::i16, Legal);

  // i32 (long).  Type legalization splits i32 into two i16 halves; with
  // ADDC/ADDE Legal (above), ADD/SUB go through the native carry chain.
  // AND/OR/XOR split cleanly into per-half ops with no carry to track.
  // Multiply/divide/shift go through libcall stubs whose
  // implementations live in runtime/src/libgcc.s.  SHL_PARTS / SRL_PARTS
  // / SRA_PARTS are the SDNodes the type legalizer emits when splitting
  // a variable-amount shift; without an action they get "Cannot select".
  // LibCall on the parent node routes the whole shift through one
  // __ashlsi3 / __lshrsi3 / __ashrsi3 call, which is both smaller and
  // simpler than implementing a 32-bit shift in 65816 assembly inline.
  for (MVT VT : {MVT::i32}) {
    // MUL i32 is Custom-lowered: the typical fall-through libcall is
    // __mulsi3 (32x32 -> 32), but when both operands are ZEXT from i16
    // we can emit __umulhisi3 (16x16 -> 32) instead.  Saves ~60 cyc per
    // call on the `(unsigned long)i * i` pattern — see LowerMUL_I32.
    setOperationAction(ISD::MUL,  VT, Custom);
    setOperationAction(ISD::SDIV, VT, LibCall);
    setOperationAction(ISD::UDIV, VT, LibCall);
    setOperationAction(ISD::SREM, VT, LibCall);
    setOperationAction(ISD::UREM, VT, LibCall);
    setOperationAction(ISD::MULHU, VT, Expand);
    setOperationAction(ISD::MULHS, VT, Expand);
    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
    setOperationAction(ISD::SDIVREM, VT, Expand);
    setOperationAction(ISD::UDIVREM, VT, Expand);
    // i32 shifts route through a libcall via the
    // preferredShiftLegalizationStrategy override (see header).  No
    // explicit SHL/SHL_PARTS action needed — the override forces the
    // type-legalizer's libcall path before SHL_PARTS would be emitted.
  }
  // i64 shifts — route to libcall before the type legalizer tries
  // to split via the next-legal-type (which becomes i32 in ptr32 mode
  // and triggers a SDAG combine loop on `i64 >> K` patterns).  By
  // marking SHL/SRL/SRA i64 LibCall here, the operation legalizer
  // picks up the libcall path even though i64 itself is illegal.
  for (MVT VT : {MVT::i64}) {
    setOperationAction(ISD::SHL, VT, LibCall);
    setOperationAction(ISD::SRL, VT, LibCall);
    setOperationAction(ISD::SRA, VT, LibCall);
  }

  if (ptr32Active) {
    for (unsigned Op : {ISD::ADD, ISD::SUB, ISD::AND, ISD::OR, ISD::XOR})
      setOperationAction(Op, MVT::i32, Custom);
    setOperationAction(ISD::SHL, MVT::i32, Custom);
    setOperationAction(ISD::SRL, MVT::i32, Custom);
    setOperationAction(ISD::SRA, MVT::i32, Custom);
    setOperationAction(ISD::ZERO_EXTEND, MVT::i32, Custom);
    setOperationAction(ISD::SIGN_EXTEND, MVT::i32, Custom);
    setOperationAction(ISD::ANY_EXTEND,  MVT::i32, Custom);
    // SIGN_EXTEND_INREG with i32 result and inner type i1/i8/i16:
    // the combiner emits this for `(int32_t)((int8_t)x)` and for
    // `-(crc & 1ul)` (the i1 case shows up in CRC32 loops).  No
    // tablegen pattern covers the i32 form; Custom-lower to per-half
    // ops.  IMPORTANT: LegalizeDAG looks up the action for
    // SIGN_EXTEND_INREG using the INNER VT (the operand value type),
    // not the result VT.  See LegalizeDAG.cpp:
    //   Action = TLI.getOperationAction(Op, InnerType);
    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1,  Custom);
    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8,  Custom);
    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Custom);
    setOperationAction(ISD::TRUNCATE,    MVT::i16, Custom);
    setOperationAction(ISD::TRUNCATE,    MVT::i8,  Custom);
    setOperationAction(ISD::LOAD,  MVT::i32, Custom);
    setOperationAction(ISD::STORE, MVT::i32, Custom);
    // Also Custom for i16/i8 LOAD/STORE in ptr32 mode so LowerLoad/
    // LowerStore can fold Wide32(Wrapper, WrapperBank) of the same
    // global (or a raw GlobalAddress) to a plain abs-16 access
    // (DBR-relative).  Without this, every `g` access for a
    // same-segment global goes through the 14-byte [dp],y
    // indirect-long path even though the bank is implicit in DBR.
    setOperationAction(ISD::STORE, MVT::i16, Custom);
    setOperationAction(ISD::STORE, MVT::i8,  Custom);
    setOperationAction(ISD::LOAD,  MVT::i16, Custom);
    setOperationAction(ISD::LOAD,  MVT::i8,  Custom);
    // ZEXTLOAD i16-from-i8 also Custom — the DAG combiner folds
    // (zext (load i8 @g)) into one zextload SDNode, so we need to
    // apply the same global-address fold there.  SEXTLOAD/EXTLOAD
    // already have Expand actions from earlier setLoadExtAction
    // calls; leave those alone (Custom would require parallel
    // tablegen patterns we don't have).
    setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, MVT::i8, Custom);
    setOperationAction(ISD::SETCC,     MVT::i32, Custom);
    setOperationAction(ISD::BR_CC,     MVT::i32, Custom);
    setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
    setOperationAction(ISD::SELECT,    MVT::i32, Custom);
    setOperationAction(ISD::Constant,  MVT::i32, Custom);
  }

  // Disable jump tables.  Generating them costs us BRIND (indirect
  // branch via 16-bit pointer load), which we don't have.  A long
  // if-else chain compiles fine without them.  Setting the threshold
  // to UINT_MAX makes LLVM never form a jump table.
  setMinimumJumpTableEntries(UINT_MAX);

  // Variable-length arrays / dynamic stack allocation.  Lowered to
  // `tsc; sec; sbc size; tcs; inc a` — A returns the address of the
  // allocated region.  Limitation: this shifts SP, so any FrameIndex
  // accessed *after* a DYNAMIC_STACKALLOC reads from a wrong offset
  // (we have no frame pointer).  Suitable for the common pattern
  // "alloca; initialise; pass; return"; complex VLA use mixed with
  // local-variable access across the alloca will miscompile.  A real
  // FP (DP slot or X-as-FP) would lift this restriction.
  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i16, Custom);
  if (ptr32Active)
    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);

  // Opt into PerformDAGCombine on LOAD nodes — needed for the
  // address-select reverse combine (see W65816TargetLowering::
  // PerformDAGCombine).
  // setTargetDAGCombine(ISD::LOAD); // bisecting pickif hang
  // SHL combine disabled while debugging the ptr32 i64-phi hang.
  // setTargetDAGCombine(ISD::SHL);

  // Combine STORE / LOAD with const-int i32 pointer to a form that
  // survives LowerI32Constant (which would otherwise split the ptr
  // into a Wide32 reg pair and lose the const-addr fast path).
  // See PerformDAGCombine.
  setTargetDAGCombine(ISD::STORE);
  setTargetDAGCombine(ISD::LOAD);
}

// Map an LLVM SETCC condition to a W65816 branch.  Returns the condition
// code along with possibly-swapped LHS/RHS; some signed comparisons are
// rewritten to use unsigned ones with a tweaked operand because the
// 65816 has no native signed branch other than BMI/BPL on a value, not
// on a comparison result.
// Map an LLVM SETCC condition to a 65816 branch.  Unsigned codes use
// BCS/BCC after CMP.  Signed SETLT/SETGE map to BMI/BPL — correct only
// when the comparison cannot overflow.  For values produced by typical
// C arithmetic on i16 this is usually fine; values near INT16_MIN/MAX
// could give wrong results until we emit the BVS handling sequence.
// SETGT / SETLE are rewritten to SETLT / SETGE with constant + 1 in
// LowerBR_CC, mirroring the SETULE / SETUGT path.
static W65816CC::CondCode mapCC(ISD::CondCode CC) {
  switch (CC) {
  case ISD::SETEQ:  return W65816CC::COND_EQ;
  case ISD::SETNE:  return W65816CC::COND_NE;
  case ISD::SETUGE: return W65816CC::COND_HS;
  case ISD::SETULT: return W65816CC::COND_LO;
  case ISD::SETLT:  return W65816CC::COND_MI;
  case ISD::SETGE:  return W65816CC::COND_PL;
  default:
    return W65816CC::COND_INVALID;
  }
}

// If both compare operands are i8, widen them to i16 so the existing
// i16 CMP path can handle them.  Use ZEXT for unsigned/eq/ne CCs and
// SEXT for signed CCs — picking the wrong extension would invert the
// answer (e.g. -1i8 sext to 0xFFFF compares < 1 signed; zext to 0x00FF
// compares > 1 unsigned, which would flip a signed less-than).
static void promoteI8Cmp(SDValue &LHS, SDValue &RHS, ISD::CondCode CC,
                         SelectionDAG &DAG, const SDLoc &DL) {
  if (LHS.getValueType() != MVT::i8) return;
  unsigned Ext;
  switch (CC) {
  case ISD::SETLT: case ISD::SETLE: case ISD::SETGT: case ISD::SETGE:
    Ext = ISD::SIGN_EXTEND; break;
  default:
    Ext = ISD::ZERO_EXTEND; break;  // unsigned + eq/ne
  }
  LHS = DAG.getNode(Ext, DL, MVT::i16, LHS);
  RHS = DAG.getNode(Ext, DL, MVT::i16, RHS);
}

// Normalize a (LHS, RHS, CC) triple so the result is something we can
// emit with one CMP + Bxx.  Returns the W65816 condition code; updates
// LHS/RHS/CC in place.  Returns COND_INVALID on failure.
static W65816CC::CondCode normalizeCC(SDValue &LHS, SDValue &RHS,
                                      ISD::CondCode &CC, SelectionDAG &DAG,
                                      const SDLoc &DL) {
  promoteI8Cmp(LHS, RHS, CC, DAG, DL);
  // CMP wants the comparand (constant or memory) on the right.  If a DAG
  // pre-pass put the constant on the left, swap and flip the condition.
  if (isa<ConstantSDNode>(LHS) && !isa<ConstantSDNode>(RHS)) {
    std::swap(LHS, RHS);
    CC = ISD::getSetCCSwappedOperands(CC);
  }

  // Signed compare via "EOR with sign bit then unsigned compare":
  //   a < b (signed)  iff  (a ^ 0x8000) < (b ^ 0x8000) (unsigned)
  // The XOR flips the sign bit, which converts signed-int ordering to
  // unsigned-int ordering on the same bits.  This avoids the WDC's
  // missing "BLT signed" — BMI/BPL alone read the sign of (a-b)
  // without the V-flag overflow correction, giving wrong results
  // when the subtraction overflows (e.g., INT16_MIN < 1 produced
  // false because (-32768 - 1) = +32767 has N=0).  After the EOR
  // transform we use BCC/BCS which depend on the carry from CMP and
  // don't suffer overflow corruption.
  //
  // Cost: 1 EOR per operand (3 bytes each in M=16) — comparable to
  // the V-aware multi-branch sequence (5+ bytes of branches), but
  // happens at SDAG time so subsequent SDAG combining can fold
  // EORs against constants or already-EOR'd values.
  bool SignedCmp = (CC == ISD::SETLT || CC == ISD::SETLE ||
                    CC == ISD::SETGT || CC == ISD::SETGE);
  if (SignedCmp && LHS.getValueType() == MVT::i16) {
    EVT VT = LHS.getValueType();
    SDValue Mask = DAG.getConstant(0x8000, DL, VT);
    LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, Mask);
    RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, Mask);
    switch (CC) {
    case ISD::SETLT: CC = ISD::SETULT; break;
    case ISD::SETLE: CC = ISD::SETULE; break;
    case ISD::SETGT: CC = ISD::SETUGT; break;
    case ISD::SETGE: CC = ISD::SETUGE; break;
    default: break;
    }
  }

  // Rewrite SETULE / SETUGT to SETULT / SETUGE with constant +/- 1.
  // (SETLE / SETGT have already been converted to their unsigned
  // counterparts above for i16; this handles original SETULE/SETUGT
  // and the post-transform SETULE/SETUGT.)  Keeps the variable on the
  // LHS and lets us use BCS / BCC natively.
  if (auto *RhsConst = dyn_cast<ConstantSDNode>(RHS)) {
    int64_t V = RhsConst->getSExtValue();
    uint64_t UV = (uint64_t)V & 0xFFFF;
    if (CC == ISD::SETULE && UV < 0xffff) {
      RHS = DAG.getConstant(UV + 1, DL, RHS.getValueType());
      CC = ISD::SETULT;
    } else if (CC == ISD::SETUGT && UV < 0xffff) {
      RHS = DAG.getConstant(UV + 1, DL, RHS.getValueType());
      CC = ISD::SETUGE;
    } else if (CC == ISD::SETLE && V < 0x7fff) {
      // Reachable only when SignedCmp transform was skipped (i8 case
      // before promoteI8Cmp could get it, or non-i16 in the future).
      RHS = DAG.getConstant(V + 1, DL, RHS.getValueType());
      CC = ISD::SETLT;
    } else if (CC == ISD::SETGT && V < 0x7fff) {
      RHS = DAG.getConstant(V + 1, DL, RHS.getValueType());
      CC = ISD::SETGE;
    }
  }

  W65816CC::CondCode TCC = mapCC(CC);
  if (TCC == W65816CC::COND_INVALID) {
    // Try swapping operands first — preferable since it leaves us with
    // a single-Bxx form.  But reject the swap if it would put a load on
    // the LHS (we can't pattern-match cmp(load,reg) without spilling A).
    bool RhsIsLoad = isa<LoadSDNode>(RHS.getNode());
    bool LhsIsLoad = isa<LoadSDNode>(LHS.getNode());
    bool SwapWouldHurt = RhsIsLoad && !LhsIsLoad;
    if (!SwapWouldHurt) {
      std::swap(LHS, RHS);
      CC = ISD::getSetCCSwappedOperands(CC);
      TCC = mapCC(CC);
    }
  }
  // Final fallback: GT/LE/UGT/ULE without a useful swap target.  Use a
  // multi-branch pseudo CC; the SELECT_CC16 custom inserter expands it
  // into a 3-BB diamond.  Only valid for SELECT_CC, not for BR_CC —
  // LowerBR_CC re-routes those through SETCC + BR_CC NE.
  if (TCC == W65816CC::COND_INVALID) {
    switch (CC) {
    case ISD::SETGT:  TCC = W65816CC::COND_GT_MB; break;
    case ISD::SETLE:  TCC = W65816CC::COND_LE_MB; break;
    case ISD::SETUGT: TCC = W65816CC::COND_HI_MB; break;
    case ISD::SETULE: TCC = W65816CC::COND_LS_MB; break;
    default: break;
    }
  }
  return TCC;
}

// Wide32 build/extract helpers, used by LowerLoad/Store/Extend/Truncate/
// I32Bin/BR_CC to construct or destructure i32 SDValues across the
// sub_lo / sub_hi halves of the Wide32 register class.
static SDValue buildWide32(SelectionDAG &DAG, const SDLoc &DL,
                           SDValue Lo, SDValue Hi) {
  SDValue RC = DAG.getTargetConstant(W65816::Wide32RegClassID, DL, MVT::i32);
  SDValue SubLo = DAG.getTargetConstant(llvm::sub_lo, DL, MVT::i32);
  SDValue SubHi = DAG.getTargetConstant(llvm::sub_hi, DL, MVT::i32);
  SDNode *RS = DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, DL, MVT::i32,
                                  {RC, Lo, SubLo, Hi, SubHi});
  return SDValue(RS, 0);
}
// Look through a buildWide32(Lo, Hi) -> REG_SEQUENCE(RC, Lo, sub_lo,
// Hi, sub_hi) pair: if X is exactly that machine node, return the
// matching half operand directly.  Avoids a TargetExtractSubreg that
// would re-enter the SDAG combiner and re-build the i32 constant /
// pair, looping forever (observed as OOM in the combiner on `*t = 0`).
static SDValue lookThroughRegSeq(SDValue X, unsigned WantSub) {
  if (!X.getNode() || !X.isMachineOpcode()) return SDValue();
  if (X.getMachineOpcode() != TargetOpcode::REG_SEQUENCE) return SDValue();
  // Layout: op0 = RC, then (Reg, SubIdx) pairs.
  for (unsigned i = 1; i + 1 < X.getNumOperands(); i += 2) {
    SDValue SubIdx = X.getOperand(i + 1);
    auto *CIdx = dyn_cast<ConstantSDNode>(SubIdx);
    if (!CIdx) continue;
    if (CIdx->getZExtValue() == WantSub)
      return X.getOperand(i);
  }
  return SDValue();
}
static SDValue extractWide32Lo(SelectionDAG &DAG, const SDLoc &DL, SDValue X) {
  // For constants, materialise the lo half as an i16 constant directly
  // — getTargetExtractSubreg on a Constant SDNode produces a malformed
  // MachineSDNode (constants don't carry sub-regs) and triggers
  // SDAG combine loops downstream.
  if (auto *C = dyn_cast<ConstantSDNode>(X)) {
    return DAG.getConstant(C->getZExtValue() & 0xFFFFu, DL, MVT::i16);
  }
  // For un-lowered GlobalAddress / ExternalSymbol nodes (which reach
  // here when the store-lowering runs before LowerOperation has split
  // the constant into a Wide32 pair), emit a fresh Wrapper / WrapperBank
  // pair directly.  getTargetExtractSubreg on a GlobalAddress node
  // produces a malformed result (no sub-reg info on a non-register).
  if (auto *GA = dyn_cast<GlobalAddressSDNode>(X)) {
    SDValue T = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, MVT::i16,
                                           GA->getOffset());
    return DAG.getNode(W65816ISD::Wrapper, DL, MVT::i16, T);
  }
  if (auto *ES = dyn_cast<ExternalSymbolSDNode>(X)) {
    SDValue T = DAG.getTargetExternalSymbol(ES->getSymbol(), MVT::i16);
    return DAG.getNode(W65816ISD::Wrapper, DL, MVT::i16, T);
  }
  if (SDValue Half = lookThroughRegSeq(X, llvm::sub_lo))
    return Half;
  return DAG.getTargetExtractSubreg(llvm::sub_lo, DL, MVT::i16, X);
}
static SDValue extractWide32Hi(SelectionDAG &DAG, const SDLoc &DL, SDValue X) {
  if (auto *C = dyn_cast<ConstantSDNode>(X)) {
    return DAG.getConstant((C->getZExtValue() >> 16) & 0xFFFFu, DL, MVT::i16);
  }
  if (auto *GA = dyn_cast<GlobalAddressSDNode>(X)) {
    SDValue T = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, MVT::i16,
                                           GA->getOffset());
    return DAG.getNode(W65816ISD::WrapperBank, DL, MVT::i16, T);
  }
  if (auto *ES = dyn_cast<ExternalSymbolSDNode>(X)) {
    SDValue T = DAG.getTargetExternalSymbol(ES->getSymbol(), MVT::i16);
    return DAG.getNode(W65816ISD::WrapperBank, DL, MVT::i16, T);
  }
  if (SDValue Half = lookThroughRegSeq(X, llvm::sub_hi))
    return Half;
  return DAG.getTargetExtractSubreg(llvm::sub_hi, DL, MVT::i16, X);
}

// Match `Ptr = REG_SEQUENCE(ADDC(BaseLo, KLo), sub_lo,
//                          ADDE(BaseHi,   0,  carry), sub_hi)` shape
// produced by LowerI32Bin for `(add Wide32, const)` where the constant
// fits an unsigned 16-bit Y (KHi must be 0).  Returns true with OutBase
// = buildWide32(BaseLo, BaseHi) and OutOff = KLo on a successful peel.
// The bank-byte carry-in is intentionally dropped: the `[dp],Y` deref
// adds Y to the 24-bit pointer without propagating beyond 16 bits.
// Caller's responsibility that the target object doesn't span a bank.
static bool peelPtr32Offset(SelectionDAG &DAG, SDLoc DL, SDValue Ptr,
                            SDValue &OutBase, uint16_t &OutOff) {
  if (Ptr.getValueType() != MVT::i32) return false;
  // Pre-LowerI32Bin shape: `ISD::ADD(BaseWide32, i32 const)`.  LowerLoad
  // runs before LowerI32Bin in legalization order, so the ADD is still
  // visible as an ISD::ADD when LowerLoad inspects Ptr.
  if (Ptr.getOpcode() == ISD::ADD) {
    SDValue L = Ptr.getOperand(0);
    SDValue R = Ptr.getOperand(1);
    auto *KC = dyn_cast<ConstantSDNode>(R);
    if (!KC) {
      KC = dyn_cast<ConstantSDNode>(L);
      if (!KC) return false;
      L = R;
    }
    uint64_t K = KC->getZExtValue();
    if (K == 0 || K > 0xFFFFu) return false;
    OutOff = (uint16_t)K;
    OutBase = L;
    return true;
  }
  // Post-LowerI32Bin shape (REG_SEQUENCE of ADDC/ADDE).  May not occur
  // in practice given the ADD path above, but kept for robustness.
  if (!Ptr.getNode() || !Ptr.isMachineOpcode()) return false;
  if (Ptr.getMachineOpcode() != TargetOpcode::REG_SEQUENCE) return false;
  SDValue Lo = lookThroughRegSeq(Ptr, llvm::sub_lo);
  SDValue Hi = lookThroughRegSeq(Ptr, llvm::sub_hi);
  if (!Lo || !Hi) return false;
  if (Lo.getOpcode() != ISD::ADDC) return false;
  if (Hi.getOpcode() != ISD::ADDE) return false;
  if (Hi.getOperand(2) != Lo.getValue(1)) return false;
  auto *KLo = dyn_cast<ConstantSDNode>(Lo.getOperand(1));
  auto *KHi = dyn_cast<ConstantSDNode>(Hi.getOperand(1));
  if (!KLo || !KHi) return false;
  if (KHi->getZExtValue() != 0) return false;
  uint64_t K = KLo->getZExtValue() & 0xFFFFu;
  if (K == 0) return false;
  OutOff = (uint16_t)K;
  OutBase = buildWide32(DAG, DL, Lo.getOperand(0), Hi.getOperand(0));
  return true;
}

SDValue W65816TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
  SDValue Chain  = Op.getOperand(0);
  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
  SDValue LHS    = Op.getOperand(2);
  SDValue RHS    = Op.getOperand(3);
  SDValue Dest   = Op.getOperand(4);
  SDLoc DL(Op);
  EVT VT = LHS.getValueType();
  // i32 BR_CC: synthesize an i16 boolean from per-half compares, then
  // branch on (bool != 0).  Avoids the legalizer's generic Expand that
  // re-enters our SETCC/BR_CC custom paths in an infinite loop.
  if (VT == MVT::i32) {
    SDValue LL = extractWide32Lo(DAG, DL, LHS);
    SDValue LH = extractWide32Hi(DAG, DL, LHS);
    SDValue RL = extractWide32Lo(DAG, DL, RHS);
    SDValue RH = extractWide32Hi(DAG, DL, RHS);
    // Fast path: i32 == 0 / != 0 → (LL | LH) cmp 0.  Drops two i16
    // setcc materializations + an AND + (for NE) an XOR; the BR_CC
    // can branch directly on the OR-test.  Hot in `while (x)` and
    // any i32-counter loop test.
    if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
        isa<ConstantSDNode>(RHS) &&
        cast<ConstantSDNode>(RHS)->isZero()) {
      SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i16, LL, LH);
      SDValue Z16 = DAG.getConstant(0, DL, MVT::i16);
      return DAG.getNode(ISD::BR_CC, DL, MVT::Other, Chain,
                         DAG.getCondCode(CC), Or, Z16, Dest);
    }
    SDValue Bool;
    if (CC == ISD::SETEQ || CC == ISD::SETNE) {
      SDValue EqLo = DAG.getSetCC(DL, MVT::i16, LL, RL, ISD::SETEQ);
      SDValue EqHi = DAG.getSetCC(DL, MVT::i16, LH, RH, ISD::SETEQ);
      Bool = DAG.getNode(ISD::AND, DL, MVT::i16, EqLo, EqHi);
      if (CC == ISD::SETNE)
        Bool = DAG.getNode(ISD::XOR, DL, MVT::i16, Bool,
                           DAG.getConstant(1, DL, MVT::i16));
    } else {
      // (a CC b) where CC is ordered:
      //   = (hi_a HiStrict hi_b)  ||  (hi_a == hi_b && lo_a LoCC lo_b)
      // HiStrict is the strict variant of CC (LE -> LT etc.) so the
      // tie-breaker (hi==hi && lo CC lo) handles the equality case
      // properly.  LoCC is always the unsigned variant of CC because
      // the low half is unsigned (the high half carries the sign).
      ISD::CondCode HiCC, LoCCu;
      switch (CC) {
      case ISD::SETLT:  HiCC = ISD::SETLT;  LoCCu = ISD::SETULT; break;
      case ISD::SETLE:  HiCC = ISD::SETLT;  LoCCu = ISD::SETULE; break;
      case ISD::SETGT:  HiCC = ISD::SETGT;  LoCCu = ISD::SETUGT; break;
      case ISD::SETGE:  HiCC = ISD::SETGT;  LoCCu = ISD::SETUGE; break;
      case ISD::SETULT: HiCC = ISD::SETULT; LoCCu = ISD::SETULT; break;
      case ISD::SETULE: HiCC = ISD::SETULT; LoCCu = ISD::SETULE; break;
      case ISD::SETUGT: HiCC = ISD::SETUGT; LoCCu = ISD::SETUGT; break;
      case ISD::SETUGE: HiCC = ISD::SETUGT; LoCCu = ISD::SETUGE; break;
      default:
        report_fatal_error("W65816: unexpected i32 BR_CC condition");
      }
      SDValue HiOk = DAG.getSetCC(DL, MVT::i16, LH, RH, HiCC);
      SDValue HiEq = DAG.getSetCC(DL, MVT::i16, LH, RH, ISD::SETEQ);
      SDValue LoOk = DAG.getSetCC(DL, MVT::i16, LL, RL, LoCCu);
      SDValue Tie  = DAG.getNode(ISD::AND, DL, MVT::i16, HiEq, LoOk);
      Bool = DAG.getNode(ISD::OR, DL, MVT::i16, HiOk, Tie);
    }
    SDValue Zero = DAG.getConstant(0, DL, MVT::i16);
    return DAG.getNode(ISD::BR_CC, DL, MVT::Other, Chain,
                       DAG.getCondCode(ISD::SETNE), Bool, Zero, Dest);
  }

  W65816CC::CondCode TCC = normalizeCC(LHS, RHS, CC, DAG, DL);
  if (TCC == W65816CC::COND_INVALID)
    report_fatal_error("W65816: branch condition not yet implemented");

  // Multi-branch CCs only have inserter support via SELECT_CC16.  For
  // BR_CC, reroute through SETCC: materialise the boolean to A, then
  // branch on NE-vs-zero.  One extra LDA but always works.
  if (TCC >= W65816CC::COND_GT_MB) {
    SDValue Bool = DAG.getNode(ISD::SETCC, DL, VT, LHS, RHS,
                               DAG.getCondCode(CC));
    SDValue Zero = DAG.getConstant(0, DL, VT);
    return DAG.getNode(ISD::BR_CC, DL, MVT::Other, Chain,
                       DAG.getCondCode(ISD::SETNE), Bool, Zero, Dest);
  }

  SDValue Glue = DAG.getNode(W65816ISD::CMP, DL, MVT::Glue, LHS, RHS);
  SDValue CCOp = DAG.getTargetConstant(TCC, DL, MVT::i8);
  return DAG.getNode(W65816ISD::BR_CC, DL, MVT::Other, Chain, Dest, CCOp,
                     Glue);
}

// LowerBRIND — `brind (chain, target_ptr)`.  Computed-goto / IR
// `indirectbr` lowers to BRIND with a pointer-typed target.  Under
// p:32:16 (default datalayout) that pointer is i32, so the generic
// legalizer's "Cannot select brind" path fires unless we step in.
//
// Lowering strategy (mirrors __jsl_indir's mechanism):
//   1. If target is i32 (Wide32), extract sub_lo — only the 16-bit
//      offset within PBR matters because JMP (abs) keeps current PBR.
//   2. Store that i16 to constant address $00B8 — the shared
//      __indirTarget DP slot.  Pinned at $00B8 so JMP (abs)'s bank-0
//      vector fetch reads it regardless of DBR / segment placement
//      (see libgcc.s for the full rationale).
//   3. Emit W65816ISD::BRIND with the chained store — the BRINDpseudo
//      tablegen pattern selects to JMP_AbsInd $00B8.
SDValue W65816TargetLowering::LowerBRIND(SDValue Op,
                                         SelectionDAG &DAG) const {
  SDValue Chain  = Op.getOperand(0);
  SDValue Target = Op.getOperand(1);
  SDLoc DL(Op);

  // Reduce the target to i16 — the low half of the (i32) pointer
  // holds the in-bank offset that JMP indirect dispatches through.
  SDValue Off16;
  if (Target.getValueType() == MVT::i32) {
    Off16 = extractWide32Lo(DAG, DL, Target);
  } else if (Target.getValueType() == MVT::i16) {
    Off16 = Target;
  } else {
    // Defensive: shouldn't happen with our current type-legalization,
    // but if it does, defer to the legalizer.
    return SDValue();
  }

  // Store the 16-bit target to $00B8.  The (store Acc16, (iPTR timm))
  // tablegen pattern lowers this to STAabs ($00B8) — the AsmPrinter
  // routes bank-0 const-int stores to STA_Abs (3 bytes, DBR-relative).
  // Since DP=0 at runtime, `sta $00B8` lands at $00:00B8 == DP slot
  // $B8, which is exactly where __jsl_indir reads via `jmp ($00B8)`.
  //
  // CRITICAL: use TargetConstant (not Constant) so the i32 Constant is
  // NOT Custom-lowered through LowerI32Constant — which would split
  // 0x00B8 into a REG_SEQUENCE(0xB8, 0).  LowerStore then can't see
  // a clean ConstantSDNode at Ptr, mis-routes the i16 store to the
  // generic ST_PTR slow path ([E0],Y indirect-long with full Wide32
  // address staging), and creates significant Wide32 register pressure
  // — multi-cgoto VM interpreters with several BRINDs in one function
  // then over-pressure the regalloc and abort with "ran out of
  // registers".  With TargetConstant the tablegen pattern at
  // InstrInfo.td:433 fires directly: `sta $b8` — one instruction, no
  // Wide32 vreg, no DPF0/DPF1 staging.
  EVT PtrVT = getPointerTy(DAG.getDataLayout());
  SDValue Addr = DAG.getTargetConstant(0x00B8, DL, PtrVT);
  SDValue Store = DAG.getStore(Chain, DL, Off16, Addr,
                               MachinePointerInfo());

  // Emit the indirect JMP.  W65816ISD::BR_IND has chain-only semantics
  // (no operand beyond chain) — the target is implicit ($00B8).  The
  // store above sequences before the JMP via the chain dependency.
  return DAG.getNode(W65816ISD::BR_IND, DL, MVT::Other, Store);
}

SDValue W65816TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
  // setcc lhs, rhs, cc  ->  select_cc lhs, rhs, 1, 0, cc.
  // The SELECT_CC then re-enters LowerOperation and we lower it via the
  // diamond-CFG path.  setBooleanContents(ZeroOrOne) means callers see
  // the result as a clean 0/1 value.
  SDValue LHS = Op.getOperand(0);
  SDValue RHS = Op.getOperand(1);
  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
  SDLoc DL(Op);
  EVT VT = Op.getValueType();
  // i32 SETCC: split into per-half compares.  Result type is i16 (the
  // legalizer keeps the boolean result type narrow regardless of LHS
  // width).
  if (LHS.getValueType() == MVT::i32) {
    SDValue LL = extractWide32Lo(DAG, DL, LHS);
    SDValue LH = extractWide32Hi(DAG, DL, LHS);
    SDValue RL = extractWide32Lo(DAG, DL, RHS);
    SDValue RH = extractWide32Hi(DAG, DL, RHS);
    // Fast path: i32 == 0 / != 0 → (LL | LH) cmp 0.  One i16 OR + one
    // i16 setcc instead of two setcc + AND (+ XOR for NE).
    if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
        isa<ConstantSDNode>(RHS) &&
        cast<ConstantSDNode>(RHS)->isZero()) {
      SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i16, LL, LH);
      SDValue Z16 = DAG.getConstant(0, DL, MVT::i16);
      return DAG.getSetCC(DL, VT, Or, Z16, CC);
    }
    if (CC == ISD::SETEQ || CC == ISD::SETNE) {
      SDValue EqLo = DAG.getSetCC(DL, VT, LL, RL, ISD::SETEQ);
      SDValue EqHi = DAG.getSetCC(DL, VT, LH, RH, ISD::SETEQ);
      SDValue Eq = DAG.getNode(ISD::AND, DL, VT, EqLo, EqHi);
      if (CC == ISD::SETNE)
        Eq = DAG.getNode(ISD::XOR, DL, VT, Eq, DAG.getConstant(1, DL, VT));
      return Eq;
    }
    ISD::CondCode HiCC, LoCCu;
    switch (CC) {
    case ISD::SETLT:  HiCC = ISD::SETLT;  LoCCu = ISD::SETULT; break;
    case ISD::SETLE:  HiCC = ISD::SETLT;  LoCCu = ISD::SETULE; break;
    case ISD::SETGT:  HiCC = ISD::SETGT;  LoCCu = ISD::SETUGT; break;
    case ISD::SETGE:  HiCC = ISD::SETGT;  LoCCu = ISD::SETUGE; break;
    case ISD::SETULT: HiCC = ISD::SETULT; LoCCu = ISD::SETULT; break;
    case ISD::SETULE: HiCC = ISD::SETULT; LoCCu = ISD::SETULE; break;
    case ISD::SETUGT: HiCC = ISD::SETUGT; LoCCu = ISD::SETUGT; break;
    case ISD::SETUGE: HiCC = ISD::SETUGT; LoCCu = ISD::SETUGE; break;
    default:
      report_fatal_error("W65816: unexpected i32 SETCC condition");
    }
    SDValue HiOk = DAG.getSetCC(DL, VT, LH, RH, HiCC);
    SDValue HiEq = DAG.getSetCC(DL, VT, LH, RH, ISD::SETEQ);
    SDValue LoOk = DAG.getSetCC(DL, VT, LL, RL, LoCCu);
    SDValue Tie  = DAG.getNode(ISD::AND, DL, VT, HiEq, LoOk);
    return DAG.getNode(ISD::OR, DL, VT, HiOk, Tie);
  }
  SDValue One  = DAG.getConstant(1, DL, VT);
  SDValue Zero = DAG.getConstant(0, DL, VT);
  return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, One, Zero,
                     DAG.getCondCode(CC));
}

SDValue W65816TargetLowering::LowerSELECT_CC(SDValue Op,
                                             SelectionDAG &DAG) const {
  SDValue LHS    = Op.getOperand(0);
  SDValue RHS    = Op.getOperand(1);
  SDValue TVal   = Op.getOperand(2);
  SDValue FVal   = Op.getOperand(3);
  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
  SDLoc DL(Op);

  // i32 SELECT_CC: synthesize an i16 boolean from the i32 compare via
  // LowerSETCC's i32 path, then select between the i32 halves driven
  // by the boolean.  Avoids creating the i32 W65816::CMP we have no
  // pattern for.
  if (LHS.getValueType() == MVT::i32) {
    // Materialise the i16 boolean.
    SDValue Bool = DAG.getSetCC(DL, MVT::i16, LHS, RHS, CC);
    SDValue Zero = DAG.getConstant(0, DL, MVT::i16);
    if (Op.getValueType() == MVT::i32) {
      SDValue TLo = extractWide32Lo(DAG, DL, TVal);
      SDValue THi = extractWide32Hi(DAG, DL, TVal);
      SDValue FLo = extractWide32Lo(DAG, DL, FVal);
      SDValue FHi = extractWide32Hi(DAG, DL, FVal);
      SDValue Lo = DAG.getSelectCC(DL, Bool, Zero, TLo, FLo, ISD::SETNE);
      SDValue Hi = DAG.getSelectCC(DL, Bool, Zero, THi, FHi, ISD::SETNE);
      return buildWide32(DAG, DL, Lo, Hi);
    }
    return DAG.getSelectCC(DL, Bool, Zero, TVal, FVal, ISD::SETNE);
  }
  // SELECT_CC with i32 result (i16 LHS): split TVal/FVal into halves
  // and run a per-half i16 SELECT_CC sharing the same condition.
  if (Op.getValueType() == MVT::i32) {
    SDValue TLo = extractWide32Lo(DAG, DL, TVal);
    SDValue THi = extractWide32Hi(DAG, DL, TVal);
    SDValue FLo = extractWide32Lo(DAG, DL, FVal);
    SDValue FHi = extractWide32Hi(DAG, DL, FVal);
    SDValue Lo = DAG.getSelectCC(DL, LHS, RHS, TLo, FLo, CC);
    SDValue Hi = DAG.getSelectCC(DL, LHS, RHS, THi, FHi, CC);
    return buildWide32(DAG, DL, Lo, Hi);
  }

  W65816CC::CondCode TCC = normalizeCC(LHS, RHS, CC, DAG, DL);
  if (TCC == W65816CC::COND_INVALID)
    report_fatal_error("W65816: select_cc condition not yet implemented");

  SDValue Glue = DAG.getNode(W65816ISD::CMP, DL, MVT::Glue, LHS, RHS);
  SDValue CCOp = DAG.getTargetConstant(TCC, DL, MVT::i8);
  // SDTypeProfile declares 1 result (the selected value).  Earlier
  // code passed a 2-VT list (value + Glue) which was silently wrong
  // and trips an SDNode-validity assertion in assertions builds.
  SDValue Ops[] = {TVal, FVal, CCOp, Glue};
  return DAG.getNode(W65816ISD::SELECT_CC, DL, Op.getValueType(), Ops);
}

// i8 -> i16 sign extend.  Branchless 3-instruction trick:
//   sext(x) = ((x & 0xFF) ^ 0x80) - 0x80
// Verify:  x=0x00 -> 0x80 - 0x80 = 0x0000.  x=0x7F -> 0xFF - 0x80 = 0x7F.
//          x=0x80 -> 0x00 - 0x80 = 0xFF80 (-128).  x=0xFF -> 0x7F - 0x80
//          = 0xFFFF (-1).
// Lowers to: AND #$00FF; EOR #$0080; SEC; SBC #$0080  (10 bytes total,
// no branches, no temp slots — much cheaper than the SELECT_CC diamond
// version that produced ~14 instructions plus stack spills).
SDValue W65816TargetLowering::LowerSignExtend(SDValue Op,
                                              SelectionDAG &DAG) const {
  SDValue X = Op.getOperand(0);
  if (X.getValueType() != MVT::i8 || Op.getValueType() != MVT::i16)
    return SDValue();
  SDLoc DL(Op);
  SDValue Z = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, X);
  SDValue Sign = DAG.getConstant(0x0080, DL, MVT::i16);
  SDValue Xor = DAG.getNode(ISD::XOR, DL, MVT::i16, Z, Sign);
  return DAG.getNode(ISD::SUB, DL, MVT::i16, Xor, Sign);
}

// ptr32 foundation hook.  In ptr16 mode (PointerWidth=16, current
// default) addresses are i16 and we return SDValue() so the legalizer
// keeps the load and the existing LDAptr / STAptr selection patterns
// match.  In ptr32 mode addresses are i32 and we wrap the load in
// W65816ISD::LD_PTR via getMemIntrinsicNode so the [dp],Y inserter
// can take the bank byte from sub_hi instead of forcing 0.
//
// Byte loads (zextload, anyext, true i8) keep going through the i16
// LDA + AND #$FF idiom — same trick the existing LDAptr uses; for
// ptr32 mode the load is still 16 bits, just bank-explicit.
SDValue W65816TargetLowering::LowerLoad(SDValue Op,
                                        SelectionDAG &DAG) const {
  LoadSDNode *Ld = cast<LoadSDNode>(Op);
  SDValue Chain = Ld->getChain();
  SDValue Ptr   = Ld->getBasePtr();
  EVT VT = Op.getValueType();
  SDLoc DL(Op);

  // Const-int address: leave the SDAG alone so the tablegen pattern
  // `(load (iPTR imm))` → LDA8long fires (bank-explicit).  See the
  // mirrored short-circuit at the top of LowerStore.
  if (isa<ConstantSDNode>(Ptr) && (VT == MVT::i8 || VT == MVT::i16))
    return SDValue();

  // i32 LOAD: split into two i16 loads at offsets 0 and 2 then
  // REG_SEQUENCE the halves into a Wide32.  Address may be i16 (stack
  // slot, global) or i32 (ptr32 deref); the recursive ADD handles
  // address arithmetic correctly via LowerI32Bin.
  if (VT == MVT::i32) {
    EVT PtrVT = Ptr.getValueType();
    SDValue Two = DAG.getConstant(2, DL, PtrVT);
    SDValue Ptr2 = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr, Two);
    SDValue Lo = DAG.getLoad(MVT::i16, DL, Chain, Ptr,
                             Ld->getPointerInfo(),
                             Ld->getAlign(),
                             Ld->getMemOperand()->getFlags());
    SDValue Hi = DAG.getLoad(MVT::i16, DL, Chain, Ptr2,
                             Ld->getPointerInfo().getWithOffset(2),
                             Ld->getAlign(),
                             Ld->getMemOperand()->getFlags());
    SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
                                   Lo.getValue(1), Hi.getValue(1));
    SDValue Val = buildWide32(DAG, DL, Lo, Hi);
    return DAG.getMergeValues({Val, NewChain}, DL);
  }

  // Same fold as LowerStore: a Wide32 ptr built from Wrapper +
  // WrapperBank of the same global, OR a raw GlobalAddress, lets us
  // emit an abs-16 (DBR-relative) load (LDA / LDA8abs) instead of
  // the slower [dp],Y indirect-long.  Our globals are in the load
  // segment that crt0 pins to DBR.
  SDValue FoldedLo;
  if (auto *GA = dyn_cast<GlobalAddressSDNode>(Ptr)) {
    FoldedLo = DAG.getNode(W65816ISD::Wrapper, DL, MVT::i16,
        DAG.getTargetGlobalAddress(GA->getGlobal(), DL, MVT::i16,
                                    GA->getOffset()));
  } else if (auto *ES = dyn_cast<ExternalSymbolSDNode>(Ptr)) {
    FoldedLo = DAG.getNode(W65816ISD::Wrapper, DL, MVT::i16,
        DAG.getTargetExternalSymbol(ES->getSymbol(), MVT::i16));
  } else if (Ptr.getNode()->isMachineOpcode() &&
             Ptr.getMachineOpcode() == TargetOpcode::REG_SEQUENCE) {
    SDValue PLo, PHi;
    for (unsigned i = 1; i + 1 < Ptr.getNumOperands(); i += 2) {
      if (auto *CIdx = dyn_cast<ConstantSDNode>(Ptr.getOperand(i + 1))) {
        if (CIdx->getZExtValue() == llvm::sub_lo) PLo = Ptr.getOperand(i);
        else if (CIdx->getZExtValue() == llvm::sub_hi) PHi = Ptr.getOperand(i);
      }
    }
    if (PLo && PHi &&
        PLo.getOpcode() == W65816ISD::Wrapper &&
        PHi.getOpcode() == W65816ISD::WrapperBank) {
      SDValue WLo = PLo.getOperand(0);
      SDValue WHi = PHi.getOperand(0);
      auto *GLo = dyn_cast<GlobalAddressSDNode>(WLo);
      auto *GHi = dyn_cast<GlobalAddressSDNode>(WHi);
      auto *ELo = dyn_cast<ExternalSymbolSDNode>(WLo);
      auto *EHi = dyn_cast<ExternalSymbolSDNode>(WHi);
      bool SameGlobal = (GLo && GHi && GLo->getGlobal() == GHi->getGlobal() &&
                         GLo->getOffset() == GHi->getOffset());
      bool SameExtern = (ELo && EHi &&
                         StringRef(ELo->getSymbol()) == EHi->getSymbol());
      if (SameGlobal || SameExtern)
        FoldedLo = PLo;
    }
  }
  if (FoldedLo) {
    EVT MemVT = Ld->getMemoryVT();
    ISD::LoadExtType ExtType = Ld->getExtensionType();
    if (ExtType == ISD::NON_EXTLOAD && MemVT == Op.getValueType()) {
      return DAG.getLoad(Op.getValueType(), DL, Chain, FoldedLo,
                         Ld->getPointerInfo(),
                         Ld->getAlign(),
                         Ld->getMemOperand()->getFlags());
    }
    // i1 memory type comes from GlobalOpt narrowing `short` globals
    // whose only assignments are 0/1.  Treat as i8 load + appropriate
    // mask — the underlying memory is still byte-sized.
    if (MemVT == MVT::i1) {
      SDValue ByteLd = DAG.getExtLoad(ISD::ZEXTLOAD, DL, MVT::i16, Chain,
                                       FoldedLo, MVT::i8,
                                       Ld->getMemOperand());
      SDValue Val = ByteLd;
      if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::EXTLOAD) {
        Val = DAG.getNode(ISD::AND, DL, MVT::i16, ByteLd,
                          DAG.getConstant(1, DL, MVT::i16));
      } else if (ExtType == ISD::SEXTLOAD) {
        // i1 sign-extend: bit 0 -> all bits.  AND #1 then NEG.
        SDValue Bit = DAG.getNode(ISD::AND, DL, MVT::i16, ByteLd,
                                  DAG.getConstant(1, DL, MVT::i16));
        Val = DAG.getNode(ISD::SUB, DL, MVT::i16,
                          DAG.getConstant(0, DL, MVT::i16), Bit);
      }
      if (Op.getValueType() == MVT::i8)
        Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Val);
      return DAG.getMergeValues({Val, ByteLd.getValue(1)}, DL);
    }
    return DAG.getExtLoad(ExtType, DL, Op.getValueType(), Chain, FoldedLo,
                          MemVT, Ld->getMemOperand());
  }

  // ptr16 mode: address is i16, let the default selection handle it.
  if (Ptr.getValueType() != MVT::i32)
    return SDValue();

  EVT MemVT = Ld->getMemoryVT();
  // Widen i1 memVT to i8 (single-byte storage).  getMemIntrinsicNode
  // asserts memvt must be supported; i1 isn't.
  if (MemVT == MVT::i1) MemVT = MVT::i8;
  SDVTList VTs = DAG.getVTList(MVT::i16, MVT::Other);
  // Try to peel a constant offset from Ptr and route through
  // LD_PTR_OFF — folds `(ptr + K)` into the Y-register of `[E0],Y`,
  // saving the i32 ADD's CLC/ADC carry chain.  ~3 instr per access.
  // See feedback_ptr32_deref_fold_layer1_mi.md.
  // LD_PTR_OFF: deferred — the peel fires correctly but the resulting
  // SDAG breaks the JSON-tokenizer + snprintf smoke tests in ways
  // bisection didn't isolate.  Stick with LD_PTR (no peel) here; the
  // LowerStore peel for ST_PTR_OFF / STB_PTR_OFF keeps the store-side
  // optimization.  Future: route loads through a SDAG combine that
  // runs post-LegalizeOps so we see the final REG_SEQUENCE shape.
  SDValue Ops[] = { Chain, Ptr };
  SDValue LdNode = DAG.getMemIntrinsicNode(W65816ISD::LD_PTR, DL, VTs, Ops,
                                           MemVT, Ld->getMemOperand());
  SDValue Val = LdNode;
  // Byte memory access: mask the high byte for zextload, leave anyext.
  // i1 memVT was widened to i8 above; the mask path is the same.
  if (MemVT == MVT::i8) {
    EVT OrigMemVT = Ld->getMemoryVT();
    SDValue MaskC = DAG.getConstant(OrigMemVT == MVT::i1 ? 1 : 0xFF,
                                    DL, MVT::i16);
    if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
        (OrigMemVT == MVT::i1 && Ld->getExtensionType() == ISD::EXTLOAD))
      Val = DAG.getNode(ISD::AND, DL, MVT::i16, Val, MaskC);
    else if (Ld->getExtensionType() == ISD::SEXTLOAD)
      Val = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i16, Val,
                        DAG.getValueType(MVT::i8));
  }
  // Narrow back to i8 if the consumer wanted i8.
  if (VT == MVT::i8)
    Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Val);
  return DAG.getMergeValues({Val, LdNode.getValue(1)}, DL);
}

// ZERO/SIGN/ANY_EXTEND i8/i16 -> i32: build a Wide32 from the i16
// payload and a 0 / sign-fill / undef high half.
SDValue W65816TargetLowering::LowerExtend(SDValue Op,
                                          SelectionDAG &DAG) const {
  SDLoc DL(Op);
  if (Op.getValueType() != MVT::i32)
    return SDValue();
  SDValue X = Op.getOperand(0);
  // Promote i8 inputs to i16 first via the same opcode.
  if (X.getValueType() == MVT::i8)
    X = DAG.getNode(Op.getOpcode(), DL, MVT::i16, X);
  SDValue Lo = X;
  SDValue Hi;
  if (Op.getOpcode() == ISD::ZERO_EXTEND) {
    Hi = DAG.getConstant(0, DL, MVT::i16);
  } else if (Op.getOpcode() == ISD::SIGN_EXTEND) {
    // Sign-fill via SRA #15 — uses our SRA15A pattern (4 insns) and
    // stays i16-typed in both LHS and RHS, dodging the combiner's
    // shift-amount-promote when ptr32 makes pointer-typed shift
    // amounts i32.
    Hi = DAG.getNode(ISD::SRA, DL, MVT::i16, Lo,
                     DAG.getConstant(15, DL, MVT::i16));
  } else {
    Hi = DAG.getUNDEF(MVT::i16);
  }
  return buildWide32(DAG, DL, Lo, Hi);
}

// SIGN_EXTEND_INREG i32 with inner type i1/i8/i16: sign-extend the low
// N bits of an i32 input to fill all 32 bits.  The legalizer leaves
// this op alone when i32 is legal — but no tablegen pattern matches
// the i32 form, so without this Custom hook isel aborts with
// "Cannot select: sign_extend_inreg ... ValueType:i1" on shapes like
// `-(crc & 1ul)` in CRC32 loops.
//
// Strategy: for inner VT V (= i1 / i8 / i16), the low half's
// `sext_inreg` (already pattern-matched at i16) produces the signed
// i16 value — then sign-fill the high half via SRA #15 of the lo
// result.
SDValue W65816TargetLowering::LowerSignExtendInReg(SDValue Op,
                                                   SelectionDAG &DAG) const {
  SDLoc DL(Op);
  SDValue X = Op.getOperand(0);
  EVT InnerVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
  EVT ResVT = Op.getValueType();

  // i16 result: replicate the existing tablegen patterns.  We MUST
  // handle this case rather than returning SDValue(), because
  // setOperationAction's Custom-returns-SDValue() falls through to
  // default Expand (= SRA/SHL chain), not to tablegen pattern match.
  // The two existing patterns are:
  //   (sext_inreg Acc16:$src, i1)  ->  NEGA16 (AND $src, 1)
  //   (sext_inreg Acc16:$src, i8)  ->  ((src & 0xFF) ^ 0x80) - 0x80
  // Reproduce them at the SDAG level so the legalizer's Custom
  // dispatch returns a fully-lowered tree.
  if (ResVT == MVT::i16) {
    if (InnerVT == MVT::i1) {
      SDValue Bit = DAG.getNode(ISD::AND, DL, MVT::i16, X,
                                DAG.getConstant(1, DL, MVT::i16));
      return DAG.getNode(ISD::SUB, DL, MVT::i16,
                         DAG.getConstant(0, DL, MVT::i16), Bit);
    }
    if (InnerVT == MVT::i8) {
      SDValue Masked = DAG.getNode(ISD::AND, DL, MVT::i16, X,
                                   DAG.getConstant(0xFF, DL, MVT::i16));
      SDValue Xored = DAG.getNode(ISD::XOR, DL, MVT::i16, Masked,
                                  DAG.getConstant(0x80, DL, MVT::i16));
      return DAG.getNode(ISD::SUB, DL, MVT::i16, Xored,
                         DAG.getConstant(0x80, DL, MVT::i16));
    }
    // inner i16 = no-op.
    return X;
  }

  if (ResVT != MVT::i32)
    return SDValue();

  // i32 result: project the input's low half (X is i32 Wide32 here),
  // apply the inner-VT sext on the i16 low half, sign-fill the hi.
  SDValue Lo = extractWide32Lo(DAG, DL, X);
  if (InnerVT != MVT::i16) {
    Lo = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i16, Lo,
                     DAG.getValueType(InnerVT));
  }
  // Sign-fill the hi half via SRA #15 — same idiom LowerExtend uses for
  // SIGN_EXTEND i16 -> i32.
  SDValue Hi = DAG.getNode(ISD::SRA, DL, MVT::i16, Lo,
                           DAG.getConstant(15, DL, MVT::i16));
  return buildWide32(DAG, DL, Lo, Hi);
}


// TRUNCATE i32 -> i16: project sub_lo.
SDValue W65816TargetLowering::LowerTruncate(SDValue Op,
                                            SelectionDAG &DAG) const {
  SDLoc DL(Op);
  if (Op.getOperand(0).getValueType() != MVT::i32)
    return SDValue();
  if (Op.getValueType() == MVT::i16)
    return extractWide32Lo(DAG, DL, Op.getOperand(0));
  if (Op.getValueType() == MVT::i8) {
    // i32 -> i16 -> i8.  The i8 trunc pattern is COPY_TO_REGCLASS at MC
    // level; the i16 sub_lo extract is the work.
    SDValue Lo16 = extractWide32Lo(DAG, DL, Op.getOperand(0));
    return DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Lo16);
  }
  return SDValue();
}

// i32 Constant: split into two i16 constants and REG_SEQUENCE.
SDValue W65816TargetLowering::LowerI32Constant(SDValue Op,
                                               SelectionDAG &DAG) const {
  SDLoc DL(Op);
  if (Op.getValueType() != MVT::i32) return SDValue();
  uint64_t V = cast<ConstantSDNode>(Op)->getZExtValue();
  SDValue Lo = DAG.getConstant(V & 0xFFFFu, DL, MVT::i16);
  SDValue Hi = DAG.getConstant((V >> 16) & 0xFFFFu, DL, MVT::i16);
  return buildWide32(DAG, DL, Lo, Hi);
}

// ADD/SUB/AND/OR/XOR i32 -> per-half i16 op.  ADDC/ADDE chain for ADD,
// SUBC/SUBE for SUB.  AND/OR/XOR are independent halves.
SDValue W65816TargetLowering::LowerI32Bin(SDValue Op,
                                          SelectionDAG &DAG) const {
  SDLoc DL(Op);
  if (Op.getValueType() != MVT::i32)
    return SDValue();
  SDValue L = Op.getOperand(0);
  SDValue R = Op.getOperand(1);
  SDValue LL = extractWide32Lo(DAG, DL, L);
  SDValue LH = extractWide32Hi(DAG, DL, L);
  SDValue RL = extractWide32Lo(DAG, DL, R);
  SDValue RH = extractWide32Hi(DAG, DL, R);
  SDValue Lo, Hi;
  switch (Op.getOpcode()) {
  case ISD::AND:
    Lo = DAG.getNode(ISD::AND, DL, MVT::i16, LL, RL);
    Hi = DAG.getNode(ISD::AND, DL, MVT::i16, LH, RH);
    break;
  case ISD::OR:
    Lo = DAG.getNode(ISD::OR, DL, MVT::i16, LL, RL);
    Hi = DAG.getNode(ISD::OR, DL, MVT::i16, LH, RH);
    break;
  case ISD::XOR:
    Lo = DAG.getNode(ISD::XOR, DL, MVT::i16, LL, RL);
    Hi = DAG.getNode(ISD::XOR, DL, MVT::i16, LH, RH);
    break;
  case ISD::ADD: {
    SDVTList VTs = DAG.getVTList(MVT::i16, MVT::Glue);
    SDValue Lo2 = DAG.getNode(ISD::ADDC, DL, VTs, LL, RL);
    Lo = Lo2.getValue(0);
    SDValue Carry = Lo2.getValue(1);
    Hi = DAG.getNode(ISD::ADDE, DL, VTs, LH, RH, Carry).getValue(0);
    break;
  }
  case ISD::SUB: {
    SDVTList VTs = DAG.getVTList(MVT::i16, MVT::Glue);
    SDValue Lo2 = DAG.getNode(ISD::SUBC, DL, VTs, LL, RL);
    Lo = Lo2.getValue(0);
    SDValue Borrow = Lo2.getValue(1);
    Hi = DAG.getNode(ISD::SUBE, DL, VTs, LH, RH, Borrow).getValue(0);
    break;
  }
  default:
    return SDValue();
  }
  return buildWide32(DAG, DL, Lo, Hi);
}

// Store companion to LowerLoad.  For i32 addresses, dispatch to the
// 16-bit ST_PTR or the byte-truncating STB_PTR target node based on
// MemoryVT.  For i16 addresses (ptr16 mode), bail out and let the
// existing STAptr / STBptr patterns match.
SDValue W65816TargetLowering::LowerStore(SDValue Op,
                                         SelectionDAG &DAG) const {
  StoreSDNode *St = cast<StoreSDNode>(Op);
  SDValue Chain = St->getChain();
  SDValue Val   = St->getValue();
  SDValue Ptr   = St->getBasePtr();
  EVT MemVT = St->getMemoryVT();
  SDLoc DL(Op);

  // Const-int address (`*(volatile uint8*)0xC035 = v`): leave the SDAG
  // alone so the tablegen pattern `(store Acc8, (iPTR imm))` →
  // STA8long fires.  Without this short-circuit the i32-pointer code
  // below promotes the constant address into a Wide32 register pair
  // and routes through STBptr32 ([dp],Y), which is 16 B / 30 cyc and
  // (worse) bank-tracks DBR.
  if (isa<ConstantSDNode>(Ptr))
    return SDValue();

  // i32 STORE: split into two halves.  Critical: the per-half stores
  // MUST go through the target-specific W65816ISD::ST_PTR node and not
  // through plain ISD::STORE, otherwise the SDAG combiner's
  // MergeConsecutiveStores re-combines them into a single i32 store
  // that re-enters LowerStore — infinite loop, OOM in the combiner.
  // For i16 ptrs (legacy ptr16), fall back to ISD::STORE; the regular
  // store-merger doesn't trip there because address splitting via
  // ISD::ADD on i16 doesn't itself fan out into ptr-pair operations.
  if (Val.getValueType() == MVT::i32) {
    SDValue Lo = extractWide32Lo(DAG, DL, Val);
    SDValue Hi = extractWide32Hi(DAG, DL, Val);
    EVT PtrVT = Ptr.getValueType();
    // ptr32 const-i32-addr fast path: `*(uint32_t*)0x5000 = v` should
    // lower to two STAabs (DBR-relative, 5 cyc each) instead of two
    // [dp],Y stores via ST_PTR.  Detect Wide32-zero-hi Constant ptr,
    // emit two i16 stores at TargetConstant:i32 addrs.  TargetConstant
    // (not Constant) so LowerI32Constant doesn't re-fire and recreate
    // the REG_SEQUENCE.  The STAabs timm pattern matches.
    if (PtrVT == MVT::i32 && Ptr.getNode()->isMachineOpcode() &&
        Ptr.getMachineOpcode() == TargetOpcode::REG_SEQUENCE) {
      SDValue PtrLo, PtrHi;
      for (unsigned i = 1; i + 1 < Ptr.getNumOperands(); i += 2) {
        if (auto *CIdx = dyn_cast<ConstantSDNode>(Ptr.getOperand(i + 1))) {
          if (CIdx->getZExtValue() == llvm::sub_lo) PtrLo = Ptr.getOperand(i);
          else if (CIdx->getZExtValue() == llvm::sub_hi) PtrHi = Ptr.getOperand(i);
        }
      }
      auto *PtrHiC = dyn_cast_or_null<ConstantSDNode>(PtrHi);
      auto *PtrLoC = dyn_cast_or_null<ConstantSDNode>(PtrLo);
      if (PtrLoC && PtrHiC && PtrHiC->getZExtValue() == 0) {
        uint64_t Base = PtrLoC->getZExtValue() & 0xFFFF;
        SDValue PLo = DAG.getTargetConstant(Base, DL, MVT::i32);
        SDValue PHi = DAG.getTargetConstant((Base + 2) & 0xFFFF, DL, MVT::i32);
        SDValue StLo = DAG.getStore(Chain, DL, Lo, PLo,
                                    St->getPointerInfo(),
                                    St->getAlign(),
                                    St->getMemOperand()->getFlags());
        SDValue StHi = DAG.getStore(StLo, DL, Hi, PHi,
                                    St->getPointerInfo().getWithOffset(2),
                                    St->getAlign(),
                                    St->getMemOperand()->getFlags());
        return StHi;
      }
    }
    SDValue Two = DAG.getConstant(2, DL, PtrVT);
    SDValue Ptr2 = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr, Two);
    if (PtrVT == MVT::i32) {
      // ptr32 path — emit two W65816ISD::ST_PTR target nodes, sequentially
      // chained.  The combiner cannot merge target-opaque MemIntrinsic
      // stores.
      SDVTList VTs = DAG.getVTList(MVT::Other);
      SDValue OpsLo[] = { Chain, Lo, Ptr };
      SDValue StLo = DAG.getMemIntrinsicNode(
          W65816ISD::ST_PTR, DL, VTs, OpsLo, MVT::i16,
          St->getMemOperand());
      SDValue OpsHi[] = { StLo, Hi, Ptr2 };
      MachineMemOperand *MMOHi = DAG.getMachineFunction().getMachineMemOperand(
          St->getMemOperand(), 2, 2);
      SDValue StHi = DAG.getMemIntrinsicNode(
          W65816ISD::ST_PTR, DL, VTs, OpsHi, MVT::i16, MMOHi);
      return StHi;
    }
    // ptr16 path — emit two regular i16 stores serially chained so the
    // store-merger sees them as a 4-byte sequence (which it will likely
    // leave alone since the resulting i32 store has no legal target
    // pattern in ptr16 mode anyway).
    SDValue StLo = DAG.getStore(Chain, DL, Lo, Ptr,
                                St->getPointerInfo(),
                                St->getAlign(),
                                St->getMemOperand()->getFlags());
    SDValue StHi = DAG.getStore(StLo, DL, Hi, Ptr2,
                                St->getPointerInfo().getWithOffset(2),
                                St->getAlign(),
                                St->getMemOperand()->getFlags());
    return StHi;
  }

  // Optimization: if the store goes through a global address (raw
  // GlobalAddress/ExternalSymbol, or a Wide32 built from Wrapper +
  // WrapperBank of the same symbol), lower to a plain i16/i8 store
  // through a single Wrapper@symbol so the tablegen pattern
  //   (store Acc8/Acc16, (W65816Wrapper tglobaladdr:$g))
  // selects STA8abs / STAabs (DBR-relative).  Our globals live in
  // the load segment that crt0 pins to DBR, so abs-16 reaches them.
  // This avoids the 14-byte [dp],y indirect-long path AND re-enables
  // the STZ peephole that the indirect path defeats.
  SDValue FoldedLo;
  if (auto *GA = dyn_cast<GlobalAddressSDNode>(Ptr)) {
    FoldedLo = DAG.getNode(W65816ISD::Wrapper, DL, MVT::i16,
        DAG.getTargetGlobalAddress(GA->getGlobal(), DL, MVT::i16,
                                    GA->getOffset()));
  } else if (auto *ES = dyn_cast<ExternalSymbolSDNode>(Ptr)) {
    FoldedLo = DAG.getNode(W65816ISD::Wrapper, DL, MVT::i16,
        DAG.getTargetExternalSymbol(ES->getSymbol(), MVT::i16));
  } else if (Ptr.getNode()->isMachineOpcode() &&
             Ptr.getMachineOpcode() == TargetOpcode::REG_SEQUENCE) {
    SDValue PLo, PHi;
    for (unsigned i = 1; i + 1 < Ptr.getNumOperands(); i += 2) {
      if (auto *CIdx = dyn_cast<ConstantSDNode>(Ptr.getOperand(i + 1))) {
        if (CIdx->getZExtValue() == llvm::sub_lo) PLo = Ptr.getOperand(i);
        else if (CIdx->getZExtValue() == llvm::sub_hi) PHi = Ptr.getOperand(i);
      }
    }
    if (PLo && PHi &&
        PLo.getOpcode() == W65816ISD::Wrapper &&
        PHi.getOpcode() == W65816ISD::WrapperBank) {
      SDValue WLo = PLo.getOperand(0);
      SDValue WHi = PHi.getOperand(0);
      auto *GLo = dyn_cast<GlobalAddressSDNode>(WLo);
      auto *GHi = dyn_cast<GlobalAddressSDNode>(WHi);
      auto *ELo = dyn_cast<ExternalSymbolSDNode>(WLo);
      auto *EHi = dyn_cast<ExternalSymbolSDNode>(WHi);
      bool SameGlobal = (GLo && GHi && GLo->getGlobal() == GHi->getGlobal() &&
                         GLo->getOffset() == GHi->getOffset());
      bool SameExtern = (ELo && EHi &&
                         StringRef(ELo->getSymbol()) == EHi->getSymbol());
      if (SameGlobal || SameExtern)
        FoldedLo = PLo;
    }
  }
  if (FoldedLo) {
    // Preserve memVT — original may be a truncating store (e.g.,
    // i16 value into i8 memory).  getStore picks memVT from Val's
    // type, which can mismatch the original MachineMemOperand size.
    if (MemVT == Val.getValueType()) {
      return DAG.getStore(Chain, DL, Val, FoldedLo,
                          St->getPointerInfo(), St->getAlign(),
                          St->getMemOperand()->getFlags());
    }
    return DAG.getTruncStore(Chain, DL, Val, FoldedLo, MemVT,
                             St->getMemOperand());
  }

  // No i32 ptr → nothing for us to do; let the default ISD::STORE
  // path handle it.  (Also avoids accidentally wrapping an i16 ptr
  // store into ST_PTR below, whose ptr operand must be i32.)
  if (Ptr.getValueType() != MVT::i32)
    return SDValue();

  // The pseudos take Acc16 (i16) as the value half; the SEP/REP wrap
  // around STBptr32 narrows in memory.  Promote i8 values to i16 with
  // ANY_EXTEND — the inserter only writes one byte, so the high half
  // is don't-care.
  if (Val.getValueType() == MVT::i8)
    Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, Val);

  SDVTList VTs = DAG.getVTList(MVT::Other);
  SDValue Base; uint16_t Off = 0;
  if (peelPtr32Offset(DAG, DL, Ptr, Base, Off)) {
    unsigned OffOpc = (MemVT == MVT::i8) ? unsigned(W65816ISD::STB_PTR_OFF)
                                         : unsigned(W65816ISD::ST_PTR_OFF);
    SDValue OffN = DAG.getTargetConstant(Off, DL, MVT::i16);
    SDValue OpsOff[] = { Chain, Val, Base, OffN };
    return DAG.getMemIntrinsicNode(OffOpc, DL, VTs, OpsOff, MemVT,
                                   St->getMemOperand());
  }
  unsigned NodeOpc = (MemVT == MVT::i8) ? unsigned(W65816ISD::STB_PTR)
                                        : unsigned(W65816ISD::ST_PTR);
  SDValue Ops[] = { Chain, Val, Ptr };
  return DAG.getMemIntrinsicNode(NodeOpc, DL, VTs, Ops, MemVT,
                                 St->getMemOperand());
}

// VAARG: load *ap, advance ap by sizeof(VT).  Unlike the default
// expansion, we do NOT align ap to the type's preferred alignment —
// caller-pushed varargs land at byte-granular addresses (PHA from an
// odd S leaves the low byte at S+1 which is even, but our prologue's
// TSC-sequence can produce odd S, etc.).  Aligning ap would skip the
// pushed value's low byte.
static SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) {
  SDLoc DL(Op);
  SDValue Chain   = Op.getOperand(0);
  SDValue VAListPtr = Op.getOperand(1);
  EVT VT = Op.getValueType();
  // ap (va_list) is `char *` on this target — i16 under ptr16, i32
  // under ptr32.  Load and store it at PtrVT so we don't truncate and
  // lose the high half (under ptr32, hi=0 so the truncation read garbage
  // back, then the i16 store wrote i16 over the lo half but left an
  // unrelated value in the hi — silent miscompile of every variadic
  // call on ptr32).
  EVT PtrVT = VAListPtr.getValueType();
  SDValue Ap = DAG.getLoad(PtrVT, DL, Chain, VAListPtr,
                            MachinePointerInfo());
  Chain = Ap.getValue(1);
  // For the actual data deref: under ptr16 we route i16 through
  // VAARG_LOAD (bank-0-explicit `[dp],Y`).  Under ptr32, ap is already
  // a Wide32 ptr with hi=0 (caller set up the va_list to point into the
  // call-frame stack-args region, bank 0); a regular load through that
  // pointer routes to LDAptr32 / STBptr32 which already deref bank-0.
  SDValue Val;
  if (VT == MVT::i16 && PtrVT == MVT::i16) {
    SDVTList VTs = DAG.getVTList(MVT::i16, MVT::Other);
    Val = DAG.getNode(W65816ISD::VAARG_LOAD, DL, VTs, Chain, Ap);
    Chain = Val.getValue(1);
  } else {
    Val = DAG.getLoad(VT, DL, Chain, Ap, MachinePointerInfo());
    Chain = Val.getValue(1);
  }
  // ap += sizeof(VT) (rounded up to whole bytes).
  unsigned Size = (VT.getSizeInBits() + 7) / 8;
  SDValue NewAp = DAG.getNode(ISD::ADD, DL, PtrVT, Ap,
                               DAG.getConstant(Size, DL, PtrVT));
  Chain = DAG.getStore(Chain, DL, NewAp, VAListPtr, MachinePointerInfo());
  return DAG.getMergeValues({Val, Chain}, DL);
}

// VASTART: store the address of the first vararg slot (recorded by
// LowerFormalArguments via VarArgsFrameIndex) to the va_list pointer.
// va_list is just `i16 *next` here — minimum implementation.
static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG,
                            const W65816TargetLowering &TLI) {
  MachineFunction &MF = DAG.getMachineFunction();
  auto *FuncInfo = MF.getInfo<W65816MachineFunctionInfo>();
  SDLoc DL(Op);
  // FrameIndex must be at PtrVT (i16 under ptr16, i32 under ptr32) so
  // the subsequent store writes the full pointer width.  Under ptr32
  // the i32 FI lowers via the i32 pointer-store path; the high half
  // is implicitly 0 (stack is bank 0) and stored alongside the lo.
  EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
  SDValue VAFI = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
                                   PtrVT);
  SDValue Chain = Op.getOperand(0);
  SDValue VAListPtr = Op.getOperand(1);
  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
  return DAG.getStore(Chain, DL, VAFI, VAListPtr, MachinePointerInfo(SV));
}

SDValue W65816TargetLowering::LowerOperation(SDValue Op,
                                             SelectionDAG &DAG) const {
  switch (Op.getOpcode()) {
  case ISD::GlobalAddress:  return LowerGlobalAddress(Op, DAG);
  case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
  case ISD::BR_CC:          return LowerBR_CC(Op, DAG);
  case ISD::BRIND:          return LowerBRIND(Op, DAG);
  case ISD::SETCC:          return LowerSETCC(Op, DAG);
  case ISD::SELECT_CC:      return LowerSELECT_CC(Op, DAG);
  case ISD::SELECT: {
    // Custom-lower SELECT for i32 result: split into per-half
    // selects.  Without this, the legalizer's default (rewriting
    // SELECT to SELECT_CC against zero) produces SELECT_CC i32 of
    // a different shape that re-enters Custom and creates a cycle.
    if (Op.getValueType() != MVT::i32)
      return SDValue();
    SDValue Cond = Op.getOperand(0);
    SDValue TVal = Op.getOperand(1);
    SDValue FVal = Op.getOperand(2);
    SDLoc DL(Op);
    SDValue TLo = extractWide32Lo(DAG, DL, TVal);
    SDValue THi = extractWide32Hi(DAG, DL, TVal);
    SDValue FLo = extractWide32Lo(DAG, DL, FVal);
    SDValue FHi = extractWide32Hi(DAG, DL, FVal);
    SDValue Lo = DAG.getNode(ISD::SELECT, DL, MVT::i16, Cond, TLo, FLo);
    SDValue Hi = DAG.getNode(ISD::SELECT, DL, MVT::i16, Cond, THi, FHi);
    return buildWide32(DAG, DL, Lo, Hi);
  }
  case ISD::SIGN_EXTEND:
    if (Op.getValueType() == MVT::i32) return LowerExtend(Op, DAG);
    return LowerSignExtend(Op, DAG);
  case ISD::VASTART:        return LowerVASTART(Op, DAG, *this);
  case ISD::VAARG:          return LowerVAARG(Op, DAG);
  case ISD::SHL:
  case ISD::SRL:
  case ISD::SRA:            return LowerShift(Op, DAG);
  case ISD::ZERO_EXTEND:
  case ISD::ANY_EXTEND:     return LowerExtend(Op, DAG);
  case ISD::SIGN_EXTEND_INREG: return LowerSignExtendInReg(Op, DAG);
  case ISD::TRUNCATE:       return LowerTruncate(Op, DAG);
  case ISD::ADD:
  case ISD::SUB:
  case ISD::AND:
  case ISD::OR:
  case ISD::XOR:            return LowerI32Bin(Op, DAG);
  case ISD::MUL:            return LowerMUL_I32(Op, DAG);
  case ISD::LOAD:           return LowerLoad(Op, DAG);
  case ISD::STORE:          return LowerStore(Op, DAG);
  case ISD::Constant:       return LowerI32Constant(Op, DAG);
  // SJLJ EH: setup_dispatch is a no-op on this target — the dispatcher
  // logic lives entirely in the SJLJ runtime (_Unwind_SjLj_Resume +
  // longjmp into the function context's jmp_buf).  The isel layer
  // doesn't need to emit any code; just thread the chain through.
  case ISD::EH_SJLJ_SETUP_DISPATCH:
    return Op.getOperand(0);
  case ISD::TRAP:
  case ISD::DEBUGTRAP: {
    // Wrap the incoming chain in a W65816ISD::TRAP node; the InstrInfo.td
    // pattern (W65816trap) selects BRK_pseudo, which the AsmPrinter
    // expands to sentinel-store + BRK + self-loop.  Threading the chain
    // through keeps memory-ordering side effects honest (the trap is
    // observed after any prior store).
    SDLoc DL(Op);
    SDValue Chain = Op.getOperand(0);
    return DAG.getNode(W65816ISD::TRAP, DL, MVT::Other, Chain);
  }
  case ISD::DYNAMIC_STACKALLOC: return LowerDynamicStackalloc(Op, DAG);
  case ISD::STACKSAVE: {
    // Return Constant 0 — SJLJ stores this into the function context
    // but our setjmp/longjmp manage SP directly, so the value is dead.
    SDLoc DL(Op);
    EVT VT = Op.getValueType();
    SDValue Chain = Op.getOperand(0);
    SDValue Result;
    if (VT == MVT::i16)
      Result = DAG.getConstant(0, DL, MVT::i16);
    else
      Result = buildWide32(DAG, DL,
                           DAG.getConstant(0, DL, MVT::i16),
                           DAG.getConstant(0, DL, MVT::i16));
    return DAG.getMergeValues({Result, Chain}, DL);
  }
  case ISD::STACKRESTORE:
    // No-op — pass the chain through.
    return Op.getOperand(0);
  case ISD::FRAMEADDR: {
    // FRAMEADDR(N): SJLJ uses N=0 (current frame).  We don't reserve a
    // frame pointer and SP isn't trivially CopyFromReg-able (no
    // register class).  Return Constant 0 — SJLJ uses it as an opaque
    // per-frame identifier; the SJLJ runtime tracks frames by jmp_buf
    // chaining (FnCtx::prev) rather than by FRAMEADDR value, so a
    // constant works for single-throw / non-nested-catch programs.
    // True multi-frame SJLJ would need a TSC-based unique value.
    SDLoc DL(Op);
    EVT VT = Op.getValueType();
    if (VT == MVT::i16)
      return DAG.getConstant(0, DL, MVT::i16);
    SDValue Lo = DAG.getConstant(0, DL, MVT::i16);
    SDValue Hi = DAG.getConstant(0, DL, MVT::i16);
    return buildWide32(DAG, DL, Lo, Hi);
  }
  default:
#ifndef NDEBUG
    Op.dump();
#endif
    llvm_unreachable("W65816: unexpected operation in LowerOperation");
  }
}

std::pair<unsigned, const TargetRegisterClass *>
W65816TargetLowering::getRegForInlineAsmConstraint(
    const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
  // Strip leading '{' and trailing '}' for the long form.
  StringRef C = Constraint;
  if (C.size() >= 2 && C.front() == '{' && C.back() == '}')
    C = C.substr(1, C.size() - 2);

  if (VT == MVT::i8) {
    if (C == "a") return {W65816::A, &W65816::Acc8RegClass};
    if (C == "x") return {W65816::X, &W65816::Idx8RegClass};
    if (C == "y") return {W65816::Y, &W65816::Idx8RegClass};
    if (C == "r") return {W65816::A, &W65816::Acc8RegClass};
  } else {  // i16 default; pointer types fold here too
    if (C == "a") return {W65816::A, &W65816::Acc16RegClass};
    if (C == "x") return {W65816::X, &W65816::Idx16RegClass};
    if (C == "y") return {W65816::Y, &W65816::Idx16RegClass};
    if (C == "r") return {W65816::A, &W65816::Acc16RegClass};
  }
  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
}

SDValue W65816TargetLowering::LowerDynamicStackalloc(SDValue Op,
                                                     SelectionDAG &DAG) const {
  // (DYNAMIC_STACKALLOC chain, size, align) -> (ptr, chain).
  // Lowered as: stash entry SP -> DP $F4 (handled by emitPrologue when
  // MFI.hasVarSizedObjects), then `tsc; sec; sbc size; tcs; inc a`.
  // The epilogue restores SP from $F4.
  //
  // Limitation: any FrameIndex (local, spill slot, parameter) accessed
  // *after* the alloca reads from a wrong stack-relative offset because
  // PEI bakes FI offsets relative to the static-frame SP, not the
  // post-alloca SP.  A real frame pointer would lift this; for now we
  // accept the limitation and document it.  The simplest safe pattern
  // is "VLA at end of function, used immediately, no further FI access";
  // anything else is at-your-own-risk until FP support lands.
  SDLoc DL(Op);
  SDValue Chain = Op.getOperand(0);
  SDValue Size = Op.getOperand(1);
  EVT ResultVT = Op.getValueType();
  // Under ptr32, both the result pointer and the size are Wide32 i32
  // values.  Extract the i16 lo half of size (a VLA larger than 64KB
  // doesn't fit in our stack anyway), do the i16 ALLOCA, then build
  // the Wide32 result with bank=0 (stack is always bank 0).
  if (ResultVT == MVT::i32) {
    SDValue Size16 = (Size.getValueType() == MVT::i32)
                       ? extractWide32Lo(DAG, DL, Size)
                       : Size;
    SDValue ChainAndPtr = DAG.getNode(W65816ISD::ALLOCA, DL,
                                      DAG.getVTList(MVT::i16, MVT::Other),
                                      Chain, Size16);
    SDValue Ptr16 = ChainAndPtr.getValue(0);
    SDValue NewChain = ChainAndPtr.getValue(1);
    SDValue Bank = DAG.getConstant(0, DL, MVT::i16);
    SDValue Ptr32 = buildWide32(DAG, DL, Ptr16, Bank);
    return DAG.getMergeValues({Ptr32, NewChain}, DL);
  }
  SDValue ChainAndPtr = DAG.getNode(W65816ISD::ALLOCA, DL,
                                    DAG.getVTList(MVT::i16, MVT::Other),
                                    Chain, Size);
  SDValue Ptr = ChainAndPtr.getValue(0);
  SDValue NewChain = ChainAndPtr.getValue(1);
  return DAG.getMergeValues({Ptr, NewChain}, DL);
}

SDValue W65816TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
  // i8 shifts: promote to i16, shift, truncate.  SRA promotes via SEXT
  // (preserves sign for arithmetic right shift); SHL/SRL via ZEXT
  // (logical / left shifts don't care about high bits).  This routes
  // i8 shifts through the same i16 fast paths and libcalls — no
  // parallel qi3 libcall set needed.  The DAG combiner would otherwise
  // narrow `(trunc (shl (zext X), K))` back to `(shl X, K)` of i8,
  // re-entering this hook in an infinite loop; the
  // `isTypeDesirableForOp(SHL/SRL/SRA, i8) -> false` override above
  // disables that combine.
  if (Op.getValueType() == MVT::i8) {
    SDLoc DL(Op);
    SDValue X = Op.getOperand(0);
    SDValue N = Op.getOperand(1);
    unsigned Ext = (Op.getOpcode() == ISD::SRA) ? ISD::SIGN_EXTEND
                                                : ISD::ZERO_EXTEND;
    SDValue X16 = DAG.getNode(Ext, DL, MVT::i16, X);
    SDValue N16 = N.getValueType() == MVT::i16
                      ? N
                      : DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, N);
    // Special case: i8 SRA by 7 of a sign-extended value is the
    // sign-fill operation — every result bit is the input's bit 7.
    // For sext(i8 x), bit 15 == bit 7, so `(sra (sext x), 7)` yields
    // the same result as `(sra (sext x), 15)`, which we have a tight
    // 4-insn pattern for via SRA15A.  Avoids the __ashrhi3 libcall
    // (~10 insns plus arg push/pop overhead) — abs8 dropped from 47
    // to 35 insns with this rewrite in place.
    if (Op.getOpcode() == ISD::SRA) {
      if (auto *C = dyn_cast<ConstantSDNode>(N)) {
        if (C->getZExtValue() == 7) {
          N16 = DAG.getConstant(15, DL, MVT::i16);
        }
      }
    }
    SDValue R16 = DAG.getNode(Op.getOpcode(), DL, MVT::i16, X16, N16);
    return DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, R16);
  }
  // Fast path: shift-by-{1,2,3,4} have inline tablegen patterns.  Return
  // Op (the unchanged node) so the legalizer leaves it alone — the
  // pattern matcher catches it later.  Returning SDValue() instead
  // would fall through to the generic Expand path, which generates a
  // BUILD_VECTOR-based magic-constant rewrite that we can't lower.
  // Also allow `(srl x, 15)` through — pattern SRL15A handles it as
  // `ASL A; LDA #0; ROL A` (3 bytes), much shorter than the libcall.
  // The type-legalizer's i32-shift-by-1 expansion emits this exact
  // node for the high-half "bit-from-low" slot.
  // Everything else goes to a libcall (__ashlhi3 / __lshrhi3 / __ashrhi3).
  // i16 only — i32 always routes to libcall (no inline i32 patterns).
  SDValue Amount = Op.getOperand(1);
  if (Op.getValueType() == MVT::i16) {
    if (auto *C = dyn_cast<ConstantSDNode>(Amount)) {
      uint64_t N = C->getZExtValue();
      if ((Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL) &&
          N >= 1 && N <= 14)
        return Op;
      if (N == 15 &&
          (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL))
        return Op;
      if (N == 1 && Op.getOpcode() == ISD::SRA)
        return Op;
      if (N == 15 && Op.getOpcode() == ISD::SRA)
        return Op;
    }
  }

  bool IsI32 = Op.getValueType() == MVT::i32;

  // Inline i32 shift-by-small-constant.  The libcall path is ~140 cyc
  // (post-tightening); unrolling N i16 ops plus carry propagation runs
  // in ~30-90 cyc.  popcount, djb2-style hashes, BigInt-style code, and
  // CRC routines all hit this.  Larger N falls through to the libcall —
  // the unrolled cost grows linearly while the libcall is constant.
  // Cutoff at N=5 chosen empirically: djb2's `(h << 5) + h` is the
  // common one that benefits.  SRA needs an arithmetic-fill shift on
  // the high half (i16 SRA by 1 is tablegen-supported); the low half is
  // filled from the high's departing bit just like SRL.
  if (IsI32) {
    if (auto *C = dyn_cast<ConstantSDNode>(Amount)) {
      uint64_t N = C->getZExtValue();
      unsigned Op0 = Op.getOpcode();
      if (N >= 1 && N <= 5 &&
          (Op0 == ISD::SHL || Op0 == ISD::SRL || Op0 == ISD::SRA)) {
        SDLoc DL(Op);
        SDValue X = Op.getOperand(0);
        SDValue Lo = extractWide32Lo(DAG, DL, X);
        SDValue Hi = extractWide32Hi(DAG, DL, X);
        SDValue ShN  = DAG.getConstant(N, DL, MVT::i16);
        SDValue ShCo = DAG.getConstant(16 - N, DL, MVT::i16);
        if (Op0 == ISD::SHL) {
          // (Hi:Lo) << N == ((Hi << N) | (Lo >> (16-N))) : (Lo << N)
          // 4 SDAG ops instead of N iterations of 4 ops.  Lets the
          // combiner / isel produce ASLA16-cascade + SRL8A+LSRA16-
          // cascade + single OR, avoiding the bit-by-bit OR cascade
          // that the unrolled form produced.
          SDValue NewLo  = DAG.getNode(ISD::SHL, DL, MVT::i16, Lo, ShN);
          SDValue HiTop  = DAG.getNode(ISD::SRL, DL, MVT::i16, Lo, ShCo);
          SDValue HiShl  = DAG.getNode(ISD::SHL, DL, MVT::i16, Hi, ShN);
          SDValue NewHi  = DAG.getNode(ISD::OR,  DL, MVT::i16, HiShl, HiTop);
          return buildWide32(DAG, DL, NewLo, NewHi);
        } else {
          // SRL/SRA by N: NewHi = Hi >> N (logical or arithmetic);
          // NewLo = (Lo >> N) | (Hi << (16-N)).
          SDValue NewHi  = DAG.getNode(Op0, DL, MVT::i16, Hi, ShN);
          SDValue LoTop  = DAG.getNode(ISD::SHL, DL, MVT::i16, Hi, ShCo);
          SDValue LoSrl  = DAG.getNode(ISD::SRL, DL, MVT::i16, Lo, ShN);
          SDValue NewLo  = DAG.getNode(ISD::OR,  DL, MVT::i16, LoSrl, LoTop);
          return buildWide32(DAG, DL, NewLo, NewHi);
        }
      }
    }
  }

  RTLIB::Libcall LC;
  switch (Op.getOpcode()) {
  case ISD::SHL: LC = IsI32 ? RTLIB::SHL_I32 : RTLIB::SHL_I16; break;
  case ISD::SRL: LC = IsI32 ? RTLIB::SRL_I32 : RTLIB::SRL_I16; break;
  case ISD::SRA: LC = IsI32 ? RTLIB::SRA_I32 : RTLIB::SRA_I16; break;
  default: llvm_unreachable("not a shift");
  }

  SDValue Val = Op.getOperand(0);
  if (IsI32 && Op.getOpcode() == ISD::SHL) {
    // Force the high half of the input to be concretely zero when the
    // shift count K is >= 16, so bits K..31 of the input are
    // mathematically irrelevant.  SDAG legalisation can mark those bits
    // as `undef` to give the regalloc freedom, but our libcall (a true
    // 32-bit shift-and-rotate loop in libgcc.s) reads ALL 32 input
    // bits and propagates garbage into the result's low half.  Caught
    // by dadd via the dpack-inline `(u64 e) << 52` path which split
    // into __ashlsi3(e_lo, 20) with X = undef → wrong mantissa.
    // For SRL/SRA we'd zero/sign-extend the LOW half similarly when
    // K >= 16, but those paths aren't exercising the bug yet.
    if (auto *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
      unsigned K = (unsigned)C->getZExtValue();
      if (K >= 16) {
        SDValue Lo = extractWide32Lo(DAG, SDLoc(Op), Val);
        SDValue Zero = DAG.getConstant(0, SDLoc(Op), MVT::i16);
        Val = buildWide32(DAG, SDLoc(Op), Lo, Zero);
      }
    }
  }
  SmallVector<SDValue, 2> Args = {Val, Op.getOperand(1)};
  TargetLowering::MakeLibCallOptions Opts;
  Opts.setIsSigned(Op.getOpcode() == ISD::SRA);
  return makeLibCall(DAG, LC, Op.getValueType(), Args, Opts, SDLoc(Op)).first;
}

SDValue W65816TargetLowering::LowerGlobalAddress(SDValue Op,
                                                 SelectionDAG &DAG) const {
  auto *GA = cast<GlobalAddressSDNode>(Op);
  SDLoc DL(Op);
  EVT PtrVT = Op.getValueType();   // i16 in ptr16 mode, i32 in ptr32 mode
  if (PtrVT == MVT::i32) {
    // i32 GlobalAddress: build Wide32 from (i16 offset, i16 bank).
    //   Lo  = Wrapper(target)      → fixup_16        (offset bytes)
    //   Hi  = WrapperBank(target)  → fixup_bank16    (bank byte + 0 pad)
    // The linker / OMF Loader patch both halves so the runtime
    // pointer reflects the actual placed segment, not the link-time
    // text-base.  Resolves the long-standing "ldx #0 is hardcoded"
    // bug that broke toolbox-call pointer args.
    SDValue OffTgt = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
                                                MVT::i16, GA->getOffset());
    SDValue Lo = DAG.getNode(W65816ISD::Wrapper, DL, MVT::i16, OffTgt);
    SDValue Hi = DAG.getNode(W65816ISD::WrapperBank, DL, MVT::i16, OffTgt);
    return buildWide32(DAG, DL, Lo, Hi);
  }
  SDValue Tgt = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, PtrVT,
                                           GA->getOffset());
  return DAG.getNode(W65816ISD::Wrapper, DL, PtrVT, Tgt);
}

SDValue W65816TargetLowering::LowerExternalSymbol(SDValue Op,
                                                  SelectionDAG &DAG) const {
  auto *ES = cast<ExternalSymbolSDNode>(Op);
  SDLoc DL(Op);
  EVT PtrVT = Op.getValueType();
  if (PtrVT == MVT::i32) {
    SDValue OffTgt = DAG.getTargetExternalSymbol(ES->getSymbol(), MVT::i16);
    SDValue Lo = DAG.getNode(W65816ISD::Wrapper, DL, MVT::i16, OffTgt);
    SDValue Hi = DAG.getNode(W65816ISD::WrapperBank, DL, MVT::i16, OffTgt);
    return buildWide32(DAG, DL, Lo, Hi);
  }
  SDValue Tgt = DAG.getTargetExternalSymbol(ES->getSymbol(), PtrVT);
  return DAG.getNode(W65816ISD::Wrapper, DL, PtrVT, Tgt);
}

SDValue W65816TargetLowering::LowerFormalArguments(
    SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
  // ABI: first i16/i8 argument is passed in A; remaining arguments are
  // pushed by the caller right-to-left and read via stack-relative
  // addressing.  After JSL pushes 3 bytes of return address, the layout
  // viewed from the callee is:
  //   (high addr)  arg N-1
  //                ...
  //                arg 1
  //                ret-addr-bank   <- (4,S) when M=0
  //                ret-addr-hi     <- (3,S)
  //                ret-addr-lo     <- (2,S)
  //   (low addr)   <next push>     <- (1,S)
  //
  // Each i16 stack arg occupies 2 bytes.  arg 1 lives at (4,S).

  MachineFunction &MF = DAG.getMachineFunction();
  MachineFrameInfo &MFI = MF.getFrameInfo();
  MachineRegisterInfo &MRI = MF.getRegInfo();

  // i32 first-arg ABI.  Two flavors as in LowerCall:
  //   - Legal-i32 (Wide32 reg class registered): single i32 InputArg.
  //   - Split-i32 (legacy): two i16 InputArgs both with OrigArgIndex==0.
  bool I32SplitFirstArg =
      Ins.size() >= 2 && Ins[0].VT == MVT::i16 && Ins[1].VT == MVT::i16 &&
      Ins[0].OrigArgIndex == 0 && Ins[1].OrigArgIndex == 0;
  // True iff the FIRST original arg spans 4 i16s (i.e., is i64).  Used
  // below to choose the Img16-via-STX_DP X-arg path for i64 callees,
  // which dodges greedy's TXA-bridge-clobbers-A spill bug.  i32-first
  // doesn't get the same treatment because the change pessimizes
  // simple functions like `int add32(int a, int b) { return a+b; }`
  // where greedy's regular A:X handling is fine.
  // Two shapes for i64-first-arg under different ptr modes:
  //   ptr16 (i32 illegal): Ins[0..3] = 4 i16 halves of arg0
  //   ptr32 (i32 legal):   Ins[0..1] = 2 i32 halves of arg0 — but the
  //                         IR-level "single i64 first arg" still splits
  //                         to 4 i16 in Outs/Ins because i64 isn't legal.
  //                         So the i16-form detection still applies here.
  bool I64FirstArg =
      Ins.size() >= 4 && Ins[0].VT == MVT::i16 && Ins[1].VT == MVT::i16 &&
      Ins[2].VT == MVT::i16 && Ins[3].VT == MVT::i16 &&
      Ins[0].OrigArgIndex == 0 && Ins[1].OrigArgIndex == 0 &&
      Ins[2].OrigArgIndex == 0 && Ins[3].OrigArgIndex == 0;
  // Also detect the i32-split shape: Ins[0..1] = 2 i32 halves of arg0
  // (with OrigArgIndex==0 on both).  This happens with ptr32 active and
  // i64 legalized via i32-split rather than i16-quad-split.
  if (!I64FirstArg && Ins.size() >= 2 && Ins[0].VT == MVT::i32 &&
      Ins[1].VT == MVT::i32 && Ins[0].OrigArgIndex == 0 &&
      Ins[1].OrigArgIndex == 0)
    I64FirstArg = true;

  unsigned ArgIdx = 0;
  // Stack offset is measured from S+1 (the WDC convention) and grows
  // upward as we walk through the stack-passed args.
  unsigned StackOffset = 4; // Skip 3 ret-addr bytes; first slot at S+4.
  for (const ISD::InputArg &Arg : Ins) {
    MVT VT = Arg.VT;
    if (VT != MVT::i16 && VT != MVT::i8 && VT != MVT::i32)
      report_fatal_error("W65816: argument type not yet supported");

    if (ArgIdx == 0 && VT == MVT::i32) {
      // Whole-i32 first arg: lo half live-in via $a, hi via $x.
      // The W65816LowerWide32 pre-RA pass walks the resulting
      // REG_SEQUENCE and rewrites Wide32 uses into pairs of i16
      // operations — keeping AX32 out of the regalloc's pair-
      // allocation path entirely.
      // For i64-first-arg signatures (the IR has a single i64 arg
      // that splits to 2 i32 in Ins[0..1] under ptr32), route BOTH
      // halves through Img16.  Without this the regalloc emits
      // `TXA; STA spill_X; STA spill_A` at function entry — the TXA
      // clobbers $a (arg0_0) before the A-spill saves it, so both
      // spill slots end up holding arg0_1.  Caused __adddf3(1.5,2.5)
      // → 1.5 because the cb-test path read TXA-corrupted A.
      // Route the hi half through Img16 (DP-backed) for whole-i32 first
      // args.  The Idx16 (X-only) class collapses through the W65816LowerWide32
      // pre-RA pass to plain Acc16, after which regalloc treats both halves
      // as competing for $a — a TXA at the top of any non-trivial function
      // body destroys arg0_lo before it's spilled (silent miscompile of
      // every i32-arg function with > a few uses).  Img16 forces an
      // STX_DP at function entry, immune to A-reuse.  i64-first already
      // did this; under ptr32 the same hazard hits any i32 arg.
      const TargetRegisterClass *VRegLoRC =
          I64FirstArg ? &W65816::Img16RegClass : &W65816::Acc16RegClass;
      const TargetRegisterClass *VRegHiRC = &W65816::Img16RegClass;
      Register VRegLo = MRI.createVirtualRegister(VRegLoRC);
      Register VRegHi = MRI.createVirtualRegister(VRegHiRC);
      MRI.addLiveIn(W65816::A, VRegLo);
      MRI.addLiveIn(W65816::X, VRegHi);
      SDValue Lo = DAG.getCopyFromReg(Chain, DL, VRegLo, MVT::i16);
      SDValue Hi = DAG.getCopyFromReg(Chain, DL, VRegHi, MVT::i16);
      InVals.push_back(buildWide32(DAG, DL, Lo, Hi));
    } else if (ArgIdx == 0) {
      // First arg in A.  For i64-first-arg signatures (4 i16 halves of
      // arg0 with OrigArgIndex==0), route arg0_0 through Img16 the same
      // way ArgIdx==1 does — via an entry STA-to-DP-slot at function
      // entry.  Without this, the regalloc emits a TXA bridge for
      // arg0_1's spill that clobbers $a (= arg0_0) BEFORE arg0_0 has
      // been saved, and BOTH arg0_0 and arg0_1's spill slots end up
      // holding arg0_1.  Observed as `__adddf3(1.5, 2.5) → 1.5` because
      // the cb-test BEQ sees flags from a TXA-clobbered LDA cb path.
      const TargetRegisterClass *RC =
          (VT == MVT::i16)
              ? (I64FirstArg ? &W65816::Img16RegClass : &W65816::Acc16RegClass)
              : &W65816::Acc8RegClass;
      Register VReg = MRI.createVirtualRegister(RC);
      MRI.addLiveIn(W65816::A, VReg);
      InVals.push_back(DAG.getCopyFromReg(Chain, DL, VReg, VT));
    } else if (ArgIdx == 1 && I32SplitFirstArg) {
      // First-arg hi half (or arg0_ml for i64-first-arg): in X.
      // For i64-first-arg signatures (4 i16s with OrigArgIndex 0), use
      // Img16 so greedy parks the value in an IMG slot via STX_DP,
      // dodging the TXA-bridge-clobbers-A spill bug.  i32-first stays
      // on the original Idx16 path because the change pessimizes
      // simple cases (verified: vprintf's writeULong/__udivsi3 chain
      // crashes if i32-first is also rerouted).  Caught by udivmod.
      const TargetRegisterClass *RC = I64FirstArg ? &W65816::Img16RegClass
                                                  : &W65816::Idx16RegClass;
      Register VReg = MRI.createVirtualRegister(RC);
      MRI.addLiveIn(W65816::X, VReg);
      InVals.push_back(DAG.getCopyFromReg(Chain, DL, VReg, MVT::i16));
    } else if (VT == MVT::i32) {
      // i32 stack arg: 4 bytes, loaded as 2 i16 halves and assembled
      // via REG_SEQUENCE into a Wide32 SDValue.
      int FILo = MFI.CreateFixedObject(2, StackOffset, /*Immutable*/true);
      int FIHi = MFI.CreateFixedObject(2, StackOffset + 2, /*Immutable*/true);
      StackOffset += 4;
      SDValue FINLo = DAG.getFrameIndex(FILo, MVT::i16);
      SDValue FINHi = DAG.getFrameIndex(FIHi, MVT::i16);
      SDValue Lo = DAG.getLoad(MVT::i16, DL, Chain, FINLo,
                               MachinePointerInfo::getFixedStack(MF, FILo));
      SDValue Hi = DAG.getLoad(MVT::i16, DL, Chain, FINHi,
                               MachinePointerInfo::getFixedStack(MF, FIHi));
      InVals.push_back(buildWide32(DAG, DL, Lo, Hi));
    } else {
      // Subsequent args are loaded from the stack.  i8 args are
      // promoted to i16 slots (matching CC_W65816's CCPromoteToType)
      // so the load can run in the function's default 16-bit M mode
      // without needing a per-byte SEP/REP wrap; we then truncate the
      // i16 back to i8 for the IR.  i16 args are loaded directly.
      unsigned ObjSize = 2;
      int FI = MFI.CreateFixedObject(ObjSize, StackOffset, /*Immutable*/true);
      StackOffset += ObjSize;
      SDValue FIN = DAG.getFrameIndex(FI, MVT::i16);
      SDValue Val = DAG.getLoad(
          MVT::i16, DL, Chain, FIN,
          MachinePointerInfo::getFixedStack(MF, FI));
      if (VT == MVT::i8)
        Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Val);
      InVals.push_back(Val);
    }
    ++ArgIdx;
  }

  // Vararg support: stash the FrameIndex of the next stack-arg slot
  // (where the caller's first vararg lives) so VASTART can use it
  // as the va_list start.  StackOffset has been advanced past every
  // named stack arg; the first vararg sits at SP + StackOffset.
  if (IsVarArg) {
    int FI = MFI.CreateFixedObject(2, StackOffset, /*Immutable=*/true);
    auto *FuncInfo = MF.getInfo<W65816MachineFunctionInfo>();
    FuncInfo->setVarArgsFrameIndex(FI);
  }

  return Chain;
}

SDValue
W65816TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                SmallVectorImpl<SDValue> &InVals) const {
  // Multi-arg ABI: arg 0 in A, args 1..N-1 pushed in REVERSE order via
  // PUSH16 (PHA) so the callee's `(4,S)` reads pick up arg 1, `(6,S)`
  // gets arg 2, etc.  CALLSEQ_START records the byte count;
  // ADJCALLSTACKUP after the call emits `tsc;clc;adc #N;tcs` to
  // release the pushed bytes (eliminateCallFramePseudoInstr).
  SelectionDAG &DAG = CLI.DAG;
  SDLoc &DL = CLI.DL;
  SDValue Chain = CLI.Chain;
  SDValue Callee = CLI.Callee;
  auto &Outs = CLI.Outs;
  auto &OutVals = CLI.OutVals;
  auto &Ins = CLI.Ins;

  if (CLI.IsTailCall)
    CLI.IsTailCall = false;
  // Up to 4 return halves (i64 split): i8/i16 in A; i32 in A:X;
  // i64 in A:X:Y plus DP $F0..$F1 for the highest half.  See
  // LowerReturn comment for the ABI.
  if (Ins.size() > 4)
    report_fatal_error("W65816: return type wider than 64 bits not supported");

  // Indirect calls (function pointers): redirect through the runtime
  // trampoline `__jsl_indir`.  The 65816 has no JSL-indirect; instead,
  // we store the dynamic target to a fixed bank-0 slot ($00:00B8 — see
  // libgcc.s for why) and JSL the trampoline, which does
  // `JMP ($00B8)`.  JMP (abs) reads its vector from bank 0 unconditionally,
  // so anchoring the slot in bank 0 makes the dispatch work under GS/OS
  // Loader / GNO non-bank-0 placement (where the program's BSS would
  // otherwise live in PBR — the JMP couldn't reach it).  Single-bank
  // assumption remains on the *target's* code (JMP indirect keeps PBR).
  bool IsIndirect = !isa<GlobalAddressSDNode>(Callee) &&
                    !isa<ExternalSymbolSDNode>(Callee);
  if (IsIndirect) {
    // Emit a constant-address store: tblgen pattern (store Acc16,
    // (iPTR imm:$addr)) -> STA_Long $0000B8 (4-byte abs-long, bank
    // explicit, ignores DBR).
    SDValue ConstAddr =
        DAG.getConstant(0xB8, DL,
                        getPointerTy(DAG.getDataLayout()));
    Chain = DAG.getStore(Chain, DL, Callee, ConstAddr,
                         MachinePointerInfo());
    // Replace the callee with __jsl_indir for the actual JSL.
    Callee = DAG.getExternalSymbol("__jsl_indir", MVT::i16);
  }

  for (const ISD::OutputArg &O : Outs) {
    if (O.VT != MVT::i16 && O.VT != MVT::i8 && O.VT != MVT::i32)
      report_fatal_error("W65816: argument type not yet supported");
  }

  // i32 first-arg ABI.  Two flavors:
  //   - Legal-i32: Outs[0].VT == i32 (whole pair).  Pass in AX32.
  //   - Split-i32 (legacy): Outs[0]/Outs[1] both i16 with OrigArgIndex==0.
  //     Pass low in A, high in X.
  bool I32WholeFirstArg =
      !Outs.empty() && Outs[0].VT == MVT::i32;
  bool I32SplitFirstArg =
      Outs.size() >= 2 && Outs[0].VT == MVT::i16 && Outs[1].VT == MVT::i16 &&
      Outs[0].OrigArgIndex == 0 && Outs[1].OrigArgIndex == 0;
  unsigned FirstStackArg = I32WholeFirstArg ? 1
                          : I32SplitFirstArg ? 2 : 1;

  // i8 stack args are promoted to i16 (2-byte slots) so the callee can
  // read them with a 16-bit M load — matches LowerFormalArguments and
  // CC_W65816's CCPromoteToType<i16>.  i32 stack args occupy 4 bytes
  // (2 PUSH16s).
  unsigned StackBytes = 0;
  for (unsigned i = FirstStackArg; i < Outs.size(); ++i)
    StackBytes += (Outs[i].VT == MVT::i32) ? 4 : 2;

  Chain = DAG.getCALLSEQ_START(Chain, StackBytes, 0, DL);

  // Push stack-passed args in reverse so arg FirstStackArg ends up at
  // the lowest post-JSL stack-relative offset (4,S).  Each push uses A
  // by default; if the value being pushed is already a `CopyFromReg X`
  // (e.g. forwarding the i32-first-arg-in-A:X hi half), push directly
  // from X via PHX — saves the TXA + A-spill round-trip that would
  // otherwise be required.
  SDValue Glue;
  // Helper: push a single i16-sized value via PHA.
  auto pushI16 = [&](SDValue V) {
    bool ViaX = false;
    if (V.getOpcode() == ISD::CopyFromReg) {
      auto *RegN = dyn_cast<RegisterSDNode>(V.getOperand(1).getNode());
      if (RegN) {
        Register R = RegN->getReg();
        if (R.isPhysical() && R == W65816::X) {
          ViaX = true;
        } else if (R.isVirtual()) {
          MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
          if (MRI.getRegClass(R) == &W65816::Idx16RegClass) {
            for (auto &LI : MRI.liveins())
              if (LI.second == R && LI.first == W65816::X) {
                ViaX = true;
                break;
              }
          }
        }
      }
    }
    if (ViaX) {
      Chain = DAG.getCopyToReg(Chain, DL, W65816::X, V, Glue);
      Glue = Chain.getValue(1);
      Chain = DAG.getNode(W65816ISD::PUSH_X, DL,
                          DAG.getVTList(MVT::Other, MVT::Glue), Chain, Glue);
    } else {
      Chain = DAG.getCopyToReg(Chain, DL, W65816::A, V, Glue);
      Glue = Chain.getValue(1);
      Chain = DAG.getNode(W65816ISD::PUSH, DL,
                          DAG.getVTList(MVT::Other, MVT::Glue), Chain, Glue);
    }
    Glue = Chain.getValue(1);
  };

  for (int i = (int)Outs.size() - 1; i >= (int)FirstStackArg; --i) {
    SDValue V = OutVals[i];
    if (Outs[i].VT == MVT::i32) {
      // Push i32 stack arg: hi half first (lands at higher address),
      // lo half second (lands at lower address = the slot the callee
      // reads as the start of the i32).
      SDValue Lo = extractWide32Lo(DAG, DL, V);
      SDValue Hi = extractWide32Hi(DAG, DL, V);
      pushI16(Hi);
      pushI16(Lo);
      continue;
    }
    if (Outs[i].VT == MVT::i8)
      V = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, V);
    pushI16(V);
  }

  // i32 first-arg.  Whole (legal-i32): split into lo/hi and copy
  // to $a/$x separately — avoids AX32 in the MIR (see
  // W65816LowerWide32).  Split-i32 (legacy 2-i16): hi in X first,
  // then lo in A below.
  if (I32WholeFirstArg) {
    SDValue Lo = extractWide32Lo(DAG, DL, OutVals[0]);
    SDValue Hi = extractWide32Hi(DAG, DL, OutVals[0]);
    Chain = DAG.getCopyToReg(Chain, DL, W65816::X, Hi, Glue);
    Glue = Chain.getValue(1);
    Chain = DAG.getCopyToReg(Chain, DL, W65816::A, Lo, Glue);
    Glue = Chain.getValue(1);
  } else if (I32SplitFirstArg) {
    Chain = DAG.getCopyToReg(Chain, DL, W65816::X, OutVals[1], Glue);
    Glue = Chain.getValue(1);
  }

  // Arg 0 in A — only for non-whole-i32 first-arg.  Whole-i32
  // already copied to A/X above.
  if (!I32WholeFirstArg && !OutVals.empty()) {
    Chain = DAG.getCopyToReg(Chain, DL, W65816::A, OutVals[0], Glue);
    Glue = Chain.getValue(1);
  }

  // Callee target type must match iPTR (i16 in ptr16, i32 in ptr32).
  // The CALL SDNode's operand-type profile (SDT_W65816Call) is iPTR;
  // hardcoding MVT::i16 here mismatches under p:32:16.
  EVT CalleeVT = getPointerTy(DAG.getDataLayout());
  if (auto *GA = dyn_cast<GlobalAddressSDNode>(Callee))
    Callee = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, CalleeVT);
  else if (auto *ES = dyn_cast<ExternalSymbolSDNode>(Callee))
    Callee = DAG.getTargetExternalSymbol(ES->getSymbol(), CalleeVT);

  SmallVector<SDValue, 4> CallOps = {Chain, Callee};
  if (I32WholeFirstArg) {
    CallOps.push_back(DAG.getRegister(W65816::A, MVT::i16));
    CallOps.push_back(DAG.getRegister(W65816::X, MVT::i16));
  } else if (!OutVals.empty()) {
    CallOps.push_back(DAG.getRegister(W65816::A, Outs[0].VT));
    if (I32SplitFirstArg)
      CallOps.push_back(DAG.getRegister(W65816::X, Outs[1].VT));
  }
  if (Glue.getNode())
    CallOps.push_back(Glue);

  Chain = DAG.getNode(W65816ISD::CALL, DL,
                      DAG.getVTList(MVT::Other, MVT::Glue), CallOps);
  Glue = Chain.getValue(1);

  Chain = DAG.getCALLSEQ_END(Chain, StackBytes, 0, Glue, DL);
  Glue = Chain.getValue(1);

  // Read return value(s).  Mirrors LowerReturn: i8/i16 in A, i32 in
  // AX32 (whole) or split A/X (legacy), and 4-half (i64 / 2x i32) in
  // A, X, Y, DPF0.  i32 Ins are read as a single i32 from the half
  // pair (A:X for the first, Y:DPF0 for a second-pair-of-halves).
  // Whole-i32 single return: read lo from $a, hi from $x.  Avoids
  // using AX32 in the SDAG / MIR — see W65816LowerWide32 pass.
  if (Ins.size() == 1 && Ins[0].VT == MVT::i32) {
    SDValue Lo = DAG.getCopyFromReg(Chain, DL, W65816::A, MVT::i16, Glue);
    Chain = Lo.getValue(1);
    Glue  = Lo.getValue(2);
    SDValue Hi = DAG.getCopyFromReg(Chain, DL, W65816::X, MVT::i16, Glue);
    Chain = Hi.getValue(1);
    Glue  = Hi.getValue(2);
    InVals.push_back(buildWide32(DAG, DL, Lo, Hi));
    return Chain;
  }
  // Build a flat list of i16 halves expected from the call.  Then
  // walk it, copying from A, X, Y, DPF0 in order.  Re-assemble i32
  // halves into a Wide32 SDValue at the end.
  SmallVector<MVT, 4> ExpVT;
  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
    MVT VT = Ins[i].VT;
    if (VT == MVT::i32) {
      ExpVT.push_back(MVT::i16);
      ExpVT.push_back(MVT::i16);
    } else if (VT == MVT::i16 || VT == MVT::i8) {
      ExpVT.push_back(VT);
    } else {
      report_fatal_error("W65816: return half must be i8/i16/i32");
    }
  }
  if (ExpVT.size() > 4)
    report_fatal_error("W65816: return type wider than 64 bits not supported");
  static constexpr Register RetRegs[4] = {W65816::A, W65816::X, W65816::Y,
                                          W65816::DPF0};
  SmallVector<SDValue, 4> Halves;
  for (unsigned i = 0; i != ExpVT.size(); ++i) {
    SDValue V = DAG.getCopyFromReg(Chain, DL, RetRegs[i], ExpVT[i], Glue);
    Chain = V.getValue(1);
    Glue  = V.getValue(2);
    Halves.push_back(V);
  }
  // Re-pack halves into the original Ins shape (i32s rebuild via
  // REG_SEQUENCE; i8/i16 pass through).
  unsigned hi = 0;
  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
    if (Ins[i].VT == MVT::i32) {
      InVals.push_back(buildWide32(DAG, DL, Halves[hi], Halves[hi + 1]));
      hi += 2;
    } else {
      InVals.push_back(Halves[hi]);
      hi += 1;
    }
  }
  return Chain;
}

SDValue W65816TargetLowering::LowerReturn(
    SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
    const SmallVectorImpl<ISD::OutputArg> &Outs,
    const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
    SelectionDAG &DAG) const {
  // Return ABI:
  //   i8/i16:  value in A.
  //   i32:     low half (Outs[0]) in A, high half (Outs[1]) in X.
  //   i64:     halves in A, X, Y, and a fixed direct-page slot at $F0..$F1
  //            (Outs[0..2] -> A,X,Y; Outs[3] stored to the DP slot).
  //   wider:   not yet supported.
  // Type legalization splits an i32 into 2 consecutive i16 Outs and an
  // i64 into 4.  Emission order matters: we copy the *highest* halves
  // first so that the regalloc can place each through A (the only
  // ALU reg) without conflict.  The TAX/TAY in copyPhysReg preserves
  // A, so subsequent low-half copies to A don't clobber.
  // With i32 legal, an Outs entry may be MVT::i32; we expand each i32
  // into its two i16 halves (sub_lo/sub_hi via EXTRACT_SUBREG) so the
  // legacy A/X/Y/DPF0 4-half return ABI continues to work for the
  // multi-half return cases (i64 returned as 2 i32, struct of 2 long
  // returned as 2 i32, etc.).
  SmallVector<MVT, 4> ExpVT;
  SmallVector<SDValue, 4> ExpVals;
  for (unsigned i = 0; i != Outs.size(); ++i) {
    MVT VT = Outs[i].VT;
    if (VT == MVT::i32) {
      ExpVT.push_back(MVT::i16);
      ExpVT.push_back(MVT::i16);
      ExpVals.push_back(extractWide32Lo(DAG, DL, OutVals[i]));
      ExpVals.push_back(extractWide32Hi(DAG, DL, OutVals[i]));
    } else if (VT == MVT::i16 || VT == MVT::i8) {
      ExpVT.push_back(VT);
      ExpVals.push_back(OutVals[i]);
    } else {
      report_fatal_error("W65816: return half must be i8/i16/i32");
    }
  }
  if (ExpVT.size() > 4)
    report_fatal_error("W65816: return type wider than 64 bits not supported");

  // Single whole-i32 return: copy directly to AX32 instead of two
  // halves to A and X.  Saves the regalloc/coalescer some work.
  bool I32WholeReturn = (Outs.size() == 1 && Outs[0].VT == MVT::i32);
  SDValue Glue;
  SmallVector<SDValue, 8> RetOps(1, Chain);

  if (I32WholeReturn) {
    // Split the i32 OutVal into lo/hi and copy each separately to
    // $a / $x (no AX32 in the SDAG — see W65816LowerWide32).
    SDValue Lo = extractWide32Lo(DAG, DL, OutVals[0]);
    SDValue Hi = extractWide32Hi(DAG, DL, OutVals[0]);
    Chain = DAG.getCopyToReg(Chain, DL, W65816::X, Hi, Glue);
    Glue = Chain.getValue(1);
    Chain = DAG.getCopyToReg(Chain, DL, W65816::A, Lo, Glue);
    Glue = Chain.getValue(1);
    RetOps.push_back(DAG.getRegister(W65816::A, MVT::i16));
    RetOps.push_back(DAG.getRegister(W65816::X, MVT::i16));
    RetOps[0] = Chain;
    if (Glue.getNode())
      RetOps.push_back(Glue);
    return DAG.getNode(W65816ISD::RET_GLUE, DL, MVT::Other, RetOps);
  }

  // Outs[3] -> DP $F0 via CopyToReg(DPF0).  Using the DPF0 fake physreg
  // (lowered to `STA $F0` by copyPhysReg) is critical: a generic
  // ISD::STORE with addr=0xF0 lowered to `sta (d,s),y`, an indirect
  // through the DBR, which silently misbehaved when DBR != 0.  STA dp
  // uses D + dp directly and is unaffected by DBR.  Done first so its
  // computation can use A freely before A holds the low result.  Glued
  // to RET_GLUE via the RetOps Register entry below so DCE doesn't
  // strip the COPY.
  // Use the expanded i16-half list (i32 outs split into 2 i16 halves).
  if (ExpVals.size() >= 4) {
    Chain = DAG.getCopyToReg(Chain, DL, W65816::DPF0, ExpVals[3], Glue);
    Glue = Chain.getValue(1);
  }
  if (ExpVals.size() >= 3) {
    Chain = DAG.getCopyToReg(Chain, DL, W65816::Y, ExpVals[2], Glue);
    Glue = Chain.getValue(1);
  }
  if (ExpVals.size() >= 2) {
    Chain = DAG.getCopyToReg(Chain, DL, W65816::X, ExpVals[1], Glue);
    Glue = Chain.getValue(1);
  }
  if (!ExpVals.empty()) {
    Chain = DAG.getCopyToReg(Chain, DL, W65816::A, ExpVals[0], Glue);
    Glue = Chain.getValue(1);
    RetOps.push_back(DAG.getRegister(W65816::A, ExpVT[0]));
  }
  if (ExpVals.size() >= 2)
    RetOps.push_back(DAG.getRegister(W65816::X, ExpVT[1]));
  if (ExpVals.size() >= 3)
    RetOps.push_back(DAG.getRegister(W65816::Y, ExpVT[2]));
  if (ExpVals.size() >= 4)
    RetOps.push_back(DAG.getRegister(W65816::DPF0, ExpVT[3]));

  RetOps[0] = Chain;
  if (Glue.getNode())
    RetOps.push_back(Glue);

  return DAG.getNode(W65816ISD::RET_GLUE, DL, MVT::Other, RetOps);
}

SDValue
W65816TargetLowering::PerformDAGCombine(SDNode *N,
                                        DAGCombinerInfo &DCI) const {
  // (shl i32 X, K) -> chain of K (add x, x) for small K.  After type
  // legalisation the i32 add splits via ADDC/ADDE pseudos which expand
  // to native ASL/ROL + carry-chain — much cheaper than the type-
  // legaliser's SHL_PARTS expansion which uses our 3-insn SRL15A trick
  // to compute the bit crossing the half boundary.  Each ADD expands to
  // ~10 insns; SHL_PARTS expansion is ~26 for K=1, ~33 for K=2, ~34 for
  // K=3.  ADD-chain wins at K<=2 and breaks even at K=3 — cap at K=2.
  // `x*N` (which the combiner canonicalises pow-of-2 muls to `x<<K`)
  // benefits the most.  i16 SHL by 1..15 has dedicated ASLA16 patterns
  // already, so we restrict the rewrite to i32+.
  // (shl i32 X, K) -> ADD chain for small K — but only when i32 is
  // ILLEGAL (i.e., gets type-split into i16 halves).  When i32 is a
  // legal type (Wide32 reg class for ptr32 mode), the rewrite cycles
  // against LLVM's generic `(add x, x) -> (shl x, 1)` combine in the
  // i64 → 2 i32 split path, hanging the legalizer.
  // STORE / LOAD with ConstantSDNode ptr (e.g. `*(volatile uint8*)0xC035 = v`):
  // wrap the immediate in a W65816ISD::WRAPPER (using a TargetGlobalAddress-
  // like marker would be cleaner but we lack the symbol table).  Re-issue
  // the store/load with the same ptr but the constant marked TargetConstant
  // — TargetConstant is opaque to LowerI32Constant, so it survives intact
  // to ISel, where the existing tablegen pattern
  //   `(store Acc8, (iPTR imm)) -> STA8long`
  // matches (`imm` accepts both Constant and TargetConstant).  4 B / 6 cyc
  // bank-explicit `sta long` instead of 16 B / 30 cyc [dp],Y.
  // Wide32-of-Wrapper-with-zero-hi → i16 Wrapper.  Under p:32:16,
  // LowerGlobalAddress builds GlobalAddress as a Wide32 reg pair
  // `(REG_SEQUENCE Wrapper(off_i16), 0_i16)`.  Stores/loads against
  // this Wide32 ptr fall to the heavy [dp],Y path (16 B / 30 cyc)
  // even when the bank half is the constant 0 — we want the cheap
  // DBR-relative `sta g` / `lda g` (3 B / 5 cyc).  Detect the shape
  // and recombine the ptr to its 16-bit form so the existing
  // tablegen `(store v, (Wrapper tglob))` → STAabs / `(load (Wrapper
  // tglob))` → LDAabs patterns fire.  Crucially, this is correct
  // ONLY when bank=0 — under GS/OS Loader, DBR is set to our bank
  // by crt0Gsos, so DBR-relative addressing reaches the same global.
  // Returns either an i16 Wrapper (drop into i16 STAabs/LDAabs pattern)
  // or a TargetConstant:i32 (for const-addr i16 stores so the timm
  // pattern fires and produces STAabs).  TargetConstant — not regular
  // Constant — because LowerI32Constant only matches ISD::Constant; if
  // we returned a fresh ConstantSDNode it would re-fire LowerI32Constant
  // and produce another Wide32 REG_SEQUENCE → infinite combine loop.
  auto unwrapWide32WithZeroHi = [&](SDValue Ptr) -> SDValue {
    if (Ptr.getValueType() != MVT::i32) return SDValue();
    if (!Ptr.getNode()->isMachineOpcode()) return SDValue();
    if (Ptr.getMachineOpcode() != TargetOpcode::REG_SEQUENCE)
      return SDValue();
    SDValue Lo, Hi;
    for (unsigned i = 1; i + 1 < Ptr.getNumOperands(); i += 2) {
      auto *CIdx = dyn_cast<ConstantSDNode>(Ptr.getOperand(i + 1));
      if (!CIdx) continue;
      if (CIdx->getZExtValue() == llvm::sub_lo) Lo = Ptr.getOperand(i);
      else if (CIdx->getZExtValue() == llvm::sub_hi) Hi = Ptr.getOperand(i);
    }
    if (!Lo || !Hi) return SDValue();
    auto *HiC = dyn_cast<ConstantSDNode>(Hi);
    if (!HiC || HiC->getZExtValue() != 0) return SDValue();
    if (Lo.getOpcode() == W65816ISD::Wrapper) return Lo;
    if (auto *LoC = dyn_cast<ConstantSDNode>(Lo)) {
      // Recombine into a TargetConstant:i32 so the `(store v, (iPTR
      // timm))` STAabs pattern fires.  Returning an i16 Constant
      // would create a malformed STORE node (Ptr type mismatch) and
      // returning a regular Constant:i32 would re-trigger
      // LowerI32Constant.
      return DCI.DAG.getTargetConstant(LoC->getZExtValue(), SDLoc(Ptr),
                                       MVT::i32);
    }
    return SDValue();
  };
  if (N->getOpcode() == ISD::STORE) {
    auto *St = cast<StoreSDNode>(N);
    EVT MemVT = St->getMemoryVT();
    SDValue Ptr = St->getBasePtr();
    // Skip i32 stores — LowerStore's i32 path has its own Wide32-zero-hi
    // const-addr fast path that emits two i16 stores at separate
    // TargetConstant addrs.  Unwrapping here would short-circuit that
    // and produce a malformed ADD(TargetConstant, Constant) when the
    // hi-half store needs Ptr+2.
    if (MemVT != MVT::i32) {
      if (SDValue I16Ptr = unwrapWide32WithZeroHi(Ptr)) {
        SelectionDAG &DAG = DCI.DAG;
        SDLoc DL(N);
        return DAG.getTruncStore(St->getChain(), DL, St->getValue(), I16Ptr,
                                 MemVT, St->getMemOperand());
      }
      // Global+i16-idx fast path for STORES (companion to the LOAD
      // branch below).  Ptr = REG_SEQUENCE(ADDC(Wrapper, idx), ADDE(...)).
      // Rewrite to CopyToReg($a, val) + CopyToReg($x, idx) + STA_AbsX.
      if ((MemVT == MVT::i16 || MemVT == MVT::i8) &&
          Ptr.getNode() && Ptr.isMachineOpcode() &&
          Ptr.getMachineOpcode() == TargetOpcode::REG_SEQUENCE) {
        SDValue Lo = lookThroughRegSeq(Ptr, llvm::sub_lo);
        if (Lo && Lo.getOpcode() == ISD::ADDC) {
          auto lookThroughExtractSubLo = [](SDValue V) -> SDValue {
            if (V.getNode() && V.isMachineOpcode() &&
                V.getMachineOpcode() == TargetOpcode::EXTRACT_SUBREG) {
              SDValue Src = V.getOperand(0);
              if (Src.isMachineOpcode() &&
                  Src.getMachineOpcode() == TargetOpcode::REG_SEQUENCE) {
                if (SDValue X = lookThroughRegSeq(Src, llvm::sub_lo))
                  return X;
              }
            }
            return V;
          };
          SDValue A = lookThroughExtractSubLo(Lo.getOperand(0));
          SDValue B = lookThroughExtractSubLo(Lo.getOperand(1));
          auto isWrapperGlobal = [](SDValue V) {
            if (V.getOpcode() != W65816ISD::Wrapper) return false;
            unsigned Op = V.getOperand(0).getOpcode();
            return Op == ISD::TargetGlobalAddress ||
                   Op == ISD::TargetExternalSymbol;
          };
          SDValue Sym, Idx;
          if (isWrapperGlobal(A))      { Sym = A.getOperand(0); Idx = B; }
          else if (isWrapperGlobal(B)) { Sym = B.getOperand(0); Idx = A; }
          if (Sym && Idx.getValueType() == MVT::i16) {
            SelectionDAG &DAG = DCI.DAG;
            SDLoc DL(N);
            SDValue Chain = St->getChain();
            SDValue Val = St->getValue();
            // STA8absX copies $a register at i16 width (M=0); the SEP
            // wraps narrow it.  Promote i8 stored value to i16.
            if (Val.getValueType() == MVT::i8)
              Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, Val);
            SDValue Glue;
            SDValue C1 = DAG.getCopyToReg(Chain, DL, W65816::X, Idx, Glue);
            Glue = C1.getValue(1);
            SDValue C2 = DAG.getCopyToReg(C1, DL, W65816::A, Val, Glue);
            Glue = C2.getValue(1);
            SDVTList StaVTs = DAG.getVTList(MVT::Other, MVT::Glue);
            unsigned Opc = (MemVT == MVT::i8) ? W65816::STA8absX
                                              : W65816::STA_AbsX;
            SDNode *Sta = DAG.getMachineNode(Opc, DL, StaVTs,
                                             {Sym, C2, Glue});
            return SDValue(Sta, 0);
          }
        }
      }
    }
    // i8 const-addr → STA8long (timm pattern); i16 const-addr →
    // STAabs (timm pattern, DBR-relative).  Wrap as TargetConstant so
    // LowerI32Constant doesn't re-enter and break the const-pattern
    // match.  i32 stores split into 2 i16 stores via LowerStore so they
    // come back through this combine as MemVT==i16.
    if (MemVT != MVT::i8 && MemVT != MVT::i16) return SDValue();
    if (auto *C = dyn_cast<ConstantSDNode>(Ptr)) {
      SelectionDAG &DAG = DCI.DAG;
      SDLoc DL(N);
      SDValue NewPtr = DAG.getTargetConstant(C->getZExtValue(), DL,
                                             Ptr.getValueType());
      return DAG.getTruncStore(St->getChain(), DL, St->getValue(), NewPtr,
                               MemVT, St->getMemOperand());
    }
  }
  if (N->getOpcode() == ISD::LOAD) {
    auto *Ld = cast<LoadSDNode>(N);
    EVT MemVT = Ld->getMemoryVT();
    EVT VT = Ld->getValueType(0);
    SDValue Ptr = Ld->getBasePtr();
    // Wide32-of-Wrapper-with-zero-hi → i16 Wrapper (companion to the
    // STORE side just above).  Lets `(load (Wrapper g))` → LDAabs fire.
    // Skip i32 loads — LowerLoad's i32 path does its own Ptr+2 ADD
    // arithmetic and would choke on a TargetConstant unwrap result.
    if (MemVT != MVT::i32) {
      if (SDValue I16Ptr = unwrapWide32WithZeroHi(Ptr)) {
        SelectionDAG &DAG = DCI.DAG;
        SDLoc DL(N);
        return DAG.getExtLoad(Ld->getExtensionType(), DL, VT,
                              Ld->getChain(), I16Ptr, MemVT,
                              Ld->getMemOperand());
      }
      // Global+i16-idx fast path: Ptr is REG_SEQUENCE produced by
      // LowerI32Bin from `(add (Wrapper sym) (zext i16 idx))`.
      //   sub_lo = ADDC(Wrapper, idx) — operands are TargetExtractSubreg
      //                                 wrapping each side's Wide32
      //   sub_hi = ADDE(0, 0, carry) — ignored (idx fits in 16 bits,
      //                                 so any carry stays in bank)
      // Rewrite the LOAD to a CopyToReg($x, idx) + LDA_AbsX(sym)
      // sequence.  Saves ~45 bytes / ~70 cyc vs the 24-bit [dp],Y deref.
      // Correct under the data-bank invariant (DBR = global's bank).
      if ((MemVT == MVT::i16 || MemVT == MVT::i8) &&
          Ptr.getNode() && Ptr.isMachineOpcode() &&
          Ptr.getMachineOpcode() == TargetOpcode::REG_SEQUENCE) {
        SDValue Lo = lookThroughRegSeq(Ptr, llvm::sub_lo);
        if (Lo && Lo.getOpcode() == ISD::ADDC) {
          auto lookThroughExtractSubLo = [](SDValue V) -> SDValue {
            if (V.getNode() && V.isMachineOpcode() &&
                V.getMachineOpcode() == TargetOpcode::EXTRACT_SUBREG) {
              SDValue Src = V.getOperand(0);
              if (Src.isMachineOpcode() &&
                  Src.getMachineOpcode() == TargetOpcode::REG_SEQUENCE) {
                if (SDValue X = lookThroughRegSeq(Src, llvm::sub_lo))
                  return X;
              }
            }
            return V;
          };
          SDValue A = lookThroughExtractSubLo(Lo.getOperand(0));
          SDValue B = lookThroughExtractSubLo(Lo.getOperand(1));
          auto isWrapperGlobal = [](SDValue V) {
            if (V.getOpcode() != W65816ISD::Wrapper) return false;
            unsigned Op = V.getOperand(0).getOpcode();
            return Op == ISD::TargetGlobalAddress ||
                   Op == ISD::TargetExternalSymbol;
          };
          SDValue Sym, Idx;
          if (isWrapperGlobal(A))      { Sym = A.getOperand(0); Idx = B; }
          else if (isWrapperGlobal(B)) { Sym = B.getOperand(0); Idx = A; }
          if (Sym && Idx.getValueType() == MVT::i16) {
            SelectionDAG &DAG = DCI.DAG;
            SDLoc DL(N);
            SDValue Chain = Ld->getChain();
            SDValue Glue;
            SDValue NewChain = DAG.getCopyToReg(Chain, DL, W65816::X, Idx,
                                                Glue);
            Glue = NewChain.getValue(1);
            SDVTList LdaVTs = DAG.getVTList(MVT::Other, MVT::Glue);
            unsigned Opc = (MemVT == MVT::i8) ? W65816::LDA8absX
                                              : W65816::LDA_AbsX;
            SDNode *Lda = DAG.getMachineNode(Opc, DL, LdaVTs,
                                             {Sym, NewChain, Glue});
            SDValue LdaChain = SDValue(Lda, 0);
            SDValue LdaGlue  = SDValue(Lda, 1);
            // Read A as the original LOAD's result VT directly.  For
            // i8 LOAD with i8 VT: read i8.  For i8 LOAD with i16 VT
            // (zext/sext): read i16 (high byte is whatever was in $a
            // before — wrong for zext, fine for sext, depends on the
            // M=8 LDA behavior).  M=8 LDA only writes the low byte of
            // $a, leaving the high byte intact.  Safe wrt liveness
            // because we're reading $a immediately after SEP/REP
            // around the load, but the high byte is now whatever
            // pre-LDA value $a held — for zext we must mask it.
            SDValue Val = DAG.getCopyFromReg(LdaChain, DL, W65816::A,
                                             VT, LdaGlue);
            SDValue Chain2 = Val.getValue(1);
            if (MemVT == MVT::i8 && VT == MVT::i16) {
              if (Ld->getExtensionType() == ISD::ZEXTLOAD) {
                Val = DAG.getNode(ISD::AND, DL, MVT::i16, Val,
                                  DAG.getConstant(0xFF, DL, MVT::i16));
              } else if (Ld->getExtensionType() == ISD::SEXTLOAD) {
                Val = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i16,
                                  Val, DAG.getValueType(MVT::i8));
              }
              // EXTLOAD: high byte don't-care, leave alone.
            }
            return DAG.getMergeValues({Val, Chain2}, DL);
          }
        }
      }
    }
    // Only the i8 const-addr path has dedicated tablegen patterns
    // (LDA8long); skip i16 const-addr loads (no LDAabs imm pattern)
    // and i32 (would re-fire on the same node with different shape).
    if (MemVT != MVT::i8 || (VT != MVT::i8 && VT != MVT::i16))
      return SDValue();
    if (auto *C = dyn_cast<ConstantSDNode>(Ptr)) {
      SelectionDAG &DAG = DCI.DAG;
      SDLoc DL(N);
      SDValue NewPtr = DAG.getTargetConstant(C->getZExtValue(), DL,
                                             Ptr.getValueType());
      return DAG.getExtLoad(Ld->getExtensionType(), DL, VT,
                            Ld->getChain(), NewPtr, MemVT,
                            Ld->getMemOperand());
    }
  }

  if (N->getOpcode() == ISD::SHL && N->getValueType(0).getSizeInBits() >= 32 &&
      !isTypeLegal(N->getValueType(0))) {
    if (auto *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
      uint64_t K = C->getZExtValue();
      if (K >= 1 && K <= 2) {
        SelectionDAG &DAG = DCI.DAG;
        SDValue X = N->getOperand(0);
        SDLoc DL(N);
        EVT VT = N->getValueType(0);
        SDValue R = X;
        for (uint64_t i = 0; i < K; ++i)
          R = DAG.getNode(ISD::ADD, DL, VT, R, R);
        return R;
      }
    }
  }

  return SDValue();
}

// Custom-lowering for ISD::MUL i32.  When both operands are ZEXT from
// i16 (or provably have high 16 bits = 0), emit a libcall to
// __umulhisi3 (16x16 -> 32) instead of the heavier __mulsi3 (32x32 ->
// 32).  Saves the 32-bit arg marshaling AND the 32-bit accumulator
// math inside the libcall — roughly equivalent to Calypsi 5.16's
// `_Mul16`.  Falls through to the standard __mulsi3 libcall otherwise.
SDValue W65816TargetLowering::LowerMUL_I32(SDValue Op,
                                           SelectionDAG &DAG) const {
  SDLoc DL(Op);
  EVT VT = Op.getValueType();
  assert(VT == MVT::i32 && "LowerMUL_I32 expects i32");
  SDValue Lhs = Op.getOperand(0);
  SDValue Rhs = Op.getOperand(1);

  auto narrowToI16 = [&](SDValue V) -> SDValue {
    // Explicit zext-from-i16 (the IR-level form, before SDAG flattening).
    if (V.getOpcode() == ISD::ZERO_EXTEND &&
        V.getOperand(0).getValueType() == MVT::i16)
      return V.getOperand(0);
    // ANY_EXTEND-from-i16 is also fine since multiplication of the low
    // 16 bits gives the same 32-bit result whatever the high bits were.
    if (V.getOpcode() == ISD::ANY_EXTEND &&
        V.getOperand(0).getValueType() == MVT::i16)
      return V.getOperand(0);
    // High 16 bits provably zero?
    KnownBits K = DAG.computeKnownBits(V);
    if (K.countMinLeadingZeros() >= 16)
      return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, V);
    return SDValue();
  };

  // Mul-by-constant strength reduction: (X * K) where K-1 or K+1 is
  // a small power of 2 (shift count 1..5, matching our inlined i32
  // SHL range) expands to (X << N) +/- X — saves a __mulsi3 libcall
  // (~250 cyc) for ~70 cyc of inlined shift+add.  Catches djb2Hash's
  // `h * 33` = (h << 5) + h.
  //
  // Patterns covered:
  //   K = 2^N + 1 in {3,5,9,17,33}    → (X << N) + X
  //   K = 2^N - 1 in {7,15,31}        → (X << N) - X
  // Larger N hits the i32 SHL libcall path (no longer profitable).
  if (auto *CN = dyn_cast<ConstantSDNode>(Rhs)) {
    int64_t K = CN->getSExtValue();
    for (unsigned N = 1; N <= 5; N++) {
      int64_t Pow = int64_t{1} << N;
      SDValue ShAmt = DAG.getConstant(N, DL, MVT::i16);
      if (K == Pow + 1) {
        SDValue Shl = DAG.getNode(ISD::SHL, DL, MVT::i32, Lhs, ShAmt);
        return DAG.getNode(ISD::ADD, DL, MVT::i32, Shl, Lhs);
      }
      if (K == Pow - 1) {
        SDValue Shl = DAG.getNode(ISD::SHL, DL, MVT::i32, Lhs, ShAmt);
        return DAG.getNode(ISD::SUB, DL, MVT::i32, Shl, Lhs);
      }
    }
  }

  SDValue A = narrowToI16(Lhs);
  SDValue B = narrowToI16(Rhs);
  if (A && B) {
    TargetLowering::ArgListTy Args;
    Args.push_back({A, Type::getInt16Ty(*DAG.getContext())});
    Args.push_back({B, Type::getInt16Ty(*DAG.getContext())});
    SDValue Callee = DAG.getExternalSymbol(
        "__umulhisi3", getPointerTy(DAG.getDataLayout()));
    TargetLowering::CallLoweringInfo CLI(DAG);
    CLI.setDebugLoc(DL)
        .setChain(DAG.getEntryNode())
        .setLibCallee(CallingConv::C,
                      Type::getInt32Ty(*DAG.getContext()),
                      Callee, std::move(Args));
    auto [Ret, Chain] = LowerCallTo(CLI);
    return Ret;
  }

  // Fall back to the standard __mulsi3 libcall.
  TargetLowering::ArgListTy Args;
  Args.push_back({Lhs, Type::getInt32Ty(*DAG.getContext())});
  Args.push_back({Rhs, Type::getInt32Ty(*DAG.getContext())});
  SDValue Callee = DAG.getExternalSymbol(
      "__mulsi3", getPointerTy(DAG.getDataLayout()));
  TargetLowering::CallLoweringInfo CLI(DAG);
  CLI.setDebugLoc(DL)
      .setChain(DAG.getEntryNode())
      .setLibCallee(CallingConv::C,
                    Type::getInt32Ty(*DAG.getContext()),
                    Callee, std::move(Args));
  auto [Ret, Chain] = LowerCallTo(CLI);
  return Ret;
}

// Map a W65816CC code to the matching Bxx opcode.
static unsigned getBranchOpcodeForCC(unsigned CC) {
  switch (CC) {
  case W65816CC::COND_EQ: return W65816::BEQ;
  case W65816CC::COND_NE: return W65816::BNE;
  case W65816CC::COND_HS: return W65816::BCS;
  case W65816CC::COND_LO: return W65816::BCC;
  case W65816CC::COND_MI: return W65816::BMI;
  case W65816CC::COND_PL: return W65816::BPL;
  case W65816CC::COND_VS: return W65816::BVS;
  case W65816CC::COND_VC: return W65816::BVC;
  }
  llvm_unreachable("invalid W65816 condition code");
}

// For multi-branch CCs, return the (branchA, branchB, BothMeanTrue) tuple.
// branchA is tested first; if it takes, we go to TrueBB if BothMeanTrue
// (i.e. both branches are "take if true"), otherwise to FalseBB.  branchB
// is tested next with the same semantic.
//
//   GT  : (BPL && BNE) → BEQ FalseBB; BPL TrueBB; fall-through FalseBB
//   LE  : (BMI || BEQ) → BEQ TrueBB;  BMI TrueBB;  fall-through FalseBB
//   HI  : (BCS && BNE) → BEQ FalseBB; BCS TrueBB; fall-through FalseBB
//   LS  : (BCC || BEQ) → BEQ TrueBB;  BCC TrueBB;  fall-through FalseBB
struct MultiBranch {
  unsigned First, Second;
  bool FirstToTrue, SecondToTrue;
};
static MultiBranch getMultiBranch(unsigned CC) {
  switch (CC) {
  case W65816CC::COND_GT_MB:
    return {W65816::BEQ, W65816::BPL, false, true};
  case W65816CC::COND_LE_MB:
    return {W65816::BEQ, W65816::BMI, true, true};
  case W65816CC::COND_HI_MB:
    return {W65816::BEQ, W65816::BCS, false, true};
  case W65816CC::COND_LS_MB:
    return {W65816::BEQ, W65816::BCC, true, true};
  }
  llvm_unreachable("not a multi-branch CC");
}

// Emit a two-Acc16 binary op as STAfi src2; OPfi dst, src1.  Allocates
// a fresh 2-byte stack slot per call.  For CMP (HasOut=false) there's
// no destination register, just the two src operands.  Always spill
// the SECOND operand so non-commutative ops (sub, cmp) compute
// src1 OP src2 correctly via OPfi (which gives src1 OP load(spill)).
static MachineBasicBlock *
emitRROp(MachineInstr &MI, MachineBasicBlock *BB, unsigned StoreOp,
         unsigned OpFI, bool HasOut) {
  MachineFunction *MF = BB->getParent();
  const W65816Subtarget &STI = MF->getSubtarget<W65816Subtarget>();
  const W65816InstrInfo &TII = *STI.getInstrInfo();
  DebugLoc DL = MI.getDebugLoc();

  int FI = MF->getFrameInfo().CreateStackObject(2, Align(2),
                                                /*isSpillSlot=*/true);

  unsigned LhsIdx = HasOut ? 1 : 0;
  unsigned RhsIdx = HasOut ? 2 : 1;
  Register Src1 = MI.getOperand(LhsIdx).getReg();
  Register Src2 = MI.getOperand(RhsIdx).getReg();

  // Spill src2 (the rhs).  Then OPfi computes src1 OP load(spill).
  BuildMI(*BB, MI.getIterator(), DL, TII.get(StoreOp))
      .addReg(Src2)
      .addFrameIndex(FI)
      .addImm(0);

  if (HasOut) {
    Register Dst = MI.getOperand(0).getReg();
    BuildMI(*BB, MI.getIterator(), DL, TII.get(OpFI), Dst)
        .addReg(Src1)
        .addFrameIndex(FI)
        .addImm(0);
  } else {
    BuildMI(*BB, MI.getIterator(), DL, TII.get(OpFI))
        .addReg(Src1)
        .addFrameIndex(FI)
        .addImm(0);
  }

  MI.eraseFromParent();
  return BB;
}

MachineBasicBlock *
W65816TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                  MachineBasicBlock *BB) const {
  // The only opcode we currently emit with usesCustomInserter=1 is
  // SELECT_CC16.  Expand it into a diamond CFG with a PHI.  For
  // single-branch CCs:
  //
  //   thisMBB:
  //     ... CMP already emitted ...
  //     Bxx sinkMBB        ; branch to "true" path
  //     ; fall through to copy0MBB
  //   copy0MBB:
  //     ; (no instructions; PHI picks fval here)
  //   sinkMBB:
  //     dst = PHI [tval, thisMBB], [fval, copy0MBB]
  //
  // For multi-branch CCs (GT/LE/UGT/ULE without const RHS, where a
  // single Bxx isn't enough), insert two branches.  Both target either
  // sinkMBB or copy0MBB depending on the condition.
  switch (MI.getOpcode()) {
  default:
    llvm_unreachable("unexpected instruction in EmitInstrWithCustomInserter");
  case W65816::ADD_RR:
    return emitRROp(MI, BB, W65816::STAfi, W65816::ADCfi, /*HasOut=*/true);
  case W65816::SUB_RR:
    return emitRROp(MI, BB, W65816::STAfi, W65816::SBCfi, /*HasOut=*/true);
  // Carry-chain variants for the hi half of an i32 split.  STAfi doesn't
  // touch P, so the carry from the previous addc/adde survives the
  // spill and is consumed by ADCEfi/SBCEfi below.
  case W65816::ADDE_RR:
    return emitRROp(MI, BB, W65816::STAfi, W65816::ADCEfi, /*HasOut=*/true);
  case W65816::SUBE_RR:
    return emitRROp(MI, BB, W65816::STAfi, W65816::SBCEfi, /*HasOut=*/true);
  case W65816::AND_RR:
    return emitRROp(MI, BB, W65816::STAfi, W65816::ANDfi, /*HasOut=*/true);
  case W65816::ORA_RR:
    return emitRROp(MI, BB, W65816::STAfi, W65816::ORAfi, /*HasOut=*/true);
  case W65816::EOR_RR:
    return emitRROp(MI, BB, W65816::STAfi, W65816::EORfi, /*HasOut=*/true);
  case W65816::CMP_RR:
    return emitRROp(MI, BB, W65816::STAfi, W65816::CMPfi, /*HasOut=*/false);
  case W65816::LDAptr32S:
  case W65816::STAptr32S:
  case W65816::STBptr32S: {
    // Split-pair variant: ptr is 2 i16 operands (lo + hi) instead of
    // 1 Wide32 reg pair.  Used by the W65816LowerWide32 pre-RA pass
    // to dodge pair-allocation pressure.  Otherwise identical to
    // the LDAptr32 inserter below.
    MachineFunction *MF = BB->getParent();
    const W65816Subtarget &STI = MF->getSubtarget<W65816Subtarget>();
    const W65816InstrInfo &TII = *STI.getInstrInfo();
    DebugLoc DL = MI.getDebugLoc();
    bool IsLoad = MI.getOpcode() == W65816::LDAptr32S;
    bool IsByteStore = MI.getOpcode() == W65816::STBptr32S;
    Register PtrLo = MI.getOperand(IsLoad ? 1 : 1).getReg();
    Register PtrHi = MI.getOperand(IsLoad ? 2 : 2).getReg();

    int FILo = MF->getFrameInfo().CreateStackObject(2, Align(2),
                                                    /*isSpillSlot=*/false);
    int FIHi = MF->getFrameInfo().CreateStackObject(2, Align(2),
                                                    /*isSpillSlot=*/false);
    BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi))
        .addReg(PtrLo).addFrameIndex(FILo).addImm(0);
    BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi))
        .addReg(PtrHi).addFrameIndex(FIHi).addImm(0);

    // STA_DP's tablegen def has no implicit A Use, so without an
    // explicit kill marker between adjacent LDAfi-STA_DP-LDAfi-STA_DP
    // pairs the fast regalloc collapses two A-loads into one (the
    // first's value is overwritten before STA_DP can store it).  Add
    // implicit Use of A on the STA_DP to encode the dependency.  This
    // also helps post-RA passes track A liveness correctly.
    BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi),
            W65816::A).addFrameIndex(FILo).addImm(0);
    BuildMI(*BB, MI.getIterator(), DL,
            TII.get(W65816::STA_DP)).addImm(0xE0)
        .addReg(W65816::A, RegState::Implicit);
    BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi),
            W65816::A).addFrameIndex(FIHi).addImm(0);
    BuildMI(*BB, MI.getIterator(), DL,
            TII.get(W65816::STA_DP)).addImm(0xE2)
        .addReg(W65816::A, RegState::Implicit);

    if (IsLoad) {
      Register Dst = MI.getOperand(0).getReg();
      BuildMI(*BB, MI.getIterator(), DL,
              TII.get(W65816::LDY_Imm16)).addImm(0);
      BuildMI(*BB, MI.getIterator(), DL,
              TII.get(W65816::LDA_DPIndLongY)).addImm(0xE0);
      BuildMI(*BB, MI.getIterator(), DL,
              TII.get(TargetOpcode::COPY), Dst).addReg(W65816::A);
    } else {
      Register Val = MI.getOperand(0).getReg();
      BuildMI(*BB, MI.getIterator(), DL,
              TII.get(TargetOpcode::COPY), W65816::A).addReg(Val);
      BuildMI(*BB, MI.getIterator(), DL,
              TII.get(W65816::LDY_Imm16)).addImm(0);
      if (IsByteStore)
        BuildMI(*BB, MI.getIterator(), DL,
                TII.get(W65816::SEP)).addImm(0x20);
      BuildMI(*BB, MI.getIterator(), DL,
              TII.get(W65816::STA_DPIndLongY)).addImm(0xE0);
      if (IsByteStore)
        BuildMI(*BB, MI.getIterator(), DL,
                TII.get(W65816::REP)).addImm(0x20);
    }
    MI.eraseFromParent();
    return BB;
  }
  case W65816::LDAptr32:
  case W65816::STAptr32:
  case W65816::STBptr32: {
    // Same shape as the i16 LDAptr/STAptr/STBptr inserter, but the
    // pointer is a Wide32 register pair: sub_lo carries the low 16
    // bits of the address, sub_hi carries the bank byte in its low
    // half (high half is pad, ORCA convention).  Stage at $E0..$E2,
    // then [dp],Y addresses the right bank without forcing 0.
    //
    // MI-level peephole: if the Wide32 ptr is the sole user of a
    // `REG_SEQUENCE(ADCi16imm BaseLo K, sub_lo, ADCEi16imm BaseHi 0,
    // sub_hi)` chain (= `(add Wide32, K)` after ISel), peel the
    // offset and pass K via the Y register on the `[dp],Y` deref.
    // Saves ~3 instructions per access (the CLC/ADC/ADC carry chain).
    // The bank-wrap caveat from LDAptr32Off applies: Y addition does
    // NOT propagate beyond 16 bits, so the target object must not
    // span a bank boundary (true for malloc'd / globally-allocated
    // ptr32 objects; struct sizeof is far below 64KB).
    //
    // Doing this here rather than in LowerLoad / a SDAG combine avoids
    // the JSON-tokenizer + BST + sprintf smoke regressions those paths
    // tripped — the rewrites perturbed SDAG scheduling in ways that
    // bisection couldn't pin down.  At MI level, the rewrite is
    // structural: ADCi16imm/ADCEi16imm become dead and get DCE'd.
    //
    // Dead unless ptr32 mode is active (LowerLoad/LowerStore are gated
    // on i32 address type).
    MachineFunction *MF = BB->getParent();
    const W65816Subtarget &STI = MF->getSubtarget<W65816Subtarget>();
    const W65816InstrInfo &TII = *STI.getInstrInfo();
    MachineRegisterInfo &MRI = MF->getRegInfo();
    DebugLoc DL = MI.getDebugLoc();
    bool IsLoad = MI.getOpcode() == W65816::LDAptr32;
    bool IsByteStore = MI.getOpcode() == W65816::STBptr32;
    Register Ptr = MI.getOperand(IsLoad ? 1 : 1).getReg();
    // Try the ADC-chain peel.  We need:
    //   1. Ptr has exactly one use (this MI) — else other users still
    //      need the full computed Wide32, no net win.
    //   2. Ptr was defined by a REG_SEQUENCE.
    //   3. Sub_lo source is ADCi16imm BaseLoReg KLo.
    //   4. Sub_hi source is ADCEi16imm BaseHiReg 0.
    //   5. KLo > 0 and KLo fits 16-bit unsigned.
    Register PeelBaseLo, PeelBaseHi;
    int64_t PeelOff = 0;
    MachineInstr *DeadLoDef = nullptr;
    MachineInstr *DeadHiDef = nullptr;
    MachineInstr *DeadPtrDef = nullptr;
    SmallVector<MachineInstr *, 4> ExtraChainDeads;
    if (IsLoad && MRI.hasOneUse(Ptr)) {
      MachineInstr *PtrDef = MRI.getUniqueVRegDef(Ptr);
      if (PtrDef && PtrDef->getOpcode() == TargetOpcode::REG_SEQUENCE) {
        Register SubLoReg, SubHiReg;
        for (unsigned i = 1, e = PtrDef->getNumOperands(); i + 1 < e; i += 2) {
          unsigned SubIdx = PtrDef->getOperand(i + 1).getImm();
          Register R = PtrDef->getOperand(i).getReg();
          if (SubIdx == llvm::sub_lo) SubLoReg = R;
          else if (SubIdx == llvm::sub_hi) SubHiReg = R;
        }
        MachineInstr *LoDef = SubLoReg ? MRI.getUniqueVRegDef(SubLoReg)
                                       : nullptr;
        MachineInstr *HiDef = SubHiReg ? MRI.getUniqueVRegDef(SubHiReg)
                                       : nullptr;
        // We don't require SubLoReg/SubHiReg to be single-use: an
        // ADCi16imm result CSE'd across multiple users (e.g., `L+K`
        // also used as input to `(L+K)+M`) is fine — peeling THIS load
        // doesn't kill the original ADC chain (other users still need
        // it).  We only erase the chain if it's all single-use end-to-end.
        bool OuterSingleUse =
            MRI.hasOneUse(SubLoReg) && MRI.hasOneUse(SubHiReg);
        if (LoDef && HiDef &&
            LoDef->getOpcode() == W65816::ADCi16imm &&
            HiDef->getOpcode() == W65816::ADCEi16imm &&
            // ADCi16imm and ADCEi16imm must be in the same MBB so we
            // can verify nothing clobbers $p between them.
            LoDef->getParent() == HiDef->getParent()) {
          // Walk forward from LoDef to HiDef.  If any instr between
          // them defines $p, the ADCE reads a tampered carry and our
          // simple substitution would change semantics.
          bool PChainOK = true;
          for (auto It = std::next(LoDef->getIterator());
               It != HiDef->getIterator() && PChainOK; ++It) {
            for (const MachineOperand &MO : It->operands()) {
              if (MO.isReg() && MO.getReg() == W65816::P &&
                  MO.isDef() && !MO.isDead()) {
                PChainOK = false;
                break;
              }
            }
          }
          int64_t KLo = LoDef->getOperand(2).getImm();
          int64_t KHi = HiDef->getOperand(2).getImm();
          Register CandLo = LoDef->getOperand(1).getReg();
          Register CandHi = HiDef->getOperand(1).getReg();
          // Accept a vreg that's `COPY <phys-reg>` for any of the
          // arg/accumulator/index physregs.  This catches both incoming
          // function args ($a/$x at entry) AND values that came from
          // a preceding load (where the result was COPYed off $a).
          auto isFromArgCopy = [&](Register R) -> bool {
            if (!R.isVirtual()) return false;
            MachineInstr *Def = MRI.getUniqueVRegDef(R);
            if (!Def || !Def->isCopy()) return false;
            const MachineOperand &Src = Def->getOperand(1);
            if (!Src.isReg() || !Src.getReg().isPhysical()) return false;
            unsigned P = Src.getReg();
            return P == W65816::A || P == W65816::X || P == W65816::Y;
          };
          // A vreg is "from a fixed (caller-pushed) stack arg" if its
          // unique def is LDAfi against a fixed FrameIndex (negative
          // index in MachineFrameInfo).  Caller-pushed args live in
          // immutable slots, so reading them later is value-equivalent
          // to reading them at function entry.
          auto isFromFixedArgSlot = [&](Register R) -> bool {
            if (!R.isVirtual()) return false;
            MachineInstr *Def = MRI.getUniqueVRegDef(R);
            if (!Def || Def->getOpcode() != W65816::LDAfi) return false;
            const MachineOperand &FIOp = Def->getOperand(1);
            if (!FIOp.isFI()) return false;
            int FI = FIOp.getIndex();
            const MachineFrameInfo &MFI = MF->getFrameInfo();
            return MFI.isFixedObjectIndex(FI);
          };
          auto isFromArg = [&](Register R) -> bool {
            if (isFromArgCopy(R)) return true;
            if (isFromFixedArgSlot(R)) return true;
            if (!R.isVirtual()) return false;
            MachineInstr *Def = MRI.getUniqueVRegDef(R);
            if (!Def || !Def->isCopy()) return false;
            const MachineOperand &Src = Def->getOperand(1);
            if (!Src.isReg() || !Src.getReg().isVirtual()) return false;
            return isFromArgCopy(Src.getReg()) ||
                   isFromFixedArgSlot(Src.getReg());
          };
          // Recursive walk: nested ADC chains arise from i32-LOAD split
          // (high half loads at `Ptr+2`, where `Ptr` is itself `arg+K`).
          // Walk back, accumulating offset, until we reach an arg-base
          // OR exhaust the chain.
          //
          // We allow inner ADC results to have multiple users — this
          // happens when the SDAG CSEs `L+K` and reuses it as input to
          // `(L+K)+M`.  In that case, peeling THIS load doesn't kill
          // the inner ADC chain (other users still need it), so we
          // don't erase those inner Ms.  Only the outer-most chain
          // (single-use) and PtrDef are erased.
          //
          // Bisecting: try peeling whenever the chain reaches a
          // "stable" base — args, fixed-arg-slot loads, OR any vreg
          // (widest).  Wider gates have historically tripped a
          // FrameLowering-related smoke regression in sprintf.
          int64_t Off = KLo;
          bool ChainOK = (PChainOK && KHi == 0 && KLo > 0 && KLo <= 0xFFFF);
          // Cap on chain walks (avoid pathological deep chains).
          unsigned MaxChainDepth = 8;
          // Track per-layer "all single-use" status — only erase layers
          // up to the first non-single-use one.
          unsigned SingleUseLayers = OuterSingleUse ? 1 : 0;
          SmallVector<MachineInstr *, 6> ChainDeads;
          if (OuterSingleUse) {
            ChainDeads.push_back(LoDef);
            ChainDeads.push_back(HiDef);
          }
          // Narrow gate: walk back only until we reach an arg-base or
          // arg-slot base.  A truly wide gate (peel any chain regardless
          // of base) makes Lua ~+0.85% LARGER because each peel adds 4B
          // of stack-slot staging that exceeds the carry-chain savings
          // for deep-chain cases.  Tested 2026-05-25.
          while (ChainOK && MaxChainDepth-- > 0 &&
                 (!isFromArg(CandLo) || !isFromArg(CandHi))) {
            if (!CandLo.isVirtual() || !CandHi.isVirtual()) {
              ChainOK = false; break;
            }
            MachineInstr *InnerLo = MRI.getUniqueVRegDef(CandLo);
            MachineInstr *InnerHi = MRI.getUniqueVRegDef(CandHi);
            if (!InnerLo || !InnerHi ||
                InnerLo->getOpcode() != W65816::ADCi16imm ||
                InnerHi->getOpcode() != W65816::ADCEi16imm ||
                InnerLo->getParent() != InnerHi->getParent()) {
              ChainOK = false; break;
            }
            bool InnerSingleUse = MRI.hasOneUse(CandLo) && MRI.hasOneUse(CandHi);
            bool InnerPOK = true;
            for (auto It = std::next(InnerLo->getIterator());
                 It != InnerHi->getIterator() && InnerPOK; ++It) {
              for (const MachineOperand &MO : It->operands()) {
                if (MO.isReg() && MO.getReg() == W65816::P &&
                    MO.isDef() && !MO.isDead()) {
                  InnerPOK = false; break;
                }
              }
            }
            if (!InnerPOK) { ChainOK = false; break; }
            int64_t InnerKLo = InnerLo->getOperand(2).getImm();
            int64_t InnerKHi = InnerHi->getOperand(2).getImm();
            if (InnerKHi != 0) { ChainOK = false; break; }
            int64_t NewOff = Off + InnerKLo;
            if (NewOff > 0xFFFF) { ChainOK = false; break; }
            Off = NewOff;
            CandLo = InnerLo->getOperand(1).getReg();
            CandHi = InnerHi->getOperand(1).getReg();
            // Track whether this inner layer is erasable (all-single-use
            // from outer through here).
            if (InnerSingleUse && SingleUseLayers ==
                ChainDeads.size() / 2) {
              SingleUseLayers++;
              ChainDeads.push_back(InnerLo);
              ChainDeads.push_back(InnerHi);
            }
            // Even if not single-use, we keep walking back — the peel
            // is still correct (just doesn't kill the inner chain).
          }
          if (ChainOK && Off > 0 && Off <= 0xFFFF &&
              isFromArg(CandLo) && isFromArg(CandHi)) {
            PeelBaseLo = CandLo;
            PeelBaseHi = CandHi;
            PeelOff = Off;
            DeadPtrDef = PtrDef;
            // Only erase the ADC chain if it's all-single-use end to
            // end.  Otherwise leave it alive — other users need it.
            if (OuterSingleUse) {
              DeadLoDef = LoDef;
              DeadHiDef = HiDef;
              for (unsigned i = 2; i < ChainDeads.size(); ++i)
                ExtraChainDeads.push_back(ChainDeads[i]);
            }
          }
        }
      }
    }
    // Layer 2 fast path: -w65816-dbr-safe-ptrs assumes the bank byte
    // matches DBR, letting us skip $E0/$E2 staging entirely.  Emit just
    // a STAfi of sub_lo and an LDAfi_indY/STAfi_indY deref via the
    // 16-bit stack-rel-indirect-Y opcode (0xB3 / 0x93).  ~4 instr per
    // deref saved vs the heavy [dp],Y indirect-long path.
    if (DbrSafePtrs) {
      Register PtrLo = MRI.createVirtualRegister(&W65816::Wide16RegClass);
      if (PeelOff) {
        BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), PtrLo)
            .addReg(PeelBaseLo);
      } else {
        BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), PtrLo)
            .addReg(Ptr, (RegState)0, llvm::sub_lo);
      }
      int FILo = MF->getFrameInfo().CreateStackObject(2, Align(2),
                                                      /*isSpillSlot=*/false);
      BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi))
          .addReg(PtrLo).addFrameIndex(FILo).addImm(0);
      BuildMI(*BB, MI.getIterator(), DL,
              TII.get(W65816::LDY_Imm16)).addImm(PeelOff);
      if (IsLoad) {
        Register Dst = MI.getOperand(0).getReg();
        // LDAfi_indY $dst, FILo — PEI resolves to LDA (FILo,S),Y.
        BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi_indY),
                W65816::A).addFrameIndex(FILo).addImm(0);
        BuildMI(*BB, MI.getIterator(), DL,
                TII.get(TargetOpcode::COPY), Dst).addReg(W65816::A);
      } else {
        Register Val = MI.getOperand(0).getReg();
        BuildMI(*BB, MI.getIterator(), DL,
                TII.get(TargetOpcode::COPY), W65816::A).addReg(Val);
        if (IsByteStore)
          BuildMI(*BB, MI.getIterator(), DL,
                  TII.get(W65816::SEP)).addImm(0x20);
        BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi_indY))
            .addReg(W65816::A).addFrameIndex(FILo).addImm(0);
        if (IsByteStore)
          BuildMI(*BB, MI.getIterator(), DL,
                  TII.get(W65816::REP)).addImm(0x20);
      }
      MI.eraseFromParent();
      if (DeadPtrDef) DeadPtrDef->eraseFromParent();
      if (DeadLoDef)  DeadLoDef->eraseFromParent();
      if (DeadHiDef)  DeadHiDef->eraseFromParent();
      for (MachineInstr *D : ExtraChainDeads) D->eraseFromParent();
      return BB;
    }

    // Extract the i16 sub-halves of the Wide32 ptr.  At custom-inserter
    // time Ptr is still a virtual register, so `TRI.getSubReg` won't
    // work (it's physreg-only).  Use COPY-with-subreg-index instead;
    // the regalloc + virtreg-rewriter resolves this to the right
    // physreg operand later.
    Register PtrLo = MRI.createVirtualRegister(&W65816::Wide16RegClass);
    Register PtrHi = MRI.createVirtualRegister(&W65816::Wide16RegClass);
    if (PeelOff) {
      // Peeled path: pull base halves from the ADC chain's inputs.
      BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), PtrLo)
          .addReg(PeelBaseLo);
      BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), PtrHi)
          .addReg(PeelBaseHi);
    } else {
      BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), PtrLo)
          .addReg(Ptr, (RegState)0, llvm::sub_lo);
      BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), PtrHi)
          .addReg(Ptr, (RegState)0, llvm::sub_hi);
    }

    // Spill each half to a fresh slot, reload via LDAfi.  Same RA-
    // pinning rationale as the i16 LDAptr inserter.
    int FILo = MF->getFrameInfo().CreateStackObject(2, Align(2),
                                                    /*isSpillSlot=*/false);
    int FIHi = MF->getFrameInfo().CreateStackObject(2, Align(2),
                                                    /*isSpillSlot=*/false);
    BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi))
        .addReg(PtrLo).addFrameIndex(FILo).addImm(0);
    BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi))
        .addReg(PtrHi).addFrameIndex(FIHi).addImm(0);

    // Change 3: $E0/$E2 staging CSE.  Look backward in this MBB for
    // the previous ptr32-deref expansion.  If its base halves match
    // ours (same vreg source) and nothing between has clobbered
    // $E0/$E2/$Y or the staged values, skip the LDAfi+STA_DP pairs
    // and reuse the previously-staged $E0..$E2.
    //
    // Inserter pattern signature (from below, latest-emitted first):
    //   STA_DP $E2 (impl A)
    //   LDAfi <FIHi'> -> A
    //   STA_DP $E0 (impl A)
    //   LDAfi <FILo'> -> A
    //   STAfi <srcHi'>, FIHi', 0     <- prior PtrHi
    //   STAfi <srcLo'>, FILo', 0     <- prior PtrLo
    bool ReuseStaging = false;
    {
      Register MySrcLo = PeelOff ? PeelBaseLo : Ptr;
      Register MySrcHi = PeelOff ? PeelBaseHi : Register();
      // For non-peel path, both halves come from `Ptr` via subreg; the
      // CSE check uses the whole Ptr vreg (so two LDAptr32 with the
      // same Ptr vreg can share staging).
      auto It = MI.getIterator();
      MachineInstr *PrevStaE2 = nullptr;
      MachineInstr *PrevLdaHi = nullptr;
      MachineInstr *PrevStaE0 = nullptr;
      MachineInstr *PrevLdaLo = nullptr;
      MachineInstr *PrevStaHi = nullptr;
      MachineInstr *PrevStaLo = nullptr;
      auto clobbersE0E2 = [&](MachineInstr &PrevMI) -> bool {
        // Any call clobbers everything in DP — including $E0..$E3.
        if (PrevMI.isCall()) return true;
        switch (PrevMI.getOpcode()) {
        // FrameLowering's long-indirect expansion of these uses $E2
        // as A-stash scratch (see W65816RegisterInfo.cpp).
        case W65816::ADCfi: case W65816::ADCEfi:
        case W65816::ANDfi: case W65816::ORAfi: case W65816::EORfi:
        case W65816::SBCfi: case W65816::SBCEfi:
        case W65816::CMPfi:
          return true;
        case W65816::STA_DP:
        case W65816::STZ_DP:
          if (PrevMI.getOperand(0).isImm()) {
            int64_t Imm = PrevMI.getOperand(0).getImm();
            if (Imm == 0xE0 || Imm == 0xE1 ||
                Imm == 0xE2 || Imm == 0xE3)
              return true;
          }
          break;
        }
        return false;
      };
      // Scan back, fail-soft.
      const unsigned MaxScan = 60;
      unsigned Scanned = 0;
      while (It != BB->begin() && Scanned++ < MaxScan) {
        --It;
        MachineInstr &P = *It;
        if (!PrevStaE2) {
          if (P.getOpcode() == W65816::STA_DP &&
              P.getOperand(0).isImm() &&
              P.getOperand(0).getImm() == 0xE2) {
            PrevStaE2 = &P;
            continue;
          }
          if (clobbersE0E2(P)) break;
          continue;
        }
        // After PrevStaE2, expect LDAfi <FIHi'>.
        if (!PrevLdaHi) {
          if (P.getOpcode() == W65816::LDAfi) { PrevLdaHi = &P; continue; }
          break;
        }
        if (!PrevStaE0) {
          if (P.getOpcode() == W65816::STA_DP &&
              P.getOperand(0).isImm() &&
              P.getOperand(0).getImm() == 0xE0) {
            PrevStaE0 = &P;
            continue;
          }
          break;
        }
        if (!PrevLdaLo) {
          if (P.getOpcode() == W65816::LDAfi) { PrevLdaLo = &P; continue; }
          break;
        }
        // Now look for STAfi srcHi', FIHi' and STAfi srcLo', FILo'.
        // They appear in either order; the inserter above emits Lo first
        // then Hi, but scanning back, we hit Hi first.
        if (!PrevStaHi) {
          if (P.getOpcode() == W65816::STAfi &&
              P.getOperand(1).isFI() &&
              P.getOperand(1).getIndex() ==
                  PrevLdaHi->getOperand(1).getIndex()) {
            PrevStaHi = &P;
            continue;
          }
          break;
        }
        if (!PrevStaLo) {
          if (P.getOpcode() == W65816::STAfi &&
              P.getOperand(1).isFI() &&
              P.getOperand(1).getIndex() ==
                  PrevLdaLo->getOperand(1).getIndex()) {
            PrevStaLo = &P;
            // Done with the structural match — fall through to operand
            // comparison.
          }
          break;
        }
      }
      if (PrevStaLo && PrevStaHi) {
        Register PrevSrcLo = PrevStaLo->getOperand(0).getReg();
        Register PrevSrcHi = PrevStaHi->getOperand(0).getReg();
        // Match if the source vregs are identical to mine.  For non-peel
        // path, PtrLo/PtrHi were freshly created via COPY from Ptr.sub_*
        // — match by tracing PrevSrcLo/Hi back through their COPY (if
        // any) to the Ptr vreg.
        auto traceToPtr = [&](Register R) -> Register {
          if (!R.isVirtual()) return R;
          MachineInstr *D = MRI.getUniqueVRegDef(R);
          while (D && D->isCopy()) {
            const MachineOperand &S = D->getOperand(1);
            if (!S.isReg() || !S.getReg().isVirtual()) break;
            R = S.getReg();
            D = MRI.getUniqueVRegDef(R);
            // For subreg copies, stop — we'd lose sub-half info.
            if (D && D->getOpcode() == TargetOpcode::REG_SEQUENCE) break;
          }
          return R;
        };
        Register MyTraceLo = traceToPtr(PeelOff ? PeelBaseLo : PtrLo);
        Register MyTraceHi = traceToPtr(PeelOff ? PeelBaseHi : PtrHi);
        Register PrevTraceLo = traceToPtr(PrevSrcLo);
        Register PrevTraceHi = traceToPtr(PrevSrcHi);
        if (MyTraceLo == PrevTraceLo && MyTraceHi == PrevTraceHi &&
            MyTraceLo.isValid() && MyTraceHi.isValid()) {
          ReuseStaging = true;
        }
      }
      (void)MySrcLo; (void)MySrcHi;  // not used directly; trace covers
    }

    // Stage the 24-bit address at $E0..$E2 unless CSE allows reusing
    // the previous staging.
    // STA_DP's tablegen def has no implicit A Use, so without an
    // explicit kill marker between adjacent LDAfi-STA_DP-LDAfi-STA_DP
    // pairs the fast regalloc collapses two A-loads into one (the
    // first's value is overwritten before STA_DP can store it).  Add
    // implicit Use of A on the STA_DP to encode the dependency.  This
    // also helps post-RA passes track A liveness correctly.
    if (!ReuseStaging) {
      BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi),
              W65816::A).addFrameIndex(FILo).addImm(0);
      BuildMI(*BB, MI.getIterator(), DL,
              TII.get(W65816::STA_DP)).addImm(0xE0)
          .addReg(W65816::A, RegState::Implicit);
      BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi),
              W65816::A).addFrameIndex(FIHi).addImm(0);
      BuildMI(*BB, MI.getIterator(), DL,
              TII.get(W65816::STA_DP)).addImm(0xE2)
          .addReg(W65816::A, RegState::Implicit);
    }

    if (IsLoad) {
      Register Dst = MI.getOperand(0).getReg();
      BuildMI(*BB, MI.getIterator(), DL,
              TII.get(W65816::LDY_Imm16)).addImm(PeelOff);
      BuildMI(*BB, MI.getIterator(), DL,
              TII.get(W65816::LDA_DPIndLongY)).addImm(0xE0);
      BuildMI(*BB, MI.getIterator(), DL,
              TII.get(TargetOpcode::COPY), Dst).addReg(W65816::A);
    } else {
      Register Val = MI.getOperand(0).getReg();
      BuildMI(*BB, MI.getIterator(), DL,
              TII.get(TargetOpcode::COPY), W65816::A).addReg(Val);
      BuildMI(*BB, MI.getIterator(), DL,
              TII.get(W65816::LDY_Imm16)).addImm(PeelOff);
      if (IsByteStore)
        BuildMI(*BB, MI.getIterator(), DL,
                TII.get(W65816::SEP)).addImm(0x20);
      BuildMI(*BB, MI.getIterator(), DL,
              TII.get(W65816::STA_DPIndLongY)).addImm(0xE0);
      if (IsByteStore)
        BuildMI(*BB, MI.getIterator(), DL,
                TII.get(W65816::REP)).addImm(0x20);
    }
    MI.eraseFromParent();
    if (DeadPtrDef) DeadPtrDef->eraseFromParent();
    if (DeadLoDef)  DeadLoDef->eraseFromParent();
    if (DeadHiDef)  DeadHiDef->eraseFromParent();
    for (MachineInstr *D : ExtraChainDeads) D->eraseFromParent();
    return BB;
  }
  case W65816::LDAptr32Off:
  case W65816::STAptr32Off:
  case W65816::STBptr32Off: {
    // ptr32 deref with constant offset.  The 65816's `[dp],Y` adds Y
    // to the 24-bit pointer at `dp..dp+2` to form the effective
    // address — so we can stage the RAW pointer at $E0..$E2 and put
    // the offset in Y, skipping the i32-add carry chain entirely.
    //
    // Saves ~3 instructions per access vs the previous approach
    // (which did `lo+off; hi+carry` to compute the pointer then
    // derefed with Y=0).  Big win on heavy struct-field code like
    // Lua's lapi.c.  See memory: ptr32-deref-fold-layer1-mi-opcodes.
    //
    // Bank-wrap caveat: `[dp],Y` doesn't propagate Y into the bank
    // byte at $E2 — if pointer+Y crosses a bank boundary, the result
    // wraps within the 24-bit address space (not into the next bank).
    // For struct fields with offsets < 64KB on malloc'd or globally-
    // allocated objects that don't straddle bank boundaries this is
    // safe; the caller must not place objects spanning $XX:FFFF.
    //
    // Dead unless ptr32 mode is active.
    MachineFunction *MF = BB->getParent();
    const W65816Subtarget &STI = MF->getSubtarget<W65816Subtarget>();
    const W65816InstrInfo &TII = *STI.getInstrInfo();
    MachineRegisterInfo &MRI = MF->getRegInfo();
    DebugLoc DL = MI.getDebugLoc();
    bool IsLoad = MI.getOpcode() == W65816::LDAptr32Off;
    bool IsByteStore = MI.getOpcode() == W65816::STBptr32Off;
    Register Ptr = MI.getOperand(1).getReg();
    int64_t Off = MI.getOperand(2).getImm();
    // See LDAptr32 inserter above: vreg sub-regs need COPY-with-subreg
    // (TRI.getSubReg is physreg-only at custom-inserter time).
    Register PtrLo = MRI.createVirtualRegister(&W65816::Wide16RegClass);
    Register PtrHi = MRI.createVirtualRegister(&W65816::Wide16RegClass);
    BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), PtrLo)
        .addReg(Ptr, (RegState)0, llvm::sub_lo);
    BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), PtrHi)
        .addReg(Ptr, (RegState)0, llvm::sub_hi);

    int FILo = MF->getFrameInfo().CreateStackObject(2, Align(2),
                                                    /*isSpillSlot=*/false);
    int FIHi = MF->getFrameInfo().CreateStackObject(2, Align(2),
                                                    /*isSpillSlot=*/false);
    BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi))
        .addReg(PtrLo).addFrameIndex(FILo).addImm(0);
    BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi))
        .addReg(PtrHi).addFrameIndex(FIHi).addImm(0);

    // ptr_lo -> $E0..$E1 (no offset add)
    BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi),
            W65816::A).addFrameIndex(FILo).addImm(0);
    BuildMI(*BB, MI.getIterator(), DL,
            TII.get(W65816::STA_DP)).addImm(0xE0);

    // ptr_hi -> $E2..$E3 (no carry propagation needed)
    BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi),
            W65816::A).addFrameIndex(FIHi).addImm(0);
    BuildMI(*BB, MI.getIterator(), DL,
            TII.get(W65816::STA_DP)).addImm(0xE2);

    if (IsLoad) {
      Register Dst = MI.getOperand(0).getReg();
      BuildMI(*BB, MI.getIterator(), DL,
              TII.get(W65816::LDY_Imm16)).addImm(Off);
      BuildMI(*BB, MI.getIterator(), DL,
              TII.get(W65816::LDA_DPIndLongY)).addImm(0xE0);
      BuildMI(*BB, MI.getIterator(), DL,
              TII.get(TargetOpcode::COPY), Dst).addReg(W65816::A);
    } else {
      Register Val = MI.getOperand(0).getReg();
      BuildMI(*BB, MI.getIterator(), DL,
              TII.get(TargetOpcode::COPY), W65816::A).addReg(Val);
      BuildMI(*BB, MI.getIterator(), DL,
              TII.get(W65816::LDY_Imm16)).addImm(Off);
      if (IsByteStore)
        BuildMI(*BB, MI.getIterator(), DL,
                TII.get(W65816::SEP)).addImm(0x20);
      BuildMI(*BB, MI.getIterator(), DL,
              TII.get(W65816::STA_DPIndLongY)).addImm(0xE0);
      if (IsByteStore)
        BuildMI(*BB, MI.getIterator(), DL,
                TII.get(W65816::REP)).addImm(0x20);
    }
    MI.eraseFromParent();
    return BB;
  }
  case W65816::LDAptrOff:
  case W65816::STAptrOff:
  case W65816::STBptrOff: {
    // Pointer access with a constant offset.  Folds the offset into
    // the pointer (CLC; ADC #off in A) BEFORE staging at $E0..$E2,
    // then accesses via [$E0],Y with Y=0.  We can't fold into Y
    // because [dp],Y on the W65816 adds Y to the full 24-bit pointer
    // — for a negative Y like 0xFFFE (= -2 signed), the addition
    // crosses into bank 1.  Folding into the pointer keeps the add
    // at 16-bit (in A) so the bank byte stays 0.
    //
    // DBR-independent — see LDAptr/STAptr/STBptr.
    MachineFunction *MF = BB->getParent();
    const W65816Subtarget &STI = MF->getSubtarget<W65816Subtarget>();
    const W65816InstrInfo &TII = *STI.getInstrInfo();
    DebugLoc DL = MI.getDebugLoc();
    bool IsLoad = MI.getOpcode() == W65816::LDAptrOff;
    bool IsByteStore = MI.getOpcode() == W65816::STBptrOff;
    Register Ptr = MI.getOperand(1).getReg();
    int64_t Off = MI.getOperand(2).getImm();

    // Spill the pointer vreg to a fresh 2-byte stack slot, then
    // reload via LDAfi.  Forces RA to materialize the source — see
    // the LDAptr/STAptr/STBptr case below for the full rationale.
    int FI = MF->getFrameInfo().CreateStackObject(2, Align(2),
                                                  /*isSpillSlot=*/false);
    BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi))
        .addReg(Ptr).addFrameIndex(FI).addImm(0);
    BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi),
            W65816::A).addFrameIndex(FI).addImm(0);

    // Compute ptr + off in A.  CLC + ADC for the add.
    BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::CLC));
    BuildMI(*BB, MI.getIterator(), DL,
            TII.get(W65816::ADC_Imm16)).addImm(Off);
    BuildMI(*BB, MI.getIterator(), DL,
            TII.get(W65816::STA_DP)).addImm(0xE0);
    if (LoaderBankDeref) {
      // Bank byte from $BE (crt0-initialised) — Loader compat path.
      BuildMI(*BB, MI.getIterator(), DL,
              TII.get(W65816::LDA_DP)).addImm(0xBE);
      BuildMI(*BB, MI.getIterator(), DL,
              TII.get(W65816::STA_DP)).addImm(0xE2);
    } else {
      BuildMI(*BB, MI.getIterator(), DL,
              TII.get(W65816::STZ_DP)).addImm(0xE2);
    }

    if (IsLoad) {
      Register Dst = MI.getOperand(0).getReg();
      BuildMI(*BB, MI.getIterator(), DL,
              TII.get(W65816::LDY_Imm16)).addImm(0);
      BuildMI(*BB, MI.getIterator(), DL,
              TII.get(W65816::LDA_DPIndLongY)).addImm(0xE0);
      BuildMI(*BB, MI.getIterator(), DL,
              TII.get(TargetOpcode::COPY), Dst).addReg(W65816::A);
    } else {
      Register Val = MI.getOperand(0).getReg();
      BuildMI(*BB, MI.getIterator(), DL,
              TII.get(TargetOpcode::COPY), W65816::A).addReg(Val);
      BuildMI(*BB, MI.getIterator(), DL,
              TII.get(W65816::LDY_Imm16)).addImm(0);
      if (IsByteStore)
        BuildMI(*BB, MI.getIterator(), DL,
                TII.get(W65816::SEP)).addImm(0x20);
      BuildMI(*BB, MI.getIterator(), DL,
              TII.get(W65816::STA_DPIndLongY)).addImm(0xE0);
      if (IsByteStore)
        BuildMI(*BB, MI.getIterator(), DL,
                TII.get(W65816::REP)).addImm(0x20);
    }
    MI.eraseFromParent();
    return BB;
  }
  case W65816::LDAptr:
  case W65816::LDAptrBank0:
  case W65816::STAptr:
  case W65816::STBptr: {
    // Pointer load/store via [dp],Y indirect-long (opcodes 0xB7 / 0x97):
    //   STA $E0           ; pointer low/hi at $E0..$E1
    //   STZ $E2           ; bank byte at $E2 = 0
    //   LDY #0
    //   LDA [$E0], Y      ; bank 0:ptr + 0
    //   STA [$E0], Y
    // Bank-explicit ZERO — DBR-independent.  Both the runInMame stack
    // ($00:0FFF down) and BSS / heap globals (placed at $00:xxxx) live
    // in bank 0, so pointer-derefs always reach the right memory even
    // when the user has switched DBR for a bank-2 store via `pha;plb`.
    //
    // Trade-off: under GS/OS Loader the user's data lives in their bank
    // (not bank 0), so library functions that write directly to globals
    // via `sta abs` (DBR-relative, lands in user bank) and user code that
    // reads via pointer-deref (lands in bank 0 by this lowering) get
    // INCONSISTENT results — silent miscompile.  gmtime hit this with
    // its __gmtimeBuf static.  Workaround for affected library code:
    // launder the buffer pointer through inline asm (see gmtime in
    // runtime/src/timeExt.c) so clang doesn't IPSCCP-fold it; the writes
    // then go via [dp],Y too and match the user reads.
    //
    // Const-int pointers (`*(volatile uint16 *)0x5000 = v`) are NOT
    // lowered through this pseudo — TableGen patterns route them to
    // STAlong / STA8long / STAabs by type.  See InstrInfo.td.
    //
    // We use $E0..$E2 in libcall-scratch DP — safe because the
    // pseudo expansion is a leaf (no calls between SEP and STA),
    // and any subsequent libcall reinitialises its own scratch.
    //
    // Why [dp],Y not abs-long-X (`STA $0,X`)?  abs-long-X is shorter
    // (~3 bytes less) but uses X to hold the pointer.  In high-
    // pressure functions like the recursive expression parser, X
    // is often live with another value, and forcing X to be free
    // for every pointer-deref triggered "ran out of registers".
    // [dp],Y uses A and Y only — leaves X for spill-bridge use.
    //
    // STBptr (truncating i8 store) wraps the actual STA in SEP/REP
    // so M=8 across the store and only one byte is written.
    MachineFunction *MF = BB->getParent();
    const W65816Subtarget &STI = MF->getSubtarget<W65816Subtarget>();
    const W65816InstrInfo &TII = *STI.getInstrInfo();
    DebugLoc DL = MI.getDebugLoc();
    bool IsLoad = MI.getOpcode() == W65816::LDAptr ||
                  MI.getOpcode() == W65816::LDAptrBank0;
    bool IsByteStore = MI.getOpcode() == W65816::STBptr;
    // LDAptrBank0 hardcodes bank=0 (STZ $E2) regardless of LoaderBankDeref.
    // Used by va_arg under Loader where the deref is a stack pointer
    // (= bank 0 always on W65816) but $BE points to our code bank.
    bool ForceBank0 = MI.getOpcode() == W65816::LDAptrBank0;

    Register Ptr = MI.getOperand(1).getReg();

    // Why we spill the pointer to a fresh stack slot first:
    // a direct `COPY $a = ptr_vreg ; STA $E0` lets RA elide the COPY
    // when ptr_vreg is already allocated to A.  In a loop body where
    // multiple Acc16 PHIs (pointer + accumulator) compete for A, the
    // PHI elimination pass picks one to be in A at the bottom of the
    // block and silently drops the COPY needed to refresh A with the
    // OTHER value at the top of the next iteration — silent miscompile
    // (sumTable read its own accumulator as the pointer on iter 2+).
    // STAfi forces RA to materialize ptr_vreg's value so it gets stored
    // to the slot, then LDAfi reads it back as a real machine load.
    int FI = MF->getFrameInfo().CreateStackObject(2, Align(2),
                                                  /*isSpillSlot=*/false);
    BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi))
        .addReg(Ptr).addFrameIndex(FI).addImm(0);
    BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi),
            W65816::A).addFrameIndex(FI).addImm(0);

    BuildMI(*BB, MI.getIterator(), DL,
            TII.get(W65816::STA_DP)).addImm(0xE0);
    if (LoaderBankDeref && !ForceBank0) {
      // Bank byte from $BE (crt0-initialised) — Loader compat path.
      BuildMI(*BB, MI.getIterator(), DL,
              TII.get(W65816::LDA_DP)).addImm(0xBE);
      BuildMI(*BB, MI.getIterator(), DL,
              TII.get(W65816::STA_DP)).addImm(0xE2);
    } else {
      BuildMI(*BB, MI.getIterator(), DL,
              TII.get(W65816::STZ_DP)).addImm(0xE2);
    }

    if (IsLoad) {
      Register Dst = MI.getOperand(0).getReg();
      BuildMI(*BB, MI.getIterator(), DL,
              TII.get(W65816::LDY_Imm16)).addImm(0);
      BuildMI(*BB, MI.getIterator(), DL,
              TII.get(W65816::LDA_DPIndLongY)).addImm(0xE0);
      BuildMI(*BB, MI.getIterator(), DL,
              TII.get(TargetOpcode::COPY), Dst).addReg(W65816::A);
    } else {
      Register Val = MI.getOperand(0).getReg();
      BuildMI(*BB, MI.getIterator(), DL,
              TII.get(TargetOpcode::COPY), W65816::A).addReg(Val);
      BuildMI(*BB, MI.getIterator(), DL,
              TII.get(W65816::LDY_Imm16)).addImm(0);
      if (IsByteStore)
        BuildMI(*BB, MI.getIterator(), DL,
                TII.get(W65816::SEP)).addImm(0x20);
      BuildMI(*BB, MI.getIterator(), DL,
              TII.get(W65816::STA_DPIndLongY)).addImm(0xE0);
      if (IsByteStore)
        BuildMI(*BB, MI.getIterator(), DL,
                TII.get(W65816::REP)).addImm(0x20);
    }
    MI.eraseFromParent();
    return BB;
  }
  case W65816::SELECT_CC8:
  case W65816::SELECT_CC16: {
    const W65816Subtarget &STI = BB->getParent()->getSubtarget<W65816Subtarget>();
    const W65816InstrInfo &TII = *STI.getInstrInfo();
    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
    DebugLoc DL = MI.getDebugLoc();
    MachineFunction *MF = BB->getParent();
    const BasicBlock *LLVM_BB = BB->getBasicBlock();
    MachineFunction::iterator It = ++BB->getIterator();

    MachineBasicBlock *thisMBB  = BB;
    MachineBasicBlock *copy0MBB = MF->CreateMachineBasicBlock(LLVM_BB);
    MachineBasicBlock *sinkMBB  = MF->CreateMachineBasicBlock(LLVM_BB);
    MF->insert(It, copy0MBB);
    MF->insert(It, sinkMBB);

    // Move the rest of thisMBB after MI to sinkMBB.
    sinkMBB->splice(sinkMBB->begin(), BB,
                    std::next(MachineBasicBlock::iterator(MI)), BB->end());
    sinkMBB->transferSuccessorsAndUpdatePHIs(BB);

    unsigned CC = MI.getOperand(3).getImm();

    // Helper: if `OpReg` is defined by a single-use, side-effect-free,
    // constant-source LDA in thisMBB, MOVE that LDA into `DstMBB` (at
    // its start).  Returns true on success.
    auto tryHoistConstInit = [&](Register OpReg,
                                 MachineBasicBlock *DstMBB) -> bool {
      if (!OpReg.isVirtual()) return false;
      if (!MRI.hasOneNonDBGUse(OpReg)) return false;
      MachineInstr *Def = MRI.getUniqueVRegDef(OpReg);
      if (!Def || Def->getParent() != thisMBB) return false;
      if (Def->getOpcode() != W65816::LDAi16imm &&
          Def->getOpcode() != W65816::LDAi8imm)
        return false;
      if (Def->getNumOperands() < 2 || !Def->getOperand(1).isImm())
        return false;
      Def->removeFromParent();
      DstMBB->insert(DstMBB->begin(), Def);
      return true;
    };

    Register TValReg = MI.getOperand(1).getReg();
    Register FValReg = MI.getOperand(2).getReg();
    auto IsConstLda = [&](Register R) {
      if (!R.isVirtual() || !MRI.hasOneNonDBGUse(R)) return false;
      MachineInstr *D = MRI.getUniqueVRegDef(R);
      return D && D->getParent() == thisMBB &&
             (D->getOpcode() == W65816::LDAi16imm ||
              D->getOpcode() == W65816::LDAi8imm) &&
             D->getNumOperands() >= 2 && D->getOperand(1).isImm();
    };

    bool BothConst = (CC < W65816CC::COND_GT_MB) &&
                     IsConstLda(TValReg) && IsConstLda(FValReg);

    if (BothConst) {
      // 4-block diamond: thisMBB has only the test (CMP) and Bxx; the
      // tval and fval LDAs each live in their own destination block,
      // which is reached only via the branch — so neither LDA's flag
      // side-effect can corrupt the CMP→Bxx test window.  This is the
      // proper fix for the "LDA between CMP and Bxx" bug catalogued in
      // project_known_issue_lda_flags.md (replacing the earlier 3-block
      // workaround that only hoisted fval).
      //
      //   thisMBB:  ...; CMP; Bxx tvalMBB
      //   copy0MBB: LDA #fval; BRA sinkMBB    (FALSE path)
      //   tvalMBB:  LDA #tval                (TRUE path; falls to sink)
      //   sinkMBB:  PHI [tval from tvalMBB, fval from copy0MBB]
      MachineBasicBlock *tvalMBB = MF->CreateMachineBasicBlock(LLVM_BB);
      MF->insert(sinkMBB->getIterator(), tvalMBB);
      BB->addSuccessor(copy0MBB);
      BB->addSuccessor(tvalMBB);
      copy0MBB->addSuccessor(sinkMBB);
      tvalMBB->addSuccessor(sinkMBB);
      unsigned BrOp = getBranchOpcodeForCC(CC);
      BuildMI(thisMBB, DL, TII.get(BrOp)).addMBB(tvalMBB);
      BuildMI(copy0MBB, DL, TII.get(W65816::BRA)).addMBB(sinkMBB);
      tryHoistConstInit(TValReg, tvalMBB);
      tryHoistConstInit(FValReg, copy0MBB);
      BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII.get(W65816::PHI),
              MI.getOperand(0).getReg())
          .addReg(TValReg).addMBB(tvalMBB)
          .addReg(FValReg).addMBB(copy0MBB);
    } else {
      // 3-block diamond: keep the existing layout and (where possible)
      // hoist fval into copy0MBB.  Used when one or both operands are
      // computed values (not constants), or when the multi-branch CC
      // requires two Bxx in thisMBB.
      BB->addSuccessor(copy0MBB);
      BB->addSuccessor(sinkMBB);
      if (CC < W65816CC::COND_GT_MB) {
        unsigned BrOp = getBranchOpcodeForCC(CC);
        BuildMI(thisMBB, DL, TII.get(BrOp)).addMBB(sinkMBB);
      } else {
        MultiBranch MB = getMultiBranch(CC);
        MachineBasicBlock *Tgt1 = MB.FirstToTrue  ? sinkMBB : copy0MBB;
        MachineBasicBlock *Tgt2 = MB.SecondToTrue ? sinkMBB : copy0MBB;
        BuildMI(thisMBB, DL, TII.get(MB.First)).addMBB(Tgt1);
        BuildMI(thisMBB, DL, TII.get(MB.Second)).addMBB(Tgt2);
      }
      copy0MBB->addSuccessor(sinkMBB);
      tryHoistConstInit(FValReg, copy0MBB);
      BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII.get(W65816::PHI),
              MI.getOperand(0).getReg())
          .addReg(TValReg).addMBB(thisMBB)
          .addReg(FValReg).addMBB(copy0MBB);
    }

    MI.eraseFromParent();
    return sinkMBB;
  }
  }
}