//===-- W65816InstrInfo.td - W65816 Instruction defs -------*- tablegen -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// W65816 instruction description.  This file defines the MC-layer instruction
// encodings for the core 65816 instruction set.  DAG-selection patterns will
// be added incrementally on top of these MC instructions.
//
//===----------------------------------------------------------------------===//

include "W65816InstrFormats.td"

//===----------------------------------------------------------------------===//
// Type Profiles
//===----------------------------------------------------------------------===//
def SDT_W65816Call         : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;
def SDT_W65816CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i16>,
                                             SDTCisVT<1, i16>]>;
def SDT_W65816CallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i16>, SDTCisVT<1, i16>]>;
def SDT_W65816Wrapper      : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
                                                  SDTCisPtrTy<0>]>;
def SDT_W65816Cmp          : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>,
                                                  SDTCisInt<0>]>;
// (CMP allows both i16 and i8 operands.)
def SDT_W65816BrCC         : SDTypeProfile<0, 2, [SDTCisVT<0, OtherVT>,
                                                  SDTCisVT<1, i8>]>;

//===----------------------------------------------------------------------===//
// W65816-specific SDNodes
//===----------------------------------------------------------------------===//
def W65816retglue : SDNode<"W65816ISD::RET_GLUE", SDTNone,
                           [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;

def W65816call    : SDNode<"W65816ISD::CALL", SDT_W65816Call,
                           [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
                            SDNPVariadic]>;

def W65816callseq_start :
    SDNode<"ISD::CALLSEQ_START", SDT_W65816CallSeqStart,
           [SDNPHasChain, SDNPOutGlue]>;
def W65816callseq_end :
    SDNode<"ISD::CALLSEQ_END",   SDT_W65816CallSeqEnd,
           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;

def W65816Wrapper : SDNode<"W65816ISD::Wrapper", SDT_W65816Wrapper>;

// Comparison: produces a Glue value (carrying processor flags).
def W65816cmp  : SDNode<"W65816ISD::CMP",   SDT_W65816Cmp,  [SDNPOutGlue]>;
// Conditional branch: takes (Chain, Dest, CC, Glue from CMP).
def W65816brcc : SDNode<"W65816ISD::BR_CC", SDT_W65816BrCC,
                        [SDNPHasChain, SDNPInGlue]>;

// Push A onto the stack.  Used by LowerCall to pass extra args.
// Takes Chain + Glue (with A pre-loaded via CopyToReg), produces
// Chain + Glue.  Has a side effect (SP changes) and stores to
// memory.  In 16-bit M mode, pushes 2 bytes and decrements SP by 2;
// the call's ADJCALLSTACKUP pseudo unwinds those bytes via
// tsc;clc;adc #N;tcs after the JSL returns.
def W65816push : SDNode<"W65816ISD::PUSH", SDTNone,
                        [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
                         SDNPSideEffect, SDNPMayStore]>;

// Push X onto the stack.  Same shape as W65816push but the value to
// push is glued from CopyToReg(X) instead of CopyToReg(A).
def W65816pushx : SDNode<"W65816ISD::PUSH_X", SDTNone,
                         [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
                          SDNPSideEffect, SDNPMayStore]>;


// SELECT_CC: takes (TVal, FVal, CC) plus a glue value carrying the
// flags from a preceding W65816cmp.  Lowered by EmitInstrWithCustomInserter
// into a CMP (already in the BB) + Bxx + diamond CFG + PHI.
def SDT_W65816SelectCC : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>,
                                              SDTCisSameAs<0, 2>,
                                              SDTCisVT<3, i8>]>;
def W65816selectcc : SDNode<"W65816ISD::SELECT_CC", SDT_W65816SelectCC,
                            [SDNPInGlue]>;

// Dynamic stack allocation: takes (chain, size:i16) and returns
// (ptr:i16, chain).  Lowers to TSC; SEC; SBC size; TCS; INC A in
// AsmPrinter.  See LowerDynamicStackalloc.
def SDT_W65816Alloca : SDTypeProfile<1, 1, [SDTCisVT<0, i16>,
                                            SDTCisVT<1, i16>]>;
def W65816alloca : SDNode<"W65816ISD::ALLOCA", SDT_W65816Alloca,
                          [SDNPHasChain, SDNPSideEffect]>;

// ptr32 load / store: target-specific load/store nodes that take a 32-bit
// pointer (Wide32 = i32) and lower to [dp],Y indirect-long with the bank
// byte taken from the pointer's hi-half.  Used for ptr32 mode where
// generic (load i32-addr) needs explicit lowering — wrapping in a target
// node prevents DAG combines from rewriting the load before isel.
//
// Loads always materialise an i16 in A (16-bit LDA); byte zext / anyext
// patterns AND-mask afterwards exactly as the existing LDAptr does.
// Stores split into two nodes: ST_PTR (full 16-bit STA) and STB_PTR
// (SEP/REP-wrapped 8-bit STA for truncating stores).
def SDT_W65816LdPtr : SDTypeProfile<1, 1, [SDTCisVT<0, i16>, SDTCisVT<1, i32>]>;
def SDT_W65816StPtr : SDTypeProfile<0, 2, [SDTCisVT<0, i16>, SDTCisVT<1, i32>]>;

def W65816ldPtr  : SDNode<"W65816ISD::LD_PTR",  SDT_W65816LdPtr,
                          [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;

// va_arg's stack-pointer deref: bank-0-explicit load.  The 65816 stack
// is hardwired to bank 0; va_arg's `ap` is always a stack pointer.
// Under Loader, $BE points to OUR bank, but va_arg needs bank 0 — so
// LowerVAARG emits this opcode and the pattern routes to LDAptrBank0
// (the bank-0-hardcoded variant of LDAptr).
def SDT_W65816VAArgLoad : SDTypeProfile<1, 1, [SDTCisVT<0, i16>, SDTCisVT<1, i16>]>;
def W65816vaargLoad : SDNode<"W65816ISD::VAARG_LOAD", SDT_W65816VAArgLoad,
                              [SDNPHasChain, SDNPMayLoad]>;
def W65816stPtr  : SDNode<"W65816ISD::ST_PTR",  SDT_W65816StPtr,
                          [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
def W65816stbPtr : SDNode<"W65816ISD::STB_PTR", SDT_W65816StPtr,
                          [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;

//===----------------------------------------------------------------------===//
// Pseudo Instructions
//===----------------------------------------------------------------------===//

let Defs = [SP], Uses = [SP] in {
def ADJCALLSTACKDOWN : W65816Pseudo<(outs),
                                    (ins i16imm:$amt1, i16imm:$amt2),
                                    "# ADJCALLSTACKDOWN $amt1 $amt2",
                                    [(W65816callseq_start timm:$amt1,
                                                          timm:$amt2)]>;
def ADJCALLSTACKUP   : W65816Pseudo<(outs),
                                    (ins i16imm:$amt1, i16imm:$amt2),
                                    "# ADJCALLSTACKUP $amt1 $amt2",
                                    [(W65816callseq_end timm:$amt1,
                                                        timm:$amt2)]>;
}

// LEA-equivalent: compute the address (SP + frame_offset + offset) of a
// stack slot and place it in A.  Selected from a bare ISD::FrameIndex
// SDValue in W65816DAGToDAGISel::Select; expanded by eliminateFrameIndex
// into TSC + CLC + ADC #disp.  Output is Acc16 because the address ends
// up in A; PtrRegs (which only contains SP) is the wrong class.
let isReMaterializable = 1, hasSideEffects = 0,
    mayLoad = 0, mayStore = 0 in
def ADDframe : W65816Pseudo<(outs Acc16:$dst),
                            (ins i16imm:$base, i16imm:$offset),
                            "# ADDframe PSEUDO", []>;

// VLA / dynamic_stackalloc: takes a 16-bit byte count in A, returns
// the address of the allocated region in A.  Expanded at AsmPrinter
// time to: TSC; SEC; SBC count; TCS; INC A.  Has side effects
// (changes SP).  Both $dst and $size are tied to A; explicit
// Defs/Uses on SP keep regalloc honest about the side effect.
let Defs = [SP], Uses = [SP], hasSideEffects = 1,
    Constraints = "$size = $dst" in
def ALLOCAfi : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$size),
                            "# ALLOCAfi $dst, $size",
                            [(set Acc16:$dst, (W65816alloca Acc16:$size))]>;

// The retglue node lowers directly to RTL (see Returns section below).
// No separate RET pseudo — the real MC instruction handles the pattern.

// Push A onto the stack.  Expanded in AsmPrinter to MC `PHA`.  Used by
// LowerCall to pass extra args; the matching `tsc;clc;adc #N;tcs` SP
// unwind happens in eliminateCallFramePseudoInstr for ADJCALLSTACKUP.
let Defs = [SP], Uses = [A, SP], mayStore = 1, hasSideEffects = 1 in {
def PUSH16 : W65816Pseudo<(outs), (ins), "# PUSH16",
                          [(W65816push)]>;
}
// Push X onto the stack.  Used by LowerCall when an outgoing arg's
// SDValue is already in X (e.g. forwarding the i32-first-arg-in-A:X
// hi half).  Saves a TXA+spill round-trip.  Expansion: PHX.
let Defs = [SP], Uses = [X, SP], mayStore = 1, hasSideEffects = 1 in {
def PUSH16X : W65816Pseudo<(outs), (ins), "# PUSH16X",
                           [(W65816pushx)]>;
}

// SELECT_CC16: implements (set Acc16:$dst, (W65816selectcc tval, fval, cc))
// where the CMP that produced the flags has already been emitted (its
// glue is implicit via the P register).  EmitInstrWithCustomInserter
// expands this into a Bxx + 2 BBs + PHI.  Marked usesCustomInserter so
// the codegen invokes our hook; Uses=[P] so MachineSched keeps the CMP
// adjacent.
let usesCustomInserter = 1, Uses = [P], hasSideEffects = 1 in {
def SELECT_CC16 : W65816Pseudo<(outs Acc16:$dst),
                               (ins Acc16:$tval, Acc16:$fval, i8imm:$cc),
                               "# SELECT_CC16 $dst, $tval, $fval, $cc",
                               [(set Acc16:$dst,
                                     (W65816selectcc Acc16:$tval,
                                                     Acc16:$fval,
                                                     timm:$cc))]>;
// i8 mirror.  Without this, `c ? a : b` patterns where the result is
// i8 (e.g. `unsigned char to_lower(char c)`) fail isel with "Cannot
// Select" — pre-existing bug.  EmitInstrWithCustomInserter handles
// both the i8 and i16 forms identically; the only difference is the
// register class on the operands.
def SELECT_CC8  : W65816Pseudo<(outs Acc8:$dst),
                               (ins Acc8:$tval, Acc8:$fval, i8imm:$cc),
                               "# SELECT_CC8 $dst, $tval, $fval, $cc",
                               [(set Acc8:$dst,
                                     (W65816selectcc Acc8:$tval,
                                                     Acc8:$fval,
                                                     timm:$cc))]>;
}

//===----------------------------------------------------------------------===//
// Codegen pseudos that expand to MC instructions in the AsmPrinter.
//
// These pseudos carry DAG patterns with explicit output operands so the
// generic code generator can allocate them; the MC-layer instructions they
// expand to have the opcode encoding but no virtual output (the result lives
// in the implicit A register).  W65816AsmPrinter::emitInstruction maps each
// pseudo here to its real MC counterpart.
//===----------------------------------------------------------------------===//

// NOTE: LDA / LDX physically update N and Z, but we deliberately do
// NOT model that with `Defs = [P]`.  Adding `Defs = [P]` lets the
// scheduler legally place an LDA between CMP and Bxx (P just gets
// re-defined; the latest def is what Bxx tests) — same flag-corruption
// bug, different mechanism.  Two complementary fixes carry the load:
// the 4-block SELECT_CC inserter for SETCC patterns, and the post-RA
// PHP/PLP wrap pass (W65816StackSlotCleanup Pass -2.5) for BR_CC
// patterns (`while`/`for`/`if-goto`).  Both landed.
let isAsCheapAsAMove = 1, isReMaterializable = 1,
    hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
def LDAi16imm : W65816Pseudo<(outs Acc16:$dst), (ins i16imm:$imm),
                             "# LDAi16imm $dst, $imm",
                             [(set Acc16:$dst, (i16 imm:$imm))]>;
let isReMaterializable = 1, isAsCheapAsAMove = 1, hasSideEffects = 0,
    mayLoad = 0, mayStore = 0 in
def LDXi16imm : W65816Pseudo<(outs Idx16:$dst), (ins i16imm:$imm),
                             "# LDXi16imm $dst, $imm",
                             [(set Idx16:$dst, (i16 imm:$imm))]>;
def LDAi8imm  : W65816Pseudo<(outs Acc8:$dst), (ins i8imm:$imm),
                             "# LDAi8imm $dst, $imm",
                             [(set Acc8:$dst, (i8 imm:$imm))]>;
}

// Materialise a 16-bit address (global / external symbol) into A.  Same
// pseudo as for an immediate constant — it expands to LDA_Imm16 with the
// symbol as the operand, which the MC encoder turns into a fixup_16.
def : Pat<(i16 (W65816Wrapper tglobaladdr:$g)),
          (LDAi16imm tglobaladdr:$g)>;
def : Pat<(i16 (W65816Wrapper texternalsym:$s)),
          (LDAi16imm texternalsym:$s)>;

// 8-bit add/sub of an immediate.
let Constraints = "$src = $dst",
    hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
def ADCi8imm : W65816Pseudo<(outs Acc8:$dst),
                            (ins Acc8:$src, i8imm:$imm),
                            "# ADCi8imm $dst, $src, $imm",
                            [(set Acc8:$dst, (add Acc8:$src, imm:$imm))]>;
def SBCi8imm : W65816Pseudo<(outs Acc8:$dst),
                            (ins Acc8:$src, i8imm:$imm),
                            "# SBCi8imm $dst, $src, $imm",
                            [(set Acc8:$dst, (sub Acc8:$src, imm:$imm))]>;
def ANDi8imm : W65816Pseudo<(outs Acc8:$dst),
                            (ins Acc8:$src, i8imm:$imm),
                            "# ANDi8imm $dst, $src, $imm",
                            [(set Acc8:$dst, (and Acc8:$src, imm:$imm))]>;
def ORAi8imm : W65816Pseudo<(outs Acc8:$dst),
                            (ins Acc8:$src, i8imm:$imm),
                            "# ORAi8imm $dst, $src, $imm",
                            [(set Acc8:$dst, (or Acc8:$src, imm:$imm))]>;
def EORi8imm : W65816Pseudo<(outs Acc8:$dst),
                            (ins Acc8:$src, i8imm:$imm),
                            "# EORi8imm $dst, $src, $imm",
                            [(set Acc8:$dst, (xor Acc8:$src, imm:$imm))]>;
}

// 8-bit load / store via a 16-bit absolute address.
let mayLoad = 1, hasSideEffects = 0, mayStore = 0 in {
def LDA8abs : W65816Pseudo<(outs Acc8:$dst), (ins i32imm:$addr),
                           "# LDA8abs $dst, $addr", []>;
// LDA8long: companion to STA8long.  Bank-explicit i8 load via LDA_Long
// (0xAF).  Used for `*(uint8*)0xC035` reads — LDA_Abs (0xAD) is
// DBR-relative and would land in the wrong bank under GS/OS Loader.
// Pattern that ROUTES const-int loads here lives at the ANDi16imm
// section (must appear after ANDi16imm is defined).
def LDA8long : W65816Pseudo<(outs Acc8:$dst), (ins i32imm:$addr),
                            "# LDA8long $dst, $addr", []>;
}
let mayStore = 1, hasSideEffects = 0, mayLoad = 0 in {
def STA8abs : W65816Pseudo<(outs), (ins Acc8:$src, i32imm:$addr),
                           "# STA8abs $src, $addr", []>;
// STA8long: 8-bit absolute-long store.  Same pattern as STA8abs but
// the AsmPrinter emits STA_Long (0x8F) — a true 24-bit bank-explicit
// store — instead of STA_Abs (0x8D, DBR-relative).  Used for MMIO via
// a constant integer address; the i32imm carries the full 24-bit
// physical address.  See the (store Acc8, (iPTR imm)) pattern.
def STA8long : W65816Pseudo<(outs), (ins Acc8:$src, i32imm:$addr),
                            "# STA8long $src, $addr", []>;
}
def : Pat<(i8 (load (W65816Wrapper tglobaladdr:$g))),
          (LDA8abs tglobaladdr:$g)>;
def : Pat<(i8 (load (W65816Wrapper texternalsym:$s))),
          (LDA8abs texternalsym:$s)>;
def : Pat<(store Acc8:$src, (W65816Wrapper tglobaladdr:$g)),
          (STA8abs Acc8:$src, tglobaladdr:$g)>;
def : Pat<(store Acc8:$src, (W65816Wrapper texternalsym:$s)),
          (STA8abs Acc8:$src, texternalsym:$s)>;
// Byte store via a constant-int address (MMIO-style: `*(volatile uint8 *)0x70
// = v`).  Without this, the i8 store falls through to STBptr ([dp],Y), which
// is 16 B / 30 cyc.  We route through STA8long (sta abs-long, opcode 0x8F)
// rather than STA8abs because a const-int address is a physical 24-bit
// pointer and must NOT track DBR — under the GS/OS Loader the data bank is
// non-zero, so DBR-relative `sta abs` would land in the wrong bank.
// `timm` matches TargetConstantSDNode — under p:32:16, a pre-isel combine
// in W65816TargetLowering::PerformDAGCombine converts the ConstantSDNode
// ptr to a TargetConstantSDNode so it survives LowerI32Constant intact.
def : Pat<(store Acc8:$src, (iPTR imm:$addr)),
          (STA8long Acc8:$src, (i32 imm:$addr))>;
def : Pat<(store Acc8:$src, (iPTR timm:$addr)),
          (STA8long Acc8:$src, (i32 timm:$addr))>;
def : Pat<(truncstorei8 Acc16:$src, (iPTR imm:$addr)),
          (STA8long (COPY_TO_REGCLASS Acc16:$src, Acc8), (i32 imm:$addr))>;
def : Pat<(truncstorei8 Acc16:$src, (iPTR timm:$addr)),
          (STA8long (COPY_TO_REGCLASS Acc16:$src, Acc8), (i32 timm:$addr))>;

// Load 16 bits via a 16-bit absolute address.  Currently only matches
// loads from a Wrapper(global); direct constant-pointer loads come once
// we add an addressing-mode complex pattern.
let mayLoad = 1, hasSideEffects = 0, mayStore = 0 in {
def LDAabs : W65816Pseudo<(outs Acc16:$dst), (ins i32imm:$addr),
                          "# LDAabs $dst, $addr", []>;
}
def : Pat<(i16 (load (W65816Wrapper tglobaladdr:$g))),
          (LDAabs tglobaladdr:$g)>;
def : Pat<(i16 (load (W65816Wrapper texternalsym:$s))),
          (LDAabs texternalsym:$s)>;
// i16 const-int-address load: companion to the STAabs (iPTR imm) /
// (iPTR timm) store patterns at line ~350.  `*(volatile uint16*)0x5000`
// → LDAabs (DBR-relative).  The combine in W65816TargetLowering returns
// a TargetConstant for the Wide32-zero-hi-Constant unwrap.
def : Pat<(i16 (load (iPTR imm:$addr))),
          (LDAabs (i32 imm:$addr))>;
def : Pat<(i16 (load (iPTR timm:$addr))),
          (LDAabs (i32 timm:$addr))>;

// Store 16 bits to a 16-bit absolute address.
let mayStore = 1, hasSideEffects = 0, mayLoad = 0 in {
def STAabs : W65816Pseudo<(outs), (ins Acc16:$src, i32imm:$addr),
                          "# STAabs $src, $addr", []>;
}
def : Pat<(store Acc16:$src, (W65816Wrapper tglobaladdr:$g)),
          (STAabs Acc16:$src, tglobaladdr:$g)>;
def : Pat<(store Acc16:$src, (W65816Wrapper texternalsym:$s)),
          (STAabs Acc16:$src, texternalsym:$s)>;
// Store via a constant-int address (`*(volatile uint16 *)0x5000 = v`).
// Lowers to STAabs (0x8D, DBR-relative) — DELIBERATELY asymmetric with the
// i8 case (STA8long, bank-explicit).  Rationale: most 65816 MMIO is i8
// (e.g. `*(uint8*)0xC035`) where users expect bank=0 always.  Const-int
// i16 is mostly used as a DBR-relative idiom in test code that switches
// DBR and verifies a write lands in the new bank.  Switching i16 to
// bank-explicit broke 10+ existing tests with no real-world i16 MMIO
// use case to justify it.  Users who need bank-explicit i16 should
// declare a global or split into two i8 stores.
def : Pat<(store Acc16:$src, (iPTR imm:$addr)),
          (STAabs Acc16:$src, (i32 imm:$addr))>;
// Under ptr32 the i16/i32 const-addr stores emerge with TargetConstant
// pointers (the PerformDAGCombine on STORE rewrites the ConstantSDNode
// into a TargetConstant to bypass LowerI32Constant's REG_SEQUENCE
// expansion).  Match `timm` so STAabs fires.
def : Pat<(store Acc16:$src, (iPTR timm:$addr)),
          (STAabs Acc16:$src, (i32 timm:$addr))>;

// 16-bit ADD: expands to CLC + ADC_Imm16.  The 65816 ADC sums with the
// carry flag, so a clean add needs CLC first.  Constraints tie the
// source and dest to A — there is only one Acc16 register so this is
// implicit, but stating it lets the register allocator coalesce
// without needing a COPY.
//
// Defs = [P] models the C-flag side-effect.  Required so tablegen can
// connect this instruction to the SDNode `addc` / `subc` (SDNPOutGlue),
// which is what the type legalizer emits as the lo half of a multi-
// precision add/sub when ADDC/SUBC is Legal (see W65816ISelLowering ctor).
let Constraints = "$src = $dst",
    hasSideEffects = 0, mayLoad = 0, mayStore = 0, Defs = [P] in {
def ADCi16imm : W65816Pseudo<(outs Acc16:$dst),
                             (ins Acc16:$src, i16imm:$imm),
                             "# ADCi16imm $dst, $src, $imm",
                             [(set Acc16:$dst,
                                   (add Acc16:$src, imm:$imm))]>;
def SBCi16imm : W65816Pseudo<(outs Acc16:$dst),
                             (ins Acc16:$src, i16imm:$imm),
                             "# SBCi16imm $dst, $src, $imm",
                             [(set Acc16:$dst,
                                   (sub Acc16:$src, imm:$imm))]>;
}

// addc/subc: same as add/sub on this target (CLC then ADC, SEC then SBC),
// but the SDNode produces a Glue carrying the post-op carry into a
// subsequent adde/sube.  Tablegen wires the Glue to the P register
// because the instruction has Defs = [P].
def : Pat<(addc Acc16:$src, imm:$imm),
          (ADCi16imm Acc16:$src, imm:$imm)>;
def : Pat<(subc Acc16:$src, imm:$imm),
          (SBCi16imm Acc16:$src, imm:$imm)>;

// ADC/SBC from a 16-bit absolute address.  Folds a load on the
// right-hand side of an add/sub into the carry-arithmetic op.
let Constraints = "$src = $dst",
    hasSideEffects = 0, mayLoad = 1, mayStore = 0, Defs = [P] in {
def ADCabs : W65816Pseudo<(outs Acc16:$dst),
                          (ins Acc16:$src, i32imm:$addr),
                          "# ADCabs $dst, $src, $addr", []>;
def SBCabs : W65816Pseudo<(outs Acc16:$dst),
                          (ins Acc16:$src, i32imm:$addr),
                          "# SBCabs $dst, $src, $addr", []>;
}
def : Pat<(add Acc16:$src,
              (i16 (load (W65816Wrapper tglobaladdr:$g)))),
          (ADCabs Acc16:$src, tglobaladdr:$g)>;
def : Pat<(add Acc16:$src,
              (i16 (load (W65816Wrapper texternalsym:$s)))),
          (ADCabs Acc16:$src, texternalsym:$s)>;
def : Pat<(sub Acc16:$src,
              (i16 (load (W65816Wrapper tglobaladdr:$g)))),
          (SBCabs Acc16:$src, tglobaladdr:$g)>;
def : Pat<(sub Acc16:$src,
              (i16 (load (W65816Wrapper texternalsym:$s)))),
          (SBCabs Acc16:$src, texternalsym:$s)>;
def : Pat<(addc Acc16:$src,
              (i16 (load (W65816Wrapper tglobaladdr:$g)))),
          (ADCabs Acc16:$src, tglobaladdr:$g)>;
def : Pat<(addc Acc16:$src,
              (i16 (load (W65816Wrapper texternalsym:$s)))),
          (ADCabs Acc16:$src, texternalsym:$s)>;
def : Pat<(subc Acc16:$src,
              (i16 (load (W65816Wrapper tglobaladdr:$g)))),
          (SBCabs Acc16:$src, tglobaladdr:$g)>;
def : Pat<(subc Acc16:$src,
              (i16 (load (W65816Wrapper texternalsym:$s)))),
          (SBCabs Acc16:$src, texternalsym:$s)>;

// adde/sube: the chained ADC/SBC for the hi half of a multi-precision
// add/sub.  Reads the C flag from the previous addc/adde (Uses = [P]),
// produces a fresh carry/borrow (Defs = [P]).  AsmPrinter expansion
// emits a bare ADC/SBC with no preceding CLC/SEC; eliminateFrameIndex
// for ADCEfi/SBCEfi skips the carry-prefix step that the standalone
// ADCfi/SBCfi rely on.
let Constraints = "$src = $dst",
    hasSideEffects = 0, mayLoad = 0, mayStore = 0,
    Uses = [P], Defs = [P] in {
def ADCEi16imm : W65816Pseudo<(outs Acc16:$dst),
                              (ins Acc16:$src, i16imm:$imm),
                              "# ADCEi16imm $dst, $src, $imm",
                              [(set Acc16:$dst,
                                    (adde Acc16:$src, imm:$imm))]>;
def SBCEi16imm : W65816Pseudo<(outs Acc16:$dst),
                              (ins Acc16:$src, i16imm:$imm),
                              "# SBCEi16imm $dst, $src, $imm",
                              [(set Acc16:$dst,
                                    (sube Acc16:$src, imm:$imm))]>;
}
let Constraints = "$src = $dst",
    hasSideEffects = 0, mayLoad = 1, mayStore = 0,
    Uses = [P], Defs = [P] in {
def ADCEabs : W65816Pseudo<(outs Acc16:$dst),
                           (ins Acc16:$src, i32imm:$addr),
                           "# ADCEabs $dst, $src, $addr", []>;
def SBCEabs : W65816Pseudo<(outs Acc16:$dst),
                           (ins Acc16:$src, i32imm:$addr),
                           "# SBCEabs $dst, $src, $addr", []>;
}
def : Pat<(adde Acc16:$src,
              (i16 (load (W65816Wrapper tglobaladdr:$g)))),
          (ADCEabs Acc16:$src, tglobaladdr:$g)>;
def : Pat<(adde Acc16:$src,
              (i16 (load (W65816Wrapper texternalsym:$s)))),
          (ADCEabs Acc16:$src, texternalsym:$s)>;
def : Pat<(sube Acc16:$src,
              (i16 (load (W65816Wrapper tglobaladdr:$g)))),
          (SBCEabs Acc16:$src, tglobaladdr:$g)>;
def : Pat<(sube Acc16:$src,
              (i16 (load (W65816Wrapper texternalsym:$s)))),
          (SBCEabs Acc16:$src, texternalsym:$s)>;

// (add Acc16, Acc16) — same value added to itself, equivalent to a 1-bit
// left shift.  Pattern needs a tied input so the result lands in A.
let Constraints = "$src = $dst",
    hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
def ASLA16 : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src),
                          "# ASLA16 $dst, $src",
                          [(set Acc16:$dst, (add Acc16:$src, Acc16:$src))]>;
}
// 1-bit shift left of the accumulator: shl x, 1.
def : Pat<(shl Acc16:$src, (i16 1)), (ASLA16 Acc16:$src)>;

// 1-bit logical shift right.  Pseudo because the MC LSR_A has no
// virtual output operand.
let Constraints = "$src = $dst",
    hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
def LSRA16 : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src),
                          "# LSRA16 $dst, $src",
                          [(set Acc16:$dst, (srl Acc16:$src, (i16 1)))]>;
def ASLA8  : W65816Pseudo<(outs Acc8:$dst), (ins Acc8:$src),
                          "# ASLA8 $dst, $src",
                          [(set Acc8:$dst, (shl Acc8:$src, (i8 1)))]>;
def LSRA8  : W65816Pseudo<(outs Acc8:$dst), (ins Acc8:$src),
                          "# LSRA8 $dst, $src",
                          [(set Acc8:$dst, (srl Acc8:$src, (i8 1)))]>;
// Signed shift right by 1: copy A's high bit into carry, then ROR
// to bring it back into A's high bit while halving the rest.  The
// AsmPrinter expands this to the 4-instruction PHA;ASL;PLA;ROR
// sequence.
def ASRA16 : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src),
                          "# ASRA16 $dst, $src",
                          [(set Acc16:$dst, (sra Acc16:$src, (i16 1)))]> {
  let Constraints = "$src = $dst";
}
}

// Shifts by small constants — unroll into 2-4 single-bit shifts.
// Anything beyond 4 bits would benefit from a loop or a XBA-and-mask
// trick; left for a future peephole.
def : Pat<(shl Acc16:$src, (i16 2)), (ASLA16 (ASLA16 Acc16:$src))>;
def : Pat<(shl Acc16:$src, (i16 3)),
          (ASLA16 (ASLA16 (ASLA16 Acc16:$src)))>;
def : Pat<(shl Acc16:$src, (i16 4)),
          (ASLA16 (ASLA16 (ASLA16 (ASLA16 Acc16:$src))))>;

def : Pat<(srl Acc16:$src, (i16 2)), (LSRA16 (LSRA16 Acc16:$src))>;
def : Pat<(srl Acc16:$src, (i16 3)),
          (LSRA16 (LSRA16 (LSRA16 Acc16:$src)))>;
def : Pat<(srl Acc16:$src, (i16 4)),
          (LSRA16 (LSRA16 (LSRA16 (LSRA16 Acc16:$src))))>;

// Shift counts 5..7 — chained single-bit shifts.  Earlier these were
// withheld because the DAG combiner narrowed `(trunc (shl (zext X), N))`
// back to `(shl X, N)` on i8 and re-entered LowerShift in a loop; the
// `isTypeDesirableForOp(SHL/SRL/SRA, i8) -> false` override in
// W65816TargetLowering now blocks that combine, so the patterns are
// safe.  Cheaper than __ashlhi3/__lshrhi3 for these counts.
def : Pat<(shl Acc16:$src, (i16 5)),
          (ASLA16 (ASLA16 (ASLA16 (ASLA16 (ASLA16 Acc16:$src)))))>;
def : Pat<(shl Acc16:$src, (i16 6)),
          (ASLA16 (ASLA16 (ASLA16 (ASLA16 (ASLA16 (ASLA16 Acc16:$src))))))>;
def : Pat<(shl Acc16:$src, (i16 7)),
          (ASLA16 (ASLA16 (ASLA16 (ASLA16 (ASLA16 (ASLA16 (ASLA16 Acc16:$src)))))))>;
def : Pat<(srl Acc16:$src, (i16 5)),
          (LSRA16 (LSRA16 (LSRA16 (LSRA16 (LSRA16 Acc16:$src)))))>;
def : Pat<(srl Acc16:$src, (i16 6)),
          (LSRA16 (LSRA16 (LSRA16 (LSRA16 (LSRA16 (LSRA16 Acc16:$src))))))>;
def : Pat<(srl Acc16:$src, (i16 7)),
          (LSRA16 (LSRA16 (LSRA16 (LSRA16 (LSRA16 (LSRA16 (LSRA16 Acc16:$src)))))))>;

// Increment / decrement of A by 1.  Match `(add x, 1)` and `(add x, -1)`
// (LLVM canonicalises sub-by-1 to add-by-(-1)).
let Constraints = "$src = $dst",
    hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
def INA_PSEUDO : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src),
                              "# INA_PSEUDO $dst, $src",
                              [(set Acc16:$dst, (add Acc16:$src, (i16 1)))]>;
def DEA_PSEUDO : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src),
                              "# DEA_PSEUDO $dst, $src",
                              [(set Acc16:$dst, (add Acc16:$src, (i16 -1)))]>;
def INA_PSEUDO8 : W65816Pseudo<(outs Acc8:$dst), (ins Acc8:$src),
                               "# INA_PSEUDO8 $dst, $src",
                               [(set Acc8:$dst, (add Acc8:$src, (i8 1)))]>;
def DEA_PSEUDO8 : W65816Pseudo<(outs Acc8:$dst), (ins Acc8:$src),
                               "# DEA_PSEUDO8 $dst, $src",
                               [(set Acc8:$dst, (add Acc8:$src, (i8 -1)))]>;
}

// Two's-complement negation: `0 - x` → `EOR #$FFFF; INC A` (i.e.
// bitwise-not then add 1).  Catches (sub 0, x) which LLVM uses for
// `-x` and the `abs` intrinsic.
let Constraints = "$src = $dst",
    hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
def NEGA16 : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src),
                          "# NEGA16 $dst, $src",
                          [(set Acc16:$dst, (sub (i16 0), Acc16:$src))]>;
// i8 mirror.  Without this the codegen falls into the generic SBC
// path: `LDA #0; SEC; SBC slot` plus 8-bit M-mode prologue and
// PHA/PLA bracketing — ~12 insns for `-x`.  NEGA8 expands to
// `EOR #$FF; INA` (2 insns in 8-bit M).
def NEGA8  : W65816Pseudo<(outs Acc8:$dst), (ins Acc8:$src),
                          "# NEGA8 $dst, $src",
                          [(set Acc8:$dst, (sub (i8 0), Acc8:$src))]>;
}

// Multi-precision negation: lo + hi halves of `-x` where x is i32.
// LLVM splits `0 - x` into `(subc 0, x_lo)` and `(sube 0, x_hi)`.
// We implement both via the ADD chain `~x + carry` since INC doesn't
// touch C; the bit pattern of C from `~x + 1` matches what `subc 0, x`
// would set (C=1 iff x was 0, i.e. no borrow).
//   NEGC16  matches subc → "EOR #$FFFF; CLC; ADC #1"   (5 bytes)
//   NEGE16  matches sube → "EOR #$FFFF; ADC #0"        (4 bytes, uses C-in)
let Constraints = "$src = $dst",
    hasSideEffects = 0, mayLoad = 0, mayStore = 0, Defs = [P] in {
def NEGC16 : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src),
                          "# NEGC16 $dst, $src",
                          [(set Acc16:$dst, (subc (i16 0), Acc16:$src))]>;
}
let Constraints = "$src = $dst",
    hasSideEffects = 0, mayLoad = 0, mayStore = 0,
    Uses = [P], Defs = [P] in {
def NEGE16 : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src),
                          "# NEGE16 $dst, $src",
                          [(set Acc16:$dst, (sube (i16 0), Acc16:$src))]>;
}

// Bitwise NOT pattern moved below EORi16imm definition.

// 16-bit bitwise ops: AND / OR / XOR against an immediate or memory
// operand.  Same shape as ADCi16imm / ADCabs minus the carry prefix
// (these don't read/write the carry flag).
let Constraints = "$src = $dst",
    hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
def ANDi16imm : W65816Pseudo<(outs Acc16:$dst),
                             (ins Acc16:$src, i16imm:$imm),
                             "# ANDi16imm $dst, $src, $imm",
                             [(set Acc16:$dst,
                                   (and Acc16:$src, imm:$imm))]>;
def ORAi16imm : W65816Pseudo<(outs Acc16:$dst),
                             (ins Acc16:$src, i16imm:$imm),
                             "# ORAi16imm $dst, $src, $imm",
                             [(set Acc16:$dst,
                                   (or Acc16:$src, imm:$imm))]>;
def EORi16imm : W65816Pseudo<(outs Acc16:$dst),
                             (ins Acc16:$src, i16imm:$imm),
                             "# EORi16imm $dst, $src, $imm",
                             [(set Acc16:$dst,
                                   (xor Acc16:$src, imm:$imm))]>;
}

// Bank-explicit i8 loads from a constant-int address (`*(uint8*)0xC035`).
// The default lowering goes through LDAptr ([dp],Y indirect-long) — 22 B /
// 35 cyc — because LDAptr's pattern `(load Wide16:$ptr)` matches once the
// matcher materialises the const into Wide16.  These patterns shortcut to
// LDA8long (sta long, 0xAF, 6 B / 10 cyc) and run BEFORE that materialisation
// because the explicit imm leaf has higher AddedComplexity.  Only the
// `(zextloadi8 imm)` form actually appears in real IR (i8 loads are
// always i16-extended at SDAG time on this 16-bit target); kept the
// raw `(load imm)` form too for symmetry with the store side.
let AddedComplexity = 50 in {
def : Pat<(i8 (load (iPTR imm:$addr))),
          (LDA8long (i32 imm:$addr))>;
def : Pat<(i8 (load (iPTR timm:$addr))),
          (LDA8long (i32 timm:$addr))>;
def : Pat<(i16 (zextloadi8 (iPTR imm:$addr))),
          (ANDi16imm (COPY_TO_REGCLASS (LDA8long (i32 imm:$addr)), Acc16),
                     0xFF)>;
def : Pat<(i16 (zextloadi8 (iPTR timm:$addr))),
          (ANDi16imm (COPY_TO_REGCLASS (LDA8long (i32 timm:$addr)), Acc16),
                     0xFF)>;
def : Pat<(i16 (extloadi8 (iPTR imm:$addr))),
          (COPY_TO_REGCLASS (LDA8long (i32 imm:$addr)), Acc16)>;
def : Pat<(i16 (extloadi8 (iPTR timm:$addr))),
          (COPY_TO_REGCLASS (LDA8long (i32 timm:$addr)), Acc16)>;
}
let Constraints = "$src = $dst",
    hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
def ANDabs : W65816Pseudo<(outs Acc16:$dst),
                          (ins Acc16:$src, i32imm:$addr),
                          "# ANDabs $dst, $src, $addr", []>;
def ORAabs : W65816Pseudo<(outs Acc16:$dst),
                          (ins Acc16:$src, i32imm:$addr),
                          "# ORAabs $dst, $src, $addr", []>;
def EORabs : W65816Pseudo<(outs Acc16:$dst),
                          (ins Acc16:$src, i32imm:$addr),
                          "# EORabs $dst, $src, $addr", []>;
}
def : Pat<(and Acc16:$src, (i16 (load (W65816Wrapper tglobaladdr:$g)))),
          (ANDabs Acc16:$src, tglobaladdr:$g)>;
def : Pat<(or  Acc16:$src, (i16 (load (W65816Wrapper tglobaladdr:$g)))),
          (ORAabs Acc16:$src, tglobaladdr:$g)>;
def : Pat<(xor Acc16:$src, (i16 (load (W65816Wrapper tglobaladdr:$g)))),
          (EORabs Acc16:$src, tglobaladdr:$g)>;

// Bitwise NOT: x ^ 0xFFFF.  LLVM lowers `~x` and i1 inversion through
// this; emit a single EOR #$FFFF via the bitwise pseudo above.
def : Pat<(xor Acc16:$src, (i16 -1)),
          (EORi16imm Acc16:$src, 0xFFFF)>;

// (srl x, 15): extract bit 15 to bit 0 (yields 0 or 1).  The
// type-legalizer's SHL_PARTS expansion of `i32 << 1` needs this for
// the high-half "carry from low" slot, and routing it through the
// __lshrhi3 libcall costs ~10 bytes per i32 shift-by-1.  Inline as
// `ASL A; LDA #0; ROL A` (3 bytes): ASL puts bit 15 into C and
// trashes A; LDA #0 doesn't touch C; ROL A folds C into bit 0.
//
// (shl x, 15): move bit 0 to bit 15 (yields 0 or 0x8000).  Used by
// SRL_PARTS / SRA_PARTS expansion of `i32 >> 1` for the low-half
// "carry from hi" slot.  Mirror sequence: `LSR A; LDA #0; ROR A`.
let Constraints = "$src = $dst",
    hasSideEffects = 0, mayLoad = 0, mayStore = 0, Defs = [P] in {
def SRL15A : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src),
                          "# SRL15A $dst, $src",
                          [(set Acc16:$dst, (srl Acc16:$src, (i16 15)))]>;
def SHL15A : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src),
                          "# SHL15A $dst, $src",
                          [(set Acc16:$dst, (shl Acc16:$src, (i16 15)))]>;
}
// (srl x, 8): high byte to low byte, zero high byte.  XBA swaps the
// two bytes of A (in 16-bit M); AND #$00FF clears the new high byte.
// 4 bytes total — much shorter than the __lshrhi3 libcall path.  Used
// by i32 shift-by-8 SHL_PARTS expansion for the cross-half slot.
//
// (shl x, 8): low byte to high byte, zero low byte.  Mirror.
let Constraints = "$src = $dst",
    hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
def SRL8A : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src),
                         "# SRL8A $dst, $src",
                         [(set Acc16:$dst, (srl Acc16:$src, (i16 8)))]>;
def SHL8A : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src),
                         "# SHL8A $dst, $src",
                         [(set Acc16:$dst, (shl Acc16:$src, (i16 8)))]>;
}

// Shift counts 9..14: SHL builds on SHL8A (XBA + low-byte mask) and chains
// 1..6 ASLs after it; SRL mirrors via SRL8A + LSRA chains.  The
// isTypeDesirableForOp override prevents the i8-shift combine loop that
// kept these out of tablegen earlier.
def : Pat<(shl Acc16:$src, (i16 9)),
          (ASLA16 (SHL8A Acc16:$src))>;
def : Pat<(shl Acc16:$src, (i16 10)),
          (ASLA16 (ASLA16 (SHL8A Acc16:$src)))>;
def : Pat<(shl Acc16:$src, (i16 11)),
          (ASLA16 (ASLA16 (ASLA16 (SHL8A Acc16:$src))))>;
def : Pat<(shl Acc16:$src, (i16 12)),
          (ASLA16 (ASLA16 (ASLA16 (ASLA16 (SHL8A Acc16:$src)))))>;
def : Pat<(shl Acc16:$src, (i16 13)),
          (ASLA16 (ASLA16 (ASLA16 (ASLA16 (ASLA16 (SHL8A Acc16:$src))))))>;
def : Pat<(shl Acc16:$src, (i16 14)),
          (ASLA16 (ASLA16 (ASLA16 (ASLA16 (ASLA16 (ASLA16 (SHL8A Acc16:$src)))))))>;
def : Pat<(srl Acc16:$src, (i16 9)),
          (LSRA16 (SRL8A Acc16:$src))>;
def : Pat<(srl Acc16:$src, (i16 10)),
          (LSRA16 (LSRA16 (SRL8A Acc16:$src)))>;
def : Pat<(srl Acc16:$src, (i16 11)),
          (LSRA16 (LSRA16 (LSRA16 (SRL8A Acc16:$src))))>;
def : Pat<(srl Acc16:$src, (i16 12)),
          (LSRA16 (LSRA16 (LSRA16 (LSRA16 (SRL8A Acc16:$src)))))>;
def : Pat<(srl Acc16:$src, (i16 13)),
          (LSRA16 (LSRA16 (LSRA16 (LSRA16 (LSRA16 (SRL8A Acc16:$src))))))>;
def : Pat<(srl Acc16:$src, (i16 14)),
          (LSRA16 (LSRA16 (LSRA16 (LSRA16 (LSRA16 (LSRA16 (SRL8A Acc16:$src)))))))>;
// (sra x, 15): sign-fill — yields $0000 if x is non-negative, $FFFF
// if negative.  Used by i32 sext-from-i16 type-legalization for the
// hi half (avoids the __ashrhi3 libcall path).  Sequence:
// `ASL A; LDA #0; SBC #0; EOR #-1` (when our SBCi16imm uses SEC + SBC,
// LDA #0; SBC #0 produces $FFFF if C=0, $0000 if C=1; EOR #-1 flips).
// Actually simpler since SBC sets carry differently: see AsmPrinter
// expansion for the exact 5-byte sequence.
let Constraints = "$src = $dst",
    hasSideEffects = 0, mayLoad = 0, mayStore = 0, Defs = [P] in {
def SRA15A : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src),
                          "# SRA15A $dst, $src",
                          [(set Acc16:$dst, (sra Acc16:$src, (i16 15)))]>;
}

// sext_inreg from i1: broadcast bit 0 to all bits.  LLVM emits this
// for `(c & 1) ? -1 : 0` patterns (e.g. CRC inner loops).  The result
// is `-(x & 1)` — 0 if bit 0 was clear, 0xFFFF if set.  Mask to bit
// 0 then two's-complement-negate.  Three pseudos = ~7 bytes.
def : Pat<(sext_inreg Acc16:$src, i1),
          (NEGA16 (ANDi16imm Acc16:$src, 1))>;

// sext_inreg from i8: branchless `((x & 0xFF) ^ 0x80) - 0x80` trick
// (same sequence LowerSignExtend uses for ISD::SIGN_EXTEND i8->i16).
// LLVM emits this when expanding a sextload-i16-from-i8 (we set
// SEXTLOAD i8 to Expand in the lowering ctor) and for explicit
// `(int)(signed char)` casts.
def : Pat<(sext_inreg Acc16:$src, i8),
          (SBCi16imm (EORi16imm
                         (ANDi16imm Acc16:$src, 0x00FF), 0x0080),
                     0x0080)>;

// Frame-index loads/stores: take a FrameIndex + offset (packed into a
// single MIOperandInfo) and expand (in eliminateFrameIndex) into an
// LDA / STA d,S with the offset baked in.  Used by LowerFormalArguments
// to read stack-passed arguments and by spill/reload via
// storeRegToStackSlot.
def memfi : Operand<i16> {
  let MIOperandInfo = (ops i32imm, i32imm);
  let PrintMethod   = "printFrameMem";
}

// LDAfi is rematerializable when the FI is a fixed (immutable) arg
// slot — see W65816InstrInfo::isReMaterializableImpl.  Without this,
// greedy regalloc spills every arg load to a fresh local slot then
// reloads from there, ballooning every i32-arg function by 4-6 insns.
let mayLoad = 1, hasSideEffects = 0, mayStore = 0,
    isReMaterializable = 1 in {
def LDAfi : W65816Pseudo<(outs Acc16:$dst), (ins memfi:$addr),
                         "# LDAfi $dst, $addr", []>;
}
// STAfi accepts Wide16 src so greedy can park the value in IMGn instead
// of A.  When src is in IMGn, eliminateFrameIndex prepends a LDA dp;
// hence Defs = [A] (the IMG case clobbers A).
let mayStore = 1, hasSideEffects = 0, mayLoad = 0, Defs = [A] in {
def STAfi : W65816Pseudo<(outs),
                         (ins Wide16:$src, memfi:$addr),
                         "# STAfi $src, $addr", []>;
}
// i8 truncating store to a FrameIndex slot.  eliminateFrameIndex wraps
// it in SEP #$20 / STA d,S / REP #$20 so only one byte is written.
// Without the wrap, a 16-bit STA writes the byte at slot+1 too, which
// corrupts the next stack slot (or return address for the last slot of
// an alloca).  Defs P because SEP/REP modify the M bit.
let mayStore = 1, hasSideEffects = 1, mayLoad = 0, Defs = [P] in {
def STA8fi : W65816Pseudo<(outs),
                          (ins Acc16:$src, memfi:$addr),
                          "# STA8fi $src, $addr", []>;
}

// ComplexPattern bridging FrameIndex SDValues to memfi.  See
// SelectFrameIndex in W65816ISelDAGToDAG.cpp.
def addr_fi : ComplexPattern<i16, 2, "SelectFrameIndex", [frameindex]>;

def : Pat<(i16 (load addr_fi:$addr)),
          (LDAfi addr_fi:$addr)>;
def : Pat<(store Acc16:$src, addr_fi:$addr),
          (STAfi Acc16:$src, addr_fi:$addr)>;

// i8 access to a FrameIndex slot.  Loads read 2 bytes via 16-bit LDA
// — the high byte is harmless (extending loads mask or sign-extend it,
// narrowing loads narrow back to Acc8 / discard).  Stores must write
// only one byte: i8 alloca arrays pack adjacent slots one byte apart,
// and a 16-bit STA at the last slot of the array would corrupt the
// return address.  Truncating stores route through STA8fi which wraps
// the STA in SEP #$20 / REP #$20.
def : Pat<(i8 (load addr_fi:$addr)),
          (COPY_TO_REGCLASS (LDAfi addr_fi:$addr), Acc8)>;
def : Pat<(i16 (zextloadi8 addr_fi:$addr)),
          (ANDi16imm (LDAfi addr_fi:$addr), 0xFF)>;
def : Pat<(i16 (extloadi8 addr_fi:$addr)),
          (LDAfi addr_fi:$addr)>;
def : Pat<(store Acc8:$src, addr_fi:$addr),
          (STA8fi (COPY_TO_REGCLASS Acc8:$src, Acc16), addr_fi:$addr)>;
def : Pat<(truncstorei8 Acc16:$src, addr_fi:$addr),
          (STA8fi Acc16:$src, addr_fi:$addr)>;

// Frame-index folding into ADC / SBC / AND / ORA / EOR / CMP.  Same
// shape as the *abs variants but the second operand is a stack slot.
// ADCfi/SBCfi mark P as Def so they can match `addc`/`subc` (the lo
// half of a multi-precision split — see ADCi16imm comment above).
let Constraints = "$src = $dst",
    hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
let Defs = [P] in {
def ADCfi : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src, memfi:$addr),
                         "# ADCfi $dst, $src, $addr", []>;
def SBCfi : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src, memfi:$addr),
                         "# SBCfi $dst, $src, $addr", []>;
}
def ANDfi : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src, memfi:$addr),
                         "# ANDfi $dst, $src, $addr", []>;
def ORAfi : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src, memfi:$addr),
                         "# ORAfi $dst, $src, $addr", []>;
def EORfi : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src, memfi:$addr),
                         "# EORfi $dst, $src, $addr", []>;
}
// ADCEfi / SBCEfi: chained ADC/SBC, hi half of a multi-precision split.
// Read carry from previous addc/adde/subc/sube via Uses = [P].
let Constraints = "$src = $dst",
    hasSideEffects = 0, mayLoad = 1, mayStore = 0,
    Uses = [P], Defs = [P] in {
def ADCEfi : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src, memfi:$addr),
                          "# ADCEfi $dst, $src, $addr", []>;
def SBCEfi : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src, memfi:$addr),
                          "# SBCEfi $dst, $src, $addr", []>;
}
let hasSideEffects = 0, mayLoad = 1, mayStore = 0, Defs = [P] in {
def CMPfi : W65816Pseudo<(outs), (ins Acc16:$lhs, memfi:$addr),
                         "# CMPfi $lhs, $addr", []>;
}
def : Pat<(add Acc16:$src, (i16 (load addr_fi:$addr))),
          (ADCfi Acc16:$src, addr_fi:$addr)>;
def : Pat<(sub Acc16:$src, (i16 (load addr_fi:$addr))),
          (SBCfi Acc16:$src, addr_fi:$addr)>;
def : Pat<(addc Acc16:$src, (i16 (load addr_fi:$addr))),
          (ADCfi Acc16:$src, addr_fi:$addr)>;
def : Pat<(subc Acc16:$src, (i16 (load addr_fi:$addr))),
          (SBCfi Acc16:$src, addr_fi:$addr)>;
def : Pat<(adde Acc16:$src, (i16 (load addr_fi:$addr))),
          (ADCEfi Acc16:$src, addr_fi:$addr)>;
def : Pat<(sube Acc16:$src, (i16 (load addr_fi:$addr))),
          (SBCEfi Acc16:$src, addr_fi:$addr)>;
def : Pat<(and Acc16:$src, (i16 (load addr_fi:$addr))),
          (ANDfi Acc16:$src, addr_fi:$addr)>;
def : Pat<(or  Acc16:$src, (i16 (load addr_fi:$addr))),
          (ORAfi Acc16:$src, addr_fi:$addr)>;
def : Pat<(xor Acc16:$src, (i16 (load addr_fi:$addr))),
          (EORfi Acc16:$src, addr_fi:$addr)>;
def : Pat<(W65816cmp Acc16:$lhs, (i16 (load addr_fi:$addr))),
          (CMPfi Acc16:$lhs, addr_fi:$addr)>;

// Zero-extending byte load: 16-bit LDA reads two bytes (the byte we want
// plus the next byte), then mask the high byte with AND #$00FF.  Reads
// one byte past the source — fine for standalone bytes in the bank-0
// data area but caller must ensure addr+1 is safe to read.  A future
// optimisation could use SEP/REP transitions to do a true 8-bit load.
def : Pat<(i16 (zextloadi8 (W65816Wrapper tglobaladdr:$g))),
          (ANDi16imm (LDAabs tglobaladdr:$g), 0xFF)>;
def : Pat<(i16 (zextloadi8 (W65816Wrapper texternalsym:$s))),
          (ANDi16imm (LDAabs texternalsym:$s), 0xFF)>;

// CMP / branches.  CMP sets the flags via the W65816cmp SDNode (glue
// out); the W65816brcc node consumes the glue and dispatches to the
// right Bxx instruction by condition code.
let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Defs = [P] in {
def CMPi16imm : W65816Pseudo<(outs), (ins Acc16:$lhs, i16imm:$rhs),
                             "# CMPi16imm $lhs, $rhs",
                             [(W65816cmp Acc16:$lhs, (i16 imm:$rhs))]>;
def CMPi8imm  : W65816Pseudo<(outs), (ins Acc8:$lhs, i8imm:$rhs),
                             "# CMPi8imm $lhs, $rhs",
                             [(W65816cmp Acc8:$lhs, (i8 imm:$rhs))]>;
}
let hasSideEffects = 0, mayLoad = 1, mayStore = 0, Defs = [P] in {
def CMPabs : W65816Pseudo<(outs), (ins Acc16:$lhs, i32imm:$addr),
                          "# CMPabs $lhs, $addr", []>;
}
def : Pat<(W65816cmp Acc16:$lhs,
              (i16 (load (W65816Wrapper tglobaladdr:$g)))),
          (CMPabs Acc16:$lhs, tglobaladdr:$g)>;
def : Pat<(W65816cmp Acc16:$lhs,
              (i16 (load (W65816Wrapper texternalsym:$s)))),
          (CMPabs Acc16:$lhs, texternalsym:$s)>;

// 16-bit byte swap: XBA exchanges A.high and A.low.  Pattern matches
// the (bswap Acc16) SDNode emitted by clang for byte-reverse loops.
let Constraints = "$src = $dst",
    hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
def XBA16 : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src),
                         "# XBA16 $dst, $src",
                         [(set Acc16:$dst, (bswap Acc16:$src))]>;
}

// Two-Acc16 binary ops.  We have only one A register, so when both
// operands are computed values (neither a foldable load/imm/global) we
// must spill one to a stack slot.  Each pseudo's custom inserter
// allocates a fresh slot and emits a STAfi+OPfi sequence; the
// register allocator handles the surrounding spills/reloads.
// hasSideEffects=1 tells the validator the pseudo may load/store
// without requiring a matching SDNode pattern (the stores are added
// by the inserter, not visible in the DAG pattern).
//
// Defs = [P] on ADD_RR/SUB_RR matches the C-flag side-effect of the
// underlying ADC/SBC, letting these pseudos serve `addc`/`subc` (the
// lo half of an i32 split) as well as plain `add`/`sub`.
let usesCustomInserter = 1, hasSideEffects = 1 in {
let Defs = [P] in {
def ADD_RR : W65816Pseudo<(outs Acc16:$dst),
                          (ins Acc16:$src1, Acc16:$src2),
                          "# ADD_RR $dst, $src1, $src2",
                          [(set Acc16:$dst,
                                (add Acc16:$src1, Acc16:$src2))]>;
def SUB_RR : W65816Pseudo<(outs Acc16:$dst),
                          (ins Acc16:$src1, Acc16:$src2),
                          "# SUB_RR $dst, $src1, $src2",
                          [(set Acc16:$dst,
                                (sub Acc16:$src1, Acc16:$src2))]>;
}
def AND_RR : W65816Pseudo<(outs Acc16:$dst),
                          (ins Acc16:$src1, Acc16:$src2),
                          "# AND_RR $dst, $src1, $src2",
                          [(set Acc16:$dst,
                                (and Acc16:$src1, Acc16:$src2))]>;
def ORA_RR : W65816Pseudo<(outs Acc16:$dst),
                          (ins Acc16:$src1, Acc16:$src2),
                          "# ORA_RR $dst, $src1, $src2",
                          [(set Acc16:$dst,
                                (or  Acc16:$src1, Acc16:$src2))]>;
def EOR_RR : W65816Pseudo<(outs Acc16:$dst),
                          (ins Acc16:$src1, Acc16:$src2),
                          "# EOR_RR $dst, $src1, $src2",
                          [(set Acc16:$dst,
                                (xor Acc16:$src1, Acc16:$src2))]>;
}
def : Pat<(addc Acc16:$src1, Acc16:$src2),
          (ADD_RR Acc16:$src1, Acc16:$src2)>;
def : Pat<(subc Acc16:$src1, Acc16:$src2),
          (SUB_RR Acc16:$src1, Acc16:$src2)>;

// Chained-carry two-Acc16 add/sub for the hi half of i32 splits.
// Inserter mirrors ADD_RR (STAfi spill + ADCEfi load-fold) but emits
// the carry-chain pseudo so the previous addc/adde's C flag is
// consumed instead of overwritten by a CLC.  Uses+Defs = [P]
// reflects the carry chain through the SDNode.
let usesCustomInserter = 1, hasSideEffects = 1,
    Uses = [P], Defs = [P] in {
def ADDE_RR : W65816Pseudo<(outs Acc16:$dst),
                           (ins Acc16:$src1, Acc16:$src2),
                           "# ADDE_RR $dst, $src1, $src2",
                           [(set Acc16:$dst,
                                 (adde Acc16:$src1, Acc16:$src2))]>;
def SUBE_RR : W65816Pseudo<(outs Acc16:$dst),
                           (ins Acc16:$src1, Acc16:$src2),
                           "# SUBE_RR $dst, $src1, $src2",
                           [(set Acc16:$dst,
                                 (sube Acc16:$src1, Acc16:$src2))]>;
}
let usesCustomInserter = 1, hasSideEffects = 1, Defs = [P] in {
def CMP_RR : W65816Pseudo<(outs), (ins Acc16:$lhs, Acc16:$rhs),
                          "# CMP_RR $lhs, $rhs",
                          [(W65816cmp Acc16:$lhs, Acc16:$rhs)]>;
}

// Pointer dereference.  The 65816 can't deref a register pointer
// directly — the indirect addressing modes all read the pointer from
// memory (DP or stack).  These pseudos spill the Acc16 pointer to a
// fresh stack slot, set Y=0, and emit LDA/STA (slot,S),Y.  Y gets
// clobbered as a side effect.  hasSideEffects=1 covers the spill
// store the inserter adds, in addition to the deref.
// LDAptr / STAptr / STBptr lower to [dp],Y indirect-long via DP
// scratch $E0..$E2 (see W65816ISelLowering.cpp inserter).  The
// inserter uses A and Y plus the DP scratch — X is not touched.
// Defs: Y (LDY #0) and P (STA/LDA set N/Z).
// $ptr is Wide16 (A or IMGn) so when bb.3-style pressure forces the
// pointer to share A with another live vreg, RA can park ptr in an
// IMGn DP slot.  Acc16:$ptr was being silently coalesced with the
// loop-PHI accumulator: both wanted A at end of bb, and PHI-elim
// dropped the COPY needed to refresh A with the pointer at top of
// the loop.  With Wide16, the COPY $a = ptr lowers to a real LDA $dp.
let usesCustomInserter = 1, hasSideEffects = 1, mayLoad = 1,
    Defs = [Y, P] in {
def LDAptr : W65816Pseudo<(outs Acc16:$dst), (ins Wide16:$ptr),
                          "# LDAptr $dst, $ptr",
                          [(set Acc16:$dst, (load Wide16:$ptr))]>;
// Variant that hardcodes bank=0 for the [dp],Y deref.  Used by
// LowerVAARG: va_arg derefs a stack pointer, and the 65816 stack is
// always in bank 0 — but under GS/OS Loader our default $E2 source
// ($BE = our bank when LoaderBankDeref is on) would point reads at
// the wrong bank.  This variant always emits `STZ $E2` so the deref
// is unambiguously bank-0.  Caught by snprintf("%d", N) under Loader
// returning constant garbage instead of N's decimal — see
// feedback_loader_substantial_test.md.
def LDAptrBank0 : W65816Pseudo<(outs Acc16:$dst), (ins Wide16:$ptr),
                               "# LDAptrBank0 $dst, $ptr",
                               [(set Acc16:$dst, (W65816vaargLoad Wide16:$ptr))]>;
}
let usesCustomInserter = 1, hasSideEffects = 1, mayStore = 1,
    Defs = [Y, P] in {
def STAptr : W65816Pseudo<(outs), (ins Acc16:$val, Wide16:$ptr),
                          "# STAptr $val, $ptr",
                          [(store Acc16:$val, Wide16:$ptr)]>;
}

// i8 zero-extending pointer load: do a 16-bit LDA (slot,s),y and mask
// the high byte.  Reads one byte past the source — fine for byte-array
// iteration where the buffer is at least 2 bytes long.  A future
// SEP/REP-aware mode pass could switch to a true 8-bit LDA.
def : Pat<(i16 (zextloadi8 Wide16:$ptr)),
          (ANDi16imm (LDAptr Wide16:$ptr), 0xFF)>;
// Anyext byte load via pointer: consumer doesn't care about the high
// byte, so just LDA (16-bit).  Same 1-byte-past-buffer caveat as
// zextloadi8.
def : Pat<(i16 (extloadi8 Wide16:$ptr)),
          (LDAptr Wide16:$ptr)>;
// And the equivalent for absolute addresses (byte loads via global ptr).
// (Already covered for Wrapper(global) above; this catches the case
// where the ptr is materialised as a value.)

// Intermediate pseudos used by the LDAptr/STAptr inserters.  Each takes
// a memfi describing the slot containing the pointer; eliminateFrameIndex
// resolves it to LDA_StackRelIndY / STA_StackRelIndY with the right d-byte.
// Y must hold 0 at the issue point (the inserter emits LDY #0 first).
let mayLoad = 1, hasSideEffects = 0, mayStore = 0, Uses = [Y] in {
def LDAfi_indY : W65816Pseudo<(outs Acc16:$dst), (ins memfi:$addr),
                              "# LDAfi_indY $dst, $addr", []>;
}
let mayStore = 1, hasSideEffects = 0, mayLoad = 0, Uses = [Y] in {
def STAfi_indY : W65816Pseudo<(outs), (ins Acc16:$src, memfi:$addr),
                              "# STAfi_indY $src, $addr", []>;
}

// i8 truncating store via Acc16 pointer.  Same shape as STAptr but
// custom inserter wraps the actual STA in SEP/REP so the M-bit is 8
// across the store and only one byte is written.  Without the wrap the
// 16-bit STA would clobber the byte at ptr+1.  Two patterns: the
// natural truncstorei8 from an i16 value (common with arg promotion),
// and a true i8 store (Acc8) that arises from i8-typed IR.
let usesCustomInserter = 1, hasSideEffects = 1, mayStore = 1,
    Defs = [Y, P] in {
def STBptr : W65816Pseudo<(outs), (ins Acc16:$val, Wide16:$ptr),
                          "# STBptr $val, $ptr",
                          [(truncstorei8 Acc16:$val, Wide16:$ptr)]>;
}

// Pointer access with constant offset.  `(load (add ptr, $off))` and
// `(store val, (add ptr, $off))` come up for struct field access and
// array indexing with small constant offsets.  Without these patterns,
// the offset becomes an explicit ADC #imm that has to spill A and
// recompute the pointer per access.  With them, we just load Y with
// the offset in the inserter (Y is 16-bit so any i16 constant fits).
// LDAptrOff / STAptrOff / STBptrOff: same [dp],Y lowering as the
// no-offset variants but folds the offset into Y.
let usesCustomInserter = 1, hasSideEffects = 1, mayLoad = 1,
    Defs = [Y, P] in {
def LDAptrOff : W65816Pseudo<(outs Acc16:$dst),
                             (ins Wide16:$ptr, i16imm:$off),
                             "# LDAptrOff $dst, $ptr, $off", []>;
}
let usesCustomInserter = 1, hasSideEffects = 1, mayStore = 1,
    Defs = [Y, P] in {
def STAptrOff : W65816Pseudo<(outs),
                             (ins Acc16:$val, Wide16:$ptr, i16imm:$off),
                             "# STAptrOff $val, $ptr, $off", []>;
def STBptrOff : W65816Pseudo<(outs),
                             (ins Acc16:$val, Wide16:$ptr, i16imm:$off),
                             "# STBptrOff $val, $ptr, $off", []>;
}
def : Pat<(i16 (load (add Wide16:$ptr, (i16 imm:$off)))),
          (LDAptrOff Wide16:$ptr, imm:$off)>;
def : Pat<(store Acc16:$val, (add Wide16:$ptr, (i16 imm:$off))),
          (STAptrOff Acc16:$val, Wide16:$ptr, imm:$off)>;
def : Pat<(truncstorei8 Acc16:$val, (add Wide16:$ptr, (i16 imm:$off))),
          (STBptrOff Acc16:$val, Wide16:$ptr, imm:$off)>;
def : Pat<(store Acc8:$val, (add Wide16:$ptr, (i16 imm:$off))),
          (STBptrOff (COPY_TO_REGCLASS Acc8:$val, Acc16),
                     Wide16:$ptr, imm:$off)>;
def : Pat<(store Acc8:$val, Wide16:$ptr),
          (STBptr (COPY_TO_REGCLASS Acc8:$val, Acc16), Wide16:$ptr)>;

// ---------------------------------------------------------------------
// ptr32 deref pseudos.  Same shape and inserter as LDAptr/STAptr/STBptr,
// but the pointer is a Wide32 (i32) value: sub_lo carries the low 16
// bits of the address, sub_hi carries the bank byte in its low half.
// Inserter stages the low 16 bits at $E0..$E1 and the bank byte at $E2,
// then emits LDA/STA [dp],Y just like the i16 path — but with a
// pointer-derived bank instead of a forced 0.
//
// Dead unless ptr32 mode is active (LowerLoad/LowerStore only emit
// W65816ldPtr/stPtr/stbPtr when the address is i32).
// ---------------------------------------------------------------------
let usesCustomInserter = 1, hasSideEffects = 1, mayLoad = 1,
    Defs = [Y, P] in {
def LDAptr32 : W65816Pseudo<(outs Acc16:$dst), (ins AnyWide32:$ptr),
                            "# LDAptr32 $dst, $ptr",
                            [(set Acc16:$dst, (W65816ldPtr AnyWide32:$ptr))]>;
}
let usesCustomInserter = 1, hasSideEffects = 1, mayStore = 1,
    Defs = [Y, P] in {
def STAptr32 : W65816Pseudo<(outs), (ins Acc16:$val, AnyWide32:$ptr),
                            "# STAptr32 $val, $ptr",
                            [(W65816stPtr Acc16:$val, AnyWide32:$ptr)]>;
def STBptr32 : W65816Pseudo<(outs), (ins Acc16:$val, AnyWide32:$ptr),
                            "# STBptr32 $val, $ptr",
                            [(W65816stbPtr Acc16:$val, AnyWide32:$ptr)]>;
}
let usesCustomInserter = 1, hasSideEffects = 1, mayLoad = 1,
    Defs = [Y, P] in {
def LDAptr32Off : W65816Pseudo<(outs Acc16:$dst),
                               (ins AnyWide32:$ptr, i16imm:$off),
                               "# LDAptr32Off $dst, $ptr, $off", []>;
}
let usesCustomInserter = 1, hasSideEffects = 1, mayStore = 1,
    Defs = [Y, P] in {
def STAptr32Off : W65816Pseudo<(outs),
                               (ins Acc16:$val, AnyWide32:$ptr, i16imm:$off),
                               "# STAptr32Off $val, $ptr, $off", []>;
def STBptr32Off : W65816Pseudo<(outs),
                               (ins Acc16:$val, AnyWide32:$ptr, i16imm:$off),
                               "# STBptr32Off $val, $ptr, $off", []>;
}

// Direct ptr32 load/store patterns over generic ISD::LOAD / ISD::STORE
// when the address is an i32 (AnyWide32) reg.  These are unreachable
// while i32 is not a legal type (ptr16 mode).  When ptr32 mode is
// activated they fire instead of the i16-pointer LDAptr / STAptr.
def : Pat<(i16 (load AnyWide32:$ptr)),
          (LDAptr32 AnyWide32:$ptr)>;
def : Pat<(store Acc16:$val, AnyWide32:$ptr),
          (STAptr32 Acc16:$val, AnyWide32:$ptr)>;
def : Pat<(truncstorei8 Acc16:$val, AnyWide32:$ptr),
          (STBptr32 Acc16:$val, AnyWide32:$ptr)>;
def : Pat<(i16 (zextloadi8 AnyWide32:$ptr)),
          (ANDi16imm (LDAptr32 AnyWide32:$ptr), 0xFF)>;
def : Pat<(i16 (extloadi8 AnyWide32:$ptr)),
          (LDAptr32 AnyWide32:$ptr)>;
def : Pat<(i8 (load AnyWide32:$ptr)),
          (COPY_TO_REGCLASS (ANDi16imm (LDAptr32 AnyWide32:$ptr), 0xFF), Acc8)>;
def : Pat<(store Acc8:$val, AnyWide32:$ptr),
          (STBptr32 (COPY_TO_REGCLASS Acc8:$val, Acc16), AnyWide32:$ptr)>;

// Off variants — folded constant-offset add patterns deferred until
// ptr32 mode is activated and we can profile real cases.  The base
// LDAptr32/STAptr32 pseudos handle the general (add ptr, off) case
// correctly via a separate i32 ADD; the Off pseudos are an optional
// optimization for small constant offsets.

// Split-pair variants: same semantics as LDAptr32/STAptr32/STBptr32 but
// the ptr is two separate i16 register operands (lo + hi) instead of
// one Wide32 register pair.  Used by the W65816LowerWide32 pre-RA pass
// to relieve register-pair allocation pressure: it walks REG_SEQUENCE
// + LDAptr32 chains, decomposes the Wide32 vregs into pairs of i16
// vregs, and rewrites the LDAptr32-family to take the two halves
// directly.
let usesCustomInserter = 1, hasSideEffects = 1, mayLoad = 1,
    Defs = [Y, P] in {
def LDAptr32S : W65816Pseudo<(outs Acc16:$dst),
                             (ins Wide16:$ptrLo, Wide16:$ptrHi),
                             "# LDAptr32S $dst, $ptrLo, $ptrHi", []>;
}
let usesCustomInserter = 1, hasSideEffects = 1, mayStore = 1,
    Defs = [Y, P] in {
def STAptr32S : W65816Pseudo<(outs),
                             (ins Acc16:$val, Wide16:$ptrLo, Wide16:$ptrHi),
                             "# STAptr32S $val, $ptrLo, $ptrHi", []>;
def STBptr32S : W65816Pseudo<(outs),
                             (ins Acc16:$val, Wide16:$ptrLo, Wide16:$ptrHi),
                             "# STBptr32S $val, $ptrLo, $ptrHi", []>;
}

// i8 load via Acc16 pointer producing a true i8 (Acc8) result.  Reuses
// the existing zextloadi8 16-bit-LDA-and-mask path: load 2 bytes, mask
// the high byte, then narrow to Acc8.  COPY_TO_REGCLASS to Acc8 is a
// no-op at MC level (same physical A).  Reads one byte past the source;
// fine for char-array iteration where the buffer is at least 2 bytes.
def : Pat<(i8 (load Wide16:$ptr)),
          (COPY_TO_REGCLASS (ANDi16imm (LDAptr Wide16:$ptr), 0xFF), Acc8)>;

// Acc8-to-Acc16 type conversions.  Both Acc8 and Acc16 alias physical A,
// so COPY_TO_REGCLASS is a no-op at MC level.  ZEXT additionally masks
// the high byte (which holds B from before any prior SEP).  ANYEXT
// leaves the high byte untouched since the consumer doesn't care.
def : Pat<(i16 (anyext Acc8:$src)),
          (COPY_TO_REGCLASS Acc8:$src, Acc16)>;
def : Pat<(i16 (zext Acc8:$src)),
          (ANDi16imm (COPY_TO_REGCLASS Acc8:$src, Acc16), 0xFF)>;
def : Pat<(i8 (trunc Acc16:$src)),
          (COPY_TO_REGCLASS Acc16:$src, Acc8)>;

// Acc8 reg-reg arithmetic and bitwise ops, expanded through the Acc16
// _RR pseudos.  Cheap to do because Acc8 and Acc16 alias the same
// physical A — COPY_TO_REGCLASS is a no-op.  Only the low byte
// matters; the high byte gets unrelated bits but is discarded by the
// final narrow-back to Acc8.  This lets an i8 expression that wasn't
// promoted by legalization (e.g. an i8 XOR feeding only an i8 store)
// reuse the spill-and-OPfi inserter without needing dedicated Acc8
// pseudos.
multiclass Acc8RR<SDNode op, Instruction ri> {
  def : Pat<(i8 (op Acc8:$a, Acc8:$b)),
            (COPY_TO_REGCLASS
                (ri (COPY_TO_REGCLASS Acc8:$a, Acc16),
                    (COPY_TO_REGCLASS Acc8:$b, Acc16)),
                Acc8)>;
}
defm : Acc8RR<add,  ADD_RR>;
defm : Acc8RR<sub,  SUB_RR>;
defm : Acc8RR<and,  AND_RR>;
defm : Acc8RR<or,   ORA_RR>;
defm : Acc8RR<xor,  EOR_RR>;

// (memory inc/dec patterns moved below INC_Abs/DEC_Abs defs.)

// (Branch patterns moved below the Real Instructions section since
// they reference instruction defs.)

//===----------------------------------------------------------------------===//
// Real Instructions
//
// Opcodes taken from the WDC W65C816S data sheet.  Instructions whose size
// depends on the M or X bits exist in two variants (Imm8 / Imm16) and carry
// TSFlags bits indicating which processor mode they assume; the REP/SEP
// scheduling pass uses those to verify/insert mode transitions.
//
// Disassembler note: for every opcode that has both an _Imm8 and an _Imm16
// form (LDA/LDX/LDY/ADC/SBC/CMP/AND/ORA/EOR/BIT/CPX/CPY), the two forms share
// the same opcode byte but differ in operand width according to M/X mode.
// The scaffold disassembler only consults the default "W65816" decoder
// table, so we push the _Imm8 variants into namespaces "W65816MHigh" /
// "W65816XHigh".  That keeps only one variant per opcode in the default
// table (the 3-byte _Imm16 form for M-dependent insns, and the 3-byte
// _Imm16 form for X-dependent insns), so `llvm-objdump -d` always decodes
// these as 16-bit immediates until the mode-aware decoder lands.
//===----------------------------------------------------------------------===//

//---------------------------------------------------------------- CPU control
def NOP : InstImplied<0xEA, "nop"> {
  let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0;
}

def REP : InstImm8<0xC2, "rep"> {
  let hasSideEffects = 1;
  let mayLoad = 0; let mayStore = 0;
}
def SEP : InstImm8<0xE2, "sep"> {
  let hasSideEffects = 1;
  let mayLoad = 0; let mayStore = 0;
}

def CLC : InstImplied<0x18, "clc"> { let mayLoad = 0; let mayStore = 0; }
def SEC : InstImplied<0x38, "sec"> { let mayLoad = 0; let mayStore = 0; }
def CLI : InstImplied<0x58, "cli"> { let mayLoad = 0; let mayStore = 0; }
def SEI : InstImplied<0x78, "sei"> { let mayLoad = 0; let mayStore = 0; }
def CLD : InstImplied<0xD8, "cld"> { let mayLoad = 0; let mayStore = 0; }
def SED : InstImplied<0xF8, "sed"> { let mayLoad = 0; let mayStore = 0; }
def CLV : InstImplied<0xB8, "clv"> { let mayLoad = 0; let mayStore = 0; }

def XCE : InstImplied<0xFB, "xce"> { let mayLoad = 0; let mayStore = 0; }
def XBA : InstImplied<0xEB, "xba"> { let mayLoad = 0; let mayStore = 0; }

def WAI : InstImplied<0xCB, "wai">;
def STP : InstImplied<0xDB, "stp">;

// WDM (William D Mensch) — reserved 2-byte NOP-equivalent.  Useful as
// a debugger / emulator hook: MAME's apple2gs CPU traps on WDM and a
// Lua plugin can dispatch on the operand byte.  CPU-side, it acts as
// a 2-byte NOP.  Operand syntax mirrors MVN: `wdm $ab` (no `#`).
def WDM : InstDP<0x42, "wdm">;

// TRB / TSB — Test and Reset/Set memory Bits.  Atomic bit clear/set
// on a byte (or 16-bit word per M flag) at the given DP or abs
// address.  Z flag set per (M & A) where M is the memory operand.
// Useful for memory-mapped IO bit twiddling.  No DP indexing form.
def TRB_DP  : InstDP<0x14, "trb">;
def TRB_Abs : InstAbs<0x1C, "trb">;
def TSB_DP  : InstDP<0x04, "tsb">;
def TSB_Abs : InstAbs<0x0C, "tsb">;

// PEI — Push Effective Indirect.  Reads a 16-bit value from DP and
// pushes it.  Useful for indirect parameter passing without going
// through A first.
def PEI_DP : InstDP<0xD4, "pei">;

//---------------------------------------------------------------- LDA (load A)
// The `_Imm8` forms of the mode-dependent load/arith/compare ops are
// marked isCodeGenOnly so the asm matcher never picks them — our
// AsmParser has no way to know the current M/X bits, so it always
// reaches for the _Imm16 form.  Codegen can still select _Imm8
// explicitly once we have 8-bit patterns.
def LDA_Imm8  : InstImm8<0xA9, "lda">  { let MHigh = 1; let DecoderNamespace = "W65816MHigh"; let isCodeGenOnly = 1; let Defs = [A]; }
def LDA_Imm16 : InstImm16<0xA9, "lda"> { let MLow  = 1;                                                              let Defs = [A]; }
def LDA_DP    : InstDP<0xA5, "lda">;
def LDA_Abs   : InstAbs<0xAD, "lda">;
def LDA_Long  : InstAbsLong<0xAF, "lda">;
def LDA_DPX   : InstDPX<0xB5, "lda">;
def LDA_AbsX  : InstAbsX<0xBD, "lda">;
def LDA_AbsY  : InstAbsY<0xB9, "lda">;
def LDA_DPInd  : InstDPInd <0xB2, "lda">;
def LDA_DPIndY : InstDPIndY<0xB1, "lda">;
def LDA_DPIndX : InstDPIndX<0xA1, "lda">;
def LDA_DPIndLong  : InstDPIndLong <0xA7, "lda"> { let Defs = [A]; }
// LDA [dp],Y: reads Y to compute the indexed address, defines A.
// Without these, regalloc thought A was unaffected by the load and
// dead-code-eliminated COPYs that were supposed to materialise the
// next pointer in A — silent miscompile in mySwap-style helpers.
def LDA_DPIndLongY : InstDPIndLongY<0xB7, "lda"> { let Defs = [A]; let Uses = [Y]; }
def LDA_LongX  : InstAbsLongX<0xBF, "lda">;

//---------------------------------------------------------------- STA (store A)
def STA_DP   : InstDP<0x85, "sta">;
def STA_Abs  : InstAbs<0x8D, "sta">;
def STA_Long : InstAbsLong<0x8F, "sta">;
def STA_DPX  : InstDPX<0x95, "sta">;
def STA_AbsX : InstAbsX<0x9D, "sta">;
def STA_AbsY : InstAbsY<0x99, "sta">;
def STA_DPInd  : InstDPInd <0x92, "sta">;
def STA_DPIndY : InstDPIndY<0x91, "sta">;
def STA_DPIndX : InstDPIndX<0x81, "sta">;
def STA_DPIndLong  : InstDPIndLong <0x87, "sta"> { let Uses = [A]; }
// STA [dp],Y: reads A (the value to store) and Y (the index).  Mark
// both so regalloc keeps A's value live across this instruction.
def STA_DPIndLongY : InstDPIndLongY<0x97, "sta"> { let Uses = [A, Y]; }
def STA_LongX  : InstAbsLongX<0x9F, "sta">;

//---------------------------------------------------------------- LDX (load X)
def LDX_Imm8  : InstImm8<0xA2, "ldx">  { let XHigh = 1; let DecoderNamespace = "W65816XHigh"; let isCodeGenOnly = 1; let Defs = [X]; }
def LDX_Imm16 : InstImm16<0xA2, "ldx"> { let XLow  = 1;                                                              let Defs = [X]; }
def LDX_DP    : InstDP<0xA6, "ldx">;
def LDX_Abs   : InstAbs<0xAE, "ldx">;
def LDX_DPY   : InstDPY<0xB6, "ldx">;
def LDX_AbsY  : InstAbsY<0xBE, "ldx">;

//---------------------------------------------------------------- STX (store X)
def STX_DP  : InstDP<0x86, "stx">;
def STX_Abs : InstAbs<0x8E, "stx">;
def STX_DPY : InstDPY<0x96, "stx">;

//---------------------------------------------------------------- LDY (load Y)
def LDY_Imm8  : InstImm8<0xA0, "ldy">  { let XHigh = 1; let DecoderNamespace = "W65816XHigh"; let isCodeGenOnly = 1; let Defs = [Y]; }
def LDY_Imm16 : InstImm16<0xA0, "ldy"> { let XLow  = 1;                                                              let Defs = [Y]; }
def LDY_DP    : InstDP<0xA4, "ldy">;
def LDY_Abs   : InstAbs<0xAC, "ldy">;
def LDY_DPX   : InstDPX<0xB4, "ldy">;
def LDY_AbsX  : InstAbsX<0xBC, "ldy">;

//---------------------------------------------------------------- STY (store Y)
def STY_DP  : InstDP<0x84, "sty">;
def STY_Abs : InstAbs<0x8C, "sty">;
def STY_DPX : InstDPX<0x94, "sty">;

//---------------------------------------------------------------- STZ (store zero)
// Width follows M flag — same as STA.  Useful for zeroing DP scratch
// without burning A.  Saves 1 byte vs `LDA #0; STA dp` per zero.
def STZ_DP   : InstDP<0x64, "stz">;
def STZ_Abs  : InstAbs<0x9C, "stz">;
def STZ_DPX  : InstDPX<0x74, "stz">;
def STZ_AbsX : InstAbsX<0x9E, "stz">;

//------------------------------------------------------------------------- ADC
def ADC_Imm8  : InstImm8<0x69, "adc">  { let MHigh = 1; let DecoderNamespace = "W65816MHigh"; let isCodeGenOnly = 1; }
def ADC_Imm16 : InstImm16<0x69, "adc"> { let MLow  = 1; }
def ADC_DP    : InstDP<0x65, "adc">;
def ADC_Abs   : InstAbs<0x6D, "adc">;
def ADC_DPX   : InstDPX<0x75, "adc">;
def ADC_AbsX  : InstAbsX<0x7D, "adc">;
def ADC_AbsY  : InstAbsY<0x79, "adc">;

//------------------------------------------------------------------------- SBC
def SBC_Imm8  : InstImm8<0xE9, "sbc">  { let MHigh = 1; let DecoderNamespace = "W65816MHigh"; let isCodeGenOnly = 1; }
def SBC_Imm16 : InstImm16<0xE9, "sbc"> { let MLow  = 1; }
def SBC_DP    : InstDP<0xE5, "sbc">;
def SBC_Abs   : InstAbs<0xED, "sbc">;
def SBC_DPX   : InstDPX<0xF5, "sbc">;
def SBC_AbsX  : InstAbsX<0xFD, "sbc">;
def SBC_AbsY  : InstAbsY<0xF9, "sbc">;

//------------------------------------------------------------------------- CMP
def CMP_Imm8  : InstImm8<0xC9, "cmp">  { let MHigh = 1; let mayLoad=0; let mayStore=0; let DecoderNamespace = "W65816MHigh"; let isCodeGenOnly = 1; }
def CMP_Imm16 : InstImm16<0xC9, "cmp"> { let MLow  = 1; let mayLoad=0; let mayStore=0; }
def CMP_DP    : InstDP<0xC5, "cmp">    { let mayStore = 0; }
def CMP_Abs   : InstAbs<0xCD, "cmp">   { let mayStore = 0; }
def CMP_DPX   : InstDPX<0xD5, "cmp">   { let mayStore = 0; }
def CMP_AbsX  : InstAbsX<0xDD, "cmp">  { let mayStore = 0; }
def CMP_AbsY  : InstAbsY<0xD9, "cmp">  { let mayStore = 0; }

//---------------------------------------------------------------- CPX/CPY
def CPX_Imm8  : InstImm8<0xE0, "cpx">  { let XHigh = 1; let mayLoad=0; let mayStore=0; let DecoderNamespace = "W65816XHigh"; let isCodeGenOnly = 1; }
def CPX_Imm16 : InstImm16<0xE0, "cpx"> { let XLow  = 1; let mayLoad=0; let mayStore=0; }
def CPX_DP    : InstDP<0xE4, "cpx">    { let mayStore = 0; }
def CPX_Abs   : InstAbs<0xEC, "cpx">   { let mayStore = 0; }
def CPY_Imm8  : InstImm8<0xC0, "cpy">  { let XHigh = 1; let mayLoad=0; let mayStore=0; let DecoderNamespace = "W65816XHigh"; let isCodeGenOnly = 1; }
def CPY_Imm16 : InstImm16<0xC0, "cpy"> { let XLow  = 1; let mayLoad=0; let mayStore=0; }
def CPY_DP    : InstDP<0xC4, "cpy">    { let mayStore = 0; }
def CPY_Abs   : InstAbs<0xCC, "cpy">   { let mayStore = 0; }

//---------------------------------------------------------------- AND/ORA/EOR
def AND_Imm8  : InstImm8<0x29, "and">  { let MHigh = 1; let mayLoad=0; let mayStore=0; let DecoderNamespace = "W65816MHigh"; let isCodeGenOnly = 1; }
def AND_Imm16 : InstImm16<0x29, "and"> { let MLow  = 1; let mayLoad=0; let mayStore=0; }
def AND_DP    : InstDP<0x25, "and">    { let mayStore = 0; }
def AND_Abs   : InstAbs<0x2D, "and">   { let mayStore = 0; }

def ORA_Imm8  : InstImm8<0x09, "ora">  { let MHigh = 1; let mayLoad=0; let mayStore=0; let DecoderNamespace = "W65816MHigh"; let isCodeGenOnly = 1; }
def ORA_Imm16 : InstImm16<0x09, "ora"> { let MLow  = 1; let mayLoad=0; let mayStore=0; }
def ORA_DP    : InstDP<0x05, "ora">    { let mayStore = 0; }
def ORA_Abs   : InstAbs<0x0D, "ora">   { let mayStore = 0; }

def EOR_Imm8  : InstImm8<0x49, "eor">  { let MHigh = 1; let mayLoad=0; let mayStore=0; let DecoderNamespace = "W65816MHigh"; let isCodeGenOnly = 1; }
def EOR_Imm16 : InstImm16<0x49, "eor"> { let MLow  = 1; let mayLoad=0; let mayStore=0; }
def EOR_DP    : InstDP<0x45, "eor">    { let mayStore = 0; }
def EOR_Abs   : InstAbs<0x4D, "eor">   { let mayStore = 0; }

def BIT_Imm8  : InstImm8<0x89, "bit">  { let MHigh = 1; let mayLoad=0; let mayStore=0; let DecoderNamespace = "W65816MHigh"; let isCodeGenOnly = 1; }
def BIT_Imm16 : InstImm16<0x89, "bit"> { let MLow  = 1; let mayLoad=0; let mayStore=0; }
def BIT_DP    : InstDP<0x24, "bit">    { let mayStore = 0; }
def BIT_Abs   : InstAbs<0x2C, "bit">   { let mayStore = 0; }

//---------------------------------------------------------------- INC/DEC
def INA : InstImplied<0x1A, "inc a">  { let mayLoad = 0; let mayStore = 0; }
def DEA : InstImplied<0x3A, "dec a">  { let mayLoad = 0; let mayStore = 0; }
def INX : InstImplied<0xE8, "inx">    { let mayLoad = 0; let mayStore = 0; }
def DEX : InstImplied<0xCA, "dex">    { let mayLoad = 0; let mayStore = 0; }
def INY : InstImplied<0xC8, "iny">    { let mayLoad = 0; let mayStore = 0; }
def DEY : InstImplied<0x88, "dey">    { let mayLoad = 0; let mayStore = 0; }

def INC_DP  : InstDP<0xE6, "inc">;
def INC_Abs : InstAbs<0xEE, "inc">;
def INC_DPX : InstDPX<0xF6, "inc">;
def INC_AbsX: InstAbsX<0xFE, "inc">;

def DEC_DP  : InstDP<0xC6, "dec">;
def DEC_Abs : InstAbs<0xCE, "dec">;
def DEC_DPX : InstDPX<0xD6, "dec">;
def DEC_AbsX: InstAbsX<0xDE, "dec">;

//---------------------------------------------------------------- Shifts
def ASL_A   : InstImplied<0x0A, "asl a">   { let mayLoad = 0; let mayStore = 0; }
def LSR_A   : InstImplied<0x4A, "lsr a">   { let mayLoad = 0; let mayStore = 0; }
def ROL_A   : InstImplied<0x2A, "rol a">   { let mayLoad = 0; let mayStore = 0; }
def ROR_A   : InstImplied<0x6A, "ror a">   { let mayLoad = 0; let mayStore = 0; }
def ASL_DP  : InstDP<0x06, "asl">;
def ASL_Abs : InstAbs<0x0E, "asl">;
def LSR_DP  : InstDP<0x46, "lsr">;
def LSR_Abs : InstAbs<0x4E, "lsr">;
def ROL_DP  : InstDP<0x26, "rol">;
def ROL_Abs : InstAbs<0x2E, "rol">;
def ROR_DP  : InstDP<0x66, "ror">;
def ROR_Abs : InstAbs<0x6E, "ror">;

//---------------------------------------------------------------- Transfers
// Defs/Uses metadata is critical: without it, machine-cp doesn't see
// that TAX (etc.) reads the source register, and may delete a `$a =
// COPY $x` immediately preceding it as a "dead store" — corrupting
// the data flow.  See feedback_w65816_implied_ops.md for the canary.
def TAX : InstImplied<0xAA, "tax"> { let mayLoad = 0; let mayStore = 0; let Defs = [X]; let Uses = [A]; }
def TAY : InstImplied<0xA8, "tay"> { let mayLoad = 0; let mayStore = 0; let Defs = [Y]; let Uses = [A]; }
def TXA : InstImplied<0x8A, "txa"> { let mayLoad = 0; let mayStore = 0; let Defs = [A]; let Uses = [X]; }
def TYA : InstImplied<0x98, "tya"> { let mayLoad = 0; let mayStore = 0; let Defs = [A]; let Uses = [Y]; }
def TXY : InstImplied<0x9B, "txy"> { let mayLoad = 0; let mayStore = 0; let Defs = [Y]; let Uses = [X]; }
def TYX : InstImplied<0xBB, "tyx"> { let mayLoad = 0; let mayStore = 0; let Defs = [X]; let Uses = [Y]; }
def TXS : InstImplied<0x9A, "txs"> { let mayLoad = 0; let mayStore = 0; let Defs = [SP]; let Uses = [X]; }
def TSX : InstImplied<0xBA, "tsx"> { let mayLoad = 0; let mayStore = 0; let Defs = [X]; let Uses = [SP]; }
def TCD : InstImplied<0x5B, "tcd"> { let mayLoad = 0; let mayStore = 0; }
def TDC : InstImplied<0x7B, "tdc"> { let mayLoad = 0; let mayStore = 0; }
def TCS : InstImplied<0x1B, "tcs"> { let mayLoad = 0; let mayStore = 0; }
def TSC : InstImplied<0x3B, "tsc"> { let mayLoad = 0; let mayStore = 0; }

//---------------------------------------------------------------- Stack push/pull
def PHA : InstImplied<0x48, "pha">;
def PLA : InstImplied<0x68, "pla">;
def PHX : InstImplied<0xDA, "phx">;
def PLX : InstImplied<0xFA, "plx">;
def PHY : InstImplied<0x5A, "phy">;
def PLY : InstImplied<0x7A, "ply">;
def PHP : InstImplied<0x08, "php">;
def PLP : InstImplied<0x28, "plp">;
def PHB : InstImplied<0x8B, "phb">;
def PLB : InstImplied<0xAB, "plb">;
def PHD : InstImplied<0x0B, "phd">;
def PLD : InstImplied<0x2B, "pld">;
def PHK : InstImplied<0x4B, "phk">;
def PEA : InstAbs<0xF4, "pea">;
def PER : InstPCRel16<0x62, "per">;

//---------------------------------------------------------------- Branches
// Conditional branches READ the P (status) register.  Without this
// Uses, MachineCSE saw no dependency between an earlier CMP (which
// defines P) and the consuming Bxx, and would happily reuse a
// "redundant" CMP whose flags had been clobbered by an intervening
// LDA/STA/ADC.  Modelling the dep is the principled fix; the
// W65816TargetMachine workaround that disabled MachineCSE entirely
// can come back out once this is verified.
let isBranch = 1, isTerminator = 1, mayLoad = 0, mayStore = 0,
    Uses = [P] in {
def BEQ : InstPCRel8<0xF0, "beq">;
def BNE : InstPCRel8<0xD0, "bne">;
def BCS : InstPCRel8<0xB0, "bcs">;
def BCC : InstPCRel8<0x90, "bcc">;
def BMI : InstPCRel8<0x30, "bmi">;
def BPL : InstPCRel8<0x10, "bpl">;
def BVS : InstPCRel8<0x70, "bvs">;
def BVC : InstPCRel8<0x50, "bvc">;
}

let isBranch = 1, isTerminator = 1, isBarrier = 1, mayLoad = 0, mayStore = 0 in {
def BRA : InstPCRel8<0x80, "bra">;
def BRL : InstPCRel16<0x82, "brl">;
def JMP_Abs    : InstAbs<0x4C, "jmp">;
def JMP_AbsInd : InstAbsInd<0x6C, "jmp">;
def JML_Long   : InstAbsLong<0x5C, "jml">;
}

//---------------------------------------------------------------- Calls
let isCall = 1, mayLoad = 0, mayStore = 0 in {
def JSR_Abs  : InstAbs<0x20, "jsr">;
def JSL_Long : InstAbsLong<0x22, "jsl">;
}

//---------------------------------------------------------------- Returns
let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 0, mayStore = 0 in {
def RTS : InstImplied<0x60, "rts">;
def RTI : InstImplied<0x40, "rti">;
// RTL is the 65816 long return; we select it for the generic retglue node.
def RTL : InstImplied<0x6B, "rtl"> {
  let Pattern = [(W65816retglue)];
}
}

//---------------------------------------------------------------- Block move
// MVN/MVP are 3 bytes: opcode + destBank + srcBank.  WDC writes the
// operand order as "dst, src" but the bytes on the wire are dst-then-src.
// Block-move operands are bank bytes written without a '#' prefix
// (e.g. `mvn $01, $02`), so the parser produces AddrDP-kind operands,
// not immediates.  Use addrDP here to match that; the encoder path is
// identical since both are single-byte values.
class InstBlockMove<bits<8> op, string mnem>
    : W65816Inst<(outs), (ins addrDP:$dst, addrDP:$src),
                 !strconcat(mnem, "\t$dst, $src")> {
  let Size = 3;
  bits<8>  dst;
  bits<8>  src;
  bits<24> Inst;
  let Inst{7-0}   = op;
  let Inst{15-8}  = dst;
  let Inst{23-16} = src;
}

def MVN : InstBlockMove<0x54, "mvn">;
def MVP : InstBlockMove<0x44, "mvp">;

//---------------------------------------------------------------- Stack-rel
def LDA_StackRel : InstStackRel<0xA3, "lda">;
def STA_StackRel : InstStackRel<0x83, "sta">;
def ADC_StackRel : InstStackRel<0x63, "adc">;
def SBC_StackRel : InstStackRel<0xE3, "sbc">;
def CMP_StackRel : InstStackRel<0xC3, "cmp">;
def AND_StackRel : InstStackRel<0x23, "and">;
def ORA_StackRel : InstStackRel<0x03, "ora">;
def EOR_StackRel : InstStackRel<0x43, "eor">;

//---------------------------------------------------------------- Stack-ind-Y
// Stack-relative indirect indexed-Y: deref a pointer spilled at S+off.
def LDA_StackRelIndY : InstStackRelIndY<0xB3, "lda">;
def STA_StackRelIndY : InstStackRelIndY<0x93, "sta">;

//===----------------------------------------------------------------------===//
// Branch patterns (placed after the Bxx defs).
//
// W65816brcc takes (Dest, CondCode) plus a glue from W65816cmp.  The CC
// constant maps to one of the eight Bxx instructions.  Values mirror
// W65816CC::CondCode in W65816.h.
//===----------------------------------------------------------------------===//

def : Pat<(W65816brcc bb:$dest, (i8 0)), (BEQ bb:$dest)>;
def : Pat<(W65816brcc bb:$dest, (i8 1)), (BNE bb:$dest)>;
def : Pat<(W65816brcc bb:$dest, (i8 2)), (BCS bb:$dest)>;
def : Pat<(W65816brcc bb:$dest, (i8 3)), (BCC bb:$dest)>;
def : Pat<(W65816brcc bb:$dest, (i8 4)), (BMI bb:$dest)>;
def : Pat<(W65816brcc bb:$dest, (i8 5)), (BPL bb:$dest)>;
def : Pat<(W65816brcc bb:$dest, (i8 6)), (BVS bb:$dest)>;
def : Pat<(W65816brcc bb:$dest, (i8 7)), (BVC bb:$dest)>;

// Unconditional branch from generic ISD::BR.
def : Pat<(br bb:$dest), (BRA bb:$dest)>;

// Memory inc/dec: `*p = *p + 1` → `INC abs`.  Single-instruction RMW
// instead of LDA → CLC → ADC #1 → STA.
def : Pat<(store
           (i16 (add (i16 (load (W65816Wrapper tglobaladdr:$g))),
                     (i16 1))),
           (W65816Wrapper tglobaladdr:$g)),
          (INC_Abs tglobaladdr:$g)>;
def : Pat<(store
           (i16 (add (i16 (load (W65816Wrapper tglobaladdr:$g))),
                     (i16 -1))),
           (W65816Wrapper tglobaladdr:$g)),
          (DEC_Abs tglobaladdr:$g)>;

// Direct call to a global / external symbol.  We use JSL (24-bit
// long jump-and-link) and RTL pairing throughout — matches the
// IIgs convention where main is entered via JSL, and means a
// function doesn't have to know how it was called to choose its
// return instruction.  A pseudo bridges the i16 symbol operand
// to JSL_Long's 24-bit operand class.
// Defs lists ALL caller-clobbered regs.  The 65816 has no
// caller/callee-save split — every callee may freely modify
// A/X/Y/DPF0/P/etc.  Critically, i32/i64 returns place high
// halves in X (i32), Y and DPF0 (i64); without those in Defs,
// the InstrEmitter does not add implicit-defs for glued
// CopyFromReg(X/Y/DPF0) on the call MI, and the verifier sees
// the post-call `COPY $y` as reading an undefined register.
// DPF0 was historically the only "extra" def so getLoad(0xF0)
// wouldn't CSE across calls; the same anti-CSE rationale applies
// to A/X/Y, but more fundamentally those are call return slots.
let isCall = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0,
    Defs = [A, X, Y, DPF0] in {
def JSLpseudo : W65816Pseudo<(outs), (ins i16imm:$dst),
                             "# JSLpseudo $dst", []>;
// ptr32 variant — same expansion in AsmPrinter; the operand class
// just exists so tablegen accepts an i32-typed tglobaladdr operand.
def JSLpseudo32 : W65816Pseudo<(outs), (ins i32imm:$dst),
                               "# JSLpseudo32 $dst", []>;
}

def : Pat<(W65816call (i16 tglobaladdr:$dst)),  (JSLpseudo tglobaladdr:$dst)>;
def : Pat<(W65816call (i16 texternalsym:$dst)), (JSLpseudo texternalsym:$dst)>;
// ptr32: under p:32:16, call targets are i32 (iPTR matches the pointer
// width).  Same JSL_long instruction handles either width — the OMF
// cRELOC opcode rewrites the offset and bank at load time.
def : Pat<(W65816call (i32 tglobaladdr:$dst)),  (JSLpseudo32 tglobaladdr:$dst)>;
def : Pat<(W65816call (i32 texternalsym:$dst)), (JSLpseudo32 texternalsym:$dst)>;