//===-- W65816InstrInfo.cpp - W65816 Instruction Information --------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Skeleton instruction-info implementation.  Real register copy and stack
// spill/reload lowering will be added once the instruction set is described.
//
//===----------------------------------------------------------------------===//

#include "W65816InstrInfo.h"
#include "W65816.h"
#include "W65816Subtarget.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/Support/ErrorHandling.h"

using namespace llvm;

#define GET_INSTRINFO_CTOR_DTOR
#include "W65816GenInstrInfo.inc"

void W65816InstrInfo::anchor() {}

W65816InstrInfo::W65816InstrInfo(const W65816Subtarget &STI)
    : W65816GenInstrInfo(STI, RI, W65816::ADJCALLSTACKDOWN,
                         W65816::ADJCALLSTACKUP),
      RI() {}

// Shared helpers exposed via W65816InstrInfo.h.  See the namespace
// comment there for usage notes.
namespace llvm {
namespace W65816Helpers {

int imgDPAddr(unsigned Reg) {
  switch (Reg) {
  case W65816::IMG0:  return 0xD0;
  case W65816::IMG1:  return 0xD2;
  case W65816::IMG2:  return 0xD4;
  case W65816::IMG3:  return 0xD6;
  case W65816::IMG4:  return 0xD8;
  case W65816::IMG5:  return 0xDA;
  case W65816::IMG6:  return 0xDC;
  case W65816::IMG7:  return 0xDE;
  case W65816::IMG8:  return 0xC0;
  case W65816::IMG9:  return 0xC2;
  case W65816::IMG10: return 0xC4;
  case W65816::IMG11: return 0xC6;
  case W65816::IMG12: return 0xC8;
  case W65816::IMG13: return 0xCA;
  case W65816::IMG14: return 0xCC;
  case W65816::IMG15: return 0xCE;
  default: return -1;
  }
}


unsigned invertCondOpcode(unsigned Opc) {
  switch (Opc) {
  case W65816::BEQ: return W65816::BNE;
  case W65816::BNE: return W65816::BEQ;
  case W65816::BCS: return W65816::BCC;
  case W65816::BCC: return W65816::BCS;
  case W65816::BMI: return W65816::BPL;
  case W65816::BPL: return W65816::BMI;
  case W65816::BVS: return W65816::BVC;
  case W65816::BVC: return W65816::BVS;
  default: return 0;
  }
}


unsigned getDpOpcodeForStackRel(unsigned Opc) {
  switch (Opc) {
  case W65816::LDA_StackRel: return W65816::LDA_DP;
  case W65816::STA_StackRel: return W65816::STA_DP;
  case W65816::ADC_StackRel: return W65816::ADC_DP;
  case W65816::SBC_StackRel: return W65816::SBC_DP;
  case W65816::CMP_StackRel: return W65816::CMP_DP;
  case W65816::AND_StackRel: return W65816::AND_DP;
  case W65816::ORA_StackRel: return W65816::ORA_DP;
  case W65816::EOR_StackRel: return W65816::EOR_DP;
  default: return 0;
  }
}


bool isTiedAcc16Consumer(unsigned Opc) {
  switch (Opc) {
  case W65816::ADCfi:
  case W65816::SBCfi:
  case W65816::ANDfi:
  case W65816::ORAfi:
  case W65816::EORfi:
  case W65816::ADCabs:
  case W65816::SBCabs:
  case W65816::ADCi16imm:
  case W65816::SBCi16imm:
  case W65816::ANDi16imm:
  case W65816::ORAi16imm:
  case W65816::EORi16imm:
    return true;
  default:
    return false;
  }
}


bool hasTiedAcc16Src(const MachineInstr &MI) {
  if (!isTiedAcc16Consumer(MI.getOpcode())) return false;
  for (unsigned i = 0; i < MI.getNumOperands(); ++i) {
    const MachineOperand &MO = MI.getOperand(i);
    if (!MO.isReg() || !MO.isUse()) continue;
    if (MI.isRegTiedToDefOperand(i)) return true;
  }
  return false;
}

} // namespace W65816Helpers
} // namespace llvm

void W65816InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator I,
                                  const DebugLoc &DL, Register DestReg,
                                  Register SrcReg, bool KillSrc,
                                  bool RenamableDest, bool RenamableSrc) const {
  if (DestReg == SrcReg)
    return;
  // A → X / X → A via TAX / TXA.  Used by i32 return ABI (lo in A, hi
  // in X) and by callers reading split-i32 results.  Both instructions
  // are 16-bit when M=0/X=0; that matches our default mode.
  if (DestReg == W65816::X && SrcReg == W65816::A) {
    BuildMI(MBB, I, DL, get(W65816::TAX));
    return;
  }
  if (DestReg == W65816::A && SrcReg == W65816::X) {
    BuildMI(MBB, I, DL, get(W65816::TXA));
    return;
  }
  // A → Y / Y → A via TAY / TYA.  Same M/X width caveat.
  if (DestReg == W65816::Y && SrcReg == W65816::A) {
    BuildMI(MBB, I, DL, get(W65816::TAY));
    return;
  }
  if (DestReg == W65816::A && SrcReg == W65816::Y) {
    BuildMI(MBB, I, DL, get(W65816::TYA));
    return;
  }
  // A → IMGn / IMGn → A: STA dp / LDA dp.  IMGn is DP-backed at fixed
  // addresses $D0..$DE — see W65816Helpers::imgDPAddr above.
  int srcImg = W65816Helpers::imgDPAddr(SrcReg);
  int dstImg = W65816Helpers::imgDPAddr(DestReg);
  if (DestReg == W65816::A && srcImg >= 0) {
    BuildMI(MBB, I, DL, get(W65816::LDA_DP)).addImm(srcImg);
    return;
  }
  if (dstImg >= 0 && SrcReg == W65816::A) {
    BuildMI(MBB, I, DL, get(W65816::STA_DP)).addImm(dstImg);
    return;
  }
  // IMGn → IMGm: route through A, but PHA-bracket so A is preserved.
  // Without the bracket, regalloc could insert this COPY between a
  // def of A and the use of A (e.g. between `$a = COPY $img10` and
  // `STAfi $a, slot`, when both vregs are alive simultaneously and
  // the regalloc decides to shuffle img physregs in between).  The
  // unbracketed lda/sta clobbers A and the subsequent STAfi spills
  // garbage.  Observed under ptr32 + full IMG defs in the C++ try/
  // catch path: `*p = 42` after `__cxa_allocate_exception` stored
  // hi-half-of-ptr at lo-half-slot, breaking the indirect-long
  // address setup so 42 landed at the wrong place.
  //
  // PHA bracket cost: +PHA (3 cyc, 1 byte) + PLA (4 cyc, 1 byte) = +7
  // cyc, +2 bytes per IMG-IMG copy.  These are rare (regalloc usually
  // can avoid them by picking the same physreg for COPY's src and
  // dst), so the cost is small.
  if (srcImg >= 0 && dstImg >= 0) {
    BuildMI(MBB, I, DL, get(W65816::PHA));
    BuildMI(MBB, I, DL, get(W65816::LDA_DP)).addImm(srcImg);
    BuildMI(MBB, I, DL, get(W65816::STA_DP)).addImm(dstImg);
    BuildMI(MBB, I, DL, get(W65816::PLA));
    return;
  }
  // SP -> A via TSC.  Used by alloca / setjmp asm machinery.
  if (DestReg == W65816::A && SrcReg == W65816::SP) {
    BuildMI(MBB, I, DL, get(W65816::TSC));
    return;
  }
  // A -> SP via TCS.
  if (DestReg == W65816::SP && SrcReg == W65816::A) {
    BuildMI(MBB, I, DL, get(W65816::TCS));
    return;
  }
  // X <-> Y via A: 65816 has no direct X<->Y transfer; bridge through
  // A.  Caller is responsible for ensuring A is dead at this program
  // point (regalloc arranges this).  Used by greedy when an i16 vreg
  // forced into one Idx16 reg gets coalesced with a use in the other.
  if (DestReg == W65816::Y && SrcReg == W65816::X) {
    BuildMI(MBB, I, DL, get(W65816::TXA));
    BuildMI(MBB, I, DL, get(W65816::TAY));
    return;
  }
  if (DestReg == W65816::X && SrcReg == W65816::Y) {
    BuildMI(MBB, I, DL, get(W65816::TYA));
    BuildMI(MBB, I, DL, get(W65816::TAX));
    return;
  }
  // X → IMGn / IMGn → X: STX dp / LDX dp.  Used by the i64-first-arg
  // entry COPY (LowerFormalArguments routes arg0_ml through Img16 to
  // dodge the TXA-bridge-clobbers-A spill bug for udivmod-shaped
  // signatures).
  if (dstImg >= 0 && SrcReg == W65816::X) {
    BuildMI(MBB, I, DL, get(W65816::STX_DP)).addImm(dstImg);
    return;
  }
  if (DestReg == W65816::X && srcImg >= 0) {
    BuildMI(MBB, I, DL, get(W65816::LDX_DP)).addImm(srcImg);
    return;
  }
  // Y -> IMGn / IMGn -> Y: STY dp / LDY dp.  Symmetric with the X
  // case above.  Used by the i32-first-arg ABI's hi half (in X) and
  // by Wide32 pair copies that have one half in Y after the per-half
  // routing — see the lambda dispatch below.
  if (dstImg >= 0 && SrcReg == W65816::Y) {
    BuildMI(MBB, I, DL, get(W65816::STY_DP)).addImm(dstImg);
    return;
  }
  if (DestReg == W65816::Y && srcImg >= 0) {
    BuildMI(MBB, I, DL, get(W65816::LDY_DP)).addImm(srcImg);
    return;
  }
  // DPF0 → A: emit `LDA $F0`.  DPF0 is the pseudo-physreg carrier
  // for an i64-returning call's high 16 bits; LowerCall builds a
  // CopyFromReg(DPF0) glued to the call so the SDAG combiner /
  // scheduler can't merge or reorder reads across calls.
  if (DestReg == W65816::A && SrcReg == W65816::DPF0) {
    BuildMI(MBB, I, DL, get(W65816::LDA_DP)).addImm(0xF0);
    return;
  }
  // A → DPF0: emit `STA $F0`.  Used by LowerReturn for the i64 high
  // half; using a true direct-page store is critical because plain
  // ISD::STORE with addr=0xF0 was lowering to `(d,s),y` indirect via
  // DBR — which silently broke under DBR != 0 (e.g. after a bank
  // switch).  STA dp uses D + dp directly, ignoring DBR.
  if (DestReg == W65816::DPF0 && SrcReg == W65816::A) {
    BuildMI(MBB, I, DL, get(W65816::STA_DP)).addImm(0xF0);
    return;
  }
  // Wide32 (AX32 or IMG-pair) <-> Wide32 copy: split on sub_lo / sub_hi
  // and recurse.  Use a hand-written dispatch instead of getSubReg
  // because the MCRegisterInfo::getSubReg path crashes when called
  // from TargetInstrInfo::lowerCopy on regs that are not pair regs
  // (the table lookup walks past the end of the diff list).
  auto wide32Halves = [](Register R)
      -> std::pair<Register, Register> {
    switch (R) {
    case W65816::AX32:    return {W65816::A,     W65816::X};
    case W65816::IMG01:   return {W65816::IMG0,  W65816::IMG1};
    case W65816::IMG23:   return {W65816::IMG2,  W65816::IMG3};
    case W65816::IMG45:   return {W65816::IMG4,  W65816::IMG5};
    case W65816::IMG67:   return {W65816::IMG6,  W65816::IMG7};
    case W65816::IMG89:   return {W65816::IMG8,  W65816::IMG9};
    case W65816::IMG1011: return {W65816::IMG10, W65816::IMG11};
    case W65816::IMG1213: return {W65816::IMG12, W65816::IMG13};
    case W65816::IMG1415: return {W65816::IMG14, W65816::IMG15};
    default:              return {Register(),    Register()};
    }
  };
  auto [srcLo, srcHi] = wide32Halves(SrcReg);
  auto [dstLo, dstHi] = wide32Halves(DestReg);
  if (srcLo && srcHi && dstLo && dstHi) {
    // Wide32 -> Wide32.  Lo-first order is correct in every direction:
    //   AX32 -> IMG_pair  : STA dstLo (A live), then STX dstHi
    //   IMG_pair -> AX32  : LDA srcLo, then LDX srcHi (independent halves)
    //   IMG_pair -> IMG_pair : LDA/STA chain twice (A is only per-half scratch)
    copyPhysReg(MBB, I, DL, dstLo, srcLo, KillSrc,
                RenamableDest, RenamableSrc);
    copyPhysReg(MBB, I, DL, dstHi, srcHi, KillSrc,
                RenamableDest, RenamableSrc);
    return;
  }
  // Wide32 -> i16: take sub_lo of source.  Arises post-RA when an
  // EXTRACT_SUBREG was lowered as a parent-reg COPY (the SubRegIndex
  // is dropped by lowerCopy).
  if (srcLo && srcHi && !dstLo) {
    copyPhysReg(MBB, I, DL, DestReg, srcLo, KillSrc,
                RenamableDest, RenamableSrc);
    return;
  }
  // i16 -> Wide32: write sub_lo only (sub_hi left as caller had it,
  // matching INSERT_SUBREG semantics).  Arises post-RA when REG_SEQUENCE
  // is expanded into per-half COPY pseudos, then a parent-reg COPY of
  // a sub-reg-only def appears.
  if (!srcLo && dstLo && dstHi) {
    copyPhysReg(MBB, I, DL, dstLo, SrcReg, KillSrc,
                RenamableDest, RenamableSrc);
    return;
  }
  // Virtual-register caller: this happens when the inline spiller
  // (called from Basic regalloc) rewrites uses of a spilled vreg and
  // asks us to copy through A before its physreg has been assigned.
  // Emit a generic COPY pseudo and let the regalloc rewriter / a later
  // ExpandPostRA pass resolve it once both regs are physical.
  if (SrcReg.isVirtual() || DestReg.isVirtual()) {
    BuildMI(MBB, I, DL, get(TargetOpcode::COPY), DestReg)
        .addReg(SrcReg, getKillRegState(KillSrc));
    return;
  }
  const TargetRegisterInfo *TRI = MBB.getParent()->getSubtarget().getRegisterInfo();
  llvm::errs() << "W65816 copyPhysReg unhandled: src="
               << (SrcReg.isPhysical() ? TRI->getRegAsmName(SrcReg) : "<vreg>")
               << " dst="
               << (DestReg.isPhysical() ? TRI->getRegAsmName(DestReg) : "<vreg>")
               << " srcImg=" << srcImg << " dstImg=" << dstImg << "\n";
  llvm_unreachable("W65816: cross-class copyPhysReg not yet implemented");
}

void W65816InstrInfo::storeRegToStackSlot(
    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
    bool isKill, int FrameIdx, const TargetRegisterClass *RC, Register VReg,
    MachineInstr::MIFlag Flags) const {
  // STAfi gets eliminated by W65816RegisterInfo::eliminateFrameIndex into
  // a real STA d,S.  Source is implicit A; emit the pseudo with the FI
  // and zero offset.  When regalloc hands us a spill from X or Y, bridge
  // through A (TXA / TYA) — same rationale as loadRegFromStackSlot.
  DebugLoc DL = MI != MBB.end() ? MI->getDebugLoc() : DebugLoc();
  // Wide32 spill: split into 2 i16 stores at offsets 0 and 2 of the
  // 4-byte spill slot.  Bridge each half through A using copyPhysReg.
  if (RC == &W65816::Wide32RegClass || RC == &W65816::Acc32RegClass ||
      RC == &W65816::AnyWide32RegClass) {
    Register Lo, Hi;
    switch (SrcReg) {
    case W65816::AX32:    Lo = W65816::A;     Hi = W65816::X;     break;
    case W65816::IMG01:   Lo = W65816::IMG0;  Hi = W65816::IMG1;  break;
    case W65816::IMG23:   Lo = W65816::IMG2;  Hi = W65816::IMG3;  break;
    case W65816::IMG45:   Lo = W65816::IMG4;  Hi = W65816::IMG5;  break;
    case W65816::IMG67:   Lo = W65816::IMG6;  Hi = W65816::IMG7;  break;
    case W65816::IMG89:   Lo = W65816::IMG8;  Hi = W65816::IMG9;  break;
    case W65816::IMG1011: Lo = W65816::IMG10; Hi = W65816::IMG11; break;
    case W65816::IMG1213: Lo = W65816::IMG12; Hi = W65816::IMG13; break;
    case W65816::IMG1415: Lo = W65816::IMG14; Hi = W65816::IMG15; break;
    default:
      // Regalloc occasionally hands us an UNPAIRED single i16 physreg
      // (Acc16 / Img16 / Idx16) for a Wide32-class spill — happens when
      // only one sub-reg is live at the spill point and the regalloc
      // decides to spill it through the Wide32 path anyway.  Treat as
      // a single i16 store of the lone half at offset 0; the matching
      // reload mirrors this (only the lo half is read back).  The hi
      // half slot at offset 2 is left unwritten — the reload's hi load
      // reads zero-init stack memory which is fine because nothing
      // genuinely needed the hi value (otherwise the regalloc would
      // have allocated a real pair).
      if (SrcReg != W65816::A) {
        copyPhysReg(MBB, MI, DL, W65816::A, SrcReg, false);
      }
      BuildMI(MBB, MI, DL, get(W65816::STAfi))
          .addReg(W65816::A).addFrameIndex(FrameIdx).addImm(0);
      return;
    }
    // Bridge lo through A, store at offset 0; bridge hi through A,
    // store at offset 2.  This is brittle in the face of regalloc
    // expectations — Wide32 spills are best avoided by keeping the
    // pair in registers if at all possible.
    if (Lo != W65816::A) {
      copyPhysReg(MBB, MI, DL, W65816::A, Lo, false);
    }
    BuildMI(MBB, MI, DL, get(W65816::STAfi))
        .addReg(W65816::A).addFrameIndex(FrameIdx).addImm(0);
    copyPhysReg(MBB, MI, DL, W65816::A, Hi, false);
    BuildMI(MBB, MI, DL, get(W65816::STAfi))
        .addReg(W65816::A).addFrameIndex(FrameIdx).addImm(2);
    return;
  }
  if (SrcReg == W65816::X || SrcReg == W65816::Y) {
    unsigned XferOp = (SrcReg == W65816::X) ? W65816::TXA : W65816::TYA;
    BuildMI(MBB, MI, DL, get(XferOp));
    SrcReg = W65816::A;
  }
  BuildMI(MBB, MI, DL, get(W65816::STAfi))
      .addReg(SrcReg, getKillRegState(isKill))
      .addFrameIndex(FrameIdx)
      .addImm(0);
}

void W65816InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                           MachineBasicBlock::iterator MI,
                                           Register DestReg, int FrameIdx,
                                           const TargetRegisterClass *RC,
                                           Register VReg, unsigned SubReg,
                                           MachineInstr::MIFlag Flags) const {
  // LDAfi only knows how to put the value in A.  If regalloc asks for
  // a spill into X or Y, we have to bridge through A: LDA d,S then
  // TAX / TAY.  Without this, the MIR has `$x = LDAfi` but the asm
  // printer emits just `LDA d,S` (which writes A, not X) — a silent
  // miscompile that surfaced as i64 subtract chains using stale X
  // values for the second word (caught by udivmod's `a - q*b` mod
  // computation).
  DebugLoc DL = MI != MBB.end() ? MI->getDebugLoc() : DebugLoc();
  // Wide32 reload: 2 i16 loads at offsets 0 and 2 of the 4-byte slot.
  if (RC == &W65816::Wide32RegClass || RC == &W65816::Acc32RegClass ||
      RC == &W65816::AnyWide32RegClass) {
    Register Lo, Hi;
    switch (DestReg) {
    case W65816::AX32:    Lo = W65816::A;     Hi = W65816::X;     break;
    case W65816::IMG01:   Lo = W65816::IMG0;  Hi = W65816::IMG1;  break;
    case W65816::IMG23:   Lo = W65816::IMG2;  Hi = W65816::IMG3;  break;
    case W65816::IMG45:   Lo = W65816::IMG4;  Hi = W65816::IMG5;  break;
    case W65816::IMG67:   Lo = W65816::IMG6;  Hi = W65816::IMG7;  break;
    case W65816::IMG89:   Lo = W65816::IMG8;  Hi = W65816::IMG9;  break;
    case W65816::IMG1011: Lo = W65816::IMG10; Hi = W65816::IMG11; break;
    case W65816::IMG1213: Lo = W65816::IMG12; Hi = W65816::IMG13; break;
    case W65816::IMG1415: Lo = W65816::IMG14; Hi = W65816::IMG15; break;
    default:
      // Mirror of the unpaired-spill case in storeRegToStackSlot:
      // regalloc handed us a single physreg for a Wide32 reload.
      // Just load the lo half from offset 0 into the dest.
      BuildMI(MBB, MI, DL, get(W65816::LDAfi), W65816::A)
          .addFrameIndex(FrameIdx).addImm(0);
      if (DestReg != W65816::A)
        copyPhysReg(MBB, MI, DL, DestReg, W65816::A, false);
      return;
    }
    // Lo half: LDA from offset 0, transfer to Lo if needed.
    BuildMI(MBB, MI, DL, get(W65816::LDAfi), W65816::A)
        .addFrameIndex(FrameIdx).addImm(0);
    if (Lo != W65816::A)
      copyPhysReg(MBB, MI, DL, Lo, W65816::A, false);
    // Hi half: LDA from offset 2, transfer to Hi.
    BuildMI(MBB, MI, DL, get(W65816::LDAfi), W65816::A)
        .addFrameIndex(FrameIdx).addImm(2);
    if (Hi != W65816::A)
      copyPhysReg(MBB, MI, DL, Hi, W65816::A, false);
    return;
  }
  if (DestReg == W65816::A) {
    BuildMI(MBB, MI, DL, get(W65816::LDAfi), DestReg)
        .addFrameIndex(FrameIdx)
        .addImm(0);
    return;
  }
  if (DestReg == W65816::X || DestReg == W65816::Y) {
    // Load via A, then transfer.  A is implicitly clobbered.
    BuildMI(MBB, MI, DL, get(W65816::LDAfi), W65816::A)
        .addFrameIndex(FrameIdx)
        .addImm(0);
    unsigned XferOp = (DestReg == W65816::X) ? W65816::TAX : W65816::TAY;
    BuildMI(MBB, MI, DL, get(XferOp));
    return;
  }
  // Fallback: assume A path (covers Acc16 / Wide16 vregs by class).
  BuildMI(MBB, MI, DL, get(W65816::LDAfi), DestReg)
      .addFrameIndex(FrameIdx)
      .addImm(0);
}

Register W65816InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
                                              int &FrameIndex) const {
  if (MI.getOpcode() != W65816::LDAfi)
    return 0;
  // memfi packs (FrameIndex, offset).  Treat only offset==0 as a true
  // stack-slot load — non-zero offset means we're addressing within
  // the slot (e.g. the high half of an i32 spill), which the generic
  // peephole/CSE machinery doesn't model.
  if (MI.getNumOperands() < 3 || !MI.getOperand(1).isFI() ||
      !MI.getOperand(2).isImm() || MI.getOperand(2).getImm() != 0)
    return 0;
  FrameIndex = MI.getOperand(1).getIndex();
  return MI.getOperand(0).getReg();
}

Register W65816InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
                                             int &FrameIndex) const {
  if (MI.getOpcode() != W65816::STAfi)
    return 0;
  // STAfi: (ins Acc16:$src, memfi:$addr) — op0 is src reg, op1 is
  // FrameIndex, op2 is offset.
  if (MI.getNumOperands() < 3 || !MI.getOperand(1).isFI() ||
      !MI.getOperand(2).isImm() || MI.getOperand(2).getImm() != 0)
    return 0;
  FrameIndex = MI.getOperand(1).getIndex();
  return MI.getOperand(0).getReg();
}

bool W65816InstrInfo::isReMaterializableImpl(const MachineInstr &MI) const {
  // Only LDAfi is gated on this hook.  We declare it
  // isReMaterializable=1 in tablegen so the framework will *consider*
  // re-emitting it instead of spilling, then call back here to confirm.
  // The instruction is safely rematerializable iff it loads from a
  // *fixed* (immutable) frame index — i.e. an arg slot.  Loads from a
  // regular spill slot read a computed value that may not be available
  // at the rematerialization point.
  if (MI.getOpcode() != W65816::LDAfi)
    return TargetInstrInfo::isReMaterializableImpl(MI);

  // Operand 1 is the FrameIndex (operand 0 is the def).
  const MachineOperand &FIOp = MI.getOperand(1);
  if (!FIOp.isFI())
    return false;
  const MachineFrameInfo &MFI = MI.getMF()->getFrameInfo();
  return MFI.isFixedObjectIndex(FIOp.getIndex());
}

int W65816InstrInfo::getSPAdjust(const MachineInstr &MI) const {
  unsigned Opc = MI.getOpcode();
  // ADJCALLSTACKDOWN returns 0 (we don't pre-shift SP — PUSH16 does
  // it incrementally).  ADJCALLSTACKUP returns -N where N is the
  // first immediate (= total pushed bytes); this counterbalances
  // the +2 contributions accumulated from each PUSH16 so SPAdj
  // returns to 0 at the end of the call sequence.
  if (Opc == W65816::ADJCALLSTACKDOWN)
    return 0;
  if (Opc == W65816::ADJCALLSTACKUP) {
    // The immediate is the byte count.
    if (MI.getNumOperands() > 0 && MI.getOperand(0).isImm())
      return -static_cast<int>(MI.getOperand(0).getImm());
    return 0;
  }
  if (Opc == W65816::PUSH16 || Opc == W65816::PUSH16X)
    return 2;
  return TargetInstrInfo::getSPAdjust(MI);
}

// Conditional branch opcode predicate — derived from the shared
// invertCondOpcode helper so the two stay in lockstep.
static bool isCondBranch(unsigned Opc) {
  return W65816Helpers::invertCondOpcode(Opc) != 0;
}

// Unconditional direct-target branch predicate.  Excludes JMP_AbsInd
// (indirect) and JML_Long (different operand kind).
static bool isUncondDirectBranch(unsigned Opc) {
  return Opc == W65816::BRA || Opc == W65816::BRL ||
         Opc == W65816::JMP_Abs;
}

// invertCondOpcode lives in namespace W65816Helpers above.

MachineBasicBlock *
W65816InstrInfo::getBranchDestBlock(const MachineInstr &MI) const {
  // All our direct branches encode the target MBB in operand 0.
  if (MI.getNumOperands() < 1 || !MI.getOperand(0).isMBB())
    return nullptr;
  return MI.getOperand(0).getMBB();
}

bool W65816InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
                                    MachineBasicBlock *&TBB,
                                    MachineBasicBlock *&FBB,
                                    SmallVectorImpl<MachineOperand> &Cond,
                                    bool AllowModify) const {
  TBB = nullptr;
  FBB = nullptr;
  Cond.clear();

  // We deliberately keep conditional branches (BEQ/BNE/etc.) opaque to
  // BranchFolder.  Their condition is encoded in the OPCODE and the
  // flag input is an implicit use of P set by a preceding CMP/etc.;
  // BranchFolder doesn't track that the CMP must stay adjacent, so
  // if it re-inserts the Bxx in a tail-merged block the flag input
  // becomes whatever earlier instruction last clobbered P.  Caught by
  // the softDouble dadd smoke (1.5 + 2.5 != 4.0) once we tried to make
  // conditional branches analyzable.
  //
  // What we DO analyze:
  //   * Empty terminator sequence (pure fall-through) — return
  //     analyzable with no targets so MachineBlockPlacement's assert
  //     about fall-through blocks is satisfied trivially.
  //   * Single unconditional direct branch (BRA / BRL / JMP_Abs) —
  //     return analyzable with TBB set, no Cond.  Safe to move because
  //     no flag dependency.
  // Everything else (Bxx in any position, indirect jumps, multiple
  // terminators, etc.) stays unanalyzable.
  MachineBasicBlock::iterator I = MBB.getFirstTerminator();
  while (I != MBB.end() && I->isDebugInstr())
    ++I;
  if (I == MBB.end())
    return false;  // No terminators: pure fall-through.

  unsigned FirstOpc = I->getOpcode();
  if (!isUncondDirectBranch(FirstOpc))
    return true;  // Conditional or unknown.  Stay opaque.

  // Single unconditional direct branch — analyzable.
  TBB = getBranchDestBlock(*I);
  if (!TBB)
    return true;
  auto Next = std::next(I);
  while (Next != MBB.end() && Next->isDebugInstr())
    ++Next;
  if (Next != MBB.end())
    return true;  // Extra terminators after unconditional.
  return false;
}

unsigned W65816InstrInfo::removeBranch(MachineBasicBlock &MBB,
                                       int *BytesRemoved) const {
  if (BytesRemoved)
    *BytesRemoved = 0;
  unsigned NumRemoved = 0;
  // Walk from the end, removing trailing direct branches.  Stop when
  // we hit a non-branch or a branch we can't analyze (e.g. JMP_AbsInd).
  while (!MBB.empty()) {
    auto It = std::prev(MBB.end());
    if (It->isDebugInstr()) {
      // Skip debug instructions but don't delete them.
      if (It == MBB.begin())
        break;
      --It;
    }
    unsigned Opc = It->getOpcode();
    if (!isCondBranch(Opc) && !isUncondDirectBranch(Opc))
      break;
    if (BytesRemoved)
      *BytesRemoved += getInstSizeInBytes(*It);
    It->eraseFromParent();
    ++NumRemoved;
  }
  return NumRemoved;
}

unsigned W65816InstrInfo::insertBranch(MachineBasicBlock &MBB,
                                       MachineBasicBlock *TBB,
                                       MachineBasicBlock *FBB,
                                       ArrayRef<MachineOperand> Cond,
                                       const DebugLoc &DL,
                                       int *BytesAdded) const {
  assert(TBB && "insertBranch requires a true target");
  assert((Cond.empty() || Cond.size() == 1) &&
         "W65816 branch conditions are single-operand (opcode)");

  if (BytesAdded)
    *BytesAdded = 0;
  unsigned NumAdded = 0;

  if (Cond.empty()) {
    // Unconditional branch.  Use BRA — W65816AsmBackend auto-relaxes
    // to BRL when the displacement exceeds an 8-bit signed offset.
    auto MI = BuildMI(&MBB, DL, get(W65816::BRA)).addMBB(TBB);
    if (BytesAdded)
      *BytesAdded += getInstSizeInBytes(*MI);
    return 1;
  }

  // Conditional branch using the opcode stored in Cond[0].
  unsigned CondOpc = Cond[0].getImm();
  auto MIc = BuildMI(&MBB, DL, get(CondOpc)).addMBB(TBB);
  if (BytesAdded)
    *BytesAdded += getInstSizeInBytes(*MIc);
  ++NumAdded;

  // If there's also a false target, emit an unconditional branch to it.
  if (FBB) {
    auto MIu = BuildMI(&MBB, DL, get(W65816::BRA)).addMBB(FBB);
    if (BytesAdded)
      *BytesAdded += getInstSizeInBytes(*MIu);
    ++NumAdded;
  }
  return NumAdded;
}

bool W65816InstrInfo::reverseBranchCondition(
    SmallVectorImpl<MachineOperand> &Cond) const {
  if (Cond.size() != 1)
    return true;
  unsigned Inverted = W65816Helpers::invertCondOpcode(Cond[0].getImm());
  if (!Inverted)
    return true;
  Cond[0].setImm(Inverted);
  return false;
}

unsigned W65816InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
  // Meta-instructions emit nothing — PHI nodes get eliminated, COPY
  // gets lowered to TXA/TYA/TAY/TAX or LDA/STA, KILL/IMPLICIT_DEF/
  // BUNDLE/CFI_INSTRUCTION/DBG_VALUE leave no bytes.  For COPY we
  // could be more precise (1 or 2 bytes depending on transfer) but
  // returning 0 is fine: the size estimate just needs to be a lower
  // bound for the BranchExpand pass's distance estimate.
  if (MI.isMetaInstruction()) return 0;

  unsigned Opc = MI.getOpcode();

  // ADJCALLSTACKDOWN / ADJCALLSTACKUP get expanded to PLA loops or
  // TSC/CLC/ADC/TCS bracket; estimate ~8 bytes worst case.
  if (Opc == W65816::ADJCALLSTACKDOWN || Opc == W65816::ADJCALLSTACKUP)
    return 8;

  // Pseudo expansions handled by AsmPrinter that emit multiple
  // bytes need explicit estimates; a missing case underestimates
  // and risks branch-range errors.  Rough byte counts below mirror
  // each pseudo's expansion in W65816AsmPrinter::emitInstruction.
  switch (Opc) {
  // i8 immediate ops wrap with SEP/REP: SEP(2) + op(2) + REP(2) = 6.
  case W65816::LDAi8imm:
  case W65816::ADCi8imm:
  case W65816::SBCi8imm:
  case W65816::ANDi8imm:
  case W65816::ORAi8imm:
  case W65816::EORi8imm:
  case W65816::CMPi8imm:
    return 6 + (Opc == W65816::ADCi8imm || Opc == W65816::SBCi8imm ? 1 : 0);
  // i8 abs load wraps: SEP(2) + LDA_Abs(3) + REP(2) = 7.
  case W65816::LDA8abs:
    return 7;
  // i8 abs store wraps: SEP(2) + STA_Abs(3) + REP(2) = 7.
  case W65816::STA8abs:
    return 7;
  // STA8fi: SEP(2) + STA d,S(2) + REP(2) = 6 (PEI expansion).
  case W65816::STA8fi:
    return 6;
  // i16 ADC/SBC pseudos prepend CLC/SEC: 1 + 3 = 4 bytes.
  case W65816::ADCi16imm:
  case W65816::SBCi16imm:
  case W65816::ADCabs:
  case W65816::SBCabs:
    return 4;
  // ADDframe: TSC + CLC + ADC #imm = 1 + 1 + 3 = 5.
  case W65816::ADDframe:
    return 5;
  // ALLOCAfi: STA dp + TSC + SEC + SBC dp + TCS + INC A = 2+1+1+2+1+1 = 8.
  case W65816::ALLOCAfi:
    return 8;
  // PUSH16 / PUSH16X: PHA / PHX = 1 byte.
  case W65816::PUSH16:
  case W65816::PUSH16X:
    return 1;
  // JSLpseudo: jsl is 4 bytes.
  case W65816::JSLpseudo:
  case W65816::JSLpseudo32:
    return 4;
  default:
    break;
  }

  // Real (non-pseudo) instruction: tablegen-defined Size.
  unsigned Size = MI.getDesc().getSize();
  if (Size != 0) return Size;

  // Fallback for any pseudo we forgot to enumerate: 4 bytes is a
  // pessimistic-but-safe upper bound on most W65816 instructions.
  return 4;
}