65816-llvm-mos/src/llvm/lib/Target/W65816/W65816InstrInfo.cpp
Scott Duensing 3388f3c5a5 More updates
2026-06-03 20:46:31 -05:00

744 lines
29 KiB
C++

//===-- W65816InstrInfo.cpp - W65816 Instruction Information --------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Skeleton instruction-info implementation. Real register copy and stack
// spill/reload lowering will be added once the instruction set is described.
//
//===----------------------------------------------------------------------===//
#include "W65816InstrInfo.h"
#include "W65816.h"
#include "W65816Subtarget.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/Support/ErrorHandling.h"
using namespace llvm;
#define GET_INSTRINFO_CTOR_DTOR
#include "W65816GenInstrInfo.inc"
void W65816InstrInfo::anchor() {}
W65816InstrInfo::W65816InstrInfo(const W65816Subtarget &STI)
: W65816GenInstrInfo(STI, RI, W65816::ADJCALLSTACKDOWN,
W65816::ADJCALLSTACKUP),
RI() {}
// Shared helpers exposed via W65816InstrInfo.h. See the namespace
// comment there for usage notes.
namespace llvm {
namespace W65816Helpers {
int imgDPAddr(unsigned Reg) {
switch (Reg) {
case W65816::IMG0: return 0xD0;
case W65816::IMG1: return 0xD2;
case W65816::IMG2: return 0xD4;
case W65816::IMG3: return 0xD6;
case W65816::IMG4: return 0xD8;
case W65816::IMG5: return 0xDA;
case W65816::IMG6: return 0xDC;
case W65816::IMG7: return 0xDE;
case W65816::IMG8: return 0xC0;
case W65816::IMG9: return 0xC2;
case W65816::IMG10: return 0xC4;
case W65816::IMG11: return 0xC6;
case W65816::IMG12: return 0xC8;
case W65816::IMG13: return 0xCA;
case W65816::IMG14: return 0xCC;
case W65816::IMG15: return 0xCE;
default: return -1;
}
}
unsigned invertCondOpcode(unsigned Opc) {
switch (Opc) {
case W65816::BEQ: return W65816::BNE;
case W65816::BNE: return W65816::BEQ;
case W65816::BCS: return W65816::BCC;
case W65816::BCC: return W65816::BCS;
case W65816::BMI: return W65816::BPL;
case W65816::BPL: return W65816::BMI;
case W65816::BVS: return W65816::BVC;
case W65816::BVC: return W65816::BVS;
default: return 0;
}
}
unsigned getDpOpcodeForStackRel(unsigned Opc) {
switch (Opc) {
case W65816::LDA_StackRel: return W65816::LDA_DP;
case W65816::STA_StackRel: return W65816::STA_DP;
case W65816::ADC_StackRel: return W65816::ADC_DP;
case W65816::SBC_StackRel: return W65816::SBC_DP;
case W65816::CMP_StackRel: return W65816::CMP_DP;
case W65816::AND_StackRel: return W65816::AND_DP;
case W65816::ORA_StackRel: return W65816::ORA_DP;
case W65816::EOR_StackRel: return W65816::EOR_DP;
default: return 0;
}
}
bool isTiedAcc16Consumer(unsigned Opc) {
switch (Opc) {
case W65816::ADCfi:
case W65816::SBCfi:
case W65816::ANDfi:
case W65816::ORAfi:
case W65816::EORfi:
case W65816::ADCabs:
case W65816::SBCabs:
case W65816::ADCi16imm:
case W65816::SBCi16imm:
case W65816::ANDi16imm:
case W65816::ORAi16imm:
case W65816::EORi16imm:
return true;
default:
return false;
}
}
bool hasTiedAcc16Src(const MachineInstr &MI) {
if (!isTiedAcc16Consumer(MI.getOpcode())) return false;
for (unsigned i = 0; i < MI.getNumOperands(); ++i) {
const MachineOperand &MO = MI.getOperand(i);
if (!MO.isReg() || !MO.isUse()) continue;
if (MI.isRegTiedToDefOperand(i)) return true;
}
return false;
}
} // namespace W65816Helpers
} // namespace llvm
void W65816InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
const DebugLoc &DL, Register DestReg,
Register SrcReg, bool KillSrc,
bool RenamableDest, bool RenamableSrc) const {
if (DestReg == SrcReg)
return;
// A → X / X → A via TAX / TXA. Used by i32 return ABI (lo in A, hi
// in X) and by callers reading split-i32 results. Both instructions
// are 16-bit when M=0/X=0; that matches our default mode.
if (DestReg == W65816::X && SrcReg == W65816::A) {
BuildMI(MBB, I, DL, get(W65816::TAX));
return;
}
if (DestReg == W65816::A && SrcReg == W65816::X) {
BuildMI(MBB, I, DL, get(W65816::TXA));
return;
}
// A → Y / Y → A via TAY / TYA. Same M/X width caveat.
if (DestReg == W65816::Y && SrcReg == W65816::A) {
BuildMI(MBB, I, DL, get(W65816::TAY));
return;
}
if (DestReg == W65816::A && SrcReg == W65816::Y) {
BuildMI(MBB, I, DL, get(W65816::TYA));
return;
}
// A → IMGn / IMGn → A: STA dp / LDA dp. IMGn is DP-backed at fixed
// addresses $D0..$DE — see W65816Helpers::imgDPAddr above.
int srcImg = W65816Helpers::imgDPAddr(SrcReg);
int dstImg = W65816Helpers::imgDPAddr(DestReg);
if (DestReg == W65816::A && srcImg >= 0) {
BuildMI(MBB, I, DL, get(W65816::LDA_DP)).addImm(srcImg);
return;
}
if (dstImg >= 0 && SrcReg == W65816::A) {
BuildMI(MBB, I, DL, get(W65816::STA_DP)).addImm(dstImg);
return;
}
// IMGn → IMGm: route through A, but PHA-bracket so A is preserved.
// Without the bracket, regalloc could insert this COPY between a
// def of A and the use of A (e.g. between `$a = COPY $img10` and
// `STAfi $a, slot`, when both vregs are alive simultaneously and
// the regalloc decides to shuffle img physregs in between). The
// unbracketed lda/sta clobbers A and the subsequent STAfi spills
// garbage. Observed under ptr32 + full IMG defs in the C++ try/
// catch path: `*p = 42` after `__cxa_allocate_exception` stored
// hi-half-of-ptr at lo-half-slot, breaking the indirect-long
// address setup so 42 landed at the wrong place.
//
// PHA bracket cost: +PHA (3 cyc, 1 byte) + PLA (4 cyc, 1 byte) = +7
// cyc, +2 bytes per IMG-IMG copy. These are rare (regalloc usually
// can avoid them by picking the same physreg for COPY's src and
// dst), so the cost is small.
if (srcImg >= 0 && dstImg >= 0) {
BuildMI(MBB, I, DL, get(W65816::PHA));
BuildMI(MBB, I, DL, get(W65816::LDA_DP)).addImm(srcImg);
BuildMI(MBB, I, DL, get(W65816::STA_DP)).addImm(dstImg);
BuildMI(MBB, I, DL, get(W65816::PLA));
return;
}
// SP -> A via TSC. Used by alloca / setjmp asm machinery.
if (DestReg == W65816::A && SrcReg == W65816::SP) {
BuildMI(MBB, I, DL, get(W65816::TSC));
return;
}
// A -> SP via TCS.
if (DestReg == W65816::SP && SrcReg == W65816::A) {
BuildMI(MBB, I, DL, get(W65816::TCS));
return;
}
// X <-> Y via A: 65816 has no direct X<->Y transfer; bridge through
// A. Caller is responsible for ensuring A is dead at this program
// point (regalloc arranges this). Used by greedy when an i16 vreg
// forced into one Idx16 reg gets coalesced with a use in the other.
if (DestReg == W65816::Y && SrcReg == W65816::X) {
BuildMI(MBB, I, DL, get(W65816::TXA));
BuildMI(MBB, I, DL, get(W65816::TAY));
return;
}
if (DestReg == W65816::X && SrcReg == W65816::Y) {
BuildMI(MBB, I, DL, get(W65816::TYA));
BuildMI(MBB, I, DL, get(W65816::TAX));
return;
}
// X → IMGn / IMGn → X: STX dp / LDX dp. Used by the i64-first-arg
// entry COPY (LowerFormalArguments routes arg0_ml through Img16 to
// dodge the TXA-bridge-clobbers-A spill bug for udivmod-shaped
// signatures).
if (dstImg >= 0 && SrcReg == W65816::X) {
BuildMI(MBB, I, DL, get(W65816::STX_DP)).addImm(dstImg);
return;
}
if (DestReg == W65816::X && srcImg >= 0) {
BuildMI(MBB, I, DL, get(W65816::LDX_DP)).addImm(srcImg);
return;
}
// Y -> IMGn / IMGn -> Y: STY dp / LDY dp. Symmetric with the X
// case above. Used by the i32-first-arg ABI's hi half (in X) and
// by Wide32 pair copies that have one half in Y after the per-half
// routing — see the lambda dispatch below.
if (dstImg >= 0 && SrcReg == W65816::Y) {
BuildMI(MBB, I, DL, get(W65816::STY_DP)).addImm(dstImg);
return;
}
if (DestReg == W65816::Y && srcImg >= 0) {
BuildMI(MBB, I, DL, get(W65816::LDY_DP)).addImm(srcImg);
return;
}
// DPF0 → A: emit `LDA $F0`. DPF0 is the pseudo-physreg carrier
// for an i64-returning call's high 16 bits; LowerCall builds a
// CopyFromReg(DPF0) glued to the call so the SDAG combiner /
// scheduler can't merge or reorder reads across calls.
if (DestReg == W65816::A && SrcReg == W65816::DPF0) {
BuildMI(MBB, I, DL, get(W65816::LDA_DP)).addImm(0xF0);
return;
}
// A → DPF0: emit `STA $F0`. Used by LowerReturn for the i64 high
// half; using a true direct-page store is critical because plain
// ISD::STORE with addr=0xF0 was lowering to `(d,s),y` indirect via
// DBR — which silently broke under DBR != 0 (e.g. after a bank
// switch). STA dp uses D + dp directly, ignoring DBR.
if (DestReg == W65816::DPF0 && SrcReg == W65816::A) {
BuildMI(MBB, I, DL, get(W65816::STA_DP)).addImm(0xF0);
return;
}
// Wide32 (AX32 or IMG-pair) <-> Wide32 copy: split on sub_lo / sub_hi
// and recurse. Use a hand-written dispatch instead of getSubReg
// because the MCRegisterInfo::getSubReg path crashes when called
// from TargetInstrInfo::lowerCopy on regs that are not pair regs
// (the table lookup walks past the end of the diff list).
auto wide32Halves = [](Register R)
-> std::pair<Register, Register> {
switch (R) {
case W65816::AX32: return {W65816::A, W65816::X};
case W65816::IMG01: return {W65816::IMG0, W65816::IMG1};
case W65816::IMG23: return {W65816::IMG2, W65816::IMG3};
case W65816::IMG45: return {W65816::IMG4, W65816::IMG5};
case W65816::IMG67: return {W65816::IMG6, W65816::IMG7};
case W65816::IMG89: return {W65816::IMG8, W65816::IMG9};
case W65816::IMG1011: return {W65816::IMG10, W65816::IMG11};
case W65816::IMG1213: return {W65816::IMG12, W65816::IMG13};
case W65816::IMG1415: return {W65816::IMG14, W65816::IMG15};
default: return {Register(), Register()};
}
};
auto [srcLo, srcHi] = wide32Halves(SrcReg);
auto [dstLo, dstHi] = wide32Halves(DestReg);
if (srcLo && srcHi && dstLo && dstHi) {
// Wide32 -> Wide32. Lo-first order is correct in every direction:
// AX32 -> IMG_pair : STA dstLo (A live), then STX dstHi
// IMG_pair -> AX32 : LDA srcLo, then LDX srcHi (independent halves)
// IMG_pair -> IMG_pair : LDA/STA chain twice (A is only per-half scratch)
copyPhysReg(MBB, I, DL, dstLo, srcLo, KillSrc,
RenamableDest, RenamableSrc);
copyPhysReg(MBB, I, DL, dstHi, srcHi, KillSrc,
RenamableDest, RenamableSrc);
return;
}
// Wide32 -> i16: take sub_lo of source. Arises post-RA when an
// EXTRACT_SUBREG was lowered as a parent-reg COPY (the SubRegIndex
// is dropped by lowerCopy).
if (srcLo && srcHi && !dstLo) {
copyPhysReg(MBB, I, DL, DestReg, srcLo, KillSrc,
RenamableDest, RenamableSrc);
return;
}
// i16 -> Wide32: write sub_lo only (sub_hi left as caller had it,
// matching INSERT_SUBREG semantics). Arises post-RA when REG_SEQUENCE
// is expanded into per-half COPY pseudos, then a parent-reg COPY of
// a sub-reg-only def appears.
if (!srcLo && dstLo && dstHi) {
copyPhysReg(MBB, I, DL, dstLo, SrcReg, KillSrc,
RenamableDest, RenamableSrc);
return;
}
// Virtual-register caller: this happens when the inline spiller
// (called from Basic regalloc) rewrites uses of a spilled vreg and
// asks us to copy through A before its physreg has been assigned.
// Emit a generic COPY pseudo and let the regalloc rewriter / a later
// ExpandPostRA pass resolve it once both regs are physical.
if (SrcReg.isVirtual() || DestReg.isVirtual()) {
BuildMI(MBB, I, DL, get(TargetOpcode::COPY), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
return;
}
const TargetRegisterInfo *TRI = MBB.getParent()->getSubtarget().getRegisterInfo();
llvm::errs() << "W65816 copyPhysReg unhandled: src="
<< (SrcReg.isPhysical() ? TRI->getRegAsmName(SrcReg) : "<vreg>")
<< " dst="
<< (DestReg.isPhysical() ? TRI->getRegAsmName(DestReg) : "<vreg>")
<< " srcImg=" << srcImg << " dstImg=" << dstImg << "\n";
llvm_unreachable("W65816: cross-class copyPhysReg not yet implemented");
}
void W65816InstrInfo::storeRegToStackSlot(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
bool isKill, int FrameIdx, const TargetRegisterClass *RC, Register VReg,
MachineInstr::MIFlag Flags) const {
// STAfi gets eliminated by W65816RegisterInfo::eliminateFrameIndex into
// a real STA d,S. Source is implicit A; emit the pseudo with the FI
// and zero offset. When regalloc hands us a spill from X or Y, bridge
// through A (TXA / TYA) — same rationale as loadRegFromStackSlot.
DebugLoc DL = MI != MBB.end() ? MI->getDebugLoc() : DebugLoc();
// Wide32 spill: split into 2 i16 stores at offsets 0 and 2 of the
// 4-byte spill slot. Bridge each half through A using copyPhysReg.
if (RC == &W65816::Wide32RegClass || RC == &W65816::Acc32RegClass ||
RC == &W65816::AnyWide32RegClass) {
Register Lo, Hi;
switch (SrcReg) {
case W65816::AX32: Lo = W65816::A; Hi = W65816::X; break;
case W65816::IMG01: Lo = W65816::IMG0; Hi = W65816::IMG1; break;
case W65816::IMG23: Lo = W65816::IMG2; Hi = W65816::IMG3; break;
case W65816::IMG45: Lo = W65816::IMG4; Hi = W65816::IMG5; break;
case W65816::IMG67: Lo = W65816::IMG6; Hi = W65816::IMG7; break;
case W65816::IMG89: Lo = W65816::IMG8; Hi = W65816::IMG9; break;
case W65816::IMG1011: Lo = W65816::IMG10; Hi = W65816::IMG11; break;
case W65816::IMG1213: Lo = W65816::IMG12; Hi = W65816::IMG13; break;
case W65816::IMG1415: Lo = W65816::IMG14; Hi = W65816::IMG15; break;
default:
// Regalloc occasionally hands us an UNPAIRED single i16 physreg
// (Acc16 / Img16 / Idx16) for a Wide32-class spill — happens when
// only one sub-reg is live at the spill point and the regalloc
// decides to spill it through the Wide32 path anyway. Treat as
// a single i16 store of the lone half at offset 0; the matching
// reload mirrors this (only the lo half is read back). The hi
// half slot at offset 2 is left unwritten — the reload's hi load
// reads zero-init stack memory which is fine because nothing
// genuinely needed the hi value (otherwise the regalloc would
// have allocated a real pair).
if (SrcReg != W65816::A) {
copyPhysReg(MBB, MI, DL, W65816::A, SrcReg, false);
}
BuildMI(MBB, MI, DL, get(W65816::STAfi))
.addReg(W65816::A).addFrameIndex(FrameIdx).addImm(0);
return;
}
// Bridge lo through A, store at offset 0; bridge hi through A,
// store at offset 2. This is brittle in the face of regalloc
// expectations — Wide32 spills are best avoided by keeping the
// pair in registers if at all possible.
if (Lo != W65816::A) {
copyPhysReg(MBB, MI, DL, W65816::A, Lo, false);
}
BuildMI(MBB, MI, DL, get(W65816::STAfi))
.addReg(W65816::A).addFrameIndex(FrameIdx).addImm(0);
copyPhysReg(MBB, MI, DL, W65816::A, Hi, false);
BuildMI(MBB, MI, DL, get(W65816::STAfi))
.addReg(W65816::A).addFrameIndex(FrameIdx).addImm(2);
return;
}
if (SrcReg == W65816::X || SrcReg == W65816::Y) {
unsigned XferOp = (SrcReg == W65816::X) ? W65816::TXA : W65816::TYA;
BuildMI(MBB, MI, DL, get(XferOp));
SrcReg = W65816::A;
}
BuildMI(MBB, MI, DL, get(W65816::STAfi))
.addReg(SrcReg, getKillRegState(isKill))
.addFrameIndex(FrameIdx)
.addImm(0);
}
void W65816InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
Register DestReg, int FrameIdx,
const TargetRegisterClass *RC,
Register VReg, unsigned SubReg,
MachineInstr::MIFlag Flags) const {
// LDAfi only knows how to put the value in A. If regalloc asks for
// a spill into X or Y, we have to bridge through A: LDA d,S then
// TAX / TAY. Without this, the MIR has `$x = LDAfi` but the asm
// printer emits just `LDA d,S` (which writes A, not X) — a silent
// miscompile that surfaced as i64 subtract chains using stale X
// values for the second word (caught by udivmod's `a - q*b` mod
// computation).
DebugLoc DL = MI != MBB.end() ? MI->getDebugLoc() : DebugLoc();
// Wide32 reload: 2 i16 loads at offsets 0 and 2 of the 4-byte slot.
if (RC == &W65816::Wide32RegClass || RC == &W65816::Acc32RegClass ||
RC == &W65816::AnyWide32RegClass) {
Register Lo, Hi;
switch (DestReg) {
case W65816::AX32: Lo = W65816::A; Hi = W65816::X; break;
case W65816::IMG01: Lo = W65816::IMG0; Hi = W65816::IMG1; break;
case W65816::IMG23: Lo = W65816::IMG2; Hi = W65816::IMG3; break;
case W65816::IMG45: Lo = W65816::IMG4; Hi = W65816::IMG5; break;
case W65816::IMG67: Lo = W65816::IMG6; Hi = W65816::IMG7; break;
case W65816::IMG89: Lo = W65816::IMG8; Hi = W65816::IMG9; break;
case W65816::IMG1011: Lo = W65816::IMG10; Hi = W65816::IMG11; break;
case W65816::IMG1213: Lo = W65816::IMG12; Hi = W65816::IMG13; break;
case W65816::IMG1415: Lo = W65816::IMG14; Hi = W65816::IMG15; break;
default:
// Mirror of the unpaired-spill case in storeRegToStackSlot:
// regalloc handed us a single physreg for a Wide32 reload.
// Just load the lo half from offset 0 into the dest.
BuildMI(MBB, MI, DL, get(W65816::LDAfi), W65816::A)
.addFrameIndex(FrameIdx).addImm(0);
if (DestReg != W65816::A)
copyPhysReg(MBB, MI, DL, DestReg, W65816::A, false);
return;
}
// Lo half: LDA from offset 0, transfer to Lo if needed.
BuildMI(MBB, MI, DL, get(W65816::LDAfi), W65816::A)
.addFrameIndex(FrameIdx).addImm(0);
if (Lo != W65816::A)
copyPhysReg(MBB, MI, DL, Lo, W65816::A, false);
// Hi half: LDA from offset 2, transfer to Hi.
BuildMI(MBB, MI, DL, get(W65816::LDAfi), W65816::A)
.addFrameIndex(FrameIdx).addImm(2);
if (Hi != W65816::A)
copyPhysReg(MBB, MI, DL, Hi, W65816::A, false);
return;
}
if (DestReg == W65816::A) {
BuildMI(MBB, MI, DL, get(W65816::LDAfi), DestReg)
.addFrameIndex(FrameIdx)
.addImm(0);
return;
}
if (DestReg == W65816::X || DestReg == W65816::Y) {
// Load via A, then transfer. A is implicitly clobbered.
BuildMI(MBB, MI, DL, get(W65816::LDAfi), W65816::A)
.addFrameIndex(FrameIdx)
.addImm(0);
unsigned XferOp = (DestReg == W65816::X) ? W65816::TAX : W65816::TAY;
BuildMI(MBB, MI, DL, get(XferOp));
return;
}
// Fallback: assume A path (covers Acc16 / Wide16 vregs by class).
BuildMI(MBB, MI, DL, get(W65816::LDAfi), DestReg)
.addFrameIndex(FrameIdx)
.addImm(0);
}
Register W65816InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
int &FrameIndex) const {
if (MI.getOpcode() != W65816::LDAfi)
return 0;
// memfi packs (FrameIndex, offset). Treat only offset==0 as a true
// stack-slot load — non-zero offset means we're addressing within
// the slot (e.g. the high half of an i32 spill), which the generic
// peephole/CSE machinery doesn't model.
if (MI.getNumOperands() < 3 || !MI.getOperand(1).isFI() ||
!MI.getOperand(2).isImm() || MI.getOperand(2).getImm() != 0)
return 0;
FrameIndex = MI.getOperand(1).getIndex();
return MI.getOperand(0).getReg();
}
Register W65816InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
int &FrameIndex) const {
if (MI.getOpcode() != W65816::STAfi)
return 0;
// STAfi: (ins Acc16:$src, memfi:$addr) — op0 is src reg, op1 is
// FrameIndex, op2 is offset.
if (MI.getNumOperands() < 3 || !MI.getOperand(1).isFI() ||
!MI.getOperand(2).isImm() || MI.getOperand(2).getImm() != 0)
return 0;
FrameIndex = MI.getOperand(1).getIndex();
return MI.getOperand(0).getReg();
}
bool W65816InstrInfo::isReMaterializableImpl(const MachineInstr &MI) const {
// Only LDAfi is gated on this hook. We declare it
// isReMaterializable=1 in tablegen so the framework will *consider*
// re-emitting it instead of spilling, then call back here to confirm.
// The instruction is safely rematerializable iff it loads from a
// *fixed* (immutable) frame index — i.e. an arg slot. Loads from a
// regular spill slot read a computed value that may not be available
// at the rematerialization point.
if (MI.getOpcode() != W65816::LDAfi)
return TargetInstrInfo::isReMaterializableImpl(MI);
// Operand 1 is the FrameIndex (operand 0 is the def).
const MachineOperand &FIOp = MI.getOperand(1);
if (!FIOp.isFI())
return false;
const MachineFrameInfo &MFI = MI.getMF()->getFrameInfo();
return MFI.isFixedObjectIndex(FIOp.getIndex());
}
int W65816InstrInfo::getSPAdjust(const MachineInstr &MI) const {
unsigned Opc = MI.getOpcode();
// ADJCALLSTACKDOWN returns 0 (we don't pre-shift SP — PUSH16 does
// it incrementally). ADJCALLSTACKUP returns -N where N is the
// first immediate (= total pushed bytes); this counterbalances
// the +2 contributions accumulated from each PUSH16 so SPAdj
// returns to 0 at the end of the call sequence.
if (Opc == W65816::ADJCALLSTACKDOWN)
return 0;
if (Opc == W65816::ADJCALLSTACKUP) {
// The immediate is the byte count.
if (MI.getNumOperands() > 0 && MI.getOperand(0).isImm())
return -static_cast<int>(MI.getOperand(0).getImm());
return 0;
}
if (Opc == W65816::PUSH16 || Opc == W65816::PUSH16X)
return 2;
return TargetInstrInfo::getSPAdjust(MI);
}
// Conditional branch opcode predicate — derived from the shared
// invertCondOpcode helper so the two stay in lockstep.
static bool isCondBranch(unsigned Opc) {
return W65816Helpers::invertCondOpcode(Opc) != 0;
}
// Unconditional direct-target branch predicate. Excludes JMP_AbsInd
// (indirect) and JML_Long (different operand kind).
static bool isUncondDirectBranch(unsigned Opc) {
return Opc == W65816::BRA || Opc == W65816::BRL ||
Opc == W65816::JMP_Abs;
}
// invertCondOpcode lives in namespace W65816Helpers above.
MachineBasicBlock *
W65816InstrInfo::getBranchDestBlock(const MachineInstr &MI) const {
// All our direct branches encode the target MBB in operand 0.
if (MI.getNumOperands() < 1 || !MI.getOperand(0).isMBB())
return nullptr;
return MI.getOperand(0).getMBB();
}
bool W65816InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
MachineBasicBlock *&TBB,
MachineBasicBlock *&FBB,
SmallVectorImpl<MachineOperand> &Cond,
bool AllowModify) const {
TBB = nullptr;
FBB = nullptr;
Cond.clear();
// We deliberately keep conditional branches (BEQ/BNE/etc.) opaque to
// BranchFolder. Their condition is encoded in the OPCODE and the
// flag input is an implicit use of P set by a preceding CMP/etc.;
// BranchFolder doesn't track that the CMP must stay adjacent, so
// if it re-inserts the Bxx in a tail-merged block the flag input
// becomes whatever earlier instruction last clobbered P. Caught by
// the softDouble dadd smoke (1.5 + 2.5 != 4.0) once we tried to make
// conditional branches analyzable.
//
// What we DO analyze:
// * Empty terminator sequence (pure fall-through) — return
// analyzable with no targets so MachineBlockPlacement's assert
// about fall-through blocks is satisfied trivially.
// * Single unconditional direct branch (BRA / BRL / JMP_Abs) —
// return analyzable with TBB set, no Cond. Safe to move because
// no flag dependency.
// Everything else (Bxx in any position, indirect jumps, multiple
// terminators, etc.) stays unanalyzable.
MachineBasicBlock::iterator I = MBB.getFirstTerminator();
while (I != MBB.end() && I->isDebugInstr())
++I;
if (I == MBB.end())
return false; // No terminators: pure fall-through.
unsigned FirstOpc = I->getOpcode();
if (!isUncondDirectBranch(FirstOpc))
return true; // Conditional or unknown. Stay opaque.
// Single unconditional direct branch — analyzable.
TBB = getBranchDestBlock(*I);
if (!TBB)
return true;
auto Next = std::next(I);
while (Next != MBB.end() && Next->isDebugInstr())
++Next;
if (Next != MBB.end())
return true; // Extra terminators after unconditional.
return false;
}
unsigned W65816InstrInfo::removeBranch(MachineBasicBlock &MBB,
int *BytesRemoved) const {
if (BytesRemoved)
*BytesRemoved = 0;
unsigned NumRemoved = 0;
// Walk from the end, removing trailing direct branches. Stop when
// we hit a non-branch or a branch we can't analyze (e.g. JMP_AbsInd).
while (!MBB.empty()) {
auto It = std::prev(MBB.end());
if (It->isDebugInstr()) {
// Skip debug instructions but don't delete them.
if (It == MBB.begin())
break;
--It;
}
unsigned Opc = It->getOpcode();
if (!isCondBranch(Opc) && !isUncondDirectBranch(Opc))
break;
if (BytesRemoved)
*BytesRemoved += getInstSizeInBytes(*It);
It->eraseFromParent();
++NumRemoved;
}
return NumRemoved;
}
unsigned W65816InstrInfo::insertBranch(MachineBasicBlock &MBB,
MachineBasicBlock *TBB,
MachineBasicBlock *FBB,
ArrayRef<MachineOperand> Cond,
const DebugLoc &DL,
int *BytesAdded) const {
assert(TBB && "insertBranch requires a true target");
assert((Cond.empty() || Cond.size() == 1) &&
"W65816 branch conditions are single-operand (opcode)");
if (BytesAdded)
*BytesAdded = 0;
unsigned NumAdded = 0;
if (Cond.empty()) {
// Unconditional branch. Use BRA — W65816AsmBackend auto-relaxes
// to BRL when the displacement exceeds an 8-bit signed offset.
auto MI = BuildMI(&MBB, DL, get(W65816::BRA)).addMBB(TBB);
if (BytesAdded)
*BytesAdded += getInstSizeInBytes(*MI);
return 1;
}
// Conditional branch using the opcode stored in Cond[0].
unsigned CondOpc = Cond[0].getImm();
auto MIc = BuildMI(&MBB, DL, get(CondOpc)).addMBB(TBB);
if (BytesAdded)
*BytesAdded += getInstSizeInBytes(*MIc);
++NumAdded;
// If there's also a false target, emit an unconditional branch to it.
if (FBB) {
auto MIu = BuildMI(&MBB, DL, get(W65816::BRA)).addMBB(FBB);
if (BytesAdded)
*BytesAdded += getInstSizeInBytes(*MIu);
++NumAdded;
}
return NumAdded;
}
bool W65816InstrInfo::reverseBranchCondition(
SmallVectorImpl<MachineOperand> &Cond) const {
if (Cond.size() != 1)
return true;
unsigned Inverted = W65816Helpers::invertCondOpcode(Cond[0].getImm());
if (!Inverted)
return true;
Cond[0].setImm(Inverted);
return false;
}
unsigned W65816InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
// Meta-instructions emit nothing — PHI nodes get eliminated, COPY
// gets lowered to TXA/TYA/TAY/TAX or LDA/STA, KILL/IMPLICIT_DEF/
// BUNDLE/CFI_INSTRUCTION/DBG_VALUE leave no bytes. For COPY we
// could be more precise (1 or 2 bytes depending on transfer) but
// returning 0 is fine: the size estimate just needs to be a lower
// bound for the BranchExpand pass's distance estimate.
if (MI.isMetaInstruction()) return 0;
unsigned Opc = MI.getOpcode();
// ADJCALLSTACKDOWN / ADJCALLSTACKUP get expanded to PLA loops or
// TSC/CLC/ADC/TCS bracket; estimate ~8 bytes worst case.
if (Opc == W65816::ADJCALLSTACKDOWN || Opc == W65816::ADJCALLSTACKUP)
return 8;
// Pseudo expansions handled by AsmPrinter that emit multiple
// bytes need explicit estimates; a missing case underestimates
// and risks branch-range errors. Rough byte counts below mirror
// each pseudo's expansion in W65816AsmPrinter::emitInstruction.
switch (Opc) {
// i8 immediate ops wrap with SEP/REP: SEP(2) + op(2) + REP(2) = 6.
case W65816::LDAi8imm:
case W65816::ADCi8imm:
case W65816::SBCi8imm:
case W65816::ANDi8imm:
case W65816::ORAi8imm:
case W65816::EORi8imm:
case W65816::CMPi8imm:
return 6 + (Opc == W65816::ADCi8imm || Opc == W65816::SBCi8imm ? 1 : 0);
// i8 abs load wraps: SEP(2) + LDA_Abs(3) + REP(2) = 7.
case W65816::LDA8abs:
return 7;
// i8 abs store wraps: SEP(2) + STA_Abs(3) + REP(2) = 7.
case W65816::STA8abs:
return 7;
// STA8fi: SEP(2) + STA d,S(2) + REP(2) = 6 (PEI expansion).
case W65816::STA8fi:
return 6;
// i16 ADC/SBC pseudos prepend CLC/SEC: 1 + 3 = 4 bytes.
case W65816::ADCi16imm:
case W65816::SBCi16imm:
case W65816::ADCabs:
case W65816::SBCabs:
return 4;
// ADDframe: TSC + CLC + ADC #imm = 1 + 1 + 3 = 5.
case W65816::ADDframe:
return 5;
// ALLOCAfi: STA dp + TSC + SEC + SBC dp + TCS + INC A = 2+1+1+2+1+1 = 8.
case W65816::ALLOCAfi:
return 8;
// PUSH16 / PUSH16X: PHA / PHX = 1 byte.
case W65816::PUSH16:
case W65816::PUSH16X:
return 1;
// JSLpseudo: jsl is 4 bytes.
case W65816::JSLpseudo:
case W65816::JSLpseudo32:
return 4;
default:
break;
}
// Real (non-pseudo) instruction: tablegen-defined Size.
unsigned Size = MI.getDesc().getSize();
if (Size != 0) return Size;
// Fallback for any pseudo we forgot to enumerate: 4 bytes is a
// pessimistic-but-safe upper bound on most W65816 instructions.
return 4;
}