3914 lines
183 KiB
C++
3914 lines
183 KiB
C++
//===-- W65816ISelLowering.cpp - W65816 DAG Lowering Implementation -------===//
|
||
//
|
||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||
// See https://llvm.org/LICENSE.txt for license information.
|
||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||
//
|
||
//===----------------------------------------------------------------------===//
|
||
//
|
||
// Minimum DAG lowering sufficient for a no-argument function returning an
|
||
// i16 constant. Argument passing and non-trivial calls still unimplemented.
|
||
//
|
||
//===----------------------------------------------------------------------===//
|
||
|
||
#include "W65816ISelLowering.h"
|
||
#include "W65816InstrInfo.h"
|
||
#include "W65816MachineFunctionInfo.h"
|
||
#include "W65816SelectionDAGInfo.h"
|
||
#include "W65816Subtarget.h"
|
||
#include "llvm/CodeGen/MachineFrameInfo.h"
|
||
#include "llvm/CodeGen/MachineFunction.h"
|
||
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
||
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
||
#include "llvm/CodeGen/SelectionDAG.h"
|
||
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
|
||
#include "llvm/Support/KnownBits.h"
|
||
#include "llvm/IR/Function.h"
|
||
#include "llvm/Support/CommandLine.h"
|
||
#include "llvm/Support/ErrorHandling.h"
|
||
|
||
using namespace llvm;
|
||
|
||
#define DEBUG_TYPE "w65816-lower"
|
||
|
||
// Loader-compat workaround: when set, LDAptr/STAptr/STBptr inserters
|
||
// load the bank byte from DP $BE (initialized by crt0 to PHK / current
|
||
// PBR) instead of forcing it to 0 via STZ $E2. This makes pointer
|
||
// derefs land in the user's bank — matching where DBR-relative
|
||
// absolute stores go — so library functions like gmtime that store
|
||
// into static buffers via DBR-relative paths are visible to caller-
|
||
// side pointer-deref reads. Costs 2 extra bytes / 4 cycles per ptr-
|
||
// deref (LDA dp + STA dp vs STZ dp). Default off to keep
|
||
// size-sensitive builds (toolbox) under the $C000 IO-window ceiling.
|
||
static cl::opt<bool> LoaderBankDeref(
|
||
"w65816-loader-bank-deref",
|
||
cl::desc("LDAptr/STAptr inserters read bank from DP $BE (set by "
|
||
"crt0 to PHK) instead of STZ $E2. Required for GS/OS "
|
||
"Loader compatibility; default off for size-sensitive "
|
||
"builds."),
|
||
cl::init(false), cl::Hidden);
|
||
|
||
// Layer 2 ptr32 opt: when set, ptr32 derefs assume the pointer's bank
|
||
// byte matches DBR. Uses `lda (d,s),Y` (opcode 0xB3, stack-relative
|
||
// indirect indexed-Y) instead of staging at $E0/$E2 and using
|
||
// `lda [dp],Y` (24-bit indirect-long). Saves ~4 instructions per
|
||
// deref. Correct only for code that touches memory inside DBR's bank
|
||
// — malloc'd Lua state + globals + BSS qualify; cross-bank pointers
|
||
// (rare) do not. Caller's responsibility. Tested by hand on lapi.c.
|
||
//
|
||
// NOTE: not static -- W65816Layer2Gate.cpp reads this to stamp the
|
||
// "w65816-layer2" function attribute on every function compiled with
|
||
// Layer 2 on, so the LTO-time gate can detect mismatched TUs. Phase
|
||
// 1.12 of GAP_CLOSURE_PLAN.md.
|
||
cl::opt<bool> DbrSafePtrs(
|
||
"w65816-dbr-safe-ptrs",
|
||
cl::desc("ptr32 derefs use 16-bit stack-rel-indirect-Y, assuming "
|
||
"the pointer's bank byte matches DBR. Significantly "
|
||
"shrinks struct-field-heavy code (Lua's lapi.c: ~3.4× → "
|
||
"much smaller) at the cost of safety for cross-bank "
|
||
"pointers (which become a miscompile)."),
|
||
cl::init(false), cl::Hidden);
|
||
|
||
W65816TargetLowering::W65816TargetLowering(const TargetMachine &TM,
|
||
const W65816Subtarget &STI)
|
||
: TargetLowering(TM, STI) {
|
||
// Register classes for the two scalar modes. The register allocator sees
|
||
// A, X and Y as both 8-bit and 16-bit; a later REP/SEP pass is responsible
|
||
// for ensuring the dynamic mode matches the selected class.
|
||
addRegisterClass(MVT::i8, &W65816::Acc8RegClass);
|
||
addRegisterClass(MVT::i16, &W65816::Acc16RegClass);
|
||
addRegisterClass(MVT::i32, &W65816::Wide32RegClass);
|
||
|
||
computeRegisterProperties(STI.getRegisterInfo());
|
||
|
||
setStackPointerRegisterToSaveRestore(W65816::SP);
|
||
setBooleanContents(ZeroOrOneBooleanContent);
|
||
setBooleanVectorContents(ZeroOrOneBooleanContent);
|
||
|
||
// GlobalAddress and ExternalSymbol: lower to W65816ISD::Wrapper so a
|
||
// tablegen pattern can fold them into instruction operands.
|
||
setOperationAction(ISD::GlobalAddress, MVT::i16, Custom);
|
||
setOperationAction(ISD::ExternalSymbol, MVT::i16, Custom);
|
||
setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
|
||
setOperationAction(ISD::ExternalSymbol, MVT::i32, Custom);
|
||
// FrameIndex i32 has its own DAG-to-DAG path in W65816ISelDAGToDAG.cpp.
|
||
|
||
// BR_CC is custom-lowered to a CMP + W65816ISD::BR_CC chain so we can
|
||
// emit the right BEQ/BNE/BCS/BCC mnemonic per condition.
|
||
setOperationAction(ISD::BR_CC, MVT::i16, Custom);
|
||
setOperationAction(ISD::BR_CC, MVT::i8, Custom);
|
||
setOperationAction(ISD::BRCOND, MVT::Other, Expand);
|
||
setOperationAction(ISD::BR_JT, MVT::Other, Expand);
|
||
// BRIND (computed-goto `goto *p`, indirectbr IR) has no direct
|
||
// 65816 instruction — JMP (abs) / JMP [abs] read the target pointer
|
||
// from MEMORY, not a register. Custom-lower to: store the pointer's
|
||
// 16-bit low half (offset within the program's PBR-pinned code bank)
|
||
// to $00B8 (the __indirTarget DP slot already reserved for indirect
|
||
// calls — see libgcc.s), then emit a `JMP ($00B8)` via the BRIND
|
||
// pseudo. Single-bank assumption on the target's code: same as
|
||
// every other JMP/BRA in our codegen.
|
||
//
|
||
// The ptr is i32 under p:32:16 (current default) — extract sub_lo.
|
||
// Under p:16 (legacy ptr16), it's already i16.
|
||
setOperationAction(ISD::BRIND, MVT::Other, Custom);
|
||
|
||
// SETCC and SELECT_CC: custom-lowered to a CMP + W65816ISD::SELECT_CC
|
||
// pseudo (with usesCustomInserter=1) that EmitInstrWithCustomInserter
|
||
// expands into a Bxx + diamond CFG + PHI. SETCC funnels through the
|
||
// same path with TVal=1 / FVal=0. SELECT (no condition operand) is
|
||
// expanded to SELECT_CC by the legalizer using SETNE against zero.
|
||
setOperationAction(ISD::SETCC, MVT::i16, Custom);
|
||
setOperationAction(ISD::SETCC, MVT::i8, Custom);
|
||
setOperationAction(ISD::SELECT_CC, MVT::i16, Custom);
|
||
setOperationAction(ISD::SELECT_CC, MVT::i8, Custom);
|
||
setOperationAction(ISD::SELECT, MVT::i16, Expand);
|
||
setOperationAction(ISD::SELECT, MVT::i8, Expand);
|
||
// 65816 has no inline sign-extend instruction; synthesize i8 -> i16
|
||
// via a bit-7 test and SELECT_CC (see LowerSignExtend).
|
||
setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Custom);
|
||
|
||
// BSWAP: no native byte-swap instruction (XBA swaps the two halves
|
||
// of the 16-bit accumulator only when in 8-bit M mode, hard to
|
||
// exploit cleanly). Lower to shifts + ORs via the generic Expand
|
||
// path — SDAG turns `bswap(i32)` into four byte extracts ORed back
|
||
// together, which our existing patterns handle. Required for
|
||
// portable C that constructs a big-endian word from byte loads:
|
||
// `((u32)b[0] << 24) | ((u32)b[1] << 16) | ((u32)b[2] << 8) | b[3]`
|
||
// (SHA-256 message-schedule, JPEG/PNG headers, etc.).
|
||
setOperationAction(ISD::BSWAP, MVT::i16, Expand);
|
||
setOperationAction(ISD::BSWAP, MVT::i32, Expand);
|
||
setOperationAction(ISD::BSWAP, MVT::i64, Expand);
|
||
|
||
// We have zextload-i8 and extload-i8 patterns (LDA + AND #$FF / bare
|
||
// LDA for the anyext case). No native sextload; mark it Expand so
|
||
// LLVM rewrites `sextload i16, i8` into `(sign_extend (load i8))`,
|
||
// which then flows through LowerSignExtend's branchless 3-insn
|
||
// sequence (AND #$00FF; EOR #$0080; SEC; SBC #$0080).
|
||
for (MVT VT : MVT::integer_valuetypes())
|
||
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Expand);
|
||
|
||
// GlobalOpt sometimes narrows a `short` global to `i1` when it sees
|
||
// every assignment is 0 or 1. Custom-lower so LowerLoad rewrites
|
||
// `zext/sext/anyext from i1` into a plain byte load + appropriate
|
||
// mask. Both i16 and i8 result widths can appear, depending on
|
||
// whether the consumer wants the value as `short` or `bool`.
|
||
for (MVT ResVT : {MVT::i8, MVT::i16}) {
|
||
setLoadExtAction(ISD::ZEXTLOAD, ResVT, MVT::i1, Custom);
|
||
setLoadExtAction(ISD::SEXTLOAD, ResVT, MVT::i1, Custom);
|
||
setLoadExtAction(ISD::EXTLOAD, ResVT, MVT::i1, Custom);
|
||
}
|
||
|
||
// Only register i32 ext-load / trunc-store and Custom actions when
|
||
// i32 is actually a legal type (ptr32 mode active). Otherwise the
|
||
// Custom-action calls intercept i16/i8 ops, and LowerTruncate's
|
||
// SDValue()-on-non-i32 bail breaks the i16→i8 trunc pattern (same
|
||
// root cause as the earlier LOAD-Custom-breaks-LDAptr issue).
|
||
bool ptr32Active = isTypeLegal(MVT::i32);
|
||
if (ptr32Active) {
|
||
for (MVT MemVT : {MVT::i8, MVT::i16}) {
|
||
setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, MemVT, Expand);
|
||
setLoadExtAction(ISD::SEXTLOAD, MVT::i32, MemVT, Expand);
|
||
setLoadExtAction(ISD::EXTLOAD, MVT::i32, MemVT, Expand);
|
||
setTruncStoreAction(MVT::i32, MemVT, Expand);
|
||
}
|
||
// Truncating byte stores (`s->c = (char)v`) land as TRUNCSTORE
|
||
// i16->i8 in SDAG after combiner canonicalization. Custom-route
|
||
// through LowerStore so the ptr-offset peel fires for them too.
|
||
setTruncStoreAction(MVT::i16, MVT::i8, Custom);
|
||
}
|
||
|
||
// Vararg support: VASTART writes the address of the first vararg slot
|
||
// to the va_list pointer. VAARG/VACOPY/VAEND use the default
|
||
// expansions that load through that pointer and bump it. This makes
|
||
// <stdarg.h>-style functions (e.g. printf-likes) compile cleanly.
|
||
setOperationAction(ISD::VASTART, MVT::Other, Custom);
|
||
// Custom VAARG so we DON'T align the va_list pointer. The default
|
||
// expansion rounds up to the type's preferred alignment (S16 = 2),
|
||
// but caller-pushed args land at PHA's resulting odd S+1 address.
|
||
// Aligning would skip the low byte and read garbage.
|
||
setOperationAction(ISD::VAARG, MVT::Other, Custom);
|
||
setOperationAction(ISD::VACOPY, MVT::Other, Expand);
|
||
setOperationAction(ISD::VAEND, MVT::Other, Expand);
|
||
|
||
// C++ exceptions (SJLJ model) — clang lowers exception machinery into
|
||
// these intrinsics via SjLjEHPrepare. We don't have native handling
|
||
// for any of them on this target; mark Expand so LegalizeDAG falls
|
||
// back to its no-op stubs (setjmp returns 0, longjmp is a no-op,
|
||
// setup_dispatch is a chain pass-through). The actual EH semantics
|
||
// are provided at runtime by libcxxabi (__cxa_throw etc.) calling
|
||
// _Unwind_SjLj_RaiseException, which in turn longjmps via the
|
||
// function context the prologue prepared. See
|
||
// runtime/src/libcxxabiSjlj.c for the runtime side.
|
||
setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Expand);
|
||
setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i16, Expand);
|
||
setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Expand);
|
||
setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
|
||
// SJLJ exception lowering uses FRAMEADDR(0) to read the current frame
|
||
// pointer. We don't reserve a frame pointer in general; return the
|
||
// entry-SP-equivalent value (current SP read via TSC) — good enough
|
||
// for SJLJ's purpose of identifying the call frame.
|
||
setOperationAction(ISD::FRAMEADDR, MVT::i16, Custom);
|
||
setOperationAction(ISD::FRAMEADDR, MVT::i32, Custom);
|
||
// stacksave / stackrestore — used by SjLjEHPrepare to save/restore SP
|
||
// around invoke calls. The jmp_buf already captures SP via TSC in
|
||
// our setjmp implementation, so these are redundant here. Lower
|
||
// stacksave to a constant 0 (the value is stored into the function
|
||
// context but never used for restoration on our target) and
|
||
// stackrestore to a chain pass-through (no-op).
|
||
// SJLJ EH uses STACKSAVE/STACKRESTORE. Default Expand calls
|
||
// CopyFromReg/$SP which fails because SP has no register class.
|
||
// Custom-lower to a Constant 0 (stacksave) and chain-passthrough
|
||
// (stackrestore) — our SJLJ runtime doesn't actually use these
|
||
// values; setjmp/longjmp manage SP directly via TSC/TCS.
|
||
setOperationAction(ISD::STACKSAVE, MVT::Other, Custom);
|
||
setOperationAction(ISD::STACKRESTORE, MVT::Other, Custom);
|
||
// FRAMEADDR is set Custom above for SJLJ; don't set it Expand here
|
||
// (the second setOperationAction would override the first).
|
||
setOperationAction(ISD::RETURNADDR, MVT::i16, Expand);
|
||
// W65816 pointers are i32; legalizer queries the action for the pointer
|
||
// type, so register Expand for i32 too. Without this,
|
||
// __builtin_return_address(0) ICEs in LowerOperation (no Custom handler
|
||
// for RETURNADDR).
|
||
setOperationAction(ISD::RETURNADDR, MVT::i32, Expand);
|
||
setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i16, Expand);
|
||
setOperationAction(ISD::EH_DWARF_CFA, MVT::i16, Expand);
|
||
|
||
// ISD::TRAP — __builtin_trap(), -fsanitize-trap=undefined. Default
|
||
// expansion is a libcall to abort(); UBSan-min wants a BRK with a
|
||
// pickup sentinel instead so the trap site is identifiable from a
|
||
// memory dump without a working stdio path. Custom-lower to a
|
||
// W65816ISD::TRAP target node; the InstrInfo.td pattern routes it
|
||
// to BRK_pseudo, whose AsmPrinter expansion writes 0xBE to $70 and
|
||
// then issues BRK + a self-loop (headless MAME mis-vectors BRK, so
|
||
// the spin is what actually halts).
|
||
setOperationAction(ISD::TRAP, MVT::Other, Custom);
|
||
// DEBUGTRAP follows the same shape — same node, same expansion.
|
||
setOperationAction(ISD::DEBUGTRAP, MVT::Other, Custom);
|
||
|
||
// The 65816 has no hardware multiplier or divider. Multiply by a
|
||
// power-of-two constant is auto-rewritten to shifts by the DAG
|
||
// combiner; arbitrary multiply / divide / mod go through libcalls
|
||
// (`__mulhi3` for i16 multiply etc.). The libcall expander emits a
|
||
// standard CALL node which flows through LowerCall, so multi-arg
|
||
// call lowering must be working first (it is, see task #26).
|
||
setOperationAction(ISD::MULHU, MVT::i16, Expand);
|
||
setOperationAction(ISD::MULHS, MVT::i16, Expand);
|
||
setOperationAction(ISD::SMUL_LOHI, MVT::i16, Expand);
|
||
setOperationAction(ISD::UMUL_LOHI, MVT::i16, Expand);
|
||
setOperationAction(ISD::MUL, MVT::i16, LibCall);
|
||
|
||
// i8 multiply / mulh / div / rem: SDAG narrows e.g. `x / 10` to
|
||
// `mulhu i8 x, -51` + shift when it proves operands fit in i8.
|
||
// The 65816 has no native 8-bit multiplier; route everything
|
||
// through the 16-bit libcalls by Promoting i8 ops to i16.
|
||
setOperationAction(ISD::MUL, MVT::i8, Promote);
|
||
setOperationAction(ISD::MULHU, MVT::i8, Promote);
|
||
setOperationAction(ISD::MULHS, MVT::i8, Promote);
|
||
setOperationAction(ISD::SDIV, MVT::i8, Promote);
|
||
setOperationAction(ISD::UDIV, MVT::i8, Promote);
|
||
setOperationAction(ISD::SREM, MVT::i8, Promote);
|
||
setOperationAction(ISD::UREM, MVT::i8, Promote);
|
||
setOperationAction(ISD::SMUL_LOHI, MVT::i8, Expand);
|
||
setOperationAction(ISD::UMUL_LOHI, MVT::i8, Expand);
|
||
// CTPOP/CTLZ/CTTZ/ROTL/ROTR — no hardware support. Expand lets the
|
||
// type legalizer rewrite into a sequence of basic ops. Without
|
||
// this, e.g. `x && !(x & (x-1))` (LLVM canonicalises to popcount==1)
|
||
// or `(x << 1) | (x >> 15)` (canonicalised to rotl) hit "Cannot
|
||
// Select" at isel.
|
||
for (MVT VT : {MVT::i8, MVT::i16, MVT::i32}) {
|
||
setOperationAction(ISD::CTPOP, VT, Expand);
|
||
setOperationAction(ISD::CTLZ, VT, Expand);
|
||
setOperationAction(ISD::CTTZ, VT, Expand);
|
||
setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
|
||
setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
|
||
setOperationAction(ISD::ROTL, VT, Expand);
|
||
setOperationAction(ISD::ROTR, VT, Expand);
|
||
}
|
||
setOperationAction(ISD::SDIV, MVT::i16, LibCall);
|
||
setOperationAction(ISD::UDIV, MVT::i16, LibCall);
|
||
setOperationAction(ISD::SREM, MVT::i16, LibCall);
|
||
setOperationAction(ISD::UREM, MVT::i16, LibCall);
|
||
setOperationAction(ISD::SDIVREM, MVT::i16, Expand);
|
||
setOperationAction(ISD::UDIVREM, MVT::i16, Expand);
|
||
|
||
// Variable-amount and large-constant shifts. We have inline
|
||
// patterns for shift-by-1..4; everything else goes through
|
||
// __ashlhi3 / __lshrhi3 / __ashrhi3. Setting the action to Custom
|
||
// lets us return SDValue() for the fast cases and route everything
|
||
// else through the libcall lowering helper.
|
||
setOperationAction(ISD::SHL, MVT::i16, Custom);
|
||
setOperationAction(ISD::SRL, MVT::i16, Custom);
|
||
setOperationAction(ISD::SRA, MVT::i16, Custom);
|
||
// i8 shifts go through Custom too — LowerShift detects the i8 result
|
||
// and routes through trunc(i16-shift(zext_or_sext(lhs), amount)).
|
||
// Avoids needing a parallel set of qi3 libcalls.
|
||
setOperationAction(ISD::SHL, MVT::i8, Custom);
|
||
setOperationAction(ISD::SRL, MVT::i8, Custom);
|
||
setOperationAction(ISD::SRA, MVT::i8, Custom);
|
||
|
||
// LOAD / STORE Custom-lowering for ptr32 mode is intentionally NOT
|
||
// wired here in ptr16 mode. Setting LOAD Custom and returning
|
||
// SDValue() from LowerLoad short-circuits the i16-result LDAptr/
|
||
// STAptr selection paths (the Custom→empty→Legal fall-through doesn't
|
||
// re-enter pattern matching). When ptr32 is activated, this hook
|
||
// needs a different gating mechanism — likely an isel-time
|
||
// replacement triggered by addrspacecast or a target DAG combine.
|
||
// See LowerLoad / LowerStore — currently dead code.
|
||
|
||
// ADDC/ADDE/SUBC/SUBE are the legacy SDNodes with implicit Glue carrying
|
||
// the carry/borrow flag between the two halves of a multi-precision add or
|
||
// sub. Setting them Legal triggers the type legalizer's carry-chain split
|
||
// for i32 ADD/SUB, which lowers to native ADC/SBC pairs (~7 instructions)
|
||
// instead of the default UADDO+SETCC+ADD-of-bool path (~25 instructions).
|
||
// The matching tablegen pseudos add Defs/Uses on the P register, which
|
||
// tablegen wires up to the SDNode's SDNPInGlue/SDNPOutGlue automatically.
|
||
setOperationAction(ISD::ADDC, MVT::i16, Legal);
|
||
setOperationAction(ISD::ADDE, MVT::i16, Legal);
|
||
setOperationAction(ISD::SUBC, MVT::i16, Legal);
|
||
setOperationAction(ISD::SUBE, MVT::i16, Legal);
|
||
|
||
// i32 (long). Type legalization splits i32 into two i16 halves; with
|
||
// ADDC/ADDE Legal (above), ADD/SUB go through the native carry chain.
|
||
// AND/OR/XOR split cleanly into per-half ops with no carry to track.
|
||
// Multiply/divide/shift go through libcall stubs whose
|
||
// implementations live in runtime/src/libgcc.s. SHL_PARTS / SRL_PARTS
|
||
// / SRA_PARTS are the SDNodes the type legalizer emits when splitting
|
||
// a variable-amount shift; without an action they get "Cannot select".
|
||
// LibCall on the parent node routes the whole shift through one
|
||
// __ashlsi3 / __lshrsi3 / __ashrsi3 call, which is both smaller and
|
||
// simpler than implementing a 32-bit shift in 65816 assembly inline.
|
||
for (MVT VT : {MVT::i32}) {
|
||
// MUL i32 is Custom-lowered: the typical fall-through libcall is
|
||
// __mulsi3 (32x32 -> 32), but when both operands are ZEXT from i16
|
||
// we can emit __umulhisi3 (16x16 -> 32) instead. Saves ~60 cyc per
|
||
// call on the `(unsigned long)i * i` pattern — see LowerMUL_I32.
|
||
setOperationAction(ISD::MUL, VT, Custom);
|
||
setOperationAction(ISD::SDIV, VT, LibCall);
|
||
setOperationAction(ISD::UDIV, VT, LibCall);
|
||
setOperationAction(ISD::SREM, VT, LibCall);
|
||
setOperationAction(ISD::UREM, VT, LibCall);
|
||
setOperationAction(ISD::MULHU, VT, Expand);
|
||
setOperationAction(ISD::MULHS, VT, Expand);
|
||
setOperationAction(ISD::SMUL_LOHI, VT, Expand);
|
||
setOperationAction(ISD::UMUL_LOHI, VT, Expand);
|
||
setOperationAction(ISD::SDIVREM, VT, Expand);
|
||
setOperationAction(ISD::UDIVREM, VT, Expand);
|
||
// i32 shifts route through a libcall via the
|
||
// preferredShiftLegalizationStrategy override (see header). No
|
||
// explicit SHL/SHL_PARTS action needed — the override forces the
|
||
// type-legalizer's libcall path before SHL_PARTS would be emitted.
|
||
}
|
||
// i64 shifts — route to libcall before the type legalizer tries
|
||
// to split via the next-legal-type (which becomes i32 in ptr32 mode
|
||
// and triggers a SDAG combine loop on `i64 >> K` patterns). By
|
||
// marking SHL/SRL/SRA i64 LibCall here, the operation legalizer
|
||
// picks up the libcall path even though i64 itself is illegal.
|
||
for (MVT VT : {MVT::i64}) {
|
||
setOperationAction(ISD::SHL, VT, LibCall);
|
||
setOperationAction(ISD::SRL, VT, LibCall);
|
||
setOperationAction(ISD::SRA, VT, LibCall);
|
||
}
|
||
|
||
if (ptr32Active) {
|
||
for (unsigned Op : {ISD::ADD, ISD::SUB, ISD::AND, ISD::OR, ISD::XOR})
|
||
setOperationAction(Op, MVT::i32, Custom);
|
||
setOperationAction(ISD::SHL, MVT::i32, Custom);
|
||
setOperationAction(ISD::SRL, MVT::i32, Custom);
|
||
setOperationAction(ISD::SRA, MVT::i32, Custom);
|
||
setOperationAction(ISD::ZERO_EXTEND, MVT::i32, Custom);
|
||
setOperationAction(ISD::SIGN_EXTEND, MVT::i32, Custom);
|
||
setOperationAction(ISD::ANY_EXTEND, MVT::i32, Custom);
|
||
// SIGN_EXTEND_INREG with i32 result and inner type i1/i8/i16:
|
||
// the combiner emits this for `(int32_t)((int8_t)x)` and for
|
||
// `-(crc & 1ul)` (the i1 case shows up in CRC32 loops). No
|
||
// tablegen pattern covers the i32 form; Custom-lower to per-half
|
||
// ops. IMPORTANT: LegalizeDAG looks up the action for
|
||
// SIGN_EXTEND_INREG using the INNER VT (the operand value type),
|
||
// not the result VT. See LegalizeDAG.cpp:
|
||
// Action = TLI.getOperationAction(Op, InnerType);
|
||
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Custom);
|
||
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Custom);
|
||
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Custom);
|
||
setOperationAction(ISD::TRUNCATE, MVT::i16, Custom);
|
||
setOperationAction(ISD::TRUNCATE, MVT::i8, Custom);
|
||
setOperationAction(ISD::LOAD, MVT::i32, Custom);
|
||
setOperationAction(ISD::STORE, MVT::i32, Custom);
|
||
// Also Custom for i16/i8 LOAD/STORE in ptr32 mode so LowerLoad/
|
||
// LowerStore can fold Wide32(Wrapper, WrapperBank) of the same
|
||
// global (or a raw GlobalAddress) to a plain abs-16 access
|
||
// (DBR-relative). Without this, every `g` access for a
|
||
// same-segment global goes through the 14-byte [dp],y
|
||
// indirect-long path even though the bank is implicit in DBR.
|
||
setOperationAction(ISD::STORE, MVT::i16, Custom);
|
||
setOperationAction(ISD::STORE, MVT::i8, Custom);
|
||
setOperationAction(ISD::LOAD, MVT::i16, Custom);
|
||
setOperationAction(ISD::LOAD, MVT::i8, Custom);
|
||
// ZEXTLOAD i16-from-i8 also Custom — the DAG combiner folds
|
||
// (zext (load i8 @g)) into one zextload SDNode, so we need to
|
||
// apply the same global-address fold there. SEXTLOAD/EXTLOAD
|
||
// already have Expand actions from earlier setLoadExtAction
|
||
// calls; leave those alone (Custom would require parallel
|
||
// tablegen patterns we don't have).
|
||
setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, MVT::i8, Custom);
|
||
setOperationAction(ISD::SETCC, MVT::i32, Custom);
|
||
setOperationAction(ISD::BR_CC, MVT::i32, Custom);
|
||
setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
|
||
setOperationAction(ISD::SELECT, MVT::i32, Custom);
|
||
setOperationAction(ISD::Constant, MVT::i32, Custom);
|
||
}
|
||
|
||
// Disable jump tables. Generating them costs us BRIND (indirect
|
||
// branch via 16-bit pointer load), which we don't have. A long
|
||
// if-else chain compiles fine without them. Setting the threshold
|
||
// to UINT_MAX makes LLVM never form a jump table.
|
||
setMinimumJumpTableEntries(UINT_MAX);
|
||
|
||
// Variable-length arrays / dynamic stack allocation. Lowered to
|
||
// `tsc; sec; sbc size; tcs; inc a` — A returns the address of the
|
||
// allocated region. Limitation: this shifts SP, so any FrameIndex
|
||
// accessed *after* a DYNAMIC_STACKALLOC reads from a wrong offset
|
||
// (we have no frame pointer). Suitable for the common pattern
|
||
// "alloca; initialise; pass; return"; complex VLA use mixed with
|
||
// local-variable access across the alloca will miscompile. A real
|
||
// FP (DP slot or X-as-FP) would lift this restriction.
|
||
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i16, Custom);
|
||
if (ptr32Active)
|
||
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
|
||
|
||
// Opt into PerformDAGCombine on LOAD nodes — needed for the
|
||
// address-select reverse combine (see W65816TargetLowering::
|
||
// PerformDAGCombine).
|
||
// setTargetDAGCombine(ISD::LOAD); // bisecting pickif hang
|
||
// SHL combine disabled while debugging the ptr32 i64-phi hang.
|
||
// setTargetDAGCombine(ISD::SHL);
|
||
|
||
// Combine STORE / LOAD with const-int i32 pointer to a form that
|
||
// survives LowerI32Constant (which would otherwise split the ptr
|
||
// into a Wide32 reg pair and lose the const-addr fast path).
|
||
// See PerformDAGCombine.
|
||
setTargetDAGCombine(ISD::STORE);
|
||
setTargetDAGCombine(ISD::LOAD);
|
||
}
|
||
|
||
// Map an LLVM SETCC condition to a W65816 branch. Returns the condition
|
||
// code along with possibly-swapped LHS/RHS; some signed comparisons are
|
||
// rewritten to use unsigned ones with a tweaked operand because the
|
||
// 65816 has no native signed branch other than BMI/BPL on a value, not
|
||
// on a comparison result.
|
||
// Map an LLVM SETCC condition to a 65816 branch. Unsigned codes use
|
||
// BCS/BCC after CMP. Signed SETLT/SETGE map to BMI/BPL — correct only
|
||
// when the comparison cannot overflow. For values produced by typical
|
||
// C arithmetic on i16 this is usually fine; values near INT16_MIN/MAX
|
||
// could give wrong results until we emit the BVS handling sequence.
|
||
// SETGT / SETLE are rewritten to SETLT / SETGE with constant + 1 in
|
||
// LowerBR_CC, mirroring the SETULE / SETUGT path.
|
||
static W65816CC::CondCode mapCC(ISD::CondCode CC) {
|
||
switch (CC) {
|
||
case ISD::SETEQ: return W65816CC::COND_EQ;
|
||
case ISD::SETNE: return W65816CC::COND_NE;
|
||
case ISD::SETUGE: return W65816CC::COND_HS;
|
||
case ISD::SETULT: return W65816CC::COND_LO;
|
||
case ISD::SETLT: return W65816CC::COND_MI;
|
||
case ISD::SETGE: return W65816CC::COND_PL;
|
||
default:
|
||
return W65816CC::COND_INVALID;
|
||
}
|
||
}
|
||
|
||
// If both compare operands are i8, widen them to i16 so the existing
|
||
// i16 CMP path can handle them. Use ZEXT for unsigned/eq/ne CCs and
|
||
// SEXT for signed CCs — picking the wrong extension would invert the
|
||
// answer (e.g. -1i8 sext to 0xFFFF compares < 1 signed; zext to 0x00FF
|
||
// compares > 1 unsigned, which would flip a signed less-than).
|
||
static void promoteI8Cmp(SDValue &LHS, SDValue &RHS, ISD::CondCode CC,
|
||
SelectionDAG &DAG, const SDLoc &DL) {
|
||
if (LHS.getValueType() != MVT::i8) return;
|
||
unsigned Ext;
|
||
switch (CC) {
|
||
case ISD::SETLT: case ISD::SETLE: case ISD::SETGT: case ISD::SETGE:
|
||
Ext = ISD::SIGN_EXTEND; break;
|
||
default:
|
||
Ext = ISD::ZERO_EXTEND; break; // unsigned + eq/ne
|
||
}
|
||
LHS = DAG.getNode(Ext, DL, MVT::i16, LHS);
|
||
RHS = DAG.getNode(Ext, DL, MVT::i16, RHS);
|
||
}
|
||
|
||
// Normalize a (LHS, RHS, CC) triple so the result is something we can
|
||
// emit with one CMP + Bxx. Returns the W65816 condition code; updates
|
||
// LHS/RHS/CC in place. Returns COND_INVALID on failure.
|
||
static W65816CC::CondCode normalizeCC(SDValue &LHS, SDValue &RHS,
|
||
ISD::CondCode &CC, SelectionDAG &DAG,
|
||
const SDLoc &DL) {
|
||
promoteI8Cmp(LHS, RHS, CC, DAG, DL);
|
||
// CMP wants the comparand (constant or memory) on the right. If a DAG
|
||
// pre-pass put the constant on the left, swap and flip the condition.
|
||
if (isa<ConstantSDNode>(LHS) && !isa<ConstantSDNode>(RHS)) {
|
||
std::swap(LHS, RHS);
|
||
CC = ISD::getSetCCSwappedOperands(CC);
|
||
}
|
||
|
||
// Signed compare via "EOR with sign bit then unsigned compare":
|
||
// a < b (signed) iff (a ^ 0x8000) < (b ^ 0x8000) (unsigned)
|
||
// The XOR flips the sign bit, which converts signed-int ordering to
|
||
// unsigned-int ordering on the same bits. This avoids the WDC's
|
||
// missing "BLT signed" — BMI/BPL alone read the sign of (a-b)
|
||
// without the V-flag overflow correction, giving wrong results
|
||
// when the subtraction overflows (e.g., INT16_MIN < 1 produced
|
||
// false because (-32768 - 1) = +32767 has N=0). After the EOR
|
||
// transform we use BCC/BCS which depend on the carry from CMP and
|
||
// don't suffer overflow corruption.
|
||
//
|
||
// Cost: 1 EOR per operand (3 bytes each in M=16) — comparable to
|
||
// the V-aware multi-branch sequence (5+ bytes of branches), but
|
||
// happens at SDAG time so subsequent SDAG combining can fold
|
||
// EORs against constants or already-EOR'd values.
|
||
bool SignedCmp = (CC == ISD::SETLT || CC == ISD::SETLE ||
|
||
CC == ISD::SETGT || CC == ISD::SETGE);
|
||
if (SignedCmp && LHS.getValueType() == MVT::i16) {
|
||
EVT VT = LHS.getValueType();
|
||
SDValue Mask = DAG.getConstant(0x8000, DL, VT);
|
||
LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, Mask);
|
||
RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, Mask);
|
||
switch (CC) {
|
||
case ISD::SETLT: CC = ISD::SETULT; break;
|
||
case ISD::SETLE: CC = ISD::SETULE; break;
|
||
case ISD::SETGT: CC = ISD::SETUGT; break;
|
||
case ISD::SETGE: CC = ISD::SETUGE; break;
|
||
default: break;
|
||
}
|
||
}
|
||
|
||
// Rewrite SETULE / SETUGT to SETULT / SETUGE with constant +/- 1.
|
||
// (SETLE / SETGT have already been converted to their unsigned
|
||
// counterparts above for i16; this handles original SETULE/SETUGT
|
||
// and the post-transform SETULE/SETUGT.) Keeps the variable on the
|
||
// LHS and lets us use BCS / BCC natively.
|
||
if (auto *RhsConst = dyn_cast<ConstantSDNode>(RHS)) {
|
||
int64_t V = RhsConst->getSExtValue();
|
||
uint64_t UV = (uint64_t)V & 0xFFFF;
|
||
if (CC == ISD::SETULE && UV < 0xffff) {
|
||
RHS = DAG.getConstant(UV + 1, DL, RHS.getValueType());
|
||
CC = ISD::SETULT;
|
||
} else if (CC == ISD::SETUGT && UV < 0xffff) {
|
||
RHS = DAG.getConstant(UV + 1, DL, RHS.getValueType());
|
||
CC = ISD::SETUGE;
|
||
} else if (CC == ISD::SETLE && V < 0x7fff) {
|
||
// Reachable only when SignedCmp transform was skipped (i8 case
|
||
// before promoteI8Cmp could get it, or non-i16 in the future).
|
||
RHS = DAG.getConstant(V + 1, DL, RHS.getValueType());
|
||
CC = ISD::SETLT;
|
||
} else if (CC == ISD::SETGT && V < 0x7fff) {
|
||
RHS = DAG.getConstant(V + 1, DL, RHS.getValueType());
|
||
CC = ISD::SETGE;
|
||
}
|
||
}
|
||
|
||
W65816CC::CondCode TCC = mapCC(CC);
|
||
if (TCC == W65816CC::COND_INVALID) {
|
||
// Try swapping operands first — preferable since it leaves us with
|
||
// a single-Bxx form. But reject the swap if it would put a load on
|
||
// the LHS (we can't pattern-match cmp(load,reg) without spilling A).
|
||
bool RhsIsLoad = isa<LoadSDNode>(RHS.getNode());
|
||
bool LhsIsLoad = isa<LoadSDNode>(LHS.getNode());
|
||
bool SwapWouldHurt = RhsIsLoad && !LhsIsLoad;
|
||
if (!SwapWouldHurt) {
|
||
std::swap(LHS, RHS);
|
||
CC = ISD::getSetCCSwappedOperands(CC);
|
||
TCC = mapCC(CC);
|
||
}
|
||
}
|
||
// Final fallback: GT/LE/UGT/ULE without a useful swap target. Use a
|
||
// multi-branch pseudo CC; the SELECT_CC16 custom inserter expands it
|
||
// into a 3-BB diamond. Only valid for SELECT_CC, not for BR_CC —
|
||
// LowerBR_CC re-routes those through SETCC + BR_CC NE.
|
||
if (TCC == W65816CC::COND_INVALID) {
|
||
switch (CC) {
|
||
case ISD::SETGT: TCC = W65816CC::COND_GT_MB; break;
|
||
case ISD::SETLE: TCC = W65816CC::COND_LE_MB; break;
|
||
case ISD::SETUGT: TCC = W65816CC::COND_HI_MB; break;
|
||
case ISD::SETULE: TCC = W65816CC::COND_LS_MB; break;
|
||
default: break;
|
||
}
|
||
}
|
||
return TCC;
|
||
}
|
||
|
||
// Wide32 build/extract helpers, used by LowerLoad/Store/Extend/Truncate/
|
||
// I32Bin/BR_CC to construct or destructure i32 SDValues across the
|
||
// sub_lo / sub_hi halves of the Wide32 register class.
|
||
static SDValue buildWide32(SelectionDAG &DAG, const SDLoc &DL,
|
||
SDValue Lo, SDValue Hi) {
|
||
SDValue RC = DAG.getTargetConstant(W65816::Wide32RegClassID, DL, MVT::i32);
|
||
SDValue SubLo = DAG.getTargetConstant(llvm::sub_lo, DL, MVT::i32);
|
||
SDValue SubHi = DAG.getTargetConstant(llvm::sub_hi, DL, MVT::i32);
|
||
SDNode *RS = DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, DL, MVT::i32,
|
||
{RC, Lo, SubLo, Hi, SubHi});
|
||
return SDValue(RS, 0);
|
||
}
|
||
// Look through a buildWide32(Lo, Hi) -> REG_SEQUENCE(RC, Lo, sub_lo,
|
||
// Hi, sub_hi) pair: if X is exactly that machine node, return the
|
||
// matching half operand directly. Avoids a TargetExtractSubreg that
|
||
// would re-enter the SDAG combiner and re-build the i32 constant /
|
||
// pair, looping forever (observed as OOM in the combiner on `*t = 0`).
|
||
static SDValue lookThroughRegSeq(SDValue X, unsigned WantSub) {
|
||
if (!X.getNode() || !X.isMachineOpcode()) return SDValue();
|
||
if (X.getMachineOpcode() != TargetOpcode::REG_SEQUENCE) return SDValue();
|
||
// Layout: op0 = RC, then (Reg, SubIdx) pairs.
|
||
for (unsigned i = 1; i + 1 < X.getNumOperands(); i += 2) {
|
||
SDValue SubIdx = X.getOperand(i + 1);
|
||
auto *CIdx = dyn_cast<ConstantSDNode>(SubIdx);
|
||
if (!CIdx) continue;
|
||
if (CIdx->getZExtValue() == WantSub)
|
||
return X.getOperand(i);
|
||
}
|
||
return SDValue();
|
||
}
|
||
static SDValue extractWide32Lo(SelectionDAG &DAG, const SDLoc &DL, SDValue X) {
|
||
// For constants, materialise the lo half as an i16 constant directly
|
||
// — getTargetExtractSubreg on a Constant SDNode produces a malformed
|
||
// MachineSDNode (constants don't carry sub-regs) and triggers
|
||
// SDAG combine loops downstream.
|
||
if (auto *C = dyn_cast<ConstantSDNode>(X)) {
|
||
return DAG.getConstant(C->getZExtValue() & 0xFFFFu, DL, MVT::i16);
|
||
}
|
||
// For un-lowered GlobalAddress / ExternalSymbol nodes (which reach
|
||
// here when the store-lowering runs before LowerOperation has split
|
||
// the constant into a Wide32 pair), emit a fresh Wrapper / WrapperBank
|
||
// pair directly. getTargetExtractSubreg on a GlobalAddress node
|
||
// produces a malformed result (no sub-reg info on a non-register).
|
||
if (auto *GA = dyn_cast<GlobalAddressSDNode>(X)) {
|
||
SDValue T = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, MVT::i16,
|
||
GA->getOffset());
|
||
return DAG.getNode(W65816ISD::Wrapper, DL, MVT::i16, T);
|
||
}
|
||
if (auto *ES = dyn_cast<ExternalSymbolSDNode>(X)) {
|
||
SDValue T = DAG.getTargetExternalSymbol(ES->getSymbol(), MVT::i16);
|
||
return DAG.getNode(W65816ISD::Wrapper, DL, MVT::i16, T);
|
||
}
|
||
if (SDValue Half = lookThroughRegSeq(X, llvm::sub_lo))
|
||
return Half;
|
||
return DAG.getTargetExtractSubreg(llvm::sub_lo, DL, MVT::i16, X);
|
||
}
|
||
static SDValue extractWide32Hi(SelectionDAG &DAG, const SDLoc &DL, SDValue X) {
|
||
if (auto *C = dyn_cast<ConstantSDNode>(X)) {
|
||
return DAG.getConstant((C->getZExtValue() >> 16) & 0xFFFFu, DL, MVT::i16);
|
||
}
|
||
if (auto *GA = dyn_cast<GlobalAddressSDNode>(X)) {
|
||
SDValue T = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, MVT::i16,
|
||
GA->getOffset());
|
||
return DAG.getNode(W65816ISD::WrapperBank, DL, MVT::i16, T);
|
||
}
|
||
if (auto *ES = dyn_cast<ExternalSymbolSDNode>(X)) {
|
||
SDValue T = DAG.getTargetExternalSymbol(ES->getSymbol(), MVT::i16);
|
||
return DAG.getNode(W65816ISD::WrapperBank, DL, MVT::i16, T);
|
||
}
|
||
if (SDValue Half = lookThroughRegSeq(X, llvm::sub_hi))
|
||
return Half;
|
||
return DAG.getTargetExtractSubreg(llvm::sub_hi, DL, MVT::i16, X);
|
||
}
|
||
|
||
// Match `Ptr = REG_SEQUENCE(ADDC(BaseLo, KLo), sub_lo,
|
||
// ADDE(BaseHi, 0, carry), sub_hi)` shape
|
||
// produced by LowerI32Bin for `(add Wide32, const)` where the constant
|
||
// fits an unsigned 16-bit Y (KHi must be 0). Returns true with OutBase
|
||
// = buildWide32(BaseLo, BaseHi) and OutOff = KLo on a successful peel.
|
||
// The bank-byte carry-in is intentionally dropped: the `[dp],Y` deref
|
||
// adds Y to the 24-bit pointer without propagating beyond 16 bits.
|
||
// Caller's responsibility that the target object doesn't span a bank.
|
||
static bool peelPtr32Offset(SelectionDAG &DAG, SDLoc DL, SDValue Ptr,
|
||
SDValue &OutBase, uint16_t &OutOff) {
|
||
if (Ptr.getValueType() != MVT::i32) return false;
|
||
// Pre-LowerI32Bin shape: `ISD::ADD(BaseWide32, i32 const)`. LowerLoad
|
||
// runs before LowerI32Bin in legalization order, so the ADD is still
|
||
// visible as an ISD::ADD when LowerLoad inspects Ptr.
|
||
if (Ptr.getOpcode() == ISD::ADD) {
|
||
SDValue L = Ptr.getOperand(0);
|
||
SDValue R = Ptr.getOperand(1);
|
||
auto *KC = dyn_cast<ConstantSDNode>(R);
|
||
if (!KC) {
|
||
KC = dyn_cast<ConstantSDNode>(L);
|
||
if (!KC) return false;
|
||
L = R;
|
||
}
|
||
uint64_t K = KC->getZExtValue();
|
||
if (K == 0 || K > 0xFFFFu) return false;
|
||
OutOff = (uint16_t)K;
|
||
OutBase = L;
|
||
return true;
|
||
}
|
||
// Post-LowerI32Bin shape (REG_SEQUENCE of ADDC/ADDE). May not occur
|
||
// in practice given the ADD path above, but kept for robustness.
|
||
if (!Ptr.getNode() || !Ptr.isMachineOpcode()) return false;
|
||
if (Ptr.getMachineOpcode() != TargetOpcode::REG_SEQUENCE) return false;
|
||
SDValue Lo = lookThroughRegSeq(Ptr, llvm::sub_lo);
|
||
SDValue Hi = lookThroughRegSeq(Ptr, llvm::sub_hi);
|
||
if (!Lo || !Hi) return false;
|
||
if (Lo.getOpcode() != ISD::ADDC) return false;
|
||
if (Hi.getOpcode() != ISD::ADDE) return false;
|
||
if (Hi.getOperand(2) != Lo.getValue(1)) return false;
|
||
auto *KLo = dyn_cast<ConstantSDNode>(Lo.getOperand(1));
|
||
auto *KHi = dyn_cast<ConstantSDNode>(Hi.getOperand(1));
|
||
if (!KLo || !KHi) return false;
|
||
if (KHi->getZExtValue() != 0) return false;
|
||
uint64_t K = KLo->getZExtValue() & 0xFFFFu;
|
||
if (K == 0) return false;
|
||
OutOff = (uint16_t)K;
|
||
OutBase = buildWide32(DAG, DL, Lo.getOperand(0), Hi.getOperand(0));
|
||
return true;
|
||
}
|
||
|
||
SDValue W65816TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
|
||
SDValue Chain = Op.getOperand(0);
|
||
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
|
||
SDValue LHS = Op.getOperand(2);
|
||
SDValue RHS = Op.getOperand(3);
|
||
SDValue Dest = Op.getOperand(4);
|
||
SDLoc DL(Op);
|
||
EVT VT = LHS.getValueType();
|
||
// i32 BR_CC: synthesize an i16 boolean from per-half compares, then
|
||
// branch on (bool != 0). Avoids the legalizer's generic Expand that
|
||
// re-enters our SETCC/BR_CC custom paths in an infinite loop.
|
||
if (VT == MVT::i32) {
|
||
SDValue LL = extractWide32Lo(DAG, DL, LHS);
|
||
SDValue LH = extractWide32Hi(DAG, DL, LHS);
|
||
SDValue RL = extractWide32Lo(DAG, DL, RHS);
|
||
SDValue RH = extractWide32Hi(DAG, DL, RHS);
|
||
// Fast path: i32 == 0 / != 0 → (LL | LH) cmp 0. Drops two i16
|
||
// setcc materializations + an AND + (for NE) an XOR; the BR_CC
|
||
// can branch directly on the OR-test. Hot in `while (x)` and
|
||
// any i32-counter loop test.
|
||
if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
|
||
isa<ConstantSDNode>(RHS) &&
|
||
cast<ConstantSDNode>(RHS)->isZero()) {
|
||
SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i16, LL, LH);
|
||
SDValue Z16 = DAG.getConstant(0, DL, MVT::i16);
|
||
return DAG.getNode(ISD::BR_CC, DL, MVT::Other, Chain,
|
||
DAG.getCondCode(CC), Or, Z16, Dest);
|
||
}
|
||
SDValue Bool;
|
||
if (CC == ISD::SETEQ || CC == ISD::SETNE) {
|
||
SDValue EqLo = DAG.getSetCC(DL, MVT::i16, LL, RL, ISD::SETEQ);
|
||
SDValue EqHi = DAG.getSetCC(DL, MVT::i16, LH, RH, ISD::SETEQ);
|
||
Bool = DAG.getNode(ISD::AND, DL, MVT::i16, EqLo, EqHi);
|
||
if (CC == ISD::SETNE)
|
||
Bool = DAG.getNode(ISD::XOR, DL, MVT::i16, Bool,
|
||
DAG.getConstant(1, DL, MVT::i16));
|
||
} else {
|
||
// (a CC b) where CC is ordered:
|
||
// = (hi_a HiStrict hi_b) || (hi_a == hi_b && lo_a LoCC lo_b)
|
||
// HiStrict is the strict variant of CC (LE -> LT etc.) so the
|
||
// tie-breaker (hi==hi && lo CC lo) handles the equality case
|
||
// properly. LoCC is always the unsigned variant of CC because
|
||
// the low half is unsigned (the high half carries the sign).
|
||
ISD::CondCode HiCC, LoCCu;
|
||
switch (CC) {
|
||
case ISD::SETLT: HiCC = ISD::SETLT; LoCCu = ISD::SETULT; break;
|
||
case ISD::SETLE: HiCC = ISD::SETLT; LoCCu = ISD::SETULE; break;
|
||
case ISD::SETGT: HiCC = ISD::SETGT; LoCCu = ISD::SETUGT; break;
|
||
case ISD::SETGE: HiCC = ISD::SETGT; LoCCu = ISD::SETUGE; break;
|
||
case ISD::SETULT: HiCC = ISD::SETULT; LoCCu = ISD::SETULT; break;
|
||
case ISD::SETULE: HiCC = ISD::SETULT; LoCCu = ISD::SETULE; break;
|
||
case ISD::SETUGT: HiCC = ISD::SETUGT; LoCCu = ISD::SETUGT; break;
|
||
case ISD::SETUGE: HiCC = ISD::SETUGT; LoCCu = ISD::SETUGE; break;
|
||
default:
|
||
report_fatal_error("W65816: unexpected i32 BR_CC condition");
|
||
}
|
||
SDValue HiOk = DAG.getSetCC(DL, MVT::i16, LH, RH, HiCC);
|
||
SDValue HiEq = DAG.getSetCC(DL, MVT::i16, LH, RH, ISD::SETEQ);
|
||
SDValue LoOk = DAG.getSetCC(DL, MVT::i16, LL, RL, LoCCu);
|
||
SDValue Tie = DAG.getNode(ISD::AND, DL, MVT::i16, HiEq, LoOk);
|
||
Bool = DAG.getNode(ISD::OR, DL, MVT::i16, HiOk, Tie);
|
||
}
|
||
SDValue Zero = DAG.getConstant(0, DL, MVT::i16);
|
||
return DAG.getNode(ISD::BR_CC, DL, MVT::Other, Chain,
|
||
DAG.getCondCode(ISD::SETNE), Bool, Zero, Dest);
|
||
}
|
||
|
||
W65816CC::CondCode TCC = normalizeCC(LHS, RHS, CC, DAG, DL);
|
||
if (TCC == W65816CC::COND_INVALID)
|
||
report_fatal_error("W65816: branch condition not yet implemented");
|
||
|
||
// Multi-branch CCs only have inserter support via SELECT_CC16. For
|
||
// BR_CC, reroute through SETCC: materialise the boolean to A, then
|
||
// branch on NE-vs-zero. One extra LDA but always works.
|
||
if (TCC >= W65816CC::COND_GT_MB) {
|
||
SDValue Bool = DAG.getNode(ISD::SETCC, DL, VT, LHS, RHS,
|
||
DAG.getCondCode(CC));
|
||
SDValue Zero = DAG.getConstant(0, DL, VT);
|
||
return DAG.getNode(ISD::BR_CC, DL, MVT::Other, Chain,
|
||
DAG.getCondCode(ISD::SETNE), Bool, Zero, Dest);
|
||
}
|
||
|
||
SDValue Glue = DAG.getNode(W65816ISD::CMP, DL, MVT::Glue, LHS, RHS);
|
||
SDValue CCOp = DAG.getTargetConstant(TCC, DL, MVT::i8);
|
||
return DAG.getNode(W65816ISD::BR_CC, DL, MVT::Other, Chain, Dest, CCOp,
|
||
Glue);
|
||
}
|
||
|
||
// LowerBRIND — `brind (chain, target_ptr)`. Computed-goto / IR
|
||
// `indirectbr` lowers to BRIND with a pointer-typed target. Under
|
||
// p:32:16 (default datalayout) that pointer is i32, so the generic
|
||
// legalizer's "Cannot select brind" path fires unless we step in.
|
||
//
|
||
// Lowering strategy (mirrors __jsl_indir's mechanism):
|
||
// 1. If target is i32 (Wide32), extract sub_lo — only the 16-bit
|
||
// offset within PBR matters because JMP (abs) keeps current PBR.
|
||
// 2. Store that i16 to constant address $00B8 — the shared
|
||
// __indirTarget DP slot. Pinned at $00B8 so JMP (abs)'s bank-0
|
||
// vector fetch reads it regardless of DBR / segment placement
|
||
// (see libgcc.s for the full rationale).
|
||
// 3. Emit W65816ISD::BRIND with the chained store — the BRINDpseudo
|
||
// tablegen pattern selects to JMP_AbsInd $00B8.
|
||
SDValue W65816TargetLowering::LowerBRIND(SDValue Op,
|
||
SelectionDAG &DAG) const {
|
||
SDValue Chain = Op.getOperand(0);
|
||
SDValue Target = Op.getOperand(1);
|
||
SDLoc DL(Op);
|
||
|
||
// Reduce the target to i16 — the low half of the (i32) pointer
|
||
// holds the in-bank offset that JMP indirect dispatches through.
|
||
SDValue Off16;
|
||
if (Target.getValueType() == MVT::i32) {
|
||
Off16 = extractWide32Lo(DAG, DL, Target);
|
||
} else if (Target.getValueType() == MVT::i16) {
|
||
Off16 = Target;
|
||
} else {
|
||
// Defensive: shouldn't happen with our current type-legalization,
|
||
// but if it does, defer to the legalizer.
|
||
return SDValue();
|
||
}
|
||
|
||
// Store the 16-bit target to $00B8. The (store Acc16, (iPTR timm))
|
||
// tablegen pattern lowers this to STAabs ($00B8) — the AsmPrinter
|
||
// routes bank-0 const-int stores to STA_Abs (3 bytes, DBR-relative).
|
||
// Since DP=0 at runtime, `sta $00B8` lands at $00:00B8 == DP slot
|
||
// $B8, which is exactly where __jsl_indir reads via `jmp ($00B8)`.
|
||
//
|
||
// CRITICAL: use TargetConstant (not Constant) so the i32 Constant is
|
||
// NOT Custom-lowered through LowerI32Constant — which would split
|
||
// 0x00B8 into a REG_SEQUENCE(0xB8, 0). LowerStore then can't see
|
||
// a clean ConstantSDNode at Ptr, mis-routes the i16 store to the
|
||
// generic ST_PTR slow path ([E0],Y indirect-long with full Wide32
|
||
// address staging), and creates significant Wide32 register pressure
|
||
// — multi-cgoto VM interpreters with several BRINDs in one function
|
||
// then over-pressure the regalloc and abort with "ran out of
|
||
// registers". With TargetConstant the tablegen pattern at
|
||
// InstrInfo.td:433 fires directly: `sta $b8` — one instruction, no
|
||
// Wide32 vreg, no DPF0/DPF1 staging.
|
||
EVT PtrVT = getPointerTy(DAG.getDataLayout());
|
||
SDValue Addr = DAG.getTargetConstant(0x00B8, DL, PtrVT);
|
||
SDValue Store = DAG.getStore(Chain, DL, Off16, Addr,
|
||
MachinePointerInfo());
|
||
|
||
// Emit the indirect JMP. W65816ISD::BR_IND has chain-only semantics
|
||
// (no operand beyond chain) — the target is implicit ($00B8). The
|
||
// store above sequences before the JMP via the chain dependency.
|
||
return DAG.getNode(W65816ISD::BR_IND, DL, MVT::Other, Store);
|
||
}
|
||
|
||
SDValue W65816TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
|
||
// setcc lhs, rhs, cc -> select_cc lhs, rhs, 1, 0, cc.
|
||
// The SELECT_CC then re-enters LowerOperation and we lower it via the
|
||
// diamond-CFG path. setBooleanContents(ZeroOrOne) means callers see
|
||
// the result as a clean 0/1 value.
|
||
SDValue LHS = Op.getOperand(0);
|
||
SDValue RHS = Op.getOperand(1);
|
||
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
|
||
SDLoc DL(Op);
|
||
EVT VT = Op.getValueType();
|
||
// i32 SETCC: split into per-half compares. Result type is i16 (the
|
||
// legalizer keeps the boolean result type narrow regardless of LHS
|
||
// width).
|
||
if (LHS.getValueType() == MVT::i32) {
|
||
SDValue LL = extractWide32Lo(DAG, DL, LHS);
|
||
SDValue LH = extractWide32Hi(DAG, DL, LHS);
|
||
SDValue RL = extractWide32Lo(DAG, DL, RHS);
|
||
SDValue RH = extractWide32Hi(DAG, DL, RHS);
|
||
// Fast path: i32 == 0 / != 0 → (LL | LH) cmp 0. One i16 OR + one
|
||
// i16 setcc instead of two setcc + AND (+ XOR for NE).
|
||
if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
|
||
isa<ConstantSDNode>(RHS) &&
|
||
cast<ConstantSDNode>(RHS)->isZero()) {
|
||
SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i16, LL, LH);
|
||
SDValue Z16 = DAG.getConstant(0, DL, MVT::i16);
|
||
return DAG.getSetCC(DL, VT, Or, Z16, CC);
|
||
}
|
||
if (CC == ISD::SETEQ || CC == ISD::SETNE) {
|
||
SDValue EqLo = DAG.getSetCC(DL, VT, LL, RL, ISD::SETEQ);
|
||
SDValue EqHi = DAG.getSetCC(DL, VT, LH, RH, ISD::SETEQ);
|
||
SDValue Eq = DAG.getNode(ISD::AND, DL, VT, EqLo, EqHi);
|
||
if (CC == ISD::SETNE)
|
||
Eq = DAG.getNode(ISD::XOR, DL, VT, Eq, DAG.getConstant(1, DL, VT));
|
||
return Eq;
|
||
}
|
||
ISD::CondCode HiCC, LoCCu;
|
||
switch (CC) {
|
||
case ISD::SETLT: HiCC = ISD::SETLT; LoCCu = ISD::SETULT; break;
|
||
case ISD::SETLE: HiCC = ISD::SETLT; LoCCu = ISD::SETULE; break;
|
||
case ISD::SETGT: HiCC = ISD::SETGT; LoCCu = ISD::SETUGT; break;
|
||
case ISD::SETGE: HiCC = ISD::SETGT; LoCCu = ISD::SETUGE; break;
|
||
case ISD::SETULT: HiCC = ISD::SETULT; LoCCu = ISD::SETULT; break;
|
||
case ISD::SETULE: HiCC = ISD::SETULT; LoCCu = ISD::SETULE; break;
|
||
case ISD::SETUGT: HiCC = ISD::SETUGT; LoCCu = ISD::SETUGT; break;
|
||
case ISD::SETUGE: HiCC = ISD::SETUGT; LoCCu = ISD::SETUGE; break;
|
||
default:
|
||
report_fatal_error("W65816: unexpected i32 SETCC condition");
|
||
}
|
||
SDValue HiOk = DAG.getSetCC(DL, VT, LH, RH, HiCC);
|
||
SDValue HiEq = DAG.getSetCC(DL, VT, LH, RH, ISD::SETEQ);
|
||
SDValue LoOk = DAG.getSetCC(DL, VT, LL, RL, LoCCu);
|
||
SDValue Tie = DAG.getNode(ISD::AND, DL, VT, HiEq, LoOk);
|
||
return DAG.getNode(ISD::OR, DL, VT, HiOk, Tie);
|
||
}
|
||
SDValue One = DAG.getConstant(1, DL, VT);
|
||
SDValue Zero = DAG.getConstant(0, DL, VT);
|
||
return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, One, Zero,
|
||
DAG.getCondCode(CC));
|
||
}
|
||
|
||
SDValue W65816TargetLowering::LowerSELECT_CC(SDValue Op,
|
||
SelectionDAG &DAG) const {
|
||
SDValue LHS = Op.getOperand(0);
|
||
SDValue RHS = Op.getOperand(1);
|
||
SDValue TVal = Op.getOperand(2);
|
||
SDValue FVal = Op.getOperand(3);
|
||
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
|
||
SDLoc DL(Op);
|
||
|
||
// i32 SELECT_CC: synthesize an i16 boolean from the i32 compare via
|
||
// LowerSETCC's i32 path, then select between the i32 halves driven
|
||
// by the boolean. Avoids creating the i32 W65816::CMP we have no
|
||
// pattern for.
|
||
if (LHS.getValueType() == MVT::i32) {
|
||
// Materialise the i16 boolean.
|
||
SDValue Bool = DAG.getSetCC(DL, MVT::i16, LHS, RHS, CC);
|
||
SDValue Zero = DAG.getConstant(0, DL, MVT::i16);
|
||
if (Op.getValueType() == MVT::i32) {
|
||
SDValue TLo = extractWide32Lo(DAG, DL, TVal);
|
||
SDValue THi = extractWide32Hi(DAG, DL, TVal);
|
||
SDValue FLo = extractWide32Lo(DAG, DL, FVal);
|
||
SDValue FHi = extractWide32Hi(DAG, DL, FVal);
|
||
SDValue Lo = DAG.getSelectCC(DL, Bool, Zero, TLo, FLo, ISD::SETNE);
|
||
SDValue Hi = DAG.getSelectCC(DL, Bool, Zero, THi, FHi, ISD::SETNE);
|
||
return buildWide32(DAG, DL, Lo, Hi);
|
||
}
|
||
return DAG.getSelectCC(DL, Bool, Zero, TVal, FVal, ISD::SETNE);
|
||
}
|
||
// SELECT_CC with i32 result (i16 LHS): split TVal/FVal into halves
|
||
// and run a per-half i16 SELECT_CC sharing the same condition.
|
||
if (Op.getValueType() == MVT::i32) {
|
||
SDValue TLo = extractWide32Lo(DAG, DL, TVal);
|
||
SDValue THi = extractWide32Hi(DAG, DL, TVal);
|
||
SDValue FLo = extractWide32Lo(DAG, DL, FVal);
|
||
SDValue FHi = extractWide32Hi(DAG, DL, FVal);
|
||
SDValue Lo = DAG.getSelectCC(DL, LHS, RHS, TLo, FLo, CC);
|
||
SDValue Hi = DAG.getSelectCC(DL, LHS, RHS, THi, FHi, CC);
|
||
return buildWide32(DAG, DL, Lo, Hi);
|
||
}
|
||
|
||
W65816CC::CondCode TCC = normalizeCC(LHS, RHS, CC, DAG, DL);
|
||
if (TCC == W65816CC::COND_INVALID)
|
||
report_fatal_error("W65816: select_cc condition not yet implemented");
|
||
|
||
SDValue Glue = DAG.getNode(W65816ISD::CMP, DL, MVT::Glue, LHS, RHS);
|
||
SDValue CCOp = DAG.getTargetConstant(TCC, DL, MVT::i8);
|
||
// SDTypeProfile declares 1 result (the selected value). Earlier
|
||
// code passed a 2-VT list (value + Glue) which was silently wrong
|
||
// and trips an SDNode-validity assertion in assertions builds.
|
||
SDValue Ops[] = {TVal, FVal, CCOp, Glue};
|
||
return DAG.getNode(W65816ISD::SELECT_CC, DL, Op.getValueType(), Ops);
|
||
}
|
||
|
||
// i8 -> i16 sign extend. Branchless 3-instruction trick:
|
||
// sext(x) = ((x & 0xFF) ^ 0x80) - 0x80
|
||
// Verify: x=0x00 -> 0x80 - 0x80 = 0x0000. x=0x7F -> 0xFF - 0x80 = 0x7F.
|
||
// x=0x80 -> 0x00 - 0x80 = 0xFF80 (-128). x=0xFF -> 0x7F - 0x80
|
||
// = 0xFFFF (-1).
|
||
// Lowers to: AND #$00FF; EOR #$0080; SEC; SBC #$0080 (10 bytes total,
|
||
// no branches, no temp slots — much cheaper than the SELECT_CC diamond
|
||
// version that produced ~14 instructions plus stack spills).
|
||
SDValue W65816TargetLowering::LowerSignExtend(SDValue Op,
|
||
SelectionDAG &DAG) const {
|
||
SDValue X = Op.getOperand(0);
|
||
if (X.getValueType() != MVT::i8 || Op.getValueType() != MVT::i16)
|
||
return SDValue();
|
||
SDLoc DL(Op);
|
||
SDValue Z = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, X);
|
||
SDValue Sign = DAG.getConstant(0x0080, DL, MVT::i16);
|
||
SDValue Xor = DAG.getNode(ISD::XOR, DL, MVT::i16, Z, Sign);
|
||
return DAG.getNode(ISD::SUB, DL, MVT::i16, Xor, Sign);
|
||
}
|
||
|
||
// ptr32 foundation hook. In ptr16 mode (PointerWidth=16, current
|
||
// default) addresses are i16 and we return SDValue() so the legalizer
|
||
// keeps the load and the existing LDAptr / STAptr selection patterns
|
||
// match. In ptr32 mode addresses are i32 and we wrap the load in
|
||
// W65816ISD::LD_PTR via getMemIntrinsicNode so the [dp],Y inserter
|
||
// can take the bank byte from sub_hi instead of forcing 0.
|
||
//
|
||
// Byte loads (zextload, anyext, true i8) keep going through the i16
|
||
// LDA + AND #$FF idiom — same trick the existing LDAptr uses; for
|
||
// ptr32 mode the load is still 16 bits, just bank-explicit.
|
||
SDValue W65816TargetLowering::LowerLoad(SDValue Op,
|
||
SelectionDAG &DAG) const {
|
||
LoadSDNode *Ld = cast<LoadSDNode>(Op);
|
||
SDValue Chain = Ld->getChain();
|
||
SDValue Ptr = Ld->getBasePtr();
|
||
EVT VT = Op.getValueType();
|
||
SDLoc DL(Op);
|
||
|
||
// Const-int address: leave the SDAG alone so the tablegen pattern
|
||
// `(load (iPTR imm))` → LDA8long fires (bank-explicit). See the
|
||
// mirrored short-circuit at the top of LowerStore.
|
||
if (isa<ConstantSDNode>(Ptr) && (VT == MVT::i8 || VT == MVT::i16))
|
||
return SDValue();
|
||
|
||
// i32 LOAD: split into two i16 loads at offsets 0 and 2 then
|
||
// REG_SEQUENCE the halves into a Wide32. Address may be i16 (stack
|
||
// slot, global) or i32 (ptr32 deref); the recursive ADD handles
|
||
// address arithmetic correctly via LowerI32Bin.
|
||
if (VT == MVT::i32) {
|
||
EVT PtrVT = Ptr.getValueType();
|
||
SDValue Two = DAG.getConstant(2, DL, PtrVT);
|
||
SDValue Ptr2 = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr, Two);
|
||
SDValue Lo = DAG.getLoad(MVT::i16, DL, Chain, Ptr,
|
||
Ld->getPointerInfo(),
|
||
Ld->getAlign(),
|
||
Ld->getMemOperand()->getFlags());
|
||
SDValue Hi = DAG.getLoad(MVT::i16, DL, Chain, Ptr2,
|
||
Ld->getPointerInfo().getWithOffset(2),
|
||
Ld->getAlign(),
|
||
Ld->getMemOperand()->getFlags());
|
||
SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
|
||
Lo.getValue(1), Hi.getValue(1));
|
||
SDValue Val = buildWide32(DAG, DL, Lo, Hi);
|
||
return DAG.getMergeValues({Val, NewChain}, DL);
|
||
}
|
||
|
||
// Same fold as LowerStore: a Wide32 ptr built from Wrapper +
|
||
// WrapperBank of the same global, OR a raw GlobalAddress, lets us
|
||
// emit an abs-16 (DBR-relative) load (LDA / LDA8abs) instead of
|
||
// the slower [dp],Y indirect-long. Our globals are in the load
|
||
// segment that crt0 pins to DBR.
|
||
SDValue FoldedLo;
|
||
if (auto *GA = dyn_cast<GlobalAddressSDNode>(Ptr)) {
|
||
FoldedLo = DAG.getNode(W65816ISD::Wrapper, DL, MVT::i16,
|
||
DAG.getTargetGlobalAddress(GA->getGlobal(), DL, MVT::i16,
|
||
GA->getOffset()));
|
||
} else if (auto *ES = dyn_cast<ExternalSymbolSDNode>(Ptr)) {
|
||
FoldedLo = DAG.getNode(W65816ISD::Wrapper, DL, MVT::i16,
|
||
DAG.getTargetExternalSymbol(ES->getSymbol(), MVT::i16));
|
||
} else if (Ptr.getNode()->isMachineOpcode() &&
|
||
Ptr.getMachineOpcode() == TargetOpcode::REG_SEQUENCE) {
|
||
SDValue PLo, PHi;
|
||
for (unsigned i = 1; i + 1 < Ptr.getNumOperands(); i += 2) {
|
||
if (auto *CIdx = dyn_cast<ConstantSDNode>(Ptr.getOperand(i + 1))) {
|
||
if (CIdx->getZExtValue() == llvm::sub_lo) PLo = Ptr.getOperand(i);
|
||
else if (CIdx->getZExtValue() == llvm::sub_hi) PHi = Ptr.getOperand(i);
|
||
}
|
||
}
|
||
if (PLo && PHi &&
|
||
PLo.getOpcode() == W65816ISD::Wrapper &&
|
||
PHi.getOpcode() == W65816ISD::WrapperBank) {
|
||
SDValue WLo = PLo.getOperand(0);
|
||
SDValue WHi = PHi.getOperand(0);
|
||
auto *GLo = dyn_cast<GlobalAddressSDNode>(WLo);
|
||
auto *GHi = dyn_cast<GlobalAddressSDNode>(WHi);
|
||
auto *ELo = dyn_cast<ExternalSymbolSDNode>(WLo);
|
||
auto *EHi = dyn_cast<ExternalSymbolSDNode>(WHi);
|
||
bool SameGlobal = (GLo && GHi && GLo->getGlobal() == GHi->getGlobal() &&
|
||
GLo->getOffset() == GHi->getOffset());
|
||
bool SameExtern = (ELo && EHi &&
|
||
StringRef(ELo->getSymbol()) == EHi->getSymbol());
|
||
if (SameGlobal || SameExtern)
|
||
FoldedLo = PLo;
|
||
}
|
||
}
|
||
if (FoldedLo) {
|
||
EVT MemVT = Ld->getMemoryVT();
|
||
ISD::LoadExtType ExtType = Ld->getExtensionType();
|
||
if (ExtType == ISD::NON_EXTLOAD && MemVT == Op.getValueType()) {
|
||
return DAG.getLoad(Op.getValueType(), DL, Chain, FoldedLo,
|
||
Ld->getPointerInfo(),
|
||
Ld->getAlign(),
|
||
Ld->getMemOperand()->getFlags());
|
||
}
|
||
// i1 memory type comes from GlobalOpt narrowing `short` globals
|
||
// whose only assignments are 0/1. Treat as i8 load + appropriate
|
||
// mask — the underlying memory is still byte-sized.
|
||
if (MemVT == MVT::i1) {
|
||
SDValue ByteLd = DAG.getExtLoad(ISD::ZEXTLOAD, DL, MVT::i16, Chain,
|
||
FoldedLo, MVT::i8,
|
||
Ld->getMemOperand());
|
||
SDValue Val = ByteLd;
|
||
if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::EXTLOAD) {
|
||
Val = DAG.getNode(ISD::AND, DL, MVT::i16, ByteLd,
|
||
DAG.getConstant(1, DL, MVT::i16));
|
||
} else if (ExtType == ISD::SEXTLOAD) {
|
||
// i1 sign-extend: bit 0 -> all bits. AND #1 then NEG.
|
||
SDValue Bit = DAG.getNode(ISD::AND, DL, MVT::i16, ByteLd,
|
||
DAG.getConstant(1, DL, MVT::i16));
|
||
Val = DAG.getNode(ISD::SUB, DL, MVT::i16,
|
||
DAG.getConstant(0, DL, MVT::i16), Bit);
|
||
}
|
||
if (Op.getValueType() == MVT::i8)
|
||
Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Val);
|
||
return DAG.getMergeValues({Val, ByteLd.getValue(1)}, DL);
|
||
}
|
||
return DAG.getExtLoad(ExtType, DL, Op.getValueType(), Chain, FoldedLo,
|
||
MemVT, Ld->getMemOperand());
|
||
}
|
||
|
||
// ptr16 mode: address is i16, let the default selection handle it.
|
||
if (Ptr.getValueType() != MVT::i32)
|
||
return SDValue();
|
||
|
||
EVT MemVT = Ld->getMemoryVT();
|
||
// Widen i1 memVT to i8 (single-byte storage). getMemIntrinsicNode
|
||
// asserts memvt must be supported; i1 isn't.
|
||
if (MemVT == MVT::i1) MemVT = MVT::i8;
|
||
SDVTList VTs = DAG.getVTList(MVT::i16, MVT::Other);
|
||
// Try to peel a constant offset from Ptr and route through
|
||
// LD_PTR_OFF — folds `(ptr + K)` into the Y-register of `[E0],Y`,
|
||
// saving the i32 ADD's CLC/ADC carry chain. ~3 instr per access.
|
||
// See feedback_ptr32_deref_fold_layer1_mi.md.
|
||
// LD_PTR_OFF: deferred — the peel fires correctly but the resulting
|
||
// SDAG breaks the JSON-tokenizer + snprintf smoke tests in ways
|
||
// bisection didn't isolate. Stick with LD_PTR (no peel) here; the
|
||
// LowerStore peel for ST_PTR_OFF / STB_PTR_OFF keeps the store-side
|
||
// optimization. Future: route loads through a SDAG combine that
|
||
// runs post-LegalizeOps so we see the final REG_SEQUENCE shape.
|
||
SDValue Ops[] = { Chain, Ptr };
|
||
SDValue LdNode = DAG.getMemIntrinsicNode(W65816ISD::LD_PTR, DL, VTs, Ops,
|
||
MemVT, Ld->getMemOperand());
|
||
SDValue Val = LdNode;
|
||
// Byte memory access: mask the high byte for zextload, leave anyext.
|
||
// i1 memVT was widened to i8 above; the mask path is the same.
|
||
if (MemVT == MVT::i8) {
|
||
EVT OrigMemVT = Ld->getMemoryVT();
|
||
SDValue MaskC = DAG.getConstant(OrigMemVT == MVT::i1 ? 1 : 0xFF,
|
||
DL, MVT::i16);
|
||
if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
|
||
(OrigMemVT == MVT::i1 && Ld->getExtensionType() == ISD::EXTLOAD))
|
||
Val = DAG.getNode(ISD::AND, DL, MVT::i16, Val, MaskC);
|
||
else if (Ld->getExtensionType() == ISD::SEXTLOAD)
|
||
Val = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i16, Val,
|
||
DAG.getValueType(MVT::i8));
|
||
}
|
||
// Narrow back to i8 if the consumer wanted i8.
|
||
if (VT == MVT::i8)
|
||
Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Val);
|
||
return DAG.getMergeValues({Val, LdNode.getValue(1)}, DL);
|
||
}
|
||
|
||
// ZERO/SIGN/ANY_EXTEND i8/i16 -> i32: build a Wide32 from the i16
|
||
// payload and a 0 / sign-fill / undef high half.
|
||
SDValue W65816TargetLowering::LowerExtend(SDValue Op,
|
||
SelectionDAG &DAG) const {
|
||
SDLoc DL(Op);
|
||
if (Op.getValueType() != MVT::i32)
|
||
return SDValue();
|
||
SDValue X = Op.getOperand(0);
|
||
// Promote i8 inputs to i16 first via the same opcode.
|
||
if (X.getValueType() == MVT::i8)
|
||
X = DAG.getNode(Op.getOpcode(), DL, MVT::i16, X);
|
||
SDValue Lo = X;
|
||
SDValue Hi;
|
||
if (Op.getOpcode() == ISD::ZERO_EXTEND) {
|
||
Hi = DAG.getConstant(0, DL, MVT::i16);
|
||
} else if (Op.getOpcode() == ISD::SIGN_EXTEND) {
|
||
// Sign-fill via SRA #15 — uses our SRA15A pattern (4 insns) and
|
||
// stays i16-typed in both LHS and RHS, dodging the combiner's
|
||
// shift-amount-promote when ptr32 makes pointer-typed shift
|
||
// amounts i32.
|
||
Hi = DAG.getNode(ISD::SRA, DL, MVT::i16, Lo,
|
||
DAG.getConstant(15, DL, MVT::i16));
|
||
} else {
|
||
Hi = DAG.getUNDEF(MVT::i16);
|
||
}
|
||
return buildWide32(DAG, DL, Lo, Hi);
|
||
}
|
||
|
||
// SIGN_EXTEND_INREG i32 with inner type i1/i8/i16: sign-extend the low
|
||
// N bits of an i32 input to fill all 32 bits. The legalizer leaves
|
||
// this op alone when i32 is legal — but no tablegen pattern matches
|
||
// the i32 form, so without this Custom hook isel aborts with
|
||
// "Cannot select: sign_extend_inreg ... ValueType:i1" on shapes like
|
||
// `-(crc & 1ul)` in CRC32 loops.
|
||
//
|
||
// Strategy: for inner VT V (= i1 / i8 / i16), the low half's
|
||
// `sext_inreg` (already pattern-matched at i16) produces the signed
|
||
// i16 value — then sign-fill the high half via SRA #15 of the lo
|
||
// result.
|
||
SDValue W65816TargetLowering::LowerSignExtendInReg(SDValue Op,
|
||
SelectionDAG &DAG) const {
|
||
SDLoc DL(Op);
|
||
SDValue X = Op.getOperand(0);
|
||
EVT InnerVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
|
||
EVT ResVT = Op.getValueType();
|
||
|
||
// i16 result: replicate the existing tablegen patterns. We MUST
|
||
// handle this case rather than returning SDValue(), because
|
||
// setOperationAction's Custom-returns-SDValue() falls through to
|
||
// default Expand (= SRA/SHL chain), not to tablegen pattern match.
|
||
// The two existing patterns are:
|
||
// (sext_inreg Acc16:$src, i1) -> NEGA16 (AND $src, 1)
|
||
// (sext_inreg Acc16:$src, i8) -> ((src & 0xFF) ^ 0x80) - 0x80
|
||
// Reproduce them at the SDAG level so the legalizer's Custom
|
||
// dispatch returns a fully-lowered tree.
|
||
if (ResVT == MVT::i16) {
|
||
if (InnerVT == MVT::i1) {
|
||
SDValue Bit = DAG.getNode(ISD::AND, DL, MVT::i16, X,
|
||
DAG.getConstant(1, DL, MVT::i16));
|
||
return DAG.getNode(ISD::SUB, DL, MVT::i16,
|
||
DAG.getConstant(0, DL, MVT::i16), Bit);
|
||
}
|
||
if (InnerVT == MVT::i8) {
|
||
SDValue Masked = DAG.getNode(ISD::AND, DL, MVT::i16, X,
|
||
DAG.getConstant(0xFF, DL, MVT::i16));
|
||
SDValue Xored = DAG.getNode(ISD::XOR, DL, MVT::i16, Masked,
|
||
DAG.getConstant(0x80, DL, MVT::i16));
|
||
return DAG.getNode(ISD::SUB, DL, MVT::i16, Xored,
|
||
DAG.getConstant(0x80, DL, MVT::i16));
|
||
}
|
||
// inner i16 = no-op.
|
||
return X;
|
||
}
|
||
|
||
if (ResVT != MVT::i32)
|
||
return SDValue();
|
||
|
||
// i32 result: project the input's low half (X is i32 Wide32 here),
|
||
// apply the inner-VT sext on the i16 low half, sign-fill the hi.
|
||
SDValue Lo = extractWide32Lo(DAG, DL, X);
|
||
if (InnerVT != MVT::i16) {
|
||
Lo = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i16, Lo,
|
||
DAG.getValueType(InnerVT));
|
||
}
|
||
// Sign-fill the hi half via SRA #15 — same idiom LowerExtend uses for
|
||
// SIGN_EXTEND i16 -> i32.
|
||
SDValue Hi = DAG.getNode(ISD::SRA, DL, MVT::i16, Lo,
|
||
DAG.getConstant(15, DL, MVT::i16));
|
||
return buildWide32(DAG, DL, Lo, Hi);
|
||
}
|
||
|
||
|
||
// TRUNCATE i32 -> i16: project sub_lo.
|
||
SDValue W65816TargetLowering::LowerTruncate(SDValue Op,
|
||
SelectionDAG &DAG) const {
|
||
SDLoc DL(Op);
|
||
if (Op.getOperand(0).getValueType() != MVT::i32)
|
||
return SDValue();
|
||
if (Op.getValueType() == MVT::i16)
|
||
return extractWide32Lo(DAG, DL, Op.getOperand(0));
|
||
if (Op.getValueType() == MVT::i8) {
|
||
// i32 -> i16 -> i8. The i8 trunc pattern is COPY_TO_REGCLASS at MC
|
||
// level; the i16 sub_lo extract is the work.
|
||
SDValue Lo16 = extractWide32Lo(DAG, DL, Op.getOperand(0));
|
||
return DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Lo16);
|
||
}
|
||
return SDValue();
|
||
}
|
||
|
||
// i32 Constant: split into two i16 constants and REG_SEQUENCE.
|
||
SDValue W65816TargetLowering::LowerI32Constant(SDValue Op,
|
||
SelectionDAG &DAG) const {
|
||
SDLoc DL(Op);
|
||
if (Op.getValueType() != MVT::i32) return SDValue();
|
||
uint64_t V = cast<ConstantSDNode>(Op)->getZExtValue();
|
||
SDValue Lo = DAG.getConstant(V & 0xFFFFu, DL, MVT::i16);
|
||
SDValue Hi = DAG.getConstant((V >> 16) & 0xFFFFu, DL, MVT::i16);
|
||
return buildWide32(DAG, DL, Lo, Hi);
|
||
}
|
||
|
||
// ADD/SUB/AND/OR/XOR i32 -> per-half i16 op. ADDC/ADDE chain for ADD,
|
||
// SUBC/SUBE for SUB. AND/OR/XOR are independent halves.
|
||
SDValue W65816TargetLowering::LowerI32Bin(SDValue Op,
|
||
SelectionDAG &DAG) const {
|
||
SDLoc DL(Op);
|
||
if (Op.getValueType() != MVT::i32)
|
||
return SDValue();
|
||
SDValue L = Op.getOperand(0);
|
||
SDValue R = Op.getOperand(1);
|
||
SDValue LL = extractWide32Lo(DAG, DL, L);
|
||
SDValue LH = extractWide32Hi(DAG, DL, L);
|
||
SDValue RL = extractWide32Lo(DAG, DL, R);
|
||
SDValue RH = extractWide32Hi(DAG, DL, R);
|
||
SDValue Lo, Hi;
|
||
switch (Op.getOpcode()) {
|
||
case ISD::AND:
|
||
Lo = DAG.getNode(ISD::AND, DL, MVT::i16, LL, RL);
|
||
Hi = DAG.getNode(ISD::AND, DL, MVT::i16, LH, RH);
|
||
break;
|
||
case ISD::OR:
|
||
Lo = DAG.getNode(ISD::OR, DL, MVT::i16, LL, RL);
|
||
Hi = DAG.getNode(ISD::OR, DL, MVT::i16, LH, RH);
|
||
break;
|
||
case ISD::XOR:
|
||
Lo = DAG.getNode(ISD::XOR, DL, MVT::i16, LL, RL);
|
||
Hi = DAG.getNode(ISD::XOR, DL, MVT::i16, LH, RH);
|
||
break;
|
||
case ISD::ADD: {
|
||
SDVTList VTs = DAG.getVTList(MVT::i16, MVT::Glue);
|
||
SDValue Lo2 = DAG.getNode(ISD::ADDC, DL, VTs, LL, RL);
|
||
Lo = Lo2.getValue(0);
|
||
SDValue Carry = Lo2.getValue(1);
|
||
Hi = DAG.getNode(ISD::ADDE, DL, VTs, LH, RH, Carry).getValue(0);
|
||
break;
|
||
}
|
||
case ISD::SUB: {
|
||
SDVTList VTs = DAG.getVTList(MVT::i16, MVT::Glue);
|
||
SDValue Lo2 = DAG.getNode(ISD::SUBC, DL, VTs, LL, RL);
|
||
Lo = Lo2.getValue(0);
|
||
SDValue Borrow = Lo2.getValue(1);
|
||
Hi = DAG.getNode(ISD::SUBE, DL, VTs, LH, RH, Borrow).getValue(0);
|
||
break;
|
||
}
|
||
default:
|
||
return SDValue();
|
||
}
|
||
return buildWide32(DAG, DL, Lo, Hi);
|
||
}
|
||
|
||
// Store companion to LowerLoad. For i32 addresses, dispatch to the
|
||
// 16-bit ST_PTR or the byte-truncating STB_PTR target node based on
|
||
// MemoryVT. For i16 addresses (ptr16 mode), bail out and let the
|
||
// existing STAptr / STBptr patterns match.
|
||
SDValue W65816TargetLowering::LowerStore(SDValue Op,
|
||
SelectionDAG &DAG) const {
|
||
StoreSDNode *St = cast<StoreSDNode>(Op);
|
||
SDValue Chain = St->getChain();
|
||
SDValue Val = St->getValue();
|
||
SDValue Ptr = St->getBasePtr();
|
||
EVT MemVT = St->getMemoryVT();
|
||
SDLoc DL(Op);
|
||
|
||
// Const-int address (`*(volatile uint8*)0xC035 = v`): leave the SDAG
|
||
// alone so the tablegen pattern `(store Acc8, (iPTR imm))` →
|
||
// STA8long fires. Without this short-circuit the i32-pointer code
|
||
// below promotes the constant address into a Wide32 register pair
|
||
// and routes through STBptr32 ([dp],Y), which is 16 B / 30 cyc and
|
||
// (worse) bank-tracks DBR.
|
||
if (isa<ConstantSDNode>(Ptr))
|
||
return SDValue();
|
||
|
||
// i32 STORE: split into two halves. Critical: the per-half stores
|
||
// MUST go through the target-specific W65816ISD::ST_PTR node and not
|
||
// through plain ISD::STORE, otherwise the SDAG combiner's
|
||
// MergeConsecutiveStores re-combines them into a single i32 store
|
||
// that re-enters LowerStore — infinite loop, OOM in the combiner.
|
||
// For i16 ptrs (legacy ptr16), fall back to ISD::STORE; the regular
|
||
// store-merger doesn't trip there because address splitting via
|
||
// ISD::ADD on i16 doesn't itself fan out into ptr-pair operations.
|
||
if (Val.getValueType() == MVT::i32) {
|
||
SDValue Lo = extractWide32Lo(DAG, DL, Val);
|
||
SDValue Hi = extractWide32Hi(DAG, DL, Val);
|
||
EVT PtrVT = Ptr.getValueType();
|
||
// ptr32 const-i32-addr fast path: `*(uint32_t*)0x5000 = v` should
|
||
// lower to two STAabs (DBR-relative, 5 cyc each) instead of two
|
||
// [dp],Y stores via ST_PTR. Detect Wide32-zero-hi Constant ptr,
|
||
// emit two i16 stores at TargetConstant:i32 addrs. TargetConstant
|
||
// (not Constant) so LowerI32Constant doesn't re-fire and recreate
|
||
// the REG_SEQUENCE. The STAabs timm pattern matches.
|
||
if (PtrVT == MVT::i32 && Ptr.getNode()->isMachineOpcode() &&
|
||
Ptr.getMachineOpcode() == TargetOpcode::REG_SEQUENCE) {
|
||
SDValue PtrLo, PtrHi;
|
||
for (unsigned i = 1; i + 1 < Ptr.getNumOperands(); i += 2) {
|
||
if (auto *CIdx = dyn_cast<ConstantSDNode>(Ptr.getOperand(i + 1))) {
|
||
if (CIdx->getZExtValue() == llvm::sub_lo) PtrLo = Ptr.getOperand(i);
|
||
else if (CIdx->getZExtValue() == llvm::sub_hi) PtrHi = Ptr.getOperand(i);
|
||
}
|
||
}
|
||
auto *PtrHiC = dyn_cast_or_null<ConstantSDNode>(PtrHi);
|
||
auto *PtrLoC = dyn_cast_or_null<ConstantSDNode>(PtrLo);
|
||
if (PtrLoC && PtrHiC && PtrHiC->getZExtValue() == 0) {
|
||
uint64_t Base = PtrLoC->getZExtValue() & 0xFFFF;
|
||
SDValue PLo = DAG.getTargetConstant(Base, DL, MVT::i32);
|
||
SDValue PHi = DAG.getTargetConstant((Base + 2) & 0xFFFF, DL, MVT::i32);
|
||
SDValue StLo = DAG.getStore(Chain, DL, Lo, PLo,
|
||
St->getPointerInfo(),
|
||
St->getAlign(),
|
||
St->getMemOperand()->getFlags());
|
||
SDValue StHi = DAG.getStore(StLo, DL, Hi, PHi,
|
||
St->getPointerInfo().getWithOffset(2),
|
||
St->getAlign(),
|
||
St->getMemOperand()->getFlags());
|
||
return StHi;
|
||
}
|
||
}
|
||
SDValue Two = DAG.getConstant(2, DL, PtrVT);
|
||
SDValue Ptr2 = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr, Two);
|
||
if (PtrVT == MVT::i32) {
|
||
// ptr32 path — emit two W65816ISD::ST_PTR target nodes, sequentially
|
||
// chained. The combiner cannot merge target-opaque MemIntrinsic
|
||
// stores.
|
||
SDVTList VTs = DAG.getVTList(MVT::Other);
|
||
SDValue OpsLo[] = { Chain, Lo, Ptr };
|
||
SDValue StLo = DAG.getMemIntrinsicNode(
|
||
W65816ISD::ST_PTR, DL, VTs, OpsLo, MVT::i16,
|
||
St->getMemOperand());
|
||
SDValue OpsHi[] = { StLo, Hi, Ptr2 };
|
||
MachineMemOperand *MMOHi = DAG.getMachineFunction().getMachineMemOperand(
|
||
St->getMemOperand(), 2, 2);
|
||
SDValue StHi = DAG.getMemIntrinsicNode(
|
||
W65816ISD::ST_PTR, DL, VTs, OpsHi, MVT::i16, MMOHi);
|
||
return StHi;
|
||
}
|
||
// ptr16 path — emit two regular i16 stores serially chained so the
|
||
// store-merger sees them as a 4-byte sequence (which it will likely
|
||
// leave alone since the resulting i32 store has no legal target
|
||
// pattern in ptr16 mode anyway).
|
||
SDValue StLo = DAG.getStore(Chain, DL, Lo, Ptr,
|
||
St->getPointerInfo(),
|
||
St->getAlign(),
|
||
St->getMemOperand()->getFlags());
|
||
SDValue StHi = DAG.getStore(StLo, DL, Hi, Ptr2,
|
||
St->getPointerInfo().getWithOffset(2),
|
||
St->getAlign(),
|
||
St->getMemOperand()->getFlags());
|
||
return StHi;
|
||
}
|
||
|
||
// Optimization: if the store goes through a global address (raw
|
||
// GlobalAddress/ExternalSymbol, or a Wide32 built from Wrapper +
|
||
// WrapperBank of the same symbol), lower to a plain i16/i8 store
|
||
// through a single Wrapper@symbol so the tablegen pattern
|
||
// (store Acc8/Acc16, (W65816Wrapper tglobaladdr:$g))
|
||
// selects STA8abs / STAabs (DBR-relative). Our globals live in
|
||
// the load segment that crt0 pins to DBR, so abs-16 reaches them.
|
||
// This avoids the 14-byte [dp],y indirect-long path AND re-enables
|
||
// the STZ peephole that the indirect path defeats.
|
||
SDValue FoldedLo;
|
||
if (auto *GA = dyn_cast<GlobalAddressSDNode>(Ptr)) {
|
||
FoldedLo = DAG.getNode(W65816ISD::Wrapper, DL, MVT::i16,
|
||
DAG.getTargetGlobalAddress(GA->getGlobal(), DL, MVT::i16,
|
||
GA->getOffset()));
|
||
} else if (auto *ES = dyn_cast<ExternalSymbolSDNode>(Ptr)) {
|
||
FoldedLo = DAG.getNode(W65816ISD::Wrapper, DL, MVT::i16,
|
||
DAG.getTargetExternalSymbol(ES->getSymbol(), MVT::i16));
|
||
} else if (Ptr.getNode()->isMachineOpcode() &&
|
||
Ptr.getMachineOpcode() == TargetOpcode::REG_SEQUENCE) {
|
||
SDValue PLo, PHi;
|
||
for (unsigned i = 1; i + 1 < Ptr.getNumOperands(); i += 2) {
|
||
if (auto *CIdx = dyn_cast<ConstantSDNode>(Ptr.getOperand(i + 1))) {
|
||
if (CIdx->getZExtValue() == llvm::sub_lo) PLo = Ptr.getOperand(i);
|
||
else if (CIdx->getZExtValue() == llvm::sub_hi) PHi = Ptr.getOperand(i);
|
||
}
|
||
}
|
||
if (PLo && PHi &&
|
||
PLo.getOpcode() == W65816ISD::Wrapper &&
|
||
PHi.getOpcode() == W65816ISD::WrapperBank) {
|
||
SDValue WLo = PLo.getOperand(0);
|
||
SDValue WHi = PHi.getOperand(0);
|
||
auto *GLo = dyn_cast<GlobalAddressSDNode>(WLo);
|
||
auto *GHi = dyn_cast<GlobalAddressSDNode>(WHi);
|
||
auto *ELo = dyn_cast<ExternalSymbolSDNode>(WLo);
|
||
auto *EHi = dyn_cast<ExternalSymbolSDNode>(WHi);
|
||
bool SameGlobal = (GLo && GHi && GLo->getGlobal() == GHi->getGlobal() &&
|
||
GLo->getOffset() == GHi->getOffset());
|
||
bool SameExtern = (ELo && EHi &&
|
||
StringRef(ELo->getSymbol()) == EHi->getSymbol());
|
||
if (SameGlobal || SameExtern)
|
||
FoldedLo = PLo;
|
||
}
|
||
}
|
||
if (FoldedLo) {
|
||
// Preserve memVT — original may be a truncating store (e.g.,
|
||
// i16 value into i8 memory). getStore picks memVT from Val's
|
||
// type, which can mismatch the original MachineMemOperand size.
|
||
if (MemVT == Val.getValueType()) {
|
||
return DAG.getStore(Chain, DL, Val, FoldedLo,
|
||
St->getPointerInfo(), St->getAlign(),
|
||
St->getMemOperand()->getFlags());
|
||
}
|
||
return DAG.getTruncStore(Chain, DL, Val, FoldedLo, MemVT,
|
||
St->getMemOperand());
|
||
}
|
||
|
||
// No i32 ptr → nothing for us to do; let the default ISD::STORE
|
||
// path handle it. (Also avoids accidentally wrapping an i16 ptr
|
||
// store into ST_PTR below, whose ptr operand must be i32.)
|
||
if (Ptr.getValueType() != MVT::i32)
|
||
return SDValue();
|
||
|
||
// The pseudos take Acc16 (i16) as the value half; the SEP/REP wrap
|
||
// around STBptr32 narrows in memory. Promote i8 values to i16 with
|
||
// ANY_EXTEND — the inserter only writes one byte, so the high half
|
||
// is don't-care.
|
||
if (Val.getValueType() == MVT::i8)
|
||
Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, Val);
|
||
|
||
SDVTList VTs = DAG.getVTList(MVT::Other);
|
||
SDValue Base; uint16_t Off = 0;
|
||
if (peelPtr32Offset(DAG, DL, Ptr, Base, Off)) {
|
||
unsigned OffOpc = (MemVT == MVT::i8) ? unsigned(W65816ISD::STB_PTR_OFF)
|
||
: unsigned(W65816ISD::ST_PTR_OFF);
|
||
SDValue OffN = DAG.getTargetConstant(Off, DL, MVT::i16);
|
||
SDValue OpsOff[] = { Chain, Val, Base, OffN };
|
||
return DAG.getMemIntrinsicNode(OffOpc, DL, VTs, OpsOff, MemVT,
|
||
St->getMemOperand());
|
||
}
|
||
unsigned NodeOpc = (MemVT == MVT::i8) ? unsigned(W65816ISD::STB_PTR)
|
||
: unsigned(W65816ISD::ST_PTR);
|
||
SDValue Ops[] = { Chain, Val, Ptr };
|
||
return DAG.getMemIntrinsicNode(NodeOpc, DL, VTs, Ops, MemVT,
|
||
St->getMemOperand());
|
||
}
|
||
|
||
// VAARG: load *ap, advance ap by sizeof(VT). Unlike the default
|
||
// expansion, we do NOT align ap to the type's preferred alignment —
|
||
// caller-pushed varargs land at byte-granular addresses (PHA from an
|
||
// odd S leaves the low byte at S+1 which is even, but our prologue's
|
||
// TSC-sequence can produce odd S, etc.). Aligning ap would skip the
|
||
// pushed value's low byte.
|
||
static SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) {
|
||
SDLoc DL(Op);
|
||
SDValue Chain = Op.getOperand(0);
|
||
SDValue VAListPtr = Op.getOperand(1);
|
||
EVT VT = Op.getValueType();
|
||
// ap (va_list) is `char *` on this target — i16 under ptr16, i32
|
||
// under ptr32. Load and store it at PtrVT so we don't truncate and
|
||
// lose the high half (under ptr32, hi=0 so the truncation read garbage
|
||
// back, then the i16 store wrote i16 over the lo half but left an
|
||
// unrelated value in the hi — silent miscompile of every variadic
|
||
// call on ptr32).
|
||
EVT PtrVT = VAListPtr.getValueType();
|
||
SDValue Ap = DAG.getLoad(PtrVT, DL, Chain, VAListPtr,
|
||
MachinePointerInfo());
|
||
Chain = Ap.getValue(1);
|
||
// For the actual data deref: under ptr16 we route i16 through
|
||
// VAARG_LOAD (bank-0-explicit `[dp],Y`). Under ptr32, ap is already
|
||
// a Wide32 ptr with hi=0 (caller set up the va_list to point into the
|
||
// call-frame stack-args region, bank 0); a regular load through that
|
||
// pointer routes to LDAptr32 / STBptr32 which already deref bank-0.
|
||
SDValue Val;
|
||
if (VT == MVT::i16 && PtrVT == MVT::i16) {
|
||
SDVTList VTs = DAG.getVTList(MVT::i16, MVT::Other);
|
||
Val = DAG.getNode(W65816ISD::VAARG_LOAD, DL, VTs, Chain, Ap);
|
||
Chain = Val.getValue(1);
|
||
} else {
|
||
Val = DAG.getLoad(VT, DL, Chain, Ap, MachinePointerInfo());
|
||
Chain = Val.getValue(1);
|
||
}
|
||
// ap += sizeof(VT) (rounded up to whole bytes).
|
||
unsigned Size = (VT.getSizeInBits() + 7) / 8;
|
||
SDValue NewAp = DAG.getNode(ISD::ADD, DL, PtrVT, Ap,
|
||
DAG.getConstant(Size, DL, PtrVT));
|
||
Chain = DAG.getStore(Chain, DL, NewAp, VAListPtr, MachinePointerInfo());
|
||
return DAG.getMergeValues({Val, Chain}, DL);
|
||
}
|
||
|
||
// VASTART: store the address of the first vararg slot (recorded by
|
||
// LowerFormalArguments via VarArgsFrameIndex) to the va_list pointer.
|
||
// va_list is just `i16 *next` here — minimum implementation.
|
||
static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG,
|
||
const W65816TargetLowering &TLI) {
|
||
MachineFunction &MF = DAG.getMachineFunction();
|
||
auto *FuncInfo = MF.getInfo<W65816MachineFunctionInfo>();
|
||
SDLoc DL(Op);
|
||
// FrameIndex must be at PtrVT (i16 under ptr16, i32 under ptr32) so
|
||
// the subsequent store writes the full pointer width. Under ptr32
|
||
// the i32 FI lowers via the i32 pointer-store path; the high half
|
||
// is implicitly 0 (stack is bank 0) and stored alongside the lo.
|
||
EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
|
||
SDValue VAFI = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
|
||
PtrVT);
|
||
SDValue Chain = Op.getOperand(0);
|
||
SDValue VAListPtr = Op.getOperand(1);
|
||
const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
|
||
return DAG.getStore(Chain, DL, VAFI, VAListPtr, MachinePointerInfo(SV));
|
||
}
|
||
|
||
SDValue W65816TargetLowering::LowerOperation(SDValue Op,
|
||
SelectionDAG &DAG) const {
|
||
switch (Op.getOpcode()) {
|
||
case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
|
||
case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
|
||
case ISD::BR_CC: return LowerBR_CC(Op, DAG);
|
||
case ISD::BRIND: return LowerBRIND(Op, DAG);
|
||
case ISD::SETCC: return LowerSETCC(Op, DAG);
|
||
case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
|
||
case ISD::SELECT: {
|
||
// Custom-lower SELECT for i32 result: split into per-half
|
||
// selects. Without this, the legalizer's default (rewriting
|
||
// SELECT to SELECT_CC against zero) produces SELECT_CC i32 of
|
||
// a different shape that re-enters Custom and creates a cycle.
|
||
if (Op.getValueType() != MVT::i32)
|
||
return SDValue();
|
||
SDValue Cond = Op.getOperand(0);
|
||
SDValue TVal = Op.getOperand(1);
|
||
SDValue FVal = Op.getOperand(2);
|
||
SDLoc DL(Op);
|
||
SDValue TLo = extractWide32Lo(DAG, DL, TVal);
|
||
SDValue THi = extractWide32Hi(DAG, DL, TVal);
|
||
SDValue FLo = extractWide32Lo(DAG, DL, FVal);
|
||
SDValue FHi = extractWide32Hi(DAG, DL, FVal);
|
||
SDValue Lo = DAG.getNode(ISD::SELECT, DL, MVT::i16, Cond, TLo, FLo);
|
||
SDValue Hi = DAG.getNode(ISD::SELECT, DL, MVT::i16, Cond, THi, FHi);
|
||
return buildWide32(DAG, DL, Lo, Hi);
|
||
}
|
||
case ISD::SIGN_EXTEND:
|
||
if (Op.getValueType() == MVT::i32) return LowerExtend(Op, DAG);
|
||
return LowerSignExtend(Op, DAG);
|
||
case ISD::VASTART: return LowerVASTART(Op, DAG, *this);
|
||
case ISD::VAARG: return LowerVAARG(Op, DAG);
|
||
case ISD::SHL:
|
||
case ISD::SRL:
|
||
case ISD::SRA: return LowerShift(Op, DAG);
|
||
case ISD::ZERO_EXTEND:
|
||
case ISD::ANY_EXTEND: return LowerExtend(Op, DAG);
|
||
case ISD::SIGN_EXTEND_INREG: return LowerSignExtendInReg(Op, DAG);
|
||
case ISD::TRUNCATE: return LowerTruncate(Op, DAG);
|
||
case ISD::ADD:
|
||
case ISD::SUB:
|
||
case ISD::AND:
|
||
case ISD::OR:
|
||
case ISD::XOR: return LowerI32Bin(Op, DAG);
|
||
case ISD::MUL: return LowerMUL_I32(Op, DAG);
|
||
case ISD::LOAD: return LowerLoad(Op, DAG);
|
||
case ISD::STORE: return LowerStore(Op, DAG);
|
||
case ISD::Constant: return LowerI32Constant(Op, DAG);
|
||
// SJLJ EH: setup_dispatch is a no-op on this target — the dispatcher
|
||
// logic lives entirely in the SJLJ runtime (_Unwind_SjLj_Resume +
|
||
// longjmp into the function context's jmp_buf). The isel layer
|
||
// doesn't need to emit any code; just thread the chain through.
|
||
case ISD::EH_SJLJ_SETUP_DISPATCH:
|
||
return Op.getOperand(0);
|
||
case ISD::TRAP:
|
||
case ISD::DEBUGTRAP: {
|
||
// Wrap the incoming chain in a W65816ISD::TRAP node; the InstrInfo.td
|
||
// pattern (W65816trap) selects BRK_pseudo, which the AsmPrinter
|
||
// expands to sentinel-store + BRK + self-loop. Threading the chain
|
||
// through keeps memory-ordering side effects honest (the trap is
|
||
// observed after any prior store).
|
||
SDLoc DL(Op);
|
||
SDValue Chain = Op.getOperand(0);
|
||
return DAG.getNode(W65816ISD::TRAP, DL, MVT::Other, Chain);
|
||
}
|
||
case ISD::DYNAMIC_STACKALLOC: return LowerDynamicStackalloc(Op, DAG);
|
||
case ISD::STACKSAVE: {
|
||
// Return Constant 0 — SJLJ stores this into the function context
|
||
// but our setjmp/longjmp manage SP directly, so the value is dead.
|
||
SDLoc DL(Op);
|
||
EVT VT = Op.getValueType();
|
||
SDValue Chain = Op.getOperand(0);
|
||
SDValue Result;
|
||
if (VT == MVT::i16)
|
||
Result = DAG.getConstant(0, DL, MVT::i16);
|
||
else
|
||
Result = buildWide32(DAG, DL,
|
||
DAG.getConstant(0, DL, MVT::i16),
|
||
DAG.getConstant(0, DL, MVT::i16));
|
||
return DAG.getMergeValues({Result, Chain}, DL);
|
||
}
|
||
case ISD::STACKRESTORE:
|
||
// No-op — pass the chain through.
|
||
return Op.getOperand(0);
|
||
case ISD::FRAMEADDR: {
|
||
// FRAMEADDR(N): SJLJ uses N=0 (current frame). We don't reserve a
|
||
// frame pointer and SP isn't trivially CopyFromReg-able (no
|
||
// register class). Return Constant 0 — SJLJ uses it as an opaque
|
||
// per-frame identifier; the SJLJ runtime tracks frames by jmp_buf
|
||
// chaining (FnCtx::prev) rather than by FRAMEADDR value, so a
|
||
// constant works for single-throw / non-nested-catch programs.
|
||
// True multi-frame SJLJ would need a TSC-based unique value.
|
||
SDLoc DL(Op);
|
||
EVT VT = Op.getValueType();
|
||
if (VT == MVT::i16)
|
||
return DAG.getConstant(0, DL, MVT::i16);
|
||
SDValue Lo = DAG.getConstant(0, DL, MVT::i16);
|
||
SDValue Hi = DAG.getConstant(0, DL, MVT::i16);
|
||
return buildWide32(DAG, DL, Lo, Hi);
|
||
}
|
||
default:
|
||
#ifndef NDEBUG
|
||
Op.dump();
|
||
#endif
|
||
llvm_unreachable("W65816: unexpected operation in LowerOperation");
|
||
}
|
||
}
|
||
|
||
std::pair<unsigned, const TargetRegisterClass *>
|
||
W65816TargetLowering::getRegForInlineAsmConstraint(
|
||
const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
|
||
// Strip leading '{' and trailing '}' for the long form.
|
||
StringRef C = Constraint;
|
||
if (C.size() >= 2 && C.front() == '{' && C.back() == '}')
|
||
C = C.substr(1, C.size() - 2);
|
||
|
||
if (VT == MVT::i8) {
|
||
if (C == "a") return {W65816::A, &W65816::Acc8RegClass};
|
||
if (C == "x") return {W65816::X, &W65816::Idx8RegClass};
|
||
if (C == "y") return {W65816::Y, &W65816::Idx8RegClass};
|
||
if (C == "r") return {W65816::A, &W65816::Acc8RegClass};
|
||
} else { // i16 default; pointer types fold here too
|
||
if (C == "a") return {W65816::A, &W65816::Acc16RegClass};
|
||
if (C == "x") return {W65816::X, &W65816::Idx16RegClass};
|
||
if (C == "y") return {W65816::Y, &W65816::Idx16RegClass};
|
||
if (C == "r") return {W65816::A, &W65816::Acc16RegClass};
|
||
}
|
||
return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
|
||
}
|
||
|
||
SDValue W65816TargetLowering::LowerDynamicStackalloc(SDValue Op,
|
||
SelectionDAG &DAG) const {
|
||
// (DYNAMIC_STACKALLOC chain, size, align) -> (ptr, chain).
|
||
// Lowered as: stash entry SP -> DP $F4 (handled by emitPrologue when
|
||
// MFI.hasVarSizedObjects), then `tsc; sec; sbc size; tcs; inc a`.
|
||
// The epilogue restores SP from $F4.
|
||
//
|
||
// Limitation: any FrameIndex (local, spill slot, parameter) accessed
|
||
// *after* the alloca reads from a wrong stack-relative offset because
|
||
// PEI bakes FI offsets relative to the static-frame SP, not the
|
||
// post-alloca SP. A real frame pointer would lift this; for now we
|
||
// accept the limitation and document it. The simplest safe pattern
|
||
// is "VLA at end of function, used immediately, no further FI access";
|
||
// anything else is at-your-own-risk until FP support lands.
|
||
SDLoc DL(Op);
|
||
SDValue Chain = Op.getOperand(0);
|
||
SDValue Size = Op.getOperand(1);
|
||
EVT ResultVT = Op.getValueType();
|
||
// Under ptr32, both the result pointer and the size are Wide32 i32
|
||
// values. Extract the i16 lo half of size (a VLA larger than 64KB
|
||
// doesn't fit in our stack anyway), do the i16 ALLOCA, then build
|
||
// the Wide32 result with bank=0 (stack is always bank 0).
|
||
if (ResultVT == MVT::i32) {
|
||
SDValue Size16 = (Size.getValueType() == MVT::i32)
|
||
? extractWide32Lo(DAG, DL, Size)
|
||
: Size;
|
||
SDValue ChainAndPtr = DAG.getNode(W65816ISD::ALLOCA, DL,
|
||
DAG.getVTList(MVT::i16, MVT::Other),
|
||
Chain, Size16);
|
||
SDValue Ptr16 = ChainAndPtr.getValue(0);
|
||
SDValue NewChain = ChainAndPtr.getValue(1);
|
||
SDValue Bank = DAG.getConstant(0, DL, MVT::i16);
|
||
SDValue Ptr32 = buildWide32(DAG, DL, Ptr16, Bank);
|
||
return DAG.getMergeValues({Ptr32, NewChain}, DL);
|
||
}
|
||
SDValue ChainAndPtr = DAG.getNode(W65816ISD::ALLOCA, DL,
|
||
DAG.getVTList(MVT::i16, MVT::Other),
|
||
Chain, Size);
|
||
SDValue Ptr = ChainAndPtr.getValue(0);
|
||
SDValue NewChain = ChainAndPtr.getValue(1);
|
||
return DAG.getMergeValues({Ptr, NewChain}, DL);
|
||
}
|
||
|
||
SDValue W65816TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
|
||
// i8 shifts: promote to i16, shift, truncate. SRA promotes via SEXT
|
||
// (preserves sign for arithmetic right shift); SHL/SRL via ZEXT
|
||
// (logical / left shifts don't care about high bits). This routes
|
||
// i8 shifts through the same i16 fast paths and libcalls — no
|
||
// parallel qi3 libcall set needed. The DAG combiner would otherwise
|
||
// narrow `(trunc (shl (zext X), K))` back to `(shl X, K)` of i8,
|
||
// re-entering this hook in an infinite loop; the
|
||
// `isTypeDesirableForOp(SHL/SRL/SRA, i8) -> false` override above
|
||
// disables that combine.
|
||
if (Op.getValueType() == MVT::i8) {
|
||
SDLoc DL(Op);
|
||
SDValue X = Op.getOperand(0);
|
||
SDValue N = Op.getOperand(1);
|
||
unsigned Ext = (Op.getOpcode() == ISD::SRA) ? ISD::SIGN_EXTEND
|
||
: ISD::ZERO_EXTEND;
|
||
SDValue X16 = DAG.getNode(Ext, DL, MVT::i16, X);
|
||
SDValue N16 = N.getValueType() == MVT::i16
|
||
? N
|
||
: DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, N);
|
||
// Special case: i8 SRA by 7 of a sign-extended value is the
|
||
// sign-fill operation — every result bit is the input's bit 7.
|
||
// For sext(i8 x), bit 15 == bit 7, so `(sra (sext x), 7)` yields
|
||
// the same result as `(sra (sext x), 15)`, which we have a tight
|
||
// 4-insn pattern for via SRA15A. Avoids the __ashrhi3 libcall
|
||
// (~10 insns plus arg push/pop overhead) — abs8 dropped from 47
|
||
// to 35 insns with this rewrite in place.
|
||
if (Op.getOpcode() == ISD::SRA) {
|
||
if (auto *C = dyn_cast<ConstantSDNode>(N)) {
|
||
if (C->getZExtValue() == 7) {
|
||
N16 = DAG.getConstant(15, DL, MVT::i16);
|
||
}
|
||
}
|
||
}
|
||
SDValue R16 = DAG.getNode(Op.getOpcode(), DL, MVT::i16, X16, N16);
|
||
return DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, R16);
|
||
}
|
||
// Fast path: shift-by-{1,2,3,4} have inline tablegen patterns. Return
|
||
// Op (the unchanged node) so the legalizer leaves it alone — the
|
||
// pattern matcher catches it later. Returning SDValue() instead
|
||
// would fall through to the generic Expand path, which generates a
|
||
// BUILD_VECTOR-based magic-constant rewrite that we can't lower.
|
||
// Also allow `(srl x, 15)` through — pattern SRL15A handles it as
|
||
// `ASL A; LDA #0; ROL A` (3 bytes), much shorter than the libcall.
|
||
// The type-legalizer's i32-shift-by-1 expansion emits this exact
|
||
// node for the high-half "bit-from-low" slot.
|
||
// Everything else goes to a libcall (__ashlhi3 / __lshrhi3 / __ashrhi3).
|
||
// i16 only — i32 always routes to libcall (no inline i32 patterns).
|
||
SDValue Amount = Op.getOperand(1);
|
||
if (Op.getValueType() == MVT::i16) {
|
||
if (auto *C = dyn_cast<ConstantSDNode>(Amount)) {
|
||
uint64_t N = C->getZExtValue();
|
||
if ((Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL) &&
|
||
N >= 1 && N <= 14)
|
||
return Op;
|
||
if (N == 15 &&
|
||
(Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL))
|
||
return Op;
|
||
if (N == 1 && Op.getOpcode() == ISD::SRA)
|
||
return Op;
|
||
if (N == 15 && Op.getOpcode() == ISD::SRA)
|
||
return Op;
|
||
}
|
||
}
|
||
|
||
bool IsI32 = Op.getValueType() == MVT::i32;
|
||
|
||
// Inline i32 shift-by-small-constant. The libcall path is ~140 cyc
|
||
// (post-tightening); unrolling N i16 ops plus carry propagation runs
|
||
// in ~30-90 cyc. popcount, djb2-style hashes, BigInt-style code, and
|
||
// CRC routines all hit this. Larger N falls through to the libcall —
|
||
// the unrolled cost grows linearly while the libcall is constant.
|
||
// Cutoff at N=5 chosen empirically: djb2's `(h << 5) + h` is the
|
||
// common one that benefits. SRA needs an arithmetic-fill shift on
|
||
// the high half (i16 SRA by 1 is tablegen-supported); the low half is
|
||
// filled from the high's departing bit just like SRL.
|
||
if (IsI32) {
|
||
if (auto *C = dyn_cast<ConstantSDNode>(Amount)) {
|
||
uint64_t N = C->getZExtValue();
|
||
unsigned Op0 = Op.getOpcode();
|
||
if (N >= 1 && N <= 5 &&
|
||
(Op0 == ISD::SHL || Op0 == ISD::SRL || Op0 == ISD::SRA)) {
|
||
SDLoc DL(Op);
|
||
SDValue X = Op.getOperand(0);
|
||
SDValue Lo = extractWide32Lo(DAG, DL, X);
|
||
SDValue Hi = extractWide32Hi(DAG, DL, X);
|
||
SDValue ShN = DAG.getConstant(N, DL, MVT::i16);
|
||
SDValue ShCo = DAG.getConstant(16 - N, DL, MVT::i16);
|
||
if (Op0 == ISD::SHL) {
|
||
// (Hi:Lo) << N == ((Hi << N) | (Lo >> (16-N))) : (Lo << N)
|
||
// 4 SDAG ops instead of N iterations of 4 ops. Lets the
|
||
// combiner / isel produce ASLA16-cascade + SRL8A+LSRA16-
|
||
// cascade + single OR, avoiding the bit-by-bit OR cascade
|
||
// that the unrolled form produced.
|
||
SDValue NewLo = DAG.getNode(ISD::SHL, DL, MVT::i16, Lo, ShN);
|
||
SDValue HiTop = DAG.getNode(ISD::SRL, DL, MVT::i16, Lo, ShCo);
|
||
SDValue HiShl = DAG.getNode(ISD::SHL, DL, MVT::i16, Hi, ShN);
|
||
SDValue NewHi = DAG.getNode(ISD::OR, DL, MVT::i16, HiShl, HiTop);
|
||
return buildWide32(DAG, DL, NewLo, NewHi);
|
||
} else {
|
||
// SRL/SRA by N: NewHi = Hi >> N (logical or arithmetic);
|
||
// NewLo = (Lo >> N) | (Hi << (16-N)).
|
||
SDValue NewHi = DAG.getNode(Op0, DL, MVT::i16, Hi, ShN);
|
||
SDValue LoTop = DAG.getNode(ISD::SHL, DL, MVT::i16, Hi, ShCo);
|
||
SDValue LoSrl = DAG.getNode(ISD::SRL, DL, MVT::i16, Lo, ShN);
|
||
SDValue NewLo = DAG.getNode(ISD::OR, DL, MVT::i16, LoSrl, LoTop);
|
||
return buildWide32(DAG, DL, NewLo, NewHi);
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
RTLIB::Libcall LC;
|
||
switch (Op.getOpcode()) {
|
||
case ISD::SHL: LC = IsI32 ? RTLIB::SHL_I32 : RTLIB::SHL_I16; break;
|
||
case ISD::SRL: LC = IsI32 ? RTLIB::SRL_I32 : RTLIB::SRL_I16; break;
|
||
case ISD::SRA: LC = IsI32 ? RTLIB::SRA_I32 : RTLIB::SRA_I16; break;
|
||
default: llvm_unreachable("not a shift");
|
||
}
|
||
|
||
SDValue Val = Op.getOperand(0);
|
||
if (IsI32 && Op.getOpcode() == ISD::SHL) {
|
||
// Force the high half of the input to be concretely zero when the
|
||
// shift count K is >= 16, so bits K..31 of the input are
|
||
// mathematically irrelevant. SDAG legalisation can mark those bits
|
||
// as `undef` to give the regalloc freedom, but our libcall (a true
|
||
// 32-bit shift-and-rotate loop in libgcc.s) reads ALL 32 input
|
||
// bits and propagates garbage into the result's low half. Caught
|
||
// by dadd via the dpack-inline `(u64 e) << 52` path which split
|
||
// into __ashlsi3(e_lo, 20) with X = undef → wrong mantissa.
|
||
// For SRL/SRA we'd zero/sign-extend the LOW half similarly when
|
||
// K >= 16, but those paths aren't exercising the bug yet.
|
||
if (auto *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
|
||
unsigned K = (unsigned)C->getZExtValue();
|
||
if (K >= 16) {
|
||
SDValue Lo = extractWide32Lo(DAG, SDLoc(Op), Val);
|
||
SDValue Zero = DAG.getConstant(0, SDLoc(Op), MVT::i16);
|
||
Val = buildWide32(DAG, SDLoc(Op), Lo, Zero);
|
||
}
|
||
}
|
||
}
|
||
SmallVector<SDValue, 2> Args = {Val, Op.getOperand(1)};
|
||
TargetLowering::MakeLibCallOptions Opts;
|
||
Opts.setIsSigned(Op.getOpcode() == ISD::SRA);
|
||
return makeLibCall(DAG, LC, Op.getValueType(), Args, Opts, SDLoc(Op)).first;
|
||
}
|
||
|
||
SDValue W65816TargetLowering::LowerGlobalAddress(SDValue Op,
|
||
SelectionDAG &DAG) const {
|
||
auto *GA = cast<GlobalAddressSDNode>(Op);
|
||
SDLoc DL(Op);
|
||
EVT PtrVT = Op.getValueType(); // i16 in ptr16 mode, i32 in ptr32 mode
|
||
if (PtrVT == MVT::i32) {
|
||
// i32 GlobalAddress: build Wide32 from (i16 offset, i16 bank).
|
||
// Lo = Wrapper(target) → fixup_16 (offset bytes)
|
||
// Hi = WrapperBank(target) → fixup_bank16 (bank byte + 0 pad)
|
||
// The linker / OMF Loader patch both halves so the runtime
|
||
// pointer reflects the actual placed segment, not the link-time
|
||
// text-base. Resolves the long-standing "ldx #0 is hardcoded"
|
||
// bug that broke toolbox-call pointer args.
|
||
SDValue OffTgt = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
|
||
MVT::i16, GA->getOffset());
|
||
SDValue Lo = DAG.getNode(W65816ISD::Wrapper, DL, MVT::i16, OffTgt);
|
||
SDValue Hi = DAG.getNode(W65816ISD::WrapperBank, DL, MVT::i16, OffTgt);
|
||
return buildWide32(DAG, DL, Lo, Hi);
|
||
}
|
||
SDValue Tgt = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, PtrVT,
|
||
GA->getOffset());
|
||
return DAG.getNode(W65816ISD::Wrapper, DL, PtrVT, Tgt);
|
||
}
|
||
|
||
SDValue W65816TargetLowering::LowerExternalSymbol(SDValue Op,
|
||
SelectionDAG &DAG) const {
|
||
auto *ES = cast<ExternalSymbolSDNode>(Op);
|
||
SDLoc DL(Op);
|
||
EVT PtrVT = Op.getValueType();
|
||
if (PtrVT == MVT::i32) {
|
||
SDValue OffTgt = DAG.getTargetExternalSymbol(ES->getSymbol(), MVT::i16);
|
||
SDValue Lo = DAG.getNode(W65816ISD::Wrapper, DL, MVT::i16, OffTgt);
|
||
SDValue Hi = DAG.getNode(W65816ISD::WrapperBank, DL, MVT::i16, OffTgt);
|
||
return buildWide32(DAG, DL, Lo, Hi);
|
||
}
|
||
SDValue Tgt = DAG.getTargetExternalSymbol(ES->getSymbol(), PtrVT);
|
||
return DAG.getNode(W65816ISD::Wrapper, DL, PtrVT, Tgt);
|
||
}
|
||
|
||
SDValue W65816TargetLowering::LowerFormalArguments(
|
||
SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
|
||
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
|
||
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
|
||
// ABI: first i16/i8 argument is passed in A; remaining arguments are
|
||
// pushed by the caller right-to-left and read via stack-relative
|
||
// addressing. After JSL pushes 3 bytes of return address, the layout
|
||
// viewed from the callee is:
|
||
// (high addr) arg N-1
|
||
// ...
|
||
// arg 1
|
||
// ret-addr-bank <- (4,S) when M=0
|
||
// ret-addr-hi <- (3,S)
|
||
// ret-addr-lo <- (2,S)
|
||
// (low addr) <next push> <- (1,S)
|
||
//
|
||
// Each i16 stack arg occupies 2 bytes. arg 1 lives at (4,S).
|
||
|
||
MachineFunction &MF = DAG.getMachineFunction();
|
||
MachineFrameInfo &MFI = MF.getFrameInfo();
|
||
MachineRegisterInfo &MRI = MF.getRegInfo();
|
||
|
||
// i32 first-arg ABI. Two flavors as in LowerCall:
|
||
// - Legal-i32 (Wide32 reg class registered): single i32 InputArg.
|
||
// - Split-i32 (legacy): two i16 InputArgs both with OrigArgIndex==0.
|
||
bool I32SplitFirstArg =
|
||
Ins.size() >= 2 && Ins[0].VT == MVT::i16 && Ins[1].VT == MVT::i16 &&
|
||
Ins[0].OrigArgIndex == 0 && Ins[1].OrigArgIndex == 0;
|
||
// True iff the FIRST original arg spans 4 i16s (i.e., is i64). Used
|
||
// below to choose the Img16-via-STX_DP X-arg path for i64 callees,
|
||
// which dodges greedy's TXA-bridge-clobbers-A spill bug. i32-first
|
||
// doesn't get the same treatment because the change pessimizes
|
||
// simple functions like `int add32(int a, int b) { return a+b; }`
|
||
// where greedy's regular A:X handling is fine.
|
||
// Two shapes for i64-first-arg under different ptr modes:
|
||
// ptr16 (i32 illegal): Ins[0..3] = 4 i16 halves of arg0
|
||
// ptr32 (i32 legal): Ins[0..1] = 2 i32 halves of arg0 — but the
|
||
// IR-level "single i64 first arg" still splits
|
||
// to 4 i16 in Outs/Ins because i64 isn't legal.
|
||
// So the i16-form detection still applies here.
|
||
bool I64FirstArg =
|
||
Ins.size() >= 4 && Ins[0].VT == MVT::i16 && Ins[1].VT == MVT::i16 &&
|
||
Ins[2].VT == MVT::i16 && Ins[3].VT == MVT::i16 &&
|
||
Ins[0].OrigArgIndex == 0 && Ins[1].OrigArgIndex == 0 &&
|
||
Ins[2].OrigArgIndex == 0 && Ins[3].OrigArgIndex == 0;
|
||
// Also detect the i32-split shape: Ins[0..1] = 2 i32 halves of arg0
|
||
// (with OrigArgIndex==0 on both). This happens with ptr32 active and
|
||
// i64 legalized via i32-split rather than i16-quad-split.
|
||
if (!I64FirstArg && Ins.size() >= 2 && Ins[0].VT == MVT::i32 &&
|
||
Ins[1].VT == MVT::i32 && Ins[0].OrigArgIndex == 0 &&
|
||
Ins[1].OrigArgIndex == 0)
|
||
I64FirstArg = true;
|
||
|
||
unsigned ArgIdx = 0;
|
||
// Stack offset is measured from S+1 (the WDC convention) and grows
|
||
// upward as we walk through the stack-passed args.
|
||
unsigned StackOffset = 4; // Skip 3 ret-addr bytes; first slot at S+4.
|
||
for (const ISD::InputArg &Arg : Ins) {
|
||
MVT VT = Arg.VT;
|
||
if (VT != MVT::i16 && VT != MVT::i8 && VT != MVT::i32)
|
||
report_fatal_error("W65816: argument type not yet supported");
|
||
|
||
if (ArgIdx == 0 && VT == MVT::i32) {
|
||
// Whole-i32 first arg: lo half live-in via $a, hi via $x.
|
||
// The W65816LowerWide32 pre-RA pass walks the resulting
|
||
// REG_SEQUENCE and rewrites Wide32 uses into pairs of i16
|
||
// operations — keeping AX32 out of the regalloc's pair-
|
||
// allocation path entirely.
|
||
// For i64-first-arg signatures (the IR has a single i64 arg
|
||
// that splits to 2 i32 in Ins[0..1] under ptr32), route BOTH
|
||
// halves through Img16. Without this the regalloc emits
|
||
// `TXA; STA spill_X; STA spill_A` at function entry — the TXA
|
||
// clobbers $a (arg0_0) before the A-spill saves it, so both
|
||
// spill slots end up holding arg0_1. Caused __adddf3(1.5,2.5)
|
||
// → 1.5 because the cb-test path read TXA-corrupted A.
|
||
// Route the hi half through Img16 (DP-backed) for whole-i32 first
|
||
// args. The Idx16 (X-only) class collapses through the W65816LowerWide32
|
||
// pre-RA pass to plain Acc16, after which regalloc treats both halves
|
||
// as competing for $a — a TXA at the top of any non-trivial function
|
||
// body destroys arg0_lo before it's spilled (silent miscompile of
|
||
// every i32-arg function with > a few uses). Img16 forces an
|
||
// STX_DP at function entry, immune to A-reuse. i64-first already
|
||
// did this; under ptr32 the same hazard hits any i32 arg.
|
||
const TargetRegisterClass *VRegLoRC =
|
||
I64FirstArg ? &W65816::Img16RegClass : &W65816::Acc16RegClass;
|
||
const TargetRegisterClass *VRegHiRC = &W65816::Img16RegClass;
|
||
Register VRegLo = MRI.createVirtualRegister(VRegLoRC);
|
||
Register VRegHi = MRI.createVirtualRegister(VRegHiRC);
|
||
MRI.addLiveIn(W65816::A, VRegLo);
|
||
MRI.addLiveIn(W65816::X, VRegHi);
|
||
SDValue Lo = DAG.getCopyFromReg(Chain, DL, VRegLo, MVT::i16);
|
||
SDValue Hi = DAG.getCopyFromReg(Chain, DL, VRegHi, MVT::i16);
|
||
InVals.push_back(buildWide32(DAG, DL, Lo, Hi));
|
||
} else if (ArgIdx == 0) {
|
||
// First arg in A. For i64-first-arg signatures (4 i16 halves of
|
||
// arg0 with OrigArgIndex==0), route arg0_0 through Img16 the same
|
||
// way ArgIdx==1 does — via an entry STA-to-DP-slot at function
|
||
// entry. Without this, the regalloc emits a TXA bridge for
|
||
// arg0_1's spill that clobbers $a (= arg0_0) BEFORE arg0_0 has
|
||
// been saved, and BOTH arg0_0 and arg0_1's spill slots end up
|
||
// holding arg0_1. Observed as `__adddf3(1.5, 2.5) → 1.5` because
|
||
// the cb-test BEQ sees flags from a TXA-clobbered LDA cb path.
|
||
const TargetRegisterClass *RC =
|
||
(VT == MVT::i16)
|
||
? (I64FirstArg ? &W65816::Img16RegClass : &W65816::Acc16RegClass)
|
||
: &W65816::Acc8RegClass;
|
||
Register VReg = MRI.createVirtualRegister(RC);
|
||
MRI.addLiveIn(W65816::A, VReg);
|
||
InVals.push_back(DAG.getCopyFromReg(Chain, DL, VReg, VT));
|
||
} else if (ArgIdx == 1 && I32SplitFirstArg) {
|
||
// First-arg hi half (or arg0_ml for i64-first-arg): in X.
|
||
// For i64-first-arg signatures (4 i16s with OrigArgIndex 0), use
|
||
// Img16 so greedy parks the value in an IMG slot via STX_DP,
|
||
// dodging the TXA-bridge-clobbers-A spill bug. i32-first stays
|
||
// on the original Idx16 path because the change pessimizes
|
||
// simple cases (verified: vprintf's writeULong/__udivsi3 chain
|
||
// crashes if i32-first is also rerouted). Caught by udivmod.
|
||
const TargetRegisterClass *RC = I64FirstArg ? &W65816::Img16RegClass
|
||
: &W65816::Idx16RegClass;
|
||
Register VReg = MRI.createVirtualRegister(RC);
|
||
MRI.addLiveIn(W65816::X, VReg);
|
||
InVals.push_back(DAG.getCopyFromReg(Chain, DL, VReg, MVT::i16));
|
||
} else if (VT == MVT::i32) {
|
||
// i32 stack arg: 4 bytes, loaded as 2 i16 halves and assembled
|
||
// via REG_SEQUENCE into a Wide32 SDValue.
|
||
int FILo = MFI.CreateFixedObject(2, StackOffset, /*Immutable*/true);
|
||
int FIHi = MFI.CreateFixedObject(2, StackOffset + 2, /*Immutable*/true);
|
||
StackOffset += 4;
|
||
SDValue FINLo = DAG.getFrameIndex(FILo, MVT::i16);
|
||
SDValue FINHi = DAG.getFrameIndex(FIHi, MVT::i16);
|
||
SDValue Lo = DAG.getLoad(MVT::i16, DL, Chain, FINLo,
|
||
MachinePointerInfo::getFixedStack(MF, FILo));
|
||
SDValue Hi = DAG.getLoad(MVT::i16, DL, Chain, FINHi,
|
||
MachinePointerInfo::getFixedStack(MF, FIHi));
|
||
InVals.push_back(buildWide32(DAG, DL, Lo, Hi));
|
||
} else {
|
||
// Subsequent args are loaded from the stack. i8 args are
|
||
// promoted to i16 slots (matching CC_W65816's CCPromoteToType)
|
||
// so the load can run in the function's default 16-bit M mode
|
||
// without needing a per-byte SEP/REP wrap; we then truncate the
|
||
// i16 back to i8 for the IR. i16 args are loaded directly.
|
||
unsigned ObjSize = 2;
|
||
int FI = MFI.CreateFixedObject(ObjSize, StackOffset, /*Immutable*/true);
|
||
StackOffset += ObjSize;
|
||
SDValue FIN = DAG.getFrameIndex(FI, MVT::i16);
|
||
SDValue Val = DAG.getLoad(
|
||
MVT::i16, DL, Chain, FIN,
|
||
MachinePointerInfo::getFixedStack(MF, FI));
|
||
if (VT == MVT::i8)
|
||
Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Val);
|
||
InVals.push_back(Val);
|
||
}
|
||
++ArgIdx;
|
||
}
|
||
|
||
// Vararg support: stash the FrameIndex of the next stack-arg slot
|
||
// (where the caller's first vararg lives) so VASTART can use it
|
||
// as the va_list start. StackOffset has been advanced past every
|
||
// named stack arg; the first vararg sits at SP + StackOffset.
|
||
if (IsVarArg) {
|
||
int FI = MFI.CreateFixedObject(2, StackOffset, /*Immutable=*/true);
|
||
auto *FuncInfo = MF.getInfo<W65816MachineFunctionInfo>();
|
||
FuncInfo->setVarArgsFrameIndex(FI);
|
||
}
|
||
|
||
return Chain;
|
||
}
|
||
|
||
SDValue
|
||
W65816TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
|
||
SmallVectorImpl<SDValue> &InVals) const {
|
||
// Multi-arg ABI: arg 0 in A, args 1..N-1 pushed in REVERSE order via
|
||
// PUSH16 (PHA) so the callee's `(4,S)` reads pick up arg 1, `(6,S)`
|
||
// gets arg 2, etc. CALLSEQ_START records the byte count;
|
||
// ADJCALLSTACKUP after the call emits `tsc;clc;adc #N;tcs` to
|
||
// release the pushed bytes (eliminateCallFramePseudoInstr).
|
||
SelectionDAG &DAG = CLI.DAG;
|
||
SDLoc &DL = CLI.DL;
|
||
SDValue Chain = CLI.Chain;
|
||
SDValue Callee = CLI.Callee;
|
||
auto &Outs = CLI.Outs;
|
||
auto &OutVals = CLI.OutVals;
|
||
auto &Ins = CLI.Ins;
|
||
|
||
if (CLI.IsTailCall)
|
||
CLI.IsTailCall = false;
|
||
// Up to 4 return halves (i64 split): i8/i16 in A; i32 in A:X;
|
||
// i64 in A:X:Y plus DP $F0..$F1 for the highest half. See
|
||
// LowerReturn comment for the ABI.
|
||
if (Ins.size() > 4)
|
||
report_fatal_error("W65816: return type wider than 64 bits not supported");
|
||
|
||
// Indirect calls (function pointers): redirect through the runtime
|
||
// trampoline `__jsl_indir`. The 65816 has no JSL-indirect; instead,
|
||
// we store the dynamic target to a fixed bank-0 slot ($00:00B8 — see
|
||
// libgcc.s for why) and JSL the trampoline, which does
|
||
// `JMP ($00B8)`. JMP (abs) reads its vector from bank 0 unconditionally,
|
||
// so anchoring the slot in bank 0 makes the dispatch work under GS/OS
|
||
// Loader / GNO non-bank-0 placement (where the program's BSS would
|
||
// otherwise live in PBR — the JMP couldn't reach it). Single-bank
|
||
// assumption remains on the *target's* code (JMP indirect keeps PBR).
|
||
bool IsIndirect = !isa<GlobalAddressSDNode>(Callee) &&
|
||
!isa<ExternalSymbolSDNode>(Callee);
|
||
if (IsIndirect) {
|
||
// Emit a constant-address store: tblgen pattern (store Acc16,
|
||
// (iPTR imm:$addr)) -> STA_Long $0000B8 (4-byte abs-long, bank
|
||
// explicit, ignores DBR).
|
||
SDValue ConstAddr =
|
||
DAG.getConstant(0xB8, DL,
|
||
getPointerTy(DAG.getDataLayout()));
|
||
Chain = DAG.getStore(Chain, DL, Callee, ConstAddr,
|
||
MachinePointerInfo());
|
||
// Replace the callee with __jsl_indir for the actual JSL.
|
||
Callee = DAG.getExternalSymbol("__jsl_indir", MVT::i16);
|
||
}
|
||
|
||
for (const ISD::OutputArg &O : Outs) {
|
||
if (O.VT != MVT::i16 && O.VT != MVT::i8 && O.VT != MVT::i32)
|
||
report_fatal_error("W65816: argument type not yet supported");
|
||
}
|
||
|
||
// i32 first-arg ABI. Two flavors:
|
||
// - Legal-i32: Outs[0].VT == i32 (whole pair). Pass in AX32.
|
||
// - Split-i32 (legacy): Outs[0]/Outs[1] both i16 with OrigArgIndex==0.
|
||
// Pass low in A, high in X.
|
||
bool I32WholeFirstArg =
|
||
!Outs.empty() && Outs[0].VT == MVT::i32;
|
||
bool I32SplitFirstArg =
|
||
Outs.size() >= 2 && Outs[0].VT == MVT::i16 && Outs[1].VT == MVT::i16 &&
|
||
Outs[0].OrigArgIndex == 0 && Outs[1].OrigArgIndex == 0;
|
||
unsigned FirstStackArg = I32WholeFirstArg ? 1
|
||
: I32SplitFirstArg ? 2 : 1;
|
||
|
||
// i8 stack args are promoted to i16 (2-byte slots) so the callee can
|
||
// read them with a 16-bit M load — matches LowerFormalArguments and
|
||
// CC_W65816's CCPromoteToType<i16>. i32 stack args occupy 4 bytes
|
||
// (2 PUSH16s).
|
||
unsigned StackBytes = 0;
|
||
for (unsigned i = FirstStackArg; i < Outs.size(); ++i)
|
||
StackBytes += (Outs[i].VT == MVT::i32) ? 4 : 2;
|
||
|
||
Chain = DAG.getCALLSEQ_START(Chain, StackBytes, 0, DL);
|
||
|
||
// Push stack-passed args in reverse so arg FirstStackArg ends up at
|
||
// the lowest post-JSL stack-relative offset (4,S). Each push uses A
|
||
// by default; if the value being pushed is already a `CopyFromReg X`
|
||
// (e.g. forwarding the i32-first-arg-in-A:X hi half), push directly
|
||
// from X via PHX — saves the TXA + A-spill round-trip that would
|
||
// otherwise be required.
|
||
SDValue Glue;
|
||
// Helper: push a single i16-sized value via PHA.
|
||
auto pushI16 = [&](SDValue V) {
|
||
bool ViaX = false;
|
||
if (V.getOpcode() == ISD::CopyFromReg) {
|
||
auto *RegN = dyn_cast<RegisterSDNode>(V.getOperand(1).getNode());
|
||
if (RegN) {
|
||
Register R = RegN->getReg();
|
||
if (R.isPhysical() && R == W65816::X) {
|
||
ViaX = true;
|
||
} else if (R.isVirtual()) {
|
||
MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
|
||
if (MRI.getRegClass(R) == &W65816::Idx16RegClass) {
|
||
for (auto &LI : MRI.liveins())
|
||
if (LI.second == R && LI.first == W65816::X) {
|
||
ViaX = true;
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
if (ViaX) {
|
||
Chain = DAG.getCopyToReg(Chain, DL, W65816::X, V, Glue);
|
||
Glue = Chain.getValue(1);
|
||
Chain = DAG.getNode(W65816ISD::PUSH_X, DL,
|
||
DAG.getVTList(MVT::Other, MVT::Glue), Chain, Glue);
|
||
} else {
|
||
Chain = DAG.getCopyToReg(Chain, DL, W65816::A, V, Glue);
|
||
Glue = Chain.getValue(1);
|
||
Chain = DAG.getNode(W65816ISD::PUSH, DL,
|
||
DAG.getVTList(MVT::Other, MVT::Glue), Chain, Glue);
|
||
}
|
||
Glue = Chain.getValue(1);
|
||
};
|
||
|
||
for (int i = (int)Outs.size() - 1; i >= (int)FirstStackArg; --i) {
|
||
SDValue V = OutVals[i];
|
||
if (Outs[i].VT == MVT::i32) {
|
||
// Push i32 stack arg: hi half first (lands at higher address),
|
||
// lo half second (lands at lower address = the slot the callee
|
||
// reads as the start of the i32).
|
||
SDValue Lo = extractWide32Lo(DAG, DL, V);
|
||
SDValue Hi = extractWide32Hi(DAG, DL, V);
|
||
pushI16(Hi);
|
||
pushI16(Lo);
|
||
continue;
|
||
}
|
||
if (Outs[i].VT == MVT::i8)
|
||
V = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, V);
|
||
pushI16(V);
|
||
}
|
||
|
||
// i32 first-arg. Whole (legal-i32): split into lo/hi and copy
|
||
// to $a/$x separately — avoids AX32 in the MIR (see
|
||
// W65816LowerWide32). Split-i32 (legacy 2-i16): hi in X first,
|
||
// then lo in A below.
|
||
if (I32WholeFirstArg) {
|
||
SDValue Lo = extractWide32Lo(DAG, DL, OutVals[0]);
|
||
SDValue Hi = extractWide32Hi(DAG, DL, OutVals[0]);
|
||
Chain = DAG.getCopyToReg(Chain, DL, W65816::X, Hi, Glue);
|
||
Glue = Chain.getValue(1);
|
||
Chain = DAG.getCopyToReg(Chain, DL, W65816::A, Lo, Glue);
|
||
Glue = Chain.getValue(1);
|
||
} else if (I32SplitFirstArg) {
|
||
Chain = DAG.getCopyToReg(Chain, DL, W65816::X, OutVals[1], Glue);
|
||
Glue = Chain.getValue(1);
|
||
}
|
||
|
||
// Arg 0 in A — only for non-whole-i32 first-arg. Whole-i32
|
||
// already copied to A/X above.
|
||
if (!I32WholeFirstArg && !OutVals.empty()) {
|
||
Chain = DAG.getCopyToReg(Chain, DL, W65816::A, OutVals[0], Glue);
|
||
Glue = Chain.getValue(1);
|
||
}
|
||
|
||
// Callee target type must match iPTR (i16 in ptr16, i32 in ptr32).
|
||
// The CALL SDNode's operand-type profile (SDT_W65816Call) is iPTR;
|
||
// hardcoding MVT::i16 here mismatches under p:32:16.
|
||
EVT CalleeVT = getPointerTy(DAG.getDataLayout());
|
||
if (auto *GA = dyn_cast<GlobalAddressSDNode>(Callee))
|
||
Callee = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, CalleeVT);
|
||
else if (auto *ES = dyn_cast<ExternalSymbolSDNode>(Callee))
|
||
Callee = DAG.getTargetExternalSymbol(ES->getSymbol(), CalleeVT);
|
||
|
||
SmallVector<SDValue, 4> CallOps = {Chain, Callee};
|
||
if (I32WholeFirstArg) {
|
||
CallOps.push_back(DAG.getRegister(W65816::A, MVT::i16));
|
||
CallOps.push_back(DAG.getRegister(W65816::X, MVT::i16));
|
||
} else if (!OutVals.empty()) {
|
||
CallOps.push_back(DAG.getRegister(W65816::A, Outs[0].VT));
|
||
if (I32SplitFirstArg)
|
||
CallOps.push_back(DAG.getRegister(W65816::X, Outs[1].VT));
|
||
}
|
||
if (Glue.getNode())
|
||
CallOps.push_back(Glue);
|
||
|
||
Chain = DAG.getNode(W65816ISD::CALL, DL,
|
||
DAG.getVTList(MVT::Other, MVT::Glue), CallOps);
|
||
Glue = Chain.getValue(1);
|
||
|
||
Chain = DAG.getCALLSEQ_END(Chain, StackBytes, 0, Glue, DL);
|
||
Glue = Chain.getValue(1);
|
||
|
||
// Read return value(s). Mirrors LowerReturn: i8/i16 in A, i32 in
|
||
// AX32 (whole) or split A/X (legacy), and 4-half (i64 / 2x i32) in
|
||
// A, X, Y, DPF0. i32 Ins are read as a single i32 from the half
|
||
// pair (A:X for the first, Y:DPF0 for a second-pair-of-halves).
|
||
// Whole-i32 single return: read lo from $a, hi from $x. Avoids
|
||
// using AX32 in the SDAG / MIR — see W65816LowerWide32 pass.
|
||
if (Ins.size() == 1 && Ins[0].VT == MVT::i32) {
|
||
SDValue Lo = DAG.getCopyFromReg(Chain, DL, W65816::A, MVT::i16, Glue);
|
||
Chain = Lo.getValue(1);
|
||
Glue = Lo.getValue(2);
|
||
SDValue Hi = DAG.getCopyFromReg(Chain, DL, W65816::X, MVT::i16, Glue);
|
||
Chain = Hi.getValue(1);
|
||
Glue = Hi.getValue(2);
|
||
InVals.push_back(buildWide32(DAG, DL, Lo, Hi));
|
||
return Chain;
|
||
}
|
||
// Build a flat list of i16 halves expected from the call. Then
|
||
// walk it, copying from A, X, Y, DPF0 in order. Re-assemble i32
|
||
// halves into a Wide32 SDValue at the end.
|
||
SmallVector<MVT, 4> ExpVT;
|
||
for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
|
||
MVT VT = Ins[i].VT;
|
||
if (VT == MVT::i32) {
|
||
ExpVT.push_back(MVT::i16);
|
||
ExpVT.push_back(MVT::i16);
|
||
} else if (VT == MVT::i16 || VT == MVT::i8) {
|
||
ExpVT.push_back(VT);
|
||
} else {
|
||
report_fatal_error("W65816: return half must be i8/i16/i32");
|
||
}
|
||
}
|
||
if (ExpVT.size() > 4)
|
||
report_fatal_error("W65816: return type wider than 64 bits not supported");
|
||
static constexpr Register RetRegs[4] = {W65816::A, W65816::X, W65816::Y,
|
||
W65816::DPF0};
|
||
SmallVector<SDValue, 4> Halves;
|
||
for (unsigned i = 0; i != ExpVT.size(); ++i) {
|
||
SDValue V = DAG.getCopyFromReg(Chain, DL, RetRegs[i], ExpVT[i], Glue);
|
||
Chain = V.getValue(1);
|
||
Glue = V.getValue(2);
|
||
Halves.push_back(V);
|
||
}
|
||
// Re-pack halves into the original Ins shape (i32s rebuild via
|
||
// REG_SEQUENCE; i8/i16 pass through).
|
||
unsigned hi = 0;
|
||
for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
|
||
if (Ins[i].VT == MVT::i32) {
|
||
InVals.push_back(buildWide32(DAG, DL, Halves[hi], Halves[hi + 1]));
|
||
hi += 2;
|
||
} else {
|
||
InVals.push_back(Halves[hi]);
|
||
hi += 1;
|
||
}
|
||
}
|
||
return Chain;
|
||
}
|
||
|
||
SDValue W65816TargetLowering::LowerReturn(
|
||
SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
|
||
const SmallVectorImpl<ISD::OutputArg> &Outs,
|
||
const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
|
||
SelectionDAG &DAG) const {
|
||
// Return ABI:
|
||
// i8/i16: value in A.
|
||
// i32: low half (Outs[0]) in A, high half (Outs[1]) in X.
|
||
// i64: halves in A, X, Y, and a fixed direct-page slot at $F0..$F1
|
||
// (Outs[0..2] -> A,X,Y; Outs[3] stored to the DP slot).
|
||
// wider: not yet supported.
|
||
// Type legalization splits an i32 into 2 consecutive i16 Outs and an
|
||
// i64 into 4. Emission order matters: we copy the *highest* halves
|
||
// first so that the regalloc can place each through A (the only
|
||
// ALU reg) without conflict. The TAX/TAY in copyPhysReg preserves
|
||
// A, so subsequent low-half copies to A don't clobber.
|
||
// With i32 legal, an Outs entry may be MVT::i32; we expand each i32
|
||
// into its two i16 halves (sub_lo/sub_hi via EXTRACT_SUBREG) so the
|
||
// legacy A/X/Y/DPF0 4-half return ABI continues to work for the
|
||
// multi-half return cases (i64 returned as 2 i32, struct of 2 long
|
||
// returned as 2 i32, etc.).
|
||
SmallVector<MVT, 4> ExpVT;
|
||
SmallVector<SDValue, 4> ExpVals;
|
||
for (unsigned i = 0; i != Outs.size(); ++i) {
|
||
MVT VT = Outs[i].VT;
|
||
if (VT == MVT::i32) {
|
||
ExpVT.push_back(MVT::i16);
|
||
ExpVT.push_back(MVT::i16);
|
||
ExpVals.push_back(extractWide32Lo(DAG, DL, OutVals[i]));
|
||
ExpVals.push_back(extractWide32Hi(DAG, DL, OutVals[i]));
|
||
} else if (VT == MVT::i16 || VT == MVT::i8) {
|
||
ExpVT.push_back(VT);
|
||
ExpVals.push_back(OutVals[i]);
|
||
} else {
|
||
report_fatal_error("W65816: return half must be i8/i16/i32");
|
||
}
|
||
}
|
||
if (ExpVT.size() > 4)
|
||
report_fatal_error("W65816: return type wider than 64 bits not supported");
|
||
|
||
// Single whole-i32 return: copy directly to AX32 instead of two
|
||
// halves to A and X. Saves the regalloc/coalescer some work.
|
||
bool I32WholeReturn = (Outs.size() == 1 && Outs[0].VT == MVT::i32);
|
||
SDValue Glue;
|
||
SmallVector<SDValue, 8> RetOps(1, Chain);
|
||
|
||
if (I32WholeReturn) {
|
||
// Split the i32 OutVal into lo/hi and copy each separately to
|
||
// $a / $x (no AX32 in the SDAG — see W65816LowerWide32).
|
||
SDValue Lo = extractWide32Lo(DAG, DL, OutVals[0]);
|
||
SDValue Hi = extractWide32Hi(DAG, DL, OutVals[0]);
|
||
Chain = DAG.getCopyToReg(Chain, DL, W65816::X, Hi, Glue);
|
||
Glue = Chain.getValue(1);
|
||
Chain = DAG.getCopyToReg(Chain, DL, W65816::A, Lo, Glue);
|
||
Glue = Chain.getValue(1);
|
||
RetOps.push_back(DAG.getRegister(W65816::A, MVT::i16));
|
||
RetOps.push_back(DAG.getRegister(W65816::X, MVT::i16));
|
||
RetOps[0] = Chain;
|
||
if (Glue.getNode())
|
||
RetOps.push_back(Glue);
|
||
return DAG.getNode(W65816ISD::RET_GLUE, DL, MVT::Other, RetOps);
|
||
}
|
||
|
||
// Outs[3] -> DP $F0 via CopyToReg(DPF0). Using the DPF0 fake physreg
|
||
// (lowered to `STA $F0` by copyPhysReg) is critical: a generic
|
||
// ISD::STORE with addr=0xF0 lowered to `sta (d,s),y`, an indirect
|
||
// through the DBR, which silently misbehaved when DBR != 0. STA dp
|
||
// uses D + dp directly and is unaffected by DBR. Done first so its
|
||
// computation can use A freely before A holds the low result. Glued
|
||
// to RET_GLUE via the RetOps Register entry below so DCE doesn't
|
||
// strip the COPY.
|
||
// Use the expanded i16-half list (i32 outs split into 2 i16 halves).
|
||
if (ExpVals.size() >= 4) {
|
||
Chain = DAG.getCopyToReg(Chain, DL, W65816::DPF0, ExpVals[3], Glue);
|
||
Glue = Chain.getValue(1);
|
||
}
|
||
if (ExpVals.size() >= 3) {
|
||
Chain = DAG.getCopyToReg(Chain, DL, W65816::Y, ExpVals[2], Glue);
|
||
Glue = Chain.getValue(1);
|
||
}
|
||
if (ExpVals.size() >= 2) {
|
||
Chain = DAG.getCopyToReg(Chain, DL, W65816::X, ExpVals[1], Glue);
|
||
Glue = Chain.getValue(1);
|
||
}
|
||
if (!ExpVals.empty()) {
|
||
Chain = DAG.getCopyToReg(Chain, DL, W65816::A, ExpVals[0], Glue);
|
||
Glue = Chain.getValue(1);
|
||
RetOps.push_back(DAG.getRegister(W65816::A, ExpVT[0]));
|
||
}
|
||
if (ExpVals.size() >= 2)
|
||
RetOps.push_back(DAG.getRegister(W65816::X, ExpVT[1]));
|
||
if (ExpVals.size() >= 3)
|
||
RetOps.push_back(DAG.getRegister(W65816::Y, ExpVT[2]));
|
||
if (ExpVals.size() >= 4)
|
||
RetOps.push_back(DAG.getRegister(W65816::DPF0, ExpVT[3]));
|
||
|
||
RetOps[0] = Chain;
|
||
if (Glue.getNode())
|
||
RetOps.push_back(Glue);
|
||
|
||
return DAG.getNode(W65816ISD::RET_GLUE, DL, MVT::Other, RetOps);
|
||
}
|
||
|
||
SDValue
|
||
W65816TargetLowering::PerformDAGCombine(SDNode *N,
|
||
DAGCombinerInfo &DCI) const {
|
||
// (shl i32 X, K) -> chain of K (add x, x) for small K. After type
|
||
// legalisation the i32 add splits via ADDC/ADDE pseudos which expand
|
||
// to native ASL/ROL + carry-chain — much cheaper than the type-
|
||
// legaliser's SHL_PARTS expansion which uses our 3-insn SRL15A trick
|
||
// to compute the bit crossing the half boundary. Each ADD expands to
|
||
// ~10 insns; SHL_PARTS expansion is ~26 for K=1, ~33 for K=2, ~34 for
|
||
// K=3. ADD-chain wins at K<=2 and breaks even at K=3 — cap at K=2.
|
||
// `x*N` (which the combiner canonicalises pow-of-2 muls to `x<<K`)
|
||
// benefits the most. i16 SHL by 1..15 has dedicated ASLA16 patterns
|
||
// already, so we restrict the rewrite to i32+.
|
||
// (shl i32 X, K) -> ADD chain for small K — but only when i32 is
|
||
// ILLEGAL (i.e., gets type-split into i16 halves). When i32 is a
|
||
// legal type (Wide32 reg class for ptr32 mode), the rewrite cycles
|
||
// against LLVM's generic `(add x, x) -> (shl x, 1)` combine in the
|
||
// i64 → 2 i32 split path, hanging the legalizer.
|
||
// STORE / LOAD with ConstantSDNode ptr (e.g. `*(volatile uint8*)0xC035 = v`):
|
||
// wrap the immediate in a W65816ISD::WRAPPER (using a TargetGlobalAddress-
|
||
// like marker would be cleaner but we lack the symbol table). Re-issue
|
||
// the store/load with the same ptr but the constant marked TargetConstant
|
||
// — TargetConstant is opaque to LowerI32Constant, so it survives intact
|
||
// to ISel, where the existing tablegen pattern
|
||
// `(store Acc8, (iPTR imm)) -> STA8long`
|
||
// matches (`imm` accepts both Constant and TargetConstant). 4 B / 6 cyc
|
||
// bank-explicit `sta long` instead of 16 B / 30 cyc [dp],Y.
|
||
// Wide32-of-Wrapper-with-zero-hi → i16 Wrapper. Under p:32:16,
|
||
// LowerGlobalAddress builds GlobalAddress as a Wide32 reg pair
|
||
// `(REG_SEQUENCE Wrapper(off_i16), 0_i16)`. Stores/loads against
|
||
// this Wide32 ptr fall to the heavy [dp],Y path (16 B / 30 cyc)
|
||
// even when the bank half is the constant 0 — we want the cheap
|
||
// DBR-relative `sta g` / `lda g` (3 B / 5 cyc). Detect the shape
|
||
// and recombine the ptr to its 16-bit form so the existing
|
||
// tablegen `(store v, (Wrapper tglob))` → STAabs / `(load (Wrapper
|
||
// tglob))` → LDAabs patterns fire. Crucially, this is correct
|
||
// ONLY when bank=0 — under GS/OS Loader, DBR is set to our bank
|
||
// by crt0Gsos, so DBR-relative addressing reaches the same global.
|
||
// Returns either an i16 Wrapper (drop into i16 STAabs/LDAabs pattern)
|
||
// or a TargetConstant:i32 (for const-addr i16 stores so the timm
|
||
// pattern fires and produces STAabs). TargetConstant — not regular
|
||
// Constant — because LowerI32Constant only matches ISD::Constant; if
|
||
// we returned a fresh ConstantSDNode it would re-fire LowerI32Constant
|
||
// and produce another Wide32 REG_SEQUENCE → infinite combine loop.
|
||
auto unwrapWide32WithZeroHi = [&](SDValue Ptr) -> SDValue {
|
||
if (Ptr.getValueType() != MVT::i32) return SDValue();
|
||
if (!Ptr.getNode()->isMachineOpcode()) return SDValue();
|
||
if (Ptr.getMachineOpcode() != TargetOpcode::REG_SEQUENCE)
|
||
return SDValue();
|
||
SDValue Lo, Hi;
|
||
for (unsigned i = 1; i + 1 < Ptr.getNumOperands(); i += 2) {
|
||
auto *CIdx = dyn_cast<ConstantSDNode>(Ptr.getOperand(i + 1));
|
||
if (!CIdx) continue;
|
||
if (CIdx->getZExtValue() == llvm::sub_lo) Lo = Ptr.getOperand(i);
|
||
else if (CIdx->getZExtValue() == llvm::sub_hi) Hi = Ptr.getOperand(i);
|
||
}
|
||
if (!Lo || !Hi) return SDValue();
|
||
auto *HiC = dyn_cast<ConstantSDNode>(Hi);
|
||
if (!HiC || HiC->getZExtValue() != 0) return SDValue();
|
||
if (Lo.getOpcode() == W65816ISD::Wrapper) return Lo;
|
||
if (auto *LoC = dyn_cast<ConstantSDNode>(Lo)) {
|
||
// Recombine into a TargetConstant:i32 so the `(store v, (iPTR
|
||
// timm))` STAabs pattern fires. Returning an i16 Constant
|
||
// would create a malformed STORE node (Ptr type mismatch) and
|
||
// returning a regular Constant:i32 would re-trigger
|
||
// LowerI32Constant.
|
||
return DCI.DAG.getTargetConstant(LoC->getZExtValue(), SDLoc(Ptr),
|
||
MVT::i32);
|
||
}
|
||
return SDValue();
|
||
};
|
||
if (N->getOpcode() == ISD::STORE) {
|
||
auto *St = cast<StoreSDNode>(N);
|
||
EVT MemVT = St->getMemoryVT();
|
||
SDValue Ptr = St->getBasePtr();
|
||
// Skip i32 stores — LowerStore's i32 path has its own Wide32-zero-hi
|
||
// const-addr fast path that emits two i16 stores at separate
|
||
// TargetConstant addrs. Unwrapping here would short-circuit that
|
||
// and produce a malformed ADD(TargetConstant, Constant) when the
|
||
// hi-half store needs Ptr+2.
|
||
if (MemVT != MVT::i32) {
|
||
if (SDValue I16Ptr = unwrapWide32WithZeroHi(Ptr)) {
|
||
SelectionDAG &DAG = DCI.DAG;
|
||
SDLoc DL(N);
|
||
return DAG.getTruncStore(St->getChain(), DL, St->getValue(), I16Ptr,
|
||
MemVT, St->getMemOperand());
|
||
}
|
||
// Global+i16-idx fast path for STORES (companion to the LOAD
|
||
// branch below). Ptr = REG_SEQUENCE(ADDC(Wrapper, idx), ADDE(...)).
|
||
// Rewrite to CopyToReg($a, val) + CopyToReg($x, idx) + STA_AbsX.
|
||
if ((MemVT == MVT::i16 || MemVT == MVT::i8) &&
|
||
Ptr.getNode() && Ptr.isMachineOpcode() &&
|
||
Ptr.getMachineOpcode() == TargetOpcode::REG_SEQUENCE) {
|
||
SDValue Lo = lookThroughRegSeq(Ptr, llvm::sub_lo);
|
||
if (Lo && Lo.getOpcode() == ISD::ADDC) {
|
||
auto lookThroughExtractSubLo = [](SDValue V) -> SDValue {
|
||
if (V.getNode() && V.isMachineOpcode() &&
|
||
V.getMachineOpcode() == TargetOpcode::EXTRACT_SUBREG) {
|
||
SDValue Src = V.getOperand(0);
|
||
if (Src.isMachineOpcode() &&
|
||
Src.getMachineOpcode() == TargetOpcode::REG_SEQUENCE) {
|
||
if (SDValue X = lookThroughRegSeq(Src, llvm::sub_lo))
|
||
return X;
|
||
}
|
||
}
|
||
return V;
|
||
};
|
||
SDValue A = lookThroughExtractSubLo(Lo.getOperand(0));
|
||
SDValue B = lookThroughExtractSubLo(Lo.getOperand(1));
|
||
auto isWrapperGlobal = [](SDValue V) {
|
||
if (V.getOpcode() != W65816ISD::Wrapper) return false;
|
||
unsigned Op = V.getOperand(0).getOpcode();
|
||
return Op == ISD::TargetGlobalAddress ||
|
||
Op == ISD::TargetExternalSymbol;
|
||
};
|
||
SDValue Sym, Idx;
|
||
if (isWrapperGlobal(A)) { Sym = A.getOperand(0); Idx = B; }
|
||
else if (isWrapperGlobal(B)) { Sym = B.getOperand(0); Idx = A; }
|
||
if (Sym && Idx.getValueType() == MVT::i16) {
|
||
SelectionDAG &DAG = DCI.DAG;
|
||
SDLoc DL(N);
|
||
SDValue Chain = St->getChain();
|
||
SDValue Val = St->getValue();
|
||
// STA8absX copies $a register at i16 width (M=0); the SEP
|
||
// wraps narrow it. Promote i8 stored value to i16.
|
||
if (Val.getValueType() == MVT::i8)
|
||
Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, Val);
|
||
SDValue Glue;
|
||
SDValue C1 = DAG.getCopyToReg(Chain, DL, W65816::X, Idx, Glue);
|
||
Glue = C1.getValue(1);
|
||
SDValue C2 = DAG.getCopyToReg(C1, DL, W65816::A, Val, Glue);
|
||
Glue = C2.getValue(1);
|
||
SDVTList StaVTs = DAG.getVTList(MVT::Other, MVT::Glue);
|
||
unsigned Opc = (MemVT == MVT::i8) ? W65816::STA8absX
|
||
: W65816::STA_AbsX;
|
||
SDNode *Sta = DAG.getMachineNode(Opc, DL, StaVTs,
|
||
{Sym, C2, Glue});
|
||
return SDValue(Sta, 0);
|
||
}
|
||
}
|
||
}
|
||
}
|
||
// i8 const-addr → STA8long (timm pattern); i16 const-addr →
|
||
// STAabs (timm pattern, DBR-relative). Wrap as TargetConstant so
|
||
// LowerI32Constant doesn't re-enter and break the const-pattern
|
||
// match. i32 stores split into 2 i16 stores via LowerStore so they
|
||
// come back through this combine as MemVT==i16.
|
||
if (MemVT != MVT::i8 && MemVT != MVT::i16) return SDValue();
|
||
if (auto *C = dyn_cast<ConstantSDNode>(Ptr)) {
|
||
SelectionDAG &DAG = DCI.DAG;
|
||
SDLoc DL(N);
|
||
SDValue NewPtr = DAG.getTargetConstant(C->getZExtValue(), DL,
|
||
Ptr.getValueType());
|
||
return DAG.getTruncStore(St->getChain(), DL, St->getValue(), NewPtr,
|
||
MemVT, St->getMemOperand());
|
||
}
|
||
}
|
||
if (N->getOpcode() == ISD::LOAD) {
|
||
auto *Ld = cast<LoadSDNode>(N);
|
||
EVT MemVT = Ld->getMemoryVT();
|
||
EVT VT = Ld->getValueType(0);
|
||
SDValue Ptr = Ld->getBasePtr();
|
||
// Wide32-of-Wrapper-with-zero-hi → i16 Wrapper (companion to the
|
||
// STORE side just above). Lets `(load (Wrapper g))` → LDAabs fire.
|
||
// Skip i32 loads — LowerLoad's i32 path does its own Ptr+2 ADD
|
||
// arithmetic and would choke on a TargetConstant unwrap result.
|
||
if (MemVT != MVT::i32) {
|
||
if (SDValue I16Ptr = unwrapWide32WithZeroHi(Ptr)) {
|
||
SelectionDAG &DAG = DCI.DAG;
|
||
SDLoc DL(N);
|
||
return DAG.getExtLoad(Ld->getExtensionType(), DL, VT,
|
||
Ld->getChain(), I16Ptr, MemVT,
|
||
Ld->getMemOperand());
|
||
}
|
||
// Global+i16-idx fast path: Ptr is REG_SEQUENCE produced by
|
||
// LowerI32Bin from `(add (Wrapper sym) (zext i16 idx))`.
|
||
// sub_lo = ADDC(Wrapper, idx) — operands are TargetExtractSubreg
|
||
// wrapping each side's Wide32
|
||
// sub_hi = ADDE(0, 0, carry) — ignored (idx fits in 16 bits,
|
||
// so any carry stays in bank)
|
||
// Rewrite the LOAD to a CopyToReg($x, idx) + LDA_AbsX(sym)
|
||
// sequence. Saves ~45 bytes / ~70 cyc vs the 24-bit [dp],Y deref.
|
||
// Correct under the data-bank invariant (DBR = global's bank).
|
||
if ((MemVT == MVT::i16 || MemVT == MVT::i8) &&
|
||
Ptr.getNode() && Ptr.isMachineOpcode() &&
|
||
Ptr.getMachineOpcode() == TargetOpcode::REG_SEQUENCE) {
|
||
SDValue Lo = lookThroughRegSeq(Ptr, llvm::sub_lo);
|
||
if (Lo && Lo.getOpcode() == ISD::ADDC) {
|
||
auto lookThroughExtractSubLo = [](SDValue V) -> SDValue {
|
||
if (V.getNode() && V.isMachineOpcode() &&
|
||
V.getMachineOpcode() == TargetOpcode::EXTRACT_SUBREG) {
|
||
SDValue Src = V.getOperand(0);
|
||
if (Src.isMachineOpcode() &&
|
||
Src.getMachineOpcode() == TargetOpcode::REG_SEQUENCE) {
|
||
if (SDValue X = lookThroughRegSeq(Src, llvm::sub_lo))
|
||
return X;
|
||
}
|
||
}
|
||
return V;
|
||
};
|
||
SDValue A = lookThroughExtractSubLo(Lo.getOperand(0));
|
||
SDValue B = lookThroughExtractSubLo(Lo.getOperand(1));
|
||
auto isWrapperGlobal = [](SDValue V) {
|
||
if (V.getOpcode() != W65816ISD::Wrapper) return false;
|
||
unsigned Op = V.getOperand(0).getOpcode();
|
||
return Op == ISD::TargetGlobalAddress ||
|
||
Op == ISD::TargetExternalSymbol;
|
||
};
|
||
SDValue Sym, Idx;
|
||
if (isWrapperGlobal(A)) { Sym = A.getOperand(0); Idx = B; }
|
||
else if (isWrapperGlobal(B)) { Sym = B.getOperand(0); Idx = A; }
|
||
if (Sym && Idx.getValueType() == MVT::i16) {
|
||
SelectionDAG &DAG = DCI.DAG;
|
||
SDLoc DL(N);
|
||
SDValue Chain = Ld->getChain();
|
||
SDValue Glue;
|
||
SDValue NewChain = DAG.getCopyToReg(Chain, DL, W65816::X, Idx,
|
||
Glue);
|
||
Glue = NewChain.getValue(1);
|
||
SDVTList LdaVTs = DAG.getVTList(MVT::Other, MVT::Glue);
|
||
unsigned Opc = (MemVT == MVT::i8) ? W65816::LDA8absX
|
||
: W65816::LDA_AbsX;
|
||
SDNode *Lda = DAG.getMachineNode(Opc, DL, LdaVTs,
|
||
{Sym, NewChain, Glue});
|
||
SDValue LdaChain = SDValue(Lda, 0);
|
||
SDValue LdaGlue = SDValue(Lda, 1);
|
||
// Read A as the original LOAD's result VT directly. For
|
||
// i8 LOAD with i8 VT: read i8. For i8 LOAD with i16 VT
|
||
// (zext/sext): read i16 (high byte is whatever was in $a
|
||
// before — wrong for zext, fine for sext, depends on the
|
||
// M=8 LDA behavior). M=8 LDA only writes the low byte of
|
||
// $a, leaving the high byte intact. Safe wrt liveness
|
||
// because we're reading $a immediately after SEP/REP
|
||
// around the load, but the high byte is now whatever
|
||
// pre-LDA value $a held — for zext we must mask it.
|
||
SDValue Val = DAG.getCopyFromReg(LdaChain, DL, W65816::A,
|
||
VT, LdaGlue);
|
||
SDValue Chain2 = Val.getValue(1);
|
||
if (MemVT == MVT::i8 && VT == MVT::i16) {
|
||
if (Ld->getExtensionType() == ISD::ZEXTLOAD) {
|
||
Val = DAG.getNode(ISD::AND, DL, MVT::i16, Val,
|
||
DAG.getConstant(0xFF, DL, MVT::i16));
|
||
} else if (Ld->getExtensionType() == ISD::SEXTLOAD) {
|
||
Val = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i16,
|
||
Val, DAG.getValueType(MVT::i8));
|
||
}
|
||
// EXTLOAD: high byte don't-care, leave alone.
|
||
}
|
||
return DAG.getMergeValues({Val, Chain2}, DL);
|
||
}
|
||
}
|
||
}
|
||
}
|
||
// Only the i8 const-addr path has dedicated tablegen patterns
|
||
// (LDA8long); skip i16 const-addr loads (no LDAabs imm pattern)
|
||
// and i32 (would re-fire on the same node with different shape).
|
||
if (MemVT != MVT::i8 || (VT != MVT::i8 && VT != MVT::i16))
|
||
return SDValue();
|
||
if (auto *C = dyn_cast<ConstantSDNode>(Ptr)) {
|
||
SelectionDAG &DAG = DCI.DAG;
|
||
SDLoc DL(N);
|
||
SDValue NewPtr = DAG.getTargetConstant(C->getZExtValue(), DL,
|
||
Ptr.getValueType());
|
||
return DAG.getExtLoad(Ld->getExtensionType(), DL, VT,
|
||
Ld->getChain(), NewPtr, MemVT,
|
||
Ld->getMemOperand());
|
||
}
|
||
}
|
||
|
||
if (N->getOpcode() == ISD::SHL && N->getValueType(0).getSizeInBits() >= 32 &&
|
||
!isTypeLegal(N->getValueType(0))) {
|
||
if (auto *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
|
||
uint64_t K = C->getZExtValue();
|
||
if (K >= 1 && K <= 2) {
|
||
SelectionDAG &DAG = DCI.DAG;
|
||
SDValue X = N->getOperand(0);
|
||
SDLoc DL(N);
|
||
EVT VT = N->getValueType(0);
|
||
SDValue R = X;
|
||
for (uint64_t i = 0; i < K; ++i)
|
||
R = DAG.getNode(ISD::ADD, DL, VT, R, R);
|
||
return R;
|
||
}
|
||
}
|
||
}
|
||
|
||
return SDValue();
|
||
}
|
||
|
||
// Custom-lowering for ISD::MUL i32. When both operands are ZEXT from
|
||
// i16 (or provably have high 16 bits = 0), emit a libcall to
|
||
// __umulhisi3 (16x16 -> 32) instead of the heavier __mulsi3 (32x32 ->
|
||
// 32). Saves the 32-bit arg marshaling AND the 32-bit accumulator
|
||
// math inside the libcall — roughly equivalent to Calypsi 5.16's
|
||
// `_Mul16`. Falls through to the standard __mulsi3 libcall otherwise.
|
||
SDValue W65816TargetLowering::LowerMUL_I32(SDValue Op,
|
||
SelectionDAG &DAG) const {
|
||
SDLoc DL(Op);
|
||
EVT VT = Op.getValueType();
|
||
assert(VT == MVT::i32 && "LowerMUL_I32 expects i32");
|
||
SDValue Lhs = Op.getOperand(0);
|
||
SDValue Rhs = Op.getOperand(1);
|
||
|
||
auto narrowToI16 = [&](SDValue V) -> SDValue {
|
||
// Explicit zext-from-i16 (the IR-level form, before SDAG flattening).
|
||
if (V.getOpcode() == ISD::ZERO_EXTEND &&
|
||
V.getOperand(0).getValueType() == MVT::i16)
|
||
return V.getOperand(0);
|
||
// ANY_EXTEND-from-i16 is also fine since multiplication of the low
|
||
// 16 bits gives the same 32-bit result whatever the high bits were.
|
||
if (V.getOpcode() == ISD::ANY_EXTEND &&
|
||
V.getOperand(0).getValueType() == MVT::i16)
|
||
return V.getOperand(0);
|
||
// High 16 bits provably zero?
|
||
KnownBits K = DAG.computeKnownBits(V);
|
||
if (K.countMinLeadingZeros() >= 16)
|
||
return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, V);
|
||
return SDValue();
|
||
};
|
||
|
||
// Mul-by-constant strength reduction: (X * K) where K-1 or K+1 is
|
||
// a small power of 2 (shift count 1..5, matching our inlined i32
|
||
// SHL range) expands to (X << N) +/- X — saves a __mulsi3 libcall
|
||
// (~250 cyc) for ~70 cyc of inlined shift+add. Catches djb2Hash's
|
||
// `h * 33` = (h << 5) + h.
|
||
//
|
||
// Patterns covered:
|
||
// K = 2^N + 1 in {3,5,9,17,33} → (X << N) + X
|
||
// K = 2^N - 1 in {7,15,31} → (X << N) - X
|
||
// Larger N hits the i32 SHL libcall path (no longer profitable).
|
||
if (auto *CN = dyn_cast<ConstantSDNode>(Rhs)) {
|
||
int64_t K = CN->getSExtValue();
|
||
for (unsigned N = 1; N <= 5; N++) {
|
||
int64_t Pow = int64_t{1} << N;
|
||
SDValue ShAmt = DAG.getConstant(N, DL, MVT::i16);
|
||
if (K == Pow + 1) {
|
||
SDValue Shl = DAG.getNode(ISD::SHL, DL, MVT::i32, Lhs, ShAmt);
|
||
return DAG.getNode(ISD::ADD, DL, MVT::i32, Shl, Lhs);
|
||
}
|
||
if (K == Pow - 1) {
|
||
SDValue Shl = DAG.getNode(ISD::SHL, DL, MVT::i32, Lhs, ShAmt);
|
||
return DAG.getNode(ISD::SUB, DL, MVT::i32, Shl, Lhs);
|
||
}
|
||
}
|
||
}
|
||
|
||
SDValue A = narrowToI16(Lhs);
|
||
SDValue B = narrowToI16(Rhs);
|
||
if (A && B) {
|
||
TargetLowering::ArgListTy Args;
|
||
Args.push_back({A, Type::getInt16Ty(*DAG.getContext())});
|
||
Args.push_back({B, Type::getInt16Ty(*DAG.getContext())});
|
||
SDValue Callee = DAG.getExternalSymbol(
|
||
"__umulhisi3", getPointerTy(DAG.getDataLayout()));
|
||
TargetLowering::CallLoweringInfo CLI(DAG);
|
||
CLI.setDebugLoc(DL)
|
||
.setChain(DAG.getEntryNode())
|
||
.setLibCallee(CallingConv::C,
|
||
Type::getInt32Ty(*DAG.getContext()),
|
||
Callee, std::move(Args));
|
||
auto [Ret, Chain] = LowerCallTo(CLI);
|
||
return Ret;
|
||
}
|
||
|
||
// Fall back to the standard __mulsi3 libcall.
|
||
TargetLowering::ArgListTy Args;
|
||
Args.push_back({Lhs, Type::getInt32Ty(*DAG.getContext())});
|
||
Args.push_back({Rhs, Type::getInt32Ty(*DAG.getContext())});
|
||
SDValue Callee = DAG.getExternalSymbol(
|
||
"__mulsi3", getPointerTy(DAG.getDataLayout()));
|
||
TargetLowering::CallLoweringInfo CLI(DAG);
|
||
CLI.setDebugLoc(DL)
|
||
.setChain(DAG.getEntryNode())
|
||
.setLibCallee(CallingConv::C,
|
||
Type::getInt32Ty(*DAG.getContext()),
|
||
Callee, std::move(Args));
|
||
auto [Ret, Chain] = LowerCallTo(CLI);
|
||
return Ret;
|
||
}
|
||
|
||
// Map a W65816CC code to the matching Bxx opcode.
|
||
static unsigned getBranchOpcodeForCC(unsigned CC) {
|
||
switch (CC) {
|
||
case W65816CC::COND_EQ: return W65816::BEQ;
|
||
case W65816CC::COND_NE: return W65816::BNE;
|
||
case W65816CC::COND_HS: return W65816::BCS;
|
||
case W65816CC::COND_LO: return W65816::BCC;
|
||
case W65816CC::COND_MI: return W65816::BMI;
|
||
case W65816CC::COND_PL: return W65816::BPL;
|
||
case W65816CC::COND_VS: return W65816::BVS;
|
||
case W65816CC::COND_VC: return W65816::BVC;
|
||
}
|
||
llvm_unreachable("invalid W65816 condition code");
|
||
}
|
||
|
||
// For multi-branch CCs, return the (branchA, branchB, BothMeanTrue) tuple.
|
||
// branchA is tested first; if it takes, we go to TrueBB if BothMeanTrue
|
||
// (i.e. both branches are "take if true"), otherwise to FalseBB. branchB
|
||
// is tested next with the same semantic.
|
||
//
|
||
// GT : (BPL && BNE) → BEQ FalseBB; BPL TrueBB; fall-through FalseBB
|
||
// LE : (BMI || BEQ) → BEQ TrueBB; BMI TrueBB; fall-through FalseBB
|
||
// HI : (BCS && BNE) → BEQ FalseBB; BCS TrueBB; fall-through FalseBB
|
||
// LS : (BCC || BEQ) → BEQ TrueBB; BCC TrueBB; fall-through FalseBB
|
||
struct MultiBranch {
|
||
unsigned First, Second;
|
||
bool FirstToTrue, SecondToTrue;
|
||
};
|
||
static MultiBranch getMultiBranch(unsigned CC) {
|
||
switch (CC) {
|
||
case W65816CC::COND_GT_MB:
|
||
return {W65816::BEQ, W65816::BPL, false, true};
|
||
case W65816CC::COND_LE_MB:
|
||
return {W65816::BEQ, W65816::BMI, true, true};
|
||
case W65816CC::COND_HI_MB:
|
||
return {W65816::BEQ, W65816::BCS, false, true};
|
||
case W65816CC::COND_LS_MB:
|
||
return {W65816::BEQ, W65816::BCC, true, true};
|
||
}
|
||
llvm_unreachable("not a multi-branch CC");
|
||
}
|
||
|
||
// Emit a two-Acc16 binary op as STAfi src2; OPfi dst, src1. Allocates
|
||
// a fresh 2-byte stack slot per call. For CMP (HasOut=false) there's
|
||
// no destination register, just the two src operands. Always spill
|
||
// the SECOND operand so non-commutative ops (sub, cmp) compute
|
||
// src1 OP src2 correctly via OPfi (which gives src1 OP load(spill)).
|
||
static MachineBasicBlock *
|
||
emitRROp(MachineInstr &MI, MachineBasicBlock *BB, unsigned StoreOp,
|
||
unsigned OpFI, bool HasOut) {
|
||
MachineFunction *MF = BB->getParent();
|
||
const W65816Subtarget &STI = MF->getSubtarget<W65816Subtarget>();
|
||
const W65816InstrInfo &TII = *STI.getInstrInfo();
|
||
DebugLoc DL = MI.getDebugLoc();
|
||
|
||
int FI = MF->getFrameInfo().CreateStackObject(2, Align(2),
|
||
/*isSpillSlot=*/true);
|
||
|
||
unsigned LhsIdx = HasOut ? 1 : 0;
|
||
unsigned RhsIdx = HasOut ? 2 : 1;
|
||
Register Src1 = MI.getOperand(LhsIdx).getReg();
|
||
Register Src2 = MI.getOperand(RhsIdx).getReg();
|
||
|
||
// Spill src2 (the rhs). Then OPfi computes src1 OP load(spill).
|
||
BuildMI(*BB, MI.getIterator(), DL, TII.get(StoreOp))
|
||
.addReg(Src2)
|
||
.addFrameIndex(FI)
|
||
.addImm(0);
|
||
|
||
if (HasOut) {
|
||
Register Dst = MI.getOperand(0).getReg();
|
||
BuildMI(*BB, MI.getIterator(), DL, TII.get(OpFI), Dst)
|
||
.addReg(Src1)
|
||
.addFrameIndex(FI)
|
||
.addImm(0);
|
||
} else {
|
||
BuildMI(*BB, MI.getIterator(), DL, TII.get(OpFI))
|
||
.addReg(Src1)
|
||
.addFrameIndex(FI)
|
||
.addImm(0);
|
||
}
|
||
|
||
MI.eraseFromParent();
|
||
return BB;
|
||
}
|
||
|
||
MachineBasicBlock *
|
||
W65816TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
|
||
MachineBasicBlock *BB) const {
|
||
// The only opcode we currently emit with usesCustomInserter=1 is
|
||
// SELECT_CC16. Expand it into a diamond CFG with a PHI. For
|
||
// single-branch CCs:
|
||
//
|
||
// thisMBB:
|
||
// ... CMP already emitted ...
|
||
// Bxx sinkMBB ; branch to "true" path
|
||
// ; fall through to copy0MBB
|
||
// copy0MBB:
|
||
// ; (no instructions; PHI picks fval here)
|
||
// sinkMBB:
|
||
// dst = PHI [tval, thisMBB], [fval, copy0MBB]
|
||
//
|
||
// For multi-branch CCs (GT/LE/UGT/ULE without const RHS, where a
|
||
// single Bxx isn't enough), insert two branches. Both target either
|
||
// sinkMBB or copy0MBB depending on the condition.
|
||
switch (MI.getOpcode()) {
|
||
default:
|
||
llvm_unreachable("unexpected instruction in EmitInstrWithCustomInserter");
|
||
case W65816::ADD_RR:
|
||
return emitRROp(MI, BB, W65816::STAfi, W65816::ADCfi, /*HasOut=*/true);
|
||
case W65816::SUB_RR:
|
||
return emitRROp(MI, BB, W65816::STAfi, W65816::SBCfi, /*HasOut=*/true);
|
||
// Carry-chain variants for the hi half of an i32 split. STAfi doesn't
|
||
// touch P, so the carry from the previous addc/adde survives the
|
||
// spill and is consumed by ADCEfi/SBCEfi below.
|
||
case W65816::ADDE_RR:
|
||
return emitRROp(MI, BB, W65816::STAfi, W65816::ADCEfi, /*HasOut=*/true);
|
||
case W65816::SUBE_RR:
|
||
return emitRROp(MI, BB, W65816::STAfi, W65816::SBCEfi, /*HasOut=*/true);
|
||
case W65816::AND_RR:
|
||
return emitRROp(MI, BB, W65816::STAfi, W65816::ANDfi, /*HasOut=*/true);
|
||
case W65816::ORA_RR:
|
||
return emitRROp(MI, BB, W65816::STAfi, W65816::ORAfi, /*HasOut=*/true);
|
||
case W65816::EOR_RR:
|
||
return emitRROp(MI, BB, W65816::STAfi, W65816::EORfi, /*HasOut=*/true);
|
||
case W65816::CMP_RR:
|
||
return emitRROp(MI, BB, W65816::STAfi, W65816::CMPfi, /*HasOut=*/false);
|
||
case W65816::LDAptr32S:
|
||
case W65816::STAptr32S:
|
||
case W65816::STBptr32S: {
|
||
// Split-pair variant: ptr is 2 i16 operands (lo + hi) instead of
|
||
// 1 Wide32 reg pair. Used by the W65816LowerWide32 pre-RA pass
|
||
// to dodge pair-allocation pressure. Otherwise identical to
|
||
// the LDAptr32 inserter below.
|
||
MachineFunction *MF = BB->getParent();
|
||
const W65816Subtarget &STI = MF->getSubtarget<W65816Subtarget>();
|
||
const W65816InstrInfo &TII = *STI.getInstrInfo();
|
||
DebugLoc DL = MI.getDebugLoc();
|
||
bool IsLoad = MI.getOpcode() == W65816::LDAptr32S;
|
||
bool IsByteStore = MI.getOpcode() == W65816::STBptr32S;
|
||
Register PtrLo = MI.getOperand(IsLoad ? 1 : 1).getReg();
|
||
Register PtrHi = MI.getOperand(IsLoad ? 2 : 2).getReg();
|
||
|
||
int FILo = MF->getFrameInfo().CreateStackObject(2, Align(2),
|
||
/*isSpillSlot=*/false);
|
||
int FIHi = MF->getFrameInfo().CreateStackObject(2, Align(2),
|
||
/*isSpillSlot=*/false);
|
||
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi))
|
||
.addReg(PtrLo).addFrameIndex(FILo).addImm(0);
|
||
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi))
|
||
.addReg(PtrHi).addFrameIndex(FIHi).addImm(0);
|
||
|
||
// STA_DP's tablegen def has no implicit A Use, so without an
|
||
// explicit kill marker between adjacent LDAfi-STA_DP-LDAfi-STA_DP
|
||
// pairs the fast regalloc collapses two A-loads into one (the
|
||
// first's value is overwritten before STA_DP can store it). Add
|
||
// implicit Use of A on the STA_DP to encode the dependency. This
|
||
// also helps post-RA passes track A liveness correctly.
|
||
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi),
|
||
W65816::A).addFrameIndex(FILo).addImm(0);
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(W65816::STA_DP)).addImm(0xE0)
|
||
.addReg(W65816::A, RegState::Implicit);
|
||
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi),
|
||
W65816::A).addFrameIndex(FIHi).addImm(0);
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(W65816::STA_DP)).addImm(0xE2)
|
||
.addReg(W65816::A, RegState::Implicit);
|
||
|
||
if (IsLoad) {
|
||
Register Dst = MI.getOperand(0).getReg();
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(W65816::LDY_Imm16)).addImm(0);
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(W65816::LDA_DPIndLongY)).addImm(0xE0);
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(TargetOpcode::COPY), Dst).addReg(W65816::A);
|
||
} else {
|
||
Register Val = MI.getOperand(0).getReg();
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(TargetOpcode::COPY), W65816::A).addReg(Val);
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(W65816::LDY_Imm16)).addImm(0);
|
||
if (IsByteStore)
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(W65816::SEP)).addImm(0x20);
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(W65816::STA_DPIndLongY)).addImm(0xE0);
|
||
if (IsByteStore)
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(W65816::REP)).addImm(0x20);
|
||
}
|
||
MI.eraseFromParent();
|
||
return BB;
|
||
}
|
||
case W65816::LDAptr32:
|
||
case W65816::STAptr32:
|
||
case W65816::STBptr32: {
|
||
// Same shape as the i16 LDAptr/STAptr/STBptr inserter, but the
|
||
// pointer is a Wide32 register pair: sub_lo carries the low 16
|
||
// bits of the address, sub_hi carries the bank byte in its low
|
||
// half (high half is pad, ORCA convention). Stage at $E0..$E2,
|
||
// then [dp],Y addresses the right bank without forcing 0.
|
||
//
|
||
// MI-level peephole: if the Wide32 ptr is the sole user of a
|
||
// `REG_SEQUENCE(ADCi16imm BaseLo K, sub_lo, ADCEi16imm BaseHi 0,
|
||
// sub_hi)` chain (= `(add Wide32, K)` after ISel), peel the
|
||
// offset and pass K via the Y register on the `[dp],Y` deref.
|
||
// Saves ~3 instructions per access (the CLC/ADC/ADC carry chain).
|
||
// The bank-wrap caveat from LDAptr32Off applies: Y addition does
|
||
// NOT propagate beyond 16 bits, so the target object must not
|
||
// span a bank boundary (true for malloc'd / globally-allocated
|
||
// ptr32 objects; struct sizeof is far below 64KB).
|
||
//
|
||
// Doing this here rather than in LowerLoad / a SDAG combine avoids
|
||
// the JSON-tokenizer + BST + sprintf smoke regressions those paths
|
||
// tripped — the rewrites perturbed SDAG scheduling in ways that
|
||
// bisection couldn't pin down. At MI level, the rewrite is
|
||
// structural: ADCi16imm/ADCEi16imm become dead and get DCE'd.
|
||
//
|
||
// Dead unless ptr32 mode is active (LowerLoad/LowerStore are gated
|
||
// on i32 address type).
|
||
MachineFunction *MF = BB->getParent();
|
||
const W65816Subtarget &STI = MF->getSubtarget<W65816Subtarget>();
|
||
const W65816InstrInfo &TII = *STI.getInstrInfo();
|
||
MachineRegisterInfo &MRI = MF->getRegInfo();
|
||
DebugLoc DL = MI.getDebugLoc();
|
||
bool IsLoad = MI.getOpcode() == W65816::LDAptr32;
|
||
bool IsByteStore = MI.getOpcode() == W65816::STBptr32;
|
||
Register Ptr = MI.getOperand(IsLoad ? 1 : 1).getReg();
|
||
// Try the ADC-chain peel. We need:
|
||
// 1. Ptr has exactly one use (this MI) — else other users still
|
||
// need the full computed Wide32, no net win.
|
||
// 2. Ptr was defined by a REG_SEQUENCE.
|
||
// 3. Sub_lo source is ADCi16imm BaseLoReg KLo.
|
||
// 4. Sub_hi source is ADCEi16imm BaseHiReg 0.
|
||
// 5. KLo > 0 and KLo fits 16-bit unsigned.
|
||
Register PeelBaseLo, PeelBaseHi;
|
||
int64_t PeelOff = 0;
|
||
MachineInstr *DeadLoDef = nullptr;
|
||
MachineInstr *DeadHiDef = nullptr;
|
||
MachineInstr *DeadPtrDef = nullptr;
|
||
SmallVector<MachineInstr *, 4> ExtraChainDeads;
|
||
if (IsLoad && MRI.hasOneUse(Ptr)) {
|
||
MachineInstr *PtrDef = MRI.getUniqueVRegDef(Ptr);
|
||
if (PtrDef && PtrDef->getOpcode() == TargetOpcode::REG_SEQUENCE) {
|
||
Register SubLoReg, SubHiReg;
|
||
for (unsigned i = 1, e = PtrDef->getNumOperands(); i + 1 < e; i += 2) {
|
||
unsigned SubIdx = PtrDef->getOperand(i + 1).getImm();
|
||
Register R = PtrDef->getOperand(i).getReg();
|
||
if (SubIdx == llvm::sub_lo) SubLoReg = R;
|
||
else if (SubIdx == llvm::sub_hi) SubHiReg = R;
|
||
}
|
||
MachineInstr *LoDef = SubLoReg ? MRI.getUniqueVRegDef(SubLoReg)
|
||
: nullptr;
|
||
MachineInstr *HiDef = SubHiReg ? MRI.getUniqueVRegDef(SubHiReg)
|
||
: nullptr;
|
||
// We don't require SubLoReg/SubHiReg to be single-use: an
|
||
// ADCi16imm result CSE'd across multiple users (e.g., `L+K`
|
||
// also used as input to `(L+K)+M`) is fine — peeling THIS load
|
||
// doesn't kill the original ADC chain (other users still need
|
||
// it). We only erase the chain if it's all single-use end-to-end.
|
||
bool OuterSingleUse =
|
||
MRI.hasOneUse(SubLoReg) && MRI.hasOneUse(SubHiReg);
|
||
if (LoDef && HiDef &&
|
||
LoDef->getOpcode() == W65816::ADCi16imm &&
|
||
HiDef->getOpcode() == W65816::ADCEi16imm &&
|
||
// ADCi16imm and ADCEi16imm must be in the same MBB so we
|
||
// can verify nothing clobbers $p between them.
|
||
LoDef->getParent() == HiDef->getParent()) {
|
||
// Walk forward from LoDef to HiDef. If any instr between
|
||
// them defines $p, the ADCE reads a tampered carry and our
|
||
// simple substitution would change semantics.
|
||
bool PChainOK = true;
|
||
for (auto It = std::next(LoDef->getIterator());
|
||
It != HiDef->getIterator() && PChainOK; ++It) {
|
||
for (const MachineOperand &MO : It->operands()) {
|
||
if (MO.isReg() && MO.getReg() == W65816::P &&
|
||
MO.isDef() && !MO.isDead()) {
|
||
PChainOK = false;
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
int64_t KLo = LoDef->getOperand(2).getImm();
|
||
int64_t KHi = HiDef->getOperand(2).getImm();
|
||
Register CandLo = LoDef->getOperand(1).getReg();
|
||
Register CandHi = HiDef->getOperand(1).getReg();
|
||
// Accept a vreg that's `COPY <phys-reg>` for any of the
|
||
// arg/accumulator/index physregs. This catches both incoming
|
||
// function args ($a/$x at entry) AND values that came from
|
||
// a preceding load (where the result was COPYed off $a).
|
||
auto isFromArgCopy = [&](Register R) -> bool {
|
||
if (!R.isVirtual()) return false;
|
||
MachineInstr *Def = MRI.getUniqueVRegDef(R);
|
||
if (!Def || !Def->isCopy()) return false;
|
||
const MachineOperand &Src = Def->getOperand(1);
|
||
if (!Src.isReg() || !Src.getReg().isPhysical()) return false;
|
||
unsigned P = Src.getReg();
|
||
return P == W65816::A || P == W65816::X || P == W65816::Y;
|
||
};
|
||
// A vreg is "from a fixed (caller-pushed) stack arg" if its
|
||
// unique def is LDAfi against a fixed FrameIndex (negative
|
||
// index in MachineFrameInfo). Caller-pushed args live in
|
||
// immutable slots, so reading them later is value-equivalent
|
||
// to reading them at function entry.
|
||
auto isFromFixedArgSlot = [&](Register R) -> bool {
|
||
if (!R.isVirtual()) return false;
|
||
MachineInstr *Def = MRI.getUniqueVRegDef(R);
|
||
if (!Def || Def->getOpcode() != W65816::LDAfi) return false;
|
||
const MachineOperand &FIOp = Def->getOperand(1);
|
||
if (!FIOp.isFI()) return false;
|
||
int FI = FIOp.getIndex();
|
||
const MachineFrameInfo &MFI = MF->getFrameInfo();
|
||
return MFI.isFixedObjectIndex(FI);
|
||
};
|
||
auto isFromArg = [&](Register R) -> bool {
|
||
if (isFromArgCopy(R)) return true;
|
||
if (isFromFixedArgSlot(R)) return true;
|
||
if (!R.isVirtual()) return false;
|
||
MachineInstr *Def = MRI.getUniqueVRegDef(R);
|
||
if (!Def || !Def->isCopy()) return false;
|
||
const MachineOperand &Src = Def->getOperand(1);
|
||
if (!Src.isReg() || !Src.getReg().isVirtual()) return false;
|
||
return isFromArgCopy(Src.getReg()) ||
|
||
isFromFixedArgSlot(Src.getReg());
|
||
};
|
||
// Recursive walk: nested ADC chains arise from i32-LOAD split
|
||
// (high half loads at `Ptr+2`, where `Ptr` is itself `arg+K`).
|
||
// Walk back, accumulating offset, until we reach an arg-base
|
||
// OR exhaust the chain.
|
||
//
|
||
// We allow inner ADC results to have multiple users — this
|
||
// happens when the SDAG CSEs `L+K` and reuses it as input to
|
||
// `(L+K)+M`. In that case, peeling THIS load doesn't kill
|
||
// the inner ADC chain (other users still need it), so we
|
||
// don't erase those inner Ms. Only the outer-most chain
|
||
// (single-use) and PtrDef are erased.
|
||
//
|
||
// Bisecting: try peeling whenever the chain reaches a
|
||
// "stable" base — args, fixed-arg-slot loads, OR any vreg
|
||
// (widest). Wider gates have historically tripped a
|
||
// FrameLowering-related smoke regression in sprintf.
|
||
int64_t Off = KLo;
|
||
bool ChainOK = (PChainOK && KHi == 0 && KLo > 0 && KLo <= 0xFFFF);
|
||
// Cap on chain walks (avoid pathological deep chains).
|
||
unsigned MaxChainDepth = 8;
|
||
// Track per-layer "all single-use" status — only erase layers
|
||
// up to the first non-single-use one.
|
||
unsigned SingleUseLayers = OuterSingleUse ? 1 : 0;
|
||
SmallVector<MachineInstr *, 6> ChainDeads;
|
||
if (OuterSingleUse) {
|
||
ChainDeads.push_back(LoDef);
|
||
ChainDeads.push_back(HiDef);
|
||
}
|
||
// Narrow gate: walk back only until we reach an arg-base or
|
||
// arg-slot base. A truly wide gate (peel any chain regardless
|
||
// of base) makes Lua ~+0.85% LARGER because each peel adds 4B
|
||
// of stack-slot staging that exceeds the carry-chain savings
|
||
// for deep-chain cases. Tested 2026-05-25.
|
||
while (ChainOK && MaxChainDepth-- > 0 &&
|
||
(!isFromArg(CandLo) || !isFromArg(CandHi))) {
|
||
if (!CandLo.isVirtual() || !CandHi.isVirtual()) {
|
||
ChainOK = false; break;
|
||
}
|
||
MachineInstr *InnerLo = MRI.getUniqueVRegDef(CandLo);
|
||
MachineInstr *InnerHi = MRI.getUniqueVRegDef(CandHi);
|
||
if (!InnerLo || !InnerHi ||
|
||
InnerLo->getOpcode() != W65816::ADCi16imm ||
|
||
InnerHi->getOpcode() != W65816::ADCEi16imm ||
|
||
InnerLo->getParent() != InnerHi->getParent()) {
|
||
ChainOK = false; break;
|
||
}
|
||
bool InnerSingleUse = MRI.hasOneUse(CandLo) && MRI.hasOneUse(CandHi);
|
||
bool InnerPOK = true;
|
||
for (auto It = std::next(InnerLo->getIterator());
|
||
It != InnerHi->getIterator() && InnerPOK; ++It) {
|
||
for (const MachineOperand &MO : It->operands()) {
|
||
if (MO.isReg() && MO.getReg() == W65816::P &&
|
||
MO.isDef() && !MO.isDead()) {
|
||
InnerPOK = false; break;
|
||
}
|
||
}
|
||
}
|
||
if (!InnerPOK) { ChainOK = false; break; }
|
||
int64_t InnerKLo = InnerLo->getOperand(2).getImm();
|
||
int64_t InnerKHi = InnerHi->getOperand(2).getImm();
|
||
if (InnerKHi != 0) { ChainOK = false; break; }
|
||
int64_t NewOff = Off + InnerKLo;
|
||
if (NewOff > 0xFFFF) { ChainOK = false; break; }
|
||
Off = NewOff;
|
||
CandLo = InnerLo->getOperand(1).getReg();
|
||
CandHi = InnerHi->getOperand(1).getReg();
|
||
// Track whether this inner layer is erasable (all-single-use
|
||
// from outer through here).
|
||
if (InnerSingleUse && SingleUseLayers ==
|
||
ChainDeads.size() / 2) {
|
||
SingleUseLayers++;
|
||
ChainDeads.push_back(InnerLo);
|
||
ChainDeads.push_back(InnerHi);
|
||
}
|
||
// Even if not single-use, we keep walking back — the peel
|
||
// is still correct (just doesn't kill the inner chain).
|
||
}
|
||
if (ChainOK && Off > 0 && Off <= 0xFFFF &&
|
||
isFromArg(CandLo) && isFromArg(CandHi)) {
|
||
PeelBaseLo = CandLo;
|
||
PeelBaseHi = CandHi;
|
||
PeelOff = Off;
|
||
DeadPtrDef = PtrDef;
|
||
// Only erase the ADC chain if it's all-single-use end to
|
||
// end. Otherwise leave it alive — other users need it.
|
||
if (OuterSingleUse) {
|
||
DeadLoDef = LoDef;
|
||
DeadHiDef = HiDef;
|
||
for (unsigned i = 2; i < ChainDeads.size(); ++i)
|
||
ExtraChainDeads.push_back(ChainDeads[i]);
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
// Layer 2 fast path: -w65816-dbr-safe-ptrs assumes the bank byte
|
||
// matches DBR, letting us skip $E0/$E2 staging entirely. Emit just
|
||
// a STAfi of sub_lo and an LDAfi_indY/STAfi_indY deref via the
|
||
// 16-bit stack-rel-indirect-Y opcode (0xB3 / 0x93). ~4 instr per
|
||
// deref saved vs the heavy [dp],Y indirect-long path.
|
||
if (DbrSafePtrs) {
|
||
Register PtrLo = MRI.createVirtualRegister(&W65816::Wide16RegClass);
|
||
if (PeelOff) {
|
||
BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), PtrLo)
|
||
.addReg(PeelBaseLo);
|
||
} else {
|
||
BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), PtrLo)
|
||
.addReg(Ptr, (RegState)0, llvm::sub_lo);
|
||
}
|
||
int FILo = MF->getFrameInfo().CreateStackObject(2, Align(2),
|
||
/*isSpillSlot=*/false);
|
||
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi))
|
||
.addReg(PtrLo).addFrameIndex(FILo).addImm(0);
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(W65816::LDY_Imm16)).addImm(PeelOff);
|
||
if (IsLoad) {
|
||
Register Dst = MI.getOperand(0).getReg();
|
||
// LDAfi_indY $dst, FILo — PEI resolves to LDA (FILo,S),Y.
|
||
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi_indY),
|
||
W65816::A).addFrameIndex(FILo).addImm(0);
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(TargetOpcode::COPY), Dst).addReg(W65816::A);
|
||
} else {
|
||
Register Val = MI.getOperand(0).getReg();
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(TargetOpcode::COPY), W65816::A).addReg(Val);
|
||
if (IsByteStore)
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(W65816::SEP)).addImm(0x20);
|
||
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi_indY))
|
||
.addReg(W65816::A).addFrameIndex(FILo).addImm(0);
|
||
if (IsByteStore)
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(W65816::REP)).addImm(0x20);
|
||
}
|
||
MI.eraseFromParent();
|
||
if (DeadPtrDef) DeadPtrDef->eraseFromParent();
|
||
if (DeadLoDef) DeadLoDef->eraseFromParent();
|
||
if (DeadHiDef) DeadHiDef->eraseFromParent();
|
||
for (MachineInstr *D : ExtraChainDeads) D->eraseFromParent();
|
||
return BB;
|
||
}
|
||
|
||
// Extract the i16 sub-halves of the Wide32 ptr. At custom-inserter
|
||
// time Ptr is still a virtual register, so `TRI.getSubReg` won't
|
||
// work (it's physreg-only). Use COPY-with-subreg-index instead;
|
||
// the regalloc + virtreg-rewriter resolves this to the right
|
||
// physreg operand later.
|
||
Register PtrLo = MRI.createVirtualRegister(&W65816::Wide16RegClass);
|
||
Register PtrHi = MRI.createVirtualRegister(&W65816::Wide16RegClass);
|
||
if (PeelOff) {
|
||
// Peeled path: pull base halves from the ADC chain's inputs.
|
||
BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), PtrLo)
|
||
.addReg(PeelBaseLo);
|
||
BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), PtrHi)
|
||
.addReg(PeelBaseHi);
|
||
} else {
|
||
BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), PtrLo)
|
||
.addReg(Ptr, (RegState)0, llvm::sub_lo);
|
||
BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), PtrHi)
|
||
.addReg(Ptr, (RegState)0, llvm::sub_hi);
|
||
}
|
||
|
||
// Spill each half to a fresh slot, reload via LDAfi. Same RA-
|
||
// pinning rationale as the i16 LDAptr inserter.
|
||
int FILo = MF->getFrameInfo().CreateStackObject(2, Align(2),
|
||
/*isSpillSlot=*/false);
|
||
int FIHi = MF->getFrameInfo().CreateStackObject(2, Align(2),
|
||
/*isSpillSlot=*/false);
|
||
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi))
|
||
.addReg(PtrLo).addFrameIndex(FILo).addImm(0);
|
||
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi))
|
||
.addReg(PtrHi).addFrameIndex(FIHi).addImm(0);
|
||
|
||
// Change 3: $E0/$E2 staging CSE. Look backward in this MBB for
|
||
// the previous ptr32-deref expansion. If its base halves match
|
||
// ours (same vreg source) and nothing between has clobbered
|
||
// $E0/$E2/$Y or the staged values, skip the LDAfi+STA_DP pairs
|
||
// and reuse the previously-staged $E0..$E2.
|
||
//
|
||
// Inserter pattern signature (from below, latest-emitted first):
|
||
// STA_DP $E2 (impl A)
|
||
// LDAfi <FIHi'> -> A
|
||
// STA_DP $E0 (impl A)
|
||
// LDAfi <FILo'> -> A
|
||
// STAfi <srcHi'>, FIHi', 0 <- prior PtrHi
|
||
// STAfi <srcLo'>, FILo', 0 <- prior PtrLo
|
||
bool ReuseStaging = false;
|
||
{
|
||
Register MySrcLo = PeelOff ? PeelBaseLo : Ptr;
|
||
Register MySrcHi = PeelOff ? PeelBaseHi : Register();
|
||
// For non-peel path, both halves come from `Ptr` via subreg; the
|
||
// CSE check uses the whole Ptr vreg (so two LDAptr32 with the
|
||
// same Ptr vreg can share staging).
|
||
auto It = MI.getIterator();
|
||
MachineInstr *PrevStaE2 = nullptr;
|
||
MachineInstr *PrevLdaHi = nullptr;
|
||
MachineInstr *PrevStaE0 = nullptr;
|
||
MachineInstr *PrevLdaLo = nullptr;
|
||
MachineInstr *PrevStaHi = nullptr;
|
||
MachineInstr *PrevStaLo = nullptr;
|
||
auto clobbersE0E2 = [&](MachineInstr &PrevMI) -> bool {
|
||
// Any call clobbers everything in DP — including $E0..$E3.
|
||
if (PrevMI.isCall()) return true;
|
||
switch (PrevMI.getOpcode()) {
|
||
// FrameLowering's long-indirect expansion of these uses $E2
|
||
// as A-stash scratch (see W65816RegisterInfo.cpp).
|
||
case W65816::ADCfi: case W65816::ADCEfi:
|
||
case W65816::ANDfi: case W65816::ORAfi: case W65816::EORfi:
|
||
case W65816::SBCfi: case W65816::SBCEfi:
|
||
case W65816::CMPfi:
|
||
return true;
|
||
case W65816::STA_DP:
|
||
case W65816::STZ_DP:
|
||
if (PrevMI.getOperand(0).isImm()) {
|
||
int64_t Imm = PrevMI.getOperand(0).getImm();
|
||
if (Imm == 0xE0 || Imm == 0xE1 ||
|
||
Imm == 0xE2 || Imm == 0xE3)
|
||
return true;
|
||
}
|
||
break;
|
||
}
|
||
return false;
|
||
};
|
||
// Scan back, fail-soft.
|
||
const unsigned MaxScan = 60;
|
||
unsigned Scanned = 0;
|
||
while (It != BB->begin() && Scanned++ < MaxScan) {
|
||
--It;
|
||
MachineInstr &P = *It;
|
||
if (!PrevStaE2) {
|
||
if (P.getOpcode() == W65816::STA_DP &&
|
||
P.getOperand(0).isImm() &&
|
||
P.getOperand(0).getImm() == 0xE2) {
|
||
PrevStaE2 = &P;
|
||
continue;
|
||
}
|
||
if (clobbersE0E2(P)) break;
|
||
continue;
|
||
}
|
||
// After PrevStaE2, expect LDAfi <FIHi'>.
|
||
if (!PrevLdaHi) {
|
||
if (P.getOpcode() == W65816::LDAfi) { PrevLdaHi = &P; continue; }
|
||
break;
|
||
}
|
||
if (!PrevStaE0) {
|
||
if (P.getOpcode() == W65816::STA_DP &&
|
||
P.getOperand(0).isImm() &&
|
||
P.getOperand(0).getImm() == 0xE0) {
|
||
PrevStaE0 = &P;
|
||
continue;
|
||
}
|
||
break;
|
||
}
|
||
if (!PrevLdaLo) {
|
||
if (P.getOpcode() == W65816::LDAfi) { PrevLdaLo = &P; continue; }
|
||
break;
|
||
}
|
||
// Now look for STAfi srcHi', FIHi' and STAfi srcLo', FILo'.
|
||
// They appear in either order; the inserter above emits Lo first
|
||
// then Hi, but scanning back, we hit Hi first.
|
||
if (!PrevStaHi) {
|
||
if (P.getOpcode() == W65816::STAfi &&
|
||
P.getOperand(1).isFI() &&
|
||
P.getOperand(1).getIndex() ==
|
||
PrevLdaHi->getOperand(1).getIndex()) {
|
||
PrevStaHi = &P;
|
||
continue;
|
||
}
|
||
break;
|
||
}
|
||
if (!PrevStaLo) {
|
||
if (P.getOpcode() == W65816::STAfi &&
|
||
P.getOperand(1).isFI() &&
|
||
P.getOperand(1).getIndex() ==
|
||
PrevLdaLo->getOperand(1).getIndex()) {
|
||
PrevStaLo = &P;
|
||
// Done with the structural match — fall through to operand
|
||
// comparison.
|
||
}
|
||
break;
|
||
}
|
||
}
|
||
if (PrevStaLo && PrevStaHi) {
|
||
Register PrevSrcLo = PrevStaLo->getOperand(0).getReg();
|
||
Register PrevSrcHi = PrevStaHi->getOperand(0).getReg();
|
||
// Match if the source vregs are identical to mine. For non-peel
|
||
// path, PtrLo/PtrHi were freshly created via COPY from Ptr.sub_*
|
||
// — match by tracing PrevSrcLo/Hi back through their COPY (if
|
||
// any) to the Ptr vreg.
|
||
auto traceToPtr = [&](Register R) -> Register {
|
||
if (!R.isVirtual()) return R;
|
||
MachineInstr *D = MRI.getUniqueVRegDef(R);
|
||
while (D && D->isCopy()) {
|
||
const MachineOperand &S = D->getOperand(1);
|
||
if (!S.isReg() || !S.getReg().isVirtual()) break;
|
||
R = S.getReg();
|
||
D = MRI.getUniqueVRegDef(R);
|
||
// For subreg copies, stop — we'd lose sub-half info.
|
||
if (D && D->getOpcode() == TargetOpcode::REG_SEQUENCE) break;
|
||
}
|
||
return R;
|
||
};
|
||
Register MyTraceLo = traceToPtr(PeelOff ? PeelBaseLo : PtrLo);
|
||
Register MyTraceHi = traceToPtr(PeelOff ? PeelBaseHi : PtrHi);
|
||
Register PrevTraceLo = traceToPtr(PrevSrcLo);
|
||
Register PrevTraceHi = traceToPtr(PrevSrcHi);
|
||
if (MyTraceLo == PrevTraceLo && MyTraceHi == PrevTraceHi &&
|
||
MyTraceLo.isValid() && MyTraceHi.isValid()) {
|
||
ReuseStaging = true;
|
||
}
|
||
}
|
||
(void)MySrcLo; (void)MySrcHi; // not used directly; trace covers
|
||
}
|
||
|
||
// Stage the 24-bit address at $E0..$E2 unless CSE allows reusing
|
||
// the previous staging.
|
||
// STA_DP's tablegen def has no implicit A Use, so without an
|
||
// explicit kill marker between adjacent LDAfi-STA_DP-LDAfi-STA_DP
|
||
// pairs the fast regalloc collapses two A-loads into one (the
|
||
// first's value is overwritten before STA_DP can store it). Add
|
||
// implicit Use of A on the STA_DP to encode the dependency. This
|
||
// also helps post-RA passes track A liveness correctly.
|
||
if (!ReuseStaging) {
|
||
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi),
|
||
W65816::A).addFrameIndex(FILo).addImm(0);
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(W65816::STA_DP)).addImm(0xE0)
|
||
.addReg(W65816::A, RegState::Implicit);
|
||
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi),
|
||
W65816::A).addFrameIndex(FIHi).addImm(0);
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(W65816::STA_DP)).addImm(0xE2)
|
||
.addReg(W65816::A, RegState::Implicit);
|
||
}
|
||
|
||
if (IsLoad) {
|
||
Register Dst = MI.getOperand(0).getReg();
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(W65816::LDY_Imm16)).addImm(PeelOff);
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(W65816::LDA_DPIndLongY)).addImm(0xE0);
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(TargetOpcode::COPY), Dst).addReg(W65816::A);
|
||
} else {
|
||
Register Val = MI.getOperand(0).getReg();
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(TargetOpcode::COPY), W65816::A).addReg(Val);
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(W65816::LDY_Imm16)).addImm(PeelOff);
|
||
if (IsByteStore)
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(W65816::SEP)).addImm(0x20);
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(W65816::STA_DPIndLongY)).addImm(0xE0);
|
||
if (IsByteStore)
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(W65816::REP)).addImm(0x20);
|
||
}
|
||
MI.eraseFromParent();
|
||
if (DeadPtrDef) DeadPtrDef->eraseFromParent();
|
||
if (DeadLoDef) DeadLoDef->eraseFromParent();
|
||
if (DeadHiDef) DeadHiDef->eraseFromParent();
|
||
for (MachineInstr *D : ExtraChainDeads) D->eraseFromParent();
|
||
return BB;
|
||
}
|
||
case W65816::LDAptr32Off:
|
||
case W65816::STAptr32Off:
|
||
case W65816::STBptr32Off: {
|
||
// ptr32 deref with constant offset. The 65816's `[dp],Y` adds Y
|
||
// to the 24-bit pointer at `dp..dp+2` to form the effective
|
||
// address — so we can stage the RAW pointer at $E0..$E2 and put
|
||
// the offset in Y, skipping the i32-add carry chain entirely.
|
||
//
|
||
// Saves ~3 instructions per access vs the previous approach
|
||
// (which did `lo+off; hi+carry` to compute the pointer then
|
||
// derefed with Y=0). Big win on heavy struct-field code like
|
||
// Lua's lapi.c. See memory: ptr32-deref-fold-layer1-mi-opcodes.
|
||
//
|
||
// Bank-wrap caveat: `[dp],Y` doesn't propagate Y into the bank
|
||
// byte at $E2 — if pointer+Y crosses a bank boundary, the result
|
||
// wraps within the 24-bit address space (not into the next bank).
|
||
// For struct fields with offsets < 64KB on malloc'd or globally-
|
||
// allocated objects that don't straddle bank boundaries this is
|
||
// safe; the caller must not place objects spanning $XX:FFFF.
|
||
//
|
||
// Dead unless ptr32 mode is active.
|
||
MachineFunction *MF = BB->getParent();
|
||
const W65816Subtarget &STI = MF->getSubtarget<W65816Subtarget>();
|
||
const W65816InstrInfo &TII = *STI.getInstrInfo();
|
||
MachineRegisterInfo &MRI = MF->getRegInfo();
|
||
DebugLoc DL = MI.getDebugLoc();
|
||
bool IsLoad = MI.getOpcode() == W65816::LDAptr32Off;
|
||
bool IsByteStore = MI.getOpcode() == W65816::STBptr32Off;
|
||
Register Ptr = MI.getOperand(1).getReg();
|
||
int64_t Off = MI.getOperand(2).getImm();
|
||
// See LDAptr32 inserter above: vreg sub-regs need COPY-with-subreg
|
||
// (TRI.getSubReg is physreg-only at custom-inserter time).
|
||
Register PtrLo = MRI.createVirtualRegister(&W65816::Wide16RegClass);
|
||
Register PtrHi = MRI.createVirtualRegister(&W65816::Wide16RegClass);
|
||
BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), PtrLo)
|
||
.addReg(Ptr, (RegState)0, llvm::sub_lo);
|
||
BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), PtrHi)
|
||
.addReg(Ptr, (RegState)0, llvm::sub_hi);
|
||
|
||
int FILo = MF->getFrameInfo().CreateStackObject(2, Align(2),
|
||
/*isSpillSlot=*/false);
|
||
int FIHi = MF->getFrameInfo().CreateStackObject(2, Align(2),
|
||
/*isSpillSlot=*/false);
|
||
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi))
|
||
.addReg(PtrLo).addFrameIndex(FILo).addImm(0);
|
||
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi))
|
||
.addReg(PtrHi).addFrameIndex(FIHi).addImm(0);
|
||
|
||
// ptr_lo -> $E0..$E1 (no offset add)
|
||
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi),
|
||
W65816::A).addFrameIndex(FILo).addImm(0);
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(W65816::STA_DP)).addImm(0xE0);
|
||
|
||
// ptr_hi -> $E2..$E3 (no carry propagation needed)
|
||
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi),
|
||
W65816::A).addFrameIndex(FIHi).addImm(0);
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(W65816::STA_DP)).addImm(0xE2);
|
||
|
||
if (IsLoad) {
|
||
Register Dst = MI.getOperand(0).getReg();
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(W65816::LDY_Imm16)).addImm(Off);
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(W65816::LDA_DPIndLongY)).addImm(0xE0);
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(TargetOpcode::COPY), Dst).addReg(W65816::A);
|
||
} else {
|
||
Register Val = MI.getOperand(0).getReg();
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(TargetOpcode::COPY), W65816::A).addReg(Val);
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(W65816::LDY_Imm16)).addImm(Off);
|
||
if (IsByteStore)
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(W65816::SEP)).addImm(0x20);
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(W65816::STA_DPIndLongY)).addImm(0xE0);
|
||
if (IsByteStore)
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(W65816::REP)).addImm(0x20);
|
||
}
|
||
MI.eraseFromParent();
|
||
return BB;
|
||
}
|
||
case W65816::LDAptrOff:
|
||
case W65816::STAptrOff:
|
||
case W65816::STBptrOff: {
|
||
// Pointer access with a constant offset. Folds the offset into
|
||
// the pointer (CLC; ADC #off in A) BEFORE staging at $E0..$E2,
|
||
// then accesses via [$E0],Y with Y=0. We can't fold into Y
|
||
// because [dp],Y on the W65816 adds Y to the full 24-bit pointer
|
||
// — for a negative Y like 0xFFFE (= -2 signed), the addition
|
||
// crosses into bank 1. Folding into the pointer keeps the add
|
||
// at 16-bit (in A) so the bank byte stays 0.
|
||
//
|
||
// DBR-independent — see LDAptr/STAptr/STBptr.
|
||
MachineFunction *MF = BB->getParent();
|
||
const W65816Subtarget &STI = MF->getSubtarget<W65816Subtarget>();
|
||
const W65816InstrInfo &TII = *STI.getInstrInfo();
|
||
DebugLoc DL = MI.getDebugLoc();
|
||
bool IsLoad = MI.getOpcode() == W65816::LDAptrOff;
|
||
bool IsByteStore = MI.getOpcode() == W65816::STBptrOff;
|
||
Register Ptr = MI.getOperand(1).getReg();
|
||
int64_t Off = MI.getOperand(2).getImm();
|
||
|
||
// Spill the pointer vreg to a fresh 2-byte stack slot, then
|
||
// reload via LDAfi. Forces RA to materialize the source — see
|
||
// the LDAptr/STAptr/STBptr case below for the full rationale.
|
||
int FI = MF->getFrameInfo().CreateStackObject(2, Align(2),
|
||
/*isSpillSlot=*/false);
|
||
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi))
|
||
.addReg(Ptr).addFrameIndex(FI).addImm(0);
|
||
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi),
|
||
W65816::A).addFrameIndex(FI).addImm(0);
|
||
|
||
// Compute ptr + off in A. CLC + ADC for the add.
|
||
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::CLC));
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(W65816::ADC_Imm16)).addImm(Off);
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(W65816::STA_DP)).addImm(0xE0);
|
||
if (LoaderBankDeref) {
|
||
// Bank byte from $BE (crt0-initialised) — Loader compat path.
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(W65816::LDA_DP)).addImm(0xBE);
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(W65816::STA_DP)).addImm(0xE2);
|
||
} else {
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(W65816::STZ_DP)).addImm(0xE2);
|
||
}
|
||
|
||
if (IsLoad) {
|
||
Register Dst = MI.getOperand(0).getReg();
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(W65816::LDY_Imm16)).addImm(0);
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(W65816::LDA_DPIndLongY)).addImm(0xE0);
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(TargetOpcode::COPY), Dst).addReg(W65816::A);
|
||
} else {
|
||
Register Val = MI.getOperand(0).getReg();
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(TargetOpcode::COPY), W65816::A).addReg(Val);
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(W65816::LDY_Imm16)).addImm(0);
|
||
if (IsByteStore)
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(W65816::SEP)).addImm(0x20);
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(W65816::STA_DPIndLongY)).addImm(0xE0);
|
||
if (IsByteStore)
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(W65816::REP)).addImm(0x20);
|
||
}
|
||
MI.eraseFromParent();
|
||
return BB;
|
||
}
|
||
case W65816::LDAptr:
|
||
case W65816::LDAptrBank0:
|
||
case W65816::STAptr:
|
||
case W65816::STBptr: {
|
||
// Pointer load/store via [dp],Y indirect-long (opcodes 0xB7 / 0x97):
|
||
// STA $E0 ; pointer low/hi at $E0..$E1
|
||
// STZ $E2 ; bank byte at $E2 = 0
|
||
// LDY #0
|
||
// LDA [$E0], Y ; bank 0:ptr + 0
|
||
// STA [$E0], Y
|
||
// Bank-explicit ZERO — DBR-independent. Both the runInMame stack
|
||
// ($00:0FFF down) and BSS / heap globals (placed at $00:xxxx) live
|
||
// in bank 0, so pointer-derefs always reach the right memory even
|
||
// when the user has switched DBR for a bank-2 store via `pha;plb`.
|
||
//
|
||
// Trade-off: under GS/OS Loader the user's data lives in their bank
|
||
// (not bank 0), so library functions that write directly to globals
|
||
// via `sta abs` (DBR-relative, lands in user bank) and user code that
|
||
// reads via pointer-deref (lands in bank 0 by this lowering) get
|
||
// INCONSISTENT results — silent miscompile. gmtime hit this with
|
||
// its __gmtimeBuf static. Workaround for affected library code:
|
||
// launder the buffer pointer through inline asm (see gmtime in
|
||
// runtime/src/timeExt.c) so clang doesn't IPSCCP-fold it; the writes
|
||
// then go via [dp],Y too and match the user reads.
|
||
//
|
||
// Const-int pointers (`*(volatile uint16 *)0x5000 = v`) are NOT
|
||
// lowered through this pseudo — TableGen patterns route them to
|
||
// STAlong / STA8long / STAabs by type. See InstrInfo.td.
|
||
//
|
||
// We use $E0..$E2 in libcall-scratch DP — safe because the
|
||
// pseudo expansion is a leaf (no calls between SEP and STA),
|
||
// and any subsequent libcall reinitialises its own scratch.
|
||
//
|
||
// Why [dp],Y not abs-long-X (`STA $0,X`)? abs-long-X is shorter
|
||
// (~3 bytes less) but uses X to hold the pointer. In high-
|
||
// pressure functions like the recursive expression parser, X
|
||
// is often live with another value, and forcing X to be free
|
||
// for every pointer-deref triggered "ran out of registers".
|
||
// [dp],Y uses A and Y only — leaves X for spill-bridge use.
|
||
//
|
||
// STBptr (truncating i8 store) wraps the actual STA in SEP/REP
|
||
// so M=8 across the store and only one byte is written.
|
||
MachineFunction *MF = BB->getParent();
|
||
const W65816Subtarget &STI = MF->getSubtarget<W65816Subtarget>();
|
||
const W65816InstrInfo &TII = *STI.getInstrInfo();
|
||
DebugLoc DL = MI.getDebugLoc();
|
||
bool IsLoad = MI.getOpcode() == W65816::LDAptr ||
|
||
MI.getOpcode() == W65816::LDAptrBank0;
|
||
bool IsByteStore = MI.getOpcode() == W65816::STBptr;
|
||
// LDAptrBank0 hardcodes bank=0 (STZ $E2) regardless of LoaderBankDeref.
|
||
// Used by va_arg under Loader where the deref is a stack pointer
|
||
// (= bank 0 always on W65816) but $BE points to our code bank.
|
||
bool ForceBank0 = MI.getOpcode() == W65816::LDAptrBank0;
|
||
|
||
Register Ptr = MI.getOperand(1).getReg();
|
||
|
||
// Why we spill the pointer to a fresh stack slot first:
|
||
// a direct `COPY $a = ptr_vreg ; STA $E0` lets RA elide the COPY
|
||
// when ptr_vreg is already allocated to A. In a loop body where
|
||
// multiple Acc16 PHIs (pointer + accumulator) compete for A, the
|
||
// PHI elimination pass picks one to be in A at the bottom of the
|
||
// block and silently drops the COPY needed to refresh A with the
|
||
// OTHER value at the top of the next iteration — silent miscompile
|
||
// (sumTable read its own accumulator as the pointer on iter 2+).
|
||
// STAfi forces RA to materialize ptr_vreg's value so it gets stored
|
||
// to the slot, then LDAfi reads it back as a real machine load.
|
||
int FI = MF->getFrameInfo().CreateStackObject(2, Align(2),
|
||
/*isSpillSlot=*/false);
|
||
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi))
|
||
.addReg(Ptr).addFrameIndex(FI).addImm(0);
|
||
BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi),
|
||
W65816::A).addFrameIndex(FI).addImm(0);
|
||
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(W65816::STA_DP)).addImm(0xE0);
|
||
if (LoaderBankDeref && !ForceBank0) {
|
||
// Bank byte from $BE (crt0-initialised) — Loader compat path.
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(W65816::LDA_DP)).addImm(0xBE);
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(W65816::STA_DP)).addImm(0xE2);
|
||
} else {
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(W65816::STZ_DP)).addImm(0xE2);
|
||
}
|
||
|
||
if (IsLoad) {
|
||
Register Dst = MI.getOperand(0).getReg();
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(W65816::LDY_Imm16)).addImm(0);
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(W65816::LDA_DPIndLongY)).addImm(0xE0);
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(TargetOpcode::COPY), Dst).addReg(W65816::A);
|
||
} else {
|
||
Register Val = MI.getOperand(0).getReg();
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(TargetOpcode::COPY), W65816::A).addReg(Val);
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(W65816::LDY_Imm16)).addImm(0);
|
||
if (IsByteStore)
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(W65816::SEP)).addImm(0x20);
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(W65816::STA_DPIndLongY)).addImm(0xE0);
|
||
if (IsByteStore)
|
||
BuildMI(*BB, MI.getIterator(), DL,
|
||
TII.get(W65816::REP)).addImm(0x20);
|
||
}
|
||
MI.eraseFromParent();
|
||
return BB;
|
||
}
|
||
case W65816::SELECT_CC8:
|
||
case W65816::SELECT_CC16: {
|
||
const W65816Subtarget &STI = BB->getParent()->getSubtarget<W65816Subtarget>();
|
||
const W65816InstrInfo &TII = *STI.getInstrInfo();
|
||
MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
|
||
DebugLoc DL = MI.getDebugLoc();
|
||
MachineFunction *MF = BB->getParent();
|
||
const BasicBlock *LLVM_BB = BB->getBasicBlock();
|
||
MachineFunction::iterator It = ++BB->getIterator();
|
||
|
||
MachineBasicBlock *thisMBB = BB;
|
||
MachineBasicBlock *copy0MBB = MF->CreateMachineBasicBlock(LLVM_BB);
|
||
MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(LLVM_BB);
|
||
MF->insert(It, copy0MBB);
|
||
MF->insert(It, sinkMBB);
|
||
|
||
// Move the rest of thisMBB after MI to sinkMBB.
|
||
sinkMBB->splice(sinkMBB->begin(), BB,
|
||
std::next(MachineBasicBlock::iterator(MI)), BB->end());
|
||
sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
|
||
|
||
unsigned CC = MI.getOperand(3).getImm();
|
||
|
||
// Helper: if `OpReg` is defined by a single-use, side-effect-free,
|
||
// constant-source LDA in thisMBB, MOVE that LDA into `DstMBB` (at
|
||
// its start). Returns true on success.
|
||
auto tryHoistConstInit = [&](Register OpReg,
|
||
MachineBasicBlock *DstMBB) -> bool {
|
||
if (!OpReg.isVirtual()) return false;
|
||
if (!MRI.hasOneNonDBGUse(OpReg)) return false;
|
||
MachineInstr *Def = MRI.getUniqueVRegDef(OpReg);
|
||
if (!Def || Def->getParent() != thisMBB) return false;
|
||
if (Def->getOpcode() != W65816::LDAi16imm &&
|
||
Def->getOpcode() != W65816::LDAi8imm)
|
||
return false;
|
||
if (Def->getNumOperands() < 2 || !Def->getOperand(1).isImm())
|
||
return false;
|
||
Def->removeFromParent();
|
||
DstMBB->insert(DstMBB->begin(), Def);
|
||
return true;
|
||
};
|
||
|
||
Register TValReg = MI.getOperand(1).getReg();
|
||
Register FValReg = MI.getOperand(2).getReg();
|
||
auto IsConstLda = [&](Register R) {
|
||
if (!R.isVirtual() || !MRI.hasOneNonDBGUse(R)) return false;
|
||
MachineInstr *D = MRI.getUniqueVRegDef(R);
|
||
return D && D->getParent() == thisMBB &&
|
||
(D->getOpcode() == W65816::LDAi16imm ||
|
||
D->getOpcode() == W65816::LDAi8imm) &&
|
||
D->getNumOperands() >= 2 && D->getOperand(1).isImm();
|
||
};
|
||
|
||
bool BothConst = (CC < W65816CC::COND_GT_MB) &&
|
||
IsConstLda(TValReg) && IsConstLda(FValReg);
|
||
|
||
if (BothConst) {
|
||
// 4-block diamond: thisMBB has only the test (CMP) and Bxx; the
|
||
// tval and fval LDAs each live in their own destination block,
|
||
// which is reached only via the branch — so neither LDA's flag
|
||
// side-effect can corrupt the CMP→Bxx test window. This is the
|
||
// proper fix for the "LDA between CMP and Bxx" bug catalogued in
|
||
// project_known_issue_lda_flags.md (replacing the earlier 3-block
|
||
// workaround that only hoisted fval).
|
||
//
|
||
// thisMBB: ...; CMP; Bxx tvalMBB
|
||
// copy0MBB: LDA #fval; BRA sinkMBB (FALSE path)
|
||
// tvalMBB: LDA #tval (TRUE path; falls to sink)
|
||
// sinkMBB: PHI [tval from tvalMBB, fval from copy0MBB]
|
||
MachineBasicBlock *tvalMBB = MF->CreateMachineBasicBlock(LLVM_BB);
|
||
MF->insert(sinkMBB->getIterator(), tvalMBB);
|
||
BB->addSuccessor(copy0MBB);
|
||
BB->addSuccessor(tvalMBB);
|
||
copy0MBB->addSuccessor(sinkMBB);
|
||
tvalMBB->addSuccessor(sinkMBB);
|
||
unsigned BrOp = getBranchOpcodeForCC(CC);
|
||
BuildMI(thisMBB, DL, TII.get(BrOp)).addMBB(tvalMBB);
|
||
BuildMI(copy0MBB, DL, TII.get(W65816::BRA)).addMBB(sinkMBB);
|
||
tryHoistConstInit(TValReg, tvalMBB);
|
||
tryHoistConstInit(FValReg, copy0MBB);
|
||
BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII.get(W65816::PHI),
|
||
MI.getOperand(0).getReg())
|
||
.addReg(TValReg).addMBB(tvalMBB)
|
||
.addReg(FValReg).addMBB(copy0MBB);
|
||
} else {
|
||
// 3-block diamond: keep the existing layout and (where possible)
|
||
// hoist fval into copy0MBB. Used when one or both operands are
|
||
// computed values (not constants), or when the multi-branch CC
|
||
// requires two Bxx in thisMBB.
|
||
BB->addSuccessor(copy0MBB);
|
||
BB->addSuccessor(sinkMBB);
|
||
if (CC < W65816CC::COND_GT_MB) {
|
||
unsigned BrOp = getBranchOpcodeForCC(CC);
|
||
BuildMI(thisMBB, DL, TII.get(BrOp)).addMBB(sinkMBB);
|
||
} else {
|
||
MultiBranch MB = getMultiBranch(CC);
|
||
MachineBasicBlock *Tgt1 = MB.FirstToTrue ? sinkMBB : copy0MBB;
|
||
MachineBasicBlock *Tgt2 = MB.SecondToTrue ? sinkMBB : copy0MBB;
|
||
BuildMI(thisMBB, DL, TII.get(MB.First)).addMBB(Tgt1);
|
||
BuildMI(thisMBB, DL, TII.get(MB.Second)).addMBB(Tgt2);
|
||
}
|
||
copy0MBB->addSuccessor(sinkMBB);
|
||
tryHoistConstInit(FValReg, copy0MBB);
|
||
BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII.get(W65816::PHI),
|
||
MI.getOperand(0).getReg())
|
||
.addReg(TValReg).addMBB(thisMBB)
|
||
.addReg(FValReg).addMBB(copy0MBB);
|
||
}
|
||
|
||
MI.eraseFromParent();
|
||
return sinkMBB;
|
||
}
|
||
}
|
||
}
|