//===-- W65816ISelLowering.cpp - W65816 DAG Lowering Implementation -------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // Minimum DAG lowering sufficient for a no-argument function returning an // i16 constant. Argument passing and non-trivial calls still unimplemented. // //===----------------------------------------------------------------------===// #include "W65816ISelLowering.h" #include "W65816InstrInfo.h" #include "W65816MachineFunctionInfo.h" #include "W65816SelectionDAGInfo.h" #include "W65816Subtarget.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/IR/Function.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" using namespace llvm; #define DEBUG_TYPE "w65816-lower" // Loader-compat workaround: when set, LDAptr/STAptr/STBptr inserters // load the bank byte from DP $BE (initialized by crt0 to PHK / current // PBR) instead of forcing it to 0 via STZ $E2. This makes pointer // derefs land in the user's bank — matching where DBR-relative // absolute stores go — so library functions like gmtime that store // into static buffers via DBR-relative paths are visible to caller- // side pointer-deref reads. Costs 2 extra bytes / 4 cycles per ptr- // deref (LDA dp + STA dp vs STZ dp). Default off to keep // size-sensitive builds (toolbox) under the $C000 IO-window ceiling. static cl::opt LoaderBankDeref( "w65816-loader-bank-deref", cl::desc("LDAptr/STAptr inserters read bank from DP $BE (set by " "crt0 to PHK) instead of STZ $E2. Required for GS/OS " "Loader compatibility; default off for size-sensitive " "builds."), cl::init(false), cl::Hidden); W65816TargetLowering::W65816TargetLowering(const TargetMachine &TM, const W65816Subtarget &STI) : TargetLowering(TM, STI) { // Register classes for the two scalar modes. The register allocator sees // A, X and Y as both 8-bit and 16-bit; a later REP/SEP pass is responsible // for ensuring the dynamic mode matches the selected class. addRegisterClass(MVT::i8, &W65816::Acc8RegClass); addRegisterClass(MVT::i16, &W65816::Acc16RegClass); addRegisterClass(MVT::i32, &W65816::Wide32RegClass); computeRegisterProperties(STI.getRegisterInfo()); setStackPointerRegisterToSaveRestore(W65816::SP); setBooleanContents(ZeroOrOneBooleanContent); setBooleanVectorContents(ZeroOrOneBooleanContent); // GlobalAddress and ExternalSymbol: lower to W65816ISD::Wrapper so a // tablegen pattern can fold them into instruction operands. setOperationAction(ISD::GlobalAddress, MVT::i16, Custom); setOperationAction(ISD::ExternalSymbol, MVT::i16, Custom); setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); setOperationAction(ISD::ExternalSymbol, MVT::i32, Custom); // FrameIndex i32 has its own DAG-to-DAG path in W65816ISelDAGToDAG.cpp. // BR_CC is custom-lowered to a CMP + W65816ISD::BR_CC chain so we can // emit the right BEQ/BNE/BCS/BCC mnemonic per condition. setOperationAction(ISD::BR_CC, MVT::i16, Custom); setOperationAction(ISD::BR_CC, MVT::i8, Custom); setOperationAction(ISD::BRCOND, MVT::Other, Expand); setOperationAction(ISD::BR_JT, MVT::Other, Expand); // SETCC and SELECT_CC: custom-lowered to a CMP + W65816ISD::SELECT_CC // pseudo (with usesCustomInserter=1) that EmitInstrWithCustomInserter // expands into a Bxx + diamond CFG + PHI. SETCC funnels through the // same path with TVal=1 / FVal=0. SELECT (no condition operand) is // expanded to SELECT_CC by the legalizer using SETNE against zero. setOperationAction(ISD::SETCC, MVT::i16, Custom); setOperationAction(ISD::SETCC, MVT::i8, Custom); setOperationAction(ISD::SELECT_CC, MVT::i16, Custom); setOperationAction(ISD::SELECT_CC, MVT::i8, Custom); setOperationAction(ISD::SELECT, MVT::i16, Expand); setOperationAction(ISD::SELECT, MVT::i8, Expand); // 65816 has no inline sign-extend instruction; synthesize i8 -> i16 // via a bit-7 test and SELECT_CC (see LowerSignExtend). setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Custom); // We have zextload-i8 and extload-i8 patterns (LDA + AND #$FF / bare // LDA for the anyext case). No native sextload; mark it Expand so // LLVM rewrites `sextload i16, i8` into `(sign_extend (load i8))`, // which then flows through LowerSignExtend's branchless 3-insn // sequence (AND #$00FF; EOR #$0080; SEC; SBC #$0080). for (MVT VT : MVT::integer_valuetypes()) setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Expand); // Only register i32 ext-load / trunc-store and Custom actions when // i32 is actually a legal type (ptr32 mode active). Otherwise the // Custom-action calls intercept i16/i8 ops, and LowerTruncate's // SDValue()-on-non-i32 bail breaks the i16→i8 trunc pattern (same // root cause as the earlier LOAD-Custom-breaks-LDAptr issue). bool ptr32Active = isTypeLegal(MVT::i32); if (ptr32Active) { for (MVT MemVT : {MVT::i8, MVT::i16}) { setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, MemVT, Expand); setLoadExtAction(ISD::SEXTLOAD, MVT::i32, MemVT, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::i32, MemVT, Expand); setTruncStoreAction(MVT::i32, MemVT, Expand); } } // Vararg support: VASTART writes the address of the first vararg slot // to the va_list pointer. VAARG/VACOPY/VAEND use the default // expansions that load through that pointer and bump it. This makes // -style functions (e.g. printf-likes) compile cleanly. setOperationAction(ISD::VASTART, MVT::Other, Custom); // Custom VAARG so we DON'T align the va_list pointer. The default // expansion rounds up to the type's preferred alignment (S16 = 2), // but caller-pushed args land at PHA's resulting odd S+1 address. // Aligning would skip the low byte and read garbage. setOperationAction(ISD::VAARG, MVT::Other, Custom); setOperationAction(ISD::VACOPY, MVT::Other, Expand); setOperationAction(ISD::VAEND, MVT::Other, Expand); // C++ exceptions (SJLJ model) — clang lowers exception machinery into // these intrinsics via SjLjEHPrepare. We don't have native handling // for any of them on this target; mark Expand so LegalizeDAG falls // back to its no-op stubs (setjmp returns 0, longjmp is a no-op, // setup_dispatch is a chain pass-through). The actual EH semantics // are provided at runtime by libcxxabi (__cxa_throw etc.) calling // _Unwind_SjLj_RaiseException, which in turn longjmps via the // function context the prologue prepared. See // runtime/src/libcxxabiSjlj.c for the runtime side. setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Expand); setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i16, Expand); setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Expand); setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom); // SJLJ exception lowering uses FRAMEADDR(0) to read the current frame // pointer. We don't reserve a frame pointer in general; return the // entry-SP-equivalent value (current SP read via TSC) — good enough // for SJLJ's purpose of identifying the call frame. setOperationAction(ISD::FRAMEADDR, MVT::i16, Custom); setOperationAction(ISD::FRAMEADDR, MVT::i32, Custom); // stacksave / stackrestore — used by SjLjEHPrepare to save/restore SP // around invoke calls. The jmp_buf already captures SP via TSC in // our setjmp implementation, so these are redundant here. Lower // stacksave to a constant 0 (the value is stored into the function // context but never used for restoration on our target) and // stackrestore to a chain pass-through (no-op). // SJLJ EH uses STACKSAVE/STACKRESTORE. Default Expand calls // CopyFromReg/$SP which fails because SP has no register class. // Custom-lower to a Constant 0 (stacksave) and chain-passthrough // (stackrestore) — our SJLJ runtime doesn't actually use these // values; setjmp/longjmp manage SP directly via TSC/TCS. setOperationAction(ISD::STACKSAVE, MVT::Other, Custom); setOperationAction(ISD::STACKRESTORE, MVT::Other, Custom); // FRAMEADDR is set Custom above for SJLJ; don't set it Expand here // (the second setOperationAction would override the first). setOperationAction(ISD::RETURNADDR, MVT::i16, Expand); setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i16, Expand); setOperationAction(ISD::EH_DWARF_CFA, MVT::i16, Expand); // The 65816 has no hardware multiplier or divider. Multiply by a // power-of-two constant is auto-rewritten to shifts by the DAG // combiner; arbitrary multiply / divide / mod go through libcalls // (`__mulhi3` for i16 multiply etc.). The libcall expander emits a // standard CALL node which flows through LowerCall, so multi-arg // call lowering must be working first (it is, see task #26). setOperationAction(ISD::MULHU, MVT::i16, Expand); setOperationAction(ISD::MULHS, MVT::i16, Expand); setOperationAction(ISD::SMUL_LOHI, MVT::i16, Expand); setOperationAction(ISD::UMUL_LOHI, MVT::i16, Expand); setOperationAction(ISD::MUL, MVT::i16, LibCall); // CTPOP/CTLZ/CTTZ/ROTL/ROTR — no hardware support. Expand lets the // type legalizer rewrite into a sequence of basic ops. Without // this, e.g. `x && !(x & (x-1))` (LLVM canonicalises to popcount==1) // or `(x << 1) | (x >> 15)` (canonicalised to rotl) hit "Cannot // Select" at isel. for (MVT VT : {MVT::i8, MVT::i16, MVT::i32}) { setOperationAction(ISD::CTPOP, VT, Expand); setOperationAction(ISD::CTLZ, VT, Expand); setOperationAction(ISD::CTTZ, VT, Expand); setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand); setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand); setOperationAction(ISD::ROTL, VT, Expand); setOperationAction(ISD::ROTR, VT, Expand); } setOperationAction(ISD::SDIV, MVT::i16, LibCall); setOperationAction(ISD::UDIV, MVT::i16, LibCall); setOperationAction(ISD::SREM, MVT::i16, LibCall); setOperationAction(ISD::UREM, MVT::i16, LibCall); setOperationAction(ISD::SDIVREM, MVT::i16, Expand); setOperationAction(ISD::UDIVREM, MVT::i16, Expand); // Variable-amount and large-constant shifts. We have inline // patterns for shift-by-1..4; everything else goes through // __ashlhi3 / __lshrhi3 / __ashrhi3. Setting the action to Custom // lets us return SDValue() for the fast cases and route everything // else through the libcall lowering helper. setOperationAction(ISD::SHL, MVT::i16, Custom); setOperationAction(ISD::SRL, MVT::i16, Custom); setOperationAction(ISD::SRA, MVT::i16, Custom); // i8 shifts go through Custom too — LowerShift detects the i8 result // and routes through trunc(i16-shift(zext_or_sext(lhs), amount)). // Avoids needing a parallel set of qi3 libcalls. setOperationAction(ISD::SHL, MVT::i8, Custom); setOperationAction(ISD::SRL, MVT::i8, Custom); setOperationAction(ISD::SRA, MVT::i8, Custom); // LOAD / STORE Custom-lowering for ptr32 mode is intentionally NOT // wired here in ptr16 mode. Setting LOAD Custom and returning // SDValue() from LowerLoad short-circuits the i16-result LDAptr/ // STAptr selection paths (the Custom→empty→Legal fall-through doesn't // re-enter pattern matching). When ptr32 is activated, this hook // needs a different gating mechanism — likely an isel-time // replacement triggered by addrspacecast or a target DAG combine. // See LowerLoad / LowerStore — currently dead code. // ADDC/ADDE/SUBC/SUBE are the legacy SDNodes with implicit Glue carrying // the carry/borrow flag between the two halves of a multi-precision add or // sub. Setting them Legal triggers the type legalizer's carry-chain split // for i32 ADD/SUB, which lowers to native ADC/SBC pairs (~7 instructions) // instead of the default UADDO+SETCC+ADD-of-bool path (~25 instructions). // The matching tablegen pseudos add Defs/Uses on the P register, which // tablegen wires up to the SDNode's SDNPInGlue/SDNPOutGlue automatically. setOperationAction(ISD::ADDC, MVT::i16, Legal); setOperationAction(ISD::ADDE, MVT::i16, Legal); setOperationAction(ISD::SUBC, MVT::i16, Legal); setOperationAction(ISD::SUBE, MVT::i16, Legal); // i32 (long). Type legalization splits i32 into two i16 halves; with // ADDC/ADDE Legal (above), ADD/SUB go through the native carry chain. // AND/OR/XOR split cleanly into per-half ops with no carry to track. // Multiply/divide/shift go through libcall stubs whose // implementations live in runtime/src/libgcc.s. SHL_PARTS / SRL_PARTS // / SRA_PARTS are the SDNodes the type legalizer emits when splitting // a variable-amount shift; without an action they get "Cannot select". // LibCall on the parent node routes the whole shift through one // __ashlsi3 / __lshrsi3 / __ashrsi3 call, which is both smaller and // simpler than implementing a 32-bit shift in 65816 assembly inline. for (MVT VT : {MVT::i32}) { setOperationAction(ISD::MUL, VT, LibCall); setOperationAction(ISD::SDIV, VT, LibCall); setOperationAction(ISD::UDIV, VT, LibCall); setOperationAction(ISD::SREM, VT, LibCall); setOperationAction(ISD::UREM, VT, LibCall); setOperationAction(ISD::MULHU, VT, Expand); setOperationAction(ISD::MULHS, VT, Expand); setOperationAction(ISD::SMUL_LOHI, VT, Expand); setOperationAction(ISD::UMUL_LOHI, VT, Expand); setOperationAction(ISD::SDIVREM, VT, Expand); setOperationAction(ISD::UDIVREM, VT, Expand); // i32 shifts route through a libcall via the // preferredShiftLegalizationStrategy override (see header). No // explicit SHL/SHL_PARTS action needed — the override forces the // type-legalizer's libcall path before SHL_PARTS would be emitted. } // i64 shifts — route to libcall before the type legalizer tries // to split via the next-legal-type (which becomes i32 in ptr32 mode // and triggers a SDAG combine loop on `i64 >> K` patterns). By // marking SHL/SRL/SRA i64 LibCall here, the operation legalizer // picks up the libcall path even though i64 itself is illegal. for (MVT VT : {MVT::i64}) { setOperationAction(ISD::SHL, VT, LibCall); setOperationAction(ISD::SRL, VT, LibCall); setOperationAction(ISD::SRA, VT, LibCall); } if (ptr32Active) { for (unsigned Op : {ISD::ADD, ISD::SUB, ISD::AND, ISD::OR, ISD::XOR}) setOperationAction(Op, MVT::i32, Custom); setOperationAction(ISD::SHL, MVT::i32, Custom); setOperationAction(ISD::SRL, MVT::i32, Custom); setOperationAction(ISD::SRA, MVT::i32, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::i32, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::i32, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::i32, Custom); // SIGN_EXTEND_INREG with i32 result and inner type i1/i8/i16: // the combiner emits this for `(int32_t)((int8_t)x)` and for // `-(crc & 1ul)` (the i1 case shows up in CRC32 loops). No // tablegen pattern covers the i32 form; Custom-lower to per-half // ops. IMPORTANT: LegalizeDAG looks up the action for // SIGN_EXTEND_INREG using the INNER VT (the operand value type), // not the result VT. See LegalizeDAG.cpp: // Action = TLI.getOperationAction(Op, InnerType); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Custom); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Custom); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Custom); setOperationAction(ISD::TRUNCATE, MVT::i16, Custom); setOperationAction(ISD::TRUNCATE, MVT::i8, Custom); setOperationAction(ISD::LOAD, MVT::i32, Custom); setOperationAction(ISD::STORE, MVT::i32, Custom); setOperationAction(ISD::SETCC, MVT::i32, Custom); setOperationAction(ISD::BR_CC, MVT::i32, Custom); setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); setOperationAction(ISD::SELECT, MVT::i32, Custom); setOperationAction(ISD::Constant, MVT::i32, Custom); } // Disable jump tables. Generating them costs us BRIND (indirect // branch via 16-bit pointer load), which we don't have. A long // if-else chain compiles fine without them. Setting the threshold // to UINT_MAX makes LLVM never form a jump table. setMinimumJumpTableEntries(UINT_MAX); // Variable-length arrays / dynamic stack allocation. Lowered to // `tsc; sec; sbc size; tcs; inc a` — A returns the address of the // allocated region. Limitation: this shifts SP, so any FrameIndex // accessed *after* a DYNAMIC_STACKALLOC reads from a wrong offset // (we have no frame pointer). Suitable for the common pattern // "alloca; initialise; pass; return"; complex VLA use mixed with // local-variable access across the alloca will miscompile. A real // FP (DP slot or X-as-FP) would lift this restriction. setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i16, Custom); // Opt into PerformDAGCombine on LOAD nodes — needed for the // address-select reverse combine (see W65816TargetLowering:: // PerformDAGCombine). // setTargetDAGCombine(ISD::LOAD); // bisecting pickif hang // SHL combine disabled while debugging the ptr32 i64-phi hang. // setTargetDAGCombine(ISD::SHL); // Combine STORE / LOAD with const-int i32 pointer to a form that // survives LowerI32Constant (which would otherwise split the ptr // into a Wide32 reg pair and lose the const-addr fast path). // See PerformDAGCombine. setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::LOAD); } // Map an LLVM SETCC condition to a W65816 branch. Returns the condition // code along with possibly-swapped LHS/RHS; some signed comparisons are // rewritten to use unsigned ones with a tweaked operand because the // 65816 has no native signed branch other than BMI/BPL on a value, not // on a comparison result. // Map an LLVM SETCC condition to a 65816 branch. Unsigned codes use // BCS/BCC after CMP. Signed SETLT/SETGE map to BMI/BPL — correct only // when the comparison cannot overflow. For values produced by typical // C arithmetic on i16 this is usually fine; values near INT16_MIN/MAX // could give wrong results until we emit the BVS handling sequence. // SETGT / SETLE are rewritten to SETLT / SETGE with constant + 1 in // LowerBR_CC, mirroring the SETULE / SETUGT path. static W65816CC::CondCode mapCC(ISD::CondCode CC) { switch (CC) { case ISD::SETEQ: return W65816CC::COND_EQ; case ISD::SETNE: return W65816CC::COND_NE; case ISD::SETUGE: return W65816CC::COND_HS; case ISD::SETULT: return W65816CC::COND_LO; case ISD::SETLT: return W65816CC::COND_MI; case ISD::SETGE: return W65816CC::COND_PL; default: return W65816CC::COND_INVALID; } } // If both compare operands are i8, widen them to i16 so the existing // i16 CMP path can handle them. Use ZEXT for unsigned/eq/ne CCs and // SEXT for signed CCs — picking the wrong extension would invert the // answer (e.g. -1i8 sext to 0xFFFF compares < 1 signed; zext to 0x00FF // compares > 1 unsigned, which would flip a signed less-than). static void promoteI8Cmp(SDValue &LHS, SDValue &RHS, ISD::CondCode CC, SelectionDAG &DAG, const SDLoc &DL) { if (LHS.getValueType() != MVT::i8) return; unsigned Ext; switch (CC) { case ISD::SETLT: case ISD::SETLE: case ISD::SETGT: case ISD::SETGE: Ext = ISD::SIGN_EXTEND; break; default: Ext = ISD::ZERO_EXTEND; break; // unsigned + eq/ne } LHS = DAG.getNode(Ext, DL, MVT::i16, LHS); RHS = DAG.getNode(Ext, DL, MVT::i16, RHS); } // Normalize a (LHS, RHS, CC) triple so the result is something we can // emit with one CMP + Bxx. Returns the W65816 condition code; updates // LHS/RHS/CC in place. Returns COND_INVALID on failure. static W65816CC::CondCode normalizeCC(SDValue &LHS, SDValue &RHS, ISD::CondCode &CC, SelectionDAG &DAG, const SDLoc &DL) { promoteI8Cmp(LHS, RHS, CC, DAG, DL); // CMP wants the comparand (constant or memory) on the right. If a DAG // pre-pass put the constant on the left, swap and flip the condition. if (isa(LHS) && !isa(RHS)) { std::swap(LHS, RHS); CC = ISD::getSetCCSwappedOperands(CC); } // Signed compare via "EOR with sign bit then unsigned compare": // a < b (signed) iff (a ^ 0x8000) < (b ^ 0x8000) (unsigned) // The XOR flips the sign bit, which converts signed-int ordering to // unsigned-int ordering on the same bits. This avoids the WDC's // missing "BLT signed" — BMI/BPL alone read the sign of (a-b) // without the V-flag overflow correction, giving wrong results // when the subtraction overflows (e.g., INT16_MIN < 1 produced // false because (-32768 - 1) = +32767 has N=0). After the EOR // transform we use BCC/BCS which depend on the carry from CMP and // don't suffer overflow corruption. // // Cost: 1 EOR per operand (3 bytes each in M=16) — comparable to // the V-aware multi-branch sequence (5+ bytes of branches), but // happens at SDAG time so subsequent SDAG combining can fold // EORs against constants or already-EOR'd values. bool SignedCmp = (CC == ISD::SETLT || CC == ISD::SETLE || CC == ISD::SETGT || CC == ISD::SETGE); if (SignedCmp && LHS.getValueType() == MVT::i16) { EVT VT = LHS.getValueType(); SDValue Mask = DAG.getConstant(0x8000, DL, VT); LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, Mask); RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, Mask); switch (CC) { case ISD::SETLT: CC = ISD::SETULT; break; case ISD::SETLE: CC = ISD::SETULE; break; case ISD::SETGT: CC = ISD::SETUGT; break; case ISD::SETGE: CC = ISD::SETUGE; break; default: break; } } // Rewrite SETULE / SETUGT to SETULT / SETUGE with constant +/- 1. // (SETLE / SETGT have already been converted to their unsigned // counterparts above for i16; this handles original SETULE/SETUGT // and the post-transform SETULE/SETUGT.) Keeps the variable on the // LHS and lets us use BCS / BCC natively. if (auto *RhsConst = dyn_cast(RHS)) { int64_t V = RhsConst->getSExtValue(); uint64_t UV = (uint64_t)V & 0xFFFF; if (CC == ISD::SETULE && UV < 0xffff) { RHS = DAG.getConstant(UV + 1, DL, RHS.getValueType()); CC = ISD::SETULT; } else if (CC == ISD::SETUGT && UV < 0xffff) { RHS = DAG.getConstant(UV + 1, DL, RHS.getValueType()); CC = ISD::SETUGE; } else if (CC == ISD::SETLE && V < 0x7fff) { // Reachable only when SignedCmp transform was skipped (i8 case // before promoteI8Cmp could get it, or non-i16 in the future). RHS = DAG.getConstant(V + 1, DL, RHS.getValueType()); CC = ISD::SETLT; } else if (CC == ISD::SETGT && V < 0x7fff) { RHS = DAG.getConstant(V + 1, DL, RHS.getValueType()); CC = ISD::SETGE; } } W65816CC::CondCode TCC = mapCC(CC); if (TCC == W65816CC::COND_INVALID) { // Try swapping operands first — preferable since it leaves us with // a single-Bxx form. But reject the swap if it would put a load on // the LHS (we can't pattern-match cmp(load,reg) without spilling A). bool RhsIsLoad = isa(RHS.getNode()); bool LhsIsLoad = isa(LHS.getNode()); bool SwapWouldHurt = RhsIsLoad && !LhsIsLoad; if (!SwapWouldHurt) { std::swap(LHS, RHS); CC = ISD::getSetCCSwappedOperands(CC); TCC = mapCC(CC); } } // Final fallback: GT/LE/UGT/ULE without a useful swap target. Use a // multi-branch pseudo CC; the SELECT_CC16 custom inserter expands it // into a 3-BB diamond. Only valid for SELECT_CC, not for BR_CC — // LowerBR_CC re-routes those through SETCC + BR_CC NE. if (TCC == W65816CC::COND_INVALID) { switch (CC) { case ISD::SETGT: TCC = W65816CC::COND_GT_MB; break; case ISD::SETLE: TCC = W65816CC::COND_LE_MB; break; case ISD::SETUGT: TCC = W65816CC::COND_HI_MB; break; case ISD::SETULE: TCC = W65816CC::COND_LS_MB; break; default: break; } } return TCC; } // Wide32 build/extract helpers, used by LowerLoad/Store/Extend/Truncate/ // I32Bin/BR_CC to construct or destructure i32 SDValues across the // sub_lo / sub_hi halves of the Wide32 register class. static SDValue buildWide32(SelectionDAG &DAG, const SDLoc &DL, SDValue Lo, SDValue Hi) { SDValue RC = DAG.getTargetConstant(W65816::Wide32RegClassID, DL, MVT::i32); SDValue SubLo = DAG.getTargetConstant(llvm::sub_lo, DL, MVT::i32); SDValue SubHi = DAG.getTargetConstant(llvm::sub_hi, DL, MVT::i32); SDNode *RS = DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, DL, MVT::i32, {RC, Lo, SubLo, Hi, SubHi}); return SDValue(RS, 0); } // Look through a buildWide32(Lo, Hi) -> REG_SEQUENCE(RC, Lo, sub_lo, // Hi, sub_hi) pair: if X is exactly that machine node, return the // matching half operand directly. Avoids a TargetExtractSubreg that // would re-enter the SDAG combiner and re-build the i32 constant / // pair, looping forever (observed as OOM in the combiner on `*t = 0`). static SDValue lookThroughRegSeq(SDValue X, unsigned WantSub) { if (!X.getNode() || !X.isMachineOpcode()) return SDValue(); if (X.getMachineOpcode() != TargetOpcode::REG_SEQUENCE) return SDValue(); // Layout: op0 = RC, then (Reg, SubIdx) pairs. for (unsigned i = 1; i + 1 < X.getNumOperands(); i += 2) { SDValue SubIdx = X.getOperand(i + 1); auto *CIdx = dyn_cast(SubIdx); if (!CIdx) continue; if (CIdx->getZExtValue() == WantSub) return X.getOperand(i); } return SDValue(); } static SDValue extractWide32Lo(SelectionDAG &DAG, const SDLoc &DL, SDValue X) { // For constants, materialise the lo half as an i16 constant directly // — getTargetExtractSubreg on a Constant SDNode produces a malformed // MachineSDNode (constants don't carry sub-regs) and triggers // SDAG combine loops downstream. if (auto *C = dyn_cast(X)) { return DAG.getConstant(C->getZExtValue() & 0xFFFFu, DL, MVT::i16); } if (SDValue Half = lookThroughRegSeq(X, llvm::sub_lo)) return Half; return DAG.getTargetExtractSubreg(llvm::sub_lo, DL, MVT::i16, X); } static SDValue extractWide32Hi(SelectionDAG &DAG, const SDLoc &DL, SDValue X) { if (auto *C = dyn_cast(X)) { return DAG.getConstant((C->getZExtValue() >> 16) & 0xFFFFu, DL, MVT::i16); } if (SDValue Half = lookThroughRegSeq(X, llvm::sub_hi)) return Half; return DAG.getTargetExtractSubreg(llvm::sub_hi, DL, MVT::i16, X); } SDValue W65816TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); ISD::CondCode CC = cast(Op.getOperand(1))->get(); SDValue LHS = Op.getOperand(2); SDValue RHS = Op.getOperand(3); SDValue Dest = Op.getOperand(4); SDLoc DL(Op); EVT VT = LHS.getValueType(); // i32 BR_CC: synthesize an i16 boolean from per-half compares, then // branch on (bool != 0). Avoids the legalizer's generic Expand that // re-enters our SETCC/BR_CC custom paths in an infinite loop. if (VT == MVT::i32) { SDValue LL = extractWide32Lo(DAG, DL, LHS); SDValue LH = extractWide32Hi(DAG, DL, LHS); SDValue RL = extractWide32Lo(DAG, DL, RHS); SDValue RH = extractWide32Hi(DAG, DL, RHS); SDValue Bool; if (CC == ISD::SETEQ || CC == ISD::SETNE) { SDValue EqLo = DAG.getSetCC(DL, MVT::i16, LL, RL, ISD::SETEQ); SDValue EqHi = DAG.getSetCC(DL, MVT::i16, LH, RH, ISD::SETEQ); Bool = DAG.getNode(ISD::AND, DL, MVT::i16, EqLo, EqHi); if (CC == ISD::SETNE) Bool = DAG.getNode(ISD::XOR, DL, MVT::i16, Bool, DAG.getConstant(1, DL, MVT::i16)); } else { // (a CC b) where CC is ordered: // = (hi_a HiStrict hi_b) || (hi_a == hi_b && lo_a LoCC lo_b) // HiStrict is the strict variant of CC (LE -> LT etc.) so the // tie-breaker (hi==hi && lo CC lo) handles the equality case // properly. LoCC is always the unsigned variant of CC because // the low half is unsigned (the high half carries the sign). ISD::CondCode HiCC, LoCCu; switch (CC) { case ISD::SETLT: HiCC = ISD::SETLT; LoCCu = ISD::SETULT; break; case ISD::SETLE: HiCC = ISD::SETLT; LoCCu = ISD::SETULE; break; case ISD::SETGT: HiCC = ISD::SETGT; LoCCu = ISD::SETUGT; break; case ISD::SETGE: HiCC = ISD::SETGT; LoCCu = ISD::SETUGE; break; case ISD::SETULT: HiCC = ISD::SETULT; LoCCu = ISD::SETULT; break; case ISD::SETULE: HiCC = ISD::SETULT; LoCCu = ISD::SETULE; break; case ISD::SETUGT: HiCC = ISD::SETUGT; LoCCu = ISD::SETUGT; break; case ISD::SETUGE: HiCC = ISD::SETUGT; LoCCu = ISD::SETUGE; break; default: report_fatal_error("W65816: unexpected i32 BR_CC condition"); } SDValue HiOk = DAG.getSetCC(DL, MVT::i16, LH, RH, HiCC); SDValue HiEq = DAG.getSetCC(DL, MVT::i16, LH, RH, ISD::SETEQ); SDValue LoOk = DAG.getSetCC(DL, MVT::i16, LL, RL, LoCCu); SDValue Tie = DAG.getNode(ISD::AND, DL, MVT::i16, HiEq, LoOk); Bool = DAG.getNode(ISD::OR, DL, MVT::i16, HiOk, Tie); } SDValue Zero = DAG.getConstant(0, DL, MVT::i16); return DAG.getNode(ISD::BR_CC, DL, MVT::Other, Chain, DAG.getCondCode(ISD::SETNE), Bool, Zero, Dest); } W65816CC::CondCode TCC = normalizeCC(LHS, RHS, CC, DAG, DL); if (TCC == W65816CC::COND_INVALID) report_fatal_error("W65816: branch condition not yet implemented"); // Multi-branch CCs only have inserter support via SELECT_CC16. For // BR_CC, reroute through SETCC: materialise the boolean to A, then // branch on NE-vs-zero. One extra LDA but always works. if (TCC >= W65816CC::COND_GT_MB) { SDValue Bool = DAG.getNode(ISD::SETCC, DL, VT, LHS, RHS, DAG.getCondCode(CC)); SDValue Zero = DAG.getConstant(0, DL, VT); return DAG.getNode(ISD::BR_CC, DL, MVT::Other, Chain, DAG.getCondCode(ISD::SETNE), Bool, Zero, Dest); } SDValue Glue = DAG.getNode(W65816ISD::CMP, DL, MVT::Glue, LHS, RHS); SDValue CCOp = DAG.getTargetConstant(TCC, DL, MVT::i8); return DAG.getNode(W65816ISD::BR_CC, DL, MVT::Other, Chain, Dest, CCOp, Glue); } SDValue W65816TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { // setcc lhs, rhs, cc -> select_cc lhs, rhs, 1, 0, cc. // The SELECT_CC then re-enters LowerOperation and we lower it via the // diamond-CFG path. setBooleanContents(ZeroOrOne) means callers see // the result as a clean 0/1 value. SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); ISD::CondCode CC = cast(Op.getOperand(2))->get(); SDLoc DL(Op); EVT VT = Op.getValueType(); // i32 SETCC: split into per-half compares. Result type is i16 (the // legalizer keeps the boolean result type narrow regardless of LHS // width). if (LHS.getValueType() == MVT::i32) { SDValue LL = extractWide32Lo(DAG, DL, LHS); SDValue LH = extractWide32Hi(DAG, DL, LHS); SDValue RL = extractWide32Lo(DAG, DL, RHS); SDValue RH = extractWide32Hi(DAG, DL, RHS); if (CC == ISD::SETEQ || CC == ISD::SETNE) { SDValue EqLo = DAG.getSetCC(DL, VT, LL, RL, ISD::SETEQ); SDValue EqHi = DAG.getSetCC(DL, VT, LH, RH, ISD::SETEQ); SDValue Eq = DAG.getNode(ISD::AND, DL, VT, EqLo, EqHi); if (CC == ISD::SETNE) Eq = DAG.getNode(ISD::XOR, DL, VT, Eq, DAG.getConstant(1, DL, VT)); return Eq; } ISD::CondCode HiCC, LoCCu; switch (CC) { case ISD::SETLT: HiCC = ISD::SETLT; LoCCu = ISD::SETULT; break; case ISD::SETLE: HiCC = ISD::SETLT; LoCCu = ISD::SETULE; break; case ISD::SETGT: HiCC = ISD::SETGT; LoCCu = ISD::SETUGT; break; case ISD::SETGE: HiCC = ISD::SETGT; LoCCu = ISD::SETUGE; break; case ISD::SETULT: HiCC = ISD::SETULT; LoCCu = ISD::SETULT; break; case ISD::SETULE: HiCC = ISD::SETULT; LoCCu = ISD::SETULE; break; case ISD::SETUGT: HiCC = ISD::SETUGT; LoCCu = ISD::SETUGT; break; case ISD::SETUGE: HiCC = ISD::SETUGT; LoCCu = ISD::SETUGE; break; default: report_fatal_error("W65816: unexpected i32 SETCC condition"); } SDValue HiOk = DAG.getSetCC(DL, VT, LH, RH, HiCC); SDValue HiEq = DAG.getSetCC(DL, VT, LH, RH, ISD::SETEQ); SDValue LoOk = DAG.getSetCC(DL, VT, LL, RL, LoCCu); SDValue Tie = DAG.getNode(ISD::AND, DL, VT, HiEq, LoOk); return DAG.getNode(ISD::OR, DL, VT, HiOk, Tie); } SDValue One = DAG.getConstant(1, DL, VT); SDValue Zero = DAG.getConstant(0, DL, VT); return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, One, Zero, DAG.getCondCode(CC)); } SDValue W65816TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); SDValue TVal = Op.getOperand(2); SDValue FVal = Op.getOperand(3); ISD::CondCode CC = cast(Op.getOperand(4))->get(); SDLoc DL(Op); // i32 SELECT_CC: synthesize an i16 boolean from the i32 compare via // LowerSETCC's i32 path, then select between the i32 halves driven // by the boolean. Avoids creating the i32 W65816::CMP we have no // pattern for. if (LHS.getValueType() == MVT::i32) { // Materialise the i16 boolean. SDValue Bool = DAG.getSetCC(DL, MVT::i16, LHS, RHS, CC); SDValue Zero = DAG.getConstant(0, DL, MVT::i16); if (Op.getValueType() == MVT::i32) { SDValue TLo = extractWide32Lo(DAG, DL, TVal); SDValue THi = extractWide32Hi(DAG, DL, TVal); SDValue FLo = extractWide32Lo(DAG, DL, FVal); SDValue FHi = extractWide32Hi(DAG, DL, FVal); SDValue Lo = DAG.getSelectCC(DL, Bool, Zero, TLo, FLo, ISD::SETNE); SDValue Hi = DAG.getSelectCC(DL, Bool, Zero, THi, FHi, ISD::SETNE); return buildWide32(DAG, DL, Lo, Hi); } return DAG.getSelectCC(DL, Bool, Zero, TVal, FVal, ISD::SETNE); } // SELECT_CC with i32 result (i16 LHS): split TVal/FVal into halves // and run a per-half i16 SELECT_CC sharing the same condition. if (Op.getValueType() == MVT::i32) { SDValue TLo = extractWide32Lo(DAG, DL, TVal); SDValue THi = extractWide32Hi(DAG, DL, TVal); SDValue FLo = extractWide32Lo(DAG, DL, FVal); SDValue FHi = extractWide32Hi(DAG, DL, FVal); SDValue Lo = DAG.getSelectCC(DL, LHS, RHS, TLo, FLo, CC); SDValue Hi = DAG.getSelectCC(DL, LHS, RHS, THi, FHi, CC); return buildWide32(DAG, DL, Lo, Hi); } W65816CC::CondCode TCC = normalizeCC(LHS, RHS, CC, DAG, DL); if (TCC == W65816CC::COND_INVALID) report_fatal_error("W65816: select_cc condition not yet implemented"); SDValue Glue = DAG.getNode(W65816ISD::CMP, DL, MVT::Glue, LHS, RHS); SDValue CCOp = DAG.getTargetConstant(TCC, DL, MVT::i8); // SDTypeProfile declares 1 result (the selected value). Earlier // code passed a 2-VT list (value + Glue) which was silently wrong // and trips an SDNode-validity assertion in assertions builds. SDValue Ops[] = {TVal, FVal, CCOp, Glue}; return DAG.getNode(W65816ISD::SELECT_CC, DL, Op.getValueType(), Ops); } // i8 -> i16 sign extend. Branchless 3-instruction trick: // sext(x) = ((x & 0xFF) ^ 0x80) - 0x80 // Verify: x=0x00 -> 0x80 - 0x80 = 0x0000. x=0x7F -> 0xFF - 0x80 = 0x7F. // x=0x80 -> 0x00 - 0x80 = 0xFF80 (-128). x=0xFF -> 0x7F - 0x80 // = 0xFFFF (-1). // Lowers to: AND #$00FF; EOR #$0080; SEC; SBC #$0080 (10 bytes total, // no branches, no temp slots — much cheaper than the SELECT_CC diamond // version that produced ~14 instructions plus stack spills). SDValue W65816TargetLowering::LowerSignExtend(SDValue Op, SelectionDAG &DAG) const { SDValue X = Op.getOperand(0); if (X.getValueType() != MVT::i8 || Op.getValueType() != MVT::i16) return SDValue(); SDLoc DL(Op); SDValue Z = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, X); SDValue Sign = DAG.getConstant(0x0080, DL, MVT::i16); SDValue Xor = DAG.getNode(ISD::XOR, DL, MVT::i16, Z, Sign); return DAG.getNode(ISD::SUB, DL, MVT::i16, Xor, Sign); } // ptr32 foundation hook. In ptr16 mode (PointerWidth=16, current // default) addresses are i16 and we return SDValue() so the legalizer // keeps the load and the existing LDAptr / STAptr selection patterns // match. In ptr32 mode addresses are i32 and we wrap the load in // W65816ISD::LD_PTR via getMemIntrinsicNode so the [dp],Y inserter // can take the bank byte from sub_hi instead of forcing 0. // // Byte loads (zextload, anyext, true i8) keep going through the i16 // LDA + AND #$FF idiom — same trick the existing LDAptr uses; for // ptr32 mode the load is still 16 bits, just bank-explicit. SDValue W65816TargetLowering::LowerLoad(SDValue Op, SelectionDAG &DAG) const { LoadSDNode *Ld = cast(Op); SDValue Chain = Ld->getChain(); SDValue Ptr = Ld->getBasePtr(); EVT VT = Op.getValueType(); SDLoc DL(Op); // Const-int address: leave the SDAG alone so the tablegen pattern // `(load (iPTR imm))` → LDA8long fires (bank-explicit). See the // mirrored short-circuit at the top of LowerStore. if (isa(Ptr) && (VT == MVT::i8 || VT == MVT::i16)) return SDValue(); // i32 LOAD: split into two i16 loads at offsets 0 and 2 then // REG_SEQUENCE the halves into a Wide32. Address may be i16 (stack // slot, global) or i32 (ptr32 deref); the recursive ADD handles // address arithmetic correctly via LowerI32Bin. if (VT == MVT::i32) { EVT PtrVT = Ptr.getValueType(); SDValue Two = DAG.getConstant(2, DL, PtrVT); SDValue Ptr2 = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr, Two); SDValue Lo = DAG.getLoad(MVT::i16, DL, Chain, Ptr, Ld->getPointerInfo(), Ld->getAlign(), Ld->getMemOperand()->getFlags()); SDValue Hi = DAG.getLoad(MVT::i16, DL, Chain, Ptr2, Ld->getPointerInfo().getWithOffset(2), Ld->getAlign(), Ld->getMemOperand()->getFlags()); SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo.getValue(1), Hi.getValue(1)); SDValue Val = buildWide32(DAG, DL, Lo, Hi); return DAG.getMergeValues({Val, NewChain}, DL); } // ptr16 mode: address is i16, let the default selection handle it. if (Ptr.getValueType() != MVT::i32) return SDValue(); EVT MemVT = Ld->getMemoryVT(); SDVTList VTs = DAG.getVTList(MVT::i16, MVT::Other); SDValue Ops[] = { Chain, Ptr }; SDValue LdNode = DAG.getMemIntrinsicNode(W65816ISD::LD_PTR, DL, VTs, Ops, MVT::i16, Ld->getMemOperand()); SDValue Val = LdNode; // Byte memory access: mask the high byte for zextload, leave anyext. if (MemVT == MVT::i8) { if (Ld->getExtensionType() == ISD::ZEXTLOAD) Val = DAG.getNode(ISD::AND, DL, MVT::i16, Val, DAG.getConstant(0xFF, DL, MVT::i16)); else if (Ld->getExtensionType() == ISD::SEXTLOAD) Val = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i16, Val, DAG.getValueType(MVT::i8)); } // Narrow back to i8 if the consumer wanted i8. if (VT == MVT::i8) Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Val); return DAG.getMergeValues({Val, LdNode.getValue(1)}, DL); } // ZERO/SIGN/ANY_EXTEND i8/i16 -> i32: build a Wide32 from the i16 // payload and a 0 / sign-fill / undef high half. SDValue W65816TargetLowering::LowerExtend(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); if (Op.getValueType() != MVT::i32) return SDValue(); SDValue X = Op.getOperand(0); // Promote i8 inputs to i16 first via the same opcode. if (X.getValueType() == MVT::i8) X = DAG.getNode(Op.getOpcode(), DL, MVT::i16, X); SDValue Lo = X; SDValue Hi; if (Op.getOpcode() == ISD::ZERO_EXTEND) { Hi = DAG.getConstant(0, DL, MVT::i16); } else if (Op.getOpcode() == ISD::SIGN_EXTEND) { // Sign-fill via SRA #15 — uses our SRA15A pattern (4 insns) and // stays i16-typed in both LHS and RHS, dodging the combiner's // shift-amount-promote when ptr32 makes pointer-typed shift // amounts i32. Hi = DAG.getNode(ISD::SRA, DL, MVT::i16, Lo, DAG.getConstant(15, DL, MVT::i16)); } else { Hi = DAG.getUNDEF(MVT::i16); } return buildWide32(DAG, DL, Lo, Hi); } // SIGN_EXTEND_INREG i32 with inner type i1/i8/i16: sign-extend the low // N bits of an i32 input to fill all 32 bits. The legalizer leaves // this op alone when i32 is legal — but no tablegen pattern matches // the i32 form, so without this Custom hook isel aborts with // "Cannot select: sign_extend_inreg ... ValueType:i1" on shapes like // `-(crc & 1ul)` in CRC32 loops. // // Strategy: for inner VT V (= i1 / i8 / i16), the low half's // `sext_inreg` (already pattern-matched at i16) produces the signed // i16 value — then sign-fill the high half via SRA #15 of the lo // result. SDValue W65816TargetLowering::LowerSignExtendInReg(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); SDValue X = Op.getOperand(0); EVT InnerVT = cast(Op.getOperand(1))->getVT(); EVT ResVT = Op.getValueType(); // i16 result: replicate the existing tablegen patterns. We MUST // handle this case rather than returning SDValue(), because // setOperationAction's Custom-returns-SDValue() falls through to // default Expand (= SRA/SHL chain), not to tablegen pattern match. // The two existing patterns are: // (sext_inreg Acc16:$src, i1) -> NEGA16 (AND $src, 1) // (sext_inreg Acc16:$src, i8) -> ((src & 0xFF) ^ 0x80) - 0x80 // Reproduce them at the SDAG level so the legalizer's Custom // dispatch returns a fully-lowered tree. if (ResVT == MVT::i16) { if (InnerVT == MVT::i1) { SDValue Bit = DAG.getNode(ISD::AND, DL, MVT::i16, X, DAG.getConstant(1, DL, MVT::i16)); return DAG.getNode(ISD::SUB, DL, MVT::i16, DAG.getConstant(0, DL, MVT::i16), Bit); } if (InnerVT == MVT::i8) { SDValue Masked = DAG.getNode(ISD::AND, DL, MVT::i16, X, DAG.getConstant(0xFF, DL, MVT::i16)); SDValue Xored = DAG.getNode(ISD::XOR, DL, MVT::i16, Masked, DAG.getConstant(0x80, DL, MVT::i16)); return DAG.getNode(ISD::SUB, DL, MVT::i16, Xored, DAG.getConstant(0x80, DL, MVT::i16)); } // inner i16 = no-op. return X; } if (ResVT != MVT::i32) return SDValue(); // i32 result: project the input's low half (X is i32 Wide32 here), // apply the inner-VT sext on the i16 low half, sign-fill the hi. SDValue Lo = extractWide32Lo(DAG, DL, X); if (InnerVT != MVT::i16) { Lo = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i16, Lo, DAG.getValueType(InnerVT)); } // Sign-fill the hi half via SRA #15 — same idiom LowerExtend uses for // SIGN_EXTEND i16 -> i32. SDValue Hi = DAG.getNode(ISD::SRA, DL, MVT::i16, Lo, DAG.getConstant(15, DL, MVT::i16)); return buildWide32(DAG, DL, Lo, Hi); } // TRUNCATE i32 -> i16: project sub_lo. SDValue W65816TargetLowering::LowerTruncate(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); if (Op.getOperand(0).getValueType() != MVT::i32) return SDValue(); if (Op.getValueType() == MVT::i16) return extractWide32Lo(DAG, DL, Op.getOperand(0)); if (Op.getValueType() == MVT::i8) { // i32 -> i16 -> i8. The i8 trunc pattern is COPY_TO_REGCLASS at MC // level; the i16 sub_lo extract is the work. SDValue Lo16 = extractWide32Lo(DAG, DL, Op.getOperand(0)); return DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Lo16); } return SDValue(); } // i32 Constant: split into two i16 constants and REG_SEQUENCE. SDValue W65816TargetLowering::LowerI32Constant(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); if (Op.getValueType() != MVT::i32) return SDValue(); uint64_t V = cast(Op)->getZExtValue(); SDValue Lo = DAG.getConstant(V & 0xFFFFu, DL, MVT::i16); SDValue Hi = DAG.getConstant((V >> 16) & 0xFFFFu, DL, MVT::i16); return buildWide32(DAG, DL, Lo, Hi); } // ADD/SUB/AND/OR/XOR i32 -> per-half i16 op. ADDC/ADDE chain for ADD, // SUBC/SUBE for SUB. AND/OR/XOR are independent halves. SDValue W65816TargetLowering::LowerI32Bin(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); if (Op.getValueType() != MVT::i32) return SDValue(); SDValue L = Op.getOperand(0); SDValue R = Op.getOperand(1); SDValue LL = extractWide32Lo(DAG, DL, L); SDValue LH = extractWide32Hi(DAG, DL, L); SDValue RL = extractWide32Lo(DAG, DL, R); SDValue RH = extractWide32Hi(DAG, DL, R); SDValue Lo, Hi; switch (Op.getOpcode()) { case ISD::AND: Lo = DAG.getNode(ISD::AND, DL, MVT::i16, LL, RL); Hi = DAG.getNode(ISD::AND, DL, MVT::i16, LH, RH); break; case ISD::OR: Lo = DAG.getNode(ISD::OR, DL, MVT::i16, LL, RL); Hi = DAG.getNode(ISD::OR, DL, MVT::i16, LH, RH); break; case ISD::XOR: Lo = DAG.getNode(ISD::XOR, DL, MVT::i16, LL, RL); Hi = DAG.getNode(ISD::XOR, DL, MVT::i16, LH, RH); break; case ISD::ADD: { SDVTList VTs = DAG.getVTList(MVT::i16, MVT::Glue); SDValue Lo2 = DAG.getNode(ISD::ADDC, DL, VTs, LL, RL); Lo = Lo2.getValue(0); SDValue Carry = Lo2.getValue(1); Hi = DAG.getNode(ISD::ADDE, DL, VTs, LH, RH, Carry).getValue(0); break; } case ISD::SUB: { SDVTList VTs = DAG.getVTList(MVT::i16, MVT::Glue); SDValue Lo2 = DAG.getNode(ISD::SUBC, DL, VTs, LL, RL); Lo = Lo2.getValue(0); SDValue Borrow = Lo2.getValue(1); Hi = DAG.getNode(ISD::SUBE, DL, VTs, LH, RH, Borrow).getValue(0); break; } default: return SDValue(); } return buildWide32(DAG, DL, Lo, Hi); } // Store companion to LowerLoad. For i32 addresses, dispatch to the // 16-bit ST_PTR or the byte-truncating STB_PTR target node based on // MemoryVT. For i16 addresses (ptr16 mode), bail out and let the // existing STAptr / STBptr patterns match. SDValue W65816TargetLowering::LowerStore(SDValue Op, SelectionDAG &DAG) const { StoreSDNode *St = cast(Op); SDValue Chain = St->getChain(); SDValue Val = St->getValue(); SDValue Ptr = St->getBasePtr(); EVT MemVT = St->getMemoryVT(); SDLoc DL(Op); // Const-int address (`*(volatile uint8*)0xC035 = v`): leave the SDAG // alone so the tablegen pattern `(store Acc8, (iPTR imm))` → // STA8long fires. Without this short-circuit the i32-pointer code // below promotes the constant address into a Wide32 register pair // and routes through STBptr32 ([dp],Y), which is 16 B / 30 cyc and // (worse) bank-tracks DBR. if (isa(Ptr)) return SDValue(); // i32 STORE: split into two halves. Critical: the per-half stores // MUST go through the target-specific W65816ISD::ST_PTR node and not // through plain ISD::STORE, otherwise the SDAG combiner's // MergeConsecutiveStores re-combines them into a single i32 store // that re-enters LowerStore — infinite loop, OOM in the combiner. // For i16 ptrs (legacy ptr16), fall back to ISD::STORE; the regular // store-merger doesn't trip there because address splitting via // ISD::ADD on i16 doesn't itself fan out into ptr-pair operations. if (Val.getValueType() == MVT::i32) { SDValue Lo = extractWide32Lo(DAG, DL, Val); SDValue Hi = extractWide32Hi(DAG, DL, Val); EVT PtrVT = Ptr.getValueType(); // ptr32 const-i32-addr fast path: `*(uint32_t*)0x5000 = v` should // lower to two STAabs (DBR-relative, 5 cyc each) instead of two // [dp],Y stores via ST_PTR. Detect Wide32-zero-hi Constant ptr, // emit two i16 stores at TargetConstant:i32 addrs. TargetConstant // (not Constant) so LowerI32Constant doesn't re-fire and recreate // the REG_SEQUENCE. The STAabs timm pattern matches. if (PtrVT == MVT::i32 && Ptr.getNode()->isMachineOpcode() && Ptr.getMachineOpcode() == TargetOpcode::REG_SEQUENCE) { SDValue PtrLo, PtrHi; for (unsigned i = 1; i + 1 < Ptr.getNumOperands(); i += 2) { if (auto *CIdx = dyn_cast(Ptr.getOperand(i + 1))) { if (CIdx->getZExtValue() == llvm::sub_lo) PtrLo = Ptr.getOperand(i); else if (CIdx->getZExtValue() == llvm::sub_hi) PtrHi = Ptr.getOperand(i); } } auto *PtrHiC = dyn_cast_or_null(PtrHi); auto *PtrLoC = dyn_cast_or_null(PtrLo); if (PtrLoC && PtrHiC && PtrHiC->getZExtValue() == 0) { uint64_t Base = PtrLoC->getZExtValue() & 0xFFFF; SDValue PLo = DAG.getTargetConstant(Base, DL, MVT::i32); SDValue PHi = DAG.getTargetConstant((Base + 2) & 0xFFFF, DL, MVT::i32); SDValue StLo = DAG.getStore(Chain, DL, Lo, PLo, St->getPointerInfo(), St->getAlign(), St->getMemOperand()->getFlags()); SDValue StHi = DAG.getStore(StLo, DL, Hi, PHi, St->getPointerInfo().getWithOffset(2), St->getAlign(), St->getMemOperand()->getFlags()); return StHi; } } SDValue Two = DAG.getConstant(2, DL, PtrVT); SDValue Ptr2 = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr, Two); if (PtrVT == MVT::i32) { // ptr32 path — emit two W65816ISD::ST_PTR target nodes, sequentially // chained. The combiner cannot merge target-opaque MemIntrinsic // stores. SDVTList VTs = DAG.getVTList(MVT::Other); SDValue OpsLo[] = { Chain, Lo, Ptr }; SDValue StLo = DAG.getMemIntrinsicNode( W65816ISD::ST_PTR, DL, VTs, OpsLo, MVT::i16, St->getMemOperand()); SDValue OpsHi[] = { StLo, Hi, Ptr2 }; MachineMemOperand *MMOHi = DAG.getMachineFunction().getMachineMemOperand( St->getMemOperand(), 2, 2); SDValue StHi = DAG.getMemIntrinsicNode( W65816ISD::ST_PTR, DL, VTs, OpsHi, MVT::i16, MMOHi); return StHi; } // ptr16 path — emit two regular i16 stores serially chained so the // store-merger sees them as a 4-byte sequence (which it will likely // leave alone since the resulting i32 store has no legal target // pattern in ptr16 mode anyway). SDValue StLo = DAG.getStore(Chain, DL, Lo, Ptr, St->getPointerInfo(), St->getAlign(), St->getMemOperand()->getFlags()); SDValue StHi = DAG.getStore(StLo, DL, Hi, Ptr2, St->getPointerInfo().getWithOffset(2), St->getAlign(), St->getMemOperand()->getFlags()); return StHi; } if (Ptr.getValueType() != MVT::i32) return SDValue(); // The pseudos take Acc16 (i16) as the value half; the SEP/REP wrap // around STBptr32 narrows in memory. Promote i8 values to i16 with // ANY_EXTEND — the inserter only writes one byte, so the high half // is don't-care. if (Val.getValueType() == MVT::i8) Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, Val); unsigned NodeOpc = (MemVT == MVT::i8) ? unsigned(W65816ISD::STB_PTR) : unsigned(W65816ISD::ST_PTR); SDVTList VTs = DAG.getVTList(MVT::Other); SDValue Ops[] = { Chain, Val, Ptr }; return DAG.getMemIntrinsicNode(NodeOpc, DL, VTs, Ops, MemVT, St->getMemOperand()); } // VAARG: load *ap, advance ap by sizeof(VT). Unlike the default // expansion, we do NOT align ap to the type's preferred alignment — // caller-pushed varargs land at byte-granular addresses (PHA from an // odd S leaves the low byte at S+1 which is even, but our prologue's // TSC-sequence can produce odd S, etc.). Aligning ap would skip the // pushed value's low byte. static SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) { SDLoc DL(Op); SDValue Chain = Op.getOperand(0); SDValue VAListPtr = Op.getOperand(1); EVT VT = Op.getValueType(); // ap (va_list) is `char *` on this target — i16 under ptr16, i32 // under ptr32. Load and store it at PtrVT so we don't truncate and // lose the high half (under ptr32, hi=0 so the truncation read garbage // back, then the i16 store wrote i16 over the lo half but left an // unrelated value in the hi — silent miscompile of every variadic // call on ptr32). EVT PtrVT = VAListPtr.getValueType(); SDValue Ap = DAG.getLoad(PtrVT, DL, Chain, VAListPtr, MachinePointerInfo()); Chain = Ap.getValue(1); // For the actual data deref: under ptr16 we route i16 through // VAARG_LOAD (bank-0-explicit `[dp],Y`). Under ptr32, ap is already // a Wide32 ptr with hi=0 (caller set up the va_list to point into the // call-frame stack-args region, bank 0); a regular load through that // pointer routes to LDAptr32 / STBptr32 which already deref bank-0. SDValue Val; if (VT == MVT::i16 && PtrVT == MVT::i16) { SDVTList VTs = DAG.getVTList(MVT::i16, MVT::Other); Val = DAG.getNode(W65816ISD::VAARG_LOAD, DL, VTs, Chain, Ap); Chain = Val.getValue(1); } else { Val = DAG.getLoad(VT, DL, Chain, Ap, MachinePointerInfo()); Chain = Val.getValue(1); } // ap += sizeof(VT) (rounded up to whole bytes). unsigned Size = (VT.getSizeInBits() + 7) / 8; SDValue NewAp = DAG.getNode(ISD::ADD, DL, PtrVT, Ap, DAG.getConstant(Size, DL, PtrVT)); Chain = DAG.getStore(Chain, DL, NewAp, VAListPtr, MachinePointerInfo()); return DAG.getMergeValues({Val, Chain}, DL); } // VASTART: store the address of the first vararg slot (recorded by // LowerFormalArguments via VarArgsFrameIndex) to the va_list pointer. // va_list is just `i16 *next` here — minimum implementation. static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG, const W65816TargetLowering &TLI) { MachineFunction &MF = DAG.getMachineFunction(); auto *FuncInfo = MF.getInfo(); SDLoc DL(Op); // FrameIndex must be at PtrVT (i16 under ptr16, i32 under ptr32) so // the subsequent store writes the full pointer width. Under ptr32 // the i32 FI lowers via the i32 pointer-store path; the high half // is implicitly 0 (stack is bank 0) and stored alongside the lo. EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout()); SDValue VAFI = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); SDValue Chain = Op.getOperand(0); SDValue VAListPtr = Op.getOperand(1); const Value *SV = cast(Op.getOperand(2))->getValue(); return DAG.getStore(Chain, DL, VAFI, VAListPtr, MachinePointerInfo(SV)); } SDValue W65816TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); case ISD::BR_CC: return LowerBR_CC(Op, DAG); case ISD::SETCC: return LowerSETCC(Op, DAG); case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); case ISD::SELECT: { // Custom-lower SELECT for i32 result: split into per-half // selects. Without this, the legalizer's default (rewriting // SELECT to SELECT_CC against zero) produces SELECT_CC i32 of // a different shape that re-enters Custom and creates a cycle. if (Op.getValueType() != MVT::i32) return SDValue(); SDValue Cond = Op.getOperand(0); SDValue TVal = Op.getOperand(1); SDValue FVal = Op.getOperand(2); SDLoc DL(Op); SDValue TLo = extractWide32Lo(DAG, DL, TVal); SDValue THi = extractWide32Hi(DAG, DL, TVal); SDValue FLo = extractWide32Lo(DAG, DL, FVal); SDValue FHi = extractWide32Hi(DAG, DL, FVal); SDValue Lo = DAG.getNode(ISD::SELECT, DL, MVT::i16, Cond, TLo, FLo); SDValue Hi = DAG.getNode(ISD::SELECT, DL, MVT::i16, Cond, THi, FHi); return buildWide32(DAG, DL, Lo, Hi); } case ISD::SIGN_EXTEND: if (Op.getValueType() == MVT::i32) return LowerExtend(Op, DAG); return LowerSignExtend(Op, DAG); case ISD::VASTART: return LowerVASTART(Op, DAG, *this); case ISD::VAARG: return LowerVAARG(Op, DAG); case ISD::SHL: case ISD::SRL: case ISD::SRA: return LowerShift(Op, DAG); case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: return LowerExtend(Op, DAG); case ISD::SIGN_EXTEND_INREG: return LowerSignExtendInReg(Op, DAG); case ISD::TRUNCATE: return LowerTruncate(Op, DAG); case ISD::ADD: case ISD::SUB: case ISD::AND: case ISD::OR: case ISD::XOR: return LowerI32Bin(Op, DAG); case ISD::LOAD: return LowerLoad(Op, DAG); case ISD::STORE: return LowerStore(Op, DAG); case ISD::Constant: return LowerI32Constant(Op, DAG); // SJLJ EH: setup_dispatch is a no-op on this target — the dispatcher // logic lives entirely in the SJLJ runtime (_Unwind_SjLj_Resume + // longjmp into the function context's jmp_buf). The isel layer // doesn't need to emit any code; just thread the chain through. case ISD::EH_SJLJ_SETUP_DISPATCH: return Op.getOperand(0); case ISD::DYNAMIC_STACKALLOC: return LowerDynamicStackalloc(Op, DAG); case ISD::STACKSAVE: { // Return Constant 0 — SJLJ stores this into the function context // but our setjmp/longjmp manage SP directly, so the value is dead. SDLoc DL(Op); EVT VT = Op.getValueType(); SDValue Chain = Op.getOperand(0); SDValue Result; if (VT == MVT::i16) Result = DAG.getConstant(0, DL, MVT::i16); else Result = buildWide32(DAG, DL, DAG.getConstant(0, DL, MVT::i16), DAG.getConstant(0, DL, MVT::i16)); return DAG.getMergeValues({Result, Chain}, DL); } case ISD::STACKRESTORE: // No-op — pass the chain through. return Op.getOperand(0); case ISD::FRAMEADDR: { // FRAMEADDR(N): SJLJ uses N=0 (current frame). We don't reserve a // frame pointer and SP isn't trivially CopyFromReg-able (no // register class). Return Constant 0 — SJLJ uses it as an opaque // per-frame identifier; the SJLJ runtime tracks frames by jmp_buf // chaining (FnCtx::prev) rather than by FRAMEADDR value, so a // constant works for single-throw / non-nested-catch programs. // True multi-frame SJLJ would need a TSC-based unique value. SDLoc DL(Op); EVT VT = Op.getValueType(); if (VT == MVT::i16) return DAG.getConstant(0, DL, MVT::i16); SDValue Lo = DAG.getConstant(0, DL, MVT::i16); SDValue Hi = DAG.getConstant(0, DL, MVT::i16); return buildWide32(DAG, DL, Lo, Hi); } default: Op.dump(); llvm_unreachable("W65816: unexpected operation in LowerOperation"); } } std::pair W65816TargetLowering::getRegForInlineAsmConstraint( const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { // Strip leading '{' and trailing '}' for the long form. StringRef C = Constraint; if (C.size() >= 2 && C.front() == '{' && C.back() == '}') C = C.substr(1, C.size() - 2); if (VT == MVT::i8) { if (C == "a") return {W65816::A, &W65816::Acc8RegClass}; if (C == "x") return {W65816::X, &W65816::Idx8RegClass}; if (C == "y") return {W65816::Y, &W65816::Idx8RegClass}; if (C == "r") return {W65816::A, &W65816::Acc8RegClass}; } else { // i16 default; pointer types fold here too if (C == "a") return {W65816::A, &W65816::Acc16RegClass}; if (C == "x") return {W65816::X, &W65816::Idx16RegClass}; if (C == "y") return {W65816::Y, &W65816::Idx16RegClass}; if (C == "r") return {W65816::A, &W65816::Acc16RegClass}; } return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); } SDValue W65816TargetLowering::LowerDynamicStackalloc(SDValue Op, SelectionDAG &DAG) const { // (DYNAMIC_STACKALLOC chain, size, align) -> (ptr, chain). // Lowered as: stash entry SP -> DP $F4 (handled by emitPrologue when // MFI.hasVarSizedObjects), then `tsc; sec; sbc size; tcs; inc a`. // The epilogue restores SP from $F4. // // Limitation: any FrameIndex (local, spill slot, parameter) accessed // *after* the alloca reads from a wrong stack-relative offset because // PEI bakes FI offsets relative to the static-frame SP, not the // post-alloca SP. A real frame pointer would lift this; for now we // accept the limitation and document it. The simplest safe pattern // is "VLA at end of function, used immediately, no further FI access"; // anything else is at-your-own-risk until FP support lands. SDLoc DL(Op); SDValue Chain = Op.getOperand(0); SDValue Size = Op.getOperand(1); SDValue ChainAndPtr = DAG.getNode(W65816ISD::ALLOCA, DL, DAG.getVTList(MVT::i16, MVT::Other), Chain, Size); SDValue Ptr = ChainAndPtr.getValue(0); SDValue NewChain = ChainAndPtr.getValue(1); return DAG.getMergeValues({Ptr, NewChain}, DL); } SDValue W65816TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { // i8 shifts: promote to i16, shift, truncate. SRA promotes via SEXT // (preserves sign for arithmetic right shift); SHL/SRL via ZEXT // (logical / left shifts don't care about high bits). This routes // i8 shifts through the same i16 fast paths and libcalls — no // parallel qi3 libcall set needed. The DAG combiner would otherwise // narrow `(trunc (shl (zext X), K))` back to `(shl X, K)` of i8, // re-entering this hook in an infinite loop; the // `isTypeDesirableForOp(SHL/SRL/SRA, i8) -> false` override above // disables that combine. if (Op.getValueType() == MVT::i8) { SDLoc DL(Op); SDValue X = Op.getOperand(0); SDValue N = Op.getOperand(1); unsigned Ext = (Op.getOpcode() == ISD::SRA) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; SDValue X16 = DAG.getNode(Ext, DL, MVT::i16, X); SDValue N16 = N.getValueType() == MVT::i16 ? N : DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, N); // Special case: i8 SRA by 7 of a sign-extended value is the // sign-fill operation — every result bit is the input's bit 7. // For sext(i8 x), bit 15 == bit 7, so `(sra (sext x), 7)` yields // the same result as `(sra (sext x), 15)`, which we have a tight // 4-insn pattern for via SRA15A. Avoids the __ashrhi3 libcall // (~10 insns plus arg push/pop overhead) — abs8 dropped from 47 // to 35 insns with this rewrite in place. if (Op.getOpcode() == ISD::SRA) { if (auto *C = dyn_cast(N)) { if (C->getZExtValue() == 7) { N16 = DAG.getConstant(15, DL, MVT::i16); } } } SDValue R16 = DAG.getNode(Op.getOpcode(), DL, MVT::i16, X16, N16); return DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, R16); } // Fast path: shift-by-{1,2,3,4} have inline tablegen patterns. Return // Op (the unchanged node) so the legalizer leaves it alone — the // pattern matcher catches it later. Returning SDValue() instead // would fall through to the generic Expand path, which generates a // BUILD_VECTOR-based magic-constant rewrite that we can't lower. // Also allow `(srl x, 15)` through — pattern SRL15A handles it as // `ASL A; LDA #0; ROL A` (3 bytes), much shorter than the libcall. // The type-legalizer's i32-shift-by-1 expansion emits this exact // node for the high-half "bit-from-low" slot. // Everything else goes to a libcall (__ashlhi3 / __lshrhi3 / __ashrhi3). // i16 only — i32 always routes to libcall (no inline i32 patterns). SDValue Amount = Op.getOperand(1); if (Op.getValueType() == MVT::i16) { if (auto *C = dyn_cast(Amount)) { uint64_t N = C->getZExtValue(); if ((Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL) && N >= 1 && N <= 14) return Op; if (N == 15 && (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL)) return Op; if (N == 1 && Op.getOpcode() == ISD::SRA) return Op; if (N == 15 && Op.getOpcode() == ISD::SRA) return Op; } } bool IsI32 = Op.getValueType() == MVT::i32; RTLIB::Libcall LC; switch (Op.getOpcode()) { case ISD::SHL: LC = IsI32 ? RTLIB::SHL_I32 : RTLIB::SHL_I16; break; case ISD::SRL: LC = IsI32 ? RTLIB::SRL_I32 : RTLIB::SRL_I16; break; case ISD::SRA: LC = IsI32 ? RTLIB::SRA_I32 : RTLIB::SRA_I16; break; default: llvm_unreachable("not a shift"); } // makeLibCall wants the args as TargetLowering::ArgListEntry; the // simpler getNode form is to manually build the call. But the // makeLibCall helper handles the calling convention. SmallVector Args = {Op.getOperand(0), Op.getOperand(1)}; TargetLowering::MakeLibCallOptions Opts; Opts.setIsSigned(Op.getOpcode() == ISD::SRA); return makeLibCall(DAG, LC, Op.getValueType(), Args, Opts, SDLoc(Op)).first; } SDValue W65816TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { auto *GA = cast(Op); SDLoc DL(Op); EVT PtrVT = Op.getValueType(); // i16 in ptr16 mode, i32 in ptr32 mode if (PtrVT == MVT::i32) { // i32 GlobalAddress: build Wide32 from (i16 offset, i16 bank). // The i16 offset goes through W65816ISD::Wrapper as before — IMM16 // cRELOC rewrites the offset under Loader. The bank half is set to // 0 here, but crt0Gsos's $BE-init or a future per-pointer bank // relocation can be threaded through. TODO: wire bank cRELOC. SDValue OffTgt = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, MVT::i16, GA->getOffset()); SDValue Lo = DAG.getNode(W65816ISD::Wrapper, DL, MVT::i16, OffTgt); SDValue Hi = DAG.getConstant(0, DL, MVT::i16); return buildWide32(DAG, DL, Lo, Hi); } SDValue Tgt = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, PtrVT, GA->getOffset()); return DAG.getNode(W65816ISD::Wrapper, DL, PtrVT, Tgt); } SDValue W65816TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { auto *ES = cast(Op); SDLoc DL(Op); EVT PtrVT = Op.getValueType(); if (PtrVT == MVT::i32) { SDValue OffTgt = DAG.getTargetExternalSymbol(ES->getSymbol(), MVT::i16); SDValue Lo = DAG.getNode(W65816ISD::Wrapper, DL, MVT::i16, OffTgt); SDValue Hi = DAG.getConstant(0, DL, MVT::i16); return buildWide32(DAG, DL, Lo, Hi); } SDValue Tgt = DAG.getTargetExternalSymbol(ES->getSymbol(), PtrVT); return DAG.getNode(W65816ISD::Wrapper, DL, PtrVT, Tgt); } SDValue W65816TargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl &InVals) const { // ABI: first i16/i8 argument is passed in A; remaining arguments are // pushed by the caller right-to-left and read via stack-relative // addressing. After JSL pushes 3 bytes of return address, the layout // viewed from the callee is: // (high addr) arg N-1 // ... // arg 1 // ret-addr-bank <- (4,S) when M=0 // ret-addr-hi <- (3,S) // ret-addr-lo <- (2,S) // (low addr) <- (1,S) // // Each i16 stack arg occupies 2 bytes. arg 1 lives at (4,S). MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); // i32 first-arg ABI. Two flavors as in LowerCall: // - Legal-i32 (Wide32 reg class registered): single i32 InputArg. // - Split-i32 (legacy): two i16 InputArgs both with OrigArgIndex==0. bool I32SplitFirstArg = Ins.size() >= 2 && Ins[0].VT == MVT::i16 && Ins[1].VT == MVT::i16 && Ins[0].OrigArgIndex == 0 && Ins[1].OrigArgIndex == 0; // True iff the FIRST original arg spans 4 i16s (i.e., is i64). Used // below to choose the Img16-via-STX_DP X-arg path for i64 callees, // which dodges greedy's TXA-bridge-clobbers-A spill bug. i32-first // doesn't get the same treatment because the change pessimizes // simple functions like `int add32(int a, int b) { return a+b; }` // where greedy's regular A:X handling is fine. // Two shapes for i64-first-arg under different ptr modes: // ptr16 (i32 illegal): Ins[0..3] = 4 i16 halves of arg0 // ptr32 (i32 legal): Ins[0..1] = 2 i32 halves of arg0 — but the // IR-level "single i64 first arg" still splits // to 4 i16 in Outs/Ins because i64 isn't legal. // So the i16-form detection still applies here. bool I64FirstArg = Ins.size() >= 4 && Ins[0].VT == MVT::i16 && Ins[1].VT == MVT::i16 && Ins[2].VT == MVT::i16 && Ins[3].VT == MVT::i16 && Ins[0].OrigArgIndex == 0 && Ins[1].OrigArgIndex == 0 && Ins[2].OrigArgIndex == 0 && Ins[3].OrigArgIndex == 0; // Also detect the i32-split shape: Ins[0..1] = 2 i32 halves of arg0 // (with OrigArgIndex==0 on both). This happens with ptr32 active and // i64 legalized via i32-split rather than i16-quad-split. if (!I64FirstArg && Ins.size() >= 2 && Ins[0].VT == MVT::i32 && Ins[1].VT == MVT::i32 && Ins[0].OrigArgIndex == 0 && Ins[1].OrigArgIndex == 0) I64FirstArg = true; unsigned ArgIdx = 0; // Stack offset is measured from S+1 (the WDC convention) and grows // upward as we walk through the stack-passed args. unsigned StackOffset = 4; // Skip 3 ret-addr bytes; first slot at S+4. for (const ISD::InputArg &Arg : Ins) { MVT VT = Arg.VT; if (VT != MVT::i16 && VT != MVT::i8 && VT != MVT::i32) report_fatal_error("W65816: argument type not yet supported"); if (ArgIdx == 0 && VT == MVT::i32) { // Whole-i32 first arg: lo half live-in via $a, hi via $x. // The W65816LowerWide32 pre-RA pass walks the resulting // REG_SEQUENCE and rewrites Wide32 uses into pairs of i16 // operations — keeping AX32 out of the regalloc's pair- // allocation path entirely. // For i64-first-arg signatures (the IR has a single i64 arg // that splits to 2 i32 in Ins[0..1] under ptr32), route BOTH // halves through Img16. Without this the regalloc emits // `TXA; STA spill_X; STA spill_A` at function entry — the TXA // clobbers $a (arg0_0) before the A-spill saves it, so both // spill slots end up holding arg0_1. Caused __adddf3(1.5,2.5) // → 1.5 because the cb-test path read TXA-corrupted A. // Route the hi half through Img16 (DP-backed) for whole-i32 first // args. The Idx16 (X-only) class collapses through the W65816LowerWide32 // pre-RA pass to plain Acc16, after which regalloc treats both halves // as competing for $a — a TXA at the top of any non-trivial function // body destroys arg0_lo before it's spilled (silent miscompile of // every i32-arg function with > a few uses). Img16 forces an // STX_DP at function entry, immune to A-reuse. i64-first already // did this; under ptr32 the same hazard hits any i32 arg. const TargetRegisterClass *VRegLoRC = I64FirstArg ? &W65816::Img16RegClass : &W65816::Acc16RegClass; const TargetRegisterClass *VRegHiRC = &W65816::Img16RegClass; Register VRegLo = MRI.createVirtualRegister(VRegLoRC); Register VRegHi = MRI.createVirtualRegister(VRegHiRC); MRI.addLiveIn(W65816::A, VRegLo); MRI.addLiveIn(W65816::X, VRegHi); SDValue Lo = DAG.getCopyFromReg(Chain, DL, VRegLo, MVT::i16); SDValue Hi = DAG.getCopyFromReg(Chain, DL, VRegHi, MVT::i16); InVals.push_back(buildWide32(DAG, DL, Lo, Hi)); } else if (ArgIdx == 0) { // First arg in A. For i64-first-arg signatures (4 i16 halves of // arg0 with OrigArgIndex==0), route arg0_0 through Img16 the same // way ArgIdx==1 does — via an entry STA-to-DP-slot at function // entry. Without this, the regalloc emits a TXA bridge for // arg0_1's spill that clobbers $a (= arg0_0) BEFORE arg0_0 has // been saved, and BOTH arg0_0 and arg0_1's spill slots end up // holding arg0_1. Observed as `__adddf3(1.5, 2.5) → 1.5` because // the cb-test BEQ sees flags from a TXA-clobbered LDA cb path. const TargetRegisterClass *RC = (VT == MVT::i16) ? (I64FirstArg ? &W65816::Img16RegClass : &W65816::Acc16RegClass) : &W65816::Acc8RegClass; Register VReg = MRI.createVirtualRegister(RC); MRI.addLiveIn(W65816::A, VReg); InVals.push_back(DAG.getCopyFromReg(Chain, DL, VReg, VT)); } else if (ArgIdx == 1 && I32SplitFirstArg) { // First-arg hi half (or arg0_ml for i64-first-arg): in X. // For i64-first-arg signatures (4 i16s with OrigArgIndex 0), use // Img16 so greedy parks the value in an IMG slot via STX_DP, // dodging the TXA-bridge-clobbers-A spill bug. i32-first stays // on the original Idx16 path because the change pessimizes // simple cases (verified: vprintf's writeULong/__udivsi3 chain // crashes if i32-first is also rerouted). Caught by udivmod. const TargetRegisterClass *RC = I64FirstArg ? &W65816::Img16RegClass : &W65816::Idx16RegClass; Register VReg = MRI.createVirtualRegister(RC); MRI.addLiveIn(W65816::X, VReg); InVals.push_back(DAG.getCopyFromReg(Chain, DL, VReg, MVT::i16)); } else if (VT == MVT::i32) { // i32 stack arg: 4 bytes, loaded as 2 i16 halves and assembled // via REG_SEQUENCE into a Wide32 SDValue. int FILo = MFI.CreateFixedObject(2, StackOffset, /*Immutable*/true); int FIHi = MFI.CreateFixedObject(2, StackOffset + 2, /*Immutable*/true); StackOffset += 4; SDValue FINLo = DAG.getFrameIndex(FILo, MVT::i16); SDValue FINHi = DAG.getFrameIndex(FIHi, MVT::i16); SDValue Lo = DAG.getLoad(MVT::i16, DL, Chain, FINLo, MachinePointerInfo::getFixedStack(MF, FILo)); SDValue Hi = DAG.getLoad(MVT::i16, DL, Chain, FINHi, MachinePointerInfo::getFixedStack(MF, FIHi)); InVals.push_back(buildWide32(DAG, DL, Lo, Hi)); } else { // Subsequent args are loaded from the stack. i8 args are // promoted to i16 slots (matching CC_W65816's CCPromoteToType) // so the load can run in the function's default 16-bit M mode // without needing a per-byte SEP/REP wrap; we then truncate the // i16 back to i8 for the IR. i16 args are loaded directly. unsigned ObjSize = 2; int FI = MFI.CreateFixedObject(ObjSize, StackOffset, /*Immutable*/true); StackOffset += ObjSize; SDValue FIN = DAG.getFrameIndex(FI, MVT::i16); SDValue Val = DAG.getLoad( MVT::i16, DL, Chain, FIN, MachinePointerInfo::getFixedStack(MF, FI)); if (VT == MVT::i8) Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Val); InVals.push_back(Val); } ++ArgIdx; } // Vararg support: stash the FrameIndex of the next stack-arg slot // (where the caller's first vararg lives) so VASTART can use it // as the va_list start. StackOffset has been advanced past every // named stack arg; the first vararg sits at SP + StackOffset. if (IsVarArg) { int FI = MFI.CreateFixedObject(2, StackOffset, /*Immutable=*/true); auto *FuncInfo = MF.getInfo(); FuncInfo->setVarArgsFrameIndex(FI); } return Chain; } SDValue W65816TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl &InVals) const { // Multi-arg ABI: arg 0 in A, args 1..N-1 pushed in REVERSE order via // PUSH16 (PHA) so the callee's `(4,S)` reads pick up arg 1, `(6,S)` // gets arg 2, etc. CALLSEQ_START records the byte count; // ADJCALLSTACKUP after the call emits `tsc;clc;adc #N;tcs` to // release the pushed bytes (eliminateCallFramePseudoInstr). SelectionDAG &DAG = CLI.DAG; SDLoc &DL = CLI.DL; SDValue Chain = CLI.Chain; SDValue Callee = CLI.Callee; auto &Outs = CLI.Outs; auto &OutVals = CLI.OutVals; auto &Ins = CLI.Ins; if (CLI.IsTailCall) CLI.IsTailCall = false; // Up to 4 return halves (i64 split): i8/i16 in A; i32 in A:X; // i64 in A:X:Y plus DP $F0..$F1 for the highest half. See // LowerReturn comment for the ABI. if (Ins.size() > 4) report_fatal_error("W65816: return type wider than 64 bits not supported"); // Indirect calls (function pointers): redirect through the runtime // trampoline `__jsl_indir`. The 65816 has no JSL-indirect; instead, // we store the dynamic target to a global (`__indirTarget`), then // JSL the trampoline, which immediately does `JMP (__indirTarget)`. // The target's RTL pops the original JSL's return frame and returns // straight back to the caller — no double-RTL or extra frame. // Caveat: single-bank only (JMP indirect is bank-local). bool IsIndirect = !isa(Callee) && !isa(Callee); if (IsIndirect) { // Store the dynamic target to __indirTarget *before* any other // setup, since pushing args clobbers A. STAabs takes an // ExternalSymbol-wrapped address operand. SDValue TargetSym = DAG.getTargetExternalSymbol("__indirTarget", MVT::i16); SDValue WrappedSym = DAG.getNode(W65816ISD::Wrapper, DL, MVT::i16, TargetSym); Chain = DAG.getStore(Chain, DL, Callee, WrappedSym, MachinePointerInfo()); // Replace the callee with __jsl_indir for the actual JSL. Callee = DAG.getExternalSymbol("__jsl_indir", MVT::i16); } for (const ISD::OutputArg &O : Outs) { if (O.VT != MVT::i16 && O.VT != MVT::i8 && O.VT != MVT::i32) report_fatal_error("W65816: argument type not yet supported"); } // i32 first-arg ABI. Two flavors: // - Legal-i32: Outs[0].VT == i32 (whole pair). Pass in AX32. // - Split-i32 (legacy): Outs[0]/Outs[1] both i16 with OrigArgIndex==0. // Pass low in A, high in X. bool I32WholeFirstArg = !Outs.empty() && Outs[0].VT == MVT::i32; bool I32SplitFirstArg = Outs.size() >= 2 && Outs[0].VT == MVT::i16 && Outs[1].VT == MVT::i16 && Outs[0].OrigArgIndex == 0 && Outs[1].OrigArgIndex == 0; unsigned FirstStackArg = I32WholeFirstArg ? 1 : I32SplitFirstArg ? 2 : 1; // i8 stack args are promoted to i16 (2-byte slots) so the callee can // read them with a 16-bit M load — matches LowerFormalArguments and // CC_W65816's CCPromoteToType. i32 stack args occupy 4 bytes // (2 PUSH16s). unsigned StackBytes = 0; for (unsigned i = FirstStackArg; i < Outs.size(); ++i) StackBytes += (Outs[i].VT == MVT::i32) ? 4 : 2; Chain = DAG.getCALLSEQ_START(Chain, StackBytes, 0, DL); // Push stack-passed args in reverse so arg FirstStackArg ends up at // the lowest post-JSL stack-relative offset (4,S). Each push uses A // by default; if the value being pushed is already a `CopyFromReg X` // (e.g. forwarding the i32-first-arg-in-A:X hi half), push directly // from X via PHX — saves the TXA + A-spill round-trip that would // otherwise be required. SDValue Glue; // Helper: push a single i16-sized value via PHA. auto pushI16 = [&](SDValue V) { bool ViaX = false; if (V.getOpcode() == ISD::CopyFromReg) { auto *RegN = dyn_cast(V.getOperand(1).getNode()); if (RegN) { Register R = RegN->getReg(); if (R.isPhysical() && R == W65816::X) { ViaX = true; } else if (R.isVirtual()) { MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); if (MRI.getRegClass(R) == &W65816::Idx16RegClass) { for (auto &LI : MRI.liveins()) if (LI.second == R && LI.first == W65816::X) { ViaX = true; break; } } } } } if (ViaX) { Chain = DAG.getCopyToReg(Chain, DL, W65816::X, V, Glue); Glue = Chain.getValue(1); Chain = DAG.getNode(W65816ISD::PUSH_X, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain, Glue); } else { Chain = DAG.getCopyToReg(Chain, DL, W65816::A, V, Glue); Glue = Chain.getValue(1); Chain = DAG.getNode(W65816ISD::PUSH, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain, Glue); } Glue = Chain.getValue(1); }; for (int i = (int)Outs.size() - 1; i >= (int)FirstStackArg; --i) { SDValue V = OutVals[i]; if (Outs[i].VT == MVT::i32) { // Push i32 stack arg: hi half first (lands at higher address), // lo half second (lands at lower address = the slot the callee // reads as the start of the i32). SDValue Lo = extractWide32Lo(DAG, DL, V); SDValue Hi = extractWide32Hi(DAG, DL, V); pushI16(Hi); pushI16(Lo); continue; } if (Outs[i].VT == MVT::i8) V = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, V); pushI16(V); } // i32 first-arg. Whole (legal-i32): split into lo/hi and copy // to $a/$x separately — avoids AX32 in the MIR (see // W65816LowerWide32). Split-i32 (legacy 2-i16): hi in X first, // then lo in A below. if (I32WholeFirstArg) { SDValue Lo = extractWide32Lo(DAG, DL, OutVals[0]); SDValue Hi = extractWide32Hi(DAG, DL, OutVals[0]); Chain = DAG.getCopyToReg(Chain, DL, W65816::X, Hi, Glue); Glue = Chain.getValue(1); Chain = DAG.getCopyToReg(Chain, DL, W65816::A, Lo, Glue); Glue = Chain.getValue(1); } else if (I32SplitFirstArg) { Chain = DAG.getCopyToReg(Chain, DL, W65816::X, OutVals[1], Glue); Glue = Chain.getValue(1); } // Arg 0 in A — only for non-whole-i32 first-arg. Whole-i32 // already copied to A/X above. if (!I32WholeFirstArg && !OutVals.empty()) { Chain = DAG.getCopyToReg(Chain, DL, W65816::A, OutVals[0], Glue); Glue = Chain.getValue(1); } // Callee target type must match iPTR (i16 in ptr16, i32 in ptr32). // The CALL SDNode's operand-type profile (SDT_W65816Call) is iPTR; // hardcoding MVT::i16 here mismatches under p:32:16. EVT CalleeVT = getPointerTy(DAG.getDataLayout()); if (auto *GA = dyn_cast(Callee)) Callee = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, CalleeVT); else if (auto *ES = dyn_cast(Callee)) Callee = DAG.getTargetExternalSymbol(ES->getSymbol(), CalleeVT); SmallVector CallOps = {Chain, Callee}; if (I32WholeFirstArg) { CallOps.push_back(DAG.getRegister(W65816::A, MVT::i16)); CallOps.push_back(DAG.getRegister(W65816::X, MVT::i16)); } else if (!OutVals.empty()) { CallOps.push_back(DAG.getRegister(W65816::A, Outs[0].VT)); if (I32SplitFirstArg) CallOps.push_back(DAG.getRegister(W65816::X, Outs[1].VT)); } if (Glue.getNode()) CallOps.push_back(Glue); Chain = DAG.getNode(W65816ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue), CallOps); Glue = Chain.getValue(1); Chain = DAG.getCALLSEQ_END(Chain, StackBytes, 0, Glue, DL); Glue = Chain.getValue(1); // Read return value(s). Mirrors LowerReturn: i8/i16 in A, i32 in // AX32 (whole) or split A/X (legacy), and 4-half (i64 / 2x i32) in // A, X, Y, DPF0. i32 Ins are read as a single i32 from the half // pair (A:X for the first, Y:DPF0 for a second-pair-of-halves). // Whole-i32 single return: read lo from $a, hi from $x. Avoids // using AX32 in the SDAG / MIR — see W65816LowerWide32 pass. if (Ins.size() == 1 && Ins[0].VT == MVT::i32) { SDValue Lo = DAG.getCopyFromReg(Chain, DL, W65816::A, MVT::i16, Glue); Chain = Lo.getValue(1); Glue = Lo.getValue(2); SDValue Hi = DAG.getCopyFromReg(Chain, DL, W65816::X, MVT::i16, Glue); Chain = Hi.getValue(1); Glue = Hi.getValue(2); InVals.push_back(buildWide32(DAG, DL, Lo, Hi)); return Chain; } // Build a flat list of i16 halves expected from the call. Then // walk it, copying from A, X, Y, DPF0 in order. Re-assemble i32 // halves into a Wide32 SDValue at the end. SmallVector ExpVT; for (unsigned i = 0, e = Ins.size(); i != e; ++i) { MVT VT = Ins[i].VT; if (VT == MVT::i32) { ExpVT.push_back(MVT::i16); ExpVT.push_back(MVT::i16); } else if (VT == MVT::i16 || VT == MVT::i8) { ExpVT.push_back(VT); } else { report_fatal_error("W65816: return half must be i8/i16/i32"); } } if (ExpVT.size() > 4) report_fatal_error("W65816: return type wider than 64 bits not supported"); static constexpr Register RetRegs[4] = {W65816::A, W65816::X, W65816::Y, W65816::DPF0}; SmallVector Halves; for (unsigned i = 0; i != ExpVT.size(); ++i) { SDValue V = DAG.getCopyFromReg(Chain, DL, RetRegs[i], ExpVT[i], Glue); Chain = V.getValue(1); Glue = V.getValue(2); Halves.push_back(V); } // Re-pack halves into the original Ins shape (i32s rebuild via // REG_SEQUENCE; i8/i16 pass through). unsigned hi = 0; for (unsigned i = 0, e = Ins.size(); i != e; ++i) { if (Ins[i].VT == MVT::i32) { InVals.push_back(buildWide32(DAG, DL, Halves[hi], Halves[hi + 1])); hi += 2; } else { InVals.push_back(Halves[hi]); hi += 1; } } return Chain; } SDValue W65816TargetLowering::LowerReturn( SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SDLoc &DL, SelectionDAG &DAG) const { // Return ABI: // i8/i16: value in A. // i32: low half (Outs[0]) in A, high half (Outs[1]) in X. // i64: halves in A, X, Y, and a fixed direct-page slot at $F0..$F1 // (Outs[0..2] -> A,X,Y; Outs[3] stored to the DP slot). // wider: not yet supported. // Type legalization splits an i32 into 2 consecutive i16 Outs and an // i64 into 4. Emission order matters: we copy the *highest* halves // first so that the regalloc can place each through A (the only // ALU reg) without conflict. The TAX/TAY in copyPhysReg preserves // A, so subsequent low-half copies to A don't clobber. // With i32 legal, an Outs entry may be MVT::i32; we expand each i32 // into its two i16 halves (sub_lo/sub_hi via EXTRACT_SUBREG) so the // legacy A/X/Y/DPF0 4-half return ABI continues to work for the // multi-half return cases (i64 returned as 2 i32, struct of 2 long // returned as 2 i32, etc.). SmallVector ExpVT; SmallVector ExpVals; for (unsigned i = 0; i != Outs.size(); ++i) { MVT VT = Outs[i].VT; if (VT == MVT::i32) { ExpVT.push_back(MVT::i16); ExpVT.push_back(MVT::i16); ExpVals.push_back(extractWide32Lo(DAG, DL, OutVals[i])); ExpVals.push_back(extractWide32Hi(DAG, DL, OutVals[i])); } else if (VT == MVT::i16 || VT == MVT::i8) { ExpVT.push_back(VT); ExpVals.push_back(OutVals[i]); } else { report_fatal_error("W65816: return half must be i8/i16/i32"); } } if (ExpVT.size() > 4) report_fatal_error("W65816: return type wider than 64 bits not supported"); // Single whole-i32 return: copy directly to AX32 instead of two // halves to A and X. Saves the regalloc/coalescer some work. bool I32WholeReturn = (Outs.size() == 1 && Outs[0].VT == MVT::i32); SDValue Glue; SmallVector RetOps(1, Chain); if (I32WholeReturn) { // Split the i32 OutVal into lo/hi and copy each separately to // $a / $x (no AX32 in the SDAG — see W65816LowerWide32). SDValue Lo = extractWide32Lo(DAG, DL, OutVals[0]); SDValue Hi = extractWide32Hi(DAG, DL, OutVals[0]); Chain = DAG.getCopyToReg(Chain, DL, W65816::X, Hi, Glue); Glue = Chain.getValue(1); Chain = DAG.getCopyToReg(Chain, DL, W65816::A, Lo, Glue); Glue = Chain.getValue(1); RetOps.push_back(DAG.getRegister(W65816::A, MVT::i16)); RetOps.push_back(DAG.getRegister(W65816::X, MVT::i16)); RetOps[0] = Chain; if (Glue.getNode()) RetOps.push_back(Glue); return DAG.getNode(W65816ISD::RET_GLUE, DL, MVT::Other, RetOps); } // Outs[3] -> DP $F0 via CopyToReg(DPF0). Using the DPF0 fake physreg // (lowered to `STA $F0` by copyPhysReg) is critical: a generic // ISD::STORE with addr=0xF0 lowered to `sta (d,s),y`, an indirect // through the DBR, which silently misbehaved when DBR != 0. STA dp // uses D + dp directly and is unaffected by DBR. Done first so its // computation can use A freely before A holds the low result. Glued // to RET_GLUE via the RetOps Register entry below so DCE doesn't // strip the COPY. // Use the expanded i16-half list (i32 outs split into 2 i16 halves). if (ExpVals.size() >= 4) { Chain = DAG.getCopyToReg(Chain, DL, W65816::DPF0, ExpVals[3], Glue); Glue = Chain.getValue(1); } if (ExpVals.size() >= 3) { Chain = DAG.getCopyToReg(Chain, DL, W65816::Y, ExpVals[2], Glue); Glue = Chain.getValue(1); } if (ExpVals.size() >= 2) { Chain = DAG.getCopyToReg(Chain, DL, W65816::X, ExpVals[1], Glue); Glue = Chain.getValue(1); } if (!ExpVals.empty()) { Chain = DAG.getCopyToReg(Chain, DL, W65816::A, ExpVals[0], Glue); Glue = Chain.getValue(1); RetOps.push_back(DAG.getRegister(W65816::A, ExpVT[0])); } if (ExpVals.size() >= 2) RetOps.push_back(DAG.getRegister(W65816::X, ExpVT[1])); if (ExpVals.size() >= 3) RetOps.push_back(DAG.getRegister(W65816::Y, ExpVT[2])); if (ExpVals.size() >= 4) RetOps.push_back(DAG.getRegister(W65816::DPF0, ExpVT[3])); RetOps[0] = Chain; if (Glue.getNode()) RetOps.push_back(Glue); return DAG.getNode(W65816ISD::RET_GLUE, DL, MVT::Other, RetOps); } SDValue W65816TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { // (shl i32 X, K) -> chain of K (add x, x) for small K. After type // legalisation the i32 add splits via ADDC/ADDE pseudos which expand // to native ASL/ROL + carry-chain — much cheaper than the type- // legaliser's SHL_PARTS expansion which uses our 3-insn SRL15A trick // to compute the bit crossing the half boundary. Each ADD expands to // ~10 insns; SHL_PARTS expansion is ~26 for K=1, ~33 for K=2, ~34 for // K=3. ADD-chain wins at K<=2 and breaks even at K=3 — cap at K=2. // `x*N` (which the combiner canonicalises pow-of-2 muls to `x< ADD chain for small K — but only when i32 is // ILLEGAL (i.e., gets type-split into i16 halves). When i32 is a // legal type (Wide32 reg class for ptr32 mode), the rewrite cycles // against LLVM's generic `(add x, x) -> (shl x, 1)` combine in the // i64 → 2 i32 split path, hanging the legalizer. // STORE / LOAD with ConstantSDNode ptr (e.g. `*(volatile uint8*)0xC035 = v`): // wrap the immediate in a W65816ISD::WRAPPER (using a TargetGlobalAddress- // like marker would be cleaner but we lack the symbol table). Re-issue // the store/load with the same ptr but the constant marked TargetConstant // — TargetConstant is opaque to LowerI32Constant, so it survives intact // to ISel, where the existing tablegen pattern // `(store Acc8, (iPTR imm)) -> STA8long` // matches (`imm` accepts both Constant and TargetConstant). 4 B / 6 cyc // bank-explicit `sta long` instead of 16 B / 30 cyc [dp],Y. // Wide32-of-Wrapper-with-zero-hi → i16 Wrapper. Under p:32:16, // LowerGlobalAddress builds GlobalAddress as a Wide32 reg pair // `(REG_SEQUENCE Wrapper(off_i16), 0_i16)`. Stores/loads against // this Wide32 ptr fall to the heavy [dp],Y path (16 B / 30 cyc) // even when the bank half is the constant 0 — we want the cheap // DBR-relative `sta g` / `lda g` (3 B / 5 cyc). Detect the shape // and recombine the ptr to its 16-bit form so the existing // tablegen `(store v, (Wrapper tglob))` → STAabs / `(load (Wrapper // tglob))` → LDAabs patterns fire. Crucially, this is correct // ONLY when bank=0 — under GS/OS Loader, DBR is set to our bank // by crt0Gsos, so DBR-relative addressing reaches the same global. // Returns either an i16 Wrapper (drop into i16 STAabs/LDAabs pattern) // or a TargetConstant:i32 (for const-addr i16 stores so the timm // pattern fires and produces STAabs). TargetConstant — not regular // Constant — because LowerI32Constant only matches ISD::Constant; if // we returned a fresh ConstantSDNode it would re-fire LowerI32Constant // and produce another Wide32 REG_SEQUENCE → infinite combine loop. auto unwrapWide32WithZeroHi = [&](SDValue Ptr) -> SDValue { if (Ptr.getValueType() != MVT::i32) return SDValue(); if (!Ptr.getNode()->isMachineOpcode()) return SDValue(); if (Ptr.getMachineOpcode() != TargetOpcode::REG_SEQUENCE) return SDValue(); SDValue Lo, Hi; for (unsigned i = 1; i + 1 < Ptr.getNumOperands(); i += 2) { auto *CIdx = dyn_cast(Ptr.getOperand(i + 1)); if (!CIdx) continue; if (CIdx->getZExtValue() == llvm::sub_lo) Lo = Ptr.getOperand(i); else if (CIdx->getZExtValue() == llvm::sub_hi) Hi = Ptr.getOperand(i); } if (!Lo || !Hi) return SDValue(); auto *HiC = dyn_cast(Hi); if (!HiC || HiC->getZExtValue() != 0) return SDValue(); if (Lo.getOpcode() == W65816ISD::Wrapper) return Lo; if (auto *LoC = dyn_cast(Lo)) { // Recombine into a TargetConstant:i32 so the `(store v, (iPTR // timm))` STAabs pattern fires. Returning an i16 Constant // would create a malformed STORE node (Ptr type mismatch) and // returning a regular Constant:i32 would re-trigger // LowerI32Constant. return DCI.DAG.getTargetConstant(LoC->getZExtValue(), SDLoc(Ptr), MVT::i32); } return SDValue(); }; if (N->getOpcode() == ISD::STORE) { auto *St = cast(N); EVT MemVT = St->getMemoryVT(); SDValue Ptr = St->getBasePtr(); // Skip i32 stores — LowerStore's i32 path has its own Wide32-zero-hi // const-addr fast path that emits two i16 stores at separate // TargetConstant addrs. Unwrapping here would short-circuit that // and produce a malformed ADD(TargetConstant, Constant) when the // hi-half store needs Ptr+2. if (MemVT != MVT::i32) { if (SDValue I16Ptr = unwrapWide32WithZeroHi(Ptr)) { SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); return DAG.getTruncStore(St->getChain(), DL, St->getValue(), I16Ptr, MemVT, St->getMemOperand()); } } // i8 const-addr → STA8long (timm pattern); i16 const-addr → // STAabs (timm pattern, DBR-relative). Wrap as TargetConstant so // LowerI32Constant doesn't re-enter and break the const-pattern // match. i32 stores split into 2 i16 stores via LowerStore so they // come back through this combine as MemVT==i16. if (MemVT != MVT::i8 && MemVT != MVT::i16) return SDValue(); if (auto *C = dyn_cast(Ptr)) { SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); SDValue NewPtr = DAG.getTargetConstant(C->getZExtValue(), DL, Ptr.getValueType()); return DAG.getTruncStore(St->getChain(), DL, St->getValue(), NewPtr, MemVT, St->getMemOperand()); } } if (N->getOpcode() == ISD::LOAD) { auto *Ld = cast(N); EVT MemVT = Ld->getMemoryVT(); EVT VT = Ld->getValueType(0); SDValue Ptr = Ld->getBasePtr(); // Wide32-of-Wrapper-with-zero-hi → i16 Wrapper (companion to the // STORE side just above). Lets `(load (Wrapper g))` → LDAabs fire. // Skip i32 loads — LowerLoad's i32 path does its own Ptr+2 ADD // arithmetic and would choke on a TargetConstant unwrap result. if (MemVT != MVT::i32) { if (SDValue I16Ptr = unwrapWide32WithZeroHi(Ptr)) { SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); return DAG.getExtLoad(Ld->getExtensionType(), DL, VT, Ld->getChain(), I16Ptr, MemVT, Ld->getMemOperand()); } } // Only the i8 const-addr path has dedicated tablegen patterns // (LDA8long); skip i16 const-addr loads (no LDAabs imm pattern) // and i32 (would re-fire on the same node with different shape). if (MemVT != MVT::i8 || (VT != MVT::i8 && VT != MVT::i16)) return SDValue(); if (auto *C = dyn_cast(Ptr)) { SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); SDValue NewPtr = DAG.getTargetConstant(C->getZExtValue(), DL, Ptr.getValueType()); return DAG.getExtLoad(Ld->getExtensionType(), DL, VT, Ld->getChain(), NewPtr, MemVT, Ld->getMemOperand()); } } if (N->getOpcode() == ISD::SHL && N->getValueType(0).getSizeInBits() >= 32 && !isTypeLegal(N->getValueType(0))) { if (auto *C = dyn_cast(N->getOperand(1))) { uint64_t K = C->getZExtValue(); if (K >= 1 && K <= 2) { SelectionDAG &DAG = DCI.DAG; SDValue X = N->getOperand(0); SDLoc DL(N); EVT VT = N->getValueType(0); SDValue R = X; for (uint64_t i = 0; i < K; ++i) R = DAG.getNode(ISD::ADD, DL, VT, R, R); return R; } } } return SDValue(); } // Map a W65816CC code to the matching Bxx opcode. static unsigned getBranchOpcodeForCC(unsigned CC) { switch (CC) { case W65816CC::COND_EQ: return W65816::BEQ; case W65816CC::COND_NE: return W65816::BNE; case W65816CC::COND_HS: return W65816::BCS; case W65816CC::COND_LO: return W65816::BCC; case W65816CC::COND_MI: return W65816::BMI; case W65816CC::COND_PL: return W65816::BPL; case W65816CC::COND_VS: return W65816::BVS; case W65816CC::COND_VC: return W65816::BVC; } llvm_unreachable("invalid W65816 condition code"); } // For multi-branch CCs, return the (branchA, branchB, BothMeanTrue) tuple. // branchA is tested first; if it takes, we go to TrueBB if BothMeanTrue // (i.e. both branches are "take if true"), otherwise to FalseBB. branchB // is tested next with the same semantic. // // GT : (BPL && BNE) → BEQ FalseBB; BPL TrueBB; fall-through FalseBB // LE : (BMI || BEQ) → BEQ TrueBB; BMI TrueBB; fall-through FalseBB // HI : (BCS && BNE) → BEQ FalseBB; BCS TrueBB; fall-through FalseBB // LS : (BCC || BEQ) → BEQ TrueBB; BCC TrueBB; fall-through FalseBB struct MultiBranch { unsigned First, Second; bool FirstToTrue, SecondToTrue; }; static MultiBranch getMultiBranch(unsigned CC) { switch (CC) { case W65816CC::COND_GT_MB: return {W65816::BEQ, W65816::BPL, false, true}; case W65816CC::COND_LE_MB: return {W65816::BEQ, W65816::BMI, true, true}; case W65816CC::COND_HI_MB: return {W65816::BEQ, W65816::BCS, false, true}; case W65816CC::COND_LS_MB: return {W65816::BEQ, W65816::BCC, true, true}; } llvm_unreachable("not a multi-branch CC"); } // Emit a two-Acc16 binary op as STAfi src2; OPfi dst, src1. Allocates // a fresh 2-byte stack slot per call. For CMP (HasOut=false) there's // no destination register, just the two src operands. Always spill // the SECOND operand so non-commutative ops (sub, cmp) compute // src1 OP src2 correctly via OPfi (which gives src1 OP load(spill)). static MachineBasicBlock * emitRROp(MachineInstr &MI, MachineBasicBlock *BB, unsigned StoreOp, unsigned OpFI, bool HasOut) { MachineFunction *MF = BB->getParent(); const W65816Subtarget &STI = MF->getSubtarget(); const W65816InstrInfo &TII = *STI.getInstrInfo(); DebugLoc DL = MI.getDebugLoc(); int FI = MF->getFrameInfo().CreateStackObject(2, Align(2), /*isSpillSlot=*/true); unsigned LhsIdx = HasOut ? 1 : 0; unsigned RhsIdx = HasOut ? 2 : 1; Register Src1 = MI.getOperand(LhsIdx).getReg(); Register Src2 = MI.getOperand(RhsIdx).getReg(); // Spill src2 (the rhs). Then OPfi computes src1 OP load(spill). BuildMI(*BB, MI.getIterator(), DL, TII.get(StoreOp)) .addReg(Src2) .addFrameIndex(FI) .addImm(0); if (HasOut) { Register Dst = MI.getOperand(0).getReg(); BuildMI(*BB, MI.getIterator(), DL, TII.get(OpFI), Dst) .addReg(Src1) .addFrameIndex(FI) .addImm(0); } else { BuildMI(*BB, MI.getIterator(), DL, TII.get(OpFI)) .addReg(Src1) .addFrameIndex(FI) .addImm(0); } MI.eraseFromParent(); return BB; } MachineBasicBlock * W65816TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const { // The only opcode we currently emit with usesCustomInserter=1 is // SELECT_CC16. Expand it into a diamond CFG with a PHI. For // single-branch CCs: // // thisMBB: // ... CMP already emitted ... // Bxx sinkMBB ; branch to "true" path // ; fall through to copy0MBB // copy0MBB: // ; (no instructions; PHI picks fval here) // sinkMBB: // dst = PHI [tval, thisMBB], [fval, copy0MBB] // // For multi-branch CCs (GT/LE/UGT/ULE without const RHS, where a // single Bxx isn't enough), insert two branches. Both target either // sinkMBB or copy0MBB depending on the condition. switch (MI.getOpcode()) { default: llvm_unreachable("unexpected instruction in EmitInstrWithCustomInserter"); case W65816::ADD_RR: return emitRROp(MI, BB, W65816::STAfi, W65816::ADCfi, /*HasOut=*/true); case W65816::SUB_RR: return emitRROp(MI, BB, W65816::STAfi, W65816::SBCfi, /*HasOut=*/true); // Carry-chain variants for the hi half of an i32 split. STAfi doesn't // touch P, so the carry from the previous addc/adde survives the // spill and is consumed by ADCEfi/SBCEfi below. case W65816::ADDE_RR: return emitRROp(MI, BB, W65816::STAfi, W65816::ADCEfi, /*HasOut=*/true); case W65816::SUBE_RR: return emitRROp(MI, BB, W65816::STAfi, W65816::SBCEfi, /*HasOut=*/true); case W65816::AND_RR: return emitRROp(MI, BB, W65816::STAfi, W65816::ANDfi, /*HasOut=*/true); case W65816::ORA_RR: return emitRROp(MI, BB, W65816::STAfi, W65816::ORAfi, /*HasOut=*/true); case W65816::EOR_RR: return emitRROp(MI, BB, W65816::STAfi, W65816::EORfi, /*HasOut=*/true); case W65816::CMP_RR: return emitRROp(MI, BB, W65816::STAfi, W65816::CMPfi, /*HasOut=*/false); case W65816::LDAptr32S: case W65816::STAptr32S: case W65816::STBptr32S: { // Split-pair variant: ptr is 2 i16 operands (lo + hi) instead of // 1 Wide32 reg pair. Used by the W65816LowerWide32 pre-RA pass // to dodge pair-allocation pressure. Otherwise identical to // the LDAptr32 inserter below. MachineFunction *MF = BB->getParent(); const W65816Subtarget &STI = MF->getSubtarget(); const W65816InstrInfo &TII = *STI.getInstrInfo(); DebugLoc DL = MI.getDebugLoc(); bool IsLoad = MI.getOpcode() == W65816::LDAptr32S; bool IsByteStore = MI.getOpcode() == W65816::STBptr32S; Register PtrLo = MI.getOperand(IsLoad ? 1 : 1).getReg(); Register PtrHi = MI.getOperand(IsLoad ? 2 : 2).getReg(); int FILo = MF->getFrameInfo().CreateStackObject(2, Align(2), /*isSpillSlot=*/false); int FIHi = MF->getFrameInfo().CreateStackObject(2, Align(2), /*isSpillSlot=*/false); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi)) .addReg(PtrLo).addFrameIndex(FILo).addImm(0); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi)) .addReg(PtrHi).addFrameIndex(FIHi).addImm(0); // STA_DP's tablegen def has no implicit A Use, so without an // explicit kill marker between adjacent LDAfi-STA_DP-LDAfi-STA_DP // pairs the fast regalloc collapses two A-loads into one (the // first's value is overwritten before STA_DP can store it). Add // implicit Use of A on the STA_DP to encode the dependency. This // also helps post-RA passes track A liveness correctly. BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi), W65816::A).addFrameIndex(FILo).addImm(0); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STA_DP)).addImm(0xE0) .addReg(W65816::A, RegState::Implicit); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi), W65816::A).addFrameIndex(FIHi).addImm(0); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STA_DP)).addImm(0xE2) .addReg(W65816::A, RegState::Implicit); if (IsLoad) { Register Dst = MI.getOperand(0).getReg(); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDY_Imm16)).addImm(0); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDA_DPIndLongY)).addImm(0xE0); BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), Dst).addReg(W65816::A); } else { Register Val = MI.getOperand(0).getReg(); BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), W65816::A).addReg(Val); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDY_Imm16)).addImm(0); if (IsByteStore) BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::SEP)).addImm(0x20); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STA_DPIndLongY)).addImm(0xE0); if (IsByteStore) BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::REP)).addImm(0x20); } MI.eraseFromParent(); return BB; } case W65816::LDAptr32: case W65816::STAptr32: case W65816::STBptr32: { // Same shape as the i16 LDAptr/STAptr/STBptr inserter, but the // pointer is a Wide32 register pair: sub_lo carries the low 16 // bits of the address, sub_hi carries the bank byte in its low // half (high half is pad, ORCA convention). Stage at $E0..$E2, // then [dp],Y addresses the right bank without forcing 0. // // Dead unless ptr32 mode is active (LowerLoad/LowerStore are gated // on i32 address type). MachineFunction *MF = BB->getParent(); const W65816Subtarget &STI = MF->getSubtarget(); const W65816InstrInfo &TII = *STI.getInstrInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); DebugLoc DL = MI.getDebugLoc(); bool IsLoad = MI.getOpcode() == W65816::LDAptr32; bool IsByteStore = MI.getOpcode() == W65816::STBptr32; Register Ptr = MI.getOperand(IsLoad ? 1 : 1).getReg(); // Extract the i16 sub-halves of the Wide32 ptr. At custom-inserter // time Ptr is still a virtual register, so `TRI.getSubReg` won't // work (it's physreg-only). Use COPY-with-subreg-index instead; // the regalloc + virtreg-rewriter resolves this to the right // physreg operand later. Register PtrLo = MRI.createVirtualRegister(&W65816::Wide16RegClass); Register PtrHi = MRI.createVirtualRegister(&W65816::Wide16RegClass); BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), PtrLo) .addReg(Ptr, (RegState)0, llvm::sub_lo); BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), PtrHi) .addReg(Ptr, (RegState)0, llvm::sub_hi); // Spill each half to a fresh slot, reload via LDAfi. Same RA- // pinning rationale as the i16 LDAptr inserter. int FILo = MF->getFrameInfo().CreateStackObject(2, Align(2), /*isSpillSlot=*/false); int FIHi = MF->getFrameInfo().CreateStackObject(2, Align(2), /*isSpillSlot=*/false); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi)) .addReg(PtrLo).addFrameIndex(FILo).addImm(0); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi)) .addReg(PtrHi).addFrameIndex(FIHi).addImm(0); // Stage the 24-bit address at $E0..$E2: sub_lo at $E0..$E1, // bank byte (low half of sub_hi) at $E2. We write 16 bits at $E2 // — the high byte ($E3) gets sub_hi's pad byte (0 by ORCA) — but // only $E2 is consulted by [dp],Y so $E3 contamination is harmless // until something else uses $E3. // STA_DP's tablegen def has no implicit A Use, so without an // explicit kill marker between adjacent LDAfi-STA_DP-LDAfi-STA_DP // pairs the fast regalloc collapses two A-loads into one (the // first's value is overwritten before STA_DP can store it). Add // implicit Use of A on the STA_DP to encode the dependency. This // also helps post-RA passes track A liveness correctly. BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi), W65816::A).addFrameIndex(FILo).addImm(0); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STA_DP)).addImm(0xE0) .addReg(W65816::A, RegState::Implicit); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi), W65816::A).addFrameIndex(FIHi).addImm(0); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STA_DP)).addImm(0xE2) .addReg(W65816::A, RegState::Implicit); if (IsLoad) { Register Dst = MI.getOperand(0).getReg(); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDY_Imm16)).addImm(0); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDA_DPIndLongY)).addImm(0xE0); BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), Dst).addReg(W65816::A); } else { Register Val = MI.getOperand(0).getReg(); BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), W65816::A).addReg(Val); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDY_Imm16)).addImm(0); if (IsByteStore) BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::SEP)).addImm(0x20); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STA_DPIndLongY)).addImm(0xE0); if (IsByteStore) BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::REP)).addImm(0x20); } MI.eraseFromParent(); return BB; } case W65816::LDAptr32Off: case W65816::STAptr32Off: case W65816::STBptr32Off: { // ptr32 deref with constant offset. Compute (sub_lo + off) into A // with CLC; ADC, store at $E0..$E1; then propagate the carry into // the bank byte via ADC #0 on (sub_hi) and store at $E2. Carry // propagation is conservatively always emitted — bank wrapping is // rare but real (bank-spanning struct or negative offset). // // Dead unless ptr32 mode is active. MachineFunction *MF = BB->getParent(); const W65816Subtarget &STI = MF->getSubtarget(); const W65816InstrInfo &TII = *STI.getInstrInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); DebugLoc DL = MI.getDebugLoc(); bool IsLoad = MI.getOpcode() == W65816::LDAptr32Off; bool IsByteStore = MI.getOpcode() == W65816::STBptr32Off; Register Ptr = MI.getOperand(1).getReg(); int64_t Off = MI.getOperand(2).getImm(); // See LDAptr32 inserter above: vreg sub-regs need COPY-with-subreg // (TRI.getSubReg is physreg-only at custom-inserter time). Register PtrLo = MRI.createVirtualRegister(&W65816::Wide16RegClass); Register PtrHi = MRI.createVirtualRegister(&W65816::Wide16RegClass); BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), PtrLo) .addReg(Ptr, (RegState)0, llvm::sub_lo); BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), PtrHi) .addReg(Ptr, (RegState)0, llvm::sub_hi); int FILo = MF->getFrameInfo().CreateStackObject(2, Align(2), /*isSpillSlot=*/false); int FIHi = MF->getFrameInfo().CreateStackObject(2, Align(2), /*isSpillSlot=*/false); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi)) .addReg(PtrLo).addFrameIndex(FILo).addImm(0); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi)) .addReg(PtrHi).addFrameIndex(FIHi).addImm(0); // (sub_lo + off) -> $E0..$E1 BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi), W65816::A).addFrameIndex(FILo).addImm(0); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::CLC)); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::ADC_Imm16)).addImm(Off); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STA_DP)).addImm(0xE0); // (sub_hi + 0 + carry) -> $E2..$E3. ADC #0 picks up the carry // from the previous ADC; if no carry, sub_hi is unchanged. BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi), W65816::A).addFrameIndex(FIHi).addImm(0); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::ADC_Imm16)).addImm(0); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STA_DP)).addImm(0xE2); if (IsLoad) { Register Dst = MI.getOperand(0).getReg(); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDY_Imm16)).addImm(0); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDA_DPIndLongY)).addImm(0xE0); BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), Dst).addReg(W65816::A); } else { Register Val = MI.getOperand(0).getReg(); BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), W65816::A).addReg(Val); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDY_Imm16)).addImm(0); if (IsByteStore) BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::SEP)).addImm(0x20); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STA_DPIndLongY)).addImm(0xE0); if (IsByteStore) BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::REP)).addImm(0x20); } MI.eraseFromParent(); return BB; } case W65816::LDAptrOff: case W65816::STAptrOff: case W65816::STBptrOff: { // Pointer access with a constant offset. Folds the offset into // the pointer (CLC; ADC #off in A) BEFORE staging at $E0..$E2, // then accesses via [$E0],Y with Y=0. We can't fold into Y // because [dp],Y on the W65816 adds Y to the full 24-bit pointer // — for a negative Y like 0xFFFE (= -2 signed), the addition // crosses into bank 1. Folding into the pointer keeps the add // at 16-bit (in A) so the bank byte stays 0. // // DBR-independent — see LDAptr/STAptr/STBptr. MachineFunction *MF = BB->getParent(); const W65816Subtarget &STI = MF->getSubtarget(); const W65816InstrInfo &TII = *STI.getInstrInfo(); DebugLoc DL = MI.getDebugLoc(); bool IsLoad = MI.getOpcode() == W65816::LDAptrOff; bool IsByteStore = MI.getOpcode() == W65816::STBptrOff; Register Ptr = MI.getOperand(1).getReg(); int64_t Off = MI.getOperand(2).getImm(); // Spill the pointer vreg to a fresh 2-byte stack slot, then // reload via LDAfi. Forces RA to materialize the source — see // the LDAptr/STAptr/STBptr case below for the full rationale. int FI = MF->getFrameInfo().CreateStackObject(2, Align(2), /*isSpillSlot=*/false); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi)) .addReg(Ptr).addFrameIndex(FI).addImm(0); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi), W65816::A).addFrameIndex(FI).addImm(0); // Compute ptr + off in A. CLC + ADC for the add. BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::CLC)); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::ADC_Imm16)).addImm(Off); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STA_DP)).addImm(0xE0); if (LoaderBankDeref) { // Bank byte from $BE (crt0-initialised) — Loader compat path. BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDA_DP)).addImm(0xBE); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STA_DP)).addImm(0xE2); } else { BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STZ_DP)).addImm(0xE2); } if (IsLoad) { Register Dst = MI.getOperand(0).getReg(); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDY_Imm16)).addImm(0); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDA_DPIndLongY)).addImm(0xE0); BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), Dst).addReg(W65816::A); } else { Register Val = MI.getOperand(0).getReg(); BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), W65816::A).addReg(Val); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDY_Imm16)).addImm(0); if (IsByteStore) BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::SEP)).addImm(0x20); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STA_DPIndLongY)).addImm(0xE0); if (IsByteStore) BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::REP)).addImm(0x20); } MI.eraseFromParent(); return BB; } case W65816::LDAptr: case W65816::LDAptrBank0: case W65816::STAptr: case W65816::STBptr: { // Pointer load/store via [dp],Y indirect-long (opcodes 0xB7 / 0x97): // STA $E0 ; pointer low/hi at $E0..$E1 // STZ $E2 ; bank byte at $E2 = 0 // LDY #0 // LDA [$E0], Y ; bank 0:ptr + 0 // STA [$E0], Y // Bank-explicit ZERO — DBR-independent. Both the runInMame stack // ($00:0FFF down) and BSS / heap globals (placed at $00:xxxx) live // in bank 0, so pointer-derefs always reach the right memory even // when the user has switched DBR for a bank-2 store via `pha;plb`. // // Trade-off: under GS/OS Loader the user's data lives in their bank // (not bank 0), so library functions that write directly to globals // via `sta abs` (DBR-relative, lands in user bank) and user code that // reads via pointer-deref (lands in bank 0 by this lowering) get // INCONSISTENT results — silent miscompile. gmtime hit this with // its __gmtimeBuf static. Workaround for affected library code: // launder the buffer pointer through inline asm (see gmtime in // runtime/src/timeExt.c) so clang doesn't IPSCCP-fold it; the writes // then go via [dp],Y too and match the user reads. // // Const-int pointers (`*(volatile uint16 *)0x5000 = v`) are NOT // lowered through this pseudo — TableGen patterns route them to // STAlong / STA8long / STAabs by type. See InstrInfo.td. // // We use $E0..$E2 in libcall-scratch DP — safe because the // pseudo expansion is a leaf (no calls between SEP and STA), // and any subsequent libcall reinitialises its own scratch. // // Why [dp],Y not abs-long-X (`STA $0,X`)? abs-long-X is shorter // (~3 bytes less) but uses X to hold the pointer. In high- // pressure functions like the recursive expression parser, X // is often live with another value, and forcing X to be free // for every pointer-deref triggered "ran out of registers". // [dp],Y uses A and Y only — leaves X for spill-bridge use. // // STBptr (truncating i8 store) wraps the actual STA in SEP/REP // so M=8 across the store and only one byte is written. MachineFunction *MF = BB->getParent(); const W65816Subtarget &STI = MF->getSubtarget(); const W65816InstrInfo &TII = *STI.getInstrInfo(); DebugLoc DL = MI.getDebugLoc(); bool IsLoad = MI.getOpcode() == W65816::LDAptr || MI.getOpcode() == W65816::LDAptrBank0; bool IsByteStore = MI.getOpcode() == W65816::STBptr; // LDAptrBank0 hardcodes bank=0 (STZ $E2) regardless of LoaderBankDeref. // Used by va_arg under Loader where the deref is a stack pointer // (= bank 0 always on W65816) but $BE points to our code bank. bool ForceBank0 = MI.getOpcode() == W65816::LDAptrBank0; Register Ptr = MI.getOperand(1).getReg(); // Why we spill the pointer to a fresh stack slot first: // a direct `COPY $a = ptr_vreg ; STA $E0` lets RA elide the COPY // when ptr_vreg is already allocated to A. In a loop body where // multiple Acc16 PHIs (pointer + accumulator) compete for A, the // PHI elimination pass picks one to be in A at the bottom of the // block and silently drops the COPY needed to refresh A with the // OTHER value at the top of the next iteration — silent miscompile // (sumTable read its own accumulator as the pointer on iter 2+). // STAfi forces RA to materialize ptr_vreg's value so it gets stored // to the slot, then LDAfi reads it back as a real machine load. int FI = MF->getFrameInfo().CreateStackObject(2, Align(2), /*isSpillSlot=*/false); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi)) .addReg(Ptr).addFrameIndex(FI).addImm(0); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi), W65816::A).addFrameIndex(FI).addImm(0); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STA_DP)).addImm(0xE0); if (LoaderBankDeref && !ForceBank0) { // Bank byte from $BE (crt0-initialised) — Loader compat path. BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDA_DP)).addImm(0xBE); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STA_DP)).addImm(0xE2); } else { BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STZ_DP)).addImm(0xE2); } if (IsLoad) { Register Dst = MI.getOperand(0).getReg(); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDY_Imm16)).addImm(0); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDA_DPIndLongY)).addImm(0xE0); BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), Dst).addReg(W65816::A); } else { Register Val = MI.getOperand(0).getReg(); BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), W65816::A).addReg(Val); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDY_Imm16)).addImm(0); if (IsByteStore) BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::SEP)).addImm(0x20); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STA_DPIndLongY)).addImm(0xE0); if (IsByteStore) BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::REP)).addImm(0x20); } MI.eraseFromParent(); return BB; } case W65816::SELECT_CC8: case W65816::SELECT_CC16: { const W65816Subtarget &STI = BB->getParent()->getSubtarget(); const W65816InstrInfo &TII = *STI.getInstrInfo(); MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); DebugLoc DL = MI.getDebugLoc(); MachineFunction *MF = BB->getParent(); const BasicBlock *LLVM_BB = BB->getBasicBlock(); MachineFunction::iterator It = ++BB->getIterator(); MachineBasicBlock *thisMBB = BB; MachineBasicBlock *copy0MBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(LLVM_BB); MF->insert(It, copy0MBB); MF->insert(It, sinkMBB); // Move the rest of thisMBB after MI to sinkMBB. sinkMBB->splice(sinkMBB->begin(), BB, std::next(MachineBasicBlock::iterator(MI)), BB->end()); sinkMBB->transferSuccessorsAndUpdatePHIs(BB); unsigned CC = MI.getOperand(3).getImm(); // Helper: if `OpReg` is defined by a single-use, side-effect-free, // constant-source LDA in thisMBB, MOVE that LDA into `DstMBB` (at // its start). Returns true on success. auto tryHoistConstInit = [&](Register OpReg, MachineBasicBlock *DstMBB) -> bool { if (!OpReg.isVirtual()) return false; if (!MRI.hasOneNonDBGUse(OpReg)) return false; MachineInstr *Def = MRI.getUniqueVRegDef(OpReg); if (!Def || Def->getParent() != thisMBB) return false; if (Def->getOpcode() != W65816::LDAi16imm && Def->getOpcode() != W65816::LDAi8imm) return false; if (Def->getNumOperands() < 2 || !Def->getOperand(1).isImm()) return false; Def->removeFromParent(); DstMBB->insert(DstMBB->begin(), Def); return true; }; Register TValReg = MI.getOperand(1).getReg(); Register FValReg = MI.getOperand(2).getReg(); auto IsConstLda = [&](Register R) { if (!R.isVirtual() || !MRI.hasOneNonDBGUse(R)) return false; MachineInstr *D = MRI.getUniqueVRegDef(R); return D && D->getParent() == thisMBB && (D->getOpcode() == W65816::LDAi16imm || D->getOpcode() == W65816::LDAi8imm) && D->getNumOperands() >= 2 && D->getOperand(1).isImm(); }; bool BothConst = (CC < W65816CC::COND_GT_MB) && IsConstLda(TValReg) && IsConstLda(FValReg); if (BothConst) { // 4-block diamond: thisMBB has only the test (CMP) and Bxx; the // tval and fval LDAs each live in their own destination block, // which is reached only via the branch — so neither LDA's flag // side-effect can corrupt the CMP→Bxx test window. This is the // proper fix for the "LDA between CMP and Bxx" bug catalogued in // project_known_issue_lda_flags.md (replacing the earlier 3-block // workaround that only hoisted fval). // // thisMBB: ...; CMP; Bxx tvalMBB // copy0MBB: LDA #fval; BRA sinkMBB (FALSE path) // tvalMBB: LDA #tval (TRUE path; falls to sink) // sinkMBB: PHI [tval from tvalMBB, fval from copy0MBB] MachineBasicBlock *tvalMBB = MF->CreateMachineBasicBlock(LLVM_BB); MF->insert(sinkMBB->getIterator(), tvalMBB); BB->addSuccessor(copy0MBB); BB->addSuccessor(tvalMBB); copy0MBB->addSuccessor(sinkMBB); tvalMBB->addSuccessor(sinkMBB); unsigned BrOp = getBranchOpcodeForCC(CC); BuildMI(thisMBB, DL, TII.get(BrOp)).addMBB(tvalMBB); BuildMI(copy0MBB, DL, TII.get(W65816::BRA)).addMBB(sinkMBB); tryHoistConstInit(TValReg, tvalMBB); tryHoistConstInit(FValReg, copy0MBB); BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII.get(W65816::PHI), MI.getOperand(0).getReg()) .addReg(TValReg).addMBB(tvalMBB) .addReg(FValReg).addMBB(copy0MBB); } else { // 3-block diamond: keep the existing layout and (where possible) // hoist fval into copy0MBB. Used when one or both operands are // computed values (not constants), or when the multi-branch CC // requires two Bxx in thisMBB. BB->addSuccessor(copy0MBB); BB->addSuccessor(sinkMBB); if (CC < W65816CC::COND_GT_MB) { unsigned BrOp = getBranchOpcodeForCC(CC); BuildMI(thisMBB, DL, TII.get(BrOp)).addMBB(sinkMBB); } else { MultiBranch MB = getMultiBranch(CC); MachineBasicBlock *Tgt1 = MB.FirstToTrue ? sinkMBB : copy0MBB; MachineBasicBlock *Tgt2 = MB.SecondToTrue ? sinkMBB : copy0MBB; BuildMI(thisMBB, DL, TII.get(MB.First)).addMBB(Tgt1); BuildMI(thisMBB, DL, TII.get(MB.Second)).addMBB(Tgt2); } copy0MBB->addSuccessor(sinkMBB); tryHoistConstInit(FValReg, copy0MBB); BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII.get(W65816::PHI), MI.getOperand(0).getReg()) .addReg(TValReg).addMBB(thisMBB) .addReg(FValReg).addMBB(copy0MBB); } MI.eraseFromParent(); return sinkMBB; } } }