//===-- W65816ISelLowering.cpp - W65816 DAG Lowering Implementation -------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // Minimum DAG lowering sufficient for a no-argument function returning an // i16 constant. Argument passing and non-trivial calls still unimplemented. // //===----------------------------------------------------------------------===// #include "W65816ISelLowering.h" #include "W65816InstrInfo.h" #include "W65816MachineFunctionInfo.h" #include "W65816SelectionDAGInfo.h" #include "W65816Subtarget.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/Support/KnownBits.h" #include "llvm/IR/Function.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" using namespace llvm; #define DEBUG_TYPE "w65816-lower" // Loader-compat workaround: when set, LDAptr/STAptr/STBptr inserters // load the bank byte from DP $BE (initialized by crt0 to PHK / current // PBR) instead of forcing it to 0 via STZ $E2. This makes pointer // derefs land in the user's bank — matching where DBR-relative // absolute stores go — so library functions like gmtime that store // into static buffers via DBR-relative paths are visible to caller- // side pointer-deref reads. Costs 2 extra bytes / 4 cycles per ptr- // deref (LDA dp + STA dp vs STZ dp). Default off to keep // size-sensitive builds (toolbox) under the $C000 IO-window ceiling. static cl::opt LoaderBankDeref( "w65816-loader-bank-deref", cl::desc("LDAptr/STAptr inserters read bank from DP $BE (set by " "crt0 to PHK) instead of STZ $E2. Required for GS/OS " "Loader compatibility; default off for size-sensitive " "builds."), cl::init(false), cl::Hidden); // Layer 2 ptr32 opt: when set, ptr32 derefs assume the pointer's bank // byte matches DBR. Uses `lda (d,s),Y` (opcode 0xB3, stack-relative // indirect indexed-Y) instead of staging at $E0/$E2 and using // `lda [dp],Y` (24-bit indirect-long). Saves ~4 instructions per // deref. Correct only for code that touches memory inside DBR's bank // — malloc'd Lua state + globals + BSS qualify; cross-bank pointers // (rare) do not. Caller's responsibility. Tested by hand on lapi.c. // // NOTE: not static -- W65816Layer2Gate.cpp reads this to stamp the // "w65816-layer2" function attribute on every function compiled with // Layer 2 on, so the LTO-time gate can detect mismatched TUs. Phase // 1.12 of GAP_CLOSURE_PLAN.md. cl::opt DbrSafePtrs( "w65816-dbr-safe-ptrs", cl::desc("ptr32 derefs use 16-bit stack-rel-indirect-Y, assuming " "the pointer's bank byte matches DBR. Significantly " "shrinks struct-field-heavy code (Lua's lapi.c: ~3.4× → " "much smaller) at the cost of safety for cross-bank " "pointers (which become a miscompile)."), cl::init(false), cl::Hidden); W65816TargetLowering::W65816TargetLowering(const TargetMachine &TM, const W65816Subtarget &STI) : TargetLowering(TM, STI) { // Register classes for the two scalar modes. The register allocator sees // A, X and Y as both 8-bit and 16-bit; a later REP/SEP pass is responsible // for ensuring the dynamic mode matches the selected class. addRegisterClass(MVT::i8, &W65816::Acc8RegClass); addRegisterClass(MVT::i16, &W65816::Acc16RegClass); addRegisterClass(MVT::i32, &W65816::Wide32RegClass); computeRegisterProperties(STI.getRegisterInfo()); setStackPointerRegisterToSaveRestore(W65816::SP); setBooleanContents(ZeroOrOneBooleanContent); setBooleanVectorContents(ZeroOrOneBooleanContent); // GlobalAddress and ExternalSymbol: lower to W65816ISD::Wrapper so a // tablegen pattern can fold them into instruction operands. setOperationAction(ISD::GlobalAddress, MVT::i16, Custom); setOperationAction(ISD::ExternalSymbol, MVT::i16, Custom); setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); setOperationAction(ISD::ExternalSymbol, MVT::i32, Custom); // FrameIndex i32 has its own DAG-to-DAG path in W65816ISelDAGToDAG.cpp. // BR_CC is custom-lowered to a CMP + W65816ISD::BR_CC chain so we can // emit the right BEQ/BNE/BCS/BCC mnemonic per condition. setOperationAction(ISD::BR_CC, MVT::i16, Custom); setOperationAction(ISD::BR_CC, MVT::i8, Custom); setOperationAction(ISD::BRCOND, MVT::Other, Expand); setOperationAction(ISD::BR_JT, MVT::Other, Expand); // BRIND (computed-goto `goto *p`, indirectbr IR) has no direct // 65816 instruction — JMP (abs) / JMP [abs] read the target pointer // from MEMORY, not a register. Custom-lower to: store the pointer's // 16-bit low half (offset within the program's PBR-pinned code bank) // to $00B8 (the __indirTarget DP slot already reserved for indirect // calls — see libgcc.s), then emit a `JMP ($00B8)` via the BRIND // pseudo. Single-bank assumption on the target's code: same as // every other JMP/BRA in our codegen. // // The ptr is i32 under p:32:16 (current default) — extract sub_lo. // Under p:16 (legacy ptr16), it's already i16. setOperationAction(ISD::BRIND, MVT::Other, Custom); // SETCC and SELECT_CC: custom-lowered to a CMP + W65816ISD::SELECT_CC // pseudo (with usesCustomInserter=1) that EmitInstrWithCustomInserter // expands into a Bxx + diamond CFG + PHI. SETCC funnels through the // same path with TVal=1 / FVal=0. SELECT (no condition operand) is // expanded to SELECT_CC by the legalizer using SETNE against zero. setOperationAction(ISD::SETCC, MVT::i16, Custom); setOperationAction(ISD::SETCC, MVT::i8, Custom); setOperationAction(ISD::SELECT_CC, MVT::i16, Custom); setOperationAction(ISD::SELECT_CC, MVT::i8, Custom); setOperationAction(ISD::SELECT, MVT::i16, Expand); setOperationAction(ISD::SELECT, MVT::i8, Expand); // 65816 has no inline sign-extend instruction; synthesize i8 -> i16 // via a bit-7 test and SELECT_CC (see LowerSignExtend). setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Custom); // BSWAP: no native byte-swap instruction (XBA swaps the two halves // of the 16-bit accumulator only when in 8-bit M mode, hard to // exploit cleanly). Lower to shifts + ORs via the generic Expand // path — SDAG turns `bswap(i32)` into four byte extracts ORed back // together, which our existing patterns handle. Required for // portable C that constructs a big-endian word from byte loads: // `((u32)b[0] << 24) | ((u32)b[1] << 16) | ((u32)b[2] << 8) | b[3]` // (SHA-256 message-schedule, JPEG/PNG headers, etc.). setOperationAction(ISD::BSWAP, MVT::i16, Expand); setOperationAction(ISD::BSWAP, MVT::i32, Expand); setOperationAction(ISD::BSWAP, MVT::i64, Expand); // We have zextload-i8 and extload-i8 patterns (LDA + AND #$FF / bare // LDA for the anyext case). No native sextload; mark it Expand so // LLVM rewrites `sextload i16, i8` into `(sign_extend (load i8))`, // which then flows through LowerSignExtend's branchless 3-insn // sequence (AND #$00FF; EOR #$0080; SEC; SBC #$0080). for (MVT VT : MVT::integer_valuetypes()) setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Expand); // GlobalOpt sometimes narrows a `short` global to `i1` when it sees // every assignment is 0 or 1. Custom-lower so LowerLoad rewrites // `zext/sext/anyext from i1` into a plain byte load + appropriate // mask. Both i16 and i8 result widths can appear, depending on // whether the consumer wants the value as `short` or `bool`. for (MVT ResVT : {MVT::i8, MVT::i16}) { setLoadExtAction(ISD::ZEXTLOAD, ResVT, MVT::i1, Custom); setLoadExtAction(ISD::SEXTLOAD, ResVT, MVT::i1, Custom); setLoadExtAction(ISD::EXTLOAD, ResVT, MVT::i1, Custom); } // Only register i32 ext-load / trunc-store and Custom actions when // i32 is actually a legal type (ptr32 mode active). Otherwise the // Custom-action calls intercept i16/i8 ops, and LowerTruncate's // SDValue()-on-non-i32 bail breaks the i16→i8 trunc pattern (same // root cause as the earlier LOAD-Custom-breaks-LDAptr issue). bool ptr32Active = isTypeLegal(MVT::i32); if (ptr32Active) { for (MVT MemVT : {MVT::i8, MVT::i16}) { setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, MemVT, Expand); setLoadExtAction(ISD::SEXTLOAD, MVT::i32, MemVT, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::i32, MemVT, Expand); setTruncStoreAction(MVT::i32, MemVT, Expand); } // Truncating byte stores (`s->c = (char)v`) land as TRUNCSTORE // i16->i8 in SDAG after combiner canonicalization. Custom-route // through LowerStore so the ptr-offset peel fires for them too. setTruncStoreAction(MVT::i16, MVT::i8, Custom); } // Vararg support: VASTART writes the address of the first vararg slot // to the va_list pointer. VAARG/VACOPY/VAEND use the default // expansions that load through that pointer and bump it. This makes // -style functions (e.g. printf-likes) compile cleanly. setOperationAction(ISD::VASTART, MVT::Other, Custom); // Custom VAARG so we DON'T align the va_list pointer. The default // expansion rounds up to the type's preferred alignment (S16 = 2), // but caller-pushed args land at PHA's resulting odd S+1 address. // Aligning would skip the low byte and read garbage. setOperationAction(ISD::VAARG, MVT::Other, Custom); setOperationAction(ISD::VACOPY, MVT::Other, Expand); setOperationAction(ISD::VAEND, MVT::Other, Expand); // C++ exceptions (SJLJ model) — clang lowers exception machinery into // these intrinsics via SjLjEHPrepare. We don't have native handling // for any of them on this target; mark Expand so LegalizeDAG falls // back to its no-op stubs (setjmp returns 0, longjmp is a no-op, // setup_dispatch is a chain pass-through). The actual EH semantics // are provided at runtime by libcxxabi (__cxa_throw etc.) calling // _Unwind_SjLj_RaiseException, which in turn longjmps via the // function context the prologue prepared. See // runtime/src/libcxxabiSjlj.c for the runtime side. setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Expand); setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i16, Expand); setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Expand); setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom); // SJLJ exception lowering uses FRAMEADDR(0) to read the current frame // pointer. We don't reserve a frame pointer in general; return the // entry-SP-equivalent value (current SP read via TSC) — good enough // for SJLJ's purpose of identifying the call frame. setOperationAction(ISD::FRAMEADDR, MVT::i16, Custom); setOperationAction(ISD::FRAMEADDR, MVT::i32, Custom); // stacksave / stackrestore — used by SjLjEHPrepare to save/restore SP // around invoke calls. The jmp_buf already captures SP via TSC in // our setjmp implementation, so these are redundant here. Lower // stacksave to a constant 0 (the value is stored into the function // context but never used for restoration on our target) and // stackrestore to a chain pass-through (no-op). // SJLJ EH uses STACKSAVE/STACKRESTORE. Default Expand calls // CopyFromReg/$SP which fails because SP has no register class. // Custom-lower to a Constant 0 (stacksave) and chain-passthrough // (stackrestore) — our SJLJ runtime doesn't actually use these // values; setjmp/longjmp manage SP directly via TSC/TCS. setOperationAction(ISD::STACKSAVE, MVT::Other, Custom); setOperationAction(ISD::STACKRESTORE, MVT::Other, Custom); // FRAMEADDR is set Custom above for SJLJ; don't set it Expand here // (the second setOperationAction would override the first). setOperationAction(ISD::RETURNADDR, MVT::i16, Expand); // W65816 pointers are i32; legalizer queries the action for the pointer // type, so register Expand for i32 too. Without this, // __builtin_return_address(0) ICEs in LowerOperation (no Custom handler // for RETURNADDR). setOperationAction(ISD::RETURNADDR, MVT::i32, Expand); setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i16, Expand); setOperationAction(ISD::EH_DWARF_CFA, MVT::i16, Expand); // ISD::TRAP — __builtin_trap(), -fsanitize-trap=undefined. Default // expansion is a libcall to abort(); UBSan-min wants a BRK with a // pickup sentinel instead so the trap site is identifiable from a // memory dump without a working stdio path. Custom-lower to a // W65816ISD::TRAP target node; the InstrInfo.td pattern routes it // to BRK_pseudo, whose AsmPrinter expansion writes 0xBE to $70 and // then issues BRK + a self-loop (headless MAME mis-vectors BRK, so // the spin is what actually halts). setOperationAction(ISD::TRAP, MVT::Other, Custom); // DEBUGTRAP follows the same shape — same node, same expansion. setOperationAction(ISD::DEBUGTRAP, MVT::Other, Custom); // The 65816 has no hardware multiplier or divider. Multiply by a // power-of-two constant is auto-rewritten to shifts by the DAG // combiner; arbitrary multiply / divide / mod go through libcalls // (`__mulhi3` for i16 multiply etc.). The libcall expander emits a // standard CALL node which flows through LowerCall, so multi-arg // call lowering must be working first (it is, see task #26). setOperationAction(ISD::MULHU, MVT::i16, Expand); setOperationAction(ISD::MULHS, MVT::i16, Expand); setOperationAction(ISD::SMUL_LOHI, MVT::i16, Expand); setOperationAction(ISD::UMUL_LOHI, MVT::i16, Expand); setOperationAction(ISD::MUL, MVT::i16, LibCall); // i8 multiply / mulh / div / rem: SDAG narrows e.g. `x / 10` to // `mulhu i8 x, -51` + shift when it proves operands fit in i8. // The 65816 has no native 8-bit multiplier; route everything // through the 16-bit libcalls by Promoting i8 ops to i16. setOperationAction(ISD::MUL, MVT::i8, Promote); setOperationAction(ISD::MULHU, MVT::i8, Promote); setOperationAction(ISD::MULHS, MVT::i8, Promote); setOperationAction(ISD::SDIV, MVT::i8, Promote); setOperationAction(ISD::UDIV, MVT::i8, Promote); setOperationAction(ISD::SREM, MVT::i8, Promote); setOperationAction(ISD::UREM, MVT::i8, Promote); setOperationAction(ISD::SMUL_LOHI, MVT::i8, Expand); setOperationAction(ISD::UMUL_LOHI, MVT::i8, Expand); // CTPOP/CTLZ/CTTZ/ROTL/ROTR — no hardware support. Expand lets the // type legalizer rewrite into a sequence of basic ops. Without // this, e.g. `x && !(x & (x-1))` (LLVM canonicalises to popcount==1) // or `(x << 1) | (x >> 15)` (canonicalised to rotl) hit "Cannot // Select" at isel. for (MVT VT : {MVT::i8, MVT::i16, MVT::i32}) { setOperationAction(ISD::CTPOP, VT, Expand); setOperationAction(ISD::CTLZ, VT, Expand); setOperationAction(ISD::CTTZ, VT, Expand); setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand); setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand); setOperationAction(ISD::ROTL, VT, Expand); setOperationAction(ISD::ROTR, VT, Expand); } setOperationAction(ISD::SDIV, MVT::i16, LibCall); setOperationAction(ISD::UDIV, MVT::i16, LibCall); setOperationAction(ISD::SREM, MVT::i16, LibCall); setOperationAction(ISD::UREM, MVT::i16, LibCall); setOperationAction(ISD::SDIVREM, MVT::i16, Expand); setOperationAction(ISD::UDIVREM, MVT::i16, Expand); // Variable-amount and large-constant shifts. We have inline // patterns for shift-by-1..4; everything else goes through // __ashlhi3 / __lshrhi3 / __ashrhi3. Setting the action to Custom // lets us return SDValue() for the fast cases and route everything // else through the libcall lowering helper. setOperationAction(ISD::SHL, MVT::i16, Custom); setOperationAction(ISD::SRL, MVT::i16, Custom); setOperationAction(ISD::SRA, MVT::i16, Custom); // i8 shifts go through Custom too — LowerShift detects the i8 result // and routes through trunc(i16-shift(zext_or_sext(lhs), amount)). // Avoids needing a parallel set of qi3 libcalls. setOperationAction(ISD::SHL, MVT::i8, Custom); setOperationAction(ISD::SRL, MVT::i8, Custom); setOperationAction(ISD::SRA, MVT::i8, Custom); // LOAD / STORE Custom-lowering for ptr32 mode is intentionally NOT // wired here in ptr16 mode. Setting LOAD Custom and returning // SDValue() from LowerLoad short-circuits the i16-result LDAptr/ // STAptr selection paths (the Custom→empty→Legal fall-through doesn't // re-enter pattern matching). When ptr32 is activated, this hook // needs a different gating mechanism — likely an isel-time // replacement triggered by addrspacecast or a target DAG combine. // See LowerLoad / LowerStore — currently dead code. // ADDC/ADDE/SUBC/SUBE are the legacy SDNodes with implicit Glue carrying // the carry/borrow flag between the two halves of a multi-precision add or // sub. Setting them Legal triggers the type legalizer's carry-chain split // for i32 ADD/SUB, which lowers to native ADC/SBC pairs (~7 instructions) // instead of the default UADDO+SETCC+ADD-of-bool path (~25 instructions). // The matching tablegen pseudos add Defs/Uses on the P register, which // tablegen wires up to the SDNode's SDNPInGlue/SDNPOutGlue automatically. setOperationAction(ISD::ADDC, MVT::i16, Legal); setOperationAction(ISD::ADDE, MVT::i16, Legal); setOperationAction(ISD::SUBC, MVT::i16, Legal); setOperationAction(ISD::SUBE, MVT::i16, Legal); // i32 (long). Type legalization splits i32 into two i16 halves; with // ADDC/ADDE Legal (above), ADD/SUB go through the native carry chain. // AND/OR/XOR split cleanly into per-half ops with no carry to track. // Multiply/divide/shift go through libcall stubs whose // implementations live in runtime/src/libgcc.s. SHL_PARTS / SRL_PARTS // / SRA_PARTS are the SDNodes the type legalizer emits when splitting // a variable-amount shift; without an action they get "Cannot select". // LibCall on the parent node routes the whole shift through one // __ashlsi3 / __lshrsi3 / __ashrsi3 call, which is both smaller and // simpler than implementing a 32-bit shift in 65816 assembly inline. for (MVT VT : {MVT::i32}) { // MUL i32 is Custom-lowered: the typical fall-through libcall is // __mulsi3 (32x32 -> 32), but when both operands are ZEXT from i16 // we can emit __umulhisi3 (16x16 -> 32) instead. Saves ~60 cyc per // call on the `(unsigned long)i * i` pattern — see LowerMUL_I32. setOperationAction(ISD::MUL, VT, Custom); setOperationAction(ISD::SDIV, VT, LibCall); setOperationAction(ISD::UDIV, VT, LibCall); setOperationAction(ISD::SREM, VT, LibCall); setOperationAction(ISD::UREM, VT, LibCall); setOperationAction(ISD::MULHU, VT, Expand); setOperationAction(ISD::MULHS, VT, Expand); setOperationAction(ISD::SMUL_LOHI, VT, Expand); setOperationAction(ISD::UMUL_LOHI, VT, Expand); setOperationAction(ISD::SDIVREM, VT, Expand); setOperationAction(ISD::UDIVREM, VT, Expand); // i32 shifts route through a libcall via the // preferredShiftLegalizationStrategy override (see header). No // explicit SHL/SHL_PARTS action needed — the override forces the // type-legalizer's libcall path before SHL_PARTS would be emitted. } // i64 shifts — route to libcall before the type legalizer tries // to split via the next-legal-type (which becomes i32 in ptr32 mode // and triggers a SDAG combine loop on `i64 >> K` patterns). By // marking SHL/SRL/SRA i64 LibCall here, the operation legalizer // picks up the libcall path even though i64 itself is illegal. for (MVT VT : {MVT::i64}) { setOperationAction(ISD::SHL, VT, LibCall); setOperationAction(ISD::SRL, VT, LibCall); setOperationAction(ISD::SRA, VT, LibCall); } if (ptr32Active) { for (unsigned Op : {ISD::ADD, ISD::SUB, ISD::AND, ISD::OR, ISD::XOR}) setOperationAction(Op, MVT::i32, Custom); setOperationAction(ISD::SHL, MVT::i32, Custom); setOperationAction(ISD::SRL, MVT::i32, Custom); setOperationAction(ISD::SRA, MVT::i32, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::i32, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::i32, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::i32, Custom); // SIGN_EXTEND_INREG with i32 result and inner type i1/i8/i16: // the combiner emits this for `(int32_t)((int8_t)x)` and for // `-(crc & 1ul)` (the i1 case shows up in CRC32 loops). No // tablegen pattern covers the i32 form; Custom-lower to per-half // ops. IMPORTANT: LegalizeDAG looks up the action for // SIGN_EXTEND_INREG using the INNER VT (the operand value type), // not the result VT. See LegalizeDAG.cpp: // Action = TLI.getOperationAction(Op, InnerType); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Custom); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Custom); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Custom); setOperationAction(ISD::TRUNCATE, MVT::i16, Custom); setOperationAction(ISD::TRUNCATE, MVT::i8, Custom); setOperationAction(ISD::LOAD, MVT::i32, Custom); setOperationAction(ISD::STORE, MVT::i32, Custom); // Also Custom for i16/i8 LOAD/STORE in ptr32 mode so LowerLoad/ // LowerStore can fold Wide32(Wrapper, WrapperBank) of the same // global (or a raw GlobalAddress) to a plain abs-16 access // (DBR-relative). Without this, every `g` access for a // same-segment global goes through the 14-byte [dp],y // indirect-long path even though the bank is implicit in DBR. setOperationAction(ISD::STORE, MVT::i16, Custom); setOperationAction(ISD::STORE, MVT::i8, Custom); setOperationAction(ISD::LOAD, MVT::i16, Custom); setOperationAction(ISD::LOAD, MVT::i8, Custom); // ZEXTLOAD i16-from-i8 also Custom — the DAG combiner folds // (zext (load i8 @g)) into one zextload SDNode, so we need to // apply the same global-address fold there. SEXTLOAD/EXTLOAD // already have Expand actions from earlier setLoadExtAction // calls; leave those alone (Custom would require parallel // tablegen patterns we don't have). setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, MVT::i8, Custom); setOperationAction(ISD::SETCC, MVT::i32, Custom); setOperationAction(ISD::BR_CC, MVT::i32, Custom); setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); setOperationAction(ISD::SELECT, MVT::i32, Custom); setOperationAction(ISD::Constant, MVT::i32, Custom); } // Disable jump tables. Generating them costs us BRIND (indirect // branch via 16-bit pointer load), which we don't have. A long // if-else chain compiles fine without them. Setting the threshold // to UINT_MAX makes LLVM never form a jump table. setMinimumJumpTableEntries(UINT_MAX); // Variable-length arrays / dynamic stack allocation. Lowered to // `tsc; sec; sbc size; tcs; inc a` — A returns the address of the // allocated region. Limitation: this shifts SP, so any FrameIndex // accessed *after* a DYNAMIC_STACKALLOC reads from a wrong offset // (we have no frame pointer). Suitable for the common pattern // "alloca; initialise; pass; return"; complex VLA use mixed with // local-variable access across the alloca will miscompile. A real // FP (DP slot or X-as-FP) would lift this restriction. setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i16, Custom); if (ptr32Active) setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); // Opt into PerformDAGCombine on LOAD nodes — needed for the // address-select reverse combine (see W65816TargetLowering:: // PerformDAGCombine). // setTargetDAGCombine(ISD::LOAD); // bisecting pickif hang // SHL combine disabled while debugging the ptr32 i64-phi hang. // setTargetDAGCombine(ISD::SHL); // Combine STORE / LOAD with const-int i32 pointer to a form that // survives LowerI32Constant (which would otherwise split the ptr // into a Wide32 reg pair and lose the const-addr fast path). // See PerformDAGCombine. setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::LOAD); } // Map an LLVM SETCC condition to a W65816 branch. Returns the condition // code along with possibly-swapped LHS/RHS; some signed comparisons are // rewritten to use unsigned ones with a tweaked operand because the // 65816 has no native signed branch other than BMI/BPL on a value, not // on a comparison result. // Map an LLVM SETCC condition to a 65816 branch. Unsigned codes use // BCS/BCC after CMP. Signed SETLT/SETGE map to BMI/BPL — correct only // when the comparison cannot overflow. For values produced by typical // C arithmetic on i16 this is usually fine; values near INT16_MIN/MAX // could give wrong results until we emit the BVS handling sequence. // SETGT / SETLE are rewritten to SETLT / SETGE with constant + 1 in // LowerBR_CC, mirroring the SETULE / SETUGT path. static W65816CC::CondCode mapCC(ISD::CondCode CC) { switch (CC) { case ISD::SETEQ: return W65816CC::COND_EQ; case ISD::SETNE: return W65816CC::COND_NE; case ISD::SETUGE: return W65816CC::COND_HS; case ISD::SETULT: return W65816CC::COND_LO; case ISD::SETLT: return W65816CC::COND_MI; case ISD::SETGE: return W65816CC::COND_PL; default: return W65816CC::COND_INVALID; } } // If both compare operands are i8, widen them to i16 so the existing // i16 CMP path can handle them. Use ZEXT for unsigned/eq/ne CCs and // SEXT for signed CCs — picking the wrong extension would invert the // answer (e.g. -1i8 sext to 0xFFFF compares < 1 signed; zext to 0x00FF // compares > 1 unsigned, which would flip a signed less-than). static void promoteI8Cmp(SDValue &LHS, SDValue &RHS, ISD::CondCode CC, SelectionDAG &DAG, const SDLoc &DL) { if (LHS.getValueType() != MVT::i8) return; unsigned Ext; switch (CC) { case ISD::SETLT: case ISD::SETLE: case ISD::SETGT: case ISD::SETGE: Ext = ISD::SIGN_EXTEND; break; default: Ext = ISD::ZERO_EXTEND; break; // unsigned + eq/ne } LHS = DAG.getNode(Ext, DL, MVT::i16, LHS); RHS = DAG.getNode(Ext, DL, MVT::i16, RHS); } // Normalize a (LHS, RHS, CC) triple so the result is something we can // emit with one CMP + Bxx. Returns the W65816 condition code; updates // LHS/RHS/CC in place. Returns COND_INVALID on failure. static W65816CC::CondCode normalizeCC(SDValue &LHS, SDValue &RHS, ISD::CondCode &CC, SelectionDAG &DAG, const SDLoc &DL) { promoteI8Cmp(LHS, RHS, CC, DAG, DL); // CMP wants the comparand (constant or memory) on the right. If a DAG // pre-pass put the constant on the left, swap and flip the condition. if (isa(LHS) && !isa(RHS)) { std::swap(LHS, RHS); CC = ISD::getSetCCSwappedOperands(CC); } // Signed compare via "EOR with sign bit then unsigned compare": // a < b (signed) iff (a ^ 0x8000) < (b ^ 0x8000) (unsigned) // The XOR flips the sign bit, which converts signed-int ordering to // unsigned-int ordering on the same bits. This avoids the WDC's // missing "BLT signed" — BMI/BPL alone read the sign of (a-b) // without the V-flag overflow correction, giving wrong results // when the subtraction overflows (e.g., INT16_MIN < 1 produced // false because (-32768 - 1) = +32767 has N=0). After the EOR // transform we use BCC/BCS which depend on the carry from CMP and // don't suffer overflow corruption. // // Cost: 1 EOR per operand (3 bytes each in M=16) — comparable to // the V-aware multi-branch sequence (5+ bytes of branches), but // happens at SDAG time so subsequent SDAG combining can fold // EORs against constants or already-EOR'd values. bool SignedCmp = (CC == ISD::SETLT || CC == ISD::SETLE || CC == ISD::SETGT || CC == ISD::SETGE); if (SignedCmp && LHS.getValueType() == MVT::i16) { EVT VT = LHS.getValueType(); SDValue Mask = DAG.getConstant(0x8000, DL, VT); LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, Mask); RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, Mask); switch (CC) { case ISD::SETLT: CC = ISD::SETULT; break; case ISD::SETLE: CC = ISD::SETULE; break; case ISD::SETGT: CC = ISD::SETUGT; break; case ISD::SETGE: CC = ISD::SETUGE; break; default: break; } } // Rewrite SETULE / SETUGT to SETULT / SETUGE with constant +/- 1. // (SETLE / SETGT have already been converted to their unsigned // counterparts above for i16; this handles original SETULE/SETUGT // and the post-transform SETULE/SETUGT.) Keeps the variable on the // LHS and lets us use BCS / BCC natively. if (auto *RhsConst = dyn_cast(RHS)) { int64_t V = RhsConst->getSExtValue(); uint64_t UV = (uint64_t)V & 0xFFFF; if (CC == ISD::SETULE && UV < 0xffff) { RHS = DAG.getConstant(UV + 1, DL, RHS.getValueType()); CC = ISD::SETULT; } else if (CC == ISD::SETUGT && UV < 0xffff) { RHS = DAG.getConstant(UV + 1, DL, RHS.getValueType()); CC = ISD::SETUGE; } else if (CC == ISD::SETLE && V < 0x7fff) { // Reachable only when SignedCmp transform was skipped (i8 case // before promoteI8Cmp could get it, or non-i16 in the future). RHS = DAG.getConstant(V + 1, DL, RHS.getValueType()); CC = ISD::SETLT; } else if (CC == ISD::SETGT && V < 0x7fff) { RHS = DAG.getConstant(V + 1, DL, RHS.getValueType()); CC = ISD::SETGE; } } W65816CC::CondCode TCC = mapCC(CC); if (TCC == W65816CC::COND_INVALID) { // Try swapping operands first — preferable since it leaves us with // a single-Bxx form. But reject the swap if it would put a load on // the LHS (we can't pattern-match cmp(load,reg) without spilling A). bool RhsIsLoad = isa(RHS.getNode()); bool LhsIsLoad = isa(LHS.getNode()); bool SwapWouldHurt = RhsIsLoad && !LhsIsLoad; if (!SwapWouldHurt) { std::swap(LHS, RHS); CC = ISD::getSetCCSwappedOperands(CC); TCC = mapCC(CC); } } // Final fallback: GT/LE/UGT/ULE without a useful swap target. Use a // multi-branch pseudo CC; the SELECT_CC16 custom inserter expands it // into a 3-BB diamond. Only valid for SELECT_CC, not for BR_CC — // LowerBR_CC re-routes those through SETCC + BR_CC NE. if (TCC == W65816CC::COND_INVALID) { switch (CC) { case ISD::SETGT: TCC = W65816CC::COND_GT_MB; break; case ISD::SETLE: TCC = W65816CC::COND_LE_MB; break; case ISD::SETUGT: TCC = W65816CC::COND_HI_MB; break; case ISD::SETULE: TCC = W65816CC::COND_LS_MB; break; default: break; } } return TCC; } // Wide32 build/extract helpers, used by LowerLoad/Store/Extend/Truncate/ // I32Bin/BR_CC to construct or destructure i32 SDValues across the // sub_lo / sub_hi halves of the Wide32 register class. static SDValue buildWide32(SelectionDAG &DAG, const SDLoc &DL, SDValue Lo, SDValue Hi) { SDValue RC = DAG.getTargetConstant(W65816::Wide32RegClassID, DL, MVT::i32); SDValue SubLo = DAG.getTargetConstant(llvm::sub_lo, DL, MVT::i32); SDValue SubHi = DAG.getTargetConstant(llvm::sub_hi, DL, MVT::i32); SDNode *RS = DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, DL, MVT::i32, {RC, Lo, SubLo, Hi, SubHi}); return SDValue(RS, 0); } // Look through a buildWide32(Lo, Hi) -> REG_SEQUENCE(RC, Lo, sub_lo, // Hi, sub_hi) pair: if X is exactly that machine node, return the // matching half operand directly. Avoids a TargetExtractSubreg that // would re-enter the SDAG combiner and re-build the i32 constant / // pair, looping forever (observed as OOM in the combiner on `*t = 0`). static SDValue lookThroughRegSeq(SDValue X, unsigned WantSub) { if (!X.getNode() || !X.isMachineOpcode()) return SDValue(); if (X.getMachineOpcode() != TargetOpcode::REG_SEQUENCE) return SDValue(); // Layout: op0 = RC, then (Reg, SubIdx) pairs. for (unsigned i = 1; i + 1 < X.getNumOperands(); i += 2) { SDValue SubIdx = X.getOperand(i + 1); auto *CIdx = dyn_cast(SubIdx); if (!CIdx) continue; if (CIdx->getZExtValue() == WantSub) return X.getOperand(i); } return SDValue(); } static SDValue extractWide32Lo(SelectionDAG &DAG, const SDLoc &DL, SDValue X) { // For constants, materialise the lo half as an i16 constant directly // — getTargetExtractSubreg on a Constant SDNode produces a malformed // MachineSDNode (constants don't carry sub-regs) and triggers // SDAG combine loops downstream. if (auto *C = dyn_cast(X)) { return DAG.getConstant(C->getZExtValue() & 0xFFFFu, DL, MVT::i16); } // For un-lowered GlobalAddress / ExternalSymbol nodes (which reach // here when the store-lowering runs before LowerOperation has split // the constant into a Wide32 pair), emit a fresh Wrapper / WrapperBank // pair directly. getTargetExtractSubreg on a GlobalAddress node // produces a malformed result (no sub-reg info on a non-register). if (auto *GA = dyn_cast(X)) { SDValue T = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, MVT::i16, GA->getOffset()); return DAG.getNode(W65816ISD::Wrapper, DL, MVT::i16, T); } if (auto *ES = dyn_cast(X)) { SDValue T = DAG.getTargetExternalSymbol(ES->getSymbol(), MVT::i16); return DAG.getNode(W65816ISD::Wrapper, DL, MVT::i16, T); } if (SDValue Half = lookThroughRegSeq(X, llvm::sub_lo)) return Half; return DAG.getTargetExtractSubreg(llvm::sub_lo, DL, MVT::i16, X); } static SDValue extractWide32Hi(SelectionDAG &DAG, const SDLoc &DL, SDValue X) { if (auto *C = dyn_cast(X)) { return DAG.getConstant((C->getZExtValue() >> 16) & 0xFFFFu, DL, MVT::i16); } if (auto *GA = dyn_cast(X)) { SDValue T = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, MVT::i16, GA->getOffset()); return DAG.getNode(W65816ISD::WrapperBank, DL, MVT::i16, T); } if (auto *ES = dyn_cast(X)) { SDValue T = DAG.getTargetExternalSymbol(ES->getSymbol(), MVT::i16); return DAG.getNode(W65816ISD::WrapperBank, DL, MVT::i16, T); } if (SDValue Half = lookThroughRegSeq(X, llvm::sub_hi)) return Half; return DAG.getTargetExtractSubreg(llvm::sub_hi, DL, MVT::i16, X); } // Match `Ptr = REG_SEQUENCE(ADDC(BaseLo, KLo), sub_lo, // ADDE(BaseHi, 0, carry), sub_hi)` shape // produced by LowerI32Bin for `(add Wide32, const)` where the constant // fits an unsigned 16-bit Y (KHi must be 0). Returns true with OutBase // = buildWide32(BaseLo, BaseHi) and OutOff = KLo on a successful peel. // The bank-byte carry-in is intentionally dropped: the `[dp],Y` deref // adds Y to the 24-bit pointer without propagating beyond 16 bits. // Caller's responsibility that the target object doesn't span a bank. static bool peelPtr32Offset(SelectionDAG &DAG, SDLoc DL, SDValue Ptr, SDValue &OutBase, uint16_t &OutOff) { if (Ptr.getValueType() != MVT::i32) return false; // Pre-LowerI32Bin shape: `ISD::ADD(BaseWide32, i32 const)`. LowerLoad // runs before LowerI32Bin in legalization order, so the ADD is still // visible as an ISD::ADD when LowerLoad inspects Ptr. if (Ptr.getOpcode() == ISD::ADD) { SDValue L = Ptr.getOperand(0); SDValue R = Ptr.getOperand(1); auto *KC = dyn_cast(R); if (!KC) { KC = dyn_cast(L); if (!KC) return false; L = R; } uint64_t K = KC->getZExtValue(); if (K == 0 || K > 0xFFFFu) return false; OutOff = (uint16_t)K; OutBase = L; return true; } // Post-LowerI32Bin shape (REG_SEQUENCE of ADDC/ADDE). May not occur // in practice given the ADD path above, but kept for robustness. if (!Ptr.getNode() || !Ptr.isMachineOpcode()) return false; if (Ptr.getMachineOpcode() != TargetOpcode::REG_SEQUENCE) return false; SDValue Lo = lookThroughRegSeq(Ptr, llvm::sub_lo); SDValue Hi = lookThroughRegSeq(Ptr, llvm::sub_hi); if (!Lo || !Hi) return false; if (Lo.getOpcode() != ISD::ADDC) return false; if (Hi.getOpcode() != ISD::ADDE) return false; if (Hi.getOperand(2) != Lo.getValue(1)) return false; auto *KLo = dyn_cast(Lo.getOperand(1)); auto *KHi = dyn_cast(Hi.getOperand(1)); if (!KLo || !KHi) return false; if (KHi->getZExtValue() != 0) return false; uint64_t K = KLo->getZExtValue() & 0xFFFFu; if (K == 0) return false; OutOff = (uint16_t)K; OutBase = buildWide32(DAG, DL, Lo.getOperand(0), Hi.getOperand(0)); return true; } SDValue W65816TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); ISD::CondCode CC = cast(Op.getOperand(1))->get(); SDValue LHS = Op.getOperand(2); SDValue RHS = Op.getOperand(3); SDValue Dest = Op.getOperand(4); SDLoc DL(Op); EVT VT = LHS.getValueType(); // i32 BR_CC: synthesize an i16 boolean from per-half compares, then // branch on (bool != 0). Avoids the legalizer's generic Expand that // re-enters our SETCC/BR_CC custom paths in an infinite loop. if (VT == MVT::i32) { SDValue LL = extractWide32Lo(DAG, DL, LHS); SDValue LH = extractWide32Hi(DAG, DL, LHS); SDValue RL = extractWide32Lo(DAG, DL, RHS); SDValue RH = extractWide32Hi(DAG, DL, RHS); // Fast path: i32 == 0 / != 0 → (LL | LH) cmp 0. Drops two i16 // setcc materializations + an AND + (for NE) an XOR; the BR_CC // can branch directly on the OR-test. Hot in `while (x)` and // any i32-counter loop test. if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa(RHS) && cast(RHS)->isZero()) { SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i16, LL, LH); SDValue Z16 = DAG.getConstant(0, DL, MVT::i16); return DAG.getNode(ISD::BR_CC, DL, MVT::Other, Chain, DAG.getCondCode(CC), Or, Z16, Dest); } SDValue Bool; if (CC == ISD::SETEQ || CC == ISD::SETNE) { SDValue EqLo = DAG.getSetCC(DL, MVT::i16, LL, RL, ISD::SETEQ); SDValue EqHi = DAG.getSetCC(DL, MVT::i16, LH, RH, ISD::SETEQ); Bool = DAG.getNode(ISD::AND, DL, MVT::i16, EqLo, EqHi); if (CC == ISD::SETNE) Bool = DAG.getNode(ISD::XOR, DL, MVT::i16, Bool, DAG.getConstant(1, DL, MVT::i16)); } else { // (a CC b) where CC is ordered: // = (hi_a HiStrict hi_b) || (hi_a == hi_b && lo_a LoCC lo_b) // HiStrict is the strict variant of CC (LE -> LT etc.) so the // tie-breaker (hi==hi && lo CC lo) handles the equality case // properly. LoCC is always the unsigned variant of CC because // the low half is unsigned (the high half carries the sign). ISD::CondCode HiCC, LoCCu; switch (CC) { case ISD::SETLT: HiCC = ISD::SETLT; LoCCu = ISD::SETULT; break; case ISD::SETLE: HiCC = ISD::SETLT; LoCCu = ISD::SETULE; break; case ISD::SETGT: HiCC = ISD::SETGT; LoCCu = ISD::SETUGT; break; case ISD::SETGE: HiCC = ISD::SETGT; LoCCu = ISD::SETUGE; break; case ISD::SETULT: HiCC = ISD::SETULT; LoCCu = ISD::SETULT; break; case ISD::SETULE: HiCC = ISD::SETULT; LoCCu = ISD::SETULE; break; case ISD::SETUGT: HiCC = ISD::SETUGT; LoCCu = ISD::SETUGT; break; case ISD::SETUGE: HiCC = ISD::SETUGT; LoCCu = ISD::SETUGE; break; default: report_fatal_error("W65816: unexpected i32 BR_CC condition"); } SDValue HiOk = DAG.getSetCC(DL, MVT::i16, LH, RH, HiCC); SDValue HiEq = DAG.getSetCC(DL, MVT::i16, LH, RH, ISD::SETEQ); SDValue LoOk = DAG.getSetCC(DL, MVT::i16, LL, RL, LoCCu); SDValue Tie = DAG.getNode(ISD::AND, DL, MVT::i16, HiEq, LoOk); Bool = DAG.getNode(ISD::OR, DL, MVT::i16, HiOk, Tie); } SDValue Zero = DAG.getConstant(0, DL, MVT::i16); return DAG.getNode(ISD::BR_CC, DL, MVT::Other, Chain, DAG.getCondCode(ISD::SETNE), Bool, Zero, Dest); } W65816CC::CondCode TCC = normalizeCC(LHS, RHS, CC, DAG, DL); if (TCC == W65816CC::COND_INVALID) report_fatal_error("W65816: branch condition not yet implemented"); // Multi-branch CCs only have inserter support via SELECT_CC16. For // BR_CC, reroute through SETCC: materialise the boolean to A, then // branch on NE-vs-zero. One extra LDA but always works. if (TCC >= W65816CC::COND_GT_MB) { SDValue Bool = DAG.getNode(ISD::SETCC, DL, VT, LHS, RHS, DAG.getCondCode(CC)); SDValue Zero = DAG.getConstant(0, DL, VT); return DAG.getNode(ISD::BR_CC, DL, MVT::Other, Chain, DAG.getCondCode(ISD::SETNE), Bool, Zero, Dest); } SDValue Glue = DAG.getNode(W65816ISD::CMP, DL, MVT::Glue, LHS, RHS); SDValue CCOp = DAG.getTargetConstant(TCC, DL, MVT::i8); return DAG.getNode(W65816ISD::BR_CC, DL, MVT::Other, Chain, Dest, CCOp, Glue); } // LowerBRIND — `brind (chain, target_ptr)`. Computed-goto / IR // `indirectbr` lowers to BRIND with a pointer-typed target. Under // p:32:16 (default datalayout) that pointer is i32, so the generic // legalizer's "Cannot select brind" path fires unless we step in. // // Lowering strategy (mirrors __jsl_indir's mechanism): // 1. If target is i32 (Wide32), extract sub_lo — only the 16-bit // offset within PBR matters because JMP (abs) keeps current PBR. // 2. Store that i16 to constant address $00B8 — the shared // __indirTarget DP slot. Pinned at $00B8 so JMP (abs)'s bank-0 // vector fetch reads it regardless of DBR / segment placement // (see libgcc.s for the full rationale). // 3. Emit W65816ISD::BRIND with the chained store — the BRINDpseudo // tablegen pattern selects to JMP_AbsInd $00B8. SDValue W65816TargetLowering::LowerBRIND(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); SDValue Target = Op.getOperand(1); SDLoc DL(Op); // Reduce the target to i16 — the low half of the (i32) pointer // holds the in-bank offset that JMP indirect dispatches through. SDValue Off16; if (Target.getValueType() == MVT::i32) { Off16 = extractWide32Lo(DAG, DL, Target); } else if (Target.getValueType() == MVT::i16) { Off16 = Target; } else { // Defensive: shouldn't happen with our current type-legalization, // but if it does, defer to the legalizer. return SDValue(); } // Store the 16-bit target to $00B8. The (store Acc16, (iPTR timm)) // tablegen pattern lowers this to STAabs ($00B8) — the AsmPrinter // routes bank-0 const-int stores to STA_Abs (3 bytes, DBR-relative). // Since DP=0 at runtime, `sta $00B8` lands at $00:00B8 == DP slot // $B8, which is exactly where __jsl_indir reads via `jmp ($00B8)`. // // CRITICAL: use TargetConstant (not Constant) so the i32 Constant is // NOT Custom-lowered through LowerI32Constant — which would split // 0x00B8 into a REG_SEQUENCE(0xB8, 0). LowerStore then can't see // a clean ConstantSDNode at Ptr, mis-routes the i16 store to the // generic ST_PTR slow path ([E0],Y indirect-long with full Wide32 // address staging), and creates significant Wide32 register pressure // — multi-cgoto VM interpreters with several BRINDs in one function // then over-pressure the regalloc and abort with "ran out of // registers". With TargetConstant the tablegen pattern at // InstrInfo.td:433 fires directly: `sta $b8` — one instruction, no // Wide32 vreg, no DPF0/DPF1 staging. EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Addr = DAG.getTargetConstant(0x00B8, DL, PtrVT); SDValue Store = DAG.getStore(Chain, DL, Off16, Addr, MachinePointerInfo()); // Emit the indirect JMP. W65816ISD::BR_IND has chain-only semantics // (no operand beyond chain) — the target is implicit ($00B8). The // store above sequences before the JMP via the chain dependency. return DAG.getNode(W65816ISD::BR_IND, DL, MVT::Other, Store); } SDValue W65816TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { // setcc lhs, rhs, cc -> select_cc lhs, rhs, 1, 0, cc. // The SELECT_CC then re-enters LowerOperation and we lower it via the // diamond-CFG path. setBooleanContents(ZeroOrOne) means callers see // the result as a clean 0/1 value. SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); ISD::CondCode CC = cast(Op.getOperand(2))->get(); SDLoc DL(Op); EVT VT = Op.getValueType(); // i32 SETCC: split into per-half compares. Result type is i16 (the // legalizer keeps the boolean result type narrow regardless of LHS // width). if (LHS.getValueType() == MVT::i32) { SDValue LL = extractWide32Lo(DAG, DL, LHS); SDValue LH = extractWide32Hi(DAG, DL, LHS); SDValue RL = extractWide32Lo(DAG, DL, RHS); SDValue RH = extractWide32Hi(DAG, DL, RHS); // Fast path: i32 == 0 / != 0 → (LL | LH) cmp 0. One i16 OR + one // i16 setcc instead of two setcc + AND (+ XOR for NE). if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa(RHS) && cast(RHS)->isZero()) { SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i16, LL, LH); SDValue Z16 = DAG.getConstant(0, DL, MVT::i16); return DAG.getSetCC(DL, VT, Or, Z16, CC); } if (CC == ISD::SETEQ || CC == ISD::SETNE) { SDValue EqLo = DAG.getSetCC(DL, VT, LL, RL, ISD::SETEQ); SDValue EqHi = DAG.getSetCC(DL, VT, LH, RH, ISD::SETEQ); SDValue Eq = DAG.getNode(ISD::AND, DL, VT, EqLo, EqHi); if (CC == ISD::SETNE) Eq = DAG.getNode(ISD::XOR, DL, VT, Eq, DAG.getConstant(1, DL, VT)); return Eq; } ISD::CondCode HiCC, LoCCu; switch (CC) { case ISD::SETLT: HiCC = ISD::SETLT; LoCCu = ISD::SETULT; break; case ISD::SETLE: HiCC = ISD::SETLT; LoCCu = ISD::SETULE; break; case ISD::SETGT: HiCC = ISD::SETGT; LoCCu = ISD::SETUGT; break; case ISD::SETGE: HiCC = ISD::SETGT; LoCCu = ISD::SETUGE; break; case ISD::SETULT: HiCC = ISD::SETULT; LoCCu = ISD::SETULT; break; case ISD::SETULE: HiCC = ISD::SETULT; LoCCu = ISD::SETULE; break; case ISD::SETUGT: HiCC = ISD::SETUGT; LoCCu = ISD::SETUGT; break; case ISD::SETUGE: HiCC = ISD::SETUGT; LoCCu = ISD::SETUGE; break; default: report_fatal_error("W65816: unexpected i32 SETCC condition"); } SDValue HiOk = DAG.getSetCC(DL, VT, LH, RH, HiCC); SDValue HiEq = DAG.getSetCC(DL, VT, LH, RH, ISD::SETEQ); SDValue LoOk = DAG.getSetCC(DL, VT, LL, RL, LoCCu); SDValue Tie = DAG.getNode(ISD::AND, DL, VT, HiEq, LoOk); return DAG.getNode(ISD::OR, DL, VT, HiOk, Tie); } SDValue One = DAG.getConstant(1, DL, VT); SDValue Zero = DAG.getConstant(0, DL, VT); return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, One, Zero, DAG.getCondCode(CC)); } SDValue W65816TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); SDValue TVal = Op.getOperand(2); SDValue FVal = Op.getOperand(3); ISD::CondCode CC = cast(Op.getOperand(4))->get(); SDLoc DL(Op); // i32 SELECT_CC: synthesize an i16 boolean from the i32 compare via // LowerSETCC's i32 path, then select between the i32 halves driven // by the boolean. Avoids creating the i32 W65816::CMP we have no // pattern for. if (LHS.getValueType() == MVT::i32) { // Materialise the i16 boolean. SDValue Bool = DAG.getSetCC(DL, MVT::i16, LHS, RHS, CC); SDValue Zero = DAG.getConstant(0, DL, MVT::i16); if (Op.getValueType() == MVT::i32) { SDValue TLo = extractWide32Lo(DAG, DL, TVal); SDValue THi = extractWide32Hi(DAG, DL, TVal); SDValue FLo = extractWide32Lo(DAG, DL, FVal); SDValue FHi = extractWide32Hi(DAG, DL, FVal); SDValue Lo = DAG.getSelectCC(DL, Bool, Zero, TLo, FLo, ISD::SETNE); SDValue Hi = DAG.getSelectCC(DL, Bool, Zero, THi, FHi, ISD::SETNE); return buildWide32(DAG, DL, Lo, Hi); } return DAG.getSelectCC(DL, Bool, Zero, TVal, FVal, ISD::SETNE); } // SELECT_CC with i32 result (i16 LHS): split TVal/FVal into halves // and run a per-half i16 SELECT_CC sharing the same condition. if (Op.getValueType() == MVT::i32) { SDValue TLo = extractWide32Lo(DAG, DL, TVal); SDValue THi = extractWide32Hi(DAG, DL, TVal); SDValue FLo = extractWide32Lo(DAG, DL, FVal); SDValue FHi = extractWide32Hi(DAG, DL, FVal); SDValue Lo = DAG.getSelectCC(DL, LHS, RHS, TLo, FLo, CC); SDValue Hi = DAG.getSelectCC(DL, LHS, RHS, THi, FHi, CC); return buildWide32(DAG, DL, Lo, Hi); } W65816CC::CondCode TCC = normalizeCC(LHS, RHS, CC, DAG, DL); if (TCC == W65816CC::COND_INVALID) report_fatal_error("W65816: select_cc condition not yet implemented"); SDValue Glue = DAG.getNode(W65816ISD::CMP, DL, MVT::Glue, LHS, RHS); SDValue CCOp = DAG.getTargetConstant(TCC, DL, MVT::i8); // SDTypeProfile declares 1 result (the selected value). Earlier // code passed a 2-VT list (value + Glue) which was silently wrong // and trips an SDNode-validity assertion in assertions builds. SDValue Ops[] = {TVal, FVal, CCOp, Glue}; return DAG.getNode(W65816ISD::SELECT_CC, DL, Op.getValueType(), Ops); } // i8 -> i16 sign extend. Branchless 3-instruction trick: // sext(x) = ((x & 0xFF) ^ 0x80) - 0x80 // Verify: x=0x00 -> 0x80 - 0x80 = 0x0000. x=0x7F -> 0xFF - 0x80 = 0x7F. // x=0x80 -> 0x00 - 0x80 = 0xFF80 (-128). x=0xFF -> 0x7F - 0x80 // = 0xFFFF (-1). // Lowers to: AND #$00FF; EOR #$0080; SEC; SBC #$0080 (10 bytes total, // no branches, no temp slots — much cheaper than the SELECT_CC diamond // version that produced ~14 instructions plus stack spills). SDValue W65816TargetLowering::LowerSignExtend(SDValue Op, SelectionDAG &DAG) const { SDValue X = Op.getOperand(0); if (X.getValueType() != MVT::i8 || Op.getValueType() != MVT::i16) return SDValue(); SDLoc DL(Op); SDValue Z = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, X); SDValue Sign = DAG.getConstant(0x0080, DL, MVT::i16); SDValue Xor = DAG.getNode(ISD::XOR, DL, MVT::i16, Z, Sign); return DAG.getNode(ISD::SUB, DL, MVT::i16, Xor, Sign); } // ptr32 foundation hook. In ptr16 mode (PointerWidth=16, current // default) addresses are i16 and we return SDValue() so the legalizer // keeps the load and the existing LDAptr / STAptr selection patterns // match. In ptr32 mode addresses are i32 and we wrap the load in // W65816ISD::LD_PTR via getMemIntrinsicNode so the [dp],Y inserter // can take the bank byte from sub_hi instead of forcing 0. // // Byte loads (zextload, anyext, true i8) keep going through the i16 // LDA + AND #$FF idiom — same trick the existing LDAptr uses; for // ptr32 mode the load is still 16 bits, just bank-explicit. SDValue W65816TargetLowering::LowerLoad(SDValue Op, SelectionDAG &DAG) const { LoadSDNode *Ld = cast(Op); SDValue Chain = Ld->getChain(); SDValue Ptr = Ld->getBasePtr(); EVT VT = Op.getValueType(); SDLoc DL(Op); // Const-int address: leave the SDAG alone so the tablegen pattern // `(load (iPTR imm))` → LDA8long fires (bank-explicit). See the // mirrored short-circuit at the top of LowerStore. if (isa(Ptr) && (VT == MVT::i8 || VT == MVT::i16)) return SDValue(); // i32 LOAD: split into two i16 loads at offsets 0 and 2 then // REG_SEQUENCE the halves into a Wide32. Address may be i16 (stack // slot, global) or i32 (ptr32 deref); the recursive ADD handles // address arithmetic correctly via LowerI32Bin. if (VT == MVT::i32) { EVT PtrVT = Ptr.getValueType(); SDValue Two = DAG.getConstant(2, DL, PtrVT); SDValue Ptr2 = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr, Two); SDValue Lo = DAG.getLoad(MVT::i16, DL, Chain, Ptr, Ld->getPointerInfo(), Ld->getAlign(), Ld->getMemOperand()->getFlags()); SDValue Hi = DAG.getLoad(MVT::i16, DL, Chain, Ptr2, Ld->getPointerInfo().getWithOffset(2), Ld->getAlign(), Ld->getMemOperand()->getFlags()); SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo.getValue(1), Hi.getValue(1)); SDValue Val = buildWide32(DAG, DL, Lo, Hi); return DAG.getMergeValues({Val, NewChain}, DL); } // Same fold as LowerStore: a Wide32 ptr built from Wrapper + // WrapperBank of the same global, OR a raw GlobalAddress, lets us // emit an abs-16 (DBR-relative) load (LDA / LDA8abs) instead of // the slower [dp],Y indirect-long. Our globals are in the load // segment that crt0 pins to DBR. SDValue FoldedLo; if (auto *GA = dyn_cast(Ptr)) { FoldedLo = DAG.getNode(W65816ISD::Wrapper, DL, MVT::i16, DAG.getTargetGlobalAddress(GA->getGlobal(), DL, MVT::i16, GA->getOffset())); } else if (auto *ES = dyn_cast(Ptr)) { FoldedLo = DAG.getNode(W65816ISD::Wrapper, DL, MVT::i16, DAG.getTargetExternalSymbol(ES->getSymbol(), MVT::i16)); } else if (Ptr.getNode()->isMachineOpcode() && Ptr.getMachineOpcode() == TargetOpcode::REG_SEQUENCE) { SDValue PLo, PHi; for (unsigned i = 1; i + 1 < Ptr.getNumOperands(); i += 2) { if (auto *CIdx = dyn_cast(Ptr.getOperand(i + 1))) { if (CIdx->getZExtValue() == llvm::sub_lo) PLo = Ptr.getOperand(i); else if (CIdx->getZExtValue() == llvm::sub_hi) PHi = Ptr.getOperand(i); } } if (PLo && PHi && PLo.getOpcode() == W65816ISD::Wrapper && PHi.getOpcode() == W65816ISD::WrapperBank) { SDValue WLo = PLo.getOperand(0); SDValue WHi = PHi.getOperand(0); auto *GLo = dyn_cast(WLo); auto *GHi = dyn_cast(WHi); auto *ELo = dyn_cast(WLo); auto *EHi = dyn_cast(WHi); bool SameGlobal = (GLo && GHi && GLo->getGlobal() == GHi->getGlobal() && GLo->getOffset() == GHi->getOffset()); bool SameExtern = (ELo && EHi && StringRef(ELo->getSymbol()) == EHi->getSymbol()); if (SameGlobal || SameExtern) FoldedLo = PLo; } } if (FoldedLo) { EVT MemVT = Ld->getMemoryVT(); ISD::LoadExtType ExtType = Ld->getExtensionType(); if (ExtType == ISD::NON_EXTLOAD && MemVT == Op.getValueType()) { return DAG.getLoad(Op.getValueType(), DL, Chain, FoldedLo, Ld->getPointerInfo(), Ld->getAlign(), Ld->getMemOperand()->getFlags()); } // i1 memory type comes from GlobalOpt narrowing `short` globals // whose only assignments are 0/1. Treat as i8 load + appropriate // mask — the underlying memory is still byte-sized. if (MemVT == MVT::i1) { SDValue ByteLd = DAG.getExtLoad(ISD::ZEXTLOAD, DL, MVT::i16, Chain, FoldedLo, MVT::i8, Ld->getMemOperand()); SDValue Val = ByteLd; if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::EXTLOAD) { Val = DAG.getNode(ISD::AND, DL, MVT::i16, ByteLd, DAG.getConstant(1, DL, MVT::i16)); } else if (ExtType == ISD::SEXTLOAD) { // i1 sign-extend: bit 0 -> all bits. AND #1 then NEG. SDValue Bit = DAG.getNode(ISD::AND, DL, MVT::i16, ByteLd, DAG.getConstant(1, DL, MVT::i16)); Val = DAG.getNode(ISD::SUB, DL, MVT::i16, DAG.getConstant(0, DL, MVT::i16), Bit); } if (Op.getValueType() == MVT::i8) Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Val); return DAG.getMergeValues({Val, ByteLd.getValue(1)}, DL); } return DAG.getExtLoad(ExtType, DL, Op.getValueType(), Chain, FoldedLo, MemVT, Ld->getMemOperand()); } // ptr16 mode: address is i16, let the default selection handle it. if (Ptr.getValueType() != MVT::i32) return SDValue(); EVT MemVT = Ld->getMemoryVT(); // Widen i1 memVT to i8 (single-byte storage). getMemIntrinsicNode // asserts memvt must be supported; i1 isn't. if (MemVT == MVT::i1) MemVT = MVT::i8; SDVTList VTs = DAG.getVTList(MVT::i16, MVT::Other); // Try to peel a constant offset from Ptr and route through // LD_PTR_OFF — folds `(ptr + K)` into the Y-register of `[E0],Y`, // saving the i32 ADD's CLC/ADC carry chain. ~3 instr per access. // See feedback_ptr32_deref_fold_layer1_mi.md. // LD_PTR_OFF: deferred — the peel fires correctly but the resulting // SDAG breaks the JSON-tokenizer + snprintf smoke tests in ways // bisection didn't isolate. Stick with LD_PTR (no peel) here; the // LowerStore peel for ST_PTR_OFF / STB_PTR_OFF keeps the store-side // optimization. Future: route loads through a SDAG combine that // runs post-LegalizeOps so we see the final REG_SEQUENCE shape. SDValue Ops[] = { Chain, Ptr }; SDValue LdNode = DAG.getMemIntrinsicNode(W65816ISD::LD_PTR, DL, VTs, Ops, MemVT, Ld->getMemOperand()); SDValue Val = LdNode; // Byte memory access: mask the high byte for zextload, leave anyext. // i1 memVT was widened to i8 above; the mask path is the same. if (MemVT == MVT::i8) { EVT OrigMemVT = Ld->getMemoryVT(); SDValue MaskC = DAG.getConstant(OrigMemVT == MVT::i1 ? 1 : 0xFF, DL, MVT::i16); if (Ld->getExtensionType() == ISD::ZEXTLOAD || (OrigMemVT == MVT::i1 && Ld->getExtensionType() == ISD::EXTLOAD)) Val = DAG.getNode(ISD::AND, DL, MVT::i16, Val, MaskC); else if (Ld->getExtensionType() == ISD::SEXTLOAD) Val = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i16, Val, DAG.getValueType(MVT::i8)); } // Narrow back to i8 if the consumer wanted i8. if (VT == MVT::i8) Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Val); return DAG.getMergeValues({Val, LdNode.getValue(1)}, DL); } // ZERO/SIGN/ANY_EXTEND i8/i16 -> i32: build a Wide32 from the i16 // payload and a 0 / sign-fill / undef high half. SDValue W65816TargetLowering::LowerExtend(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); if (Op.getValueType() != MVT::i32) return SDValue(); SDValue X = Op.getOperand(0); // Promote i8 inputs to i16 first via the same opcode. if (X.getValueType() == MVT::i8) X = DAG.getNode(Op.getOpcode(), DL, MVT::i16, X); SDValue Lo = X; SDValue Hi; if (Op.getOpcode() == ISD::ZERO_EXTEND) { Hi = DAG.getConstant(0, DL, MVT::i16); } else if (Op.getOpcode() == ISD::SIGN_EXTEND) { // Sign-fill via SRA #15 — uses our SRA15A pattern (4 insns) and // stays i16-typed in both LHS and RHS, dodging the combiner's // shift-amount-promote when ptr32 makes pointer-typed shift // amounts i32. Hi = DAG.getNode(ISD::SRA, DL, MVT::i16, Lo, DAG.getConstant(15, DL, MVT::i16)); } else { Hi = DAG.getUNDEF(MVT::i16); } return buildWide32(DAG, DL, Lo, Hi); } // SIGN_EXTEND_INREG i32 with inner type i1/i8/i16: sign-extend the low // N bits of an i32 input to fill all 32 bits. The legalizer leaves // this op alone when i32 is legal — but no tablegen pattern matches // the i32 form, so without this Custom hook isel aborts with // "Cannot select: sign_extend_inreg ... ValueType:i1" on shapes like // `-(crc & 1ul)` in CRC32 loops. // // Strategy: for inner VT V (= i1 / i8 / i16), the low half's // `sext_inreg` (already pattern-matched at i16) produces the signed // i16 value — then sign-fill the high half via SRA #15 of the lo // result. SDValue W65816TargetLowering::LowerSignExtendInReg(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); SDValue X = Op.getOperand(0); EVT InnerVT = cast(Op.getOperand(1))->getVT(); EVT ResVT = Op.getValueType(); // i16 result: replicate the existing tablegen patterns. We MUST // handle this case rather than returning SDValue(), because // setOperationAction's Custom-returns-SDValue() falls through to // default Expand (= SRA/SHL chain), not to tablegen pattern match. // The two existing patterns are: // (sext_inreg Acc16:$src, i1) -> NEGA16 (AND $src, 1) // (sext_inreg Acc16:$src, i8) -> ((src & 0xFF) ^ 0x80) - 0x80 // Reproduce them at the SDAG level so the legalizer's Custom // dispatch returns a fully-lowered tree. if (ResVT == MVT::i16) { if (InnerVT == MVT::i1) { SDValue Bit = DAG.getNode(ISD::AND, DL, MVT::i16, X, DAG.getConstant(1, DL, MVT::i16)); return DAG.getNode(ISD::SUB, DL, MVT::i16, DAG.getConstant(0, DL, MVT::i16), Bit); } if (InnerVT == MVT::i8) { SDValue Masked = DAG.getNode(ISD::AND, DL, MVT::i16, X, DAG.getConstant(0xFF, DL, MVT::i16)); SDValue Xored = DAG.getNode(ISD::XOR, DL, MVT::i16, Masked, DAG.getConstant(0x80, DL, MVT::i16)); return DAG.getNode(ISD::SUB, DL, MVT::i16, Xored, DAG.getConstant(0x80, DL, MVT::i16)); } // inner i16 = no-op. return X; } if (ResVT != MVT::i32) return SDValue(); // i32 result: project the input's low half (X is i32 Wide32 here), // apply the inner-VT sext on the i16 low half, sign-fill the hi. SDValue Lo = extractWide32Lo(DAG, DL, X); if (InnerVT != MVT::i16) { Lo = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i16, Lo, DAG.getValueType(InnerVT)); } // Sign-fill the hi half via SRA #15 — same idiom LowerExtend uses for // SIGN_EXTEND i16 -> i32. SDValue Hi = DAG.getNode(ISD::SRA, DL, MVT::i16, Lo, DAG.getConstant(15, DL, MVT::i16)); return buildWide32(DAG, DL, Lo, Hi); } // TRUNCATE i32 -> i16: project sub_lo. SDValue W65816TargetLowering::LowerTruncate(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); if (Op.getOperand(0).getValueType() != MVT::i32) return SDValue(); if (Op.getValueType() == MVT::i16) return extractWide32Lo(DAG, DL, Op.getOperand(0)); if (Op.getValueType() == MVT::i8) { // i32 -> i16 -> i8. The i8 trunc pattern is COPY_TO_REGCLASS at MC // level; the i16 sub_lo extract is the work. SDValue Lo16 = extractWide32Lo(DAG, DL, Op.getOperand(0)); return DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Lo16); } return SDValue(); } // i32 Constant: split into two i16 constants and REG_SEQUENCE. SDValue W65816TargetLowering::LowerI32Constant(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); if (Op.getValueType() != MVT::i32) return SDValue(); uint64_t V = cast(Op)->getZExtValue(); SDValue Lo = DAG.getConstant(V & 0xFFFFu, DL, MVT::i16); SDValue Hi = DAG.getConstant((V >> 16) & 0xFFFFu, DL, MVT::i16); return buildWide32(DAG, DL, Lo, Hi); } // ADD/SUB/AND/OR/XOR i32 -> per-half i16 op. ADDC/ADDE chain for ADD, // SUBC/SUBE for SUB. AND/OR/XOR are independent halves. SDValue W65816TargetLowering::LowerI32Bin(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); if (Op.getValueType() != MVT::i32) return SDValue(); SDValue L = Op.getOperand(0); SDValue R = Op.getOperand(1); SDValue LL = extractWide32Lo(DAG, DL, L); SDValue LH = extractWide32Hi(DAG, DL, L); SDValue RL = extractWide32Lo(DAG, DL, R); SDValue RH = extractWide32Hi(DAG, DL, R); SDValue Lo, Hi; switch (Op.getOpcode()) { case ISD::AND: Lo = DAG.getNode(ISD::AND, DL, MVT::i16, LL, RL); Hi = DAG.getNode(ISD::AND, DL, MVT::i16, LH, RH); break; case ISD::OR: Lo = DAG.getNode(ISD::OR, DL, MVT::i16, LL, RL); Hi = DAG.getNode(ISD::OR, DL, MVT::i16, LH, RH); break; case ISD::XOR: Lo = DAG.getNode(ISD::XOR, DL, MVT::i16, LL, RL); Hi = DAG.getNode(ISD::XOR, DL, MVT::i16, LH, RH); break; case ISD::ADD: { SDVTList VTs = DAG.getVTList(MVT::i16, MVT::Glue); SDValue Lo2 = DAG.getNode(ISD::ADDC, DL, VTs, LL, RL); Lo = Lo2.getValue(0); SDValue Carry = Lo2.getValue(1); Hi = DAG.getNode(ISD::ADDE, DL, VTs, LH, RH, Carry).getValue(0); break; } case ISD::SUB: { SDVTList VTs = DAG.getVTList(MVT::i16, MVT::Glue); SDValue Lo2 = DAG.getNode(ISD::SUBC, DL, VTs, LL, RL); Lo = Lo2.getValue(0); SDValue Borrow = Lo2.getValue(1); Hi = DAG.getNode(ISD::SUBE, DL, VTs, LH, RH, Borrow).getValue(0); break; } default: return SDValue(); } return buildWide32(DAG, DL, Lo, Hi); } // Store companion to LowerLoad. For i32 addresses, dispatch to the // 16-bit ST_PTR or the byte-truncating STB_PTR target node based on // MemoryVT. For i16 addresses (ptr16 mode), bail out and let the // existing STAptr / STBptr patterns match. SDValue W65816TargetLowering::LowerStore(SDValue Op, SelectionDAG &DAG) const { StoreSDNode *St = cast(Op); SDValue Chain = St->getChain(); SDValue Val = St->getValue(); SDValue Ptr = St->getBasePtr(); EVT MemVT = St->getMemoryVT(); SDLoc DL(Op); // Const-int address (`*(volatile uint8*)0xC035 = v`): leave the SDAG // alone so the tablegen pattern `(store Acc8, (iPTR imm))` → // STA8long fires. Without this short-circuit the i32-pointer code // below promotes the constant address into a Wide32 register pair // and routes through STBptr32 ([dp],Y), which is 16 B / 30 cyc and // (worse) bank-tracks DBR. if (isa(Ptr)) return SDValue(); // i32 STORE: split into two halves. Critical: the per-half stores // MUST go through the target-specific W65816ISD::ST_PTR node and not // through plain ISD::STORE, otherwise the SDAG combiner's // MergeConsecutiveStores re-combines them into a single i32 store // that re-enters LowerStore — infinite loop, OOM in the combiner. // For i16 ptrs (legacy ptr16), fall back to ISD::STORE; the regular // store-merger doesn't trip there because address splitting via // ISD::ADD on i16 doesn't itself fan out into ptr-pair operations. if (Val.getValueType() == MVT::i32) { SDValue Lo = extractWide32Lo(DAG, DL, Val); SDValue Hi = extractWide32Hi(DAG, DL, Val); EVT PtrVT = Ptr.getValueType(); // ptr32 const-i32-addr fast path: `*(uint32_t*)0x5000 = v` should // lower to two STAabs (DBR-relative, 5 cyc each) instead of two // [dp],Y stores via ST_PTR. Detect Wide32-zero-hi Constant ptr, // emit two i16 stores at TargetConstant:i32 addrs. TargetConstant // (not Constant) so LowerI32Constant doesn't re-fire and recreate // the REG_SEQUENCE. The STAabs timm pattern matches. if (PtrVT == MVT::i32 && Ptr.getNode()->isMachineOpcode() && Ptr.getMachineOpcode() == TargetOpcode::REG_SEQUENCE) { SDValue PtrLo, PtrHi; for (unsigned i = 1; i + 1 < Ptr.getNumOperands(); i += 2) { if (auto *CIdx = dyn_cast(Ptr.getOperand(i + 1))) { if (CIdx->getZExtValue() == llvm::sub_lo) PtrLo = Ptr.getOperand(i); else if (CIdx->getZExtValue() == llvm::sub_hi) PtrHi = Ptr.getOperand(i); } } auto *PtrHiC = dyn_cast_or_null(PtrHi); auto *PtrLoC = dyn_cast_or_null(PtrLo); if (PtrLoC && PtrHiC && PtrHiC->getZExtValue() == 0) { uint64_t Base = PtrLoC->getZExtValue() & 0xFFFF; SDValue PLo = DAG.getTargetConstant(Base, DL, MVT::i32); SDValue PHi = DAG.getTargetConstant((Base + 2) & 0xFFFF, DL, MVT::i32); SDValue StLo = DAG.getStore(Chain, DL, Lo, PLo, St->getPointerInfo(), St->getAlign(), St->getMemOperand()->getFlags()); SDValue StHi = DAG.getStore(StLo, DL, Hi, PHi, St->getPointerInfo().getWithOffset(2), St->getAlign(), St->getMemOperand()->getFlags()); return StHi; } } SDValue Two = DAG.getConstant(2, DL, PtrVT); SDValue Ptr2 = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr, Two); if (PtrVT == MVT::i32) { // ptr32 path — emit two W65816ISD::ST_PTR target nodes, sequentially // chained. The combiner cannot merge target-opaque MemIntrinsic // stores. SDVTList VTs = DAG.getVTList(MVT::Other); SDValue OpsLo[] = { Chain, Lo, Ptr }; SDValue StLo = DAG.getMemIntrinsicNode( W65816ISD::ST_PTR, DL, VTs, OpsLo, MVT::i16, St->getMemOperand()); SDValue OpsHi[] = { StLo, Hi, Ptr2 }; MachineMemOperand *MMOHi = DAG.getMachineFunction().getMachineMemOperand( St->getMemOperand(), 2, 2); SDValue StHi = DAG.getMemIntrinsicNode( W65816ISD::ST_PTR, DL, VTs, OpsHi, MVT::i16, MMOHi); return StHi; } // ptr16 path — emit two regular i16 stores serially chained so the // store-merger sees them as a 4-byte sequence (which it will likely // leave alone since the resulting i32 store has no legal target // pattern in ptr16 mode anyway). SDValue StLo = DAG.getStore(Chain, DL, Lo, Ptr, St->getPointerInfo(), St->getAlign(), St->getMemOperand()->getFlags()); SDValue StHi = DAG.getStore(StLo, DL, Hi, Ptr2, St->getPointerInfo().getWithOffset(2), St->getAlign(), St->getMemOperand()->getFlags()); return StHi; } // Optimization: if the store goes through a global address (raw // GlobalAddress/ExternalSymbol, or a Wide32 built from Wrapper + // WrapperBank of the same symbol), lower to a plain i16/i8 store // through a single Wrapper@symbol so the tablegen pattern // (store Acc8/Acc16, (W65816Wrapper tglobaladdr:$g)) // selects STA8abs / STAabs (DBR-relative). Our globals live in // the load segment that crt0 pins to DBR, so abs-16 reaches them. // This avoids the 14-byte [dp],y indirect-long path AND re-enables // the STZ peephole that the indirect path defeats. SDValue FoldedLo; if (auto *GA = dyn_cast(Ptr)) { FoldedLo = DAG.getNode(W65816ISD::Wrapper, DL, MVT::i16, DAG.getTargetGlobalAddress(GA->getGlobal(), DL, MVT::i16, GA->getOffset())); } else if (auto *ES = dyn_cast(Ptr)) { FoldedLo = DAG.getNode(W65816ISD::Wrapper, DL, MVT::i16, DAG.getTargetExternalSymbol(ES->getSymbol(), MVT::i16)); } else if (Ptr.getNode()->isMachineOpcode() && Ptr.getMachineOpcode() == TargetOpcode::REG_SEQUENCE) { SDValue PLo, PHi; for (unsigned i = 1; i + 1 < Ptr.getNumOperands(); i += 2) { if (auto *CIdx = dyn_cast(Ptr.getOperand(i + 1))) { if (CIdx->getZExtValue() == llvm::sub_lo) PLo = Ptr.getOperand(i); else if (CIdx->getZExtValue() == llvm::sub_hi) PHi = Ptr.getOperand(i); } } if (PLo && PHi && PLo.getOpcode() == W65816ISD::Wrapper && PHi.getOpcode() == W65816ISD::WrapperBank) { SDValue WLo = PLo.getOperand(0); SDValue WHi = PHi.getOperand(0); auto *GLo = dyn_cast(WLo); auto *GHi = dyn_cast(WHi); auto *ELo = dyn_cast(WLo); auto *EHi = dyn_cast(WHi); bool SameGlobal = (GLo && GHi && GLo->getGlobal() == GHi->getGlobal() && GLo->getOffset() == GHi->getOffset()); bool SameExtern = (ELo && EHi && StringRef(ELo->getSymbol()) == EHi->getSymbol()); if (SameGlobal || SameExtern) FoldedLo = PLo; } } if (FoldedLo) { // Preserve memVT — original may be a truncating store (e.g., // i16 value into i8 memory). getStore picks memVT from Val's // type, which can mismatch the original MachineMemOperand size. if (MemVT == Val.getValueType()) { return DAG.getStore(Chain, DL, Val, FoldedLo, St->getPointerInfo(), St->getAlign(), St->getMemOperand()->getFlags()); } return DAG.getTruncStore(Chain, DL, Val, FoldedLo, MemVT, St->getMemOperand()); } // No i32 ptr → nothing for us to do; let the default ISD::STORE // path handle it. (Also avoids accidentally wrapping an i16 ptr // store into ST_PTR below, whose ptr operand must be i32.) if (Ptr.getValueType() != MVT::i32) return SDValue(); // The pseudos take Acc16 (i16) as the value half; the SEP/REP wrap // around STBptr32 narrows in memory. Promote i8 values to i16 with // ANY_EXTEND — the inserter only writes one byte, so the high half // is don't-care. if (Val.getValueType() == MVT::i8) Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, Val); SDVTList VTs = DAG.getVTList(MVT::Other); SDValue Base; uint16_t Off = 0; if (peelPtr32Offset(DAG, DL, Ptr, Base, Off)) { unsigned OffOpc = (MemVT == MVT::i8) ? unsigned(W65816ISD::STB_PTR_OFF) : unsigned(W65816ISD::ST_PTR_OFF); SDValue OffN = DAG.getTargetConstant(Off, DL, MVT::i16); SDValue OpsOff[] = { Chain, Val, Base, OffN }; return DAG.getMemIntrinsicNode(OffOpc, DL, VTs, OpsOff, MemVT, St->getMemOperand()); } unsigned NodeOpc = (MemVT == MVT::i8) ? unsigned(W65816ISD::STB_PTR) : unsigned(W65816ISD::ST_PTR); SDValue Ops[] = { Chain, Val, Ptr }; return DAG.getMemIntrinsicNode(NodeOpc, DL, VTs, Ops, MemVT, St->getMemOperand()); } // VAARG: load *ap, advance ap by sizeof(VT). Unlike the default // expansion, we do NOT align ap to the type's preferred alignment — // caller-pushed varargs land at byte-granular addresses (PHA from an // odd S leaves the low byte at S+1 which is even, but our prologue's // TSC-sequence can produce odd S, etc.). Aligning ap would skip the // pushed value's low byte. static SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) { SDLoc DL(Op); SDValue Chain = Op.getOperand(0); SDValue VAListPtr = Op.getOperand(1); EVT VT = Op.getValueType(); // ap (va_list) is `char *` on this target — i16 under ptr16, i32 // under ptr32. Load and store it at PtrVT so we don't truncate and // lose the high half (under ptr32, hi=0 so the truncation read garbage // back, then the i16 store wrote i16 over the lo half but left an // unrelated value in the hi — silent miscompile of every variadic // call on ptr32). EVT PtrVT = VAListPtr.getValueType(); SDValue Ap = DAG.getLoad(PtrVT, DL, Chain, VAListPtr, MachinePointerInfo()); Chain = Ap.getValue(1); // For the actual data deref: under ptr16 we route i16 through // VAARG_LOAD (bank-0-explicit `[dp],Y`). Under ptr32, ap is already // a Wide32 ptr with hi=0 (caller set up the va_list to point into the // call-frame stack-args region, bank 0); a regular load through that // pointer routes to LDAptr32 / STBptr32 which already deref bank-0. SDValue Val; if (VT == MVT::i16 && PtrVT == MVT::i16) { SDVTList VTs = DAG.getVTList(MVT::i16, MVT::Other); Val = DAG.getNode(W65816ISD::VAARG_LOAD, DL, VTs, Chain, Ap); Chain = Val.getValue(1); } else { Val = DAG.getLoad(VT, DL, Chain, Ap, MachinePointerInfo()); Chain = Val.getValue(1); } // ap += sizeof(VT) (rounded up to whole bytes). unsigned Size = (VT.getSizeInBits() + 7) / 8; SDValue NewAp = DAG.getNode(ISD::ADD, DL, PtrVT, Ap, DAG.getConstant(Size, DL, PtrVT)); Chain = DAG.getStore(Chain, DL, NewAp, VAListPtr, MachinePointerInfo()); return DAG.getMergeValues({Val, Chain}, DL); } // VASTART: store the address of the first vararg slot (recorded by // LowerFormalArguments via VarArgsFrameIndex) to the va_list pointer. // va_list is just `i16 *next` here — minimum implementation. static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG, const W65816TargetLowering &TLI) { MachineFunction &MF = DAG.getMachineFunction(); auto *FuncInfo = MF.getInfo(); SDLoc DL(Op); // FrameIndex must be at PtrVT (i16 under ptr16, i32 under ptr32) so // the subsequent store writes the full pointer width. Under ptr32 // the i32 FI lowers via the i32 pointer-store path; the high half // is implicitly 0 (stack is bank 0) and stored alongside the lo. EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout()); SDValue VAFI = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); SDValue Chain = Op.getOperand(0); SDValue VAListPtr = Op.getOperand(1); const Value *SV = cast(Op.getOperand(2))->getValue(); return DAG.getStore(Chain, DL, VAFI, VAListPtr, MachinePointerInfo(SV)); } SDValue W65816TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); case ISD::BR_CC: return LowerBR_CC(Op, DAG); case ISD::BRIND: return LowerBRIND(Op, DAG); case ISD::SETCC: return LowerSETCC(Op, DAG); case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); case ISD::SELECT: { // Custom-lower SELECT for i32 result: split into per-half // selects. Without this, the legalizer's default (rewriting // SELECT to SELECT_CC against zero) produces SELECT_CC i32 of // a different shape that re-enters Custom and creates a cycle. if (Op.getValueType() != MVT::i32) return SDValue(); SDValue Cond = Op.getOperand(0); SDValue TVal = Op.getOperand(1); SDValue FVal = Op.getOperand(2); SDLoc DL(Op); SDValue TLo = extractWide32Lo(DAG, DL, TVal); SDValue THi = extractWide32Hi(DAG, DL, TVal); SDValue FLo = extractWide32Lo(DAG, DL, FVal); SDValue FHi = extractWide32Hi(DAG, DL, FVal); SDValue Lo = DAG.getNode(ISD::SELECT, DL, MVT::i16, Cond, TLo, FLo); SDValue Hi = DAG.getNode(ISD::SELECT, DL, MVT::i16, Cond, THi, FHi); return buildWide32(DAG, DL, Lo, Hi); } case ISD::SIGN_EXTEND: if (Op.getValueType() == MVT::i32) return LowerExtend(Op, DAG); return LowerSignExtend(Op, DAG); case ISD::VASTART: return LowerVASTART(Op, DAG, *this); case ISD::VAARG: return LowerVAARG(Op, DAG); case ISD::SHL: case ISD::SRL: case ISD::SRA: return LowerShift(Op, DAG); case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: return LowerExtend(Op, DAG); case ISD::SIGN_EXTEND_INREG: return LowerSignExtendInReg(Op, DAG); case ISD::TRUNCATE: return LowerTruncate(Op, DAG); case ISD::ADD: case ISD::SUB: case ISD::AND: case ISD::OR: case ISD::XOR: return LowerI32Bin(Op, DAG); case ISD::MUL: return LowerMUL_I32(Op, DAG); case ISD::LOAD: return LowerLoad(Op, DAG); case ISD::STORE: return LowerStore(Op, DAG); case ISD::Constant: return LowerI32Constant(Op, DAG); // SJLJ EH: setup_dispatch is a no-op on this target — the dispatcher // logic lives entirely in the SJLJ runtime (_Unwind_SjLj_Resume + // longjmp into the function context's jmp_buf). The isel layer // doesn't need to emit any code; just thread the chain through. case ISD::EH_SJLJ_SETUP_DISPATCH: return Op.getOperand(0); case ISD::TRAP: case ISD::DEBUGTRAP: { // Wrap the incoming chain in a W65816ISD::TRAP node; the InstrInfo.td // pattern (W65816trap) selects BRK_pseudo, which the AsmPrinter // expands to sentinel-store + BRK + self-loop. Threading the chain // through keeps memory-ordering side effects honest (the trap is // observed after any prior store). SDLoc DL(Op); SDValue Chain = Op.getOperand(0); return DAG.getNode(W65816ISD::TRAP, DL, MVT::Other, Chain); } case ISD::DYNAMIC_STACKALLOC: return LowerDynamicStackalloc(Op, DAG); case ISD::STACKSAVE: { // Return Constant 0 — SJLJ stores this into the function context // but our setjmp/longjmp manage SP directly, so the value is dead. SDLoc DL(Op); EVT VT = Op.getValueType(); SDValue Chain = Op.getOperand(0); SDValue Result; if (VT == MVT::i16) Result = DAG.getConstant(0, DL, MVT::i16); else Result = buildWide32(DAG, DL, DAG.getConstant(0, DL, MVT::i16), DAG.getConstant(0, DL, MVT::i16)); return DAG.getMergeValues({Result, Chain}, DL); } case ISD::STACKRESTORE: // No-op — pass the chain through. return Op.getOperand(0); case ISD::FRAMEADDR: { // FRAMEADDR(N): SJLJ uses N=0 (current frame). We don't reserve a // frame pointer and SP isn't trivially CopyFromReg-able (no // register class). Return Constant 0 — SJLJ uses it as an opaque // per-frame identifier; the SJLJ runtime tracks frames by jmp_buf // chaining (FnCtx::prev) rather than by FRAMEADDR value, so a // constant works for single-throw / non-nested-catch programs. // True multi-frame SJLJ would need a TSC-based unique value. SDLoc DL(Op); EVT VT = Op.getValueType(); if (VT == MVT::i16) return DAG.getConstant(0, DL, MVT::i16); SDValue Lo = DAG.getConstant(0, DL, MVT::i16); SDValue Hi = DAG.getConstant(0, DL, MVT::i16); return buildWide32(DAG, DL, Lo, Hi); } default: #ifndef NDEBUG Op.dump(); #endif llvm_unreachable("W65816: unexpected operation in LowerOperation"); } } std::pair W65816TargetLowering::getRegForInlineAsmConstraint( const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { // Strip leading '{' and trailing '}' for the long form. StringRef C = Constraint; if (C.size() >= 2 && C.front() == '{' && C.back() == '}') C = C.substr(1, C.size() - 2); if (VT == MVT::i8) { if (C == "a") return {W65816::A, &W65816::Acc8RegClass}; if (C == "x") return {W65816::X, &W65816::Idx8RegClass}; if (C == "y") return {W65816::Y, &W65816::Idx8RegClass}; if (C == "r") return {W65816::A, &W65816::Acc8RegClass}; } else { // i16 default; pointer types fold here too if (C == "a") return {W65816::A, &W65816::Acc16RegClass}; if (C == "x") return {W65816::X, &W65816::Idx16RegClass}; if (C == "y") return {W65816::Y, &W65816::Idx16RegClass}; if (C == "r") return {W65816::A, &W65816::Acc16RegClass}; } return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); } SDValue W65816TargetLowering::LowerDynamicStackalloc(SDValue Op, SelectionDAG &DAG) const { // (DYNAMIC_STACKALLOC chain, size, align) -> (ptr, chain). // Lowered as: stash entry SP -> DP $F4 (handled by emitPrologue when // MFI.hasVarSizedObjects), then `tsc; sec; sbc size; tcs; inc a`. // The epilogue restores SP from $F4. // // Limitation: any FrameIndex (local, spill slot, parameter) accessed // *after* the alloca reads from a wrong stack-relative offset because // PEI bakes FI offsets relative to the static-frame SP, not the // post-alloca SP. A real frame pointer would lift this; for now we // accept the limitation and document it. The simplest safe pattern // is "VLA at end of function, used immediately, no further FI access"; // anything else is at-your-own-risk until FP support lands. SDLoc DL(Op); SDValue Chain = Op.getOperand(0); SDValue Size = Op.getOperand(1); EVT ResultVT = Op.getValueType(); // Under ptr32, both the result pointer and the size are Wide32 i32 // values. Extract the i16 lo half of size (a VLA larger than 64KB // doesn't fit in our stack anyway), do the i16 ALLOCA, then build // the Wide32 result with bank=0 (stack is always bank 0). if (ResultVT == MVT::i32) { SDValue Size16 = (Size.getValueType() == MVT::i32) ? extractWide32Lo(DAG, DL, Size) : Size; SDValue ChainAndPtr = DAG.getNode(W65816ISD::ALLOCA, DL, DAG.getVTList(MVT::i16, MVT::Other), Chain, Size16); SDValue Ptr16 = ChainAndPtr.getValue(0); SDValue NewChain = ChainAndPtr.getValue(1); SDValue Bank = DAG.getConstant(0, DL, MVT::i16); SDValue Ptr32 = buildWide32(DAG, DL, Ptr16, Bank); return DAG.getMergeValues({Ptr32, NewChain}, DL); } SDValue ChainAndPtr = DAG.getNode(W65816ISD::ALLOCA, DL, DAG.getVTList(MVT::i16, MVT::Other), Chain, Size); SDValue Ptr = ChainAndPtr.getValue(0); SDValue NewChain = ChainAndPtr.getValue(1); return DAG.getMergeValues({Ptr, NewChain}, DL); } SDValue W65816TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { // i8 shifts: promote to i16, shift, truncate. SRA promotes via SEXT // (preserves sign for arithmetic right shift); SHL/SRL via ZEXT // (logical / left shifts don't care about high bits). This routes // i8 shifts through the same i16 fast paths and libcalls — no // parallel qi3 libcall set needed. The DAG combiner would otherwise // narrow `(trunc (shl (zext X), K))` back to `(shl X, K)` of i8, // re-entering this hook in an infinite loop; the // `isTypeDesirableForOp(SHL/SRL/SRA, i8) -> false` override above // disables that combine. if (Op.getValueType() == MVT::i8) { SDLoc DL(Op); SDValue X = Op.getOperand(0); SDValue N = Op.getOperand(1); unsigned Ext = (Op.getOpcode() == ISD::SRA) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; SDValue X16 = DAG.getNode(Ext, DL, MVT::i16, X); SDValue N16 = N.getValueType() == MVT::i16 ? N : DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, N); // Special case: i8 SRA by 7 of a sign-extended value is the // sign-fill operation — every result bit is the input's bit 7. // For sext(i8 x), bit 15 == bit 7, so `(sra (sext x), 7)` yields // the same result as `(sra (sext x), 15)`, which we have a tight // 4-insn pattern for via SRA15A. Avoids the __ashrhi3 libcall // (~10 insns plus arg push/pop overhead) — abs8 dropped from 47 // to 35 insns with this rewrite in place. if (Op.getOpcode() == ISD::SRA) { if (auto *C = dyn_cast(N)) { if (C->getZExtValue() == 7) { N16 = DAG.getConstant(15, DL, MVT::i16); } } } SDValue R16 = DAG.getNode(Op.getOpcode(), DL, MVT::i16, X16, N16); return DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, R16); } // Fast path: shift-by-{1,2,3,4} have inline tablegen patterns. Return // Op (the unchanged node) so the legalizer leaves it alone — the // pattern matcher catches it later. Returning SDValue() instead // would fall through to the generic Expand path, which generates a // BUILD_VECTOR-based magic-constant rewrite that we can't lower. // Also allow `(srl x, 15)` through — pattern SRL15A handles it as // `ASL A; LDA #0; ROL A` (3 bytes), much shorter than the libcall. // The type-legalizer's i32-shift-by-1 expansion emits this exact // node for the high-half "bit-from-low" slot. // Everything else goes to a libcall (__ashlhi3 / __lshrhi3 / __ashrhi3). // i16 only — i32 always routes to libcall (no inline i32 patterns). SDValue Amount = Op.getOperand(1); if (Op.getValueType() == MVT::i16) { if (auto *C = dyn_cast(Amount)) { uint64_t N = C->getZExtValue(); if ((Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL) && N >= 1 && N <= 14) return Op; if (N == 15 && (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL)) return Op; if (N == 1 && Op.getOpcode() == ISD::SRA) return Op; if (N == 15 && Op.getOpcode() == ISD::SRA) return Op; } } bool IsI32 = Op.getValueType() == MVT::i32; // Inline i32 shift-by-small-constant. The libcall path is ~140 cyc // (post-tightening); unrolling N i16 ops plus carry propagation runs // in ~30-90 cyc. popcount, djb2-style hashes, BigInt-style code, and // CRC routines all hit this. Larger N falls through to the libcall — // the unrolled cost grows linearly while the libcall is constant. // Cutoff at N=5 chosen empirically: djb2's `(h << 5) + h` is the // common one that benefits. SRA needs an arithmetic-fill shift on // the high half (i16 SRA by 1 is tablegen-supported); the low half is // filled from the high's departing bit just like SRL. if (IsI32) { if (auto *C = dyn_cast(Amount)) { uint64_t N = C->getZExtValue(); unsigned Op0 = Op.getOpcode(); if (N >= 1 && N <= 5 && (Op0 == ISD::SHL || Op0 == ISD::SRL || Op0 == ISD::SRA)) { SDLoc DL(Op); SDValue X = Op.getOperand(0); SDValue Lo = extractWide32Lo(DAG, DL, X); SDValue Hi = extractWide32Hi(DAG, DL, X); SDValue ShN = DAG.getConstant(N, DL, MVT::i16); SDValue ShCo = DAG.getConstant(16 - N, DL, MVT::i16); if (Op0 == ISD::SHL) { // (Hi:Lo) << N == ((Hi << N) | (Lo >> (16-N))) : (Lo << N) // 4 SDAG ops instead of N iterations of 4 ops. Lets the // combiner / isel produce ASLA16-cascade + SRL8A+LSRA16- // cascade + single OR, avoiding the bit-by-bit OR cascade // that the unrolled form produced. SDValue NewLo = DAG.getNode(ISD::SHL, DL, MVT::i16, Lo, ShN); SDValue HiTop = DAG.getNode(ISD::SRL, DL, MVT::i16, Lo, ShCo); SDValue HiShl = DAG.getNode(ISD::SHL, DL, MVT::i16, Hi, ShN); SDValue NewHi = DAG.getNode(ISD::OR, DL, MVT::i16, HiShl, HiTop); return buildWide32(DAG, DL, NewLo, NewHi); } else { // SRL/SRA by N: NewHi = Hi >> N (logical or arithmetic); // NewLo = (Lo >> N) | (Hi << (16-N)). SDValue NewHi = DAG.getNode(Op0, DL, MVT::i16, Hi, ShN); SDValue LoTop = DAG.getNode(ISD::SHL, DL, MVT::i16, Hi, ShCo); SDValue LoSrl = DAG.getNode(ISD::SRL, DL, MVT::i16, Lo, ShN); SDValue NewLo = DAG.getNode(ISD::OR, DL, MVT::i16, LoSrl, LoTop); return buildWide32(DAG, DL, NewLo, NewHi); } } } } RTLIB::Libcall LC; switch (Op.getOpcode()) { case ISD::SHL: LC = IsI32 ? RTLIB::SHL_I32 : RTLIB::SHL_I16; break; case ISD::SRL: LC = IsI32 ? RTLIB::SRL_I32 : RTLIB::SRL_I16; break; case ISD::SRA: LC = IsI32 ? RTLIB::SRA_I32 : RTLIB::SRA_I16; break; default: llvm_unreachable("not a shift"); } SDValue Val = Op.getOperand(0); if (IsI32 && Op.getOpcode() == ISD::SHL) { // Force the high half of the input to be concretely zero when the // shift count K is >= 16, so bits K..31 of the input are // mathematically irrelevant. SDAG legalisation can mark those bits // as `undef` to give the regalloc freedom, but our libcall (a true // 32-bit shift-and-rotate loop in libgcc.s) reads ALL 32 input // bits and propagates garbage into the result's low half. Caught // by dadd via the dpack-inline `(u64 e) << 52` path which split // into __ashlsi3(e_lo, 20) with X = undef → wrong mantissa. // For SRL/SRA we'd zero/sign-extend the LOW half similarly when // K >= 16, but those paths aren't exercising the bug yet. if (auto *C = dyn_cast(Op.getOperand(1))) { unsigned K = (unsigned)C->getZExtValue(); if (K >= 16) { SDValue Lo = extractWide32Lo(DAG, SDLoc(Op), Val); SDValue Zero = DAG.getConstant(0, SDLoc(Op), MVT::i16); Val = buildWide32(DAG, SDLoc(Op), Lo, Zero); } } } SmallVector Args = {Val, Op.getOperand(1)}; TargetLowering::MakeLibCallOptions Opts; Opts.setIsSigned(Op.getOpcode() == ISD::SRA); return makeLibCall(DAG, LC, Op.getValueType(), Args, Opts, SDLoc(Op)).first; } SDValue W65816TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { auto *GA = cast(Op); SDLoc DL(Op); EVT PtrVT = Op.getValueType(); // i16 in ptr16 mode, i32 in ptr32 mode if (PtrVT == MVT::i32) { // i32 GlobalAddress: build Wide32 from (i16 offset, i16 bank). // Lo = Wrapper(target) → fixup_16 (offset bytes) // Hi = WrapperBank(target) → fixup_bank16 (bank byte + 0 pad) // The linker / OMF Loader patch both halves so the runtime // pointer reflects the actual placed segment, not the link-time // text-base. Resolves the long-standing "ldx #0 is hardcoded" // bug that broke toolbox-call pointer args. SDValue OffTgt = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, MVT::i16, GA->getOffset()); SDValue Lo = DAG.getNode(W65816ISD::Wrapper, DL, MVT::i16, OffTgt); SDValue Hi = DAG.getNode(W65816ISD::WrapperBank, DL, MVT::i16, OffTgt); return buildWide32(DAG, DL, Lo, Hi); } SDValue Tgt = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, PtrVT, GA->getOffset()); return DAG.getNode(W65816ISD::Wrapper, DL, PtrVT, Tgt); } SDValue W65816TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { auto *ES = cast(Op); SDLoc DL(Op); EVT PtrVT = Op.getValueType(); if (PtrVT == MVT::i32) { SDValue OffTgt = DAG.getTargetExternalSymbol(ES->getSymbol(), MVT::i16); SDValue Lo = DAG.getNode(W65816ISD::Wrapper, DL, MVT::i16, OffTgt); SDValue Hi = DAG.getNode(W65816ISD::WrapperBank, DL, MVT::i16, OffTgt); return buildWide32(DAG, DL, Lo, Hi); } SDValue Tgt = DAG.getTargetExternalSymbol(ES->getSymbol(), PtrVT); return DAG.getNode(W65816ISD::Wrapper, DL, PtrVT, Tgt); } SDValue W65816TargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl &InVals) const { // ABI: first i16/i8 argument is passed in A; remaining arguments are // pushed by the caller right-to-left and read via stack-relative // addressing. After JSL pushes 3 bytes of return address, the layout // viewed from the callee is: // (high addr) arg N-1 // ... // arg 1 // ret-addr-bank <- (4,S) when M=0 // ret-addr-hi <- (3,S) // ret-addr-lo <- (2,S) // (low addr) <- (1,S) // // Each i16 stack arg occupies 2 bytes. arg 1 lives at (4,S). MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); // i32 first-arg ABI. Two flavors as in LowerCall: // - Legal-i32 (Wide32 reg class registered): single i32 InputArg. // - Split-i32 (legacy): two i16 InputArgs both with OrigArgIndex==0. bool I32SplitFirstArg = Ins.size() >= 2 && Ins[0].VT == MVT::i16 && Ins[1].VT == MVT::i16 && Ins[0].OrigArgIndex == 0 && Ins[1].OrigArgIndex == 0; // True iff the FIRST original arg spans 4 i16s (i.e., is i64). Used // below to choose the Img16-via-STX_DP X-arg path for i64 callees, // which dodges greedy's TXA-bridge-clobbers-A spill bug. i32-first // doesn't get the same treatment because the change pessimizes // simple functions like `int add32(int a, int b) { return a+b; }` // where greedy's regular A:X handling is fine. // Two shapes for i64-first-arg under different ptr modes: // ptr16 (i32 illegal): Ins[0..3] = 4 i16 halves of arg0 // ptr32 (i32 legal): Ins[0..1] = 2 i32 halves of arg0 — but the // IR-level "single i64 first arg" still splits // to 4 i16 in Outs/Ins because i64 isn't legal. // So the i16-form detection still applies here. bool I64FirstArg = Ins.size() >= 4 && Ins[0].VT == MVT::i16 && Ins[1].VT == MVT::i16 && Ins[2].VT == MVT::i16 && Ins[3].VT == MVT::i16 && Ins[0].OrigArgIndex == 0 && Ins[1].OrigArgIndex == 0 && Ins[2].OrigArgIndex == 0 && Ins[3].OrigArgIndex == 0; // Also detect the i32-split shape: Ins[0..1] = 2 i32 halves of arg0 // (with OrigArgIndex==0 on both). This happens with ptr32 active and // i64 legalized via i32-split rather than i16-quad-split. if (!I64FirstArg && Ins.size() >= 2 && Ins[0].VT == MVT::i32 && Ins[1].VT == MVT::i32 && Ins[0].OrigArgIndex == 0 && Ins[1].OrigArgIndex == 0) I64FirstArg = true; unsigned ArgIdx = 0; // Stack offset is measured from S+1 (the WDC convention) and grows // upward as we walk through the stack-passed args. unsigned StackOffset = 4; // Skip 3 ret-addr bytes; first slot at S+4. for (const ISD::InputArg &Arg : Ins) { MVT VT = Arg.VT; if (VT != MVT::i16 && VT != MVT::i8 && VT != MVT::i32) report_fatal_error("W65816: argument type not yet supported"); if (ArgIdx == 0 && VT == MVT::i32) { // Whole-i32 first arg: lo half live-in via $a, hi via $x. // The W65816LowerWide32 pre-RA pass walks the resulting // REG_SEQUENCE and rewrites Wide32 uses into pairs of i16 // operations — keeping AX32 out of the regalloc's pair- // allocation path entirely. // For i64-first-arg signatures (the IR has a single i64 arg // that splits to 2 i32 in Ins[0..1] under ptr32), route BOTH // halves through Img16. Without this the regalloc emits // `TXA; STA spill_X; STA spill_A` at function entry — the TXA // clobbers $a (arg0_0) before the A-spill saves it, so both // spill slots end up holding arg0_1. Caused __adddf3(1.5,2.5) // → 1.5 because the cb-test path read TXA-corrupted A. // Route the hi half through Img16 (DP-backed) for whole-i32 first // args. The Idx16 (X-only) class collapses through the W65816LowerWide32 // pre-RA pass to plain Acc16, after which regalloc treats both halves // as competing for $a — a TXA at the top of any non-trivial function // body destroys arg0_lo before it's spilled (silent miscompile of // every i32-arg function with > a few uses). Img16 forces an // STX_DP at function entry, immune to A-reuse. i64-first already // did this; under ptr32 the same hazard hits any i32 arg. const TargetRegisterClass *VRegLoRC = I64FirstArg ? &W65816::Img16RegClass : &W65816::Acc16RegClass; const TargetRegisterClass *VRegHiRC = &W65816::Img16RegClass; Register VRegLo = MRI.createVirtualRegister(VRegLoRC); Register VRegHi = MRI.createVirtualRegister(VRegHiRC); MRI.addLiveIn(W65816::A, VRegLo); MRI.addLiveIn(W65816::X, VRegHi); SDValue Lo = DAG.getCopyFromReg(Chain, DL, VRegLo, MVT::i16); SDValue Hi = DAG.getCopyFromReg(Chain, DL, VRegHi, MVT::i16); InVals.push_back(buildWide32(DAG, DL, Lo, Hi)); } else if (ArgIdx == 0) { // First arg in A. For i64-first-arg signatures (4 i16 halves of // arg0 with OrigArgIndex==0), route arg0_0 through Img16 the same // way ArgIdx==1 does — via an entry STA-to-DP-slot at function // entry. Without this, the regalloc emits a TXA bridge for // arg0_1's spill that clobbers $a (= arg0_0) BEFORE arg0_0 has // been saved, and BOTH arg0_0 and arg0_1's spill slots end up // holding arg0_1. Observed as `__adddf3(1.5, 2.5) → 1.5` because // the cb-test BEQ sees flags from a TXA-clobbered LDA cb path. const TargetRegisterClass *RC = (VT == MVT::i16) ? (I64FirstArg ? &W65816::Img16RegClass : &W65816::Acc16RegClass) : &W65816::Acc8RegClass; Register VReg = MRI.createVirtualRegister(RC); MRI.addLiveIn(W65816::A, VReg); InVals.push_back(DAG.getCopyFromReg(Chain, DL, VReg, VT)); } else if (ArgIdx == 1 && I32SplitFirstArg) { // First-arg hi half (or arg0_ml for i64-first-arg): in X. // For i64-first-arg signatures (4 i16s with OrigArgIndex 0), use // Img16 so greedy parks the value in an IMG slot via STX_DP, // dodging the TXA-bridge-clobbers-A spill bug. i32-first stays // on the original Idx16 path because the change pessimizes // simple cases (verified: vprintf's writeULong/__udivsi3 chain // crashes if i32-first is also rerouted). Caught by udivmod. const TargetRegisterClass *RC = I64FirstArg ? &W65816::Img16RegClass : &W65816::Idx16RegClass; Register VReg = MRI.createVirtualRegister(RC); MRI.addLiveIn(W65816::X, VReg); InVals.push_back(DAG.getCopyFromReg(Chain, DL, VReg, MVT::i16)); } else if (VT == MVT::i32) { // i32 stack arg: 4 bytes, loaded as 2 i16 halves and assembled // via REG_SEQUENCE into a Wide32 SDValue. int FILo = MFI.CreateFixedObject(2, StackOffset, /*Immutable*/true); int FIHi = MFI.CreateFixedObject(2, StackOffset + 2, /*Immutable*/true); StackOffset += 4; SDValue FINLo = DAG.getFrameIndex(FILo, MVT::i16); SDValue FINHi = DAG.getFrameIndex(FIHi, MVT::i16); SDValue Lo = DAG.getLoad(MVT::i16, DL, Chain, FINLo, MachinePointerInfo::getFixedStack(MF, FILo)); SDValue Hi = DAG.getLoad(MVT::i16, DL, Chain, FINHi, MachinePointerInfo::getFixedStack(MF, FIHi)); InVals.push_back(buildWide32(DAG, DL, Lo, Hi)); } else { // Subsequent args are loaded from the stack. i8 args are // promoted to i16 slots (matching CC_W65816's CCPromoteToType) // so the load can run in the function's default 16-bit M mode // without needing a per-byte SEP/REP wrap; we then truncate the // i16 back to i8 for the IR. i16 args are loaded directly. unsigned ObjSize = 2; int FI = MFI.CreateFixedObject(ObjSize, StackOffset, /*Immutable*/true); StackOffset += ObjSize; SDValue FIN = DAG.getFrameIndex(FI, MVT::i16); SDValue Val = DAG.getLoad( MVT::i16, DL, Chain, FIN, MachinePointerInfo::getFixedStack(MF, FI)); if (VT == MVT::i8) Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Val); InVals.push_back(Val); } ++ArgIdx; } // Vararg support: stash the FrameIndex of the next stack-arg slot // (where the caller's first vararg lives) so VASTART can use it // as the va_list start. StackOffset has been advanced past every // named stack arg; the first vararg sits at SP + StackOffset. if (IsVarArg) { int FI = MFI.CreateFixedObject(2, StackOffset, /*Immutable=*/true); auto *FuncInfo = MF.getInfo(); FuncInfo->setVarArgsFrameIndex(FI); } return Chain; } SDValue W65816TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl &InVals) const { // Multi-arg ABI: arg 0 in A, args 1..N-1 pushed in REVERSE order via // PUSH16 (PHA) so the callee's `(4,S)` reads pick up arg 1, `(6,S)` // gets arg 2, etc. CALLSEQ_START records the byte count; // ADJCALLSTACKUP after the call emits `tsc;clc;adc #N;tcs` to // release the pushed bytes (eliminateCallFramePseudoInstr). SelectionDAG &DAG = CLI.DAG; SDLoc &DL = CLI.DL; SDValue Chain = CLI.Chain; SDValue Callee = CLI.Callee; auto &Outs = CLI.Outs; auto &OutVals = CLI.OutVals; auto &Ins = CLI.Ins; if (CLI.IsTailCall) CLI.IsTailCall = false; // Up to 4 return halves (i64 split): i8/i16 in A; i32 in A:X; // i64 in A:X:Y plus DP $F0..$F1 for the highest half. See // LowerReturn comment for the ABI. if (Ins.size() > 4) report_fatal_error("W65816: return type wider than 64 bits not supported"); // Indirect calls (function pointers): redirect through the runtime // trampoline `__jsl_indir`. The 65816 has no JSL-indirect; instead, // we store the dynamic target to a fixed bank-0 slot ($00:00B8 — see // libgcc.s for why) and JSL the trampoline, which does // `JMP ($00B8)`. JMP (abs) reads its vector from bank 0 unconditionally, // so anchoring the slot in bank 0 makes the dispatch work under GS/OS // Loader / GNO non-bank-0 placement (where the program's BSS would // otherwise live in PBR — the JMP couldn't reach it). Single-bank // assumption remains on the *target's* code (JMP indirect keeps PBR). bool IsIndirect = !isa(Callee) && !isa(Callee); if (IsIndirect) { // Emit a constant-address store: tblgen pattern (store Acc16, // (iPTR imm:$addr)) -> STA_Long $0000B8 (4-byte abs-long, bank // explicit, ignores DBR). SDValue ConstAddr = DAG.getConstant(0xB8, DL, getPointerTy(DAG.getDataLayout())); Chain = DAG.getStore(Chain, DL, Callee, ConstAddr, MachinePointerInfo()); // Replace the callee with __jsl_indir for the actual JSL. Callee = DAG.getExternalSymbol("__jsl_indir", MVT::i16); } for (const ISD::OutputArg &O : Outs) { if (O.VT != MVT::i16 && O.VT != MVT::i8 && O.VT != MVT::i32) report_fatal_error("W65816: argument type not yet supported"); } // i32 first-arg ABI. Two flavors: // - Legal-i32: Outs[0].VT == i32 (whole pair). Pass in AX32. // - Split-i32 (legacy): Outs[0]/Outs[1] both i16 with OrigArgIndex==0. // Pass low in A, high in X. bool I32WholeFirstArg = !Outs.empty() && Outs[0].VT == MVT::i32; bool I32SplitFirstArg = Outs.size() >= 2 && Outs[0].VT == MVT::i16 && Outs[1].VT == MVT::i16 && Outs[0].OrigArgIndex == 0 && Outs[1].OrigArgIndex == 0; unsigned FirstStackArg = I32WholeFirstArg ? 1 : I32SplitFirstArg ? 2 : 1; // i8 stack args are promoted to i16 (2-byte slots) so the callee can // read them with a 16-bit M load — matches LowerFormalArguments and // CC_W65816's CCPromoteToType. i32 stack args occupy 4 bytes // (2 PUSH16s). unsigned StackBytes = 0; for (unsigned i = FirstStackArg; i < Outs.size(); ++i) StackBytes += (Outs[i].VT == MVT::i32) ? 4 : 2; Chain = DAG.getCALLSEQ_START(Chain, StackBytes, 0, DL); // Push stack-passed args in reverse so arg FirstStackArg ends up at // the lowest post-JSL stack-relative offset (4,S). Each push uses A // by default; if the value being pushed is already a `CopyFromReg X` // (e.g. forwarding the i32-first-arg-in-A:X hi half), push directly // from X via PHX — saves the TXA + A-spill round-trip that would // otherwise be required. SDValue Glue; // Helper: push a single i16-sized value via PHA. auto pushI16 = [&](SDValue V) { bool ViaX = false; if (V.getOpcode() == ISD::CopyFromReg) { auto *RegN = dyn_cast(V.getOperand(1).getNode()); if (RegN) { Register R = RegN->getReg(); if (R.isPhysical() && R == W65816::X) { ViaX = true; } else if (R.isVirtual()) { MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); if (MRI.getRegClass(R) == &W65816::Idx16RegClass) { for (auto &LI : MRI.liveins()) if (LI.second == R && LI.first == W65816::X) { ViaX = true; break; } } } } } if (ViaX) { Chain = DAG.getCopyToReg(Chain, DL, W65816::X, V, Glue); Glue = Chain.getValue(1); Chain = DAG.getNode(W65816ISD::PUSH_X, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain, Glue); } else { Chain = DAG.getCopyToReg(Chain, DL, W65816::A, V, Glue); Glue = Chain.getValue(1); Chain = DAG.getNode(W65816ISD::PUSH, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain, Glue); } Glue = Chain.getValue(1); }; for (int i = (int)Outs.size() - 1; i >= (int)FirstStackArg; --i) { SDValue V = OutVals[i]; if (Outs[i].VT == MVT::i32) { // Push i32 stack arg: hi half first (lands at higher address), // lo half second (lands at lower address = the slot the callee // reads as the start of the i32). SDValue Lo = extractWide32Lo(DAG, DL, V); SDValue Hi = extractWide32Hi(DAG, DL, V); pushI16(Hi); pushI16(Lo); continue; } if (Outs[i].VT == MVT::i8) V = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, V); pushI16(V); } // i32 first-arg. Whole (legal-i32): split into lo/hi and copy // to $a/$x separately — avoids AX32 in the MIR (see // W65816LowerWide32). Split-i32 (legacy 2-i16): hi in X first, // then lo in A below. if (I32WholeFirstArg) { SDValue Lo = extractWide32Lo(DAG, DL, OutVals[0]); SDValue Hi = extractWide32Hi(DAG, DL, OutVals[0]); Chain = DAG.getCopyToReg(Chain, DL, W65816::X, Hi, Glue); Glue = Chain.getValue(1); Chain = DAG.getCopyToReg(Chain, DL, W65816::A, Lo, Glue); Glue = Chain.getValue(1); } else if (I32SplitFirstArg) { Chain = DAG.getCopyToReg(Chain, DL, W65816::X, OutVals[1], Glue); Glue = Chain.getValue(1); } // Arg 0 in A — only for non-whole-i32 first-arg. Whole-i32 // already copied to A/X above. if (!I32WholeFirstArg && !OutVals.empty()) { Chain = DAG.getCopyToReg(Chain, DL, W65816::A, OutVals[0], Glue); Glue = Chain.getValue(1); } // Callee target type must match iPTR (i16 in ptr16, i32 in ptr32). // The CALL SDNode's operand-type profile (SDT_W65816Call) is iPTR; // hardcoding MVT::i16 here mismatches under p:32:16. EVT CalleeVT = getPointerTy(DAG.getDataLayout()); if (auto *GA = dyn_cast(Callee)) Callee = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, CalleeVT); else if (auto *ES = dyn_cast(Callee)) Callee = DAG.getTargetExternalSymbol(ES->getSymbol(), CalleeVT); SmallVector CallOps = {Chain, Callee}; if (I32WholeFirstArg) { CallOps.push_back(DAG.getRegister(W65816::A, MVT::i16)); CallOps.push_back(DAG.getRegister(W65816::X, MVT::i16)); } else if (!OutVals.empty()) { CallOps.push_back(DAG.getRegister(W65816::A, Outs[0].VT)); if (I32SplitFirstArg) CallOps.push_back(DAG.getRegister(W65816::X, Outs[1].VT)); } if (Glue.getNode()) CallOps.push_back(Glue); Chain = DAG.getNode(W65816ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue), CallOps); Glue = Chain.getValue(1); Chain = DAG.getCALLSEQ_END(Chain, StackBytes, 0, Glue, DL); Glue = Chain.getValue(1); // Read return value(s). Mirrors LowerReturn: i8/i16 in A, i32 in // AX32 (whole) or split A/X (legacy), and 4-half (i64 / 2x i32) in // A, X, Y, DPF0. i32 Ins are read as a single i32 from the half // pair (A:X for the first, Y:DPF0 for a second-pair-of-halves). // Whole-i32 single return: read lo from $a, hi from $x. Avoids // using AX32 in the SDAG / MIR — see W65816LowerWide32 pass. if (Ins.size() == 1 && Ins[0].VT == MVT::i32) { SDValue Lo = DAG.getCopyFromReg(Chain, DL, W65816::A, MVT::i16, Glue); Chain = Lo.getValue(1); Glue = Lo.getValue(2); SDValue Hi = DAG.getCopyFromReg(Chain, DL, W65816::X, MVT::i16, Glue); Chain = Hi.getValue(1); Glue = Hi.getValue(2); InVals.push_back(buildWide32(DAG, DL, Lo, Hi)); return Chain; } // Build a flat list of i16 halves expected from the call. Then // walk it, copying from A, X, Y, DPF0 in order. Re-assemble i32 // halves into a Wide32 SDValue at the end. SmallVector ExpVT; for (unsigned i = 0, e = Ins.size(); i != e; ++i) { MVT VT = Ins[i].VT; if (VT == MVT::i32) { ExpVT.push_back(MVT::i16); ExpVT.push_back(MVT::i16); } else if (VT == MVT::i16 || VT == MVT::i8) { ExpVT.push_back(VT); } else { report_fatal_error("W65816: return half must be i8/i16/i32"); } } if (ExpVT.size() > 4) report_fatal_error("W65816: return type wider than 64 bits not supported"); static constexpr Register RetRegs[4] = {W65816::A, W65816::X, W65816::Y, W65816::DPF0}; SmallVector Halves; for (unsigned i = 0; i != ExpVT.size(); ++i) { SDValue V = DAG.getCopyFromReg(Chain, DL, RetRegs[i], ExpVT[i], Glue); Chain = V.getValue(1); Glue = V.getValue(2); Halves.push_back(V); } // Re-pack halves into the original Ins shape (i32s rebuild via // REG_SEQUENCE; i8/i16 pass through). unsigned hi = 0; for (unsigned i = 0, e = Ins.size(); i != e; ++i) { if (Ins[i].VT == MVT::i32) { InVals.push_back(buildWide32(DAG, DL, Halves[hi], Halves[hi + 1])); hi += 2; } else { InVals.push_back(Halves[hi]); hi += 1; } } return Chain; } SDValue W65816TargetLowering::LowerReturn( SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SDLoc &DL, SelectionDAG &DAG) const { // Return ABI: // i8/i16: value in A. // i32: low half (Outs[0]) in A, high half (Outs[1]) in X. // i64: halves in A, X, Y, and a fixed direct-page slot at $F0..$F1 // (Outs[0..2] -> A,X,Y; Outs[3] stored to the DP slot). // wider: not yet supported. // Type legalization splits an i32 into 2 consecutive i16 Outs and an // i64 into 4. Emission order matters: we copy the *highest* halves // first so that the regalloc can place each through A (the only // ALU reg) without conflict. The TAX/TAY in copyPhysReg preserves // A, so subsequent low-half copies to A don't clobber. // With i32 legal, an Outs entry may be MVT::i32; we expand each i32 // into its two i16 halves (sub_lo/sub_hi via EXTRACT_SUBREG) so the // legacy A/X/Y/DPF0 4-half return ABI continues to work for the // multi-half return cases (i64 returned as 2 i32, struct of 2 long // returned as 2 i32, etc.). SmallVector ExpVT; SmallVector ExpVals; for (unsigned i = 0; i != Outs.size(); ++i) { MVT VT = Outs[i].VT; if (VT == MVT::i32) { ExpVT.push_back(MVT::i16); ExpVT.push_back(MVT::i16); ExpVals.push_back(extractWide32Lo(DAG, DL, OutVals[i])); ExpVals.push_back(extractWide32Hi(DAG, DL, OutVals[i])); } else if (VT == MVT::i16 || VT == MVT::i8) { ExpVT.push_back(VT); ExpVals.push_back(OutVals[i]); } else { report_fatal_error("W65816: return half must be i8/i16/i32"); } } if (ExpVT.size() > 4) report_fatal_error("W65816: return type wider than 64 bits not supported"); // Single whole-i32 return: copy directly to AX32 instead of two // halves to A and X. Saves the regalloc/coalescer some work. bool I32WholeReturn = (Outs.size() == 1 && Outs[0].VT == MVT::i32); SDValue Glue; SmallVector RetOps(1, Chain); if (I32WholeReturn) { // Split the i32 OutVal into lo/hi and copy each separately to // $a / $x (no AX32 in the SDAG — see W65816LowerWide32). SDValue Lo = extractWide32Lo(DAG, DL, OutVals[0]); SDValue Hi = extractWide32Hi(DAG, DL, OutVals[0]); Chain = DAG.getCopyToReg(Chain, DL, W65816::X, Hi, Glue); Glue = Chain.getValue(1); Chain = DAG.getCopyToReg(Chain, DL, W65816::A, Lo, Glue); Glue = Chain.getValue(1); RetOps.push_back(DAG.getRegister(W65816::A, MVT::i16)); RetOps.push_back(DAG.getRegister(W65816::X, MVT::i16)); RetOps[0] = Chain; if (Glue.getNode()) RetOps.push_back(Glue); return DAG.getNode(W65816ISD::RET_GLUE, DL, MVT::Other, RetOps); } // Outs[3] -> DP $F0 via CopyToReg(DPF0). Using the DPF0 fake physreg // (lowered to `STA $F0` by copyPhysReg) is critical: a generic // ISD::STORE with addr=0xF0 lowered to `sta (d,s),y`, an indirect // through the DBR, which silently misbehaved when DBR != 0. STA dp // uses D + dp directly and is unaffected by DBR. Done first so its // computation can use A freely before A holds the low result. Glued // to RET_GLUE via the RetOps Register entry below so DCE doesn't // strip the COPY. // Use the expanded i16-half list (i32 outs split into 2 i16 halves). if (ExpVals.size() >= 4) { Chain = DAG.getCopyToReg(Chain, DL, W65816::DPF0, ExpVals[3], Glue); Glue = Chain.getValue(1); } if (ExpVals.size() >= 3) { Chain = DAG.getCopyToReg(Chain, DL, W65816::Y, ExpVals[2], Glue); Glue = Chain.getValue(1); } if (ExpVals.size() >= 2) { Chain = DAG.getCopyToReg(Chain, DL, W65816::X, ExpVals[1], Glue); Glue = Chain.getValue(1); } if (!ExpVals.empty()) { Chain = DAG.getCopyToReg(Chain, DL, W65816::A, ExpVals[0], Glue); Glue = Chain.getValue(1); RetOps.push_back(DAG.getRegister(W65816::A, ExpVT[0])); } if (ExpVals.size() >= 2) RetOps.push_back(DAG.getRegister(W65816::X, ExpVT[1])); if (ExpVals.size() >= 3) RetOps.push_back(DAG.getRegister(W65816::Y, ExpVT[2])); if (ExpVals.size() >= 4) RetOps.push_back(DAG.getRegister(W65816::DPF0, ExpVT[3])); RetOps[0] = Chain; if (Glue.getNode()) RetOps.push_back(Glue); return DAG.getNode(W65816ISD::RET_GLUE, DL, MVT::Other, RetOps); } SDValue W65816TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { // (shl i32 X, K) -> chain of K (add x, x) for small K. After type // legalisation the i32 add splits via ADDC/ADDE pseudos which expand // to native ASL/ROL + carry-chain — much cheaper than the type- // legaliser's SHL_PARTS expansion which uses our 3-insn SRL15A trick // to compute the bit crossing the half boundary. Each ADD expands to // ~10 insns; SHL_PARTS expansion is ~26 for K=1, ~33 for K=2, ~34 for // K=3. ADD-chain wins at K<=2 and breaks even at K=3 — cap at K=2. // `x*N` (which the combiner canonicalises pow-of-2 muls to `x< ADD chain for small K — but only when i32 is // ILLEGAL (i.e., gets type-split into i16 halves). When i32 is a // legal type (Wide32 reg class for ptr32 mode), the rewrite cycles // against LLVM's generic `(add x, x) -> (shl x, 1)` combine in the // i64 → 2 i32 split path, hanging the legalizer. // STORE / LOAD with ConstantSDNode ptr (e.g. `*(volatile uint8*)0xC035 = v`): // wrap the immediate in a W65816ISD::WRAPPER (using a TargetGlobalAddress- // like marker would be cleaner but we lack the symbol table). Re-issue // the store/load with the same ptr but the constant marked TargetConstant // — TargetConstant is opaque to LowerI32Constant, so it survives intact // to ISel, where the existing tablegen pattern // `(store Acc8, (iPTR imm)) -> STA8long` // matches (`imm` accepts both Constant and TargetConstant). 4 B / 6 cyc // bank-explicit `sta long` instead of 16 B / 30 cyc [dp],Y. // Wide32-of-Wrapper-with-zero-hi → i16 Wrapper. Under p:32:16, // LowerGlobalAddress builds GlobalAddress as a Wide32 reg pair // `(REG_SEQUENCE Wrapper(off_i16), 0_i16)`. Stores/loads against // this Wide32 ptr fall to the heavy [dp],Y path (16 B / 30 cyc) // even when the bank half is the constant 0 — we want the cheap // DBR-relative `sta g` / `lda g` (3 B / 5 cyc). Detect the shape // and recombine the ptr to its 16-bit form so the existing // tablegen `(store v, (Wrapper tglob))` → STAabs / `(load (Wrapper // tglob))` → LDAabs patterns fire. Crucially, this is correct // ONLY when bank=0 — under GS/OS Loader, DBR is set to our bank // by crt0Gsos, so DBR-relative addressing reaches the same global. // Returns either an i16 Wrapper (drop into i16 STAabs/LDAabs pattern) // or a TargetConstant:i32 (for const-addr i16 stores so the timm // pattern fires and produces STAabs). TargetConstant — not regular // Constant — because LowerI32Constant only matches ISD::Constant; if // we returned a fresh ConstantSDNode it would re-fire LowerI32Constant // and produce another Wide32 REG_SEQUENCE → infinite combine loop. auto unwrapWide32WithZeroHi = [&](SDValue Ptr) -> SDValue { if (Ptr.getValueType() != MVT::i32) return SDValue(); if (!Ptr.getNode()->isMachineOpcode()) return SDValue(); if (Ptr.getMachineOpcode() != TargetOpcode::REG_SEQUENCE) return SDValue(); SDValue Lo, Hi; for (unsigned i = 1; i + 1 < Ptr.getNumOperands(); i += 2) { auto *CIdx = dyn_cast(Ptr.getOperand(i + 1)); if (!CIdx) continue; if (CIdx->getZExtValue() == llvm::sub_lo) Lo = Ptr.getOperand(i); else if (CIdx->getZExtValue() == llvm::sub_hi) Hi = Ptr.getOperand(i); } if (!Lo || !Hi) return SDValue(); auto *HiC = dyn_cast(Hi); if (!HiC || HiC->getZExtValue() != 0) return SDValue(); if (Lo.getOpcode() == W65816ISD::Wrapper) return Lo; if (auto *LoC = dyn_cast(Lo)) { // Recombine into a TargetConstant:i32 so the `(store v, (iPTR // timm))` STAabs pattern fires. Returning an i16 Constant // would create a malformed STORE node (Ptr type mismatch) and // returning a regular Constant:i32 would re-trigger // LowerI32Constant. return DCI.DAG.getTargetConstant(LoC->getZExtValue(), SDLoc(Ptr), MVT::i32); } return SDValue(); }; if (N->getOpcode() == ISD::STORE) { auto *St = cast(N); EVT MemVT = St->getMemoryVT(); SDValue Ptr = St->getBasePtr(); // Skip i32 stores — LowerStore's i32 path has its own Wide32-zero-hi // const-addr fast path that emits two i16 stores at separate // TargetConstant addrs. Unwrapping here would short-circuit that // and produce a malformed ADD(TargetConstant, Constant) when the // hi-half store needs Ptr+2. if (MemVT != MVT::i32) { if (SDValue I16Ptr = unwrapWide32WithZeroHi(Ptr)) { SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); return DAG.getTruncStore(St->getChain(), DL, St->getValue(), I16Ptr, MemVT, St->getMemOperand()); } // Global+i16-idx fast path for STORES (companion to the LOAD // branch below). Ptr = REG_SEQUENCE(ADDC(Wrapper, idx), ADDE(...)). // Rewrite to CopyToReg($a, val) + CopyToReg($x, idx) + STA_AbsX. if ((MemVT == MVT::i16 || MemVT == MVT::i8) && Ptr.getNode() && Ptr.isMachineOpcode() && Ptr.getMachineOpcode() == TargetOpcode::REG_SEQUENCE) { SDValue Lo = lookThroughRegSeq(Ptr, llvm::sub_lo); if (Lo && Lo.getOpcode() == ISD::ADDC) { auto lookThroughExtractSubLo = [](SDValue V) -> SDValue { if (V.getNode() && V.isMachineOpcode() && V.getMachineOpcode() == TargetOpcode::EXTRACT_SUBREG) { SDValue Src = V.getOperand(0); if (Src.isMachineOpcode() && Src.getMachineOpcode() == TargetOpcode::REG_SEQUENCE) { if (SDValue X = lookThroughRegSeq(Src, llvm::sub_lo)) return X; } } return V; }; SDValue A = lookThroughExtractSubLo(Lo.getOperand(0)); SDValue B = lookThroughExtractSubLo(Lo.getOperand(1)); auto isWrapperGlobal = [](SDValue V) { if (V.getOpcode() != W65816ISD::Wrapper) return false; unsigned Op = V.getOperand(0).getOpcode(); return Op == ISD::TargetGlobalAddress || Op == ISD::TargetExternalSymbol; }; SDValue Sym, Idx; if (isWrapperGlobal(A)) { Sym = A.getOperand(0); Idx = B; } else if (isWrapperGlobal(B)) { Sym = B.getOperand(0); Idx = A; } if (Sym && Idx.getValueType() == MVT::i16) { SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); SDValue Chain = St->getChain(); SDValue Val = St->getValue(); // STA8absX copies $a register at i16 width (M=0); the SEP // wraps narrow it. Promote i8 stored value to i16. if (Val.getValueType() == MVT::i8) Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, Val); SDValue Glue; SDValue C1 = DAG.getCopyToReg(Chain, DL, W65816::X, Idx, Glue); Glue = C1.getValue(1); SDValue C2 = DAG.getCopyToReg(C1, DL, W65816::A, Val, Glue); Glue = C2.getValue(1); SDVTList StaVTs = DAG.getVTList(MVT::Other, MVT::Glue); unsigned Opc = (MemVT == MVT::i8) ? W65816::STA8absX : W65816::STA_AbsX; SDNode *Sta = DAG.getMachineNode(Opc, DL, StaVTs, {Sym, C2, Glue}); return SDValue(Sta, 0); } } } } // i8 const-addr → STA8long (timm pattern); i16 const-addr → // STAabs (timm pattern, DBR-relative). Wrap as TargetConstant so // LowerI32Constant doesn't re-enter and break the const-pattern // match. i32 stores split into 2 i16 stores via LowerStore so they // come back through this combine as MemVT==i16. if (MemVT != MVT::i8 && MemVT != MVT::i16) return SDValue(); if (auto *C = dyn_cast(Ptr)) { SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); SDValue NewPtr = DAG.getTargetConstant(C->getZExtValue(), DL, Ptr.getValueType()); return DAG.getTruncStore(St->getChain(), DL, St->getValue(), NewPtr, MemVT, St->getMemOperand()); } } if (N->getOpcode() == ISD::LOAD) { auto *Ld = cast(N); EVT MemVT = Ld->getMemoryVT(); EVT VT = Ld->getValueType(0); SDValue Ptr = Ld->getBasePtr(); // Wide32-of-Wrapper-with-zero-hi → i16 Wrapper (companion to the // STORE side just above). Lets `(load (Wrapper g))` → LDAabs fire. // Skip i32 loads — LowerLoad's i32 path does its own Ptr+2 ADD // arithmetic and would choke on a TargetConstant unwrap result. if (MemVT != MVT::i32) { if (SDValue I16Ptr = unwrapWide32WithZeroHi(Ptr)) { SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); return DAG.getExtLoad(Ld->getExtensionType(), DL, VT, Ld->getChain(), I16Ptr, MemVT, Ld->getMemOperand()); } // Global+i16-idx fast path: Ptr is REG_SEQUENCE produced by // LowerI32Bin from `(add (Wrapper sym) (zext i16 idx))`. // sub_lo = ADDC(Wrapper, idx) — operands are TargetExtractSubreg // wrapping each side's Wide32 // sub_hi = ADDE(0, 0, carry) — ignored (idx fits in 16 bits, // so any carry stays in bank) // Rewrite the LOAD to a CopyToReg($x, idx) + LDA_AbsX(sym) // sequence. Saves ~45 bytes / ~70 cyc vs the 24-bit [dp],Y deref. // Correct under the data-bank invariant (DBR = global's bank). if ((MemVT == MVT::i16 || MemVT == MVT::i8) && Ptr.getNode() && Ptr.isMachineOpcode() && Ptr.getMachineOpcode() == TargetOpcode::REG_SEQUENCE) { SDValue Lo = lookThroughRegSeq(Ptr, llvm::sub_lo); if (Lo && Lo.getOpcode() == ISD::ADDC) { auto lookThroughExtractSubLo = [](SDValue V) -> SDValue { if (V.getNode() && V.isMachineOpcode() && V.getMachineOpcode() == TargetOpcode::EXTRACT_SUBREG) { SDValue Src = V.getOperand(0); if (Src.isMachineOpcode() && Src.getMachineOpcode() == TargetOpcode::REG_SEQUENCE) { if (SDValue X = lookThroughRegSeq(Src, llvm::sub_lo)) return X; } } return V; }; SDValue A = lookThroughExtractSubLo(Lo.getOperand(0)); SDValue B = lookThroughExtractSubLo(Lo.getOperand(1)); auto isWrapperGlobal = [](SDValue V) { if (V.getOpcode() != W65816ISD::Wrapper) return false; unsigned Op = V.getOperand(0).getOpcode(); return Op == ISD::TargetGlobalAddress || Op == ISD::TargetExternalSymbol; }; SDValue Sym, Idx; if (isWrapperGlobal(A)) { Sym = A.getOperand(0); Idx = B; } else if (isWrapperGlobal(B)) { Sym = B.getOperand(0); Idx = A; } if (Sym && Idx.getValueType() == MVT::i16) { SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); SDValue Chain = Ld->getChain(); SDValue Glue; SDValue NewChain = DAG.getCopyToReg(Chain, DL, W65816::X, Idx, Glue); Glue = NewChain.getValue(1); SDVTList LdaVTs = DAG.getVTList(MVT::Other, MVT::Glue); unsigned Opc = (MemVT == MVT::i8) ? W65816::LDA8absX : W65816::LDA_AbsX; SDNode *Lda = DAG.getMachineNode(Opc, DL, LdaVTs, {Sym, NewChain, Glue}); SDValue LdaChain = SDValue(Lda, 0); SDValue LdaGlue = SDValue(Lda, 1); // Read A as the original LOAD's result VT directly. For // i8 LOAD with i8 VT: read i8. For i8 LOAD with i16 VT // (zext/sext): read i16 (high byte is whatever was in $a // before — wrong for zext, fine for sext, depends on the // M=8 LDA behavior). M=8 LDA only writes the low byte of // $a, leaving the high byte intact. Safe wrt liveness // because we're reading $a immediately after SEP/REP // around the load, but the high byte is now whatever // pre-LDA value $a held — for zext we must mask it. SDValue Val = DAG.getCopyFromReg(LdaChain, DL, W65816::A, VT, LdaGlue); SDValue Chain2 = Val.getValue(1); if (MemVT == MVT::i8 && VT == MVT::i16) { if (Ld->getExtensionType() == ISD::ZEXTLOAD) { Val = DAG.getNode(ISD::AND, DL, MVT::i16, Val, DAG.getConstant(0xFF, DL, MVT::i16)); } else if (Ld->getExtensionType() == ISD::SEXTLOAD) { Val = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i16, Val, DAG.getValueType(MVT::i8)); } // EXTLOAD: high byte don't-care, leave alone. } return DAG.getMergeValues({Val, Chain2}, DL); } } } } // Only the i8 const-addr path has dedicated tablegen patterns // (LDA8long); skip i16 const-addr loads (no LDAabs imm pattern) // and i32 (would re-fire on the same node with different shape). if (MemVT != MVT::i8 || (VT != MVT::i8 && VT != MVT::i16)) return SDValue(); if (auto *C = dyn_cast(Ptr)) { SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); SDValue NewPtr = DAG.getTargetConstant(C->getZExtValue(), DL, Ptr.getValueType()); return DAG.getExtLoad(Ld->getExtensionType(), DL, VT, Ld->getChain(), NewPtr, MemVT, Ld->getMemOperand()); } } if (N->getOpcode() == ISD::SHL && N->getValueType(0).getSizeInBits() >= 32 && !isTypeLegal(N->getValueType(0))) { if (auto *C = dyn_cast(N->getOperand(1))) { uint64_t K = C->getZExtValue(); if (K >= 1 && K <= 2) { SelectionDAG &DAG = DCI.DAG; SDValue X = N->getOperand(0); SDLoc DL(N); EVT VT = N->getValueType(0); SDValue R = X; for (uint64_t i = 0; i < K; ++i) R = DAG.getNode(ISD::ADD, DL, VT, R, R); return R; } } } return SDValue(); } // Custom-lowering for ISD::MUL i32. When both operands are ZEXT from // i16 (or provably have high 16 bits = 0), emit a libcall to // __umulhisi3 (16x16 -> 32) instead of the heavier __mulsi3 (32x32 -> // 32). Saves the 32-bit arg marshaling AND the 32-bit accumulator // math inside the libcall — roughly equivalent to Calypsi 5.16's // `_Mul16`. Falls through to the standard __mulsi3 libcall otherwise. SDValue W65816TargetLowering::LowerMUL_I32(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); EVT VT = Op.getValueType(); assert(VT == MVT::i32 && "LowerMUL_I32 expects i32"); SDValue Lhs = Op.getOperand(0); SDValue Rhs = Op.getOperand(1); auto narrowToI16 = [&](SDValue V) -> SDValue { // Explicit zext-from-i16 (the IR-level form, before SDAG flattening). if (V.getOpcode() == ISD::ZERO_EXTEND && V.getOperand(0).getValueType() == MVT::i16) return V.getOperand(0); // ANY_EXTEND-from-i16 is also fine since multiplication of the low // 16 bits gives the same 32-bit result whatever the high bits were. if (V.getOpcode() == ISD::ANY_EXTEND && V.getOperand(0).getValueType() == MVT::i16) return V.getOperand(0); // High 16 bits provably zero? KnownBits K = DAG.computeKnownBits(V); if (K.countMinLeadingZeros() >= 16) return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, V); return SDValue(); }; // Mul-by-constant strength reduction: (X * K) where K-1 or K+1 is // a small power of 2 (shift count 1..5, matching our inlined i32 // SHL range) expands to (X << N) +/- X — saves a __mulsi3 libcall // (~250 cyc) for ~70 cyc of inlined shift+add. Catches djb2Hash's // `h * 33` = (h << 5) + h. // // Patterns covered: // K = 2^N + 1 in {3,5,9,17,33} → (X << N) + X // K = 2^N - 1 in {7,15,31} → (X << N) - X // Larger N hits the i32 SHL libcall path (no longer profitable). if (auto *CN = dyn_cast(Rhs)) { int64_t K = CN->getSExtValue(); for (unsigned N = 1; N <= 5; N++) { int64_t Pow = int64_t{1} << N; SDValue ShAmt = DAG.getConstant(N, DL, MVT::i16); if (K == Pow + 1) { SDValue Shl = DAG.getNode(ISD::SHL, DL, MVT::i32, Lhs, ShAmt); return DAG.getNode(ISD::ADD, DL, MVT::i32, Shl, Lhs); } if (K == Pow - 1) { SDValue Shl = DAG.getNode(ISD::SHL, DL, MVT::i32, Lhs, ShAmt); return DAG.getNode(ISD::SUB, DL, MVT::i32, Shl, Lhs); } } } SDValue A = narrowToI16(Lhs); SDValue B = narrowToI16(Rhs); if (A && B) { TargetLowering::ArgListTy Args; Args.push_back({A, Type::getInt16Ty(*DAG.getContext())}); Args.push_back({B, Type::getInt16Ty(*DAG.getContext())}); SDValue Callee = DAG.getExternalSymbol( "__umulhisi3", getPointerTy(DAG.getDataLayout())); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(DL) .setChain(DAG.getEntryNode()) .setLibCallee(CallingConv::C, Type::getInt32Ty(*DAG.getContext()), Callee, std::move(Args)); auto [Ret, Chain] = LowerCallTo(CLI); return Ret; } // Fall back to the standard __mulsi3 libcall. TargetLowering::ArgListTy Args; Args.push_back({Lhs, Type::getInt32Ty(*DAG.getContext())}); Args.push_back({Rhs, Type::getInt32Ty(*DAG.getContext())}); SDValue Callee = DAG.getExternalSymbol( "__mulsi3", getPointerTy(DAG.getDataLayout())); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(DL) .setChain(DAG.getEntryNode()) .setLibCallee(CallingConv::C, Type::getInt32Ty(*DAG.getContext()), Callee, std::move(Args)); auto [Ret, Chain] = LowerCallTo(CLI); return Ret; } // Map a W65816CC code to the matching Bxx opcode. static unsigned getBranchOpcodeForCC(unsigned CC) { switch (CC) { case W65816CC::COND_EQ: return W65816::BEQ; case W65816CC::COND_NE: return W65816::BNE; case W65816CC::COND_HS: return W65816::BCS; case W65816CC::COND_LO: return W65816::BCC; case W65816CC::COND_MI: return W65816::BMI; case W65816CC::COND_PL: return W65816::BPL; case W65816CC::COND_VS: return W65816::BVS; case W65816CC::COND_VC: return W65816::BVC; } llvm_unreachable("invalid W65816 condition code"); } // For multi-branch CCs, return the (branchA, branchB, BothMeanTrue) tuple. // branchA is tested first; if it takes, we go to TrueBB if BothMeanTrue // (i.e. both branches are "take if true"), otherwise to FalseBB. branchB // is tested next with the same semantic. // // GT : (BPL && BNE) → BEQ FalseBB; BPL TrueBB; fall-through FalseBB // LE : (BMI || BEQ) → BEQ TrueBB; BMI TrueBB; fall-through FalseBB // HI : (BCS && BNE) → BEQ FalseBB; BCS TrueBB; fall-through FalseBB // LS : (BCC || BEQ) → BEQ TrueBB; BCC TrueBB; fall-through FalseBB struct MultiBranch { unsigned First, Second; bool FirstToTrue, SecondToTrue; }; static MultiBranch getMultiBranch(unsigned CC) { switch (CC) { case W65816CC::COND_GT_MB: return {W65816::BEQ, W65816::BPL, false, true}; case W65816CC::COND_LE_MB: return {W65816::BEQ, W65816::BMI, true, true}; case W65816CC::COND_HI_MB: return {W65816::BEQ, W65816::BCS, false, true}; case W65816CC::COND_LS_MB: return {W65816::BEQ, W65816::BCC, true, true}; } llvm_unreachable("not a multi-branch CC"); } // Emit a two-Acc16 binary op as STAfi src2; OPfi dst, src1. Allocates // a fresh 2-byte stack slot per call. For CMP (HasOut=false) there's // no destination register, just the two src operands. Always spill // the SECOND operand so non-commutative ops (sub, cmp) compute // src1 OP src2 correctly via OPfi (which gives src1 OP load(spill)). static MachineBasicBlock * emitRROp(MachineInstr &MI, MachineBasicBlock *BB, unsigned StoreOp, unsigned OpFI, bool HasOut) { MachineFunction *MF = BB->getParent(); const W65816Subtarget &STI = MF->getSubtarget(); const W65816InstrInfo &TII = *STI.getInstrInfo(); DebugLoc DL = MI.getDebugLoc(); int FI = MF->getFrameInfo().CreateStackObject(2, Align(2), /*isSpillSlot=*/true); unsigned LhsIdx = HasOut ? 1 : 0; unsigned RhsIdx = HasOut ? 2 : 1; Register Src1 = MI.getOperand(LhsIdx).getReg(); Register Src2 = MI.getOperand(RhsIdx).getReg(); // Spill src2 (the rhs). Then OPfi computes src1 OP load(spill). BuildMI(*BB, MI.getIterator(), DL, TII.get(StoreOp)) .addReg(Src2) .addFrameIndex(FI) .addImm(0); if (HasOut) { Register Dst = MI.getOperand(0).getReg(); BuildMI(*BB, MI.getIterator(), DL, TII.get(OpFI), Dst) .addReg(Src1) .addFrameIndex(FI) .addImm(0); } else { BuildMI(*BB, MI.getIterator(), DL, TII.get(OpFI)) .addReg(Src1) .addFrameIndex(FI) .addImm(0); } MI.eraseFromParent(); return BB; } MachineBasicBlock * W65816TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const { // The only opcode we currently emit with usesCustomInserter=1 is // SELECT_CC16. Expand it into a diamond CFG with a PHI. For // single-branch CCs: // // thisMBB: // ... CMP already emitted ... // Bxx sinkMBB ; branch to "true" path // ; fall through to copy0MBB // copy0MBB: // ; (no instructions; PHI picks fval here) // sinkMBB: // dst = PHI [tval, thisMBB], [fval, copy0MBB] // // For multi-branch CCs (GT/LE/UGT/ULE without const RHS, where a // single Bxx isn't enough), insert two branches. Both target either // sinkMBB or copy0MBB depending on the condition. switch (MI.getOpcode()) { default: llvm_unreachable("unexpected instruction in EmitInstrWithCustomInserter"); case W65816::ADD_RR: return emitRROp(MI, BB, W65816::STAfi, W65816::ADCfi, /*HasOut=*/true); case W65816::SUB_RR: return emitRROp(MI, BB, W65816::STAfi, W65816::SBCfi, /*HasOut=*/true); // Carry-chain variants for the hi half of an i32 split. STAfi doesn't // touch P, so the carry from the previous addc/adde survives the // spill and is consumed by ADCEfi/SBCEfi below. case W65816::ADDE_RR: return emitRROp(MI, BB, W65816::STAfi, W65816::ADCEfi, /*HasOut=*/true); case W65816::SUBE_RR: return emitRROp(MI, BB, W65816::STAfi, W65816::SBCEfi, /*HasOut=*/true); case W65816::AND_RR: return emitRROp(MI, BB, W65816::STAfi, W65816::ANDfi, /*HasOut=*/true); case W65816::ORA_RR: return emitRROp(MI, BB, W65816::STAfi, W65816::ORAfi, /*HasOut=*/true); case W65816::EOR_RR: return emitRROp(MI, BB, W65816::STAfi, W65816::EORfi, /*HasOut=*/true); case W65816::CMP_RR: return emitRROp(MI, BB, W65816::STAfi, W65816::CMPfi, /*HasOut=*/false); case W65816::LDAptr32S: case W65816::STAptr32S: case W65816::STBptr32S: { // Split-pair variant: ptr is 2 i16 operands (lo + hi) instead of // 1 Wide32 reg pair. Used by the W65816LowerWide32 pre-RA pass // to dodge pair-allocation pressure. Otherwise identical to // the LDAptr32 inserter below. MachineFunction *MF = BB->getParent(); const W65816Subtarget &STI = MF->getSubtarget(); const W65816InstrInfo &TII = *STI.getInstrInfo(); DebugLoc DL = MI.getDebugLoc(); bool IsLoad = MI.getOpcode() == W65816::LDAptr32S; bool IsByteStore = MI.getOpcode() == W65816::STBptr32S; Register PtrLo = MI.getOperand(IsLoad ? 1 : 1).getReg(); Register PtrHi = MI.getOperand(IsLoad ? 2 : 2).getReg(); int FILo = MF->getFrameInfo().CreateStackObject(2, Align(2), /*isSpillSlot=*/false); int FIHi = MF->getFrameInfo().CreateStackObject(2, Align(2), /*isSpillSlot=*/false); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi)) .addReg(PtrLo).addFrameIndex(FILo).addImm(0); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi)) .addReg(PtrHi).addFrameIndex(FIHi).addImm(0); // STA_DP's tablegen def has no implicit A Use, so without an // explicit kill marker between adjacent LDAfi-STA_DP-LDAfi-STA_DP // pairs the fast regalloc collapses two A-loads into one (the // first's value is overwritten before STA_DP can store it). Add // implicit Use of A on the STA_DP to encode the dependency. This // also helps post-RA passes track A liveness correctly. BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi), W65816::A).addFrameIndex(FILo).addImm(0); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STA_DP)).addImm(0xE0) .addReg(W65816::A, RegState::Implicit); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi), W65816::A).addFrameIndex(FIHi).addImm(0); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STA_DP)).addImm(0xE2) .addReg(W65816::A, RegState::Implicit); if (IsLoad) { Register Dst = MI.getOperand(0).getReg(); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDY_Imm16)).addImm(0); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDA_DPIndLongY)).addImm(0xE0); BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), Dst).addReg(W65816::A); } else { Register Val = MI.getOperand(0).getReg(); BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), W65816::A).addReg(Val); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDY_Imm16)).addImm(0); if (IsByteStore) BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::SEP)).addImm(0x20); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STA_DPIndLongY)).addImm(0xE0); if (IsByteStore) BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::REP)).addImm(0x20); } MI.eraseFromParent(); return BB; } case W65816::LDAptr32: case W65816::STAptr32: case W65816::STBptr32: { // Same shape as the i16 LDAptr/STAptr/STBptr inserter, but the // pointer is a Wide32 register pair: sub_lo carries the low 16 // bits of the address, sub_hi carries the bank byte in its low // half (high half is pad, ORCA convention). Stage at $E0..$E2, // then [dp],Y addresses the right bank without forcing 0. // // MI-level peephole: if the Wide32 ptr is the sole user of a // `REG_SEQUENCE(ADCi16imm BaseLo K, sub_lo, ADCEi16imm BaseHi 0, // sub_hi)` chain (= `(add Wide32, K)` after ISel), peel the // offset and pass K via the Y register on the `[dp],Y` deref. // Saves ~3 instructions per access (the CLC/ADC/ADC carry chain). // The bank-wrap caveat from LDAptr32Off applies: Y addition does // NOT propagate beyond 16 bits, so the target object must not // span a bank boundary (true for malloc'd / globally-allocated // ptr32 objects; struct sizeof is far below 64KB). // // Doing this here rather than in LowerLoad / a SDAG combine avoids // the JSON-tokenizer + BST + sprintf smoke regressions those paths // tripped — the rewrites perturbed SDAG scheduling in ways that // bisection couldn't pin down. At MI level, the rewrite is // structural: ADCi16imm/ADCEi16imm become dead and get DCE'd. // // Dead unless ptr32 mode is active (LowerLoad/LowerStore are gated // on i32 address type). MachineFunction *MF = BB->getParent(); const W65816Subtarget &STI = MF->getSubtarget(); const W65816InstrInfo &TII = *STI.getInstrInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); DebugLoc DL = MI.getDebugLoc(); bool IsLoad = MI.getOpcode() == W65816::LDAptr32; bool IsByteStore = MI.getOpcode() == W65816::STBptr32; Register Ptr = MI.getOperand(IsLoad ? 1 : 1).getReg(); // Try the ADC-chain peel. We need: // 1. Ptr has exactly one use (this MI) — else other users still // need the full computed Wide32, no net win. // 2. Ptr was defined by a REG_SEQUENCE. // 3. Sub_lo source is ADCi16imm BaseLoReg KLo. // 4. Sub_hi source is ADCEi16imm BaseHiReg 0. // 5. KLo > 0 and KLo fits 16-bit unsigned. Register PeelBaseLo, PeelBaseHi; int64_t PeelOff = 0; MachineInstr *DeadLoDef = nullptr; MachineInstr *DeadHiDef = nullptr; MachineInstr *DeadPtrDef = nullptr; SmallVector ExtraChainDeads; if (IsLoad && MRI.hasOneUse(Ptr)) { MachineInstr *PtrDef = MRI.getUniqueVRegDef(Ptr); if (PtrDef && PtrDef->getOpcode() == TargetOpcode::REG_SEQUENCE) { Register SubLoReg, SubHiReg; for (unsigned i = 1, e = PtrDef->getNumOperands(); i + 1 < e; i += 2) { unsigned SubIdx = PtrDef->getOperand(i + 1).getImm(); Register R = PtrDef->getOperand(i).getReg(); if (SubIdx == llvm::sub_lo) SubLoReg = R; else if (SubIdx == llvm::sub_hi) SubHiReg = R; } MachineInstr *LoDef = SubLoReg ? MRI.getUniqueVRegDef(SubLoReg) : nullptr; MachineInstr *HiDef = SubHiReg ? MRI.getUniqueVRegDef(SubHiReg) : nullptr; // We don't require SubLoReg/SubHiReg to be single-use: an // ADCi16imm result CSE'd across multiple users (e.g., `L+K` // also used as input to `(L+K)+M`) is fine — peeling THIS load // doesn't kill the original ADC chain (other users still need // it). We only erase the chain if it's all single-use end-to-end. bool OuterSingleUse = MRI.hasOneUse(SubLoReg) && MRI.hasOneUse(SubHiReg); if (LoDef && HiDef && LoDef->getOpcode() == W65816::ADCi16imm && HiDef->getOpcode() == W65816::ADCEi16imm && // ADCi16imm and ADCEi16imm must be in the same MBB so we // can verify nothing clobbers $p between them. LoDef->getParent() == HiDef->getParent()) { // Walk forward from LoDef to HiDef. If any instr between // them defines $p, the ADCE reads a tampered carry and our // simple substitution would change semantics. bool PChainOK = true; for (auto It = std::next(LoDef->getIterator()); It != HiDef->getIterator() && PChainOK; ++It) { for (const MachineOperand &MO : It->operands()) { if (MO.isReg() && MO.getReg() == W65816::P && MO.isDef() && !MO.isDead()) { PChainOK = false; break; } } } int64_t KLo = LoDef->getOperand(2).getImm(); int64_t KHi = HiDef->getOperand(2).getImm(); Register CandLo = LoDef->getOperand(1).getReg(); Register CandHi = HiDef->getOperand(1).getReg(); // Accept a vreg that's `COPY ` for any of the // arg/accumulator/index physregs. This catches both incoming // function args ($a/$x at entry) AND values that came from // a preceding load (where the result was COPYed off $a). auto isFromArgCopy = [&](Register R) -> bool { if (!R.isVirtual()) return false; MachineInstr *Def = MRI.getUniqueVRegDef(R); if (!Def || !Def->isCopy()) return false; const MachineOperand &Src = Def->getOperand(1); if (!Src.isReg() || !Src.getReg().isPhysical()) return false; unsigned P = Src.getReg(); return P == W65816::A || P == W65816::X || P == W65816::Y; }; // A vreg is "from a fixed (caller-pushed) stack arg" if its // unique def is LDAfi against a fixed FrameIndex (negative // index in MachineFrameInfo). Caller-pushed args live in // immutable slots, so reading them later is value-equivalent // to reading them at function entry. auto isFromFixedArgSlot = [&](Register R) -> bool { if (!R.isVirtual()) return false; MachineInstr *Def = MRI.getUniqueVRegDef(R); if (!Def || Def->getOpcode() != W65816::LDAfi) return false; const MachineOperand &FIOp = Def->getOperand(1); if (!FIOp.isFI()) return false; int FI = FIOp.getIndex(); const MachineFrameInfo &MFI = MF->getFrameInfo(); return MFI.isFixedObjectIndex(FI); }; auto isFromArg = [&](Register R) -> bool { if (isFromArgCopy(R)) return true; if (isFromFixedArgSlot(R)) return true; if (!R.isVirtual()) return false; MachineInstr *Def = MRI.getUniqueVRegDef(R); if (!Def || !Def->isCopy()) return false; const MachineOperand &Src = Def->getOperand(1); if (!Src.isReg() || !Src.getReg().isVirtual()) return false; return isFromArgCopy(Src.getReg()) || isFromFixedArgSlot(Src.getReg()); }; // Recursive walk: nested ADC chains arise from i32-LOAD split // (high half loads at `Ptr+2`, where `Ptr` is itself `arg+K`). // Walk back, accumulating offset, until we reach an arg-base // OR exhaust the chain. // // We allow inner ADC results to have multiple users — this // happens when the SDAG CSEs `L+K` and reuses it as input to // `(L+K)+M`. In that case, peeling THIS load doesn't kill // the inner ADC chain (other users still need it), so we // don't erase those inner Ms. Only the outer-most chain // (single-use) and PtrDef are erased. // // Bisecting: try peeling whenever the chain reaches a // "stable" base — args, fixed-arg-slot loads, OR any vreg // (widest). Wider gates have historically tripped a // FrameLowering-related smoke regression in sprintf. int64_t Off = KLo; bool ChainOK = (PChainOK && KHi == 0 && KLo > 0 && KLo <= 0xFFFF); // Cap on chain walks (avoid pathological deep chains). unsigned MaxChainDepth = 8; // Track per-layer "all single-use" status — only erase layers // up to the first non-single-use one. unsigned SingleUseLayers = OuterSingleUse ? 1 : 0; SmallVector ChainDeads; if (OuterSingleUse) { ChainDeads.push_back(LoDef); ChainDeads.push_back(HiDef); } // Narrow gate: walk back only until we reach an arg-base or // arg-slot base. A truly wide gate (peel any chain regardless // of base) makes Lua ~+0.85% LARGER because each peel adds 4B // of stack-slot staging that exceeds the carry-chain savings // for deep-chain cases. Tested 2026-05-25. while (ChainOK && MaxChainDepth-- > 0 && (!isFromArg(CandLo) || !isFromArg(CandHi))) { if (!CandLo.isVirtual() || !CandHi.isVirtual()) { ChainOK = false; break; } MachineInstr *InnerLo = MRI.getUniqueVRegDef(CandLo); MachineInstr *InnerHi = MRI.getUniqueVRegDef(CandHi); if (!InnerLo || !InnerHi || InnerLo->getOpcode() != W65816::ADCi16imm || InnerHi->getOpcode() != W65816::ADCEi16imm || InnerLo->getParent() != InnerHi->getParent()) { ChainOK = false; break; } bool InnerSingleUse = MRI.hasOneUse(CandLo) && MRI.hasOneUse(CandHi); bool InnerPOK = true; for (auto It = std::next(InnerLo->getIterator()); It != InnerHi->getIterator() && InnerPOK; ++It) { for (const MachineOperand &MO : It->operands()) { if (MO.isReg() && MO.getReg() == W65816::P && MO.isDef() && !MO.isDead()) { InnerPOK = false; break; } } } if (!InnerPOK) { ChainOK = false; break; } int64_t InnerKLo = InnerLo->getOperand(2).getImm(); int64_t InnerKHi = InnerHi->getOperand(2).getImm(); if (InnerKHi != 0) { ChainOK = false; break; } int64_t NewOff = Off + InnerKLo; if (NewOff > 0xFFFF) { ChainOK = false; break; } Off = NewOff; CandLo = InnerLo->getOperand(1).getReg(); CandHi = InnerHi->getOperand(1).getReg(); // Track whether this inner layer is erasable (all-single-use // from outer through here). if (InnerSingleUse && SingleUseLayers == ChainDeads.size() / 2) { SingleUseLayers++; ChainDeads.push_back(InnerLo); ChainDeads.push_back(InnerHi); } // Even if not single-use, we keep walking back — the peel // is still correct (just doesn't kill the inner chain). } if (ChainOK && Off > 0 && Off <= 0xFFFF && isFromArg(CandLo) && isFromArg(CandHi)) { PeelBaseLo = CandLo; PeelBaseHi = CandHi; PeelOff = Off; DeadPtrDef = PtrDef; // Only erase the ADC chain if it's all-single-use end to // end. Otherwise leave it alive — other users need it. if (OuterSingleUse) { DeadLoDef = LoDef; DeadHiDef = HiDef; for (unsigned i = 2; i < ChainDeads.size(); ++i) ExtraChainDeads.push_back(ChainDeads[i]); } } } } } // Layer 2 fast path: -w65816-dbr-safe-ptrs assumes the bank byte // matches DBR, letting us skip $E0/$E2 staging entirely. Emit just // a STAfi of sub_lo and an LDAfi_indY/STAfi_indY deref via the // 16-bit stack-rel-indirect-Y opcode (0xB3 / 0x93). ~4 instr per // deref saved vs the heavy [dp],Y indirect-long path. if (DbrSafePtrs) { Register PtrLo = MRI.createVirtualRegister(&W65816::Wide16RegClass); if (PeelOff) { BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), PtrLo) .addReg(PeelBaseLo); } else { BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), PtrLo) .addReg(Ptr, (RegState)0, llvm::sub_lo); } int FILo = MF->getFrameInfo().CreateStackObject(2, Align(2), /*isSpillSlot=*/false); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi)) .addReg(PtrLo).addFrameIndex(FILo).addImm(0); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDY_Imm16)).addImm(PeelOff); if (IsLoad) { Register Dst = MI.getOperand(0).getReg(); // LDAfi_indY $dst, FILo — PEI resolves to LDA (FILo,S),Y. BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi_indY), W65816::A).addFrameIndex(FILo).addImm(0); BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), Dst).addReg(W65816::A); } else { Register Val = MI.getOperand(0).getReg(); BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), W65816::A).addReg(Val); if (IsByteStore) BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::SEP)).addImm(0x20); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi_indY)) .addReg(W65816::A).addFrameIndex(FILo).addImm(0); if (IsByteStore) BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::REP)).addImm(0x20); } MI.eraseFromParent(); if (DeadPtrDef) DeadPtrDef->eraseFromParent(); if (DeadLoDef) DeadLoDef->eraseFromParent(); if (DeadHiDef) DeadHiDef->eraseFromParent(); for (MachineInstr *D : ExtraChainDeads) D->eraseFromParent(); return BB; } // Extract the i16 sub-halves of the Wide32 ptr. At custom-inserter // time Ptr is still a virtual register, so `TRI.getSubReg` won't // work (it's physreg-only). Use COPY-with-subreg-index instead; // the regalloc + virtreg-rewriter resolves this to the right // physreg operand later. Register PtrLo = MRI.createVirtualRegister(&W65816::Wide16RegClass); Register PtrHi = MRI.createVirtualRegister(&W65816::Wide16RegClass); if (PeelOff) { // Peeled path: pull base halves from the ADC chain's inputs. BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), PtrLo) .addReg(PeelBaseLo); BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), PtrHi) .addReg(PeelBaseHi); } else { BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), PtrLo) .addReg(Ptr, (RegState)0, llvm::sub_lo); BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), PtrHi) .addReg(Ptr, (RegState)0, llvm::sub_hi); } // Spill each half to a fresh slot, reload via LDAfi. Same RA- // pinning rationale as the i16 LDAptr inserter. int FILo = MF->getFrameInfo().CreateStackObject(2, Align(2), /*isSpillSlot=*/false); int FIHi = MF->getFrameInfo().CreateStackObject(2, Align(2), /*isSpillSlot=*/false); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi)) .addReg(PtrLo).addFrameIndex(FILo).addImm(0); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi)) .addReg(PtrHi).addFrameIndex(FIHi).addImm(0); // Change 3: $E0/$E2 staging CSE. Look backward in this MBB for // the previous ptr32-deref expansion. If its base halves match // ours (same vreg source) and nothing between has clobbered // $E0/$E2/$Y or the staged values, skip the LDAfi+STA_DP pairs // and reuse the previously-staged $E0..$E2. // // Inserter pattern signature (from below, latest-emitted first): // STA_DP $E2 (impl A) // LDAfi -> A // STA_DP $E0 (impl A) // LDAfi -> A // STAfi , FIHi', 0 <- prior PtrHi // STAfi , FILo', 0 <- prior PtrLo bool ReuseStaging = false; { Register MySrcLo = PeelOff ? PeelBaseLo : Ptr; Register MySrcHi = PeelOff ? PeelBaseHi : Register(); // For non-peel path, both halves come from `Ptr` via subreg; the // CSE check uses the whole Ptr vreg (so two LDAptr32 with the // same Ptr vreg can share staging). auto It = MI.getIterator(); MachineInstr *PrevStaE2 = nullptr; MachineInstr *PrevLdaHi = nullptr; MachineInstr *PrevStaE0 = nullptr; MachineInstr *PrevLdaLo = nullptr; MachineInstr *PrevStaHi = nullptr; MachineInstr *PrevStaLo = nullptr; auto clobbersE0E2 = [&](MachineInstr &PrevMI) -> bool { // Any call clobbers everything in DP — including $E0..$E3. if (PrevMI.isCall()) return true; switch (PrevMI.getOpcode()) { // FrameLowering's long-indirect expansion of these uses $E2 // as A-stash scratch (see W65816RegisterInfo.cpp). case W65816::ADCfi: case W65816::ADCEfi: case W65816::ANDfi: case W65816::ORAfi: case W65816::EORfi: case W65816::SBCfi: case W65816::SBCEfi: case W65816::CMPfi: return true; case W65816::STA_DP: case W65816::STZ_DP: if (PrevMI.getOperand(0).isImm()) { int64_t Imm = PrevMI.getOperand(0).getImm(); if (Imm == 0xE0 || Imm == 0xE1 || Imm == 0xE2 || Imm == 0xE3) return true; } break; } return false; }; // Scan back, fail-soft. const unsigned MaxScan = 60; unsigned Scanned = 0; while (It != BB->begin() && Scanned++ < MaxScan) { --It; MachineInstr &P = *It; if (!PrevStaE2) { if (P.getOpcode() == W65816::STA_DP && P.getOperand(0).isImm() && P.getOperand(0).getImm() == 0xE2) { PrevStaE2 = &P; continue; } if (clobbersE0E2(P)) break; continue; } // After PrevStaE2, expect LDAfi . if (!PrevLdaHi) { if (P.getOpcode() == W65816::LDAfi) { PrevLdaHi = &P; continue; } break; } if (!PrevStaE0) { if (P.getOpcode() == W65816::STA_DP && P.getOperand(0).isImm() && P.getOperand(0).getImm() == 0xE0) { PrevStaE0 = &P; continue; } break; } if (!PrevLdaLo) { if (P.getOpcode() == W65816::LDAfi) { PrevLdaLo = &P; continue; } break; } // Now look for STAfi srcHi', FIHi' and STAfi srcLo', FILo'. // They appear in either order; the inserter above emits Lo first // then Hi, but scanning back, we hit Hi first. if (!PrevStaHi) { if (P.getOpcode() == W65816::STAfi && P.getOperand(1).isFI() && P.getOperand(1).getIndex() == PrevLdaHi->getOperand(1).getIndex()) { PrevStaHi = &P; continue; } break; } if (!PrevStaLo) { if (P.getOpcode() == W65816::STAfi && P.getOperand(1).isFI() && P.getOperand(1).getIndex() == PrevLdaLo->getOperand(1).getIndex()) { PrevStaLo = &P; // Done with the structural match — fall through to operand // comparison. } break; } } if (PrevStaLo && PrevStaHi) { Register PrevSrcLo = PrevStaLo->getOperand(0).getReg(); Register PrevSrcHi = PrevStaHi->getOperand(0).getReg(); // Match if the source vregs are identical to mine. For non-peel // path, PtrLo/PtrHi were freshly created via COPY from Ptr.sub_* // — match by tracing PrevSrcLo/Hi back through their COPY (if // any) to the Ptr vreg. auto traceToPtr = [&](Register R) -> Register { if (!R.isVirtual()) return R; MachineInstr *D = MRI.getUniqueVRegDef(R); while (D && D->isCopy()) { const MachineOperand &S = D->getOperand(1); if (!S.isReg() || !S.getReg().isVirtual()) break; R = S.getReg(); D = MRI.getUniqueVRegDef(R); // For subreg copies, stop — we'd lose sub-half info. if (D && D->getOpcode() == TargetOpcode::REG_SEQUENCE) break; } return R; }; Register MyTraceLo = traceToPtr(PeelOff ? PeelBaseLo : PtrLo); Register MyTraceHi = traceToPtr(PeelOff ? PeelBaseHi : PtrHi); Register PrevTraceLo = traceToPtr(PrevSrcLo); Register PrevTraceHi = traceToPtr(PrevSrcHi); if (MyTraceLo == PrevTraceLo && MyTraceHi == PrevTraceHi && MyTraceLo.isValid() && MyTraceHi.isValid()) { ReuseStaging = true; } } (void)MySrcLo; (void)MySrcHi; // not used directly; trace covers } // Stage the 24-bit address at $E0..$E2 unless CSE allows reusing // the previous staging. // STA_DP's tablegen def has no implicit A Use, so without an // explicit kill marker between adjacent LDAfi-STA_DP-LDAfi-STA_DP // pairs the fast regalloc collapses two A-loads into one (the // first's value is overwritten before STA_DP can store it). Add // implicit Use of A on the STA_DP to encode the dependency. This // also helps post-RA passes track A liveness correctly. if (!ReuseStaging) { BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi), W65816::A).addFrameIndex(FILo).addImm(0); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STA_DP)).addImm(0xE0) .addReg(W65816::A, RegState::Implicit); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi), W65816::A).addFrameIndex(FIHi).addImm(0); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STA_DP)).addImm(0xE2) .addReg(W65816::A, RegState::Implicit); } if (IsLoad) { Register Dst = MI.getOperand(0).getReg(); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDY_Imm16)).addImm(PeelOff); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDA_DPIndLongY)).addImm(0xE0); BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), Dst).addReg(W65816::A); } else { Register Val = MI.getOperand(0).getReg(); BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), W65816::A).addReg(Val); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDY_Imm16)).addImm(PeelOff); if (IsByteStore) BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::SEP)).addImm(0x20); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STA_DPIndLongY)).addImm(0xE0); if (IsByteStore) BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::REP)).addImm(0x20); } MI.eraseFromParent(); if (DeadPtrDef) DeadPtrDef->eraseFromParent(); if (DeadLoDef) DeadLoDef->eraseFromParent(); if (DeadHiDef) DeadHiDef->eraseFromParent(); for (MachineInstr *D : ExtraChainDeads) D->eraseFromParent(); return BB; } case W65816::LDAptr32Off: case W65816::STAptr32Off: case W65816::STBptr32Off: { // ptr32 deref with constant offset. The 65816's `[dp],Y` adds Y // to the 24-bit pointer at `dp..dp+2` to form the effective // address — so we can stage the RAW pointer at $E0..$E2 and put // the offset in Y, skipping the i32-add carry chain entirely. // // Saves ~3 instructions per access vs the previous approach // (which did `lo+off; hi+carry` to compute the pointer then // derefed with Y=0). Big win on heavy struct-field code like // Lua's lapi.c. See memory: ptr32-deref-fold-layer1-mi-opcodes. // // Bank-wrap caveat: `[dp],Y` doesn't propagate Y into the bank // byte at $E2 — if pointer+Y crosses a bank boundary, the result // wraps within the 24-bit address space (not into the next bank). // For struct fields with offsets < 64KB on malloc'd or globally- // allocated objects that don't straddle bank boundaries this is // safe; the caller must not place objects spanning $XX:FFFF. // // Dead unless ptr32 mode is active. MachineFunction *MF = BB->getParent(); const W65816Subtarget &STI = MF->getSubtarget(); const W65816InstrInfo &TII = *STI.getInstrInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); DebugLoc DL = MI.getDebugLoc(); bool IsLoad = MI.getOpcode() == W65816::LDAptr32Off; bool IsByteStore = MI.getOpcode() == W65816::STBptr32Off; Register Ptr = MI.getOperand(1).getReg(); int64_t Off = MI.getOperand(2).getImm(); // See LDAptr32 inserter above: vreg sub-regs need COPY-with-subreg // (TRI.getSubReg is physreg-only at custom-inserter time). Register PtrLo = MRI.createVirtualRegister(&W65816::Wide16RegClass); Register PtrHi = MRI.createVirtualRegister(&W65816::Wide16RegClass); BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), PtrLo) .addReg(Ptr, (RegState)0, llvm::sub_lo); BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), PtrHi) .addReg(Ptr, (RegState)0, llvm::sub_hi); int FILo = MF->getFrameInfo().CreateStackObject(2, Align(2), /*isSpillSlot=*/false); int FIHi = MF->getFrameInfo().CreateStackObject(2, Align(2), /*isSpillSlot=*/false); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi)) .addReg(PtrLo).addFrameIndex(FILo).addImm(0); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi)) .addReg(PtrHi).addFrameIndex(FIHi).addImm(0); // ptr_lo -> $E0..$E1 (no offset add) BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi), W65816::A).addFrameIndex(FILo).addImm(0); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STA_DP)).addImm(0xE0); // ptr_hi -> $E2..$E3 (no carry propagation needed) BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi), W65816::A).addFrameIndex(FIHi).addImm(0); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STA_DP)).addImm(0xE2); if (IsLoad) { Register Dst = MI.getOperand(0).getReg(); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDY_Imm16)).addImm(Off); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDA_DPIndLongY)).addImm(0xE0); BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), Dst).addReg(W65816::A); } else { Register Val = MI.getOperand(0).getReg(); BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), W65816::A).addReg(Val); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDY_Imm16)).addImm(Off); if (IsByteStore) BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::SEP)).addImm(0x20); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STA_DPIndLongY)).addImm(0xE0); if (IsByteStore) BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::REP)).addImm(0x20); } MI.eraseFromParent(); return BB; } case W65816::LDAptrOff: case W65816::STAptrOff: case W65816::STBptrOff: { // Pointer access with a constant offset. Folds the offset into // the pointer (CLC; ADC #off in A) BEFORE staging at $E0..$E2, // then accesses via [$E0],Y with Y=0. We can't fold into Y // because [dp],Y on the W65816 adds Y to the full 24-bit pointer // — for a negative Y like 0xFFFE (= -2 signed), the addition // crosses into bank 1. Folding into the pointer keeps the add // at 16-bit (in A) so the bank byte stays 0. // // DBR-independent — see LDAptr/STAptr/STBptr. MachineFunction *MF = BB->getParent(); const W65816Subtarget &STI = MF->getSubtarget(); const W65816InstrInfo &TII = *STI.getInstrInfo(); DebugLoc DL = MI.getDebugLoc(); bool IsLoad = MI.getOpcode() == W65816::LDAptrOff; bool IsByteStore = MI.getOpcode() == W65816::STBptrOff; Register Ptr = MI.getOperand(1).getReg(); int64_t Off = MI.getOperand(2).getImm(); // Spill the pointer vreg to a fresh 2-byte stack slot, then // reload via LDAfi. Forces RA to materialize the source — see // the LDAptr/STAptr/STBptr case below for the full rationale. int FI = MF->getFrameInfo().CreateStackObject(2, Align(2), /*isSpillSlot=*/false); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi)) .addReg(Ptr).addFrameIndex(FI).addImm(0); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi), W65816::A).addFrameIndex(FI).addImm(0); // Compute ptr + off in A. CLC + ADC for the add. BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::CLC)); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::ADC_Imm16)).addImm(Off); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STA_DP)).addImm(0xE0); if (LoaderBankDeref) { // Bank byte from $BE (crt0-initialised) — Loader compat path. BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDA_DP)).addImm(0xBE); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STA_DP)).addImm(0xE2); } else { BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STZ_DP)).addImm(0xE2); } if (IsLoad) { Register Dst = MI.getOperand(0).getReg(); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDY_Imm16)).addImm(0); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDA_DPIndLongY)).addImm(0xE0); BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), Dst).addReg(W65816::A); } else { Register Val = MI.getOperand(0).getReg(); BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), W65816::A).addReg(Val); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDY_Imm16)).addImm(0); if (IsByteStore) BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::SEP)).addImm(0x20); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STA_DPIndLongY)).addImm(0xE0); if (IsByteStore) BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::REP)).addImm(0x20); } MI.eraseFromParent(); return BB; } case W65816::LDAptr: case W65816::LDAptrBank0: case W65816::STAptr: case W65816::STBptr: { // Pointer load/store via [dp],Y indirect-long (opcodes 0xB7 / 0x97): // STA $E0 ; pointer low/hi at $E0..$E1 // STZ $E2 ; bank byte at $E2 = 0 // LDY #0 // LDA [$E0], Y ; bank 0:ptr + 0 // STA [$E0], Y // Bank-explicit ZERO — DBR-independent. Both the runInMame stack // ($00:0FFF down) and BSS / heap globals (placed at $00:xxxx) live // in bank 0, so pointer-derefs always reach the right memory even // when the user has switched DBR for a bank-2 store via `pha;plb`. // // Trade-off: under GS/OS Loader the user's data lives in their bank // (not bank 0), so library functions that write directly to globals // via `sta abs` (DBR-relative, lands in user bank) and user code that // reads via pointer-deref (lands in bank 0 by this lowering) get // INCONSISTENT results — silent miscompile. gmtime hit this with // its __gmtimeBuf static. Workaround for affected library code: // launder the buffer pointer through inline asm (see gmtime in // runtime/src/timeExt.c) so clang doesn't IPSCCP-fold it; the writes // then go via [dp],Y too and match the user reads. // // Const-int pointers (`*(volatile uint16 *)0x5000 = v`) are NOT // lowered through this pseudo — TableGen patterns route them to // STAlong / STA8long / STAabs by type. See InstrInfo.td. // // We use $E0..$E2 in libcall-scratch DP — safe because the // pseudo expansion is a leaf (no calls between SEP and STA), // and any subsequent libcall reinitialises its own scratch. // // Why [dp],Y not abs-long-X (`STA $0,X`)? abs-long-X is shorter // (~3 bytes less) but uses X to hold the pointer. In high- // pressure functions like the recursive expression parser, X // is often live with another value, and forcing X to be free // for every pointer-deref triggered "ran out of registers". // [dp],Y uses A and Y only — leaves X for spill-bridge use. // // STBptr (truncating i8 store) wraps the actual STA in SEP/REP // so M=8 across the store and only one byte is written. MachineFunction *MF = BB->getParent(); const W65816Subtarget &STI = MF->getSubtarget(); const W65816InstrInfo &TII = *STI.getInstrInfo(); DebugLoc DL = MI.getDebugLoc(); bool IsLoad = MI.getOpcode() == W65816::LDAptr || MI.getOpcode() == W65816::LDAptrBank0; bool IsByteStore = MI.getOpcode() == W65816::STBptr; // LDAptrBank0 hardcodes bank=0 (STZ $E2) regardless of LoaderBankDeref. // Used by va_arg under Loader where the deref is a stack pointer // (= bank 0 always on W65816) but $BE points to our code bank. bool ForceBank0 = MI.getOpcode() == W65816::LDAptrBank0; Register Ptr = MI.getOperand(1).getReg(); // Why we spill the pointer to a fresh stack slot first: // a direct `COPY $a = ptr_vreg ; STA $E0` lets RA elide the COPY // when ptr_vreg is already allocated to A. In a loop body where // multiple Acc16 PHIs (pointer + accumulator) compete for A, the // PHI elimination pass picks one to be in A at the bottom of the // block and silently drops the COPY needed to refresh A with the // OTHER value at the top of the next iteration — silent miscompile // (sumTable read its own accumulator as the pointer on iter 2+). // STAfi forces RA to materialize ptr_vreg's value so it gets stored // to the slot, then LDAfi reads it back as a real machine load. int FI = MF->getFrameInfo().CreateStackObject(2, Align(2), /*isSpillSlot=*/false); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi)) .addReg(Ptr).addFrameIndex(FI).addImm(0); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi), W65816::A).addFrameIndex(FI).addImm(0); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STA_DP)).addImm(0xE0); if (LoaderBankDeref && !ForceBank0) { // Bank byte from $BE (crt0-initialised) — Loader compat path. BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDA_DP)).addImm(0xBE); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STA_DP)).addImm(0xE2); } else { BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STZ_DP)).addImm(0xE2); } if (IsLoad) { Register Dst = MI.getOperand(0).getReg(); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDY_Imm16)).addImm(0); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDA_DPIndLongY)).addImm(0xE0); BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), Dst).addReg(W65816::A); } else { Register Val = MI.getOperand(0).getReg(); BuildMI(*BB, MI.getIterator(), DL, TII.get(TargetOpcode::COPY), W65816::A).addReg(Val); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDY_Imm16)).addImm(0); if (IsByteStore) BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::SEP)).addImm(0x20); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STA_DPIndLongY)).addImm(0xE0); if (IsByteStore) BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::REP)).addImm(0x20); } MI.eraseFromParent(); return BB; } case W65816::SELECT_CC8: case W65816::SELECT_CC16: { const W65816Subtarget &STI = BB->getParent()->getSubtarget(); const W65816InstrInfo &TII = *STI.getInstrInfo(); MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); DebugLoc DL = MI.getDebugLoc(); MachineFunction *MF = BB->getParent(); const BasicBlock *LLVM_BB = BB->getBasicBlock(); MachineFunction::iterator It = ++BB->getIterator(); MachineBasicBlock *thisMBB = BB; MachineBasicBlock *copy0MBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(LLVM_BB); MF->insert(It, copy0MBB); MF->insert(It, sinkMBB); // Move the rest of thisMBB after MI to sinkMBB. sinkMBB->splice(sinkMBB->begin(), BB, std::next(MachineBasicBlock::iterator(MI)), BB->end()); sinkMBB->transferSuccessorsAndUpdatePHIs(BB); unsigned CC = MI.getOperand(3).getImm(); // Helper: if `OpReg` is defined by a single-use, side-effect-free, // constant-source LDA in thisMBB, MOVE that LDA into `DstMBB` (at // its start). Returns true on success. auto tryHoistConstInit = [&](Register OpReg, MachineBasicBlock *DstMBB) -> bool { if (!OpReg.isVirtual()) return false; if (!MRI.hasOneNonDBGUse(OpReg)) return false; MachineInstr *Def = MRI.getUniqueVRegDef(OpReg); if (!Def || Def->getParent() != thisMBB) return false; if (Def->getOpcode() != W65816::LDAi16imm && Def->getOpcode() != W65816::LDAi8imm) return false; if (Def->getNumOperands() < 2 || !Def->getOperand(1).isImm()) return false; Def->removeFromParent(); DstMBB->insert(DstMBB->begin(), Def); return true; }; Register TValReg = MI.getOperand(1).getReg(); Register FValReg = MI.getOperand(2).getReg(); auto IsConstLda = [&](Register R) { if (!R.isVirtual() || !MRI.hasOneNonDBGUse(R)) return false; MachineInstr *D = MRI.getUniqueVRegDef(R); return D && D->getParent() == thisMBB && (D->getOpcode() == W65816::LDAi16imm || D->getOpcode() == W65816::LDAi8imm) && D->getNumOperands() >= 2 && D->getOperand(1).isImm(); }; bool BothConst = (CC < W65816CC::COND_GT_MB) && IsConstLda(TValReg) && IsConstLda(FValReg); if (BothConst) { // 4-block diamond: thisMBB has only the test (CMP) and Bxx; the // tval and fval LDAs each live in their own destination block, // which is reached only via the branch — so neither LDA's flag // side-effect can corrupt the CMP→Bxx test window. This is the // proper fix for the "LDA between CMP and Bxx" bug catalogued in // project_known_issue_lda_flags.md (replacing the earlier 3-block // workaround that only hoisted fval). // // thisMBB: ...; CMP; Bxx tvalMBB // copy0MBB: LDA #fval; BRA sinkMBB (FALSE path) // tvalMBB: LDA #tval (TRUE path; falls to sink) // sinkMBB: PHI [tval from tvalMBB, fval from copy0MBB] MachineBasicBlock *tvalMBB = MF->CreateMachineBasicBlock(LLVM_BB); MF->insert(sinkMBB->getIterator(), tvalMBB); BB->addSuccessor(copy0MBB); BB->addSuccessor(tvalMBB); copy0MBB->addSuccessor(sinkMBB); tvalMBB->addSuccessor(sinkMBB); unsigned BrOp = getBranchOpcodeForCC(CC); BuildMI(thisMBB, DL, TII.get(BrOp)).addMBB(tvalMBB); BuildMI(copy0MBB, DL, TII.get(W65816::BRA)).addMBB(sinkMBB); tryHoistConstInit(TValReg, tvalMBB); tryHoistConstInit(FValReg, copy0MBB); BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII.get(W65816::PHI), MI.getOperand(0).getReg()) .addReg(TValReg).addMBB(tvalMBB) .addReg(FValReg).addMBB(copy0MBB); } else { // 3-block diamond: keep the existing layout and (where possible) // hoist fval into copy0MBB. Used when one or both operands are // computed values (not constants), or when the multi-branch CC // requires two Bxx in thisMBB. BB->addSuccessor(copy0MBB); BB->addSuccessor(sinkMBB); if (CC < W65816CC::COND_GT_MB) { unsigned BrOp = getBranchOpcodeForCC(CC); BuildMI(thisMBB, DL, TII.get(BrOp)).addMBB(sinkMBB); } else { MultiBranch MB = getMultiBranch(CC); MachineBasicBlock *Tgt1 = MB.FirstToTrue ? sinkMBB : copy0MBB; MachineBasicBlock *Tgt2 = MB.SecondToTrue ? sinkMBB : copy0MBB; BuildMI(thisMBB, DL, TII.get(MB.First)).addMBB(Tgt1); BuildMI(thisMBB, DL, TII.get(MB.Second)).addMBB(Tgt2); } copy0MBB->addSuccessor(sinkMBB); tryHoistConstInit(FValReg, copy0MBB); BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII.get(W65816::PHI), MI.getOperand(0).getReg()) .addReg(TValReg).addMBB(thisMBB) .addReg(FValReg).addMBB(copy0MBB); } MI.eraseFromParent(); return sinkMBB; } } }