diff --git a/STATUS.md b/STATUS.md index 3d65a76..668aaa9 100644 --- a/STATUS.md +++ b/STATUS.md @@ -65,30 +65,28 @@ which runs correctly under MAME (apple2gs). ## In flight (build-system level) -- **DWARF sidecar emission in link816** (#51): The link should produce - a separate sidecar file with line-number / variable-location info - that an IDE or post-mortem dumper can consume. Skeleton not yet - written; deferred until other correctness work is done. +- **DWARF sidecar — minimal version landed** (#51): `link816 --debug-out + FILE` collects every `.debug_*` section from the input objects and + writes them to a sidecar with section headers. Addresses are still + object-file-local (no relocation processing). A consumer that wants + source-mapped final-image addresses must re-run reloc against the + text/rodata bases, or use offsets within their object scope. Future + work: apply text/rodata relocations to `.debug_info` / `.debug_line` + so addresses match the final image, and emit a TOC the consumer + can index by source file or function. ## Known issues / workarounds -- **Greedy register allocator mis-orders spills** in two patterns - (#69, #70): - 1. Functions where both `$a` and `$x` are live-in (i64-first-arg - with a stack-output pointer, e.g. `udivmod(i64, i64, ptr)`). - The TAX bridging `$x` to A clobbers `$a`'s value before the - second STA can save it. - 2. Iterative quicksort with `if/else` recursion choice: complex - live-ranges across two `swap()` calls produce wrong arg values. - - Both reproduce only at `-O1`/`-O2` with greedy. Workaround: - `-mllvm -regalloc=fast` for the affected translation unit. - `softDouble.c` already requires this flag for `__muldf3` (build.sh - applies it automatically). - - Real fix is a pre-RA pass that pre-spills critical pointer - arguments to memory, or a targeted fix in greedy's spill-ordering - heuristic. Material work; deferred. +- **Greedy register allocator mis-orders spills** in iterative + quicksort with `if/else` recursion choice (#70). Complex live + ranges across two `swap()` calls produce wrong pointer args. + Reproduces only at `-O1`/`-O2` with greedy. Workaround: + `-mllvm -regalloc=fast` for the affected translation unit, or + rewrite the qsort with explicit recursion guards instead of the + iterative tail-elim form. `softDouble.c` already uses this + flag for `__muldf3` (build.sh applies it automatically). Real + fix is either a pre-RA pass that explicitly spills loop-carried + pointer args or a targeted greedy heuristic patch. - **(d,s),y / (sr,s),y addressing wraps the bank** when Y is negative as 16-bit unsigned. Worked around by `W65816NegYIndY` @@ -115,7 +113,10 @@ which runs correctly under MAME (apple2gs). several test cases. Acceptable today (Newton iterations still converge); revisit when an exact-match test suite lands. -- **DWARF sidecar** (#51) for source-level debugging. +- **DWARF sidecar with relocations applied** — current (#51) version + is raw section pass-through; addresses are object-file-local. A + real source-level debugger needs the linker to apply text/rodata + relocations to `.debug_info` / `.debug_line` first. - **More of the C standard library**: `` transcendental functions (sin, cos, exp, log, pow), `` beyond what's diff --git a/scripts/smokeTest.sh b/scripts/smokeTest.sh index b862f9f..75e6265 100755 --- a/scripts/smokeTest.sh +++ b/scripts/smokeTest.sh @@ -2018,6 +2018,49 @@ EOF fi rm -f "$cAofFile" "$oAofFile" "$binAofFile" + log "check: MAME runs udivmod(0x123...DEF, 0x10000, &m) → q=0x12345_6789AB m=0xCDEF (#69)" + cUdmFile="$(mktemp --suffix=.c)" + oUdmFile="$(mktemp --suffix=.o)" + binUdmFile="$(mktemp --suffix=.bin)" + cat > "$cUdmFile" <<'EOF' +__attribute__((noinline)) void switchToBank2(void) { + __asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n"); +} +typedef unsigned long long u64; +__attribute__((noinline)) u64 udivmod(u64 a, u64 b, u64 *out_mod) { + *out_mod = a % b; + return a / b; +} +int main(void) { + u64 m; + u64 q = udivmod(0x123456789ABCDEFULL, 0x10000ULL, &m); + union { u64 u; unsigned short w[4]; } qu, mu; + qu.u = q; mu.u = m; + switchToBank2(); + *(volatile unsigned short *)0x5000 = qu.w[0]; + *(volatile unsigned short *)0x5002 = qu.w[1]; + *(volatile unsigned short *)0x5004 = qu.w[2]; + *(volatile unsigned short *)0x5006 = qu.w[3]; + *(volatile unsigned short *)0x5008 = mu.w[0]; + *(volatile unsigned short *)0x500a = mu.w[1]; + *(volatile unsigned short *)0x500c = mu.w[2]; + *(volatile unsigned short *)0x500e = mu.w[3]; + while (1) {} +} +EOF + "$CLANG" --target=w65816 -O2 -ffunction-sections -c \ + "$cUdmFile" -o "$oUdmFile" + "$PROJECT_ROOT/tools/link816" -o "$binUdmFile" --text-base 0x1000 \ + "$oCrt0F" "$oLibcF" "$oSfF" "$oSdF" "$oLibgccFile" "$oUdmFile" \ + >/dev/null 2>&1 + if ! bash "$PROJECT_ROOT/scripts/runInMame.sh" "$binUdmFile" --check \ + 0x025000=89ab 0x025002=4567 0x025004=0123 0x025006=0000 \ + 0x025008=cdef 0x02500a=0000 0x02500c=0000 0x02500e=0000 \ + >/dev/null 2>&1; then + die "MAME: udivmod(...) wrong mod (i64-first-arg X-spill bug)" + fi + rm -f "$cUdmFile" "$oUdmFile" "$binUdmFile" + log "check: MAME runs sqr(10) → 100 (frame-less ADJCALLSTACKUP must emit PLY)" cSqrFile="$(mktemp --suffix=.c)" oSqrFile="$(mktemp --suffix=.o)" @@ -2285,6 +2328,29 @@ EOF # Linker exports the synthetic __bss_start / __bss_end / etc. # symbols so crt0 can do BSS init and runtime malloc finds the # heap top. + log "check: link816 --debug-out emits a DWARF sidecar (#51)" + cDbgFile="$(mktemp --suffix=.c)" + oDbgFile="$(mktemp --suffix=.o)" + binDbgFile="$(mktemp --suffix=.bin)" + dbgOutFile="$(mktemp --suffix=.dbg)" + cat > "$cDbgFile" <<'EOF' +int add(int a, int b) { return a + b; } +int main(void) { return add(3, 4); } +EOF + "$CLANG" --target=w65816 -O2 -g -ffunction-sections -c "$cDbgFile" -o "$oDbgFile" + "$PROJECT_ROOT/tools/link816" -o "$binDbgFile" --debug-out "$dbgOutFile" \ + --text-base 0x1000 "$oDbgFile" "$oLibgccFile" 2>/dev/null + if ! head -1 "$dbgOutFile" | grep -q "DWARF sidecar"; then + die "link816 --debug-out: sidecar missing header" + fi + if ! grep -q "SEC \.debug_info" "$dbgOutFile"; then + die "link816 --debug-out: sidecar missing .debug_info section" + fi + if ! grep -q "SEC \.debug_line" "$dbgOutFile"; then + die "link816 --debug-out: sidecar missing .debug_line section" + fi + rm -f "$cDbgFile" "$oDbgFile" "$binDbgFile" "$dbgOutFile" + log "check: link816 emits __bss_start, __bss_end, __heap_start" cBssFile="$(mktemp --suffix=.c)" oBssFile="$(mktemp --suffix=.o)" diff --git a/src/link816/link816.cpp b/src/link816/link816.cpp index 307b329..9a4f5ad 100644 --- a/src/link816/link816.cpp +++ b/src/link816/link816.cpp @@ -708,16 +708,61 @@ static uint32_t parseInt(const std::string &s) { static void usage(const char *argv0) { std::fprintf(stderr, "usage: %s -o [--text-base ADDR] [--rodata-base ADDR]\n" - " [--bss-base ADDR] [--map FILE] ...\n", + " [--bss-base ADDR] [--map FILE] [--debug-out FILE]\n" + " ...\n", argv0); std::exit(2); } +// ---------------------------------------------------------------- DWARF +// Sidecar emission. Walks each input object and concatenates every +// section whose name starts with `.debug_`. Each section is prefixed +// by a small ASCII-readable header line: +// +// ; OBJ SEC SIZE +// +// followed by the raw section bytes. Address-bearing sections +// (.debug_info, .debug_line, .debug_aranges, .debug_loc, etc.) are +// written WITHOUT relocation processing — addresses are object-file- +// local, not final-image-local. A consumer that wants source-mapped +// addresses needs to either (a) re-run reloc against the linked +// section bases, or (b) use the relative offsets within their object +// scope. Better than nothing for a single-TU debug session. +static void writeDebugSidecar( + const std::string &path, + const std::vector> &objs) { + std::ofstream f(path, std::ios::binary); + if (!f) die("cannot open '" + path + "' for writing"); + f << "; llvm816 link816 DWARF sidecar v0\n"; + f << "; Object-file-local addresses; not relocated to final image.\n"; + size_t total = 0; + size_t kept = 0; + for (const auto &objPtr : objs) { + const InputObject &obj = *objPtr; + for (const Section &sec : obj.sections) { + if (sec.name.rfind(".debug_", 0) != 0) continue; + if (sec.size == 0) continue; + f << "; OBJ " << obj.path << " SEC " << sec.name + << " SIZE " << sec.size << "\n"; + f.write(reinterpret_cast(obj.raw.data() + + sec.fileOffset), + sec.size); + f << "\n"; + total += sec.size; + kept++; + } + } + std::fprintf(stderr, + "debug sidecar: %zu sections, %zu bytes -> %s\n", + kept, total, path.c_str()); +} + } // anonymous namespace int main(int argc, char **argv) { std::string outPath; std::string mapPath; + std::string debugOutPath; Linker linker; int i = 1; @@ -738,6 +783,9 @@ int main(int argc, char **argv) { } else if (a == "--map") { if (++i >= argc) usage(argv[0]); mapPath = argv[i++]; + } else if (a == "--debug-out") { + if (++i >= argc) usage(argv[0]); + debugOutPath = argv[i++]; } else if (a == "-h" || a == "--help") { usage(argv[0]); } else if (!a.empty() && a[0] == '-') { @@ -757,6 +805,7 @@ int main(int argc, char **argv) { f.write(reinterpret_cast(image.data()), image.size()); if (!mapPath.empty()) linker.writeMap(mapPath); + if (!debugOutPath.empty()) writeDebugSidecar(debugOutPath, linker.objs); std::fprintf(stderr, "linked: text=[0x%04x+%u] rodata=[0x%04x+%u] bss=[0x%04x+%u] " diff --git a/src/llvm/lib/Target/W65816/W65816ISelLowering.cpp b/src/llvm/lib/Target/W65816/W65816ISelLowering.cpp index bf398d8..d936340 100644 --- a/src/llvm/lib/Target/W65816/W65816ISelLowering.cpp +++ b/src/llvm/lib/Target/W65816/W65816ISelLowering.cpp @@ -638,6 +638,17 @@ SDValue W65816TargetLowering::LowerFormalArguments( bool I32FirstArg = Ins.size() >= 2 && Ins[0].VT == MVT::i16 && Ins[1].VT == MVT::i16 && Ins[0].OrigArgIndex == 0 && Ins[1].OrigArgIndex == 0; + // True iff the FIRST original arg spans 4 i16s (i.e., is i64). Used + // below to choose the Img16-via-STX_DP X-arg path for i64 callees, + // which dodges greedy's TXA-bridge-clobbers-A spill bug. i32-first + // doesn't get the same treatment because the change pessimizes + // simple functions like `int add32(int a, int b) { return a+b; }` + // where greedy's regular A:X handling is fine. + bool I64FirstArg = + Ins.size() >= 4 && Ins[0].VT == MVT::i16 && Ins[1].VT == MVT::i16 && + Ins[2].VT == MVT::i16 && Ins[3].VT == MVT::i16 && + Ins[0].OrigArgIndex == 0 && Ins[1].OrigArgIndex == 0 && + Ins[2].OrigArgIndex == 0 && Ins[3].OrigArgIndex == 0; unsigned ArgIdx = 0; // Stack offset is measured from S+1 (the WDC convention) and grows @@ -655,8 +666,16 @@ SDValue W65816TargetLowering::LowerFormalArguments( MRI.addLiveIn(W65816::A, VReg); InVals.push_back(DAG.getCopyFromReg(Chain, DL, VReg, VT)); } else if (ArgIdx == 1 && I32FirstArg) { - // i32 first-arg hi half: in X. - Register VReg = MRI.createVirtualRegister(&W65816::Idx16RegClass); + // First-arg hi half (or arg0_ml for i64-first-arg): in X. + // For i64-first-arg signatures (4 i16s with OrigArgIndex 0), use + // Img16 so greedy parks the value in an IMG slot via STX_DP, + // dodging the TXA-bridge-clobbers-A spill bug. i32-first stays + // on the original Idx16 path because the change pessimizes + // simple cases (verified: vprintf's writeULong/__udivsi3 chain + // crashes if i32-first is also rerouted). Caught by udivmod. + const TargetRegisterClass *RC = I64FirstArg ? &W65816::Img16RegClass + : &W65816::Idx16RegClass; + Register VReg = MRI.createVirtualRegister(RC); MRI.addLiveIn(W65816::X, VReg); InVals.push_back(DAG.getCopyFromReg(Chain, DL, VReg, MVT::i16)); } else { diff --git a/src/llvm/lib/Target/W65816/W65816InstrInfo.cpp b/src/llvm/lib/Target/W65816/W65816InstrInfo.cpp index 81226fa..aff0df3 100644 --- a/src/llvm/lib/Target/W65816/W65816InstrInfo.cpp +++ b/src/llvm/lib/Target/W65816/W65816InstrInfo.cpp @@ -92,10 +92,10 @@ void W65816InstrInfo::copyPhysReg(MachineBasicBlock &MBB, BuildMI(MBB, I, DL, get(W65816::STA_DP)).addImm(dstImg); return; } - // X → IMGn / IMGn → X: STX dp / LDX dp. Avoids the A-bridge that - // TAX/TXA would impose; critical for i32-first-arg signatures - // (live-in $a + $x) where bridging X via A clobbers $a's value - // before it can be saved. Caught by udivmod and iterative qsort. + // X → IMGn / IMGn → X: STX dp / LDX dp. Used by the i64-first-arg + // entry COPY (LowerFormalArguments routes arg0_ml through Img16 to + // dodge the TXA-bridge-clobbers-A spill bug for udivmod-shaped + // signatures). if (dstImg >= 0 && SrcReg == W65816::X) { BuildMI(MBB, I, DL, get(W65816::STX_DP)).addImm(dstImg); return; @@ -104,15 +104,6 @@ void W65816InstrInfo::copyPhysReg(MachineBasicBlock &MBB, BuildMI(MBB, I, DL, get(W65816::LDX_DP)).addImm(srcImg); return; } - // Y → IMGn / IMGn → Y: STY dp / LDY dp — symmetric. - if (dstImg >= 0 && SrcReg == W65816::Y) { - BuildMI(MBB, I, DL, get(W65816::STY_DP)).addImm(dstImg); - return; - } - if (DestReg == W65816::Y && srcImg >= 0) { - BuildMI(MBB, I, DL, get(W65816::LDY_DP)).addImm(srcImg); - return; - } // DPF0 → A: emit `LDA $F0`. DPF0 is the pseudo-physreg carrier // for an i64-returning call's high 16 bits; LowerCall builds a // CopyFromReg(DPF0) glued to the call so the SDAG combiner /