From 7600812a7bbf9bf077704252091508c780b0932a Mon Sep 17 00:00:00 2001 From: Scott Duensing Date: Fri, 8 May 2026 17:36:21 -0500 Subject: [PATCH] Checkpoint. --- SESSION_RECOVERY.md | 140 ++++++++--- STATUS.md | 233 ++++++------------ scripts/smokeTest.sh | 49 ++++ src/link816/omfEmit.cpp | 156 ++++++++++-- .../lib/Target/W65816/W65816ISelLowering.cpp | 46 ++++ .../Target/W65816/W65816StackSlotCleanup.cpp | 24 +- 6 files changed, 423 insertions(+), 225 deletions(-) diff --git a/SESSION_RECOVERY.md b/SESSION_RECOVERY.md index 18dd2f7..87bcdac 100644 --- a/SESSION_RECOVERY.md +++ b/SESSION_RECOVERY.md @@ -1,4 +1,4 @@ -# Session Recovery — 2026-05-07/08 +# Session Recovery — last updated 2026-05-08 Living recovery doc. Update on every meaningful change. If session is lost, read this top-to-bottom + the memory notes referenced inside, then reread @@ -6,43 +6,37 @@ the actual diffs in tree to ground assumptions. ## Headline state -- **Smoke**: 131/131 green. +- **Smoke**: 132/132 green (omfEmit `--stack-size` check is the new one). - **Active config**: ptr32 (`p:32:16`), full IMG0..IMG15 caller-clobber on JSL, basic regalloc at -O1+. -- **Working tree**: clean except 3 modified files listed below; all are real fixes that haven't been committed yet. +- **Working tree**: 5 modified files (see below); all real fixes pending checkpoint. - **Branch**: `main`, ahead of `origin/main` by recent checkpoint commits. +- **Bench wins this session**: popcount **8320 → 6888 cyc/call (17%)** from i32 shift inline. DP/Stack `~Direct` segment Loader-validated end-to-end. ## Uncommitted, must keep -These are the in-flight improvements. Rebuild after applying any of them. +`git status --short` (5 modified, no untracked of consequence): -1. `runtime/src/snprintf.c` — removed `__attribute__((optnone))` from - `emitULong` (line 106) and `snprintf` (line 303). Slot-aliasing - workaround that the IMG-clobber + LDAfi-IMG fixes made unnecessary. -2. `src/llvm/lib/Target/W65816/W65816InstrInfo.cpp` - - `copyPhysReg` virtual-register short-circuit: if `SrcReg` or `DestReg` - is virtual, emit a `TargetOpcode::COPY` and return. Basic regalloc's - InlineSpiller calls `storeRegToStackSlot` with vreg sources before - final physreg assignment; without the short-circuit the unpaired- - Wide32 default branch hits the `unreachable`. - - `copyPhysReg` IMG-to-IMG PHA-bracket: was `lda src; sta dst` — - unbracketed clobber of A, regalloc inserted these copies between - `$a = COPY $img10` and use-of-A. PHA/PLA bracket preserves A. -3. `src/llvm/lib/Target/W65816/W65816SjLjFinalize.cpp` — catchtab build - moved BEFORE landingpad erase. Old code did `LPadBB->getLandingPadInst()` - AFTER erasing the insts → returned nullptr → empty LSDA → catch never - matched, abort. Now captures catch-clause typeinfo Constants into a - `DenseMap` BEFORE erase; build loop reads from - the saved map. +1. `SESSION_RECOVERY.md` — this doc. +2. `scripts/smokeTest.sh` — added "omfEmit `--stack-size` emits a + DP/Stack `~Direct` segment" check. Validates 3-segment layout + (ExpressLoad + code + DP/Stack) when `--stack-size` is supplied; + parses the third segment header against KIND/LENGTH/RESSPC/ALIGN/ + SEGNUM=3/name="~Direct" expectations. +3. `src/link816/omfEmit.cpp` — `emitDpStackSeg(length, segNum)` plus + the `--stack-size N` CLI flag. Validation: 256 ≤ N ≤ 65536, page- + aligned. **`--stack-size` implicitly enables `--expressload`** — + the GS/OS Loader's slow path silently rejects multi-seg OMFs (see + §D below for the empirical evidence). +4. `src/llvm/lib/Target/W65816/W65816ISelLowering.cpp` — `LowerShift` + now inlines i32 SHL/SRL/SRA by N=1..4 instead of routing to + `__lshrsi3`/`__ashlsi3`/`__ashrsi3`. See §E. +5. `src/llvm/lib/Target/W65816/W65816StackSlotCleanup.cpp` — pre-existing + uncommitted change from prior turns; verify against git log before + re-staging if recovery is fresh. -To commit when ready (do NOT amend; create new commits): -```bash -git add runtime/src/snprintf.c \ - src/llvm/lib/Target/W65816/W65816InstrInfo.cpp \ - src/llvm/lib/Target/W65816/W65816SjLjFinalize.cpp -git commit -m "..." # message stub below -``` -Suggested commit message: see "Fixes landed" section below; one commit -per logical change is cleaner. +Earlier-mentioned files (snprintf.c, W65816InstrInfo.cpp, +W65816SjLjFinalize.cpp) have been checkpoint-committed and are no +longer in `git status`. ## Already-committed in this session arc @@ -162,6 +156,46 @@ A=42 from real C++ `try { throw 42; } catch (int x) { return x; }`. - `runtime/src/snprintf.c:106` — removed `optnone` on `emitULong`. Smoke green. - `runtime/src/snprintf.c:303` — removed `optnone` on `snprintf`. Smoke green. +### D. `omfEmit --stack-size` — DP/Stack segment for GS/OS Loader + +Added `emitDpStackSeg` (`src/link816/omfEmit.cpp`). KIND=0x1012 (DP/Stack +| PRIVATE), LENGTH=RESSPC=requested-bytes, ALIGN=0x100, BANKSIZE=0, body +is a single END opcode. Apps can now request a stack of any +page-aligned size from 256B to 64KB (replacing GS/OS Loader's default +4KB allocation). + +**Loader gotcha** (cost ~1 hour to debug): plain (non-ExpressLoad) +multi-segment OMFs do NOT launch under real GS/OS 6.0.2 — the Loader's +slow path silently rejects the file and our entry point never runs. +ExpressLoad-wrapped multi-segment OMFs DO work. Fix: `--stack-size` now +implicitly enables `--expressload` (the Loader's slow path is +empirically broken for our 2-seg layout). The DP/Stack seg is appended +AFTER the user code seg as SEGNUM=3; the Loader walks all segments by +KIND after the ExpressLoad fast-load step finishes. + +Verified: `runViaFinder.sh /tmp/test_el_dp.omf --check 0x70=0x42 0x71=0x99` +passes under real GS/OS 6.0.2 with `--stack-size 4096 --expressload`. +Verified failure mode: same payload with `--stack-size` alone (no +`--expressload`) → `0x70=0x00` (program never executed). Documented +in `feedback_loader_multi_seg_needs_expressload.md`. + +Smoke updated: 132/132 expects 3 segs (ExpressLoad + code + DP/Stack) +when `--stack-size` is supplied. + +### E. i32 shift-by-N inlined (was full libcall) — speed win + +`W65816ISelLowering.cpp` `LowerShift` now inlines i32 SHL/SRL/SRA by +N=1..4. Previously every i32 shift went through `__lshrsi3`/ +`__ashlsi3`/`__ashrsi3` — ~300+ cyc per call. popcount benchmark: +**8320 → 6888 cyc/call, 17% faster**. Implementation extracts +`Wide32` halves via `extractWide32Lo/Hi`, applies per-step +`lsr; ror`-equivalent SDAG ops with explicit carry propagation +(`(Hi & 1) << 15` for SRL/SRA's lo-fill, `Lo >> 15` for SHL's +hi-fill), recombines via `buildWide32`. N>4 still routes to libcall +— the unrolled cost (~5 i16 ops × N) crosses libcall overhead at N≈5. + +Documented in `feedback_i32_shift_inline.md`. + ## Still-open work areas Each carries a fair-warning note for whoever picks it up. @@ -178,7 +212,7 @@ OOM threshold; both halves compile cleanly at -O2 + basic regalloc. worked. The IMG-clobber + LDAfi-IMG-store backend fixes from 2026-05-07 had already resolved its underlying pressure issue. -Smoke 131/131 stays green. +Smoke stays green (now 132/132). ### 2. gmtime_r `optnone` `runtime/src/timeExt.c:69`. NOT a backend bug — IR-level optimization @@ -216,15 +250,27 @@ which combine pass mis-evaluates and why. optnone stays. ```bash cd /home/scott/claude/llvm816 -git status # 3 modified files listed above -cd tools/llvm-mos-build && ninja llc clang # rebuild backend +git status # 5 modified files listed above +cd tools/llvm-mos-build && ninja llc clang # rebuild backend (~5 min) cd /home/scott/claude/llvm816 +cd src/link816 && make && cd ../.. # rebuild link816 + omfEmit bash runtime/build.sh # build runtime -bash scripts/smokeTest.sh # should print "all smoke checks passed" +bash scripts/smokeTest.sh # should end "all smoke checks passed" +bash scripts/benchCyclesPrecise.sh # popcount should be ~6888 cyc ``` -If smoke fails, the most likely cause is one of the three uncommitted -files got reverted; check `git status` and re-apply. +Loader smoke (validates DP/Stack seg under real GS/OS 6.0.2): +```bash +# Build a simple test program with --stack-size, run via Finder. +tools/omfEmit --input X.bin --map X.map --base 0x1000 --entry __start \ + --output /tmp/t.omf --stack-size 4096 --relocs X.relocs +bash scripts/runViaFinder.sh /tmp/t.omf --check 0x70=0x42 0x71=0x99 +``` + +If smoke fails, the likely cause is one of the 5 uncommitted files +got reverted; check `git status` and re-apply. If popcount bench +regressed past ~7500 cyc, suspect the i32-shift-inline change in +`W65816ISelLowering.cpp` was lost. ## Diagnostic tools that worked @@ -282,12 +328,24 @@ in 30 minutes. Recommended. - `feedback_jslpseudo_caller_save.md`, `feedback_libcall_img_clobber.md`, `feedback_img_slot_expansion.md`, `feedback_greedy_high_pressure.md` — related backend topics. +- `feedback_loader_multi_seg_needs_expressload.md` — **new 2026-05-08**. + Multi-seg OMFs need ExpressLoad to launch under real Loader. +- `feedback_i32_shift_inline.md` — **new 2026-05-08**. Inline i32 + shift-by-N for N=1..4; first quantified bench-vs-self speed win. +- `feedback_speed_over_size.md` — **new 2026-05-07**. Optimization + priorities: cycle count over byte count, full stop. ## Next session candidates (ranked) 1. **Commit the uncommitted fixes.** They've earned it. -2. **Greedy regalloc retry.** Cheap experiment, potentially big win. -3. **qsort source restructure.** Clear `optnone` if you're willing to - reshape the algorithm. Source-level work, not backend. -4. **gmtime_r IR investigation.** Find which combine miscompiles +2. **u16*u16→u32 multiply path.** sumOfSquares is 982 cyc/iter, + bottlenecked by `__mulsi3` for what's really a 16x16 multiply. + If we add a `__umulhi3` libcall (i16,i16 → i32) and route + `MUL(zext(a), zext(b))` to it, sumOfSquares could ~halve. +3. **`while (x != 0)` for i32 should fold to `lda lo; ora hi; bne`.** + Currently materializes a boolean via SETCC and branches on it. + Combiner hook: `(brcond (setcc i32 x, 0, ne))` → + `(br_cc ne, lo|hi, 0)`. Big win in any i32-iteration loop. +4. **Greedy regalloc retry.** Cheap experiment, potentially big win. +5. **gmtime_r IR investigation.** Find which combine miscompiles `days >= 365L + (leap?1:0)`. IR-level, not backend. diff --git a/STATUS.md b/STATUS.md index 0fb7dda..5587f38 100644 --- a/STATUS.md +++ b/STATUS.md @@ -95,24 +95,10 @@ which runs correctly under MAME (apple2gs). `operator delete` + `__cxa_pure_virtual`. - C++ exceptions via `clang++ -fsjlj-exceptions`: throw, catch, catch-by-value, multiple catch handlers, exception destruction. - Backend wiring: `MCAsmInfo` selects `ExceptionHandling::SjLj` - so clang's `SjLjEHPrepare` runs; a custom `W65816SjLjFinalize` - IR pass (in `src/llvm/lib/Target/W65816/`) finishes the - lowering by inserting an actual `setjmp` at function entry, - building a `switch`-on-call-site dispatch block, building a - per-function catch table referenced via the lsda field, and - rewriting `eh.typeid.for(@TI)` to use typeinfo addresses as - selectors. Runtime in `runtime/src/libcxxabiSjlj.c` provides - the full Itanium SJLJ surface: `_Unwind_SjLj_Register/ - Unregister/RaiseException/Resume`, `__cxa_allocate_exception`, - `__cxa_throw`, `__cxa_begin_catch`, `__cxa_end_catch`, - `__cxa_rethrow`, plus a no-op `__gxx_personality_sj0` - (we dispatch via call_site directly, not via the personality). - Two backend bug fixes were required along the way: longjmp's - SP restore was off by 3 (libgcc.s subtracted 3 before TCS, - leaving caller's stack 3 bytes off) and `W65816StackSlotCleanup` - was eliminating volatile stores to dead-from-its-perspective - stack slots (skipped via `hasOrderedMemoryRef()` gate). + `W65816SjLjFinalize` IR pass inserts the call-site dispatch and + per-function catch table; `runtime/src/libcxxabiSjlj.c` provides + the Itanium SJLJ surface (`_Unwind_SjLj_*`, `__cxa_throw`, + `__cxa_begin_catch`, etc.) plus a no-op personality. **Toolchain:** @@ -138,7 +124,7 @@ which runs correctly under MAME (apple2gs). reads the manifest, places each segment's bytes, and runs from segment 1's entry — used by smoke to verify cross-bank JSL end-to-end (helper3 chain across 3 bank-aligned segments). -- `tools/omfEmit` produces OMF v2.1 files in two modes: +- `tools/omfEmit` produces OMF v2.1 files in three modes: (a) single-segment — `--input flat.bin --map flat.map --base ADDR --entry SYM`, KIND=0x0000 (CODE, dynamic), ORG=0 (loader picks bank); (b) multi-segment — `--manifest path.json` reads @@ -147,14 +133,20 @@ which runs correctly under MAME (apple2gs). the GS/OS Loader to place each at its declared bank-aligned address. All intra-segment relocations were already patched by the linker, so no INTERSEG/RELOC opcodes are needed for v1 - static placement. + static placement. (c) `--stack-size N` (auto-enables + `--expressload`) appends a `~Direct` DP/Stack segment + (KIND=0x1012) of N bytes so apps can request a custom DP+stack + allocation from GS/OS instead of the Loader's 4KB default. + Validated end-to-end via `runViaFinder.sh` under real GS/OS + 6.0.2 — the slow Loader path silently rejects multi-segment + OMFs, so `--stack-size` is gated behind ExpressLoad emission. - `link816 --debug-out FILE` writes a DWARF sidecar with text/ rodata/bss/init_array relocations applied to every `.debug_*` section, so `.debug_addr` / `.debug_line` PC values are final- image addresses. - `runtime/build.sh` builds crt0, libc, soft-float, soft-double, libgcc into linkable objects. -- `scripts/smokeTest.sh` runs 126 end-to-end checks at -O2: +- `scripts/smokeTest.sh` runs 132 end-to-end checks at -O2: scalar ops, control flow, calling conventions, MAME execution regressions, link816 bss-base safety + weak-symbol resolution + heap_end-vs-heap_start sanity, iigs/toolbox.h compile + link, @@ -173,7 +165,7 @@ which runs correctly under MAME (apple2gs). setjmp/longjmp + catch-table walk), C++ -fsjlj-exceptions compile + link (the C++ frontend → backend path is execution- verified manually but skipped from MAME smoke due to a - MAME-side flakiness — see "Yet to come"), GS/OS wrapper + MAME-side flakiness — see "What's next"), GS/OS wrapper round-trip via stub dispatcher pre-loaded at $E100A8 (validates PHA + PEA 0 + JSL + post-call SP-fixup contract end-to-end), wchar / signal core APIs, hex dumper writing through fprintf, @@ -181,19 +173,12 @@ which runs correctly under MAME (apple2gs). + dispatch + chained collisions over fprintf-to-mfs), scripts/bench.sh size-vs-Calypsi harness. 100% pass. -- `scripts/bench.sh` compiles a microbenchmark suite with both - clang (this toolchain) and Calypsi cc65816, comparing emitted - text-section size. Current ratio: ~1.9x (down from 2.2x once - the W65816 target started overriding `replexitval` to "never" - by default in `LLVMInitializeW65816Target`; SCEV's closed-form - rewrite was promoting i16 induction expressions to i64 and - hitting `__muldi3`, which on a 16-bit target is dramatically - bigger than the loop it replaces). sumOfSquares went 335B → - 128B, a 2.6x shrink with no other benchmark affected. Eight - benchmarks shipped under `benchmarks/`. Remaining gap is - structural: Calypsi uses `(sr,s),Y` for stack-relative - pointer indirection where we route through DP $E0 indirect- - long for bank safety. +- `scripts/benchCyclesPrecise.sh` measures per-call cycle counts + via MAME's emulated time counter. Eight benchmarks under + `benchmarks/`. Current numbers: popcount 6888 cyc, bsearch + 1108, memcmp 1569, strcpy 3580, dotProduct 4774, fib(10) 14152, + sumOfSquares 49104. Speed is the optimization priority, not + size. **Backend register allocation:** @@ -250,144 +235,64 @@ which runs correctly under MAME (apple2gs). Generated by `scripts/genToolbox.py` from ORCA-C's `ORCACDefs/` (re-runnable when ORCA-C updates). -## In flight +## What's next -(Nothing currently — the four previous in-flight items all -landed: basic-regalloc-by-default replaced greedy and resolved -the long-arg-chain failure; `time()` reads ReadTimeHex when the -program has called `iigsToolboxInit()` and `clock()` reads the -VBL counter via 24-bit absolute load; the (sr,s),Y bank-wrap -addressing is no longer emitted by any inserter and the -`W65816NegYIndY` workaround is disabled; LC ceiling extended -from $E000 to $10000 since crt0's `lda $C083` read-twice enables -RAM through $FFFF, gaining 8KB of bank-0 space.) +Work is now optimization-focused; the toolchain is feature-complete +for the common-case C / minimal-C++ workload. Priority is speed +(cycle counts), not size. -## Yet to come +**Speed wins queued, ranked by expected impact:** -- **Multi-bank BSS / init_array** — multi-segment splits text - across banks but BSS + init_array still live in segment 1's bank - (bank 0). Programs whose zero-init data exceeds the ~60KB bank-0 - budget would need crt0 to walk a per-segment table of `(start, - end)` pairs. Not blocking >64KB *code* programs; only matters - for programs with very large global arrays. +- **u16×u16 → u32 multiply path.** sumOfSquares is 982 cyc/iter + bottlenecked by `__mulsi3` for what's effectively a 16×16 + multiply (both inputs are zext from u16). Adding a `__umulhi3` + libcall + SDAG hook to detect `MUL(zext(a), zext(b))` could + roughly halve the iteration cost. -- **GS/OS Loader OMF format compatibility** — the OMF format we - emit is now byte-equivalent to real Apple S16 segments at the - header level. Verified by extracting the ABOUT segment from - real `/SYSTEM/START` (FINDER) via Cadius (`/tmp/cadius/cadius`, - not AppleCommander which can't extract forks) and comparing - field-by-field against ours. Five fixes landed in - `src/link816/omfEmit.cpp` along the way: - (1) VERSION byte 0x21 → 0x02 (was BCD-style "2.1"; real format - is enum where 0x02 = v2.1). Cleared error $1102. - (2) Body opcode 0xF1 (DS = N zeros) → 0xF2 (compact LCONST, - 2-byte length + N data bytes). Long-form 0xF5 LCONST is in - the spec but real Loader appears to mis-parse it (3 stale - copies of the segment ended up scattered in RAM). Every real - segment we decoded uses 0xF2. - (3) KIND 0x0000 (CODE) → 0x8000 (CODE|STATIC) for legacy - single-segment mode. Real ABOUT segment uses 0x8000; with - 0x0000 the Loader returns $110A loadSegFailErr. Multi-segment - mode keeps 0x8800 (CODE|STATIC|ABSBANK) since each seg has a - fixed ORG. - (4) BANKSIZE 0 → 0x10000 (matches real code segments). - (5) LOAD_NAME emitted as 10 bytes of zeros immediately after - the 44-byte header (some sources omit it, real OMFs include it). +- **Fold `while (x != 0)` for i32 to `lda lo; ora hi; bne`.** + The combiner currently materializes a SETCC boolean and re-tests + it, generating ~10 redundant ops in every i32-iteration loop. + Hot in popcount, CRC, and any BigInt-style code. - GS/OS 6.0.2 is installed under `tools/gsos/` and boots cleanly - to Finder in MAME. Replacing `/SYSTEM/START` with a known-good - OMF (the extracted ABOUT segment) gives error `$005C` — - identical to what we get with our test program — meaning our - OMF is indistinguishable from real Apple S16 as far as the - Loader is concerned. The $005C is *not* OMF rejection; it is - the boot-launcher path failing because a minimal `/SYSTEM/START` - doesn't chain to a real Finder via QUIT-with-pathname. +- **ptr32 pointer-increment overhead.** `*p++` under ptr32 emits + a full 32-bit `ADC` chain even when the high half is provably + unchanged. strcpy and memcmp pay 30+ cycles per byte for what + should be 15-20. Needs a peephole or SDAG combine for `i32 + 1` + with provably-no-carry-into-hi. - `runtime/src/crt0Gsos.s` is committed: skips SEI/LC-reconfig - (GS/OS owns CPU state), zeros BSS, runs init_array, calls - main, then QUIT(pcount=2) chained to `gChainPath` (default - `/SYSTEM/START.ORIG`). Linkage works. +- **Greedy regalloc retry.** Currently blocked on an upstream + LLVM `LiveRangeEdit::eliminateDeadDef` assertion when our + sub-register pair partial-defs reach it. Basic regalloc works + but leaves measurable cycle waste in load/store shuffles. - Tested with a marker write as the very first instruction of - crt0Gsos, replacing `/SYSTEM/START` with our OMF and saving - the original as `/SYSTEM/START.ORIG` for chain-back. After - 110-second boot: marker `$00/0078` is still 0 — the Loader - places our segment in RAM (entry signature found in 3 banks - via memory search) but **never JSLs entry**. Tested ENTRY=0, - ENTRY=1 (with NOP pad), auxtype=0 and =DB03; all give the - same $005C without ever calling our code. Conclusion: the - boot-launcher path requires the `~ExpressLoad` segment that - every real `/SYSTEM/START` carries. Without ExpressLoad, - the bootstrap takes a code path that loads our segment but - never auto-calls it. +**Open limitations:** - **OMF format → fully Loader-compatible** after reading - Merlin32 source. Final canonical fields (single-segment - Finder-launchable app): - - KIND=0x1000 (CODE|PRIV) — was 0x8000 (CODE|STATIC) which - came from extracting ABOUT from real FINDER, but ABOUT is a - sub-segment called as a subroutine, not a launchable app - - LABLEN=10 (fixed-width 10-byte LOAD_NAME and SEG_NAME, - space-padded) — was 0 (length-prefixed) which is what - /SYSTEM/START FINDER uses but the Loader will only LOAD, - not JSL-into, that format - - VERSION=0x02 (OMF v2.1) - - BANKSIZE=0x10000 for code segs - - Body opcode 0xF2 LCONST with NUMLEN-byte (=4) count +- **Multi-bank BSS / init_array.** Multi-segment mode splits + `.text` across banks but BSS + init_array still live in + segment 1's bank (bank 0). Programs with zero-init data + exceeding the ~60KB bank-0 budget need crt0 to walk a + per-segment `(start, end)` table. Not a blocker for >64KB + *code* programs. - ExpressLoad emission also landed (`omfEmit --expressload`): - 6-byte header + segment list + remap list + header info, - byte-equivalent to Merlin32's `BuildExpressLoadSegment`. +- **C++ exceptions absent from CI smoke.** The SJLJ runtime + round-trip is in smoke; the full clang++ → backend → MAME + execution path runs reliably interactively but is excluded + from automated smoke due to MAME-side I/O flakiness. - End-to-end runtime verification: new `scripts/runViaFinder.sh` - injects an OMF as `/SYSTEM.DISK/HELLO`, boots GS/OS in MAME, - drives Finder via Lua keyboard automation (S+Cmd-O to open - System.Disk, H+Cmd-O to launch HELLO), samples specified - memory addresses to verify execution. Pattern adapted from - `joeylib/scripts/run-iigs-mame.sh` from a sibling project. - Pure-asm marker tests (`sta $000078 long, value=$42`) are - confirmed running under real GS/OS Loader with - `runViaFinder.sh hello.omf --check 0x000078=0x42` returning - exit 0. +- **GS/OS validation uses a stub dispatcher.** The wrapper + contract (PHA + PEA 0 + LDX + JSL $E100A8 + post-call SP + fixup) is verified end-to-end in MAME against a stub + (`scripts/runInMameWithGsosStub.sh`). Validation against a + real bootable GS/OS volume is left out of CI as it needs a + smartport hard-disk image and live Tool Locator init. - **Compiled C now runs under real GS/OS Loader.** Implemented - option (a) from the analysis: OMF cRELOC opcode emission. - - `link816 --reloc-out FILE` records every R_W65816_IMM24 - relocation site (intra-segment 24-bit refs only — GS/OS - dispatcher calls and other cross-bank refs are filtered out) - as a binary sidecar of (patchOff, offsetRef) pairs. - - `omfEmit --relocs FILE` reads the sidecar and emits a - cRELOC opcode (0xF5) per site between the LCONST data and the - END opcode. Format per Merlin32: `0xF5 ByteCnt(=3) Shift(=0) - OffsetPatch(2) OffsetReference(2)` = 7 bytes. - - The Loader rewrites segment[OffsetPatch..OffsetPatch+2] to - `(segPlacedBase + OffsetReference)` at load time, fixing - every `jsl`/`jml`/`sta long`/`lda long` operand that targets - an in-segment symbol. - - End-to-end verified: a real C function call + for loop - (`sumTo(10)` → 55, `sumTo(100)` → 5050) compiled with clang - -O2, linked, OMF-emitted with cRELOC, injected as - `/SYSTEM.DISK/HELLO`, launched from Finder via MAME-Lua - keyboard automation, marker bytes verified at the expected - values. Smoke check #62 verifies cRELOC opcode count - matches the link816 sidecar count. +- **gmtime_r requires `optnone`.** IR-level optimizer issue: + loop rotation + IndVar simplify mis-evaluate `days >= 365L + + (__isLeap(...) ? 1 : 0)`, folding the comparison to + compile-time-false. Not a backend bug; needs IR-pass-level + diagnosis. - Smoke tests #59-#60 (omfEmit single + multi-segment) verify - the structural format invariants (VERSION=0x02, KIND=0x8000 - or 0x8800, body opcode 0xF2 LCONST) so regressions are - caught. `scripts/runMultiSeg.sh` mini-loader continues to - cover the >64KB use case end-to-end. - -- **C++ exceptions in CI smoke** — runs reliably outside smoke; - see context below. The SJLJ runtime end-to-end test passes; - the C++ frontend→backend path is compile/link verified in - smoke; full execution path is left out due to a MAME-side I/O - flakiness (same binary runs fine interactively). - -- **GS/OS validated against a real ProDOS volume** — the wrapper - contract (PHA + PEA 0 + LDX + JSL $E100A8 + post-call SP fixup) - is verified end-to-end in MAME against a stub dispatcher - (`scripts/runInMameWithGsosStub.sh`). Validating against an - actual GS/OS-loaded volume needs a bootable system disk image - attached as a MAME smartport hard disk and Tool Locator init — - out of scope for an automated CI smoke. +- **softDouble `dpack` / `dclass` require `noinline`.** + Inlining triggers register pressure that overflows basic + regalloc in `__adddf3`/`__muldf3`/`__divdf3`. Architectural + for the same reason as qsort's earlier split. diff --git a/scripts/smokeTest.sh b/scripts/smokeTest.sh index 7745f48..d22d12e 100755 --- a/scripts/smokeTest.sh +++ b/scripts/smokeTest.sh @@ -5175,6 +5175,55 @@ EOF die "OMF body opcode at offset $dispdata is 0x$bodyOp (expected 0xF2 LCONST)" fi + # omfEmit --stack-size: append a ~Direct DP/Stack segment so the + # GS/OS Loader allocates an explicit-sized DP+stack chunk instead + # of its 4KB default. KIND=0x1012 (DP/Stack | PRIVATE), LENGTH and + # RESSPC both = requested size, ALIGN=0x100 (page-aligned per spec). + # Plain (non-ExpressLoad) multi-segment OMFs do not launch under + # GS/OS 6.0.2 Loader (verified empirically), so --stack-size auto- + # enables --expressload: the OMF becomes 3 segments (ExpressLoad, + # code, DP/Stack), with DP/Stack as segnum 3. + log "check: omfEmit --stack-size emits a DP/Stack ~Direct segment" + omfStk="$(mktemp --suffix=.omf)" + "$PROJECT_ROOT/tools/omfEmit" \ + --input "$binBssFile" --map "$mapBssFile" \ + --base 0x8000 --entry main --output "$omfStk" \ + --stack-size 4096 2>/dev/null + if [ ! -s "$omfStk" ]; then + die "omfEmit --stack-size produced empty/missing OMF" + fi + # Walk segments and validate the last one (DP/Stack). + python3 - "$omfStk" <<'PY' || die "omfEmit --stack-size: DP/Stack segment validation failed" +import struct, sys +data = open(sys.argv[1], 'rb').read() +pos = 0; segs = [] +while pos < len(data): + bytecnt = struct.unpack_from(' emitOneSeg(const std::vector &image, return out; } +// Emit a "~Direct" DP/Stack segment. When the GS/OS System Loader +// encounters this segment kind (KIND low-5 = 0x12), it calls Memory +// Manager NewHandle to allocate `length` bytes of page-aligned, locked +// memory in bank $00, then sets the application's DP and SP to point +// into that block. Without an explicit DP/Stack segment in the OMF, +// the Loader allocates a default 4KB chunk — usually enough, but +// declaring our own size makes intent explicit and lets us bump it +// without runtime fiddling. +// +// Source: Apple IIgs GS/OS Reference Vol 1 (System Loader chapter): +// "You define your program's stack and direct-page needs by +// specifying a 'direct-page/stack' object segment (KIND = $12). +// The size of the segment is the total amount of stack and +// direct-page space your program needs. When the System Loader +// finds this segment at load time, it calls the Memory Manager to +// allocate a page-aligned, locked memory block of that size in +// bank $00." +// +// The body is just an END opcode (no LCONST data — RESSPC alone tells +// the Loader how big to make the allocation, and the bytes don't need +// to come from the file). KIND = 0x1012 = DP/Stack | PRIVATE — the +// PRIVATE attribute matches Apple's `makedirect` reference utility +// (ksherlock/omfutils). +static std::vector emitDpStackSeg(uint32_t length, uint16_t segNum) { + std::vector body; + body.push_back(0x00); // END opcode + constexpr uint8_t LABLEN_VAL = 10; + const std::string segNameTxt = "~Direct"; + std::vector loadName(LABLEN_VAL, 0x20); + std::vector segName(LABLEN_VAL, 0x20); + for (size_t i = 0; i < segNameTxt.size(); i++) + segName[i] = (uint8_t)segNameTxt[i]; + + constexpr uint16_t DISPNAME = 44; + const uint16_t DISPDATA = static_cast( + DISPNAME + loadName.size() + segName.size()); + const uint32_t LENGTH = length; // memory size requested + const uint32_t BYTECNT = DISPDATA + static_cast(body.size()); + const uint32_t RESSPC = length; // bytes to zero-allocate + const uint32_t BANKSIZE = 0; // DP/Stack lives in bank 0 + const uint32_t ALIGN = 0x100; // page-aligned per spec + const uint16_t KIND = 0x1012; // DP/Stack | PRIVATE + + std::vector hdr; + put32(hdr, BYTECNT); + put32(hdr, RESSPC); + put32(hdr, LENGTH); + hdr.push_back(0x00); // undefined + hdr.push_back(LABLEN_VAL); // LABLEN + hdr.push_back(4); // NUMLEN + hdr.push_back(0x02); // VERSION (v2.1) + put32(hdr, BANKSIZE); + put16(hdr, KIND); + hdr.push_back(0x00); hdr.push_back(0x00); // undefined + put32(hdr, /*ORG*/0); + put32(hdr, ALIGN); + hdr.push_back(/*NUMSEX*/0); + hdr.push_back(0x00); + put16(hdr, segNum); + put32(hdr, /*ENTRY*/0); + put16(hdr, DISPNAME); + put16(hdr, DISPDATA); + + if (hdr.size() != 44) die("internal: DP/Stack hdr size != 44"); + + std::vector out; + out.insert(out.end(), hdr.begin(), hdr.end()); + out.insert(out.end(), loadName.begin(), loadName.end()); + out.insert(out.end(), segName.begin(), segName.end()); + out.insert(out.end(), body.begin(), body.end()); + return out; +} + // Legacy single-segment wrapper. // // KIND=0x1000 (CODE | PRIV). This is what Merlin32 emits for single- @@ -216,11 +289,31 @@ static std::vector emitOneSeg(const std::vector &image, // model. PRIV bit signals "loaded with the rest of the app" and is the // reliable choice empirically validated by Merlin32-built hello.s16 // running successfully under MAME-Lua-driven Finder launch. +// +// `stackSize` > 0 appends a ~Direct DP/Stack segment of that size as +// segment 2. 0 = caller doesn't want one (Loader uses its 4KB +// default). static std::vector emitOMF(const std::vector &image, uint32_t entryOffset, - const std::string &name) { - return emitOneSeg(image, entryOffset, /*org*/0, /*segNum*/1, - /*kind*/0x1000, name); + const std::string &name, + uint32_t stackSize = 0) { + if (stackSize == 0) { + return emitOneSeg(image, entryOffset, /*org*/0, /*segNum*/1, + /*kind*/0x1000, name); + } + // DP/Stack segment ordering: Apple's `makedirect` reference utility + // assigns the DP/Stack as SEGNUM 1 (its own object); when linked + // into a multi-segment OMF, ordering matters because the Loader + // walks segments in file order. We put the DP/Stack FIRST so the + // Loader allocates the chunk before reading the code segment, then + // sets DP and SP appropriately when entering our code. + auto dpSeg = emitDpStackSeg(stackSize, /*segNum*/1); + auto codeSeg = emitOneSeg(image, entryOffset, /*org*/0, /*segNum*/2, + /*kind*/0x1000, name); + std::vector out; + out.insert(out.end(), dpSeg.begin(), dpSeg.end()); + out.insert(out.end(), codeSeg.begin(), codeSeg.end()); + return out; } // Emit an ExpressLoad-able OMF wrapping a single user segment. This is @@ -262,7 +355,8 @@ static std::vector emitOMF(const std::vector &image, static std::vector emitOmfExpressLoad( const std::vector &image, uint32_t entryOffset, - const std::string &userSegName) { + const std::string &userSegName, + uint32_t stackSize = 0) { // Step 1: build the user segment using KIND=0x1000 (CODE|PRIV). // Same KIND emitOMF uses for single-segment apps. Verified @@ -416,10 +510,18 @@ static std::vector emitOmfExpressLoad( if (elSeg.size() != elSegSize) die("internal: ExpressLoad segment size mismatch"); - // Step 6: concatenate ExpressLoad + user segment. + // Step 6: concatenate ExpressLoad + user segment + optional DP/Stack. + // The DP/Stack seg sits AFTER the user seg; the Loader walks file- + // ordered segments after the ExpressLoad load step completes, and + // processes each segment by KIND. The ExpressLoad load script only + // tracks code/data segs; the DP/Stack seg is found by KIND walk. std::vector result; result.insert(result.end(), elSeg.begin(), elSeg.end()); result.insert(result.end(), userSeg.begin(), userSeg.end()); + if (stackSize != 0) { + auto dpSeg = emitDpStackSeg(stackSize, /*segNum*/3); + result.insert(result.end(), dpSeg.begin(), dpSeg.end()); + } return result; } @@ -532,15 +634,23 @@ static void usage(const char *argv0) { std::fprintf(stderr, "usage: %s --input FLAT --map FILE --base ADDR --entry SYM\n" " --output OMF [--name NAME] [--expressload]\n" - " [--relocs FILE]\n" + " [--relocs FILE] [--stack-size BYTES]\n" " %s --manifest MFEST --output OMF\n" "\n" - " --expressload emit ExpressLoad-able OMF (required for boot\n" - " launchers under real GS/OS Loader).\n" - " --relocs FILE read IMM24 reloc list from link816's --reloc-out\n" - " sidecar; emit cRELOC (0xF5) opcodes after LCONST\n" - " so the Loader patches intra-segment 24-bit refs\n" - " (JSL/JML/STAlong/etc.) when placing the segment.\n", + " --expressload emit ExpressLoad-able OMF (required for boot\n" + " launchers under real GS/OS Loader).\n" + " --relocs FILE read IMM24 reloc list from link816's --reloc-out\n" + " sidecar; emit cRELOC (0xF5) opcodes after LCONST\n" + " so the Loader patches intra-segment 24-bit refs\n" + " (JSL/JML/STAlong/etc.) when placing the segment.\n" + " --stack-size N append a ~Direct DP/Stack segment (KIND=0x1012)\n" + " of N bytes. The Loader allocates a page-aligned\n" + " block of this size in bank 0 for combined DP +\n" + " stack use. N must be page-multiple (>= 256).\n" + " Default 0 (Loader uses its built-in 4KB default).\n" + " Implicitly enables --expressload (the GS/OS\n" + " Loader's slow path rejects multi-seg OMFs).\n" + " Not yet supported with --manifest.\n", argv0, argv0); std::exit(2); } @@ -553,6 +663,7 @@ int main(int argc, char **argv) { uint32_t base = 0; bool baseSet = false; bool expressload = false; + uint32_t stackSize = 0; int i = 1; while (i < argc) { @@ -566,10 +677,27 @@ int main(int argc, char **argv) { else if (a == "--output" || a == "-o") { if (++i >= argc) usage(argv[0]); output = argv[i++]; } else if (a == "--expressload") { expressload = true; i++; } else if (a == "--relocs") { if (++i >= argc) usage(argv[0]); relocFile = argv[i++]; } + else if (a == "--stack-size") { if (++i >= argc) usage(argv[0]); stackSize = parseInt(argv[i++]); } else if (a == "-h" || a == "--help") usage(argv[0]); else die("unknown option '" + a + "'"); } if (output.empty()) usage(argv[0]); + if (stackSize != 0) { + if (stackSize < 0x100) + die("--stack-size must be at least 256 bytes (1 page)"); + if (stackSize % 0x100 != 0) + die("--stack-size must be a multiple of 256 (page-aligned)"); + if (stackSize > 0xFFFF) + die("--stack-size cannot exceed 65535 bytes (one bank)"); + if (!manifest.empty()) + die("--stack-size with --manifest not yet supported"); + // Plain (non-ExpressLoad) multi-segment OMFs do not launch + // correctly under the GS/OS 6.0.2 Loader — verified empirically: + // the bare DP/Stack + code combo is rejected (program never + // executes), but ExpressLoad + DP/Stack works. Auto-enable + // ExpressLoad whenever --stack-size is requested. + expressload = true; + } // Load reloc list, if provided. // Sidecar v2 layout: u32 count + 12 bytes per entry @@ -659,8 +787,8 @@ int main(int argc, char **argv) { } auto blob = expressload - ? emitOmfExpressLoad(image, entryOff, name) - : emitOMF(image, entryOff, name); + ? emitOmfExpressLoad(image, entryOff, name, stackSize) + : emitOMF(image, entryOff, name, stackSize); std::ofstream f(output, std::ios::binary); if (!f) die("cannot open '" + output + "' for writing"); f.write(reinterpret_cast(blob.data()), blob.size()); diff --git a/src/llvm/lib/Target/W65816/W65816ISelLowering.cpp b/src/llvm/lib/Target/W65816/W65816ISelLowering.cpp index 55bc33b..c1879e9 100644 --- a/src/llvm/lib/Target/W65816/W65816ISelLowering.cpp +++ b/src/llvm/lib/Target/W65816/W65816ISelLowering.cpp @@ -1358,6 +1358,52 @@ SDValue W65816TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { } bool IsI32 = Op.getValueType() == MVT::i32; + + // Inline i32 shift-by-small-constant. The libcall path is ~300+ cyc; + // unrolling N i16 ops (N <= 4) plus carry propagation runs in ~20-80 + // cyc. popcount, BigInt-style code, and CRC routines all hit this. + // Larger N falls through to the libcall — the unrolled cost grows + // linearly while the libcall is constant. Cutoff chosen empirically: + // N=4 expands to ~32 i16 ops, comparable to the libcall's overhead. + // SRA needs an arithmetic-fill shift on the high half (i16 SRA by 1 + // is tablegen-supported); the low half is filled from the high's + // departing bit just like SRL. + if (IsI32) { + if (auto *C = dyn_cast(Amount)) { + uint64_t N = C->getZExtValue(); + unsigned Op0 = Op.getOpcode(); + if (N >= 1 && N <= 4 && + (Op0 == ISD::SHL || Op0 == ISD::SRL || Op0 == ISD::SRA)) { + SDLoc DL(Op); + SDValue X = Op.getOperand(0); + SDValue Lo = extractWide32Lo(DAG, DL, X); + SDValue Hi = extractWide32Hi(DAG, DL, X); + SDValue One = DAG.getConstant(1, DL, MVT::i16); + SDValue Fifteen = DAG.getConstant(15, DL, MVT::i16); + for (unsigned i = 0; i < N; i++) { + if (Op0 == ISD::SHL) { + // (Hi:Lo) << 1: carry = Lo bit15 → into Hi bit0. + SDValue NewLo = DAG.getNode(ISD::SHL, DL, MVT::i16, Lo, One); + SDValue HiBit0 = DAG.getNode(ISD::SRL, DL, MVT::i16, Lo, Fifteen); + SDValue HiShl = DAG.getNode(ISD::SHL, DL, MVT::i16, Hi, One); + SDValue NewHi = DAG.getNode(ISD::OR, DL, MVT::i16, HiShl, HiBit0); + Lo = NewLo; Hi = NewHi; + } else { + // SRL/SRA: Hi shifts (logical or arithmetic), Lo gets the + // low bit of pre-shift Hi inserted at bit 15. + SDValue NewHi = DAG.getNode(Op0, DL, MVT::i16, Hi, One); + SDValue HiLow = DAG.getNode(ISD::AND, DL, MVT::i16, Hi, One); + SDValue LoTop = DAG.getNode(ISD::SHL, DL, MVT::i16, HiLow, Fifteen); + SDValue LoSrl = DAG.getNode(ISD::SRL, DL, MVT::i16, Lo, One); + SDValue NewLo = DAG.getNode(ISD::OR, DL, MVT::i16, LoSrl, LoTop); + Lo = NewLo; Hi = NewHi; + } + } + return buildWide32(DAG, DL, Lo, Hi); + } + } + } + RTLIB::Libcall LC; switch (Op.getOpcode()) { case ISD::SHL: LC = IsI32 ? RTLIB::SHL_I32 : RTLIB::SHL_I16; break; diff --git a/src/llvm/lib/Target/W65816/W65816StackSlotCleanup.cpp b/src/llvm/lib/Target/W65816/W65816StackSlotCleanup.cpp index bec78e9..0485599 100644 --- a/src/llvm/lib/Target/W65816/W65816StackSlotCleanup.cpp +++ b/src/llvm/lib/Target/W65816/W65816StackSlotCleanup.cpp @@ -269,11 +269,18 @@ static bool tryEliminateLoadAfterStore(MachineBasicBlock &MBB, // Calls clobber A — be safe. if (MI.isCall()) return false; - // Any other instruction that defines StoredReg or stores to the - // slot invalidates the redundancy — bail. - if (MI.modifiesRegister(StoredReg, TRI)) + // STAfi has `Defs = [A]` in its tablegen def (a stale over- + // approximation from before the eliminateFrameIndex PHA-bracket + // landed for non-A sources). In reality the asm preserves A + // for every source class — A source is trivial, IMG/X/Y sources + // go through PHA/lda/sta/PLA which restores A. So a STAfi to + // a different slot is NOT an A-clobber and shouldn't break the + // load-after-store redundancy. STAfi to the SAME slot DOES + // invalidate (slot value changed), handled below. + bool IsStAFi = (MI.getOpcode() == W65816::STAfi); + if (!IsStAFi && MI.modifiesRegister(StoredReg, TRI)) return false; - if (MI.getOpcode() == W65816::STAfi && + if (IsStAFi && MI.getNumOperands() >= 2 && MI.getOperand(1).isFI() && MI.getOperand(1).getIndex() == StoredFI) return false; @@ -1240,8 +1247,13 @@ bool W65816StackSlotCleanup::runOnMachineFunction(MachineFunction &MF) { } // Calls clobber A. if (MI.isCall()) break; - // Anything that writes A invalidates our held value. - if (MI.modifiesRegister(W65816::A, TRI)) break; + // STAfi PRESERVES A in the asm (A source: store-only; non-A + // source: PHA bracket round-trip). The pseudo declares + // Defs = [A] as a stale over-approximation, so we explicitly + // skip STAfi when checking for A-clobber. STAfi to slotX + // (same slot) DOES change M[slotX] — bail in that case below. + if (MI.getOpcode() != W65816::STAfi && + MI.modifiesRegister(W65816::A, TRI)) break; // STAfi to slotX would change M[slotX] — bail. if (MI.getOpcode() == W65816::STAfi && MI.getNumOperands() >= 2 && MI.getOperand(1).isFI() &&