From f542f4fa01a7e89eaf0c37437afff8cde9105154 Mon Sep 17 00:00:00 2001 From: Scott Duensing Date: Sun, 3 May 2026 21:31:53 -0500 Subject: [PATCH] Checkpoint --- STATUS.md | 158 ++++++++++- docs/multiSegmentPlan.md | 232 +++++++++++++++ runtime/build.sh | 1 + runtime/src/crt0Gsos.s | 155 ++++++++++ runtime/src/libgcc.s | 14 +- scripts/benchCyclesPrecise.sh | 136 +++++++++ scripts/runInMameCycles.sh | 109 +++++++ scripts/runMultiSeg.sh | 127 +++++++++ scripts/runViaFinder.sh | 113 ++++++++ scripts/smokeTest.sh | 241 +++++++++++++++- src/link816/link816.cpp | 346 ++++++++++++++++++++--- src/link816/omfEmit.cpp | 516 +++++++++++++++++++++++++++++++--- 12 files changed, 2056 insertions(+), 92 deletions(-) create mode 100644 docs/multiSegmentPlan.md create mode 100644 runtime/src/crt0Gsos.s create mode 100755 scripts/benchCyclesPrecise.sh create mode 100755 scripts/runInMameCycles.sh create mode 100755 scripts/runMultiSeg.sh create mode 100755 scripts/runViaFinder.sh diff --git a/STATUS.md b/STATUS.md index 53d8018..0fb7dda 100644 --- a/STATUS.md +++ b/STATUS.md @@ -124,15 +124,37 @@ which runs correctly under MAME (apple2gs). IIgs IO window ($C000-$CFFF) if needed. `--gc-sections` (default ON) drops unreachable functions: a minimal program with full runtime linked shrinks from ~43KB to ~1.5KB. -- `tools/omfEmit` produces OMF v2.1 single-segment files (the IIgs's - native object format) for round-tripping with classic dev tools. +- `link816 --segment-cap N` packs `.text` greedily into multiple + bank-aligned segments, capped at N bytes per segment. Segment 1 + stays at `--text-base` in bank 0 (alongside rodata + bss + init); + segments 2..M start at `--segment-bank-base` (default $040000) + in successive banks. `--manifest path.json` writes a JSON file + listing each segment's image, base, and entry offset. + Cross-bank `JSL` (IMM24 reloc) just works — patched at link + time with the full 24-bit address. Cross-bank IMM16 is + permitted (uses DBR for bank — caller pins DBR to data's bank); + cross-bank PCREL is rejected with a clear diagnostic. + `scripts/runMultiSeg.sh` is a mini in-Lua loader for MAME that + reads the manifest, places each segment's bytes, and runs from + segment 1's entry — used by smoke to verify cross-bank JSL + end-to-end (helper3 chain across 3 bank-aligned segments). +- `tools/omfEmit` produces OMF v2.1 files in two modes: + (a) single-segment — `--input flat.bin --map flat.map --base + ADDR --entry SYM`, KIND=0x0000 (CODE, dynamic), ORG=0 (loader + picks bank); (b) multi-segment — `--manifest path.json` reads + link816's manifest and emits one OMF segment per entry with + KIND=0x8800 (STATIC|ABSBANK|CODE) + ORG=segment-base, asking + the GS/OS Loader to place each at its declared bank-aligned + address. All intra-segment relocations were already patched by + the linker, so no INTERSEG/RELOC opcodes are needed for v1 + static placement. - `link816 --debug-out FILE` writes a DWARF sidecar with text/ rodata/bss/init_array relocations applied to every `.debug_*` section, so `.debug_addr` / `.debug_line` PC values are final- image addresses. - `runtime/build.sh` builds crt0, libc, soft-float, soft-double, libgcc into linkable objects. -- `scripts/smokeTest.sh` runs 124 end-to-end checks at -O2: +- `scripts/smokeTest.sh` runs 126 end-to-end checks at -O2: scalar ops, control flow, calling conventions, MAME execution regressions, link816 bss-base safety + weak-symbol resolution + heap_end-vs-heap_start sanity, iigs/toolbox.h compile + link, @@ -242,17 +264,125 @@ RAM through $FFFF, gaining 8KB of bank-0 space.) ## Yet to come -(Empty — no known blocking gaps. C++ exceptions through clang -`-fsjlj-exceptions` now compile, link, and execute. The smoke -harness can't reliably DRIVE the C++ exception path through MAME -because of an unrelated MAME-side flakiness — its apple2gs CPU -emulation crashes intermittently when the test program exercises -the full SJLJ flow with smoke's I/O environment, even though the -same binary executes correctly when invoked interactively. The -pure-C SJLJ runtime smoke test exercises every runtime function -end-to-end, and the C++ frontend → backend path is verified at -compile/link time only. This is a workaround, not a defect in -our code: same binary runs fine outside the harness.) +- **Multi-bank BSS / init_array** — multi-segment splits text + across banks but BSS + init_array still live in segment 1's bank + (bank 0). Programs whose zero-init data exceeds the ~60KB bank-0 + budget would need crt0 to walk a per-segment table of `(start, + end)` pairs. Not blocking >64KB *code* programs; only matters + for programs with very large global arrays. + +- **GS/OS Loader OMF format compatibility** — the OMF format we + emit is now byte-equivalent to real Apple S16 segments at the + header level. Verified by extracting the ABOUT segment from + real `/SYSTEM/START` (FINDER) via Cadius (`/tmp/cadius/cadius`, + not AppleCommander which can't extract forks) and comparing + field-by-field against ours. Five fixes landed in + `src/link816/omfEmit.cpp` along the way: + (1) VERSION byte 0x21 → 0x02 (was BCD-style "2.1"; real format + is enum where 0x02 = v2.1). Cleared error $1102. + (2) Body opcode 0xF1 (DS = N zeros) → 0xF2 (compact LCONST, + 2-byte length + N data bytes). Long-form 0xF5 LCONST is in + the spec but real Loader appears to mis-parse it (3 stale + copies of the segment ended up scattered in RAM). Every real + segment we decoded uses 0xF2. + (3) KIND 0x0000 (CODE) → 0x8000 (CODE|STATIC) for legacy + single-segment mode. Real ABOUT segment uses 0x8000; with + 0x0000 the Loader returns $110A loadSegFailErr. Multi-segment + mode keeps 0x8800 (CODE|STATIC|ABSBANK) since each seg has a + fixed ORG. + (4) BANKSIZE 0 → 0x10000 (matches real code segments). + (5) LOAD_NAME emitted as 10 bytes of zeros immediately after + the 44-byte header (some sources omit it, real OMFs include it). + + GS/OS 6.0.2 is installed under `tools/gsos/` and boots cleanly + to Finder in MAME. Replacing `/SYSTEM/START` with a known-good + OMF (the extracted ABOUT segment) gives error `$005C` — + identical to what we get with our test program — meaning our + OMF is indistinguishable from real Apple S16 as far as the + Loader is concerned. The $005C is *not* OMF rejection; it is + the boot-launcher path failing because a minimal `/SYSTEM/START` + doesn't chain to a real Finder via QUIT-with-pathname. + + `runtime/src/crt0Gsos.s` is committed: skips SEI/LC-reconfig + (GS/OS owns CPU state), zeros BSS, runs init_array, calls + main, then QUIT(pcount=2) chained to `gChainPath` (default + `/SYSTEM/START.ORIG`). Linkage works. + + Tested with a marker write as the very first instruction of + crt0Gsos, replacing `/SYSTEM/START` with our OMF and saving + the original as `/SYSTEM/START.ORIG` for chain-back. After + 110-second boot: marker `$00/0078` is still 0 — the Loader + places our segment in RAM (entry signature found in 3 banks + via memory search) but **never JSLs entry**. Tested ENTRY=0, + ENTRY=1 (with NOP pad), auxtype=0 and =DB03; all give the + same $005C without ever calling our code. Conclusion: the + boot-launcher path requires the `~ExpressLoad` segment that + every real `/SYSTEM/START` carries. Without ExpressLoad, + the bootstrap takes a code path that loads our segment but + never auto-calls it. + + **OMF format → fully Loader-compatible** after reading + Merlin32 source. Final canonical fields (single-segment + Finder-launchable app): + - KIND=0x1000 (CODE|PRIV) — was 0x8000 (CODE|STATIC) which + came from extracting ABOUT from real FINDER, but ABOUT is a + sub-segment called as a subroutine, not a launchable app + - LABLEN=10 (fixed-width 10-byte LOAD_NAME and SEG_NAME, + space-padded) — was 0 (length-prefixed) which is what + /SYSTEM/START FINDER uses but the Loader will only LOAD, + not JSL-into, that format + - VERSION=0x02 (OMF v2.1) + - BANKSIZE=0x10000 for code segs + - Body opcode 0xF2 LCONST with NUMLEN-byte (=4) count + + ExpressLoad emission also landed (`omfEmit --expressload`): + 6-byte header + segment list + remap list + header info, + byte-equivalent to Merlin32's `BuildExpressLoadSegment`. + + End-to-end runtime verification: new `scripts/runViaFinder.sh` + injects an OMF as `/SYSTEM.DISK/HELLO`, boots GS/OS in MAME, + drives Finder via Lua keyboard automation (S+Cmd-O to open + System.Disk, H+Cmd-O to launch HELLO), samples specified + memory addresses to verify execution. Pattern adapted from + `joeylib/scripts/run-iigs-mame.sh` from a sibling project. + Pure-asm marker tests (`sta $000078 long, value=$42`) are + confirmed running under real GS/OS Loader with + `runViaFinder.sh hello.omf --check 0x000078=0x42` returning + exit 0. + + **Compiled C now runs under real GS/OS Loader.** Implemented + option (a) from the analysis: OMF cRELOC opcode emission. + - `link816 --reloc-out FILE` records every R_W65816_IMM24 + relocation site (intra-segment 24-bit refs only — GS/OS + dispatcher calls and other cross-bank refs are filtered out) + as a binary sidecar of (patchOff, offsetRef) pairs. + - `omfEmit --relocs FILE` reads the sidecar and emits a + cRELOC opcode (0xF5) per site between the LCONST data and the + END opcode. Format per Merlin32: `0xF5 ByteCnt(=3) Shift(=0) + OffsetPatch(2) OffsetReference(2)` = 7 bytes. + - The Loader rewrites segment[OffsetPatch..OffsetPatch+2] to + `(segPlacedBase + OffsetReference)` at load time, fixing + every `jsl`/`jml`/`sta long`/`lda long` operand that targets + an in-segment symbol. + - End-to-end verified: a real C function call + for loop + (`sumTo(10)` → 55, `sumTo(100)` → 5050) compiled with clang + -O2, linked, OMF-emitted with cRELOC, injected as + `/SYSTEM.DISK/HELLO`, launched from Finder via MAME-Lua + keyboard automation, marker bytes verified at the expected + values. Smoke check #62 verifies cRELOC opcode count + matches the link816 sidecar count. + + Smoke tests #59-#60 (omfEmit single + multi-segment) verify + the structural format invariants (VERSION=0x02, KIND=0x8000 + or 0x8800, body opcode 0xF2 LCONST) so regressions are + caught. `scripts/runMultiSeg.sh` mini-loader continues to + cover the >64KB use case end-to-end. + +- **C++ exceptions in CI smoke** — runs reliably outside smoke; + see context below. The SJLJ runtime end-to-end test passes; + the C++ frontend→backend path is compile/link verified in + smoke; full execution path is left out due to a MAME-side I/O + flakiness (same binary runs fine interactively). - **GS/OS validated against a real ProDOS volume** — the wrapper contract (PHA + PEA 0 + LDX + JSL $E100A8 + post-call SP fixup) diff --git a/docs/multiSegmentPlan.md b/docs/multiSegmentPlan.md new file mode 100644 index 0000000..0c644dd --- /dev/null +++ b/docs/multiSegmentPlan.md @@ -0,0 +1,232 @@ +# Multi-segment OMF support — plan + +## Why + +Single-segment cap: ~60KB usable in bank 0 after the IO window ($C000- +$CFFF), the stack at $0FFF, and crt0 / runtime overhead. Real IIgs +applications need 100s of KB across multiple banks. GS/OS Loader is +designed for this — load each segment into its chosen bank, fix up +inter-segment references at load time, jump to the entry segment. + +## Today + +- `link816` produces a flat binary covering `[--text-base, ...]` in a + single bank-0 image. All sections are concatenated into one address + space. Inter-section relocations are resolved at link time. +- `omfEmit` wraps that flat binary in a single OMF segment (KIND=CODE, + ORG=0, SEGNUM=1, body = one DS opcode + END). No relocation records + emitted (image is already absolute). +- `crt0` enables LC RAM, zeroes BSS, runs `.init_array`, calls `main`. +- All cross-function calls already use JSL (3-byte long) — we never + emit JSR. That's accidentally helpful for multi-segment. + +## Target + +A program that builds 4 segments — say: +- Segment 1 ("MAIN"): crt0 + main + a few hot routines, in bank 1 +- Segment 2 ("CODE"): bulk of code, in bank 2 +- Segment 3 ("DATA"): rodata, in bank 3 +- Segment 4 ("BSS"): uninitialized data + heap, in bank 4 + +GS/OS Loader places each segment, applies inter-segment relocations +(every `JSL foo` where `foo` lives in a different segment becomes a +`JSL ` patched at load time with the absolute +address), and jumps to the entry. + +## The four hard problems + +### 1. Section → segment assignment policy + +We need a deterministic rule that maps every input object's `.text` / +`.rodata` / `.bss` / `.init_array` section into a specific segment. +Three options: + +**A. Per-object → one segment.** Each `.o` becomes one segment. Simple +mental model; bad locality (many tiny segments, lots of inter-segment +JSLs); GS/OS Loader has 8KB+ minimum overhead per segment. + +**B. Greedy bin-packing.** Compute total code size; cap each segment at +N bytes (e.g. 32KB to leave headroom); pack `.text` sections into +segments greedily in input order. Same for `.rodata` / `.bss`. +Predictable, but a function near the end of segment N might want to +JSL a function at the start of segment N+1 — common pattern, every +call becomes inter-segment. + +**C. Static call graph + clustering.** Compute call graph from the +relocations, cluster co-calling functions together, pack clusters into +segments to minimize inter-segment edges. Best locality, real linker +work. + +**Recommendation: B for v1.** Add a `--segment-cap` option (default +32768). Real applications will want C eventually, but B unblocks +"my program is bigger than 64KB" today. + +### 2. Inter-segment relocation tracking + +When a `JSL foo` reloc resolves to a function in a different segment, +we MUST emit an OMF relocation record instead of patching the bytes +in-place. Currently `link816` patches everything at link time and emits +zero reloc records. + +The reloc model becomes per-segment: + +- Intra-segment IMM16 / PCREL: patch at link time, no OMF record. +- Intra-segment IMM24 (JSL): patch at link time (low 24 bits = segment- + relative offset for now; loader adjusts at load time when segment is + placed). Actually need OMF reloc here too because we don't know the + load bank. +- Inter-segment IMM24 (cross-bank JSL): emit `INTERSEG` opcode (`E2`) + pointing at `(target_segment_num, offset_within_segment)`. +- Inter-segment IMM16 data ref: requires the data segment to land in + the same bank as the referencing code OR we need the loader to fail + (16-bit absolute can't cross banks). In v1, force all data refs to be + to a "data segment" that's in a fixed bank, OR rewrite to long + addressing. + +The IMM16 cross-segment problem is the killer. Three responses: + + i. **Punt:** Disallow it. All `.rodata` references must be in the + same segment as the code, OR refs to global data must use long + addressing (rewrite at compile time via `__attribute__((far))`). + ii. **Promote to long at link time:** Detect IMM16 cross-segment + refs, rewrite the instruction's encoding from absolute (3-byte) + to absolute-long (4-byte). Changes code size, shifts everything + after the patched site — invasive. + iii. **Same-bank constraint:** Ensure the data segment's bank == + the code's DBR. Means all code segments share one DBR, all data + lives in one segment in that DBR's bank. + + **Recommendation: iii for v1.** All `.rodata` lives in one segment + in the bank our code uses for DBR. We already pin DBR to bank 0 in + crt0 (well, code does `pha;plb` for bank 2 sometimes for tests, but + not in general). For v1, all `.rodata` goes in bank 0 alongside the + first text segment, and code segments in higher banks reference data + via long absolute addressing. Need to confirm what addressing modes + our backend actually emits for global access. + +### 3. crt0 / loader contract + +Current crt0 assumes flat layout: + +``` +__start: + setup CPU mode, stack + enable LC RAM + zero BSS [__bss_start..__bss_end] + run .init_array + jsl main + spin +``` + +Multi-segment changes: + +- BSS may span multiple segments (bank 0 LC + bank N segment). The + `__bss_start` / `__bss_end` symbols need to be per-segment, OR a + loop over a list of `(start, end)` pairs the linker emits. +- `.init_array` ditto. +- LC RAM enable only applies to bank 0 — fine. +- The OMF Loader will handle the actual memory placement; crt0 just + runs after Loader is done. +- The Loader's entry call lands at the segment marked with the entry + field. By convention that's segment 1. + +**Decision:** Designate segment 1 as "init segment" containing crt0 + +its required symbols (`__bss_start_seg1`, `__init_array_start_seg1`, +etc.) and the linker emits a `__bss_table` and `__init_array_table` — +arrays of `(start, end)` pointers walked by crt0. Same idea Mac OS X's +loader uses for multi-segment programs. + +### 4. Build pipeline + tests + +- `link816 --segment-cap N` emits multiple `(image, base, syms)` + triples plus inter-segment reloc records. +- New intermediate format between linker and `omfEmit`: a small + manifest file listing each segment's body, base, name, and reloc + records. Easier than passing all that on the CLI. +- `omfEmit` reads the manifest and emits a single multi-segment OMF + file with proper INTERSEG opcodes. +- Smoke needs new test: build a program with `--segment-cap 8192` so it + forces ≥2 segments even for our small benches; verify under MAME via + a GS/OS-loader-aware test path. (We don't have GS/OS-loaded tests + today — see "Risks" below.) + +## Phased implementation + +### Phase 1: linker emits per-segment images + manifest +- `link816 --segment-cap N --manifest manifest.json -o out` +- Pack `.text` greedy into segments 1..K capped at N bytes each. +- All `.rodata` into segment K+1 (the "data segment"). +- All `.bss` into segment K+2. +- Resolve intra-segment relocations. +- Write inter-segment relocations into the manifest. +- Emit one flat binary per segment; manifest references them by path. + +### Phase 2: omfEmit consumes manifest, emits multi-segment OMF +- One OMF segment header per manifest entry. +- DS opcodes for body bytes. +- INTERSEG (`E2`) opcodes for inter-segment reloc patch sites. +- RELOC (`E0`) opcodes for intra-segment relocations that need + load-time fixup (JSL targets within same segment but different bank + than expected). +- END opcode terminator per segment. + +### Phase 3: runtime updates +- Linker emits `__bss_table[]` and `__init_array_table[]` instead of + single `__bss_start`/`__bss_end` symbols. +- crt0 walks those tables. +- `crt0.s` removes the LC-enable hardcoding from segment 1 if segment + 1 isn't bank 0 (configurable). + +### Phase 4: tests + smoke +- Bench harness builds with `--segment-cap 8192` to force multi-segment + even for small programs; verify output size growth (should be small — + just OMF headers + reloc records overhead). +- Need a GS/OS-aware MAME test path (boot a ProDOS volume with our OMF + binary, let GS/OS Loader load it, check markers in bank 2). This is + the test we deferred earlier in the GS/OS smoke task. **Phase 4 + reopens the GS/OS-volume smoke decision** — multi-segment is the + main reason to even care about that. + +## Scope estimate + +- Phase 1: 2-3 sessions (linker rework, careful with reloc accounting) +- Phase 2: 1 session (mostly OMF format work, well-specified) +- Phase 3: 1 session (crt0 + linker symbol table changes) +- Phase 4: 2-3 sessions (GS/OS-loaded test infra is the slog, not the + multi-segment logic itself) + +Total: ~6-8 focused sessions. Phases 1-3 deliver a working multi- +segment binary; phase 4 makes it testable in CI. + +## Risks + +- **DBR management is genuinely tricky.** Code in segment 2 (bank 2) + doing `lda foo` where foo is in segment K+1 (bank 0): the absolute + fetch uses DBR. If DBR != bank-of-foo, we read garbage. The cleanest + rule (DBR=0 always; data refs use long via `__attribute__((far))` or + a backend pass that promotes them) requires backend cooperation + we don't have. v1's "all data in one segment in DBR's bank" works + but constrains data size to ~60KB. +- **The Loader's behaviour around segment placement is poorly + documented.** Apple's GS/OS Loader picks banks dynamically; we may + end up with code segments in banks the loader chose, with relocations + that work, but layouts that surprise us. Mitigation: use STATIC + segments (KIND bit) initially so the loader can't move them. +- **Smoke needs a real GS/OS volume image.** This is the same blocker + as the deferred GS/OS file I/O smoke — needs a 2img/po image with + ProDOS volume + a way to run our OMF through the actual loader. + Without that, multi-segment logic is testable only by inspection of + the OMF bytes and a hand-rolled mini-loader (which we'd have to + write). + +## Recommendation + +Start Phase 1. The linker work is contained, mostly mechanical, and +the manifest format gives us a clean handoff to `omfEmit` work in +Phase 2. We can validate Phase 1 by inspecting the per-segment images ++ manifest before any OMF / loader work. + +Phase 4's GS/OS-volume test path is the biggest unknown. Reasonable to +defer that decision until Phases 1-3 are working — at that point we +can decide whether to invest in proper GS/OS-loaded smoke or accept +"multi-segment OMF emits valid bytes per the spec" as the test bar. diff --git a/runtime/build.sh b/runtime/build.sh index 14adad9..ea92262 100755 --- a/runtime/build.sh +++ b/runtime/build.sh @@ -32,6 +32,7 @@ cc() { } asm "$SRC/crt0.s" +asm "$SRC/crt0Gsos.s" asm "$SRC/libgcc.s" cc "$SRC/libc.c" cc "$SRC/strtol.c" diff --git a/runtime/src/crt0Gsos.s b/runtime/src/crt0Gsos.s new file mode 100644 index 0000000..ffc07b4 --- /dev/null +++ b/runtime/src/crt0Gsos.s @@ -0,0 +1,155 @@ +; crt0Gsos.s — GS/OS S16 application crt0. +; +; Use this INSTEAD OF crt0.s when building an OMF that the real +; GS/OS Loader will launch. Differences from crt0.s: +; - No SEI / interrupt-source clearing (GS/OS owns the IRQ chain). +; - No language-card RAM enable (GS/OS configures memory). +; - No stack base reset (GS/OS allocated and set our SP). +; - Honors GS/OS's DBR=our-bank, DP=allocated-page setup. +; - On main() return, calls GS/OS QUIT(pcount=2) to chain to a +; known next application (default: /SYSTEM/START.ORIG which +; test setups must save off the original boot launcher to). +; +; Entry from the System Loader (per Apple IIgs Toolbox Reference): +; E=0 (native), M=0 (16-bit accumulator), X=0 (16-bit index) +; DBR = the bank into which our entry segment was placed +; DP = pointer to a Memory-Manager-allocated DP page +; Stack at entry, top-down: +; PCL PCH PBR (3 bytes — JSL return addr to launcher) +; flags-lo flags-hi (2 bytes — launch flags) +; path-low path-mid path-bank pad (4 bytes — pathname long ptr) +; +; QUIT discards the entire stack so we never need to pop the launch +; frame ourselves. + + .text + + .globl __start +__start: + ; Set DP=0. The C compiler assumes DP=0 for all `sta dp` and + ; `[dp],y`-style accesses; GS/OS hands us a Memory-Manager- + ; allocated DP page that we discard. + rep #0x30 + lda #0 + tcd + + ; BSS zero-init. With DBR=our bank, `stz abs,X` writes to + ; ourBank:X — correct as long as __bss_start/__bss_end fit in + ; the segment's bank. + rep #0x30 + ldx #__bss_start +.Lbss_loop: + cpx #__bss_end + bcs .Lbss_done + sep #0x20 + stz 0x0000, x + rep #0x20 + inx + bra .Lbss_loop +.Lbss_done: + + ; Walk .init_array (C++ ctors). + ; + ; ⚠ KNOWN BROKEN under real GS/OS Loader for non-zero-bank + ; placement: `jsl __jsl_indir` bakes a bank-0 operand at link + ; time. When the Loader places us at bank $1f or similar, the + ; JSL targets bank 0 (= GS/OS code) instead of our actual bank + ; — so this loop crashes if init_array has any entries. Same + ; applies to `jsl main` below. Closing the gap requires either + ; RELOC opcode emission in omfEmit (so the Loader patches the + ; JSL bank bytes at load time) or runtime self-patching of JSL + ; opcodes in crt0. Tracked separately. + rep #0x30 + ldx #__init_array_start +.Linit_loop: + cpx #__init_array_end + bcs .Linit_done + stx 0xe0 + ldy #0 + lda (0xe0), y + sta __indirTarget + phx + jsl __jsl_indir + plx + inx + inx + bra .Linit_loop +.Linit_done: + + ; Call main. Standard W65816 C ABI: arg0 in A; we pass none. + rep #0x30 + jsl main + + ; ---- QUIT (pcount=2) chain to gChainPath --------------------- + ; Parm block layout in DP $80..$87: + ; $80,$81 pcount = 2 + ; $82..$85 pathname long ptr (lo, mid, bank, pad) + ; $86,$87 flags = 0 + ; + ; The path is a GSString (2-byte length + chars). It must live + ; in bank-0 memory (GS/OS reads parm fields as bank-0). DP is in + ; bank 0, so we copy the GSString from our segment into DP $A0. + + rep #0x30 + + ; Copy length byte first to compute total bytes to copy. + sep #0x20 + lda gChainPath ; low byte of GSString length + clc + adc #2 ; +2 for the length word itself + tay ; Y = bytes to copy (paths < 256 chars) + rep #0x20 + + ldx #0 +.LcopyPath: + sep #0x20 + lda gChainPath, x ; DBR-relative read (DBR = our bank) + sta 0xa0, x ; DP write (in bank 0) + rep #0x20 + inx + dey + bne .LcopyPath + + ; Build parm block at DP $80. + rep #0x30 + lda #2 + sta 0x80 ; pcount + + tdc + clc + adc #0xa0 + sta 0x82 ; pathname long-ptr low+mid 16 + lda #0 + sta 0x84 ; bank byte (0) + pad byte (0) + sta 0x86 ; flags = 0 + + ; Push 32-bit parm-block pointer (low half + bank-0). + tdc + clc + adc #0x80 + pha + pea 0 + ldx #0x2029 ; QUIT class-1 call number + jsl 0xe100a8 ; GS/OS dispatcher + + ; QUIT only returns on failure. Clean up + BRK. + pla + pla + .byte 0x00, 0x00 + + .size __start, . - __start + + +; gChainPath — GSString chain target for QUIT after main(). Default +; is "/SYSTEM/START.ORIG" (saved-original boot launcher). Programs +; that need a different target must rename this symbol; the linker +; resolves whichever def is present. +; +; GSString: 2-byte length word + N chars. Length here = 18 +; ("/SYSTEM/START.ORIG"). + + .section .rodata,"a" + .globl gChainPath +gChainPath: + .byte 18, 0 + .ascii "/SYSTEM/START.ORIG" diff --git a/runtime/src/libgcc.s b/runtime/src/libgcc.s index 0e3ac2d..7309dd0 100644 --- a/runtime/src/libgcc.s +++ b/runtime/src/libgcc.s @@ -322,9 +322,17 @@ __mulsi3: ; Clear running product at $e8/$ea. stz 0xe8 stz 0xea - ; Loop 32 times: examine LSB of multiplier, conditionally add - ; multiplicand to product, then shift multiplier right and - ; multiplicand left. Use Y as a 16-bit counter (X mode = 16). + ; Fast path: if multiplier's high half ($e2) is 0, we only + ; need 16 loop iterations (the full 32-iter shift-out would + ; just shift in zeros after iter 16). Common in C code where + ; both source operands are zext'd from i16 — e.g. `i*i` with + ; i a `unsigned short`. Saves ~half the multiply cycles in + ; that case (sumOfSquares: 80000 → ~40000 cyc/call). + lda 0xe2 + bne .Lmulsi_full + ldy #0x10 + bra .Lmulsi_loop +.Lmulsi_full: ldy #0x20 .Lmulsi_loop: ; Test bit 0 of multiplier (lo word). diff --git a/scripts/benchCyclesPrecise.sh b/scripts/benchCyclesPrecise.sh new file mode 100755 index 0000000..3a875c3 --- /dev/null +++ b/scripts/benchCyclesPrecise.sh @@ -0,0 +1,136 @@ +#!/usr/bin/env bash +# benchCyclesPrecise.sh — measure per-call cycle counts via the +# emu.time()-based runner (scripts/runInMameCycles.sh). +# +# For each benchmark in benchmarks/, build a wrapper that calls the +# bench function ITERS times between START / DONE markers; the runner +# captures emulated time and converts to cycles assuming the IIgs +# slow-mode clock (1023000 Hz — IIe-compatible default; our binary +# doesn't enable fast mode unless its wrapper does). +# +# Output: markdown table with cycles-per-call. Both clang and the +# Calypsi numbers (from `tools/calypsi/cc65816`) are reported when +# Calypsi is installed. + +set -euo pipefail +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +BENCH_DIR="$PROJECT_ROOT/benchmarks" + +CLANG="$PROJECT_ROOT/tools/llvm-mos-build/bin/clang" +LLVM_MC="$PROJECT_ROOT/tools/llvm-mos-build/bin/llvm-mc" +LINK="$PROJECT_ROOT/tools/link816" +RUNNER="$PROJECT_ROOT/scripts/runInMameCycles.sh" + +oCrt0=$(mktemp --suffix=.o) +oLibgcc=$(mktemp --suffix=.o) +"$LLVM_MC" -arch=w65816 -filetype=obj "$PROJECT_ROOT/runtime/src/crt0.s" -o "$oCrt0" +"$LLVM_MC" -arch=w65816 -filetype=obj "$PROJECT_ROOT/runtime/src/libgcc.s" -o "$oLibgcc" + +# Per-benchmark inputs / extern decls (mirrors benchCycles.sh). +benchInputs() { + case "$1" in + sumOfSquares) echo 'sumOfSquares(50)';; + fib) echo 'fib(10)';; + strcpy) echo 'mystrcpy(dst, "hello world!")';; + memcmp) echo 'mymemcmp("hello", "hello", 5)';; + bsearch) echo 'bsearch(arr, 8, 5)';; + dotProduct) echo 'dotProduct(va, vb, 4)';; + popcount) echo 'popcount(0x12345678UL)';; + crc32) echo 'crc32((const unsigned char *)"hello", 5)';; + *) echo "/* unknown */";; + esac +} + +benchExtern() { + case "$1" in + sumOfSquares) echo 'extern unsigned long sumOfSquares(unsigned short n);';; + fib) echo 'extern unsigned short fib(unsigned short n);';; + strcpy) echo 'extern char *mystrcpy(char *d, const char *s); static char dst[16];';; + memcmp) echo 'extern int mymemcmp(const void *a, const void *b, unsigned int n);';; + bsearch) echo 'extern int bsearch(const int *arr, int n, int key); static const int arr[] = {1,2,3,4,5,6,7,8};';; + dotProduct) echo 'extern long dotProduct(const short *a, const short *b, unsigned int n); static const short va[] = {1,2,3,4}; static const short vb[] = {5,6,7,8};';; + popcount) echo 'extern int popcount(unsigned long x);';; + crc32) echo 'extern unsigned long crc32(const unsigned char *p, unsigned int n);';; + *) echo '';; + esac +} + +# How many iterations to run each bench for. Bigger = more +# precise (smaller relative measurement noise) but longer runtime. +# Heavy benches get fewer iters; cheap benches get more. +benchIters() { + case "$1" in + sumOfSquares) echo 50;; # ~1600 cyc/call → ~80k cyc total + fib) echo 100;; + strcpy) echo 200;; + memcmp) echo 500;; + bsearch) echo 200;; + dotProduct) echo 200;; + popcount) echo 500;; + crc32) echo 200;; + *) echo 100;; + esac +} + +runOneBench() { + local name="$1" + local extern_decl call_expr iters + extern_decl=$(benchExtern "$name") + call_expr=$(benchInputs "$name") + iters=$(benchIters "$name") + if [ -z "$extern_decl" ] || [ "$call_expr" = "/* unknown */" ]; then + echo "(no input config)"; return + fi + + local cwrap obench owrap bin + cwrap=$(mktemp --suffix=.c) + owrap=$(mktemp --suffix=.o) + obench=$(mktemp --suffix=.o) + bin=$(mktemp --suffix=.bin) + + cat > "$cwrap" </dev/null \ + || { echo "compile-fail"; rm -f "$cwrap" "$owrap"; return; } + "$CLANG" --target=w65816 -O2 -ffunction-sections -c "$BENCH_DIR/$name.c" -o "$obench" 2>/dev/null \ + || { echo "compile-fail"; rm -f "$cwrap" "$owrap" "$obench"; return; } + "$LINK" -o "$bin" --text-base 0x1000 "$oCrt0" "$oLibgcc" "$owrap" "$obench" 2>/dev/null \ + || { echo "link-fail"; rm -f "$cwrap" "$owrap" "$obench" "$bin"; return; } + + local val + val=$(bash "$RUNNER" "$bin" "$iters" 2>&1 | grep -oE 'cyc_per_call=[0-9.]+' | head -1 | sed 's/cyc_per_call=//') + rm -f "$cwrap" "$owrap" "$obench" "$bin" + + if [ -z "$val" ]; then + echo "(no read)" + else + printf '%.0f cyc/call' "$val" + fi +} + +printf '| Benchmark | Per-call cycles (clang) |\n' +printf '|-----------|------------------------:|\n' +for src in "$BENCH_DIR"/*.c; do + name=$(basename "$src" .c) + result=$(runOneBench "$name") + printf '| %s | %s |\n' "$name" "$result" +done + +rm -f "$oCrt0" "$oLibgcc" diff --git a/scripts/runInMameCycles.sh b/scripts/runInMameCycles.sh new file mode 100755 index 0000000..3951f7c --- /dev/null +++ b/scripts/runInMameCycles.sh @@ -0,0 +1,109 @@ +#!/usr/bin/env bash +# runInMameCycles.sh — measure emulated CPU time between START / DONE +# markers via MAME's emu.time(). +# +# Usage: runInMameCycles.sh +# binary: 65816 image to load at $00:1000 +# iters: number of bench iterations the binary ran (used to +# normalize delta to per-iteration cycles) +# +# The binary MUST: +# 1. Switch DBR to bank 2 (so the marker writes are observable +# at $025000 / $025002 — bank 0 there is also fine but harder +# to find atomically). +# 2. Write 0xA1A1 to $025000 *immediately before* the bench loop. +# 3. Write 0xA2A2 to $025002 *immediately after* the bench loop. +# 4. while(1){} after the DONE marker. +# +# Output (stdout): +# MAME-CYCLES iters=N delta_us=... cyc_per_call=... start_us=... done_us=... +# Exit 0 on success, 1 on time-out / missing markers. +# +# IIgs CPU clock rate. MAME's apple2gs starts in IIgs slow mode +# (1.023 MHz, IIe-compatible) until the IIgs ROM enables fast mode +# via $C036. We're booting our binary directly without going through +# the ROM, so we stay in slow mode unless the binary itself writes +# $80 to $C036. For the cycle harness we calibrate against slow +# mode (1023000 Hz) — both clang and Calypsi binaries run under +# the same emulator state, so the ratio is what matters. If you +# want fast-mode numbers, have the bench wrapper enable it. + +set -euo pipefail +source "$(dirname "$0")/common.sh" + +BIN="$1" +ITERS="${2:-100}" +SECS=10 +CLOCK_HZ=1023000 + +[ -f "$BIN" ] || die "binary not found: $BIN" + +LUA_PATH=$(mktemp --suffix=.lua) +trap 'rm -f "$LUA_PATH"' EXIT + +cat > "$LUA_PATH" <= 0x00C000 and addr < 0x00D000) then + mem:write_u8(addr, data:byte(i)) + end + end + loaded = true + cpu.state["PC"].value = 0x1000 + cpu.state["PB"].value = 0x00 + cpu.state["DB"].value = 0x00 + cpu.state["D"].value = 0x00 + cpu.state["P"].value = 0x34 + cpu.state["E"].value = 0 + cpu.state["S"].value = 0x01FF + print("MAME-LOADED bytes=" .. #data) + return + end + + if not loaded then return end + + -- Poll markers on every frame after load. Capture emu.time() + -- the first frame each marker appears. + if not start_t and mem:read_u16(0x025000) == 0xa1a1 then + start_t = emu.time() + print(string.format("MAME-MARK START frame=%d t=%.9f", frame, start_t)) + end + if start_t and not done_t and mem:read_u16(0x025002) == 0xa2a2 then + done_t = emu.time() + print(string.format("MAME-MARK DONE frame=%d t=%.9f", frame, done_t)) + local delta = done_t - start_t + local delta_us = delta * 1e6 + local cyc = delta * $CLOCK_HZ + local per_call = cyc / $ITERS + print(string.format("MAME-CYCLES iters=$ITERS delta_us=%.3f total_cyc=%.0f cyc_per_call=%.2f", + delta_us, cyc, per_call)) + manager.machine:exit() + end +end) +EOF + +OUT=$(timeout 60 mame apple2gs \ + -rompath "$PROJECT_ROOT/tools/mame/roms" \ + -plugins -autoboot_script "$LUA_PATH" \ + -window -sound none -nothrottle -seconds_to_run "$SECS" 2>&1 | grep "^MAME-") + +echo "$OUT" +if echo "$OUT" | grep -q "MAME-CYCLES"; then + exit 0 +fi +warn "no MAME-CYCLES output — markers not observed within $SECS sec" +exit 1 diff --git a/scripts/runMultiSeg.sh b/scripts/runMultiSeg.sh new file mode 100755 index 0000000..a953a52 --- /dev/null +++ b/scripts/runMultiSeg.sh @@ -0,0 +1,127 @@ +#!/usr/bin/env bash +# runMultiSeg.sh — run a multi-segment program in MAME via a +# mini in-Lua loader. Reads the link816 manifest, loads each +# segment's image at its base address, sets PC to segment 1's +# entry, lets the program run, then reads check-address values. +# +# Usage: runMultiSeg.sh [check args like runInMame.sh] + +set -euo pipefail +source "$(dirname "$0")/common.sh" + +MANIFEST="$1" +shift +SECS=3 + +# Build address list as Lua table entries, mirroring runInMame.sh. +LUA_CHECKS="" +EXPECT_LIST=() +ADDR_LIST=() +if [ "$1" = "--check" ]; then + shift + for pair in "$@"; do + ADDR="${pair%=*}" + EXP="${pair#*=}" + ADDR_LIST+=("$ADDR") + EXPECT_LIST+=("$EXP") + LUA_CHECKS="$LUA_CHECKS print(string.format('MAME-READ addr=0x%06x val=0x%04x', $ADDR, mem:read_u16($ADDR)))"$'\n' + done +else + ADDR="$1" + EXP="$2" + ADDR_LIST+=("$ADDR") + EXPECT_LIST+=("$EXP") + LUA_CHECKS="print(string.format('MAME-READ addr=0x%06x val=0x%04x', $ADDR, mem:read_u16($ADDR)))" +fi + +[ -f "$MANIFEST" ] || die "manifest not found: $MANIFEST" + +# Parse manifest with python (every machine has it). Emit a Lua +# table of (image_path, base, entry_offset_from_seg1). +PARSED=$(python3 - < "$LUA_PATH" <> 16) & 0xff + cpu.state["DB"].value = 0x00 + cpu.state["D"].value = 0x00 + cpu.state["P"].value = 0x34 + cpu.state["E"].value = 0 + cpu.state["S"].value = 0x01FF + print('MAME-READY pc=0x' .. string.format('%06x', $ENTRY_BASE + $ENTRY_OFF)) + end + if frame == 60 then + local cpu = manager.machine.devices[":maincpu"] + local mem = cpu.spaces["program"] +$LUA_CHECKS + manager.machine:exit() + end +end) +EOF + +OUT=$(timeout 30 mame apple2gs \ + -rompath "$PROJECT_ROOT/tools/mame/roms" \ + -plugins -autoboot_script "$LUA_PATH" \ + -window -sound none -nothrottle -seconds_to_run "$SECS" 2>&1 | grep -E "^(MAME-|SEG-)") + +echo "$OUT" +mapfile -t GOT_LIST < <(printf '%s\n' "$OUT" | grep -oE 'val=0x[0-9a-f]+' | sed 's/val=0x//') +ok=1 +for i in "${!EXPECT_LIST[@]}"; do + if [ "${GOT_LIST[$i]:-}" != "${EXPECT_LIST[$i]}" ]; then + warn "MAME mismatch at ${ADDR_LIST[$i]}: got 0x${GOT_LIST[$i]:-MISSING} expected 0x${EXPECT_LIST[$i]}" + ok=0 + fi +done +if [ $ok -eq 1 ]; then + log "MAME (multi-seg) OK: ${#EXPECT_LIST[@]} reads matched" + exit 0 +fi +exit 1 diff --git a/scripts/runViaFinder.sh b/scripts/runViaFinder.sh new file mode 100755 index 0000000..5e67c0b --- /dev/null +++ b/scripts/runViaFinder.sh @@ -0,0 +1,113 @@ +#!/usr/bin/env bash +# runViaFinder.sh — boot real GS/OS 6.0.2 in MAME, drive Finder via +# Lua keyboard automation to launch a user OMF, sample memory at +# specific frames to verify the program executed. +# +# Usage: runViaFinder.sh --check =... +# The OMF file is injected as /SYSTEM.DISK/HELLO (top-level on the +# boot disk). Lua then waits for Finder, types S+Cmd-O to open the +# System.Disk volume window, then H+Cmd-O to launch HELLO. +# +# Memory checks happen at frame 5400 (~90s emulated, well after the +# launch path completes) and exit 0 / 1 depending on whether each +# requested address holds the requested value. +# +# Requires: +# - tools/gsos/sys602.po (GS/OS 6.0.2 boot disk) +# - /tmp/cadius/cadius (forked-file-aware ProDOS tool) +# - mame apple2gs in PATH + +set -euo pipefail + +OMF="$1" +shift +[ -f "$OMF" ] || { echo "missing: $OMF" >&2; exit 2; } +[ "${1:-}" = "--check" ] || { echo "usage: $0 --check =..." >&2; exit 2; } +shift + +PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +CADIUS=${CADIUS:-/tmp/cadius/cadius} +SYSDISK=${SYSDISK:-$PROJECT_ROOT/tools/gsos/sys602.po} + +[ -x "$CADIUS" ] || { echo "cadius not found at $CADIUS" >&2; exit 2; } +[ -f "$SYSDISK" ] || { echo "sysdisk not found at $SYSDISK" >&2; exit 2; } + +WORK=$(mktemp -d -t finderlaunch.XXXXXX) +trap 'rm -rf "$WORK"' EXIT + +cp "$SYSDISK" "$WORK/disk.po" +cp "$OMF" "$WORK/HELLO#B30000" +"$CADIUS" ADDFILE "$WORK/disk.po" /SYSTEM.DISK "$WORK/HELLO#B30000" >/dev/null + +LUA_CHECKS="" +EXPECTS=() +for pair in "$@"; do + [ "$pair" = "--check" ] && continue + addr="${pair%=*}"; val="${pair#*=}" + EXPECTS+=("$pair") + LUA_CHECKS="$LUA_CHECKS print(string.format('MAME-READ %s=%02x', '$addr', mem:read_u8($addr)))"$'\n' +done + +cat > "$WORK/finder.lua" <= steps[idx][1] do + steps[idx][2]() + idx = idx + 1 + end +end) +LUA + +OUT=$(timeout 130 mame apple2gs -rompath "$PROJECT_ROOT/tools/mame/roms" \ + -window -nothrottle -sound none \ + -seconds_to_run 110 -flop3 "$WORK/disk.po" \ + -autoboot_script "$WORK/finder.lua" &1) + +# Verify each expected value. +fail=0 +for pair in "${EXPECTS[@]}"; do + addr="${pair%=*}"; want="${pair#*=}" + line=$(echo "$OUT" | grep "MAME-READ $addr=" | tail -1) + got=$(echo "$line" | sed -E 's/.*=([0-9a-f]+).*/\1/') + # Compare numerically (handles case differences and 0x prefix variants). + gotN=$(printf '%d' "0x$got" 2>/dev/null || echo -1) + wantN=$(printf '%d' "$want" 2>/dev/null || echo -2) + if [ "$gotN" = "$wantN" ]; then + echo " $addr = 0x$got (want $want) ✓" + else + echo " $addr = 0x$got (want $want) ✗" + fail=1 + fi +done +exit $fail diff --git a/scripts/smokeTest.sh b/scripts/smokeTest.sh index c34176d..ae027fe 100755 --- a/scripts/smokeTest.sh +++ b/scripts/smokeTest.sh @@ -4424,6 +4424,48 @@ EOF fi rm -f "$cShFile" "$oShFile" "$binShFile" + # Multi-segment link: --segment-cap forces >1 text segments + # at bank-aligned bases; mini multi-segment loader + # (scripts/runMultiSeg.sh) loads each + runs. helper3(10,20) + # chains compute → helper1 → helper2 → helper3 across + # whatever segment boundaries the packer landed on; result + # must be 0xBF ((31+61)*2+7 = 191). Verifies (a) text + # splitting at the cap, (b) bank-aligned segment placement, + # (c) cross-bank JSL works. + log "check: link816 --segment-cap splits text + cross-bank JSL works" + cMsegFile="$(mktemp --suffix=.c)" + oMsegFile="$(mktemp --suffix=.o)" + binMseg="$(mktemp --suffix=.bin)" + mfMseg="$(mktemp --suffix=.json)" + cat > "$cMsegFile" <<'EOF' +__attribute__((noinline)) static void switchToBank2(void) { + __asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n"); +} +__attribute__((noinline)) static int compute(int x) { return x * 3 + 1; } +__attribute__((noinline)) static int helper1(int a, int b) { return compute(a) + compute(b); } +__attribute__((noinline)) static int helper2(int a, int b) { return helper1(a, b) * 2; } +__attribute__((noinline)) static int helper3(int a, int b) { return helper2(a, b) + 7; } +int main(void) { + switchToBank2(); + int r = helper3(10, 20); + *(volatile unsigned short *)0x5000 = (unsigned short)r; + while (1) {} +} +EOF + "$CLANG" --target=w65816 -O2 -ffunction-sections -c "$cMsegFile" -o "$oMsegFile" + "$PROJECT_ROOT/tools/link816" -o "$binMseg" --text-base 0x1000 \ + --segment-cap 512 --manifest "$mfMseg" \ + "$oCrt0F" "$oLibgccFile" "$oMsegFile" >/dev/null 2>&1 + if ! grep -q '"num": 2' "$mfMseg"; then + die "link816 --segment-cap 512 did not split into multiple segments" + fi + if ! bash "$PROJECT_ROOT/scripts/runMultiSeg.sh" "$mfMseg" --check \ + 0x025000=00bf /dev/null 2>&1; then + die "MAME: multi-segment helper3(10,20) != 0xBF" + fi + rm -f "$cMsegFile" "$oMsegFile" "$binMseg" "$mfMseg" \ + "${binMseg%.bin}".seg*.bin + rm -f "$oLibcF" "$oStrtolF" "$oSnprintfF" "$oQsortF" \ "$oExtrasF" "$oStrtokF" "$oMathF" "$oSfF" "$oSdF" "$oCrt0F" else @@ -4921,12 +4963,203 @@ EOF if [ ! -s "$omfFile" ]; then die "omfEmit produced empty/missing OMF" fi - # Sanity-check the OMF: VERSION byte at offset 15 should be 0x21 - # (OMF v2.1). KIND at offset 20-21 should be 0x0000 (CODE). + # Sanity-check the OMF. VERSION byte at offset 15 is the OMF + # spec enum: 0x00=v1.0, 0x01=v2.0, 0x02=v2.1. Real GS/OS apps + # all have 0x02 — it is not BCD "2.1" as some online docs suggest. + # KIND at offset 20-21 should be 0x1000 (CODE|PRIV) — verified + # via Merlin32 reference: Merlin's hello.s16 with KIND=0x1000 + # ran successfully under MAME-Lua-driven Finder launch on real + # GS/OS 6.0.2 (marker bytes at $00/0078 set to $42/$99 confirmed). + # LABLEN must be 10 (fixed-width space-padded names) — LABLEN=0 + # (length-prefixed) is in the spec but not Loader-launchable. ver=$(od -An -tx1 -N 1 -j 15 "$omfFile" | tr -d ' ') - if [ "$ver" != "21" ]; then - die "OMF version byte at offset 15 is 0x$ver (expected 0x21 = v2.1)" + if [ "$ver" != "02" ]; then + die "OMF version byte at offset 15 is 0x$ver (expected 0x02 = v2.1)" fi + lablen=$(od -An -tu1 -N 1 -j 13 "$omfFile" | tr -d ' ') + if [ "$lablen" != "10" ]; then + die "OMF LABLEN is $lablen (expected 10 = fixed-width names)" + fi + kindLo=$(od -An -tx1 -N 1 -j 20 "$omfFile" | tr -d ' ') + kindHi=$(od -An -tx1 -N 1 -j 21 "$omfFile" | tr -d ' ') + if [ "$kindLo" != "00" ] || [ "$kindHi" != "10" ]; then + die "OMF KIND is 0x$kindHi$kindLo (expected 0x1000 = CODE|PRIV)" + fi + # Body opcode at offset DISPDATA: should be 0xF2 (LCONST, what + # every real GS/OS app segment uses). + dispdata=$(od -An -tu2 -N 2 -j 42 "$omfFile" | tr -d ' ') + bodyOp=$(od -An -tx1 -N 1 -j "$dispdata" "$omfFile" | tr -d ' ') + if [ "$bodyOp" != "f2" ]; then + die "OMF body opcode at offset $dispdata is 0x$bodyOp (expected 0xF2 LCONST)" + fi + + # omfEmit --manifest path: read a link816 multi-segment manifest + # and emit one OMF segment per entry. Each segment header has + # KIND=0x8800 (STATIC|ABSBANK|CODE), ORG=base address, SEGNUM + # 1..N. Smoke just verifies we get N>1 segments at the expected + # bank-aligned ORGs; the actual loader-side execution is covered + # by the in-tree mini-loader in the multi-segment MAME smoke + # check above. + log "check: omfEmit --manifest produces valid multi-segment OMF" + cMomfFile="$(mktemp --suffix=.c)" + oMomfFile="$(mktemp --suffix=.o)" + binMomf="$(mktemp --suffix=.bin)" + mfMomf="$(mktemp --suffix=.json)" + omfMomf="$(mktemp --suffix=.omf)" + cCrt0Momf="$(mktemp --suffix=.o)" + oLgMomf="$(mktemp --suffix=.o)" + cat > "$cMomfFile" <<'EOF' +__attribute__((noinline)) static int compute(int x) { return x * 3 + 1; } +__attribute__((noinline)) static int helper1(int a, int b) { return compute(a) + compute(b); } +__attribute__((noinline)) static int helper2(int a, int b) { return helper1(a, b) * 2; } +int main(void) { return helper2(10, 20); } +EOF + "$LLVM_MC" -arch=w65816 -filetype=obj "$PROJECT_ROOT/runtime/src/crt0.s" -o "$cCrt0Momf" + "$LLVM_MC" -arch=w65816 -filetype=obj "$PROJECT_ROOT/runtime/src/libgcc.s" -o "$oLgMomf" + "$CLANG" --target=w65816 -O2 -ffunction-sections -c "$cMomfFile" -o "$oMomfFile" + "$PROJECT_ROOT/tools/link816" -o "$binMomf" --text-base 0x1000 \ + --segment-cap 256 --manifest "$mfMomf" \ + "$cCrt0Momf" "$oLgMomf" "$oMomfFile" >/dev/null 2>&1 + "$PROJECT_ROOT/tools/omfEmit" --manifest "$mfMomf" --output "$omfMomf" >/dev/null 2>&1 + if [ ! -s "$omfMomf" ]; then + die "omfEmit --manifest produced empty/missing OMF" + fi + # Walk segments, count + verify KIND + ORG. + nSeg=$(python3 -c " +import struct +data = open('$omfMomf','rb').read() +pos = 0; n = 0; bad = 0 +while pos < len(data): + n += 1 + bytecnt = struct.unpack_from('=2 segments with KIND=0x8800, got $nSeg" + fi + rm -f "$cMomfFile" "$oMomfFile" "$binMomf" "$mfMomf" "$omfMomf" \ + "$cCrt0Momf" "$oLgMomf" "${binMomf%.bin}".seg*.bin + + # omfEmit --expressload: emit a 2-segment OMF where seg 1 is + # ~ExpressLoad (KIND=0x8001 DATA|STATIC) and seg 2 is the user + # code (KIND=0x8000 CODE|STATIC). Verifies the ExpressLoad load + # script structure: 8-byte header, segment list with self-rel + # offset, remap list, header info entry containing data offset + # that points exactly at user seg's LCONST data start (= body + # opcode offset + 5 for 0xF2 + 4-byte length). + log "check: omfEmit --expressload produces valid 2-seg ExpressLoad OMF" + cElFile="$(mktemp --suffix=.c)" + oElFile="$(mktemp --suffix=.o)" + binEl="$(mktemp --suffix=.bin)" + mapEl="$(mktemp --suffix=.map)" + omfEl="$(mktemp --suffix=.omf)" + cat > "$cElFile" <<'EOF' +int main(void) { return 0; } +EOF + "$CLANG" --target=w65816 -O2 -c "$cElFile" -o "$oElFile" + "$PROJECT_ROOT/tools/link816" -o "$binEl" --text-base 0x1000 \ + --map "$mapEl" --no-gc-sections \ + "$PROJECT_ROOT/runtime/crt0Gsos.o" "$oElFile" \ + "$PROJECT_ROOT/runtime/libgcc.o" >/dev/null 2>&1 + "$PROJECT_ROOT/tools/omfEmit" --input "$binEl" --map "$mapEl" \ + --base 0x1000 --entry __start --output "$omfEl" \ + --name HELLO --expressload >/dev/null 2>&1 + if [ ! -s "$omfEl" ]; then + die "omfEmit --expressload produced empty/missing OMF" + fi + # Validate structure with Python. + python3 -c " +import struct, sys +b = open('$omfEl','rb').read() +seg1_bytecnt = struct.unpack_from(' "$cR1" <<'EOF' +__attribute__((noinline)) static int helper(int x) { return x + 1; } +void main(void) { + *(volatile unsigned char *)0x00007F = (unsigned char)helper(0x40); +} +EOF + "$CLANG" --target=w65816 -O2 -c "$cR1" -o "$oR1" + "$PROJECT_ROOT/tools/link816" -o "$binR1" --text-base 0x1000 \ + --map "$mapR1" --reloc-out "$relR1" --no-gc-sections \ + "$PROJECT_ROOT/runtime/crt0Gsos.o" "$oR1" \ + "$PROJECT_ROOT/runtime/libgcc.o" >/dev/null 2>&1 + "$PROJECT_ROOT/tools/omfEmit" --input "$binR1" --map "$mapR1" \ + --base 0x1000 --entry __start --output "$omfR1" \ + --name HELLO --relocs "$relR1" >/dev/null 2>&1 + if [ ! -s "$omfR1" ] || [ ! -s "$relR1" ]; then + die "link816 --reloc-out / omfEmit --relocs produced empty output" + fi + python3 -c " +import struct, sys +b = open('$omfR1','rb').read() +r = open('$relR1','rb').read() +nRel = struct.unpack_from('=1 reloc site, got {nRel}'); sys.exit(1) +# Body opcode at DISPDATA; LCONST data length follows. Walk body and +# count cRELOC opcodes (0xF5). +dispdata = struct.unpack_from(' body; // patched bytes ready to write +}; + struct Layout { - uint32_t textBase, textSize; + uint32_t textBase, textSize; // segment 1's text (bank 0) uint32_t rodataBase, rodataSize; uint32_t bssBase, bssSize; uint32_t initBase, initSize; + // segments[0] = segment 1 (bank 0); segments[1..] = bank-N+ overflow. + // Always at least one entry. + std::vector segments; }; +// One IMM24 (3-byte absolute) relocation site, recorded for OMF +// cRELOC emission. The Loader will rewrite the 3 bytes at `patchOff` +// to be (segPlacedBase + offsetRef) when the segment is placed at +// runtime — this is what makes our compiled C runnable from Finder +// when the segment lands at e.g. bank $1F instead of bank 0. +struct Imm24Site { + uint32_t patchOff; // offset within text image (== patchAddr - textBase) + uint32_t offsetRef; // offset within text image of target symbol +}; +static std::vector gImm24Sites; +static uint32_t gTextBaseForSites = 0; +static bool gRecordSites = false; + static void applyReloc(std::vector &buf, uint32_t off, uint32_t patchAddr, uint32_t target, uint8_t rtype, const std::string &symName) { @@ -309,9 +338,12 @@ static void applyReloc(std::vector &buf, uint32_t off, buf[off] = static_cast(target & 0xFF); break; case R_W65816_IMM16: - if (target > 0xFFFF) - die("R_W65816_IMM16 to '" + symName + "' = 0x" + - std::to_string(target) + " out of range"); + // Keep only the low 16 bits. In single-bank programs this + // is a tautology (target IS 16-bit); in multi-segment + // programs the target may live in a different bank, but + // IMM16 absolute uses DBR for the bank at runtime, so + // patching just the low 16 bits is correct as long as the + // caller's DBR points at the target's bank. buf[off] = static_cast(target & 0xFF); buf[off + 1] = static_cast((target >> 8) & 0xFF); break; @@ -322,6 +354,23 @@ static void applyReloc(std::vector &buf, uint32_t off, buf[off] = static_cast(target & 0xFF); buf[off + 1] = static_cast((target >> 8) & 0xFF); buf[off + 2] = static_cast((target >> 16) & 0xFF); + // Record the site for OMF cRELOC emission (only if recording is + // enabled — gRecordSites is set by the CLI when --reloc-out is + // requested). The patch offset is within the segment image; the + // reference offset is the in-segment offset of the target. + if (gRecordSites) { + // Only intra-segment refs need cRELOC; cross-bank refs (to + // GS/OS dispatcher etc.) target absolute fixed addresses + // and shouldn't be relocated by the Loader. + uint32_t targetBank = target & 0xFF0000; + uint32_t baseBank = gTextBaseForSites & 0xFF0000; + if (targetBank == baseBank) { + Imm24Site s; + s.patchOff = patchAddr - gTextBaseForSites; + s.offsetRef = target - gTextBaseForSites; + gImm24Sites.push_back(s); + } + } break; case R_W65816_PCREL8: Signed = static_cast(target) - (static_cast(patchAddr) + 1); @@ -357,6 +406,13 @@ struct Linker { uint32_t rodataBase = 0; uint32_t bssBase = 0x2000; bool gcSections = true; + // Multi-segment support. segmentCap == 0 means "no cap" — produce + // a single-segment image (existing behaviour). Non-zero caps the + // bytes per text segment; overflow text sections get bank-aligned + // bases starting at segmentBankBase. + uint32_t segmentCap = 0; + uint32_t segmentBankBase = 0x040000; + std::string manifestPath; // Per-section identity: (object index, section index within obj). using SecID = std::pair; @@ -453,12 +509,17 @@ struct Linker { } // Per-object, per-section: in-merged-text/rodata/bss offset. + // For text: textWithin gives the offset within the *segment* the + // section is placed in; textSegOf names which segment (1-based). + // Single-segment builds put everything in segment 1; multi-segment + // builds may scatter sections across segments. struct ObjOffsets { uint32_t textBaseInMerged = 0; uint32_t rodataBaseInMerged = 0; uint32_t bssBaseInMerged = 0; uint32_t initBaseInMerged = 0; - std::map textWithin; + std::map textWithin; // offset within its segment + std::map textSegOf; // section idx -> segment num (1-based) std::map rodataWithin; std::map bssWithin; std::map initWithin; @@ -494,8 +555,12 @@ struct Linker { uint32_t base = 0; if (kind == "text") { auto wIt = oo.textWithin.find(sym.shndx); - base = lastLayout.textBase + oo.textBaseInMerged - + (wIt == oo.textWithin.end() ? 0 : wIt->second); + auto sIt = oo.textSegOf.find(sym.shndx); + uint32_t segNum = (sIt == oo.textSegOf.end()) ? 1 : sIt->second; + uint32_t segBase = (segNum >= 1 && segNum <= lastLayout.segments.size()) + ? lastLayout.segments[segNum - 1].base + : lastLayout.textBase; + base = segBase + (wIt == oo.textWithin.end() ? 0 : wIt->second); } else if (kind == "rodata") { auto wIt = oo.rodataWithin.find(sym.shndx); base = lastLayout.rodataBase + oo.rodataBaseInMerged @@ -531,18 +596,39 @@ struct Linker { Layout link(std::vector &outImage) { // 1. Layout: each obj's sections at running offsets. + // Text is segment-aware: when --segment-cap is set and total + // text would exceed it, sections spill into segments 2, 3, ... + // each based at successive bank boundaries starting from + // segmentBankBase. Other sections (rodata/bss/init_array) + // stay in segment 1's bank for v1 — multi-bank data refs + // would need IMM16 promotion to long which we don't do yet. objOff.resize(objs.size()); uint32_t curText = 0, curRodata = 0, curBss = 0, curInit = 0; - // gc-sections: compute the live-section set before accumulating - // so dead sections drop out of every later layout/reloc step. + std::vector segSizes = {0}; // bytes packed into each segment (1-based; index 0 = seg 1) + uint32_t curSeg = 1; computeLiveSet(); for (size_t fi = 0; fi < objs.size(); ++fi) { ObjOffsets &oo = objOff[fi]; oo.textBaseInMerged = curText; for (uint32_t idx : objs[fi]->sectionsByKind("text")) { if (!isLive(fi, idx)) continue; - oo.textWithin[idx] = curText - oo.textBaseInMerged; - curText += objs[fi]->sections[idx].size; + uint32_t sz = objs[fi]->sections[idx].size; + // If adding this section would exceed the cap, start a + // new segment. Skip empty sections in the cap check + // (they fit anywhere). Sections larger than the cap + // get their own segment (we don't split a single + // section across banks — it'd violate intra-section + // PCREL and 16-bit absolute addressing). + if (segmentCap && sz > 0 && + segSizes[curSeg - 1] > 0 && + segSizes[curSeg - 1] + sz > segmentCap) { + curSeg++; + segSizes.push_back(0); + } + oo.textSegOf[idx] = curSeg; + oo.textWithin[idx] = segSizes[curSeg - 1]; + segSizes[curSeg - 1] += sz; + curText += sz; } oo.rodataBaseInMerged = curRodata; for (uint32_t idx : objs[fi]->sectionsByKind("rodata")) { @@ -563,13 +649,25 @@ struct Linker { curInit += objs[fi]->sections[idx].size; } } + // Build the segment list with bases. + std::vector segments; + segments.resize(segSizes.size()); + segments[0].segNum = 1; + segments[0].base = textBase; + segments[0].size = segSizes[0]; + for (size_t k = 1; k < segSizes.size(); ++k) { + segments[k].segNum = static_cast(k + 1); + segments[k].base = segmentBankBase + 0x10000u * (k - 1); + segments[k].size = segSizes[k]; + } Layout L; L.textBase = textBase; - L.textSize = curText; + L.textSize = segSizes[0]; // segment-1 text size (bank 0) L.bssSize = curBss; - L.rodataBase = rodataBase ? rodataBase : (textBase + curText); + L.rodataBase = rodataBase ? rodataBase : (textBase + segSizes[0]); L.rodataSize = curRodata; + L.segments = std::move(segments); // Reject a --rodata-base that overlaps text. Without this // check, the gap between text-end and rodata-base goes // negative, the unsigned subtraction wraps to ~4GB, and the @@ -739,8 +837,12 @@ struct Linker { uint32_t addr = 0; if (kind == "text") { auto it = oo.textWithin.find(sym.shndx); - addr = textBase + oo.textBaseInMerged - + (it == oo.textWithin.end() ? 0 : it->second) + auto sIt = oo.textSegOf.find(sym.shndx); + uint32_t segNum = (sIt == oo.textSegOf.end()) ? 1 : sIt->second; + uint32_t segBase = (segNum >= 1 && segNum <= L.segments.size()) + ? L.segments[segNum - 1].base + : textBase; + addr = segBase + (it == oo.textWithin.end() ? 0 : it->second) + sym.value; } else if (kind == "rodata") { auto it = oo.rodataWithin.find(sym.shndx); @@ -776,16 +878,20 @@ struct Linker { } } - // 3. Build text and rodata buffers. Skip dead sections under - // gc-sections (isLive() returns true for everything when gc - // is off). - std::vector textBuf; - textBuf.reserve(curText); + // 3. Build per-segment text buffers + rodata. Skip dead + // sections under gc-sections. + std::vector> segTextBufs(L.segments.size()); + for (size_t k = 0; k < L.segments.size(); ++k) + segTextBufs[k].reserve(L.segments[k].size); for (size_t fi = 0; fi < objs.size(); ++fi) { + const auto &oo = objOff[fi]; for (uint32_t idx : objs[fi]->sectionsByKind("text")) { if (!isLive(fi, idx)) continue; + auto sIt = oo.textSegOf.find(idx); + uint32_t segNum = (sIt == oo.textSegOf.end()) ? 1 : sIt->second; const uint8_t *p = objs[fi]->sectionData(idx); - textBuf.insert(textBuf.end(), p, p + objs[fi]->sections[idx].size); + auto &buf = segTextBufs[segNum - 1]; + buf.insert(buf.end(), p, p + objs[fi]->sections[idx].size); } } std::vector rodataBuf; @@ -799,7 +905,7 @@ struct Linker { } } - // 4. Apply relocations to text buffer. + // 4. Apply relocations to text buffers (each in its own segment). for (size_t fi = 0; fi < objs.size(); ++fi) { const auto &obj = *objs[fi]; const auto &oo = objOff[fi]; @@ -807,20 +913,53 @@ struct Linker { if (!isLive(fi, textIdx)) continue; auto it = obj.relocs.find(textIdx); if (it == obj.relocs.end()) continue; - uint32_t inMerged = oo.textBaseInMerged + oo.textWithin.at(textIdx); + auto sIt = oo.textSegOf.find(textIdx); + uint32_t segNum = (sIt == oo.textSegOf.end()) ? 1 : sIt->second; + uint32_t inSeg = oo.textWithin.at(textIdx); + uint32_t segBase = L.segments[segNum - 1].base; + auto &textBuf = segTextBufs[segNum - 1]; for (const Reloc &r : it->second) { - uint32_t patchOff = inMerged + r.offset; - uint32_t patchAddr = textBase + patchOff; + uint32_t patchOff = inSeg + r.offset; + uint32_t patchAddr = segBase + patchOff; uint32_t target; std::string resolvedName; if (!resolveSym(obj, oo, r, target, resolvedName)) die(obj.path + ": .text reloc to unresolved '" + resolvedName + "'"); + // PCREL relocs can't span banks (the displacement + // is intra-bank only). Detect and report so the + // user can adjust packing. IMM16 cross-bank is + // tolerated: 16-bit absolute uses DBR for the + // bank, which we keep at 0 by default (so refs + // to bank-0 data work from any code segment), + // and we can't statically know the target bank + // intent anyway. + if (segmentCap && (r.type == R_W65816_PCREL16 || + r.type == R_W65816_PCREL8)) { + uint32_t targetSegBank = target & 0xFF0000; + uint32_t patchSegBank = segBase & 0xFF0000; + if (targetSegBank != patchSegBank) { + char msg[200]; + std::snprintf(msg, sizeof(msg), + "%s: cross-bank PCREL reloc to '%s' (target bank " + "0x%X, code bank 0x%X) — adjust --segment-cap " + "or pack referenced section into the same segment", + obj.path.c_str(), resolvedName.c_str(), + targetSegBank, patchSegBank); + die(msg); + } + } applyReloc(textBuf, patchOff, patchAddr, target, r.type, resolvedName); } } } + // Move per-segment patched text into the Layout for output. + for (size_t k = 0; k < L.segments.size(); ++k) + L.segments[k].body = std::move(segTextBufs[k]); + // Re-publish layout now that segment bodies are populated — + // writeMultiSegment reads from lastLayout. + lastLayout = L; // 4b. Apply relocations to rodata/data buffer. Globals like // `int *p = &v;` need their initializer patched at link time @@ -849,11 +988,16 @@ struct Linker { } } - // 5. Compose output: text || (gap) || rodata. bss is virtual. + // 5. Compose output: segment-1 text || (gap) || rodata. + // bss is virtual. Multi-segment builds emit additional text + // segments separately (see writeSegmentImages); the main -o + // output stays segment 1's image so existing single-segment + // smoke checks still work unchanged. outImage.clear(); - outImage = std::move(textBuf); - if (L.rodataBase != textBase + curText) { - uint32_t gap = L.rodataBase - (textBase + curText); + const uint32_t seg1TextSize = static_cast(L.segments[0].body.size()); + outImage = L.segments[0].body; + if (L.rodataBase != textBase + seg1TextSize) { + uint32_t gap = L.rodataBase - (textBase + seg1TextSize); outImage.insert(outImage.end(), gap, 0); } outImage.insert(outImage.end(), rodataBuf.begin(), rodataBuf.end()); @@ -1025,6 +1169,82 @@ struct Linker { } } + // Write per-segment images for segments 2..N (segment 1 is the + // main -o output) and a JSON manifest describing all segments. + // Image filename convention: .seg.bin where outBase is + // the -o path with any trailing extension stripped. Manifest JSON + // is at the user-supplied --manifest path. + void writeMultiSegment(const std::string &mainOutPath, + const std::string &mfPath, + const std::string &entrySym) const { + if (lastLayout.segments.empty()) return; + // Strip the extension from mainOutPath for per-segment names. + std::string outBase = mainOutPath; + size_t dot = outBase.find_last_of('.'); + size_t slash = outBase.find_last_of('/'); + if (dot != std::string::npos && + (slash == std::string::npos || dot > slash)) { + outBase = outBase.substr(0, dot); + } + // Per-segment images for K >= 2. + for (size_t k = 1; k < lastLayout.segments.size(); ++k) { + const auto &seg = lastLayout.segments[k]; + char name[256]; + std::snprintf(name, sizeof(name), "%s.seg%u.bin", + outBase.c_str(), seg.segNum); + std::ofstream f(name, std::ios::binary); + if (!f) die(std::string("cannot open '") + name + "' for writing"); + f.write(reinterpret_cast(seg.body.data()), + seg.body.size()); + } + // Manifest. Hand-rolled JSON (no external dep). + if (mfPath.empty()) return; + std::ofstream mf(mfPath); + if (!mf) die("cannot open '" + mfPath + "' for writing"); + char buf[512]; + std::snprintf(buf, sizeof(buf), + "{\n" + " \"version\": 1,\n" + " \"main\": \"%s\",\n" + " \"entry\": \"%s\",\n" + " \"segments\": [\n", mainOutPath.c_str(), entrySym.c_str()); + mf << buf; + for (size_t k = 0; k < lastLayout.segments.size(); ++k) { + const auto &seg = lastLayout.segments[k]; + std::string imgPath = mainOutPath; + if (k > 0) { + char nm[256]; + std::snprintf(nm, sizeof(nm), "%s.seg%u.bin", + outBase.c_str(), seg.segNum); + imgPath = nm; + } + uint32_t entryOff = 0; + // Set entry_offset on whichever segment actually contains + // the entry symbol — usually segment 1 (crt0's __start) + // but could be any segment if user picks a non-standard + // entry point. + auto it = globalSyms.find(entrySym); + if (it != globalSyms.end() && it->second >= seg.base && + it->second < seg.base + seg.body.size()) { + entryOff = it->second - seg.base; + } + std::snprintf(buf, sizeof(buf), + " {\n" + " \"num\": %u,\n" + " \"name\": \"SEG%u\",\n" + " \"base\": \"0x%06x\",\n" + " \"size\": %zu,\n" + " \"image\": \"%s\",\n" + " \"entry_offset\": \"0x%04x\"\n" + " }%s\n", + seg.segNum, seg.segNum, seg.base, seg.body.size(), + imgPath.c_str(), entryOff, + (k + 1 < lastLayout.segments.size()) ? "," : ""); + mf << buf; + } + mf << " ]\n}\n"; + } + // Stash the last layout so writeMap can use it. Layout lastLayout; }; @@ -1047,8 +1267,13 @@ static void usage(const char *argv0) { std::fprintf(stderr, "usage: %s -o [--text-base ADDR] [--rodata-base ADDR]\n" " [--bss-base ADDR] [--map FILE] [--debug-out FILE]\n" - " [--no-gc-sections]\n" - " ...\n", + " [--reloc-out FILE] [--no-gc-sections]\n" + " ...\n" + "\n" + " --reloc-out FILE write IMM24 relocation site list (binary:\n" + " ...)\n" + " consumed by omfEmit --relocs to emit cRELOC\n" + " opcodes for runtime bank-byte fixup.\n", argv0); std::exit(2); } @@ -1059,6 +1284,7 @@ int main(int argc, char **argv) { std::string outPath; std::string mapPath; std::string debugOutPath; + std::string relocOutPath; Linker linker; int i = 1; @@ -1082,6 +1308,9 @@ int main(int argc, char **argv) { } else if (a == "--debug-out") { if (++i >= argc) usage(argv[0]); debugOutPath = argv[i++]; + } else if (a == "--reloc-out") { + if (++i >= argc) usage(argv[0]); + relocOutPath = argv[i++]; } else if (a == "--gc-sections") { // Drop sections not reachable from __start / main / // init_array. Requires `-ffunction-sections` (so each @@ -1094,6 +1323,15 @@ int main(int argc, char **argv) { } else if (a == "--no-gc-sections") { linker.gcSections = false; i++; + } else if (a == "--segment-cap") { + if (++i >= argc) usage(argv[0]); + linker.segmentCap = parseInt(argv[i++]); + } else if (a == "--segment-bank-base") { + if (++i >= argc) usage(argv[0]); + linker.segmentBankBase = parseInt(argv[i++]); + } else if (a == "--manifest") { + if (++i >= argc) usage(argv[0]); + linker.manifestPath = argv[i++]; } else if (a == "-h" || a == "--help") { usage(argv[0]); } else if (!a.empty() && a[0] == '-') { @@ -1105,6 +1343,14 @@ int main(int argc, char **argv) { } if (outPath.empty() || linker.objs.empty()) usage(argv[0]); + // Enable IMM24 site recording before linking, so applyReloc populates + // gImm24Sites for cRELOC sidecar emission. + if (!relocOutPath.empty()) { + gRecordSites = true; + gTextBaseForSites = linker.textBase; + gImm24Sites.clear(); + } + std::vector image; Layout L = linker.link(image); @@ -1114,13 +1360,49 @@ int main(int argc, char **argv) { if (!mapPath.empty()) linker.writeMap(mapPath); if (!debugOutPath.empty()) linker.writeDebugSidecar(debugOutPath); + if (!relocOutPath.empty()) { + // Sidecar binary format: + // u32 count + // { u32 patchOff; u32 offsetRef; } × count + // Both offsets are within the segment image (== link-time addr + // minus textBase). Consumed by omfEmit --relocs to emit cRELOC + // opcodes after the LCONST data. + std::ofstream rf(relocOutPath, std::ios::binary); + if (!rf) die("cannot open '" + relocOutPath + "' for writing"); + uint32_t count = (uint32_t)gImm24Sites.size(); + rf.write(reinterpret_cast(&count), 4); + for (const auto &s : gImm24Sites) { + uint32_t po = s.patchOff, off = s.offsetRef; + rf.write(reinterpret_cast(&po), 4); + rf.write(reinterpret_cast(&off), 4); + } + } + // Multi-segment: write per-segment images + manifest if there's + // more than one segment OR --manifest was requested. + if (L.segments.size() > 1 || !linker.manifestPath.empty()) { + // Default entry symbol is __start (crt0's program entry, + // which calls main). GS/OS Loader runs from segment 1's + // entry; crt0 lives in segment 1 by convention (first + // input object is typically the runtime/crt0). + linker.writeMultiSegment(outPath, linker.manifestPath, "__start"); + } std::fprintf(stderr, "linked: text=[0x%04x+%u] rodata=[0x%04x+%u] bss=[0x%04x+%u] " - "-> %s (%zu bytes)\n", + "-> %s (%zu bytes)", L.textBase, L.textSize, L.rodataBase, L.rodataSize, L.bssBase, L.bssSize, outPath.c_str(), image.size()); + if (L.segments.size() > 1) { + std::fprintf(stderr, " + %zu extra segments", + L.segments.size() - 1); + for (size_t k = 1; k < L.segments.size(); ++k) { + std::fprintf(stderr, " seg%u=[0x%06x+%zu]", + L.segments[k].segNum, L.segments[k].base, + L.segments[k].body.size()); + } + } + std::fprintf(stderr, "\n"); return 0; } diff --git a/src/link816/omfEmit.cpp b/src/link816/omfEmit.cpp index 5f1c1df..8501573 100644 --- a/src/link816/omfEmit.cpp +++ b/src/link816/omfEmit.cpp @@ -1,14 +1,25 @@ -// omfEmit — wrap a flat binary in a minimal Apple IIgs OMF v2.1 -// container so GS/OS can load and execute it. +// omfEmit — wrap a flat binary (or a multi-segment manifest from +// link816) in an Apple IIgs OMF v2.1 container. // -// Single-segment output (CODE, kind=0), no INTERSEG opcodes (multi- -// segment output is a follow-on). Header layout per OMF 2.1 spec: -// 44-byte fixed header + 10-byte LOAD_NAME + 32-byte SEG_NAME, then -// the body (DS opcode for the payload, END opcode terminator). +// Single-segment mode (legacy): one CODE segment with KIND=0, +// no INTERSEG opcodes, ORG=0 (loader picks bank). Header layout +// per OMF 2.1 spec: 44-byte fixed header + 10-byte LOAD_NAME + +// 32-byte SEG_NAME, then the body (DS opcode for the payload, +// END opcode terminator). // -// CLI mirrors the Python tool exactly: // omfEmit --input flat.bin --map flat.map --base 0x8000 // --entry main --output prog.omf [--name SEG] +// +// Multi-segment mode: read the JSON manifest emitted by +// `link816 --manifest`, write one OMF segment per manifest entry. +// Each segment's ORG is set to its declared base (bank-aligned) +// so the loader places it at the exact address the linker assumed +// when it patched intra-segment IMM24 / IMM16 relocations. KIND +// uses the STATIC + ABSBANK attributes to ask the loader not to +// move segments around — necessary because all relocs were already +// baked in at link time (no INTERSEG opcodes emitted yet). +// +// omfEmit --manifest manifest.json --output prog.omf #include #include @@ -26,6 +37,14 @@ namespace { std::exit(1); } +// Populated by --relocs from a link816 sidecar. Each entry is +// (OffsetPatch, OffsetReference) — the in-segment offset to patch +// (3 bytes wide) and the in-segment offset of the target. Consumed +// by emitOneSeg to write cRELOC opcodes between LCONST and END. +} // close namespace +std::vector> gReloc24Sites; +namespace { + static std::vector readFile(const std::string &path) { std::ifstream f(path, std::ios::binary); if (!f) die("cannot open '" + path + "' for reading"); @@ -67,59 +86,107 @@ static void put16(std::vector &v, uint16_t x) { v.push_back((x >> 8) & 0xFF); } -static std::vector emitOMF(const std::vector &image, - uint32_t entryOffset, - const std::string &name) { - // Body: DS (literal data) + END. +// Emit one OMF segment record. Caller composes multiple records +// back-to-back to form a multi-segment OMF file. +// +// `org` : absolute load address. 0 means "loader picks" (single- +// segment mode). Non-zero (typical for multi-segment) +// requests STATIC ABSBANK placement at that exact address. +// `segNum` : 1-based segment number. +// `entryOff`: offset within this segment to the program entry point; +// only meaningful for the entry segment (typically 1), +// ignored otherwise. +// `kind` : OMF KIND field. Caller picks; v1 uses 0x8800 (STATIC | +// ABSBANK | CODE) for multi-segment static placement, or +// 0x0000 (CODE, dynamic) for single-segment legacy mode. +static std::vector emitOneSeg(const std::vector &image, + uint32_t entryOff, + uint32_t org, + uint16_t segNum, + uint16_t kind, + const std::string &name) { std::vector body; if (!image.empty()) { - body.push_back(0xF1); // DS opcode + // LCONST opcode 0xF2: takes a NUMLEN-byte count followed by N + // literal bytes. With NUMLEN=4 (standard for v2.1), the count + // field is 4 bytes. Verified empirically against real /SYSTEM/ + // START on GS/OS 6.0.2: every segment uses 0xF2 + 4-byte count. + body.push_back(0xF2); // LCONST opcode put32(body, static_cast(image.size())); body.insert(body.end(), image.begin(), image.end()); } + // cRELOC opcodes (0xF5): one per IMM24 reloc site. Format per + // Merlin32's BuildOMFFile: + // 1B opcode (0xF5) + // 1B ByteCnt (3 for IMM24) + // 1B BitShift (0 = no shift) + // 2B OffsetPatch (offset in segment to patch) + // 2B OffsetReference (in-segment offset of target) + // The Loader rewrites segment[OffsetPatch..OffsetPatch+2] to be + // (segPlacedBase + OffsetReference) at load time. This is what + // makes JSL/JML/STAlong/etc. with intra-segment targets work when + // the Loader places us at non-zero bank. + for (const auto &s : ::gReloc24Sites) { + body.push_back(0xF5); + body.push_back(3); // ByteCnt + body.push_back(0); // BitShift + put16(body, s.first); // OffsetPatch + put16(body, s.second); // OffsetReference + } body.push_back(0x00); // END opcode - // LOAD_NAME: 10 bytes, space-padded. - std::string loadName = name.substr(0, 10); - while (loadName.size() < 10) loadName += ' '; - - // SEG_NAME: 1-byte length prefix + 31 bytes (truncated, padded with NUL). - std::string segNameTxt = name.substr(0, 31); - std::vector segName; - segName.push_back(static_cast(segNameTxt.size())); - for (char c : segNameTxt) segName.push_back((uint8_t)c); - while (segName.size() < 32) segName.push_back(0); + // Real OMF format (Merlin32 convention, verified GS/OS Loader-launchable): + // - LABLEN = 10: both LOAD_NAME and SEG_NAME are 10 bytes wide, + // space-padded. This is what Merlin32 emits and what GS/OS + // Loader accepts when launching from Finder. Length-prefixed + // names (LABLEN=0, what /SYSTEM/START FINDER and TOOL.SETUP + // use) is documented in the OMF spec but NOT accepted by the + // Loader for app launch — empirical finding: switching from + // LABLEN=0 to LABLEN=10 was the key change that took our hello + // from "OMF loaded but entry never JSL'd → $005C error" to + // "marker $0078 = $42 set, code ran". + constexpr uint8_t LABLEN_VAL = 10; + std::vector loadName(10, 0x20); // 10 spaces + std::string segNameTxt = name.substr(0, 10); // truncate to LABLEN + std::vector segName(LABLEN_VAL, 0x20); // 10-byte field, space-padded + for (size_t i = 0; i < segNameTxt.size(); i++) + segName[i] = (uint8_t)segNameTxt[i]; constexpr uint16_t DISPNAME = 44; - const uint16_t DISPDATA = DISPNAME + 10 + 32; + const uint16_t DISPDATA = static_cast( + DISPNAME + loadName.size() + segName.size()); const uint32_t LENGTH = static_cast(image.size()); const uint32_t BYTECNT = DISPDATA + static_cast(body.size()); const uint32_t RESSPC = 0; + // BANKSIZE = 0x10000 — segment fits in one 64KB bank. + // Earlier I tried 0 (matched one decoded file) but real + // executable code segments use 0x10000. const uint32_t BANKSIZE = 0x10000; - const uint16_t KIND = 0x0000; // CODE - const uint32_t ORG = 0; const uint32_t ALIGN = 0; const uint8_t NUMSEX = 0; - const uint16_t SEGNUM = 1; - const uint32_t ENTRY = entryOffset; std::vector hdr; put32(hdr, BYTECNT); put32(hdr, RESSPC); put32(hdr, LENGTH); hdr.push_back(0x00); // undefined - hdr.push_back(10); // LABLEN + hdr.push_back(LABLEN_VAL); // LABLEN (10 = fixed-width names) hdr.push_back(4); // NUMLEN - hdr.push_back(0x21); // VERSION 2.1 + hdr.push_back(0x02); // VERSION (0x02 = OMF v2.1; 0x01 = v2.0) + // Earlier we used 0x21 here thinking it was BCD-encoded "2.1" — + // it's not. The VERSION byte uses an enum: 0x00=v1.0, 0x01=v2.0, + // 0x02=v2.1. Real GS/OS apps decoded from a system disk have + // 0x02 here. GS/OS Loader rejects 0x21 with error $1102 because + // there's no version with that code. put32(hdr, BANKSIZE); - put16(hdr, KIND); + put16(hdr, kind); hdr.push_back(0x00); hdr.push_back(0x00); // undefined (2 bytes) - put32(hdr, ORG); + put32(hdr, org); put32(hdr, ALIGN); hdr.push_back(NUMSEX); hdr.push_back(0x00); // undefined - put16(hdr, SEGNUM); - put32(hdr, ENTRY); + put16(hdr, segNum); + put32(hdr, entryOff); put16(hdr, DISPNAME); put16(hdr, DISPDATA); @@ -133,6 +200,303 @@ static std::vector emitOMF(const std::vector &image, return out; } +// Legacy single-segment wrapper. +// +// KIND=0x1000 (CODE | PRIV). This is what Merlin32 emits for single- +// segment GS/OS apps and what GS/OS Loader actually launches via +// Finder double-click. KIND=0x8000 (CODE|STATIC) was earlier hypothesis +// based on extracting ABOUT from real FINDER, but ABOUT is a sub- +// segment of FINDER, not a standalone app — so its KIND isn't a valid +// model. PRIV bit signals "loaded with the rest of the app" and is the +// reliable choice empirically validated by Merlin32-built hello.s16 +// running successfully under MAME-Lua-driven Finder launch. +static std::vector emitOMF(const std::vector &image, + uint32_t entryOffset, + const std::string &name) { + return emitOneSeg(image, entryOffset, /*org*/0, /*segNum*/1, + /*kind*/0x1000, name); +} + +// Emit an ExpressLoad-able OMF wrapping a single user segment. This is +// what real GS/OS apps look like: a `~ExpressLoad` segment as seg 1, +// then the actual code as seg 2. +// +// Why we need ExpressLoad: replacing /SYSTEM/START with a single- +// segment OMF (no ExpressLoad) makes the GS/OS Loader place our +// segment in RAM but never JSL the entry — verified by writing a +// marker as the first instruction of crt0Gsos and observing the +// marker remained 0 across the entire boot. +// +// ExpressLoad format reverse-engineered from real /SYSTEM/START +// (FINDER) on GS/OS 6.0.2 disk. Each ExpressLoad-able file's seg 1 +// is a `~ExpressLoad` data segment containing a load script. +// +// The load script (stored as the LCONST data of the ExpressLoad seg): +// +0..1 word file_ref = 0 +// +2..3 word reserved = 0 +// +4..5 word extra = 0 (Neil Parker's docs omit this) +// +6..7 word count = N - 2 where N = total segs +// +8.. 8B/seg segment list = (N - 1) entries: +// +0..1: self-rel offset to header info entry +// +2..3: flags = 0 +// +4..7: handle = 0 +// +Var 2B/seg remap list = (N - 1) words: +// new segment number for old position +// +Var Var/seg header info entries: +// +0..3: data offset in file (= body op + 5) +// +4..7: data length (= seg LENGTH field) +// +8..11: reloc offset in file (0 if no relocs) +// +12..15: reloc length (0 if no relocs) +// +16..47: header copy bytes [12..43] of the +// user segment, with DISPDATA zeroed +// +48..57: LOAD_NAME (10 bytes) +// +58.. : SEG_NAME (length-prefixed) +// +// All counts use NUMLEN=4 (4-byte length on LCONST opcodes). +static std::vector emitOmfExpressLoad( + const std::vector &image, + uint32_t entryOffset, + const std::string &userSegName) { + + // Step 1: build the user segment using KIND=0x1000 (CODE|PRIV). + // Same KIND emitOMF uses for single-segment apps. Verified + // Loader-launchable via the Finder smoke path. + auto userSeg = emitOneSeg(image, entryOffset, /*org*/0, /*segNum*/2, + /*kind*/0x1000, userSegName); + + // Step 2: figure out the file offsets we'll need to bake into the + // load script. We don't know the ExpressLoad segment's total size + // yet — but we can compute it because each component is a fixed + // function of the user segment name length. + // + // ExpressLoad LCONST data layout (matches Merlin32 source — see + // BuildExpressLoadSegment in Merlin32's a65816_OMF.c): + // 6 bytes header (4-byte reserved DWORD + 2-byte count WORD) + // 8 bytes segment list (1 entry per non-ExpressLoad segment) + // 2 bytes remap list (1 entry per non-ExpressLoad segment) + // 16 bytes header info offsets (data_off, data_len, reloc_off, reloc_len) + // + header_xpress: bytes [12..43] of user header (32 bytes) + LOAD_NAME (10) + SEG_NAME (1+N) + // = 6 + 8 + 2 + 16 + 32 + 10 + 1 + N = 75 + N bytes + // + // KEY FIX from earlier emitter version: header is 6 bytes, NOT 8. + // I had written 8 bytes (file_ref WORD + reserved WORD + extra WORD + + // count WORD) based on misreading /SYSTEM/START's bytes. Merlin32 + // uses (reserved DWORD + count WORD) = 6 bytes total. /SYSTEM/START + // has count=0 in the 6-byte interpretation which means it uses some + // other variant (maybe APW Express's older format), but Merlin32's + // format is what we know is GS/OS-loader-accepted today. + constexpr uint32_t HDR_SIZE = 44; + constexpr uint32_t LOAD_NAME_SIZE = 10; + constexpr uint32_t SEG_NAME_SIZE = 10; // LABLEN=10 → fixed-width SEG_NAME + const uint32_t userNameLen = (uint32_t)userSegName.size(); + const uint32_t userNameAreaSize = LOAD_NAME_SIZE + SEG_NAME_SIZE; + + // ExpressLoad's own segment metrics. The name "~ExpressLoad" is 12 + // chars and won't fit in a LABLEN=10 field, so the ExpressLoad seg + // uses LABLEN=0 (length-prefixed name): 1 length byte + 12 chars. + const std::string elName = "~ExpressLoad"; + const uint32_t elNameAreaSize = LOAD_NAME_SIZE + 1 + (uint32_t)elName.size(); + // header_xpress_length = (header bytes 12..43) + LOAD_NAME + SEG_NAME + // = 32 + 10 + 10 = 52 bytes + // Per-segment ExpressLoad data: 8 (table) + 2 (remap) + 16 (offsets) + 52 = 78 bytes + // Header (6 bytes) + per-segment data: 6 + 78 = 84 + const uint32_t elDataSize = 84; + (void)userNameLen; // truncated in user seg name; LABLEN=10 fixed + // Body size = 1 byte LCONST opcode + 4 byte length + data + 1 byte END + const uint32_t elBodySize = 1 + 4 + elDataSize + 1; + const uint32_t elSegSize = HDR_SIZE + elNameAreaSize + elBodySize; + + // User segment file offsets (after ExpressLoad seg). + const uint32_t userSegStart = elSegSize; + const uint32_t userBodyOpOff = userSegStart + HDR_SIZE + userNameAreaSize; + const uint32_t userDataOff = userBodyOpOff + 5; // 1 op + 4 length + + // Step 3: build the ExpressLoad LCONST data. + std::vector elData; + // Header (6 bytes): reserved DWORD + count WORD + put32(elData, 0); // reserved + put16(elData, 0); // count = N-2 = 0 (for 2 segs) + + // Segment list (1 × 8 bytes) + // Self-rel offset = (header info offset within elData) - (this entry pos) + // = 16 - 6 = 10 + constexpr uint32_t segListEntryOff = 6; + const uint32_t headerInfoOff = 6 + 8 + 2; // header + segtable + remap + put16(elData, (uint16_t)(headerInfoOff - segListEntryOff)); + put16(elData, 0); // flags + put32(elData, 0); // handle + + // Remap list: old seg 1 (which would be our user seg without + // ExpressLoad) maps to new seg 2 (since ExpressLoad takes seg 1). + put16(elData, 2); + + // Header info entry for the user segment. + put32(elData, userDataOff); // data offset in file + put32(elData, (uint32_t)image.size()); // data length + put32(elData, 0); // reloc offset (0 = no relocs) + put32(elData, 0); // reloc length + + // Header copy: bytes [12..43] of user segment header, DISPDATA → 0. + if (userSeg.size() < HDR_SIZE) die("internal: user seg too small"); + elData.insert(elData.end(), userSeg.begin() + 12, userSeg.begin() + HDR_SIZE); + // DISPDATA is at offset 42..43 of the original header; in the copy + // (which omits the first 12 bytes), it lands at offset 30..31. + elData[elData.size() - 32 + 30] = 0; + elData[elData.size() - 32 + 31] = 0; + + // LOAD_NAME (10 bytes, space-padded — matches Merlin convention) + for (int i = 0; i < (int)LOAD_NAME_SIZE; i++) elData.push_back(0x20); + // SEG_NAME (10 bytes fixed-width, space-padded) + std::string truncated = userSegName.substr(0, SEG_NAME_SIZE); + for (size_t i = 0; i < SEG_NAME_SIZE; i++) { + elData.push_back(i < truncated.size() ? (uint8_t)truncated[i] : 0x20); + } + + if (elData.size() != elDataSize) + die("internal: ExpressLoad data size mismatch"); + + // Step 4: build the ExpressLoad segment header. + // KIND=0x8001 (DATA|STATIC), BANKSIZE=0 (DATA segs use 0, not 0x10000). + std::vector elHdr; + const uint32_t elBytecnt = HDR_SIZE + elNameAreaSize + elBodySize; + put32(elHdr, elBytecnt); // BYTECNT + put32(elHdr, 0); // RESSPC + put32(elHdr, elDataSize); // LENGTH (= LCONST data size) + elHdr.push_back(0); // undef + elHdr.push_back(0); // LABLEN + elHdr.push_back(4); // NUMLEN + elHdr.push_back(2); // VERSION (0x02 = v2.1) + put32(elHdr, 0); // BANKSIZE = 0 for DATA seg + put16(elHdr, 0x8001); // KIND = DATA|STATIC + elHdr.push_back(0); elHdr.push_back(0); // undef + put32(elHdr, 0); // ORG + put32(elHdr, 0); // ALIGN + elHdr.push_back(0); // NUMSEX + elHdr.push_back(0); // undef + put16(elHdr, 1); // SEGNUM = 1 + put32(elHdr, 0); // ENTRY = 0 + put16(elHdr, (uint16_t)HDR_SIZE); // DISPNAME = 44 + put16(elHdr, (uint16_t)(HDR_SIZE + elNameAreaSize)); // DISPDATA + + if (elHdr.size() != HDR_SIZE) die("internal: el hdr size != 44"); + + // Step 5: assemble the ExpressLoad segment. + std::vector elSeg; + elSeg.insert(elSeg.end(), elHdr.begin(), elHdr.end()); + for (int i = 0; i < (int)LOAD_NAME_SIZE; i++) elSeg.push_back(0); + elSeg.push_back((uint8_t)elName.size()); + for (char c : elName) elSeg.push_back((uint8_t)c); + // Body: LCONST opcode + 4-byte length + data + END + elSeg.push_back(0xF2); + put32(elSeg, elDataSize); + elSeg.insert(elSeg.end(), elData.begin(), elData.end()); + elSeg.push_back(0x00); + + if (elSeg.size() != elSegSize) + die("internal: ExpressLoad segment size mismatch"); + + // Step 6: concatenate ExpressLoad + user segment. + std::vector result; + result.insert(result.end(), elSeg.begin(), elSeg.end()); + result.insert(result.end(), userSeg.begin(), userSeg.end()); + return result; +} + +// Bare-bones manifest parser. link816's manifest is structured as +// `{ "segments": [ { "num": N, "base": "0xHHHHHH", "size": N, +// "image": "PATH", "entry_offset": "0xHHHH" }, ... ] }` with strict +// formatting (one field per line, no nested whitespace tricks). We +// match each field with simple regex/find — good enough since we're +// the only producer of this format. +struct ManifestSeg { + uint32_t num = 0; + uint32_t base = 0; + uint32_t entryOff = 0; + std::string image; + std::string name; +}; + +static std::string extractStringField(const std::string &block, + const std::string &key) { + std::string needle = "\"" + key + "\":"; + size_t p = block.find(needle); + if (p == std::string::npos) return {}; + // Skip whitespace after the colon. If the next non-space char + // isn't a quote, the value is a bare number — return empty so + // the caller falls through to the bare-number path (without + // accidentally consuming the next field's quoted string). + p += needle.size(); + while (p < block.size() && std::isspace((unsigned char)block[p])) p++; + if (p >= block.size() || block[p] != '"') return {}; + size_t e = block.find('"', p + 1); + if (e == std::string::npos) return {}; + return block.substr(p + 1, e - p - 1); +} +static uint32_t extractNumberField(const std::string &block, + const std::string &key) { + // Number can appear bare (size: 1234) or as a hex string ("0x..."). + std::string s = extractStringField(block, key); + if (!s.empty()) { + return static_cast(std::stoul(s, nullptr, 0)); + } + std::string needle = "\"" + key + "\":"; + size_t p = block.find(needle); + if (p == std::string::npos) return 0; + p += needle.size(); + while (p < block.size() && std::isspace((unsigned char)block[p])) p++; + size_t e = p; + while (e < block.size() && + (std::isdigit((unsigned char)block[e]) || + block[e] == 'x' || block[e] == 'X' || + (block[e] >= 'a' && block[e] <= 'f') || + (block[e] >= 'A' && block[e] <= 'F'))) e++; + if (e == p) return 0; + return static_cast(std::stoul(block.substr(p, e - p), + nullptr, 0)); +} + +static std::vector parseManifest(const std::string &path) { + std::ifstream f(path); + if (!f) die("cannot open '" + path + "' for reading"); + std::string text((std::istreambuf_iterator(f)), + std::istreambuf_iterator()); + std::vector segs; + // Find "segments": [ ... ] then split into per-segment {} blocks. + size_t arrStart = text.find("\"segments\""); + if (arrStart == std::string::npos) die("manifest missing 'segments'"); + arrStart = text.find('[', arrStart); + if (arrStart == std::string::npos) die("manifest 'segments' not array"); + size_t pos = arrStart + 1; + while (pos < text.size()) { + size_t obStart = text.find('{', pos); + if (obStart == std::string::npos) break; + // Match closing } via brace depth. + int depth = 1; + size_t obEnd = obStart + 1; + while (obEnd < text.size() && depth > 0) { + if (text[obEnd] == '{') depth++; + else if (text[obEnd] == '}') depth--; + if (depth > 0) obEnd++; + } + if (depth != 0) die("manifest segment block unterminated"); + std::string block = text.substr(obStart, obEnd - obStart + 1); + ManifestSeg seg; + seg.num = extractNumberField(block, "num"); + seg.base = extractNumberField(block, "base"); + seg.entryOff = extractNumberField(block, "entry_offset"); + seg.image = extractStringField(block, "image"); + seg.name = extractStringField(block, "name"); + if (seg.image.empty()) die("manifest segment missing 'image'"); + if (seg.name.empty()) seg.name = "SEG" + std::to_string(seg.num); + segs.push_back(std::move(seg)); + pos = obEnd + 1; + size_t closing = text.find_first_not_of(" \t\n\r,", pos); + if (closing != std::string::npos && text[closing] == ']') break; + } + if (segs.empty()) die("manifest has no segments"); + return segs; +} + static uint32_t parseInt(const std::string &s) { char *end = nullptr; unsigned long v = std::strtoul(s.c_str(), &end, 0); @@ -146,17 +510,28 @@ static uint32_t parseInt(const std::string &s) { static void usage(const char *argv0) { std::fprintf(stderr, "usage: %s --input FLAT --map FILE --base ADDR --entry SYM\n" - " --output OMF [--name NAME]\n", - argv0); + " --output OMF [--name NAME] [--expressload]\n" + " [--relocs FILE]\n" + " %s --manifest MFEST --output OMF\n" + "\n" + " --expressload emit ExpressLoad-able OMF (required for boot\n" + " launchers under real GS/OS Loader).\n" + " --relocs FILE read IMM24 reloc list from link816's --reloc-out\n" + " sidecar; emit cRELOC (0xF5) opcodes after LCONST\n" + " so the Loader patches intra-segment 24-bit refs\n" + " (JSL/JML/STAlong/etc.) when placing the segment.\n", + argv0, argv0); std::exit(2); } } // namespace int main(int argc, char **argv) { - std::string input, mapFile, output, entry = "main", name; + std::string input, mapFile, output, entry = "main", name, manifest; + std::string relocFile; uint32_t base = 0; bool baseSet = false; + bool expressload = false; int i = 1; while (i < argc) { @@ -166,11 +541,70 @@ int main(int argc, char **argv) { else if (a == "--base") { if (++i >= argc) usage(argv[0]); base = parseInt(argv[i++]); baseSet = true; } else if (a == "--entry") { if (++i >= argc) usage(argv[0]); entry = argv[i++]; } else if (a == "--name") { if (++i >= argc) usage(argv[0]); name = argv[i++]; } + else if (a == "--manifest") { if (++i >= argc) usage(argv[0]); manifest = argv[i++]; } else if (a == "--output" || a == "-o") { if (++i >= argc) usage(argv[0]); output = argv[i++]; } + else if (a == "--expressload") { expressload = true; i++; } + else if (a == "--relocs") { if (++i >= argc) usage(argv[0]); relocFile = argv[i++]; } else if (a == "-h" || a == "--help") usage(argv[0]); else die("unknown option '" + a + "'"); } - if (input.empty() || mapFile.empty() || !baseSet || output.empty()) + if (output.empty()) usage(argv[0]); + + // Load IMM24 reloc list, if provided. + if (!relocFile.empty()) { + auto raw = readFile(relocFile); + if (raw.size() < 4) die("--relocs file too small"); + uint32_t cnt = (uint32_t)raw[0] | ((uint32_t)raw[1] << 8) + | ((uint32_t)raw[2] << 16) | ((uint32_t)raw[3] << 24); + if (raw.size() != 4 + 8 * cnt) + die("--relocs file size mismatch: count=" + std::to_string(cnt) + + " expected " + std::to_string(4 + 8*cnt) + " bytes, got " + + std::to_string(raw.size())); + gReloc24Sites.reserve(cnt); + for (uint32_t k = 0; k < cnt; k++) { + size_t off = 4 + k * 8; + uint32_t patchOff = (uint32_t)raw[off] | ((uint32_t)raw[off+1] << 8) + | ((uint32_t)raw[off+2] << 16) | ((uint32_t)raw[off+3] << 24); + uint32_t offRef = (uint32_t)raw[off+4] | ((uint32_t)raw[off+5] << 8) + | ((uint32_t)raw[off+6] << 16) | ((uint32_t)raw[off+7] << 24); + if (patchOff > 0xFFFF || offRef > 0xFFFF) + die("reloc site out of 16-bit range — segment too large?"); + gReloc24Sites.emplace_back((uint16_t)patchOff, (uint16_t)offRef); + } + } + + // Multi-segment mode. + if (!manifest.empty()) { + auto segs = parseManifest(manifest); + std::vector blob; + size_t totalPayload = 0; + for (size_t k = 0; k < segs.size(); ++k) { + const auto &s = segs[k]; + auto img = readFile(s.image); + // Multi-segment: STATIC | ABSBANK | CODE. STATIC tells + // the loader not to relocate the segment (we baked all + // intra-segment relocations at link time and have no + // INTERSEG / RELOC opcodes); ABSBANK + ORG=base pins it + // to a specific bank. CODE is the default (type 0). + uint16_t kind = (k == 0) ? 0x8800u : 0x8800u; + uint32_t entryOff = (k == 0) ? s.entryOff : 0; + auto seg = emitOneSeg(img, entryOff, s.base, + static_cast(s.num), + kind, s.name); + blob.insert(blob.end(), seg.begin(), seg.end()); + totalPayload += img.size(); + } + std::ofstream f(output, std::ios::binary); + if (!f) die("cannot open '" + output + "' for writing"); + f.write(reinterpret_cast(blob.data()), blob.size()); + std::fprintf(stderr, + "OMF: %zu segments, %zu bytes payload -> %s (%zu bytes total)\n", + segs.size(), totalPayload, output.c_str(), blob.size()); + return 0; + } + + // Legacy single-segment mode (--input/--map/--base). + if (input.empty() || mapFile.empty() || !baseSet) usage(argv[0]); auto image = readFile(input); @@ -193,14 +627,18 @@ int main(int argc, char **argv) { name = (dot == std::string::npos) ? base_n : base_n.substr(0, dot); } - auto blob = emitOMF(image, entryOff, name); + auto blob = expressload + ? emitOmfExpressLoad(image, entryOff, name) + : emitOMF(image, entryOff, name); std::ofstream f(output, std::ios::binary); if (!f) die("cannot open '" + output + "' for writing"); f.write(reinterpret_cast(blob.data()), blob.size()); std::fprintf(stderr, - "OMF: 1 segment, %zu bytes payload, entry='%s' at +0x%x -> %s " + "OMF: %d segment%s%s, %zu bytes payload, entry='%s' at +0x%x -> %s " "(%zu bytes total)\n", + expressload ? 2 : 1, expressload ? "s" : "", + expressload ? " (ExpressLoad)" : "", image.size(), entry.c_str(), entryOff, output.c_str(), blob.size()); return 0;