From e65fedc8e17f00d4fbbb778fbfa7ed85d252c2fb Mon Sep 17 00:00:00 2001 From: Scott Duensing Date: Wed, 13 May 2026 15:48:34 -0500 Subject: [PATCH] Checkpoint --- STATUS.md | 171 +++- compare/README.md | 44 + compare/evalAt.c | 21 + compare/evalAt.calypsi.lst | 318 +++++++ compare/evalAt.ours.s | 593 +++++++++++++ compare/mul16to32.c | 4 + compare/mul16to32.calypsi.lst | 37 + compare/mul16to32.ours.s | 23 + compare/regen.sh | 44 + compare/sumSquares.c | 8 + compare/sumSquares.calypsi.lst | 68 ++ compare/sumSquares.ours.s | 93 +++ mame.ini | 416 ++++++++++ patches/0005-target-data-layout-w65816.patch | 2 +- plugin.ini | 17 + runtime/build.sh | 13 +- runtime/include/assert.h | 6 + runtime/include/complex.h | 100 +++ runtime/include/errno.h | 50 +- runtime/include/fenv.h | 51 ++ runtime/include/inttypes.h | 23 +- runtime/include/iso646.h | 20 + runtime/include/locale.h | 4 + runtime/include/math.h | 44 + runtime/include/stdalign.h | 13 + runtime/include/stdatomic.h | 138 ++++ runtime/include/stddef.h | 5 +- runtime/include/stdint.h | 9 +- runtime/include/stdio.h | 47 +- runtime/include/stdlib.h | 22 + runtime/include/stdnoreturn.h | 9 + runtime/include/string.h | 4 + runtime/include/tgmath.h | 97 +++ runtime/include/threads.h | 91 ++ runtime/include/time.h | 12 + runtime/include/uchar.h | 53 ++ runtime/include/wchar.h | 40 +- runtime/include/wctype.h | 84 ++ runtime/src/crt0.s | 96 ++- runtime/src/extras.c | 309 ++++++- runtime/src/libc.c | 116 +++ runtime/src/libgcc.s | 85 +- runtime/src/math.c | 109 +++ runtime/src/snprintf.c | 69 +- runtime/src/softDouble.c | 48 +- runtime/src/sscanf.c | 110 ++- runtime/src/timeExt.c | 54 +- scripts/runMultiSeg.sh | 4 +- scripts/smokeTest.sh | 781 +++++++++++++++++- src/clang/lib/Basic/Targets/W65816.h | 2 +- src/link816/link816.cpp | 117 ++- src/llvm/lib/Target/W65816/CMakeLists.txt | 3 + src/llvm/lib/Target/W65816/W65816.h | 25 + .../lib/Target/W65816/W65816AsmPrinter.cpp | 45 + .../lib/Target/W65816/W65816FrameLowering.cpp | 26 +- .../lib/Target/W65816/W65816I32IncFold.cpp | 225 +++++ .../lib/Target/W65816/W65816ISelLowering.cpp | 132 ++- .../lib/Target/W65816/W65816ISelLowering.h | 5 + .../lib/Target/W65816/W65816ImgCalleeSave.cpp | 278 +++++++ .../lib/Target/W65816/W65816InstrInfo.cpp | 159 +++- src/llvm/lib/Target/W65816/W65816InstrInfo.h | 18 +- src/llvm/lib/Target/W65816/W65816InstrInfo.td | 25 +- .../lib/Target/W65816/W65816NarrowI32Mul.cpp | 150 ++++ .../lib/Target/W65816/W65816RegisterInfo.cpp | 53 +- .../lib/Target/W65816/W65816SepRepCleanup.cpp | 392 +++++++++ .../Target/W65816/W65816StackSlotCleanup.cpp | 79 +- .../lib/Target/W65816/W65816TargetMachine.cpp | 81 +- ui.ini | 71 ++ 68 files changed, 6153 insertions(+), 308 deletions(-) create mode 100644 compare/README.md create mode 100644 compare/evalAt.c create mode 100644 compare/evalAt.calypsi.lst create mode 100644 compare/evalAt.ours.s create mode 100644 compare/mul16to32.c create mode 100644 compare/mul16to32.calypsi.lst create mode 100644 compare/mul16to32.ours.s create mode 100755 compare/regen.sh create mode 100644 compare/sumSquares.c create mode 100644 compare/sumSquares.calypsi.lst create mode 100644 compare/sumSquares.ours.s create mode 100644 mame.ini create mode 100644 plugin.ini create mode 100644 runtime/include/complex.h create mode 100644 runtime/include/fenv.h create mode 100644 runtime/include/iso646.h create mode 100644 runtime/include/stdalign.h create mode 100644 runtime/include/stdatomic.h create mode 100644 runtime/include/stdnoreturn.h create mode 100644 runtime/include/tgmath.h create mode 100644 runtime/include/threads.h create mode 100644 runtime/include/uchar.h create mode 100644 runtime/include/wctype.h create mode 100644 src/llvm/lib/Target/W65816/W65816I32IncFold.cpp create mode 100644 src/llvm/lib/Target/W65816/W65816ImgCalleeSave.cpp create mode 100644 src/llvm/lib/Target/W65816/W65816NarrowI32Mul.cpp create mode 100644 ui.ini diff --git a/STATUS.md b/STATUS.md index 70060b7..a285834 100644 --- a/STATUS.md +++ b/STATUS.md @@ -50,6 +50,12 @@ which runs correctly under MAME (apple2gs). coverage as printf (`%d %u %x %ld %lu %s %c %f %p %%` + width). C99 truncation semantics for snprintf. `%.Nf` produces the correct fractional digits with round-half-up. +- scanf family: `sscanf` / `vsscanf` parse a C string; `fscanf` / + `vfscanf` bridge to vsscanf via a per-call line buffer (caps at + 255 bytes / line; a longer line silently truncates). `scanf` + reads from stdin which always returns EOF on this target — the + surface compiles but isn't useful without a stdin source. + Format directives: `%d %i %u %x %X %o %s %c %ld %lu %lx %li %lo %%`. - qsort + bsearch over arbitrary element size with a user `cmp` callback. - Standard string/stdlib glue: strcat, strncat, strpbrk, strspn, @@ -91,12 +97,61 @@ which runs correctly under MAME (apple2gs). - ``: wcslen / wcscmp / wcsncmp / wcscpy / wcsncpy / wcscat / wcschr / wcsrchr; mbtowc / wctomb / mbstowcs / wcstombs / mblen with the trivial 1:1 byte<->wide mapping - (Latin-1). wchar_t is 16-bit on this target. + (Latin-1). wchar_t is 16-bit on this target. Extended set: + wmemcpy / wmemmove / wmemset / wmemcmp / wmemchr; + wcstol / wcstoul / wcstoll / wcstoull / wcstod / wcstof; + swprintf / vswprintf; wcsftime. All delegate to the byte + equivalents under the Latin-1 model. - ``: in-process signal table. signal() registers a handler; raise() invokes it. Default actions: SIGABRT calls abort(), SIGINT/SIGTERM call exit(128+sig), others ignored. - ``: setlocale always returns "C"; localeconv returns a fixed C-locale lconv struct. +- ``: rounding mode + exception flag word tracked but + no-op (softFloat / softDouble are fixed RNE; exceptions never + raised). Surface compiles cleanly for portable code. +- ``: C11 type-generic math via `_Generic`; selects + `sqrtf` vs `sqrt` etc. based on argument type. +- ``: C11 atomic surface, all ops lower to plain + ops (single-core uniprocessor — no real synchronization + needed). `_Atomic T` is treated as plain `T`. +- ``: stubs. `thrd_create` returns `thrd_error`; + mutex/cond ops are no-ops; `call_once` and `tss_*` work since + they're degenerate on a single-core target. +- `aligned_alloc` / `posix_memalign` / `aligned_free`: wrap + malloc with an over-allocation + pointer-stash trick. Match + C11 contract — `aligned_alloc(N, M)` returns N-aligned, free + with `aligned_free`. +- ``: alternative operator spellings (`and`, `or`, + `not`, etc.) — C95 compat header. +- ``: aliases `_Alignas` / `_Alignof` to `alignas` / + `alignof`. +- ``: aliases `_Noreturn` to `noreturn`. +- ``: `char16_t` / `char32_t` typedefs + `mbrtoc16` / + `c16rtomb` / `mbrtoc32` / `c32rtomb` conversion helpers. In + our Latin-1 model these are 1:1 byte copies (no UTF-8 decode). +- ``: wide-char classification + case folding. + Delegates to `` for code-points 0..255; anything + outside Latin-1 returns false / unchanged. +- ``: C99 complex-number surface — clang built-in + `_Complex` lowers to soft-double under the hood. Macros + `complex` / `_Complex_I` / `I` / `CMPLX` / `CMPLXF` / `CMPLXL` + plus inline `creal` / `cimag` / `conj` / `cproj` / `cabs` / + `carg` and their `f` / `l` variants. Transcendental complex + routines (csin/ccos/cexp/etc.) intentionally not provided — + they would each need a polynomial-expansion implementation + with limited IIgs value. +- ``: adds C11 `static_assert` as a macro alias for + the `_Static_assert` keyword. +- ``: full C standard error codes (EDOM, ERANGE, + EILSEQ) plus common POSIX codes (EPERM..EPIPE, ENAMETOOLONG, + ENOSYS, ENOTEMPTY, ELOOP). `strerror` maps every defined + code to a human-readable string. +- ``: adds C standard buffer-control surface + (`setvbuf` / `setbuf` as no-ops, `_IOFBF` / `_IOLBF` / `_IONBF` + / `BUFSIZ`); `fgetpos` / `fsetpos` wrap `ftell` / `fseek`; + `remove` routes through `mfsUnregister`; `rename` / `tmpfile` + / `tmpnam` are stubs. - C++ subset: classes, single inheritance, multiple inheritance (Drawable+Movable through one Sprite), virtual base diamond (A and B virtually derive Base; Diamond inherits from both @@ -162,7 +217,7 @@ which runs correctly under MAME (apple2gs). image addresses. - `runtime/build.sh` builds crt0, libc, soft-float, soft-double, libgcc into linkable objects. -- `scripts/smokeTest.sh` runs 132 end-to-end checks at -O2: +- `scripts/smokeTest.sh` runs 145 end-to-end checks at -O2: scalar ops, control flow, calling conventions, MAME execution regressions, link816 bss-base safety + weak-symbol resolution + heap_end-vs-heap_start sanity, iigs/toolbox.h compile + link, @@ -191,20 +246,23 @@ which runs correctly under MAME (apple2gs). - `scripts/benchCyclesPrecise.sh` measures per-call cycle counts via MAME's emulated time counter. Eight benchmarks under - `benchmarks/`. Current numbers: popcount 4876 cyc, bsearch - 938, memcmp 1330, strcpy 3325, dotProduct 4007, fib(10) 12958, - sumOfSquares 40920. Speed is the optimization priority, not + `benchmarks/`. Current numbers: popcount 3683 cyc, bsearch + 852, memcmp 1091, strcpy 2558, dotProduct 2387, fib(10) 12617, + sumOfSquares 23529. Speed is the optimization priority, not size. **Backend register allocation:** -- Basic regalloc as default at -O1+; fast at -O0/optnone. We use - basic instead of greedy because greedy fails ("ran out of - registers during register allocation") on functions with many - cross-call Acc16 vregs (the `ok |= bit; helper(); ok |= bit;` - pattern across many if-blocks). Basic handles those cleanly - with negligible code-size overhead vs greedy on the bench - suite (~0.6%). +- Greedy regalloc as default at -O1+; fast at -O0/optnone. Greedy + was previously blocked by an upstream LLVM `LiveRangeEdit::elimina- + teDeadDef` assertion firing on KILL pseudos with non-dead implicit- + def $a. Fix landed in `tools/llvm-mos/llvm/lib/CodeGen/InlineSpil- + ler.cpp`: when InlineSpiller converts a redundant STAfi to a KILL + pseudo, mark BOTH explicit and implicit defs dead (the original loop + only iterated `MI.defs()` = explicit-only, leaving the inherited + implicit-def $a live). Bench impact: popcount −19.4%, strcpy + −18.9%, memcmp −8.6%, bsearch −9.2%. + - Pre-RA passes: `WidenAcc16` (Acc16→Wide16 promotion, lets greedy spread i16 pressure across A and 16 IMG slots); `TiedDefSpill` (handles tied-def-multi-use hazard); @@ -259,29 +317,39 @@ for the common-case C / minimal-C++ workload. Priority is speed **Speed wins queued, ranked by expected impact:** -- **ptr32 pointer-increment overhead.** `*p++` under ptr32 emits - a full 32-bit `ADC` chain even when the high half is provably - unchanged, and LSR rewrites `*p++` into base+offset (worse on - W65816). strcpy/memcmp pay 30+ cycles per byte for what should - be 15-20. Tried `-disable-lsr` (strcpy −10%, dotProduct +10%) - and TTI `isLSRCostLess` override (memcmp +22% — worse); both - too risky without per-loop heuristics. Needs either a peephole - for `i32 + 1` with provably-no-carry-into-hi or per-loop LSR - override based on pointer-vs-array access pattern. +- **ptr32 pointer-increment overhead** (partially addressed). The + `i32 += 1` post-PEI peephole (`W65816I32IncFold`) detects the + 6-instruction LDA/ADCi16imm 1/STA/LDA/ADCEi16imm 0/STA pattern and + rewrites to LDA/INA/STA/INC_HI_IF_CARRY (with private-label BNE + expansion in AsmPrinter). Saves ~13 cyc per increment on the + no-carry common path. memcmp 1330 → 1194 (−10.2%), strcpy 3325 → + 3154 (−5.1%). LSR's `*p++ → base+offset` rewrite remains + unaddressed; tried `-disable-lsr` and `isLSRCostLess` override, + both regressed dotProduct. -- **Greedy regalloc retry.** Currently blocked on an upstream - LLVM `LiveRangeEdit::eliminateDeadDef` assertion when our - sub-register pair partial-defs reach it. Basic regalloc works - but leaves measurable cycle waste in load/store shuffles. +- **More peephole / libcall opportunities.** __mulsi3 just gained + early-exit when the multiplier shifts to 0; dotProduct dropped + 4007→2472 (−38.3%), sumOfSquares 40920→23870 (−41.6%). Next + candidates: a true 16×16→32 multiply libcall (for `(u32)i*i` + patterns) and shift-by-N inlining for shifts 5+ that currently + go through __ashlsi3. **Open limitations:** -- **Multi-bank BSS / init_array.** Multi-segment mode splits - `.text` across banks but BSS + init_array still live in - segment 1's bank (bank 0). Programs with zero-init data - exceeding the ~60KB bank-0 budget need crt0 to walk a - per-segment `(start, end)` table. Not a blocker for >64KB - *code* programs. +- **Multi-bank BSS** — full support up to 4 banks (256KB). link816 + splits BSS into up to 4 contiguous segments at link time; each + segment fits within a single bank. Linker emits + `__bss_seg{0..3}_lo16 / _bank / _size` symbols. crt0 walks the + table, setting DBR per segment. Per-segment size capped at + 0xFF00 so the 16-bit `cpx #__bss_segN_size` loop comparison + doesn't wrap to 0 on a full-bank segment (a single full bank is + split into a 0xFF00-byte primary + 0x100-byte tail in the same + bank). Smoke 137/137 validates BSS spanning bank 3 + bank 4 + (100KB) is zeroed end-to-end. Note: program access to non-DBR + bank globals still requires DBR management — the compiler emits + DBR-relative absolute for global accesses, so accessing BSS in + bank N needs the program to set DBR=N or use `sta long` via + inline asm. - **C++ exceptions absent from CI smoke.** The SJLJ runtime round-trip is in smoke; the full clang++ → backend → MAME @@ -295,13 +363,36 @@ for the common-case C / minimal-C++ workload. Priority is speed real bootable GS/OS volume is left out of CI as it needs a smartport hard-disk image and live Tool Locator init. -- **gmtime_r requires `optnone`.** IR-level optimizer issue: - loop rotation + IndVar simplify mis-evaluate `days >= 365L + - (__isLeap(...) ? 1 : 0)`, folding the comparison to - compile-time-false. Not a backend bug; needs IR-pass-level - diagnosis. +- **VLAs work end-to-end** (2026-05-09). Backend Custom-lowers + `ISD::DYNAMIC_STACKALLOC` for both i16 and i32 result types. + Loop patterns now produce correct results: `sum_n(3)→6` + verified in MAME smoke. Fix: in VLA functions PEI expands + STAfi/STA8fi/STAfi_indY to a 4-MC sequence ending in `LDY $F8` + which clobbers N/Z; the StackSlotCleanup PHP/PLP wrap pass + treats those pseudos as flag-corrupting so PLP wraps the entire + expansion. `expandFarFI` uses `STY $F8`/`LDY $F8` to a DP + scratch slot rather than PHY/PLY (PHY/PLY between PHP/PLP would + pollute the saved P). -- **softDouble `dpack` / `dclass` require `noinline`.** - Inlining triggers register pressure that overflows basic - regalloc in `__adddf3`/`__muldf3`/`__divdf3`. Architectural - for the same reason as qsort's earlier split. +- **dpack and dclass now both inline** (2026-05-10). dpack uses + a volatile-output array rewrite to defeat the backend stack-slot + coalesce bug that previously caused dadd(1.5, 2.5) → + 0x4010_4010_0000_0000. dclass's pointer-arg stores lower to + STBptr/STAptr (indirect-long, DBR-independent) and inline + cleanly. All softDouble routines compile at -O2. + +- **IMG8..IMG15 callee-save via W65816ImgCalleeSave** (2026-05-13). + New post-RA, pre-PEI pass detects use of IMG8..IMG15 ($C0..$CE) + in a function and emits prologue save + epilogue restore so those + slots behave as callee-saved AT THE ASM LEVEL — without going + through LLVM's CSR mechanism (which would shift regalloc decisions + and break unrelated tests). Save shape per used slot: `PHA; LDA + $C?; STAfi A,slot,2; PLA`; restore mirrors it. The `+2` ImmOffset + compensates for PHA's SP shift so the lowered `sta d,s` lands on + the same byte that subsequent normal-SP reads see. Cost: ~16 + cycles + 6 bytes per used slot, applied only to functions that + actually use those slots (most don't). Fixed picol `expr 1+2 == 4` + (now `3`) and a class of recursive double-fn miscompiles with + compound `||` conditions — see `feedback_picol_expr_compound_or.md`. + Smoke 149/149 green including a new orBug regression test guarding + the fix. diff --git a/compare/README.md b/compare/README.md new file mode 100644 index 0000000..05ee9d4 --- /dev/null +++ b/compare/README.md @@ -0,0 +1,44 @@ +# compare/ — backend output side-by-side with Calypsi 5.16 + +Each test case lives as three files: + +- `.c` — the C source. +- `.ours.s` — our backend's assembly (`clang --target=w65816 -O2 -S`). +- `.calypsi.lst` — Calypsi's listing file with source, hex bytes, and asm + in one document (`cc65816 --speed -O 2 --64bit-doubles`). + +Calypsi's `--output` flag emits an ELF object, not text — its `--list-file` is the +human-readable artifact. (32-bit-doubles is Calypsi's default; we pass +`--64bit-doubles` so FP-heavy tests compare apples to apples against our IEEE-754 +`double` ABI.) + +## Regenerating + +``` +bash compare/regen.sh +``` + +Recompiles every `*.c` in this directory under both compilers and prints an +instruction-count summary: + +``` +test ours calypsi ratio +---- ---- ------- ----- +evalAt 419 268 1.56x +mul16to32 12 11 1.09x +sumSquares 72 31 2.32x +``` + +(Numbers above are illustrative — re-run to see current state.) + +## Adding a new comparison + +Drop a `.c` in this directory and run `regen.sh`. No other wiring needed. + +## Counting methodology + +The summary counts asm-line opcodes (lda/sta/jsl/...) on our side and listing +lines that begin with a hex byte (Calypsi's emit-byte column) on theirs. +Both metrics are static instruction counts, NOT bytes. They underestimate +calls-to-runtime (each libcall counts as one `jsl`, not the body it expands to). +For cycle counts, use `scripts/benchCyclesPrecise.sh`. diff --git a/compare/evalAt.c b/compare/evalAt.c new file mode 100644 index 0000000..e739f2e --- /dev/null +++ b/compare/evalAt.c @@ -0,0 +1,21 @@ +// Benchmark function — orBug-style recursive double expression eval. +// Used to compare W65816 backend (with W65816ImgCalleeSave pass) vs Calypsi. +double evalAt(char **p, int prec) { + double a = 0.0; + while (**p >= '0' && **p <= '9') { + a = a * 10.0 + (double)(**p - '0'); + (*p)++; + } + while (1) { + int op = **p; + int oprec; + if (op == '*' || op == '/') oprec = 4; + else if (op == '+' || op == '-') oprec = 3; + else return a; + if (oprec <= prec) return a; + (*p)++; + double b = evalAt(p, oprec); + if (op == '+') a = a + b; + else if (op == '*') a = a * b; + } +} diff --git a/compare/evalAt.calypsi.lst b/compare/evalAt.calypsi.lst new file mode 100644 index 0000000..e79fa69 --- /dev/null +++ b/compare/evalAt.calypsi.lst @@ -0,0 +1,318 @@ +############################################################################### +# # +# Calypsi ISO C compiler for 65816 version 5.16 # +# 13/May/2026 15:46:15 # +# Command line: --speed -O 2 --64bit-doubles evalAt.c -o # +# /tmp/evalAt.calypsi.elf --list-file evalAt.calypsi.lst # +# # +############################################################################### + + \ 000000 .rtmodel version,"1" + \ 000000 .rtmodel codeModel,"large" + \ 000000 .rtmodel dataModel,"small" + \ 000000 .rtmodel core,"65816" + \ 000000 .rtmodel huge,"0" + \ 000000 .rtmodel doubleSize,"64" + \ 000000 .rtmodel target,"none-specified" + \ 000000 .extern _Dp + \ 000000 .extern _Vfp + \ 000000 .extern __f64_add + \ 000000 .extern __f64_mul + \ 000000 .extern __i32_to_f64 +0001 // Benchmark function — orBug-style recursive double expression eval. +0002 // Used to compare W65816 backend (with W65816ImgCalleeSave pass) vs Calypsi. +0003 double evalAt(char **p, int prec) { + \ 000000 .section farcode,text + \ 000000 .public evalAt + \ 000000 evalAt: + \ 000000 d4.. pei dp:.tiny (_Dp+8) + \ 000002 a8 tay + \ 000003 3b tsc + \ 000004 38 sec + \ 000005 e92600 sbc ##38 + \ 000008 1b tcs + \ 000009 98 tya + \ 00000a 831d sta 29,s + \ 00000c a5.. lda dp:.tiny _Dp + \ 00000e 831b sta 27,s + \ 000010 a5.. lda dp:.tiny (_Dp+2) + \ 000012 85.. sta dp:.tiny (_Dp+8) +0004 double a = 0.0; + \ 000014 ad.... lda _Const_0000000000000000+6 + \ 000017 8309 sta 9,s + \ 000019 ad.... lda _Const_0000000000000000+4 + \ 00001c 8307 sta 7,s + \ 00001e ad.... lda _Const_0000000000000000+2 + \ 000021 8305 sta 5,s + \ 000023 ad.... lda _Const_0000000000000000 + \ 000026 8303 sta 3,s +0005 while (**p >= '0' && **p <= '9') { + \ 000028 a309 lda 9,s + \ 00002a 8319 sta 25,s + \ 00002c a307 lda 7,s + \ 00002e 8317 sta 23,s + \ 000030 a305 lda 5,s + \ 000032 8315 sta 21,s + \ 000034 a303 lda 3,s + \ 000036 8313 `?L41`: sta 19,s + \ 000038 22...... jsl long:`?L44` + \ 00003c e220 sep #32 + \ 00003e c930 cmp #48 + \ 000040 c220 rep #32 + \ 000042 b003 bcs `?L48` + \ 000044 4c.... jmp .kbank `?L5` + \ 000047 a31b `?L48`: lda 27,s + \ 000049 a8 tay + \ 00004a be0000 ldx 0,y + \ 00004d a93900 lda ##57 + \ 000050 e220 sep #32 + \ 000052 dd0000 cmp 0,x + \ 000055 c220 rep #32 + \ 000057 9072 bcc `?L5` +0006 a = a * 10.0 + (double)(**p - '0'); + \ 000059 a2.... ldx ##_Const_4024000000000000 + \ 00005c 86.. stx dp:.tiny (_Dp+2) + \ 00005e 3b tsc + \ 00005f 18 clc + \ 000060 691300 adc ##19 + \ 000063 85.. sta dp:.tiny _Dp + \ 000065 3b tsc + \ 000066 18 clc + \ 000067 690300 adc ##3 + \ 00006a 22...... jsl long:__f64_mul + \ 00006e 22...... jsl long:`?L44` + \ 000072 29ff00 and ##255 + \ 000075 38 sec + \ 000076 e93000 sbc ##48 + \ 000079 a20000 ldx ##0 + \ 00007c a8 tay + \ 00007d 1001 bpl `?L31` + \ 00007f ca dex + \ 000080 `?L31`: + \ 000080 86.. stx dp:.tiny (_Dp+2) + \ 000082 85.. sta dp:.tiny _Dp + \ 000084 3b tsc + \ 000085 18 clc + \ 000086 690b00 adc ##11 + \ 000089 22...... jsl long:__i32_to_f64 + \ 00008d 3b tsc + \ 00008e 18 clc + \ 00008f 690b00 adc ##11 + \ 000092 85.. sta dp:.tiny (_Dp+2) + \ 000094 3b tsc + \ 000095 18 clc + \ 000096 690300 adc ##3 + \ 000099 85.. sta dp:.tiny _Dp + \ 00009b 3b tsc + \ 00009c 18 clc + \ 00009d 690300 adc ##3 + \ 0000a0 22...... jsl long:__f64_add + \ 0000a4 a309 lda 9,s + \ 0000a6 8311 sta 17,s + \ 0000a8 a307 lda 7,s + \ 0000aa 830f sta 15,s + \ 0000ac a305 lda 5,s + \ 0000ae 830d sta 13,s + \ 0000b0 a303 lda 3,s + \ 0000b2 830b sta 11,s +0007 (*p)++; + \ 0000b4 a31b lda 27,s + \ 0000b6 aa tax + \ 0000b7 fe0000 inc 0,x + \ 0000ba a311 lda 17,s + \ 0000bc 8319 sta 25,s + \ 0000be a30f lda 15,s + \ 0000c0 8317 sta 23,s + \ 0000c2 a30d lda 13,s + \ 0000c4 8315 sta 21,s + \ 0000c6 a30b lda 11,s + \ 0000c8 4c.... jmp .kbank `?L41` + \ 0000cb `?L5`: +0008 } +0009 while (1) { + \ 0000cb a319 lda 25,s + \ 0000cd 8325 sta 37,s + \ 0000cf a317 lda 23,s + \ 0000d1 8323 sta 35,s + \ 0000d3 a315 lda 21,s + \ 0000d5 8321 sta 33,s + \ 0000d7 a313 lda 19,s + \ 0000d9 831f `?L40`: sta 31,s +0010 int op = **p; +0011 int oprec; +0012 if (op == '*' || op == '/') oprec = 4; + \ 0000db 22...... jsl long:`?L44` + \ 0000df 29ff00 and ##255 + \ 0000e2 830b sta 11,s + \ 0000e4 c92a00 cmp ##42 + \ 0000e7 f016 beq `?L12` + \ 0000e9 c92f00 cmp ##47 + \ 0000ec f011 beq `?L12` +0013 else if (op == '+' || op == '-') oprec = 3; + \ 0000ee c92b00 cmp ##43 + \ 0000f1 f005 beq `?L15` + \ 0000f3 c92d00 cmp ##45 + \ 0000f6 d018 bne `?L19` + \ 0000f8 a90300 `?L15`: lda ##3 + \ 0000fb 8301 sta 1,s + \ 0000fd 8005 bra `?L11` + \ 0000ff a90400 `?L12`: lda ##4 + \ 000102 8301 sta 1,s + \ 000104 `?L11`: +0014 else return a; +0015 if (oprec <= prec) return a; + \ 000104 a5.. lda dp:.tiny (_Dp+8) + \ 000106 38 sec + \ 000107 e301 sbc 1,s + \ 000109 5003 bvc `?L35` + \ 00010b 490080 eor ##-32768 + \ 00010e 302a `?L35`: bmi `?L18` + \ 000110 a325 `?L19`: lda 37,s + \ 000112 a00600 ldy ##6 + \ 000115 931d sta (29,s),y + \ 000117 a323 lda 35,s + \ 000119 a00400 ldy ##4 + \ 00011c 931d sta (29,s),y + \ 00011e a321 lda 33,s + \ 000120 a00200 ldy ##2 + \ 000123 931d sta (29,s),y + \ 000125 a31f lda 31,s + \ 000127 a00000 ldy ##0 + \ 00012a 931d sta (29,s),y + \ 00012c a31d lda 29,s +0016 (*p)++; +0017 double b = evalAt(p, oprec); +0018 if (op == '+') a = a + b; +0019 else if (op == '*') a = a * b; +0020 } +0021 } + \ 00012e a8 tay + \ 00012f 3b tsc + \ 000130 18 clc + \ 000131 692600 adc ##38 + \ 000134 1b tcs + \ 000135 98 tya + \ 000136 7a ply + \ 000137 84.. sty dp:.tiny (_Dp+8) + \ 000139 6b rtl + \ 00013a a31b `?L18`: lda 27,s + \ 00013c aa tax + \ 00013d fe0000 inc 0,x + \ 000140 a301 lda 1,s + \ 000142 85.. sta dp:.tiny (_Dp+2) + \ 000144 a31b lda 27,s + \ 000146 85.. sta dp:.tiny _Dp + \ 000148 3b tsc + \ 000149 18 clc + \ 00014a 690300 adc ##3 + \ 00014d 22...... jsl long:evalAt + \ 000151 a30b lda 11,s + \ 000153 c92b00 cmp ##43 + \ 000156 d037 bne `?L21` + \ 000158 3b tsc + \ 000159 18 clc + \ 00015a 690300 adc ##3 + \ 00015d 85.. sta dp:.tiny (_Dp+2) + \ 00015f 3b tsc + \ 000160 18 clc + \ 000161 691f00 adc ##31 + \ 000164 85.. sta dp:.tiny _Dp + \ 000166 3b tsc + \ 000167 18 clc + \ 000168 690300 adc ##3 + \ 00016b 22...... jsl long:__f64_add + \ 00016f a309 lda 9,s + \ 000171 8319 sta 25,s + \ 000173 a307 lda 7,s + \ 000175 8317 sta 23,s + \ 000177 a305 lda 5,s + \ 000179 8315 sta 21,s + \ 00017b a303 lda 3,s + \ 00017d 8313 sta 19,s + \ 00017f a319 lda 25,s + \ 000181 8311 sta 17,s + \ 000183 a317 lda 23,s + \ 000185 830f sta 15,s + \ 000187 a315 lda 21,s + \ 000189 830d sta 13,s + \ 00018b a313 lda 19,s + \ 00018d 805a bra `?L43` + \ 00018f c92a00 `?L21`: cmp ##42 + \ 000192 d037 bne `?L24` + \ 000194 3b tsc + \ 000195 18 clc + \ 000196 690300 adc ##3 + \ 000199 85.. sta dp:.tiny (_Dp+2) + \ 00019b 3b tsc + \ 00019c 18 clc + \ 00019d 691f00 adc ##31 + \ 0001a0 85.. sta dp:.tiny _Dp + \ 0001a2 3b tsc + \ 0001a3 18 clc + \ 0001a4 690300 adc ##3 + \ 0001a7 22...... jsl long:__f64_mul + \ 0001ab a309 lda 9,s + \ 0001ad 8311 sta 17,s + \ 0001af a307 lda 7,s + \ 0001b1 830f sta 15,s + \ 0001b3 a305 lda 5,s + \ 0001b5 830d sta 13,s + \ 0001b7 a303 lda 3,s + \ 0001b9 830b sta 11,s + \ 0001bb a311 lda 17,s + \ 0001bd 8309 sta 9,s + \ 0001bf a30f lda 15,s + \ 0001c1 8307 sta 7,s + \ 0001c3 a30d lda 13,s + \ 0001c5 8305 sta 5,s + \ 0001c7 a30b lda 11,s + \ 0001c9 800e bra `?L42` + \ 0001cb a325 `?L24`: lda 37,s + \ 0001cd 8309 sta 9,s + \ 0001cf a323 lda 35,s + \ 0001d1 8307 sta 7,s + \ 0001d3 a321 lda 33,s + \ 0001d5 8305 sta 5,s + \ 0001d7 a31f lda 31,s + \ 0001d9 8303 `?L42`: sta 3,s + \ 0001db a309 lda 9,s + \ 0001dd 8311 sta 17,s + \ 0001df a307 lda 7,s + \ 0001e1 830f sta 15,s + \ 0001e3 a305 lda 5,s + \ 0001e5 830d sta 13,s + \ 0001e7 a303 lda 3,s + \ 0001e9 830b `?L43`: sta 11,s + \ 0001eb a311 lda 17,s + \ 0001ed 8325 sta 37,s + \ 0001ef a30f lda 15,s + \ 0001f1 8323 sta 35,s + \ 0001f3 a30d lda 13,s + \ 0001f5 8321 sta 33,s + \ 0001f7 a30b lda 11,s + \ 0001f9 4c.... jmp .kbank `?L40` + \ 000000 .section farcode,text + \ 000000 a31e `?L44`: lda 30,s + \ 000002 a8 tay + \ 000003 be0000 ldx 0,y + \ 000006 bd0000 lda 0,x + \ 000009 6b rtl + \ 000000 .section cdata,rodata + \ 000000 .pubweak _Const_0000000000000000 + \ 000000 _Const_0000000000000000: + \ 000000 00000000 .quad 0 + \ 000004 00000000 + \ 000000 .section cdata,rodata + \ 000000 .pubweak _Const_4024000000000000 + \ 000000 _Const_4024000000000000: + \ 000000 00000000 .quad 0x4024000000000000 + \ 000004 00002440 + +########################## +# # +# Memory sizes (decimal) # +# # +########################## + +Executable (Text): 518 bytes +Constant : 16 bytes diff --git a/compare/evalAt.ours.s b/compare/evalAt.ours.s new file mode 100644 index 0000000..cd8be47 --- /dev/null +++ b/compare/evalAt.ours.s @@ -0,0 +1,593 @@ + .file "evalAt.c" + .text + .globl evalAt ; -- Begin function evalAt + .type evalAt,@function +evalAt: ; @evalAt +; %bb.0: ; %entry + rep #0x30 + tay + tsc + sec + sbc #0x46 + tcs + tya + pha + lda 0xc0 + sta 0xb, s + lda 0xc4 + sta 0x9, s + lda 0xc6 + sta 0x7, s + lda 0xc8 + sta 0x5, s + lda 0xca + sta 0x3, s + pla + stx 0xc0 + sta 0x19, s + clc + adc #0x2 + sta 0x1f, s + lda 0xc0 + sta 0x21, s + adc #0x0 + sta 0x21, s + lda 0x1f, s + sta 0x45, s + lda 0x21, s + sta 0x43, s + lda 0x45, s + sta 0xe0 + lda 0x43, s + sta 0xe2 + ldy #0x0 + lda [0xe0 ], y + sta 0x1d, s + lda 0x19, s + sta 0x41, s + pha + lda 0xc0 + sta 0x41, s + pla + lda 0x41, s + sta 0xe0 + lda 0x3f, s + sta 0xe2 + lda [0xe0 ], y + sta 0x21, s + lda 0x4a, s + sta 0xb, s + lda #0x0 + sta 0xc4 + sta 0xc6 + lda 0x21, s + sta 0x3d, s + lda 0x1d, s + sta 0x3b, s + lda 0x3d, s + sta 0xe0 + lda 0x3b, s + sta 0xe2 + lda [0xe0 ], y + and #0xff + sta 0x1b, s + sep #0x20 + clc + adc #0xd0 + rep #0x20 + and #0xff + cmp #0xa + pha + lda 0xc4 + sta 0xc8 + pla + pha + lda 0xc6 + sta 0xca + pla + bcc .LBB0_1 +; %bb.15: ; %entry + brl .LBB0_4 +.LBB0_1: ; %while.body.preheader + lda 0x21, s + inc a + sta 0x21, s + bne .Ltmp0 + lda 0x1d, s + inc a + sta 0x1d, s +.Ltmp0: + lda #0x0 + sta 0x15, s + sta 0x13, s + sta 0x11, s + sta 0xf, s + lda 0x1d, s + sta 0x17, s +.LBB0_2: ; %while.body + ; =>This Inner Loop Header: Depth=1 + sta 0x1d, s + lda 0x19, s + tax + pha + lda 0xc0 + sta 0x3b, s + pla + txa + sta 0xe0 + lda 0x39, s + sta 0xe2 + lda 0x21, s + ldy #0x0 + sta [0xe0 ], y + lda 0x19, s + clc + adc #0x2 + sta 0xd, s + lda 0xc0 + sta 0x1f, s + adc #0x0 + sta 0x1f, s + lda 0xd, s + sta 0x37, s + lda 0x1f, s + tax + lda 0x37, s + sta 0xe0 + txa + sta 0xe2 + lda 0x1d, s + sta [0xe0 ], y + pea 0x4024 + pea 0x0 + pea 0x0 + pea 0x0 + lda 0x17, s + pha + lda 0x1b, s + pha + lda 0x1f, s + tax + lda 0x21, s + jsl __muldf3 + sta 0xe0 + tsc + clc + adc #0xc + tcs + lda 0xe0 + sta 0x1f, s + txa + sta 0x15, s + tya + sta 0x13, s + lda 0xf0 + sta 0x11, s + lda 0x1b, s + sep #0x20 + clc + adc #0xd0 + rep #0x20 + and #0xff + sta 0x1b, s + ldx #0x0 + lda 0x1b, s + jsl __floatunsidf + sta 0x1b, s + txa + sta 0xf, s + tya + sta 0xd, s + pei 0xf0 + lda 0xf, s + pha + lda 0x13, s + tax + phx + lda 0x21, s + pha + lda 0x19, s + pha + lda 0x1d, s + pha + lda 0x21, s + tax + lda 0x2b, s + jsl __adddf3 + sta 0xe0 + tsc + clc + adc #0xc + tcs + lda 0xe0 + sta 0x15, s + txa + sta 0x13, s + tya + sta 0x11, s + lda 0xf0 + sta 0xf, s + lda 0x21, s + sta 0xd0 + tax + lda 0x21, s + clc + adc #0x1 + sta 0x21, s + txa + lda 0xd0 + sta 0x1f, s + lda 0x17, s + adc #0x0 + sta 0x17, s + lda 0x11, s + sta 0xc8 + lda 0xf, s + sta 0xca + lda 0x15, s + sta 0xc4 + lda 0x13, s + sta 0xc6 + lda 0x1f, s + sta 0x35, s + lda 0x1d, s + tax + lda 0x35, s + sta 0xe0 + txa + sta 0xe2 + ldy #0x0 + lda [0xe0 ], y + and #0xff + sta 0x1b, s + sep #0x20 + clc + adc #0xd0 + rep #0x20 + and #0xff + cmp #0xa + lda 0x17, s + bcs .LBB0_3 +; %bb.16: ; %while.body + ; in Loop: Header=BB0_2 Depth=1 + brl .LBB0_2 +.LBB0_3: ; %while.cond7.preheader.loopexit + lda 0x21, s + clc + adc #0xffff + sta 0x21, s + lda 0x17, s + adc #0xffff + sta 0x1d, s +.LBB0_4: ; %while.cond7.preheader + lda 0xb, s + eor #0x8000 + sta 0xb, s + lda 0x1b, s + brl .LBB0_5 +.LBB0_11: ; %if.then33 + ; in Loop: Header=BB0_5 Depth=1 + lda 0xc6 + sta 0x1b, s + lda 0xc4 + sta 0x15, s + lda 0xca + sta 0x11, s + lda 0xc8 + sta 0x13, s + lda 0x17, s + pha + lda 0x1f, s + pha + lda 0x23, s + pha + lda 0x27, s + pha + lda 0x19, s + pha + lda 0x1d, s + pha + lda 0x27, s + tax + lda 0x21, s + jsl __muldf3 +.LBB0_12: ; %cleanup + ; in Loop: Header=BB0_5 Depth=1 + sta 0xe0 + tsc + clc + adc #0xc + tcs + lda 0xe0 + sta 0x21, s + txa + sta 0x1f, s + tya + sta 0x1d, s + lda 0xf0 + sta 0x1b, s + lda 0x1d, s + sta 0xc8 + lda 0x1b, s + sta 0xca + lda 0x21, s + sta 0xc4 + lda 0x1f, s + sta 0xc6 +.LBB0_13: ; %cleanup + ; in Loop: Header=BB0_5 Depth=1 + lda 0x19, s + clc + adc #0x2 + sta 0x1f, s + lda 0xc0 + sta 0x21, s + adc #0x0 + sta 0x21, s + lda 0x1f, s + sta 0x25, s + lda 0x21, s + tax + lda 0x25, s + sta 0xe0 + txa + sta 0xe2 + ldy #0x0 + lda [0xe0 ], y + sta 0x1d, s + lda 0x19, s + tax + pha + lda 0xc0 + sta 0x25, s + pla + txa + sta 0xe0 + lda 0x23, s + sta 0xe2 + lda [0xe0 ], y + sta 0x21, s + lda 0x1d, s + tax + lda 0x21, s + sta 0xe0 + txa + sta 0xe2 + lda [0xe0 ], y + and #0xff +.LBB0_5: ; %while.cond7 + ; =>This Inner Loop Header: Depth=1 + sta 0x1b, s + sep #0x20 + clc + adc #0xd6 + rep #0x20 + and #0xff + sta 0x1f, s + lda 0x1f, s + pha + lda #0x2b + jsl __lshrhi3 + ply + sta 0x17, s + lda 0x1f, s + cmp #0x6 + bcc .LBB0_6 +; %bb.17: ; %while.cond7 + brl .LBB0_14 +.LBB0_6: ; %while.cond7 + ; in Loop: Header=BB0_5 Depth=1 + lda 0x17, s + and #0x1 + sta 0x17, s + lda #0x0 + sta 0x33, s + lda 0x17, s + ora 0x33, s + bne .LBB0_7 +; %bb.18: ; %while.cond7 + brl .LBB0_14 +.LBB0_7: ; %switch.lookup + ; in Loop: Header=BB0_5 Depth=1 + lda #0x0 + asl a + sta 0x17, s + lda 0x1f, s + asl a + lda #0x0 + rol a + sta 0x31, s + lda 0x17, s + ora 0x31, s + sta 0x17, s + lda 0x1f, s + asl a + sta 0x1f, s + lda #.Lswitch.table.evalAt + sta 0x2f, s + lda 0x1f, s + clc + adc 0x2f, s + sta 0x1f, s + lda #0x0 + sta 0x2d, s + lda 0x17, s + adc 0x2d, s + sta 0x17, s + lda 0x1f, s + sta 0x2b, s + lda 0x17, s + tax + lda 0x2b, s + sta 0xe0 + txa + sta 0xe2 + ldy #0x0 + lda [0xe0 ], y + sta 0x1f, s + lda 0x1f, s + tax + eor #0x8000 + sta 0x1f, s + txa + sta 0x17, s + lda 0xb, s + cmp 0x1f, s + bcc .LBB0_8 +; %bb.19: ; %switch.lookup + brl .LBB0_14 +.LBB0_8: ; %if.end25 + ; in Loop: Header=BB0_5 Depth=1 + lda 0x21, s + inc a + sta 0x21, s + bne .Ltmp1 + lda 0x1d, s + inc a + sta 0x1d, s +.Ltmp1: + lda 0x19, s + tax + pha + lda 0xc0 + sta 0x2b, s + pla + txa + sta 0xe0 + lda 0x29, s + sta 0xe2 + lda 0x21, s + ldy #0x0 + sta [0xe0 ], y + lda 0x19, s + sta 0xd0 + clc + adc #0x2 + sta 0x1f, s + lda 0xd0 + sta 0x21, s + lda 0xc0 + adc #0x0 + sta 0x15, s + lda 0x1f, s + sta 0x27, s + lda 0x15, s + tax + lda 0x27, s + sta 0xe0 + txa + sta 0xe2 + lda 0x1d, s + sta [0xe0 ], y + lda 0x17, s + pha + ldx 0xc0 + lda 0x23, s + jsl evalAt + sta 0xe0 + tsc + clc + adc #0x2 + tcs + lda 0xe0 + sta 0x21, s + txa + sta 0x1f, s + tya + sta 0x1d, s + lda 0xf0 + sta 0x17, s + lda 0x1b, s + and #0xff + cmp #0x2a + bne .LBB0_9 +; %bb.20: ; %if.end25 + ; in Loop: Header=BB0_5 Depth=1 + brl .LBB0_11 +.LBB0_9: ; %if.end25 + ; in Loop: Header=BB0_5 Depth=1 + cmp #0x2b + beq .LBB0_10 +; %bb.21: ; %if.end25 + ; in Loop: Header=BB0_5 Depth=1 + brl .LBB0_13 +.LBB0_10: ; %if.then29 + ; in Loop: Header=BB0_5 Depth=1 + lda 0xc6 + sta 0x1b, s + lda 0xc4 + sta 0x15, s + lda 0xca + sta 0x11, s + lda 0xc8 + sta 0x13, s + lda 0x17, s + pha + lda 0x1f, s + pha + lda 0x23, s + pha + lda 0x27, s + pha + lda 0x19, s + pha + lda 0x1d, s + pha + lda 0x27, s + tax + lda 0x21, s + jsl __adddf3 + brl .LBB0_12 +.LBB0_14: ; %cleanup37 + lda 0xc6 + sta 0x21, s + lda 0xc4 + sta 0x1f, s + lda 0xca + sta 0x1b, s + lda 0xc8 + sta 0x1d, s + lda 0x1b, s + sta 0xf0 + lda 0x1d, s + tay + lda 0x21, s + tax + lda 0x1f, s + pha + lda 0x3, s + sta 0xca + lda 0x5, s + sta 0xc8 + lda 0x7, s + sta 0xc6 + lda 0x9, s + sta 0xc4 + lda 0xb, s + sta 0xc0 + pla + sta 0xe0 + tsc + clc + adc #0x46 + tcs + lda 0xe0 + rtl +.Lfunc_end0: + .size evalAt, .Lfunc_end0-evalAt + ; -- End function + .type .Lswitch.table.evalAt,@object ; @switch.table.evalAt + .section .rodata,"a",@progbits + .p2align 1, 0x0 +.Lswitch.table.evalAt: + .short 4 ; 0x4 + .short 3 ; 0x3 + .zero 2 + .short 3 ; 0x3 + .zero 2 + .short 4 ; 0x4 + .size .Lswitch.table.evalAt, 12 + + .ident "clang version 23.0.0git (https://github.com/llvm-mos/llvm-mos.git c798c31416f72b395c658b5502d281a162387ab1)" + .section ".note.GNU-stack","",@progbits + .addrsig diff --git a/compare/mul16to32.c b/compare/mul16to32.c new file mode 100644 index 0000000..7988ac6 --- /dev/null +++ b/compare/mul16to32.c @@ -0,0 +1,4 @@ +// Explicit zext pattern - should trigger the combine. +unsigned long mul16to32(unsigned short a, unsigned short b) { + return (unsigned long)a * (unsigned long)b; +} diff --git a/compare/mul16to32.calypsi.lst b/compare/mul16to32.calypsi.lst new file mode 100644 index 0000000..8df288a --- /dev/null +++ b/compare/mul16to32.calypsi.lst @@ -0,0 +1,37 @@ +############################################################################### +# # +# Calypsi ISO C compiler for 65816 version 5.16 # +# 13/May/2026 15:46:15 # +# Command line: --speed -O 2 --64bit-doubles mul16to32.c -o # +# /tmp/mul16to32.calypsi.elf --list-file # +# mul16to32.calypsi.lst # +# # +############################################################################### + + \ 000000 .rtmodel version,"1" + \ 000000 .rtmodel codeModel,"large" + \ 000000 .rtmodel dataModel,"small" + \ 000000 .rtmodel core,"65816" + \ 000000 .rtmodel huge,"0" + \ 000000 .rtmodel target,"none-specified" + \ 000000 .extern _Dp + \ 000000 .extern _Mul16 + \ 000000 .extern _Vfp +0001 // Explicit zext pattern - should trigger the combine. +0002 unsigned long mul16to32(unsigned short a, unsigned short b) { + \ 000000 .section farcode,text + \ 000000 .public mul16to32 + \ 000000 aa mul16to32: tax +0003 return (unsigned long)a * (unsigned long)b; + \ 000001 a5.. lda dp:.tiny _Dp + \ 000003 22...... jsl long:_Mul16 +0004 } + \ 000007 6b rtl + +########################## +# # +# Memory sizes (decimal) # +# # +########################## + +Executable (Text): 8 bytes diff --git a/compare/mul16to32.ours.s b/compare/mul16to32.ours.s new file mode 100644 index 0000000..0e39aa6 --- /dev/null +++ b/compare/mul16to32.ours.s @@ -0,0 +1,23 @@ + .file "mul16to32.c" + .text + .globl mul16to32 ; -- Begin function mul16to32 + .type mul16to32,@function +mul16to32: ; @mul16to32 +; %bb.0: ; %entry + rep #0x30 + pha + pha + lda 0x8, s + jsl __umulhisi3 + ply + sta 0x1, s + lda 0x1, s + ply + rtl +.Lfunc_end0: + .size mul16to32, .Lfunc_end0-mul16to32 + ; -- End function + .ident "clang version 23.0.0git (https://github.com/llvm-mos/llvm-mos.git c798c31416f72b395c658b5502d281a162387ab1)" + .section ".note.GNU-stack","",@progbits + .addrsig + .addrsig_sym __umulhisi3 diff --git a/compare/regen.sh b/compare/regen.sh new file mode 100755 index 0000000..0b70159 --- /dev/null +++ b/compare/regen.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash +# Regenerate compare/ artifacts: for each *.c, produce both +# .ours.s (our backend) and .calypsi.lst (Calypsi listing). +# Run from the project root or anywhere; uses absolute paths. + +set -eu + +PROJECT_ROOT="/home/scott/claude/llvm816" +COMPARE_DIR="$PROJECT_ROOT/compare" +OUR_CLANG="$PROJECT_ROOT/tools/llvm-mos-build/bin/clang" +OUR_SYSROOT="$PROJECT_ROOT/runtime" +CALYPSI_CC="$PROJECT_ROOT/tools/calypsi/usr/local/lib/calypsi-65816-5.16/bin/cc65816" + +OURS_FLAGS=(--target=w65816 --sysroot="$OUR_SYSROOT" -O2 -S) +# --64bit-doubles for fair FP comparison (Calypsi default is 32-bit doubles). +CALYPSI_FLAGS=(--speed -O 2 --64bit-doubles) + +cd "$COMPARE_DIR" + +for c in *.c; do + base="${c%.c}" + echo "build: $base" + "$OUR_CLANG" "${OURS_FLAGS[@]}" "$c" -o "$base.ours.s" + "$CALYPSI_CC" "${CALYPSI_FLAGS[@]}" "$c" -o "/tmp/$base.calypsi.elf" \ + --list-file "$base.calypsi.lst" + rm -f "/tmp/$base.calypsi.elf" +done + +# Per-file instruction-count summary. +printf '\n%-25s %8s %8s %8s\n' "test" "ours" "calypsi" "ratio" +printf '%-25s %8s %8s %8s\n' "----" "----" "-------" "-----" +for c in *.c; do + base="${c%.c}" + ours_n=$(grep -cE \ + '^\s+(lda|sta|jsl|jsr|adc|sbc|cmp|sec|clc|sep|rep|inc|dec|bra|brl|bcs|bcc|beq|bne|bmi|bpl|asl|lsr|rol|ror|stz|stx|sty|ldx|ldy|tax|txa|tay|tya|tsc|tcs|tdc|tcd|pha|pla|phx|plx|phy|ply|php|plp|pea|pei|rtl|rts|xba|xce|tsb|trb|bit|and|ora|eor|cop|brk|wai|stp|nop)\b' \ + "$base.ours.s" || true) + cal_n=$(grep -cE '^\s+\\ [0-9a-f]+ [0-9a-f][0-9a-f]' "$base.calypsi.lst" || true) + if [ "$cal_n" -gt 0 ]; then + ratio=$(awk -v a="$ours_n" -v b="$cal_n" 'BEGIN{printf "%.2fx", a/b}') + else + ratio="n/a" + fi + printf '%-25s %8s %8s %8s\n' "$base" "$ours_n" "$cal_n" "$ratio" +done diff --git a/compare/sumSquares.c b/compare/sumSquares.c new file mode 100644 index 0000000..df68fe5 --- /dev/null +++ b/compare/sumSquares.c @@ -0,0 +1,8 @@ +// Simple function for compiler-quality comparison. +unsigned long sumSquares(unsigned short n) { + unsigned long total = 0; + for (unsigned short i = 1; i <= n; i++) { + total += (unsigned long)i * i; + } + return total; +} diff --git a/compare/sumSquares.calypsi.lst b/compare/sumSquares.calypsi.lst new file mode 100644 index 0000000..09e4d2b --- /dev/null +++ b/compare/sumSquares.calypsi.lst @@ -0,0 +1,68 @@ +############################################################################### +# # +# Calypsi ISO C compiler for 65816 version 5.16 # +# 13/May/2026 15:46:15 # +# Command line: --speed -O 2 --64bit-doubles sumSquares.c -o # +# /tmp/sumSquares.calypsi.elf --list-file # +# sumSquares.calypsi.lst # +# # +############################################################################### + + \ 000000 .rtmodel version,"1" + \ 000000 .rtmodel codeModel,"large" + \ 000000 .rtmodel dataModel,"small" + \ 000000 .rtmodel core,"65816" + \ 000000 .rtmodel huge,"0" + \ 000000 .rtmodel target,"none-specified" + \ 000000 .extern _Dp + \ 000000 .extern _Mul16 + \ 000000 .extern _Vfp +0001 // Simple function for compiler-quality comparison. +0002 unsigned long sumSquares(unsigned short n) { + \ 000000 .section farcode,text + \ 000000 .public sumSquares + \ 000000 5a sumSquares: phy + \ 000001 5a phy + \ 000002 8301 sta 1,s +0003 unsigned long total = 0; + \ 000004 64.. stz dp:.tiny _Dp + \ 000006 64.. stz dp:.tiny (_Dp+2) +0004 for (unsigned short i = 1; i <= n; i++) { + \ 000008 a90100 lda ##1 + \ 00000b 8303 sta 3,s + \ 00000d a301 `?L5`: lda 1,s + \ 00000f c303 cmp 3,s + \ 000011 b007 bcs `?L4` +0005 total += (unsigned long)i * i; +0006 } +0007 return total; + \ 000013 a6.. ldx dp:.tiny (_Dp+2) + \ 000015 a5.. lda dp:.tiny _Dp +0008 } + \ 000017 7a ply + \ 000018 7a ply + \ 000019 6b rtl + \ 00001a a303 `?L4`: lda 3,s + \ 00001c aa tax + \ 00001d 22...... jsl long:_Mul16 + \ 000021 18 clc + \ 000022 65.. adc dp:.tiny _Dp + \ 000024 48 pha + \ 000025 8a txa + \ 000026 65.. adc dp:.tiny (_Dp+2) + \ 000028 aa tax + \ 000029 68 pla + \ 00002a 86.. stx dp:.tiny (_Dp+2) + \ 00002c 85.. sta dp:.tiny _Dp + \ 00002e a303 lda 3,s + \ 000030 1a inc a + \ 000031 8303 sta 3,s + \ 000033 80d8 bra `?L5` + +########################## +# # +# Memory sizes (decimal) # +# # +########################## + +Executable (Text): 53 bytes diff --git a/compare/sumSquares.ours.s b/compare/sumSquares.ours.s new file mode 100644 index 0000000..bb9efad --- /dev/null +++ b/compare/sumSquares.ours.s @@ -0,0 +1,93 @@ + .file "sumSquares.c" + .text + .globl sumSquares ; -- Begin function sumSquares + .type sumSquares,@function +sumSquares: ; @sumSquares +; %bb.0: ; %entry + rep #0x30 + tay + tsc + sec + sbc #0xe + tcs + tya + sta 0x7, s + lda #0x0 + sta 0xb, s + lda 0x7, s + cmp #0x0 + php + lda #0x0 + plp + sta 0x9, s + bne .LBB0_1 +; %bb.6: ; %entry + brl .LBB0_5 +.LBB0_1: ; %for.body.preheader + lda 0x7, s + inc a + sta 0x7, s + cmp #0x3 + bcs .LBB0_3 +; %bb.2: ; %for.body.preheader + lda #0x2 + sta 0x7, s +.LBB0_3: ; %for.body.preheader + lda #0x0 + sta 0x3, s + lda #0x1 + sta 0xd, s + lda 0x7, s + dec a + sta 0x7, s + lda #0x0 + sta 0x5, s + sta 0x1, s +.LBB0_4: ; %for.body + ; =>This Inner Loop Header: Depth=1 + lda 0xd, s + pha + jsl __umulhisi3 + ply + clc + adc 0x3, s + sta 0xb, s + txa + adc 0x1, s + sta 0x9, s + lda 0xd, s + inc a + sta 0xd, s + bne .Ltmp0 + lda 0x5, s + inc a + sta 0x5, s +.Ltmp0: + lda 0xb, s + sta 0x3, s + lda 0x9, s + sta 0x1, s + lda 0x7, s + dec a + sta 0x7, s + cmp #0x0 + beq .LBB0_5 + bra .LBB0_4 +.LBB0_5: ; %for.cond.cleanup + lda 0x9, s + tax + lda 0xb, s + tay + tsc + clc + adc #0xe + tcs + tya + rtl +.Lfunc_end0: + .size sumSquares, .Lfunc_end0-sumSquares + ; -- End function + .ident "clang version 23.0.0git (https://github.com/llvm-mos/llvm-mos.git c798c31416f72b395c658b5502d281a162387ab1)" + .section ".note.GNU-stack","",@progbits + .addrsig + .addrsig_sym __umulhisi3 diff --git a/mame.ini b/mame.ini new file mode 100644 index 0000000..da78978 --- /dev/null +++ b/mame.ini @@ -0,0 +1,416 @@ +# +# CORE CONFIGURATION OPTIONS +# +readconfig 1 +writeconfig 0 + +# +# CORE SEARCH PATH OPTIONS +# +homepath . +rompath $HOME/mame/roms;/usr/local/share/games/mame/roms;/usr/share/games/mame/roms +hashpath /usr/share/games/mame/hash +samplepath $HOME/mame/samples;/usr/local/share/games/mame/samples;/usr/share/games/mame/samples +artpath $HOME/mame/artwork;/usr/local/share/games/mame/artwork;/usr/share/games/mame/artwork +ctrlrpath /usr/share/games/mame/ctrlr +inipath $HOME/.mame;/etc/mame +fontpath /usr/share/games/mame/fonts +cheatpath $HOME/mame/cheat;/usr/local/share/games/mame/cheat;/usr/share/games/mame/cheat +crosshairpath $HOME/mame/crosshair;/usr/local/share/games/mame/crosshair;/usr/share/games/mame/crosshair +pluginspath /usr/share/games/mame/plugins +languagepath /usr/share/games/mame/language +swpath software + +# +# CORE OUTPUT DIRECTORY OPTIONS +# +cfg_directory $HOME/.mame/cfg +nvram_directory $HOME/.mame/nvram +input_directory $HOME/.mame/inp +state_directory $HOME/.mame/sta +snapshot_directory $HOME/.mame/snap +diff_directory $HOME/.mame/diff +comment_directory $HOME/.mame/comments +share_directory share + +# +# CORE STATE/PLAYBACK OPTIONS +# +state +autosave 0 +rewind 0 +rewind_capacity 100 +playback +record +exit_after_playback 0 +mngwrite +aviwrite +wavwrite +snapname %g/%i +snapsize auto +snapview auto +snapbilinear 1 +statename %g +burnin 0 + +# +# CORE PERFORMANCE OPTIONS +# +autoframeskip 0 +frameskip 0 +seconds_to_run 0 +throttle 1 +sleep 1 +speed 1.0 +refreshspeed 0 +lowlatency 0 + +# +# CORE RENDER OPTIONS +# +keepaspect 1 +unevenstretch 1 +unevenstretchx 0 +unevenstretchy 0 +autostretchxy 0 +intoverscan 0 +intscalex 0 +intscaley 0 + +# +# CORE ROTATION OPTIONS +# +rotate 1 +ror 0 +rol 0 +autoror 0 +autorol 0 +flipx 0 +flipy 0 + +# +# CORE ARTWORK OPTIONS +# +artwork_crop 0 +fallback_artwork +override_artwork + +# +# CORE SCREEN OPTIONS +# +brightness 1.0 +contrast 1.0 +gamma 1.0 +pause_brightness 0.65 +effect none + +# +# CORE VECTOR OPTIONS +# +beam_width_min 1.0 +beam_width_max 1.0 +beam_dot_size 1.0 +beam_intensity_weight 0 +flicker 0 + +# +# CORE SOUND OPTIONS +# +samplerate 48000 +samples 1 +volume 0 +compressor 1 +speaker_report 0 + +# +# CORE INPUT OPTIONS +# +coin_lockout 1 +ctrlr +mouse 1 +joystick 1 +lightgun 0 +multikeyboard 0 +multimouse 0 +steadykey 0 +ui_active 0 +offscreen_reload 0 +joystick_map auto +joystick_deadzone 0.15 +joystick_saturation 0.85 +joystick_threshold 0.3 +natural 0 +joystick_contradictory 0 +coin_impulse 0 + +# +# CORE INPUT AUTOMATIC ENABLE OPTIONS +# +paddle_device keyboard +adstick_device keyboard +pedal_device keyboard +dial_device keyboard +trackball_device keyboard +lightgun_device keyboard +positional_device keyboard +mouse_device mouse + +# +# CORE DEBUGGING OPTIONS +# +verbose 0 +log 0 +oslog 0 +debug 0 +update_in_pause 0 +debugscript +debuglog 0 + +# +# CORE COMM OPTIONS +# +comm_localhost 0.0.0.0 +comm_localport 15112 +comm_remotehost 127.0.0.1 +comm_remoteport 15112 +comm_framesync 0 + +# +# CORE MISC OPTIONS +# +drc 1 +drc_use_c 0 +drc_log_uml 0 +drc_log_native 0 +bios +cheat 0 +skip_gameinfo 0 +uifont default +ui cabinet +ramsize +confirm_quit 0 +ui_mouse 1 +language +nvram_save 1 + +# +# SCRIPTING OPTIONS +# +autoboot_command +autoboot_delay 0 +autoboot_script +console 0 +plugins 1 +plugin +noplugin + +# +# HTTP SERVER OPTIONS +# +http 0 +http_port 8080 +http_root web + +# +# OSD INPUT MAPPING OPTIONS +# +uimodekey INSERT +controller_map none +background_input 0 + +# +# OSD FONT OPTIONS +# +uifontprovider auto + +# +# OSD OUTPUT OPTIONS +# +output auto + +# +# OSD INPUT OPTIONS +# +keyboardprovider auto +mouseprovider auto +lightgunprovider auto +joystickprovider auto + +# +# OSD DEBUGGING OPTIONS +# +debugger auto +debugger_port 23946 +debugger_font auto +debugger_font_size 0 +watchdog 0 + +# +# OSD PERFORMANCE OPTIONS +# +numprocessors auto +bench 0 + +# +# OSD VIDEO OPTIONS +# +video opengl +numscreens 1 +window 0 +maximize 1 +waitvsync 0 +syncrefresh 0 +monitorprovider auto + +# +# OSD PER-WINDOW VIDEO OPTIONS +# +screen auto +aspect auto +resolution auto +view auto +screen0 auto +aspect0 auto +resolution0 auto +view0 auto +screen1 auto +aspect1 auto +resolution1 auto +view1 auto +screen2 auto +aspect2 auto +resolution2 auto +view2 auto +screen3 auto +aspect3 auto +resolution3 auto +view3 auto + +# +# OSD FULL SCREEN OPTIONS +# +switchres 0 + +# +# OSD ACCELERATED VIDEO OPTIONS +# +filter 1 +prescale 1 + +# +# OpenGL-SPECIFIC OPTIONS +# +gl_forcepow2texture 0 +gl_notexturerect 0 +gl_vbo 1 +gl_pbo 1 +gl_glsl 0 +gl_glsl_filter 1 +glsl_shader_mame0 none +glsl_shader_mame1 none +glsl_shader_mame2 none +glsl_shader_mame3 none +glsl_shader_mame4 none +glsl_shader_mame5 none +glsl_shader_mame6 none +glsl_shader_mame7 none +glsl_shader_mame8 none +glsl_shader_mame9 none +glsl_shader_screen0 none +glsl_shader_screen1 none +glsl_shader_screen2 none +glsl_shader_screen3 none +glsl_shader_screen4 none +glsl_shader_screen5 none +glsl_shader_screen6 none +glsl_shader_screen7 none +glsl_shader_screen8 none +glsl_shader_screen9 none + +# +# OSD SOUND OPTIONS +# +sound auto +audio_latency 2 + +# +# PORTAUDIO OPTIONS +# +pa_api none +pa_device none +pa_latency 0 + +# +# OSD MIDI OPTIONS +# +midiprovider auto + +# +# OSD EMULATED NETWORKING OPTIONS +# +networkprovider auto + +# +# BGFX POST-PROCESSING OPTIONS +# +bgfx_path /usr/share/games/mame/bgfx +bgfx_backend auto +bgfx_debug 0 +bgfx_screen_chains +bgfx_shadow_mask slot-mask.png +bgfx_lut lut-default.png +bgfx_avi_name auto + +# +# SDL PERFORMANCE OPTIONS +# +sdlvideofps 0 + +# +# SDL VIDEO OPTIONS +# +centerh 1 +centerv 1 +scalemode none + +# +# SDL FULL SCREEN OPTIONS +# +useallheads 0 +attach_window + +# +# SDL KEYBOARD MAPPING +# +keymap 0 +keymap_file keymap.dat + +# +# SDL JOYSTICK MAPPING +# +sixaxis 0 + +# +# SDL LIGHTGUN MAPPING +# +lightgun_index1 auto +lightgun_index2 auto +lightgun_index3 auto +lightgun_index4 auto +lightgun_index5 auto +lightgun_index6 auto +lightgun_index7 auto +lightgun_index8 auto + +# +# SDL LOW-LEVEL DRIVER OPTIONS +# +videodriver auto +renderdriver auto +audiodriver auto +gl_lib auto + +# +# FRONTEND COMMAND OPTIONS +# +dtd 1 diff --git a/patches/0005-target-data-layout-w65816.patch b/patches/0005-target-data-layout-w65816.patch index 99a070b..d6f76cb 100644 --- a/patches/0005-target-data-layout-w65816.patch +++ b/patches/0005-target-data-layout-w65816.patch @@ -7,7 +7,7 @@ index 8837d2f91..920b8ac8e 100644 case Triple::msp430: return "e-m:e-p:16:16-i32:16-i64:16-f32:16-f64:16-a:8-n8:16-S16"; + case Triple::w65816: -+ return "e-m:e-p:32:16-i16:16-i32:16-i64:16-f32:16-f64:16-n8:16-S16"; ++ return "e-m:e-p:32:16-i16:16-i32:16-i64:16-f32:16-f64:16-a:8-n8:16-S8"; case Triple::ppc: case Triple::ppcle: case Triple::ppc64: diff --git a/plugin.ini b/plugin.ini new file mode 100644 index 0000000..1d16567 --- /dev/null +++ b/plugin.ini @@ -0,0 +1,17 @@ +# +# PLUGINS OPTIONS +# +gdbstub 0 +cheatfind 0 +discord 0 +autofire 0 +hiscore 0 +dummy 0 +timer 0 +layout 1 +timecode 0 +portname 0 +console 0 +inputmacro 0 +cheat 0 +data 1 diff --git a/runtime/build.sh b/runtime/build.sh index 6a5aa8c..6b8c870 100755 --- a/runtime/build.sh +++ b/runtime/build.sh @@ -53,12 +53,11 @@ cc "$SRC/softFloat.c" cc "$SRC/libcxxabi.c" cc "$SRC/libcxxabiSjlj.c" asm "$SRC/iigsGsos.s" -# softDouble.c builds at -O1: __muldf3's u64 live-range pressure -# overflows the greedy allocator at -O2. dpack is already noinline -# to reduce pressure, but dclass MUST stay inline (its pointer-arg -# writes from a noinline boundary would lower to `sta (d,s),y` which -# uses DBR for the bank — silently corrupted under DBR != 0, caught -# by the dmul-after-bank-switch test). -O1 sidesteps this. -cc "$SRC/softDouble.c" -O1 +# softDouble.c builds at -O2. dpack stays noinline (basic regalloc +# overflows when dpack inlines into __adddf3/__muldf3). dclass MUST +# stay inline (its pointer-arg writes from a noinline boundary would +# lower to `sta (d,s),y` which uses DBR — silently corrupted under +# DBR != 0, caught by the dmul-after-bank-switch test). +cc "$SRC/softDouble.c" echo "runtime built: $(ls -1 "$OUT"/*.o | wc -l) objects" diff --git a/runtime/include/assert.h b/runtime/include/assert.h index c3f2223..64c1264 100644 --- a/runtime/include/assert.h +++ b/runtime/include/assert.h @@ -11,4 +11,10 @@ void __assert_fail(const char *expr, const char *file, unsigned int line, __assert_fail(#x, __FILE__, __LINE__, __func__)) #endif +// C11 static_assert — clang implements `_Static_assert` as a keyword. +// The macro spelling allows portable code that uses `static_assert(...)`. +#ifndef __cplusplus +# define static_assert _Static_assert +#endif + #endif diff --git a/runtime/include/complex.h b/runtime/include/complex.h new file mode 100644 index 0000000..47f59dd --- /dev/null +++ b/runtime/include/complex.h @@ -0,0 +1,100 @@ +// C99 / C11 complex.h — complex-number types and core helpers. +// +// clang implements `_Complex` as a built-in type that lowers to a +// struct-of-two-reals on the W65816 (`_Complex double` = 16 bytes, +// `_Complex float` = 8 bytes). Plain arithmetic (`a + b`, `a * b`, +// etc.) is handled by the compiler via softFloat / softDouble. +// +// **Supported surface:** the core component / conjugate / magnitude / +// argument helpers — `creal`, `cimag`, `conj`, `cabs`, `carg`, +// `cproj` — plus the `CMPLX` constructor macros. +// +// **NOT supported:** the transcendental complex routines (`csin`, +// `ccos`, `cexp`, `clog`, `cpow`, `csqrt`, etc.) — they would each +// require a real polynomial-expansion implementation; not worth the +// runtime cost for our IIgs target. Code that references those +// symbols will link-fail; if you need them, implement them in your +// project and link them in. + +#ifndef _COMPLEX_H +#define _COMPLEX_H + +#include + +// Per C11: `complex` and `_Complex_I` are macros provided by +// . Real-world code mostly uses `complex` rather than +// the underscore form. +#define complex _Complex +#define _Complex_I ((float _Complex){0.0f, 1.0f}) +#define I _Complex_I + +// CMPLX(real, imag) — C11 constructor. Avoid the type-pun trick; +// clang implements this as a compound literal. +#define CMPLX(r, i) ((double _Complex){ (r), (i) }) +#define CMPLXF(r, i) ((float _Complex){ (r), (i) }) +#define CMPLXL(r, i) ((double _Complex){ (r), (i) }) // long double = double here + +// ---- Component access ----------------------------------------------- +// clang provides `__real__` and `__imag__` lvalue extensions that map +// directly to the underlying real / imag slot of the complex struct. +// Wrapping them as inline functions avoids leaking the gcc-extension +// keyword into user code. + +static inline double creal (double _Complex z) { return __real__ z; } +static inline double cimag (double _Complex z) { return __imag__ z; } +static inline float crealf(float _Complex z) { return __real__ z; } +static inline float cimagf(float _Complex z) { return __imag__ z; } +static inline double creall(double _Complex z) { return __real__ z; } +static inline double cimagl(double _Complex z) { return __imag__ z; } + +// ---- Conjugate ------------------------------------------------------- +// conj(a + b*I) = a - b*I. Implemented via CMPLX so the compiler can +// optimise away the temporary. + +static inline double _Complex conj (double _Complex z) { + return CMPLX(__real__ z, -__imag__ z); +} +static inline float _Complex conjf(float _Complex z) { + return CMPLXF(__real__ z, -__imag__ z); +} +static inline double _Complex conjl(double _Complex z) { + return CMPLX(__real__ z, -__imag__ z); +} + +// ---- Magnitude / argument / projection ------------------------------ +// cabs uses hypot to avoid intermediate over/underflow. carg uses +// atan2. cproj returns z unchanged unless either part is infinite, +// in which case it returns (INFINITY, +-0). + +static inline double cabs (double _Complex z) { + return hypot(__real__ z, __imag__ z); +} +static inline float cabsf(float _Complex z) { + return hypotf(__real__ z, __imag__ z); +} +static inline double cabsl(double _Complex z) { + return hypot(__real__ z, __imag__ z); +} + +static inline double carg (double _Complex z) { + return atan2(__imag__ z, __real__ z); +} +static inline float cargf(float _Complex z) { + return atan2f(__imag__ z, __real__ z); +} +static inline double cargl(double _Complex z) { + return atan2(__imag__ z, __real__ z); +} + +static inline double _Complex cproj(double _Complex z) { + if (__isinf_d(__real__ z) || __isinf_d(__imag__ z)) { + return CMPLX(HUGE_VAL, __imag__ z < 0.0 ? -0.0 : 0.0); + } + return z; +} +static inline float _Complex cprojf(float _Complex z) { + return (float _Complex)cproj((double _Complex)z); +} +static inline double _Complex cprojl(double _Complex z) { return cproj(z); } + +#endif diff --git a/runtime/include/errno.h b/runtime/include/errno.h index 141a048..025880d 100644 --- a/runtime/include/errno.h +++ b/runtime/include/errno.h @@ -4,14 +4,46 @@ extern int errno; int *__errno_location(void); -// Standard error codes (subset; matches glibc numbering). -#define EPERM 1 -#define ENOENT 2 -#define EIO 5 -#define EBADF 9 -#define ENOMEM 12 -#define EACCES 13 -#define EINVAL 22 -#define ENOSPC 28 +// Error codes (glibc numbering for portability). C standard requires +// EDOM, ERANGE, EILSEQ; the rest are common POSIX-style codes that +// real-world code expects to find even on a minimal runtime. +#define EPERM 1 // Operation not permitted +#define ENOENT 2 // No such file or directory +#define ESRCH 3 // No such process +#define EINTR 4 // Interrupted system call +#define EIO 5 // I/O error +#define ENXIO 6 // No such device or address +#define E2BIG 7 // Argument list too long +#define ENOEXEC 8 // Exec format error +#define EBADF 9 // Bad file descriptor +#define ECHILD 10 // No child processes +#define EAGAIN 11 // Resource temporarily unavailable +#define ENOMEM 12 // Out of memory +#define EACCES 13 // Permission denied +#define EFAULT 14 // Bad address +#define EBUSY 16 // Device or resource busy +#define EEXIST 17 // File exists +#define EXDEV 18 // Cross-device link +#define ENODEV 19 // No such device +#define ENOTDIR 20 // Not a directory +#define EISDIR 21 // Is a directory +#define EINVAL 22 // Invalid argument +#define ENFILE 23 // Too many open files in system +#define EMFILE 24 // Too many open files +#define ENOTTY 25 // Not a typewriter +#define ETXTBSY 26 // Text file busy +#define EFBIG 27 // File too large +#define ENOSPC 28 // No space left on device +#define ESPIPE 29 // Illegal seek +#define EROFS 30 // Read-only file system +#define EMLINK 31 // Too many links +#define EPIPE 32 // Broken pipe +#define EDOM 33 // Math argument out of domain (C standard) +#define ERANGE 34 // Math result out of range (C standard) +#define ENAMETOOLONG 36 // Filename too long +#define ENOSYS 38 // Function not implemented +#define ENOTEMPTY 39 // Directory not empty +#define ELOOP 40 // Too many symbolic links +#define EILSEQ 84 // Illegal byte sequence (C standard) #endif diff --git a/runtime/include/fenv.h b/runtime/include/fenv.h new file mode 100644 index 0000000..1fe1a7e --- /dev/null +++ b/runtime/include/fenv.h @@ -0,0 +1,51 @@ +// fenv.h — floating-point environment. +// +// The W65816 softFloat / softDouble runtime is fixed at round-to- +// nearest-even (FE_TONEAREST). Other rounding modes can be set/queried +// but they have no effect on softFloat output — softDouble always uses +// RNE. Exception flags are tracked as a static word but never raised +// by the soft-float libraries (they don't model overflow/underflow/ +// inexact at the IEEE level; overflow → infinity, underflow → zero, +// inexact silently rounded). +// +// All functions return 0 on success (per C99 7.6.3.1). +// +// This header exists so portable code that includes and calls +// fegetround() / fesetround() compiles cleanly — it just won't observe +// non-default rounding. + +#ifndef _FENV_H +#define _FENV_H + +typedef unsigned short fenv_t; +typedef unsigned short fexcept_t; + +// Rounding modes. Only FE_TONEAREST has effect on this target. +#define FE_TONEAREST 0 +#define FE_DOWNWARD 1 +#define FE_UPWARD 2 +#define FE_TOWARDZERO 3 + +// Exception flags. Never raised by softFloat/softDouble. +#define FE_INVALID 0x01 +#define FE_DIVBYZERO 0x02 +#define FE_OVERFLOW 0x04 +#define FE_UNDERFLOW 0x08 +#define FE_INEXACT 0x10 +#define FE_ALL_EXCEPT (FE_INVALID|FE_DIVBYZERO|FE_OVERFLOW|FE_UNDERFLOW|FE_INEXACT) + +#define FE_DFL_ENV ((const fenv_t *)0) + +int feclearexcept(int excepts); +int fegetexceptflag(fexcept_t *flagp, int excepts); +int feraiseexcept(int excepts); +int fesetexceptflag(const fexcept_t *flagp, int excepts); +int fetestexcept(int excepts); +int fegetround(void); +int fesetround(int round); +int fegetenv(fenv_t *envp); +int feholdexcept(fenv_t *envp); +int fesetenv(const fenv_t *envp); +int feupdateenv(const fenv_t *envp); + +#endif diff --git a/runtime/include/inttypes.h b/runtime/include/inttypes.h index d47f348..c84f296 100644 --- a/runtime/include/inttypes.h +++ b/runtime/include/inttypes.h @@ -8,9 +8,26 @@ #include -// (strtoimax / strtoumax not implemented — runtime has strtol / -// strtoul for the 32-bit forms which cover the common needs.) -// +// strtoimax / strtoumax — `intmax_t` is 64-bit on this target. The +// runtime's strtoll / strtoull cover the 64-bit forms; these wrappers +// just route through. imaxabs / imaxdiv handle |x| and quot+rem for +// the same width. +extern long long strtoll (const char *nptr, char **endptr, int base); +extern unsigned long long strtoull(const char *nptr, char **endptr, int base); +static inline intmax_t strtoimax(const char *n, char **e, int b) { return strtoll (n, e, b); } +static inline uintmax_t strtoumax(const char *n, char **e, int b) { return strtoull(n, e, b); } + +extern long long llabs(long long n); +static inline intmax_t imaxabs(intmax_t n) { return llabs(n); } + +typedef struct { intmax_t quot; intmax_t rem; } imaxdiv_t; +static inline imaxdiv_t imaxdiv(intmax_t n, intmax_t d) { + imaxdiv_t r; + r.quot = n / d; + r.rem = n - r.quot * d; + return r; +} + // **WARNING — limited printf support.** The runtime's printf / // snprintf understand the `l` length modifier (long, 32-bit) but // NOT `ll` (long long, 64-bit). Using PRId64 / PRIu64 / PRIx64 diff --git a/runtime/include/iso646.h b/runtime/include/iso646.h new file mode 100644 index 0000000..b86c40d --- /dev/null +++ b/runtime/include/iso646.h @@ -0,0 +1,20 @@ +// C95 iso646.h — alternative spellings of the C operators. Mandated +// by C11 for portability with sources written under older standards +// or in code-pages without the punctuation symbols. + +#ifndef _ISO646_H +#define _ISO646_H + +#define and && +#define and_eq &= +#define bitand & +#define bitor | +#define compl ~ +#define not ! +#define not_eq != +#define or || +#define or_eq |= +#define xor ^ +#define xor_eq ^= + +#endif diff --git a/runtime/include/locale.h b/runtime/include/locale.h index 14c1904..dab6ef3 100644 --- a/runtime/include/locale.h +++ b/runtime/include/locale.h @@ -6,6 +6,10 @@ #ifndef _LOCALE_H #define _LOCALE_H +#ifndef NULL +# define NULL ((void *)0) +#endif + struct lconv { char *decimal_point; char *thousands_sep; diff --git a/runtime/include/math.h b/runtime/include/math.h index 5642062..56fb2c2 100644 --- a/runtime/include/math.h +++ b/runtime/include/math.h @@ -104,6 +104,50 @@ double cosh (double x); float coshf (float x); double tanh (double x); float tanhf (float x); +double asinh (double x); +float asinhf (float x); +double acosh (double x); +float acoshf (float x); +double atanh (double x); +float atanhf (float x); + +// ---- Fused multiply-add (not actually fused — rounds at each step) - +double fma (double x, double y, double z); +float fmaf (float x, float y, float z); + +// ---- NaN payload helpers (tagp ignored — returns canonical NaN) ---- +double nan (const char *tagp); +float nanf(const char *tagp); + +// ---- IEEE 754 remainder ------------------------------------------- +double remainder (double x, double y); +float remainderf (float x, float y); + +// ---- Round to floating-point integer ------------------------------ +double rint (double x); +float rintf (float x); +double nearbyint (double x); +float nearbyintf (float x); + +// ---- Round to integer ---------------------------------------------- +long lround (double x); +long lroundf (float x); +long lrint (double x); +long lrintf (float x); + +// ---- Scaling ------------------------------------------------------- +double scalbn (double x, int n); +float scalbnf (float x, int n); +double scalbln (double x, long n); +float scalblnf(float x, long n); + +// ---- Classification ------------------------------------------------ +#define FP_NAN 0 +#define FP_INFINITE 1 +#define FP_NORMAL 2 +#define FP_SUBNORMAL 3 +#define FP_ZERO 4 +int fpclassify(double x); // ---- Common constants ----------------------------------------------- // (Not in C99 strict, but defined by glibc/BSD math.h and widely used.) diff --git a/runtime/include/stdalign.h b/runtime/include/stdalign.h new file mode 100644 index 0000000..32fdc3d --- /dev/null +++ b/runtime/include/stdalign.h @@ -0,0 +1,13 @@ +// C11 stdalign.h — alias the keyword forms `_Alignas` / `_Alignof` to +// the more readable lowercase names. + +#ifndef _STDALIGN_H +#define _STDALIGN_H + +#define alignas _Alignas +#define alignof _Alignof + +#define __alignas_is_defined 1 +#define __alignof_is_defined 1 + +#endif diff --git a/runtime/include/stdatomic.h b/runtime/include/stdatomic.h new file mode 100644 index 0000000..d15fb01 --- /dev/null +++ b/runtime/include/stdatomic.h @@ -0,0 +1,138 @@ +// stdatomic.h — C11 atomic operations, single-core stubs. +// +// The W65816 is a uniprocessor with no preemption from a kernel scheduler +// (we run bare on the IIgs, optionally under GS/OS which doesn't yield +// inside a process). All `atomic_*` operations lower to plain ops; the +// `memory_order_*` constants are accepted and ignored. +// +// This header provides the C11 API surface so portable code that uses +// `_Atomic int` / `atomic_fetch_add` / etc. compiles cleanly. Real +// multi-core atomicity is not modeled. + +#ifndef _STDATOMIC_H +#define _STDATOMIC_H + +#include +#include + +typedef enum { + memory_order_relaxed, + memory_order_consume, + memory_order_acquire, + memory_order_release, + memory_order_acq_rel, + memory_order_seq_cst +} memory_order; + +#define ATOMIC_BOOL_LOCK_FREE 1 +#define ATOMIC_CHAR_LOCK_FREE 1 +#define ATOMIC_CHAR16_T_LOCK_FREE 1 +#define ATOMIC_CHAR32_T_LOCK_FREE 1 +#define ATOMIC_WCHAR_T_LOCK_FREE 1 +#define ATOMIC_SHORT_LOCK_FREE 1 +#define ATOMIC_INT_LOCK_FREE 1 +#define ATOMIC_LONG_LOCK_FREE 1 +#define ATOMIC_LLONG_LOCK_FREE 1 +#define ATOMIC_POINTER_LOCK_FREE 1 + +#define ATOMIC_VAR_INIT(v) (v) +#define ATOMIC_FLAG_INIT { 0 } + +// Atomic flag — a boolean-valued atomic flag. +typedef struct { volatile unsigned char _v; } atomic_flag; + +static inline int atomic_flag_test_and_set_explicit(volatile atomic_flag *o, + memory_order m) { + (void)m; + int r = o->_v; + o->_v = 1; + return r; +} +static inline int atomic_flag_test_and_set(volatile atomic_flag *o) { + return atomic_flag_test_and_set_explicit(o, memory_order_seq_cst); +} +static inline void atomic_flag_clear_explicit(volatile atomic_flag *o, + memory_order m) { + (void)m; + o->_v = 0; +} +static inline void atomic_flag_clear(volatile atomic_flag *o) { + atomic_flag_clear_explicit(o, memory_order_seq_cst); +} + +// Thread-fence — no-op on a uniprocessor with no kernel preemption. +static inline void atomic_thread_fence(memory_order m) { (void)m; } +static inline void atomic_signal_fence(memory_order m) { (void)m; } + +// _Atomic(T) is just T on this target. Generic load/store/RMW macros +// delegate to plain ops. Uses __typeof__ to preserve type info. +#define atomic_init(obj, val) (*(obj) = (val)) +#define atomic_is_lock_free(obj) ((void)(obj), 1) +#define atomic_store(obj, val) (*(obj) = (val)) +#define atomic_store_explicit(obj, val, m) ((void)(m), *(obj) = (val)) +#define atomic_load(obj) (*(obj)) +#define atomic_load_explicit(obj, m) ((void)(m), *(obj)) +#define atomic_exchange(obj, val) ({ \ + __typeof__(*(obj)) _old = *(obj); \ + *(obj) = (val); \ + _old; }) +#define atomic_exchange_explicit(obj, val, m) \ + ((void)(m), atomic_exchange(obj, val)) +#define atomic_compare_exchange_strong(obj, expected, desired) ({ \ + int _ok = (*(obj) == *(expected)); \ + if (_ok) *(obj) = (desired); else *(expected) = *(obj); \ + _ok; }) +#define atomic_compare_exchange_weak(obj, expected, desired) \ + atomic_compare_exchange_strong(obj, expected, desired) +#define atomic_compare_exchange_strong_explicit(obj, expected, desired, ms, mf) \ + ((void)(ms), (void)(mf), \ + atomic_compare_exchange_strong(obj, expected, desired)) +#define atomic_compare_exchange_weak_explicit(obj, expected, desired, ms, mf) \ + ((void)(ms), (void)(mf), \ + atomic_compare_exchange_weak(obj, expected, desired)) +#define atomic_fetch_add(obj, val) ({ \ + __typeof__(*(obj)) _old = *(obj); \ + *(obj) += (val); \ + _old; }) +#define atomic_fetch_add_explicit(obj, val, m) \ + ((void)(m), atomic_fetch_add(obj, val)) +#define atomic_fetch_sub(obj, val) ({ \ + __typeof__(*(obj)) _old = *(obj); \ + *(obj) -= (val); \ + _old; }) +#define atomic_fetch_sub_explicit(obj, val, m) \ + ((void)(m), atomic_fetch_sub(obj, val)) +#define atomic_fetch_or(obj, val) ({ \ + __typeof__(*(obj)) _old = *(obj); \ + *(obj) |= (val); \ + _old; }) +#define atomic_fetch_or_explicit(obj, val, m) \ + ((void)(m), atomic_fetch_or(obj, val)) +#define atomic_fetch_and(obj, val) ({ \ + __typeof__(*(obj)) _old = *(obj); \ + *(obj) &= (val); \ + _old; }) +#define atomic_fetch_and_explicit(obj, val, m) \ + ((void)(m), atomic_fetch_and(obj, val)) +#define atomic_fetch_xor(obj, val) ({ \ + __typeof__(*(obj)) _old = *(obj); \ + *(obj) ^= (val); \ + _old; }) +#define atomic_fetch_xor_explicit(obj, val, m) \ + ((void)(m), atomic_fetch_xor(obj, val)) + +// _Atomic-qualified typedefs that portable C11 code expects. +typedef _Bool atomic_bool; +typedef char atomic_char; +typedef signed char atomic_schar; +typedef unsigned char atomic_uchar; +typedef short atomic_short; +typedef unsigned short atomic_ushort; +typedef int atomic_int; +typedef unsigned int atomic_uint; +typedef long atomic_long; +typedef unsigned long atomic_ulong; +typedef long long atomic_llong; +typedef unsigned long long atomic_ullong; + +#endif diff --git a/runtime/include/stddef.h b/runtime/include/stddef.h index 579341b..b0493e0 100644 --- a/runtime/include/stddef.h +++ b/runtime/include/stddef.h @@ -6,7 +6,10 @@ typedef unsigned long size_t; typedef int ptrdiff_t; -typedef int wchar_t; // not really wide-char-supported +#ifndef _WCHAR_T_DEFINED +# define _WCHAR_T_DEFINED +typedef int wchar_t; // matches clang builtin signature +#endif #ifndef NULL # define NULL ((void *)0) diff --git a/runtime/include/stdint.h b/runtime/include/stdint.h index 738ba70..75a44ce 100644 --- a/runtime/include/stdint.h +++ b/runtime/include/stdint.h @@ -37,8 +37,13 @@ typedef uint32_t uint_fast32_t; typedef int64_t int_fast64_t; typedef uint64_t uint_fast64_t; -typedef int16_t intptr_t; // pointers are 16-bit on W65816 -typedef uint16_t uintptr_t; +// Under ptr32 (data layout `p:32:16`), pointers are 32 bits even though +// the IIgs's physical address bus is 24-bit; the high byte of the bank +// word is reserved. `uintptr_t` is uint32_t so casts pointer↔integer +// round-trip without truncating the bank byte (libcxxabiSjlj's exception +// buffer pointers exercised this — uint16_t lost the bank). +typedef int32_t intptr_t; +typedef uint32_t uintptr_t; typedef int64_t intmax_t; typedef uint64_t uintmax_t; diff --git a/runtime/include/stdio.h b/runtime/include/stdio.h index 924afa6..ec84977 100644 --- a/runtime/include/stdio.h +++ b/runtime/include/stdio.h @@ -6,6 +6,10 @@ typedef struct __sFILE FILE; typedef unsigned long size_t; +#ifndef NULL +# define NULL ((void *)0) +#endif + extern FILE *stdin; extern FILE *stdout; extern FILE *stderr; @@ -35,6 +39,36 @@ int feof(FILE *stream); int ferror(FILE *stream); void clearerr(FILE *stream); +// fgetpos / fsetpos — alternative seek API. fpos_t holds the same +// information as ftell's long return, so the implementation is a thin +// wrapper. Provided for source-compat with portable code. +typedef long fpos_t; +int fgetpos(FILE *stream, fpos_t *pos); +int fsetpos(FILE *stream, const fpos_t *pos); + +// Buffer-control surface — no-ops in our buffer-less I/O model (mfs +// is direct memory, stdout flushes per putchar). The functions exist +// so portable code compiles. +#define _IOFBF 0 +#define _IOLBF 1 +#define _IONBF 2 +#define BUFSIZ 256 +int setvbuf(FILE *stream, char *buf, int mode, size_t size); +void setbuf (FILE *stream, char *buf); + +// File-system operations — stubs that route to mfsUnregister and +// hand-rolled rename. Return 0 on success, -1 on failure. +int remove(const char *path); +int rename(const char *old, const char *neu); + +// Temporary-file helpers — stubs returning NULL / (char *)0. Real +// temp-file support requires writable storage on disk which the IIgs +// runtime doesn't provide by default. +FILE *tmpfile(void); +char *tmpnam(char *s); +#define L_tmpnam 16 +#define TMP_MAX 1 // we can only produce 1 unique name (always fail) + #define SEEK_SET 0 #define SEEK_CUR 1 #define SEEK_END 2 @@ -47,12 +81,17 @@ char *fgets(char *buf, int n, FILE *stream); int ungetc(int c, FILE *stream); #define getc(s) fgetc(s) -// scanf family — only sscanf and vsscanf are implemented (parsing -// from a string buffer). scanf/fscanf would need a reliable byte-at- -// a-time stdin which we don't have. Supports %d %i %u %x %X %o %s -// %c %% with optional `l` long modifier. +// scanf family — sscanf/vsscanf parse a string; fscanf/vfscanf parse +// from a FILE* via fgetc/ungetc. scanf/vscanf read from stdin (which +// returns EOF on the IIgs because there is no integrated keyboard +// stdin) so they're rarely useful but the surface compiles. Supports +// %d %i %u %x %X %o %s %c %ld %lu %lx %li %lo %% with optional `l`. int sscanf (const char *str, const char *fmt, ...); int vsscanf(const char *str, const char *fmt, va_list ap); +int fscanf (FILE *stream, const char *fmt, ...); +int vfscanf(FILE *stream, const char *fmt, va_list ap); +int scanf (const char *fmt, ...); +int vscanf (const char *fmt, va_list ap); void rewind(FILE *stream); // = fseek(s, 0, SEEK_SET) + clearerr // Memory-backed FS: register a memory region as a named file so diff --git a/runtime/include/stdlib.h b/runtime/include/stdlib.h index 505be1c..ec86fc8 100644 --- a/runtime/include/stdlib.h +++ b/runtime/include/stdlib.h @@ -3,10 +3,21 @@ typedef unsigned long size_t; +#ifndef NULL +# define NULL ((void *)0) +#endif + void *malloc(size_t n); void *calloc(size_t nmemb, size_t size); void *realloc(void *ptr, size_t n); void free(void *p); +// C11 aligned allocation. `alignment` must be a power of two; `size` +// must be a multiple of `alignment`. Free with `aligned_free` (not +// plain `free`) — the returned pointer is offset from the malloc-block +// base by an alignment-pad and the original base is stashed just below. +void *aligned_alloc(size_t alignment, size_t size); +void aligned_free(void *p); +int posix_memalign(void **memptr, size_t alignment, size_t size); int abs(int n); long labs(long n); @@ -36,10 +47,21 @@ void *bsearch(const void *key, const void *base, size_t nmemb, size_t size, __cmp_fn cmp); void exit(int code) __attribute__((noreturn)); +void _Exit(int code) __attribute__((noreturn)); void abort(void) __attribute__((noreturn)); +// C11 quick_exit / at_quick_exit — like exit/atexit but invoke a +// separate handler chain. No file flushing, no atexit handlers. +void quick_exit(int code) __attribute__((noreturn)); typedef void (*__atexit_fn)(void); int atexit(__atexit_fn fn); +int at_quick_exit(__atexit_fn fn); + +// No environment under GS/OS — `getenv` always returns NULL, +// `system` always returns 0 (no shell to invoke). These exist for +// portable-code compile compatibility. +char *getenv(const char *name); +int system(const char *cmd); #define EXIT_SUCCESS 0 #define EXIT_FAILURE 1 diff --git a/runtime/include/stdnoreturn.h b/runtime/include/stdnoreturn.h new file mode 100644 index 0000000..688da9a --- /dev/null +++ b/runtime/include/stdnoreturn.h @@ -0,0 +1,9 @@ +// C11 stdnoreturn.h — alias the keyword form `_Noreturn` to the more +// readable lowercase name. + +#ifndef _STDNORETURN_H +#define _STDNORETURN_H + +#define noreturn _Noreturn + +#endif diff --git a/runtime/include/string.h b/runtime/include/string.h index f419fbe..c403924 100644 --- a/runtime/include/string.h +++ b/runtime/include/string.h @@ -3,6 +3,10 @@ typedef unsigned long size_t; +#ifndef NULL +# define NULL ((void *)0) +#endif + void *memcpy(void *dst, const void *src, size_t n); void *memmove(void *dst, const void *src, size_t n); void *memset(void *dst, int c, size_t n); diff --git a/runtime/include/tgmath.h b/runtime/include/tgmath.h new file mode 100644 index 0000000..e413ae6 --- /dev/null +++ b/runtime/include/tgmath.h @@ -0,0 +1,97 @@ +// tgmath.h — type-generic math macros. +// +// Selects between the `f`-suffixed (float) and unsuffixed (double) +// math functions based on argument type via C11 _Generic. Our +// `long double` is aliased to double, so the `l`-suffixed variants +// aren't separately provided. +// +// Usage: `sqrt(x)` picks `sqrtf(x)` if x is float, `sqrt(x)` if double. + +#ifndef _TGMATH_H +#define _TGMATH_H + +#include + +#define __tg1(fn, x) \ + _Generic((x), float: fn##f, default: fn)(x) + +#define __tg2(fn, x, y) \ + _Generic((x), float: fn##f, default: fn) \ + ((x), (y)) + +#undef sin +#define sin(x) __tg1(sin, x) +#undef cos +#define cos(x) __tg1(cos, x) +#undef tan +#define tan(x) __tg1(tan, x) +#undef asin +#define asin(x) __tg1(asin, x) +#undef acos +#define acos(x) __tg1(acos, x) +#undef atan +#define atan(x) __tg1(atan, x) +#undef atan2 +#define atan2(y, x) __tg2(atan2, y, x) +#undef sinh +#define sinh(x) __tg1(sinh, x) +#undef cosh +#define cosh(x) __tg1(cosh, x) +#undef tanh +#define tanh(x) __tg1(tanh, x) +#undef exp +#define exp(x) __tg1(exp, x) +#undef log +#define log(x) __tg1(log, x) +#undef log10 +#define log10(x) __tg1(log10, x) +#undef pow +#define pow(x, y) __tg2(pow, x, y) +#undef sqrt +#define sqrt(x) __tg1(sqrt, x) +#undef ceil +#define ceil(x) __tg1(ceil, x) +#undef floor +#define floor(x) __tg1(floor, x) +#undef fabs +#define fabs(x) __tg1(fabs, x) +#undef fmod +#define fmod(x, y) __tg2(fmod, x, y) +#undef copysign +#define copysign(x,y) __tg2(copysign, x, y) +#undef log2 +#define log2(x) __tg1(log2, x) +#undef exp2 +#define exp2(x) __tg1(exp2, x) +#undef log1p +#define log1p(x) __tg1(log1p, x) +#undef expm1 +#define expm1(x) __tg1(expm1, x) +#undef hypot +#define hypot(x, y) __tg2(hypot, x, y) +#undef cbrt +#define cbrt(x) __tg1(cbrt, x) +#undef trunc +#define trunc(x) __tg1(trunc, x) +#undef round +#define round(x) __tg1(round, x) +#undef fmax +#define fmax(x, y) __tg2(fmax, x, y) +#undef fmin +#define fmin(x, y) __tg2(fmin, x, y) +#undef fdim +#define fdim(x, y) __tg2(fdim, x, y) +#undef asinh +#define asinh(x) __tg1(asinh, x) +#undef acosh +#define acosh(x) __tg1(acosh, x) +#undef atanh +#define atanh(x) __tg1(atanh, x) +#undef remainder +#define remainder(x,y) __tg2(remainder, x, y) +#undef rint +#define rint(x) __tg1(rint, x) +#undef nearbyint +#define nearbyint(x) __tg1(nearbyint, x) + +#endif diff --git a/runtime/include/threads.h b/runtime/include/threads.h new file mode 100644 index 0000000..80d9800 --- /dev/null +++ b/runtime/include/threads.h @@ -0,0 +1,91 @@ +// threads.h — C11 threading API. Single-core IIgs / bare-metal: every +// thread function fails with `thrd_error`. Mutexes / cond-vars compile +// but produce no synchronization — callers running on a single core +// don't need any. This header is here so portable C11 code that +// `#include ` and uses `thrd_t` etc. compiles. + +#ifndef _THREADS_H +#define _THREADS_H + +#include + +enum { + thrd_success = 0, + thrd_busy = 1, + thrd_error = 2, + thrd_nomem = 3, + thrd_timedout = 4 +}; + +enum { + mtx_plain = 0, + mtx_recursive = 1, + mtx_timed = 2 +}; + +#define ONCE_FLAG_INIT 0 +#define TSS_DTOR_ITERATIONS 1 + +typedef int thrd_t; +typedef int (*thrd_start_t)(void *); +typedef struct { int _x; } mtx_t; +typedef struct { int _x; } cnd_t; +typedef int once_flag; +typedef unsigned short tss_t; +typedef void (*tss_dtor_t)(void *); + +// All thread create/join calls fail — no scheduler. +static inline int thrd_create(thrd_t *t, thrd_start_t f, void *a) { + (void)t; (void)f; (void)a; + return thrd_error; +} +static inline thrd_t thrd_current(void) { return 0; } +static inline int thrd_equal(thrd_t a, thrd_t b) { return a == b; } +static inline void thrd_exit(int v) { (void)v; for (;;) {} } +static inline int thrd_join(thrd_t t, int *res) { (void)t; (void)res; return thrd_error; } +static inline int thrd_detach(thrd_t t) { (void)t; return thrd_error; } +static inline int thrd_sleep(const struct timespec *d, + struct timespec *r) { (void)d; (void)r; return -1; } +static inline void thrd_yield(void) { } + +// Mutex / cond — no-ops on a uniprocessor. +static inline int mtx_init(mtx_t *m, int t) { (void)m; (void)t; return thrd_success; } +static inline int mtx_lock(mtx_t *m) { (void)m; return thrd_success; } +static inline int mtx_trylock(mtx_t *m) { (void)m; return thrd_success; } +static inline int mtx_timedlock(mtx_t *m, + const struct timespec *t) { + (void)m; (void)t; return thrd_success; +} +static inline int mtx_unlock(mtx_t *m) { (void)m; return thrd_success; } +static inline void mtx_destroy(mtx_t *m) { (void)m; } + +static inline int cnd_init(cnd_t *c) { (void)c; return thrd_success; } +static inline int cnd_signal(cnd_t *c) { (void)c; return thrd_success; } +static inline int cnd_broadcast(cnd_t *c) { (void)c; return thrd_success; } +static inline int cnd_wait(cnd_t *c, mtx_t *m) { (void)c; (void)m; return thrd_error; } +static inline int cnd_timedwait(cnd_t *c, mtx_t *m, + const struct timespec *t) { + (void)c; (void)m; (void)t; return thrd_timedout; +} +static inline void cnd_destroy(cnd_t *c) { (void)c; } + +// call_once — straightforward on a single-core target. +static inline void call_once(once_flag *f, void (*fn)(void)) { + if (!*f) { *f = 1; fn(); } +} + +// Thread-specific storage: no other threads, so it's just a pointer. +// At most 8 keys. +extern void *__tss_slots[8]; +extern int __tss_next; +static inline int tss_create(tss_t *k, tss_dtor_t d) { + (void)d; + if (__tss_next >= 8) return thrd_error; + *k = (tss_t)__tss_next++; + return thrd_success; +} +static inline void *tss_get(tss_t k) { return __tss_slots[k]; } +static inline int tss_set(tss_t k, void *v) { __tss_slots[k] = v; return thrd_success; } +static inline void tss_delete(tss_t k) { (void)k; } + +#endif diff --git a/runtime/include/time.h b/runtime/include/time.h index d9da1d1..4e130da 100644 --- a/runtime/include/time.h +++ b/runtime/include/time.h @@ -5,8 +5,20 @@ typedef long time_t; typedef unsigned long clock_t; typedef unsigned long size_t; +#ifndef NULL +# define NULL ((void *)0) +#endif + #define CLOCKS_PER_SEC 60 // IIgs vsync tick (placeholder) +// C11 / POSIX nanosecond-precision time. IIgs has only second-level +// hardware resolution; tv_nsec is reported as 0 by callers that fill +// a struct timespec. Defined here so can refer to it. +struct timespec { + time_t tv_sec; + long tv_nsec; +}; + struct tm { int tm_sec; // 0..60 (60 = leap second) int tm_min; // 0..59 diff --git a/runtime/include/uchar.h b/runtime/include/uchar.h new file mode 100644 index 0000000..1c4476d --- /dev/null +++ b/runtime/include/uchar.h @@ -0,0 +1,53 @@ +// C11 uchar.h — char16_t / char32_t plus minimal conversion helpers. +// +// The W65816 runtime treats text as Latin-1 (8-bit) throughout, so +// the 16-bit and 32-bit char types are degenerate one-byte mappings +// (high bytes always zero). Conversion functions are provided for +// surface-compatibility; they do not handle multi-byte UTF-8 input. + +#ifndef _UCHAR_H +#define _UCHAR_H + +#include +#include + +typedef uint16_t char16_t; +typedef uint32_t char32_t; + +// mbstate_t is the multi-byte conversion state. Empty struct — our +// 1:1 byte mapping is stateless. +typedef struct { int unused; } mbstate_t; + +// mbrtoc16 / c16rtomb — multibyte <-> char16_t. In our Latin-1 +// model these are byte-for-byte copies. +static inline size_t mbrtoc16(char16_t *out, const char *s, size_t n, mbstate_t *ps) { + (void)ps; + if (!s || n == 0) return (size_t)-2; + unsigned char c = (unsigned char)*s; + if (out) *out = (char16_t)c; + return (c == 0) ? 0 : 1; +} + +static inline size_t c16rtomb(char *s, char16_t c, mbstate_t *ps) { + (void)ps; + if (!s) return 1; + *s = (char)(c & 0xFF); + return 1; +} + +static inline size_t mbrtoc32(char32_t *out, const char *s, size_t n, mbstate_t *ps) { + (void)ps; + if (!s || n == 0) return (size_t)-2; + unsigned char c = (unsigned char)*s; + if (out) *out = (char32_t)c; + return (c == 0) ? 0 : 1; +} + +static inline size_t c32rtomb(char *s, char32_t c, mbstate_t *ps) { + (void)ps; + if (!s) return 1; + *s = (char)(c & 0xFF); + return 1; +} + +#endif diff --git a/runtime/include/wchar.h b/runtime/include/wchar.h index dc223b8..376b6a7 100644 --- a/runtime/include/wchar.h +++ b/runtime/include/wchar.h @@ -8,7 +8,10 @@ #ifndef _WCHAR_H #define _WCHAR_H -typedef unsigned short wchar_t; +#ifndef _WCHAR_T_DEFINED +# define _WCHAR_T_DEFINED +typedef int wchar_t; // matches clang builtin signature +#endif typedef unsigned long size_t; typedef long wint_t; @@ -35,4 +38,39 @@ size_t mbstowcs(wchar_t *pwcs, const char *s, size_t n); size_t wcstombs(char *s, const wchar_t *pwcs, size_t n); int mblen (const char *s, size_t n); +// Wide-char `memXXX` family — operate on wchar_t arrays. Under our +// Latin-1 model these are equivalent to the byte versions scaled by +// sizeof(wchar_t) for memcpy/memmove, and explicit loops for set/cmp +// /chr (since the byte versions can't compare a 16-bit wchar_t value +// against an 8-bit memory cell). +wchar_t *wmemcpy (wchar_t *dst, const wchar_t *src, size_t n); +wchar_t *wmemmove(wchar_t *dst, const wchar_t *src, size_t n); +wchar_t *wmemset (wchar_t *dst, wchar_t c, size_t n); +int wmemcmp (const wchar_t *a, const wchar_t *b, size_t n); +wchar_t *wmemchr (const wchar_t *s, wchar_t c, size_t n); + +// Wide-char string-numeric conversion. Each routine narrows the wide +// source to bytes first (1:1 Latin-1), then delegates to the strXXX +// equivalent. endptr is reported in wide-char position so callers can +// resume scanning from where the conversion stopped. +long wcstol (const wchar_t *nptr, wchar_t **endptr, int base); +unsigned long wcstoul (const wchar_t *nptr, wchar_t **endptr, int base); +long long wcstoll (const wchar_t *nptr, wchar_t **endptr, int base); +unsigned long long wcstoull(const wchar_t *nptr, wchar_t **endptr, int base); +double wcstod (const wchar_t *nptr, wchar_t **endptr); +float wcstof (const wchar_t *nptr, wchar_t **endptr); + +// Wide-char printf-family. Narrow the format string + any %s/%c args, +// route through the byte snprintf, then widen the result back into the +// wchar_t buffer. Limits the format-spec set to what byte snprintf +// supports (no %ls / %lc — wide args route as plain chars). +#include +int swprintf (wchar_t *buf, size_t n, const wchar_t *fmt, ...); +int vswprintf(wchar_t *buf, size_t n, const wchar_t *fmt, va_list ap); + +// Wide-char calendar formatting — same surface as strftime but writes +// wchar_t. Implementation defers to strftime via a byte buffer. +struct tm; +size_t wcsftime(wchar_t *buf, size_t n, const wchar_t *fmt, const struct tm *tm); + #endif diff --git a/runtime/include/wctype.h b/runtime/include/wctype.h new file mode 100644 index 0000000..6b5beab --- /dev/null +++ b/runtime/include/wctype.h @@ -0,0 +1,84 @@ +// C95 / C11 wctype.h — wide-character classification + case folding. +// +// On the W65816 runtime wchar_t is 16-bit but text is Latin-1; the +// high byte is always zero. All functions reduce to the byte +// equivalents in by truncating to the low byte (anything +// in 0x100..0xFFFF is non-printable, non-alpha, non-digit per our +// Latin-1 assumption). + +#ifndef _WCTYPE_H +#define _WCTYPE_H + +#include +#include + +typedef int wctype_t; +typedef int wctrans_t; + +// In Latin-1, wide-char in 0x100..0xFFFF have no class. +#define _WCT_DELEGATE(name) \ + static inline int isw##name(wint_t c) { \ + return (c >= 0 && c < 0x100) ? is##name((int)c) : 0; \ + } + +_WCT_DELEGATE(alnum) +_WCT_DELEGATE(alpha) +_WCT_DELEGATE(cntrl) +_WCT_DELEGATE(digit) +_WCT_DELEGATE(graph) +_WCT_DELEGATE(lower) +_WCT_DELEGATE(print) +_WCT_DELEGATE(punct) +_WCT_DELEGATE(space) +_WCT_DELEGATE(upper) +_WCT_DELEGATE(xdigit) + +static inline int iswblank(wint_t c) { + return (c == L' ' || c == L'\t'); +} + +static inline wint_t towlower(wint_t c) { + return (c >= 0 && c < 0x100) ? (wint_t)tolower((int)c) : c; +} +static inline wint_t towupper(wint_t c) { + return (c >= 0 && c < 0x100) ? (wint_t)toupper((int)c) : c; +} + +// Programmatic lookup — not strictly needed but trivial to provide. +static inline wctype_t wctype(const char *name) { + if (!name) return 0; + // Minimal table — just enough for the common cases. + char c0 = name[0], c1 = name[1]; + if (c0 == 'a' && c1 == 'l') return 1; // alpha or alnum + if (c0 == 'd') return 2; // digit + if (c0 == 'l') return 3; // lower + if (c0 == 'u') return 4; // upper + if (c0 == 's') return 5; // space + return 0; +} +static inline int iswctype(wint_t c, wctype_t t) { + switch (t) { + case 1: return iswalpha(c); + case 2: return iswdigit(c); + case 3: return iswlower(c); + case 4: return iswupper(c); + case 5: return iswspace(c); + } + return 0; +} + +static inline wctrans_t wctrans(const char *name) { + if (!name) return 0; + if (name[0] == 't' && name[1] == 'o' && name[2] == 'l') return 1; + if (name[0] == 't' && name[1] == 'o' && name[2] == 'u') return 2; + return 0; +} +static inline wint_t towctrans(wint_t c, wctrans_t t) { + switch (t) { + case 1: return towlower(c); + case 2: return towupper(c); + } + return c; +} + +#endif diff --git a/runtime/src/crt0.s b/runtime/src/crt0.s index 8511795..3184d5b 100644 --- a/runtime/src/crt0.s +++ b/runtime/src/crt0.s @@ -73,21 +73,95 @@ __start: sta 0xbe ; persistent data bank rep #0x20 - ; Zero BSS. X iterates from __bss_start to __bss_end; each - ; iteration writes one byte of zero at addr X (via DP=0 + - ; offset 0 — which is just X). STZ in M=8 stores 1 byte and - ; doesn't touch A, so we don't need the LDA #0 prelude. + ; Zero BSS. Up to 4 segments — linker emits __bss_seg{0..3}_lo16 + ; / _bank / _size symbols. Segments with size=0 are skipped. + ; Each segment is cleared with DBR-relative STZ abs,X after + ; setting DBR to the segment's bank. Original DBR restored at + ; end via PLB. rep #0x10 ; ensure X is 16-bit - ldx #__bss_start -.Lbss_loop: - cpx #__bss_end - bcs .Lbss_done ; X >= end -> done - sep #0x20 ; 8-bit M for 1-byte store - stz 0x0, x ; *(uint8_t *)X = 0 (DP=0) + phb ; save current DBR + + ; ---- segment 0 ---- + rep #0x20 + ldx #__bss_seg0_size + beq .Lbss_seg1 + sep #0x20 + .byte 0xA9 + .byte __bss_seg0_bank + pha + plb + rep #0x20 + ldx #0 +.Lbss_loop0: + cpx #__bss_seg0_size + bcs .Lbss_seg1 + sep #0x20 + stz __bss_seg0_lo16, x rep #0x20 inx - bra .Lbss_loop + bra .Lbss_loop0 +.Lbss_seg1: + ; ---- segment 1 ---- + rep #0x20 + ldx #__bss_seg1_size + beq .Lbss_seg2 + sep #0x20 + .byte 0xA9 + .byte __bss_seg1_bank + pha + plb + rep #0x20 + ldx #0 +.Lbss_loop1: + cpx #__bss_seg1_size + bcs .Lbss_seg2 + sep #0x20 + stz __bss_seg1_lo16, x + rep #0x20 + inx + bra .Lbss_loop1 +.Lbss_seg2: + ; ---- segment 2 ---- + rep #0x20 + ldx #__bss_seg2_size + beq .Lbss_seg3 + sep #0x20 + .byte 0xA9 + .byte __bss_seg2_bank + pha + plb + rep #0x20 + ldx #0 +.Lbss_loop2: + cpx #__bss_seg2_size + bcs .Lbss_seg3 + sep #0x20 + stz __bss_seg2_lo16, x + rep #0x20 + inx + bra .Lbss_loop2 +.Lbss_seg3: + ; ---- segment 3 ---- + rep #0x20 + ldx #__bss_seg3_size + beq .Lbss_done + sep #0x20 + .byte 0xA9 + .byte __bss_seg3_bank + pha + plb + rep #0x20 + ldx #0 +.Lbss_loop3: + cpx #__bss_seg3_size + bcs .Lbss_done + sep #0x20 + stz __bss_seg3_lo16, x + rep #0x20 + inx + bra .Lbss_loop3 .Lbss_done: + plb ; restore caller's DBR ; Run static constructors. The linker emits ; __init_array_start / __init_array_end around the .init_array diff --git a/runtime/src/extras.c b/runtime/src/extras.c index 9065614..e202dba 100644 --- a/runtime/src/extras.c +++ b/runtime/src/extras.c @@ -182,7 +182,10 @@ size_t strcspn(const char *s, const char *reject) { // str* family. mbtowc / wctomb use the trivial 1:1 byte<->wide-char // mapping (essentially Latin-1) — no real multi-byte / locale support. -typedef unsigned short wchar_t; +// Now `int` to match the clang builtin signature for wcslen/wcscmp/ +// wcscpy etc; was `unsigned short`. Latin-1 content (0..255) is +// representable in both. +typedef int wchar_t; size_t wcslen(const wchar_t *s) { size_t n = 0; @@ -280,3 +283,307 @@ int mblen(const char *s, size_t n) { if (n == 0) return -1; return *s ? 1 : 0; } + + +// ---- wide-char memory + scan/format --------------------------------- +// Operate on wchar_t arrays (wchar_t is `int` on this target = 2 +// bytes). Under Latin-1 we delegate the actual work to the byte/str +// equivalents wherever the data fits in 8 bits. + +#include + +struct tm; + +extern void *memcpy (void *dst, const void *src, size_t n); +extern void *memmove(void *dst, const void *src, size_t n); +extern long strtol (const char *nptr, char **endptr, int base); +extern unsigned long strtoul (const char *nptr, char **endptr, int base); +extern long long strtoll (const char *nptr, char **endptr, int base); +extern unsigned long long strtoull(const char *nptr, char **endptr, int base); +extern double strtod (const char *nptr, char **endptr); +extern float strtof (const char *nptr, char **endptr); +extern int vsnprintf(char *buf, size_t n, const char *fmt, va_list ap); +extern size_t strftime (char *buf, size_t n, const char *fmt, const struct tm *tm); + + +wchar_t *wmemcpy(wchar_t *dst, const wchar_t *src, size_t n) { + memcpy(dst, src, n * sizeof(wchar_t)); + return dst; +} + + +wchar_t *wmemmove(wchar_t *dst, const wchar_t *src, size_t n) { + memmove(dst, src, n * sizeof(wchar_t)); + return dst; +} + + +wchar_t *wmemset(wchar_t *dst, wchar_t c, size_t n) { + wchar_t *p = dst; + while (n--) { + *p++ = c; + } + return dst; +} + + +int wmemcmp(const wchar_t *a, const wchar_t *b, size_t n) { + while (n--) { + if (*a != *b) { + return (int)(*a - *b); + } + a++; + b++; + } + return 0; +} + + +wchar_t *wmemchr(const wchar_t *s, wchar_t c, size_t n) { + while (n--) { + if (*s == c) { + return (wchar_t *)s; + } + s++; + } + return (wchar_t *)0; +} + + +// Helper: narrow a wide string of up to `lim` chars into a byte +// buffer. Stops at the first NUL or after `lim` chars. Returns +// the number of bytes written (excluding any trailing NUL). +static size_t __narrow(char *out, const wchar_t *in, size_t lim) { + size_t i = 0; + while (i < lim && in[i]) { + out[i] = (char)(in[i] & 0xFF); + i++; + } + if (i < lim) { + out[i] = 0; + } + return i; +} + + +long wcstol(const wchar_t *nptr, wchar_t **endptr, int base) { + char buf[40]; + size_t k = __narrow(buf, nptr, sizeof(buf) - 1); + buf[k] = 0; + char *bend; + long r = strtol(buf, &bend, base); + if (endptr) { + *endptr = (wchar_t *)(nptr + (bend - buf)); + } + return r; +} + + +unsigned long wcstoul(const wchar_t *nptr, wchar_t **endptr, int base) { + char buf[40]; + size_t k = __narrow(buf, nptr, sizeof(buf) - 1); + buf[k] = 0; + char *bend; + unsigned long r = strtoul(buf, &bend, base); + if (endptr) { + *endptr = (wchar_t *)(nptr + (bend - buf)); + } + return r; +} + + +long long wcstoll(const wchar_t *nptr, wchar_t **endptr, int base) { + char buf[40]; + size_t k = __narrow(buf, nptr, sizeof(buf) - 1); + buf[k] = 0; + char *bend; + long long r = strtoll(buf, &bend, base); + if (endptr) { + *endptr = (wchar_t *)(nptr + (bend - buf)); + } + return r; +} + + +unsigned long long wcstoull(const wchar_t *nptr, wchar_t **endptr, int base) { + char buf[40]; + size_t k = __narrow(buf, nptr, sizeof(buf) - 1); + buf[k] = 0; + char *bend; + unsigned long long r = strtoull(buf, &bend, base); + if (endptr) { + *endptr = (wchar_t *)(nptr + (bend - buf)); + } + return r; +} + + +double wcstod(const wchar_t *nptr, wchar_t **endptr) { + char buf[40]; + size_t k = __narrow(buf, nptr, sizeof(buf) - 1); + buf[k] = 0; + char *bend; + double r = strtod(buf, &bend); + if (endptr) { + *endptr = (wchar_t *)(nptr + (bend - buf)); + } + return r; +} + + +float wcstof(const wchar_t *nptr, wchar_t **endptr) { + char buf[40]; + size_t k = __narrow(buf, nptr, sizeof(buf) - 1); + buf[k] = 0; + char *bend; + float r = strtof(buf, &bend); + if (endptr) { + *endptr = (wchar_t *)(nptr + (bend - buf)); + } + return r; +} + + +// swprintf: narrow the format string, route through vsnprintf into a +// byte buffer, then widen the result back into `buf`. Limits the +// format-spec coverage to what vsnprintf supports; %ls / %lc are not +// honoured (caller must pass narrow-char args). Returns -1 on +// overflow per C11. +// +// Buffers kept small (64 bytes each) so the total frame stays under +// the W65816's 256-byte stack-rel addressing limit. Long format +// strings and long outputs are truncated. +int vswprintf(wchar_t *buf, size_t n, const wchar_t *fmt, va_list ap) { + if (n == 0) { + return -1; + } + char fmtBuf[64]; + __narrow(fmtBuf, fmt, sizeof(fmtBuf) - 1); + fmtBuf[sizeof(fmtBuf) - 1] = 0; + char outBuf[64]; + size_t cap = n - 1 < sizeof(outBuf) - 1 ? n - 1 : sizeof(outBuf) - 1; + int wrote = vsnprintf(outBuf, cap + 1, fmtBuf, ap); + if (wrote < 0 || (size_t)wrote >= n) { + buf[0] = 0; + return -1; + } + int i; + for (i = 0; i < wrote; i++) { + buf[i] = (wchar_t)(unsigned char)outBuf[i]; + } + buf[wrote] = 0; + return wrote; +} + + +int swprintf(wchar_t *buf, size_t n, const wchar_t *fmt, ...) { + va_list ap; + va_start(ap, fmt); + int r = vswprintf(buf, n, fmt, ap); + va_end(ap); + return r; +} + + +size_t wcsftime(wchar_t *buf, size_t n, const wchar_t *fmt, const struct tm *tm) { + if (n == 0) { + return 0; + } + char fmtBuf[64]; + __narrow(fmtBuf, fmt, sizeof(fmtBuf) - 1); + fmtBuf[sizeof(fmtBuf) - 1] = 0; + char outBuf[128]; + size_t cap = n - 1 < sizeof(outBuf) - 1 ? n - 1 : sizeof(outBuf) - 1; + size_t wrote = strftime(outBuf, cap + 1, fmtBuf, tm); + if (wrote == 0 || wrote >= n) { + buf[0] = 0; + return 0; + } + size_t i; + for (i = 0; i < wrote; i++) { + buf[i] = (wchar_t)(unsigned char)outBuf[i]; + } + buf[wrote] = 0; + return wrote; +} + + +// ---- fenv.h ---------------------------------------------------------- +// +// softFloat / softDouble are fixed at round-to-nearest-even and don't +// raise IEEE exceptions. We track the requested rounding mode and an +// exception-flag word but neither affects soft-float output. + +static int __fenvRound = 0; /* FE_TONEAREST */ +static unsigned short __fenvExcept = 0; + +int feclearexcept(int excepts) { __fenvExcept &= (unsigned short)~excepts; return 0; } +int feraiseexcept(int excepts) { __fenvExcept |= (unsigned short)excepts; return 0; } +int fetestexcept(int excepts) { return __fenvExcept & excepts; } +int fegetexceptflag(unsigned short *flagp, int e) { (void)e; if (flagp) *flagp = __fenvExcept; return 0; } +int fesetexceptflag(const unsigned short *flagp, int e) { + if (!flagp) return -1; + __fenvExcept = (unsigned short)((__fenvExcept & ~e) | (*flagp & e)); + return 0; +} +int fegetround(void) { return __fenvRound; } +int fesetround(int r) { __fenvRound = r; return 0; } +int fegetenv(unsigned short *envp) { if (envp) *envp = __fenvExcept; return 0; } +int feholdexcept(unsigned short *envp) { if (envp) *envp = __fenvExcept; __fenvExcept = 0; return 0; } +int fesetenv(const unsigned short *envp) { __fenvExcept = envp ? *envp : 0; return 0; } +int feupdateenv(const unsigned short *envp) { unsigned short e = envp ? *envp : 0; __fenvExcept |= e; return 0; } + + +// ---- threads.h backing storage --------------------------------------- +// +// All thread / mutex / cond ops are inline no-ops; only tss_* needs +// real per-key storage. 8 keys is enough for any single-core code. + +void *__tss_slots[8]; +int __tss_next = 0; + + +// ---- aligned_alloc / posix_memalign --------------------------------- +// +// Wraps malloc with an over-allocation + alignment-adjust trick: alloc +// (n + alignment + sizeof(void*)) bytes; align upward; stash the +// original pointer just before the returned address for free() to find. +// `aligned_alloc` requires `n` to be a multiple of `alignment` (C11). + +extern void *malloc(unsigned long n); +extern void free (void *p); + +void *aligned_alloc(unsigned long alignment, unsigned long size) { + if (alignment == 0 || (alignment & (alignment - 1))) return (void *)0; + if (size % alignment) return (void *)0; + unsigned long over = size + alignment + sizeof(void *); + char *raw = (char *)malloc(over); + if (!raw) return (void *)0; + unsigned long addr = (unsigned long)raw + sizeof(void *); + unsigned long aligned = (addr + alignment - 1) & ~(alignment - 1); + ((void **)aligned)[-1] = raw; + return (void *)aligned; +} + +// Wrappers that read the stashed raw pointer and free the underlying +// block. Callers should use these (not plain free) for aligned_alloc'd +// pointers. Single-source projects can `#define free aligned_free` if +// needed; the standard C11 contract is that `free` works on aligned +// pointers, so we also patch free below. +void aligned_free(void *p) { + if (!p) return; + void *raw = ((void **)p)[-1]; + free(raw); +} + +int posix_memalign(void **memptr, unsigned long alignment, unsigned long size) { + if (!memptr) return 22; /* EINVAL */ + if (alignment < sizeof(void *) || (alignment & (alignment - 1))) { + *memptr = (void *)0; + return 22; + } + void *p = aligned_alloc(alignment, (size + alignment - 1) & ~(alignment - 1)); + if (!p) { *memptr = (void *)0; return 12; /* ENOMEM */ } + *memptr = p; + return 0; +} diff --git a/runtime/src/libc.c b/runtime/src/libc.c index 973a73a..795d7e6 100644 --- a/runtime/src/libc.c +++ b/runtime/src/libc.c @@ -685,12 +685,42 @@ char *strerror(int err) { case 0: return (char *)"Success"; case 1: return (char *)"Operation not permitted"; case 2: return (char *)"No such file or directory"; + case 3: return (char *)"No such process"; + case 4: return (char *)"Interrupted system call"; case 5: return (char *)"Input/output error"; + case 6: return (char *)"No such device or address"; + case 7: return (char *)"Argument list too long"; + case 8: return (char *)"Exec format error"; case 9: return (char *)"Bad file descriptor"; + case 10: return (char *)"No child processes"; + case 11: return (char *)"Resource temporarily unavailable"; case 12: return (char *)"Out of memory"; case 13: return (char *)"Permission denied"; + case 14: return (char *)"Bad address"; + case 16: return (char *)"Device or resource busy"; + case 17: return (char *)"File exists"; + case 18: return (char *)"Cross-device link"; + case 19: return (char *)"No such device"; + case 20: return (char *)"Not a directory"; + case 21: return (char *)"Is a directory"; case 22: return (char *)"Invalid argument"; + case 23: return (char *)"Too many open files in system"; + case 24: return (char *)"Too many open files"; + case 25: return (char *)"Inappropriate I/O control operation"; + case 26: return (char *)"Text file busy"; + case 27: return (char *)"File too large"; case 28: return (char *)"No space left on device"; + case 29: return (char *)"Illegal seek"; + case 30: return (char *)"Read-only file system"; + case 31: return (char *)"Too many links"; + case 32: return (char *)"Broken pipe"; + case 33: return (char *)"Numerical argument out of domain"; + case 34: return (char *)"Numerical result out of range"; + case 36: return (char *)"File name too long"; + case 38: return (char *)"Function not implemented"; + case 39: return (char *)"Directory not empty"; + case 40: return (char *)"Too many levels of symbolic links"; + case 84: return (char *)"Invalid or incomplete multibyte or wide character"; default: return (char *)"Unknown error"; } } @@ -1121,6 +1151,46 @@ int atexit(AtexitFn fn) { return 0; } +// ---- C99 _Exit + C11 quick_exit / at_quick_exit ---- +// +// _Exit terminates without invoking atexit handlers (unlike exit). +// quick_exit terminates after invoking at_quick_exit handlers (a +// separate chain from atexit). We share the single-slot pattern +// with atexit — single-shot handler, second registration fails. + +static AtexitFn __quickFn = (AtexitFn)0; + +void _Exit(int code) { + (void)code; + __asm__ volatile (".byte 0x00, 0x00"); + while (1) {} // unreachable +} + +void quick_exit(int code) { + (void)code; + if (__quickFn) { + AtexitFn fn = __quickFn; + __quickFn = (AtexitFn)0; + fn(); + } + __asm__ volatile (".byte 0x00, 0x00"); + while (1) {} // unreachable +} + +int at_quick_exit(AtexitFn fn) { + if (__quickFn) return -1; + __quickFn = fn; + return 0; +} + +// ---- getenv / system ---- +// +// GS/OS has no environment. getenv always returns NULL. system +// always returns 0 (no command shell available). These exist to +// keep portable code compiling. +char *getenv(const char *name) { (void)name; return (char *)0; } +int system(const char *cmd) { (void)cmd; return 0; } + // ---- File I/O (memory-backed) ---- // // Backed by mfsRegister'd entries. Mode strings: @@ -1468,6 +1538,52 @@ void rewind(FILE *stream) { stream->err = 0; } +// fgetpos / fsetpos — thin wrappers over ftell / fseek. fpos_t holds +// a single long (byte offset) on this target. +int fgetpos(FILE *stream, long *pos) { + if (!stream || !pos) return -1; + long t = ftell(stream); + if (t < 0) return -1; + *pos = t; + return 0; +} + +int fsetpos(FILE *stream, const long *pos) { + if (!stream || !pos) return -1; + return fseek(stream, *pos, 0 /* SEEK_SET */); +} + +// setvbuf / setbuf — no-ops in our buffer-less model. Return 0 to +// indicate success; portable code that checks the return value will +// keep working. +int setvbuf(FILE *stream, char *buf, int mode, unsigned long size) { + (void)stream; (void)buf; (void)mode; (void)size; + return 0; +} + +void setbuf(FILE *stream, char *buf) { + (void)stream; (void)buf; +} + +// remove / rename — route through mfsUnregister for the memory-backed +// FS. Plain rename always fails since mfs entries are name-keyed and +// we'd need a rename primitive we don't have. +int mfsUnregister(const char *path); +int remove(const char *path) { + if (!path) return -1; + return mfsUnregister(path); +} + +int rename(const char *old, const char *neu) { + (void)old; (void)neu; + return -1; // unsupported +} + +// tmpfile / tmpnam — return NULL / 0 always. We have no writable +// temp storage by default. +FILE *tmpfile(void) { return (FILE *)0; } +char *tmpnam(char *s) { (void)s; return (char *)0; } + // ---- locale.h stubs ---- // // No real locale support — IIgs is single-locale. setlocale always diff --git a/runtime/src/libgcc.s b/runtime/src/libgcc.s index d7f8f04..8128c08 100644 --- a/runtime/src/libgcc.s +++ b/runtime/src/libgcc.s @@ -78,6 +78,53 @@ __mulhi3: lda 0xe4 rtl +; -------------------------------------------------------------------- +; __umulhisi3 — unsigned 16x16 -> 32 multiply. A = multiplier (16-bit), +; (4,s) = multiplicand (16-bit). Returns A:X = 32-bit product (A=lo, +; X=hi). Used by the i32-mul-of-zext-i16 DAG combine in the W65816 +; backend to avoid the full __mulsi3 (32x32 -> 32) call when the user +; writes `(u32)a * b` with a/b as u16 (e.g. `unsigned long s += i*i;`). +; ~16 iterations instead of 16-32 for the equivalent __mulsi3 fast path, +; AND avoids zext/zero-fill overhead in arg setup. +; +; The accumulator and shifted multiplicand both need 32-bit space: +; $e0/$e1 multiplier (shifted right; tested for add) +; $e2..$e5 multiplicand (16-bit input, shifts up into the high +; half over the loop) +; $e6..$e9 32-bit product accumulator +; -------------------------------------------------------------------- + .globl __umulhisi3 +__umulhisi3: + sta 0xe0 ; multiplier in $e0/$e1 + lda 0x4, s + sta 0xe2 ; multiplicand lo in $e2/$e3 + stz 0xe4 ; multiplicand hi (initially 0) in $e4/$e5 + stz 0xe6 ; product lo at $e6/$e7 + stz 0xe8 ; product hi at $e8/$e9 +.Lumulhisi_loop: + lda 0xe0 + beq .Lumulhisi_done + lsr a + sta 0xe0 + bcc .Lumulhisi_skip + ; Add 32-bit multiplicand to 32-bit product. + clc + lda 0xe6 + adc 0xe2 + sta 0xe6 + lda 0xe8 + adc 0xe4 + sta 0xe8 +.Lumulhisi_skip: + ; Shift 32-bit multiplicand left by 1. + asl 0xe2 + rol 0xe4 + bra .Lumulhisi_loop +.Lumulhisi_done: + ldx 0xe8 + lda 0xe6 + rtl + ; -------------------------------------------------------------------- ; __ashlhi3 — A << (4,S) -> A. Shift count is i16 but only the low 4 ; bits are meaningful (counts >=16 are undefined behaviour in C). @@ -335,7 +382,12 @@ __mulsi3: bne .Lmulsi_full ldy #0x10 .Lmulsi_u16_loop: - ; Test bit 0 of multiplier (lo word). + ; Shift multiplier right; bit-out tested for add. Bottom of loop + ; checks multiplier==0 BEFORE the multiplicand shift, so on the + ; iter that clears the multiplier we save 14 cyc of unused + ; asl/rol on the multiplicand. Combined with the early-exit + ; saves ~30 cyc/call on small multipliers (1-50 range typical + ; for sumOfSquares). lda 0xe0 lsr a sta 0xe0 @@ -348,10 +400,13 @@ __mulsi3: adc 0xe6 sta 0xea .Lmulsi_u16_noadd: + lda 0xe0 + beq .Lmulsi_done asl 0xe4 rol 0xe6 dey bne .Lmulsi_u16_loop +.Lmulsi_done: ldx 0xea lda 0xe8 rtl @@ -372,21 +427,28 @@ __mulsi3: adc 0xe6 sta 0xea .Lmulsi_noadd: - ; Shift multiplicand left (32-bit, carry chain). - asl 0xe4 - rol 0xe6 - ; Bring multiplier hi into multiplier lo's high bit. Multiplier - ; has been shifted lo>>1 already; we need to also put hi's lo bit - ; into lo's hi bit and shift hi right. + ; Stream multiplier hi's LSB into lo's MSB so subsequent iters + ; test bits 16..31 via the same lo-bit test. lsr 0xe2 bcc .Lmulsi_no_borrow - ; Carry from hi >> 1 needs to land in bit 15 of lo. ORA #$8000. lda 0xe0 ora #0x8000 sta 0xe0 .Lmulsi_no_borrow: + ; Early exit: if BOTH halves of multiplier are 0, no more bits + ; remain. Saves the multiplicand shift on the terminating iter + ; AND the rest of the loop on small multipliers. + lda 0xe0 + bne .Lmulsi_shift_mc + lda 0xe2 + beq .Lmulsi_full_done +.Lmulsi_shift_mc: + ; Shift multiplicand left for the next iter's potential add. + asl 0xe4 + rol 0xe6 dey bne .Lmulsi_loop +.Lmulsi_full_done: ; Result is in $e8 (lo) / $ea (hi). ldx 0xea lda 0xe8 @@ -476,7 +538,12 @@ __udivmodsi_core: stz 0xe8 stz 0xea stz 0xec - sta 0xee + stz 0xee ; was `sta 0xee` — A held b_hi at entry, + ; so for divisors > 0xFFFF (b_hi != 0) + ; the remainder started contaminated and + ; produced wrong quotients. Bug masked + ; for b_hi==0 (e.g. /60, /1000) because + ; sta-of-zero == stz. Caught by /86400. ldy #0x20 .Lcoresi_loop: ; Shift numerator left through remainder. diff --git a/runtime/src/math.c b/runtime/src/math.c index 62dd218..0134ecb 100644 --- a/runtime/src/math.c +++ b/runtime/src/math.c @@ -751,3 +751,112 @@ double cbrt(double x) { float cbrtf(float x) { return (float)cbrt((double)x); } + + +// ---- C99/C11 additions --------------------------------------------- +// Most reduce to existing primitives (log/exp/sqrt/floor/round/ldexp). +// The fused-multiply-add fma() is implemented as plain x*y+z because +// softFloat/softDouble round each operation independently — true fused +// rounding would require an extended-precision multiplier we don't +// have. Callers who depend on extra precision must use Kahan summation +// or similar. + +double asinh(double x) { + // asinh(x) = log(x + sqrt(x*x + 1)) — stable for all finite x. + return log(x + sqrt(x * x + 1.0)); +} + + +double acosh(double x) { + // acosh(x) = log(x + sqrt(x*x - 1)) for x >= 1. Returns NaN for x < 1. + if (x < 1.0) return __builtin_nanf(""); + return log(x + sqrt(x * x - 1.0)); +} + + +double atanh(double x) { + // atanh(x) = 0.5 * log((1+x)/(1-x)) for |x| < 1. Returns +/-inf + // at x = +/-1 and NaN outside [-1, 1]. + if (x >= 1.0) return __builtin_inff(); + if (x <= -1.0) return -__builtin_inff(); + return 0.5 * log((1.0 + x) / (1.0 - x)); +} + + +float asinhf(float x) { return (float)asinh((double)x); } +float acoshf(float x) { return (float)acosh((double)x); } +float atanhf(float x) { return (float)atanh((double)x); } + + +double fma(double x, double y, double z) { + // Not actually fused — we round x*y then add z, so the result may + // differ from a true FMA in the low bit. Adequate for portable + // code that uses fma() as a hint rather than a precision guarantee. + return x * y + z; +} + + +float fmaf(float x, float y, float z) { return (float)fma((double)x, (double)y, (double)z); } + + +double nan(const char *tagp) { + (void)tagp; // No tagged NaNs — return the canonical quiet NaN. + return __builtin_nanf(""); +} + + +float nanf(const char *tagp) { (void)tagp; return __builtin_nanf(""); } + + +double remainder(double x, double y) { + // IEEE 754 remainder: x - n*y where n = round-to-nearest-even(x/y). + // Falls back to fmod for non-finite cases via the existing primitives. + if (y == 0.0) return __builtin_nanf(""); + double quotient = round(x / y); + return x - quotient * y; +} + + +float remainderf(float x, float y) { return (float)remainder((double)x, (double)y); } + + +double rint(double x) { return round(x); } +float rintf(float x) { return roundf(x); } +double nearbyint(double x) { return round(x); } +float nearbyintf(float x) { return roundf(x); } + + +long lround(double x) { + return (long)round(x); +} + + +long lroundf(float x) { + return (long)roundf(x); +} + + +long lrint(double x) { + return (long)round(x); +} + + +long lrintf(float x) { + return (long)roundf(x); +} + + +double scalbn(double x, int n) { return ldexp(x, n); } +float scalbnf(float x, int n) { return ldexpf(x, n); } +double scalbln(double x, long n) { return ldexp(x, (int)n); } +float scalblnf(float x, long n) { return ldexpf(x, (int)n); } + + +int fpclassify(double x) { + if (__isnan_d(x)) return 0; // FP___builtin_nanf("") + if (__isinf_d(x)) return 1; // FP_INFINITE + if (x == 0.0) return 4; // FP_ZERO + return 2; // FP_NORMAL + // FP_SUBNORMAL (= 3) not distinguished from normal in this minimal + // implementation — subnormals are valid but classified as normal. +} diff --git a/runtime/src/snprintf.c b/runtime/src/snprintf.c index ec72418..35b7109 100644 --- a/runtime/src/snprintf.c +++ b/runtime/src/snprintf.c @@ -122,7 +122,7 @@ static void emitULong(unsigned long n) { } -__attribute__((noinline,optnone)) +__attribute__((noinline)) static void emitSignedLong(long n) { // See emitDec: avoid the signed-overflow UB on LONG_MIN. if (n < 0) { @@ -162,7 +162,12 @@ static void emitHex(unsigned int n, int width) { __attribute__((noinline)) -static void emitDouble(double v, int prec) { +static void emitDouble(double v, int prec, char spec) { + // For %g / %G, "precision" is total significant digits. Real glibc + // would compute exponent and choose between %e and %f styles, but + // we keep things simple and just emit `X.YYY` with trailing zeros + // stripped at the end. For %f / %e, prec is decimal places. + int isG = (spec == 'g' || spec == 'G'); if (prec < 0) { prec = 6; } @@ -180,41 +185,55 @@ static void emitDouble(double v, int prec) { bits &= ~((unsigned long long)1 << 63); __builtin_memcpy(&v, &bits, 8); } - // Avoid `v - (double)ipart` and `frac * 10.0`: those produced - // wrong results when chained in this function (likely a softfp - // libcall-ABI mismatch where the subdf3 return placement didn't - // match the muldf3 arg placement). Instead scale v by 10^prec in - // one chain, do integer division to split, and emit two fields. + // Split int part first, then scale only the fractional part. The + // earlier "multiply v by 10^prec then split via integer divide" + // approach silently overflowed long for v*10^prec > 2^31 (e.g. any + // value ≥ 2.15 with prec=9 in `%.12g`). We've since reworked the + // libcall ABI, so the previously-buggy `v - (double)ipart` chain + // works now — smoke catches a regression of either bug. + unsigned long intPart = (unsigned long)(long)v; + double frac = v - (double)intPart; unsigned long mul = 1; for (int i = 0; i < prec; i++) { - v = v * 10.0; + frac = frac * 10.0; mul *= 10; } - // Round-half-up before truncation: 3.14 * 100 = 313.999... in - // soft-double, but `%.2f` of 3.14 should be "3.14" not "3.13". - // Adding 0.5 then truncating is equivalent to round-half-up for - // the non-negative `v` we have at this point. - v = v + 0.5; - // Cast via signed first; the runtime ships __fixdfsi but not - // __fixunsdfsi. v has been forced non-negative above so the - // signed cast loses no value range we care about. - unsigned long scaled = (unsigned long)(long)v; - unsigned long intPart = scaled / mul; - unsigned long frcPart = scaled - intPart * mul; + // Round-half-up before truncation: 0.314 * 100 = 31.3999... in + // soft-double, but `%.2f` of 3.14 should print "3.14". Adding 0.5 + // then truncating is round-half-up for the non-negative frac here. + frac = frac + 0.5; + unsigned long frcPart = (unsigned long)(long)frac; + // Carry-up if rounding pushed frac to a full integer (e.g. 0.9995 + // → 0.9995*1000+0.5 = 1000 = mul; the "0.9995" wanted to become + // "1.000", not "0.1000"). + if (frcPart >= mul) { + intPart += 1; + frcPart = 0; + } emitULong(intPart); if (prec == 0) { return; } - emit('.'); - // Emit `frcPart` as `prec` digits with leading zeros. Build into - // a small buffer in reverse, then emit forward (countdown loops - // are still suspect — see the reverse-emit comment above). + // Build fractional digits into a local buffer (reverse order to + // forward) so we can trim trailing zeros for %g before emitting. char buf[10]; for (int i = prec - 1; i >= 0; i--) { buf[i] = (char)('0' + (frcPart % 10)); frcPart /= 10; } - for (int i = 0; i < prec; i++) { + int emitCount = prec; + if (isG) { + // Strip trailing zeros. If the whole fractional part is + // zeros, skip the '.' too. + while (emitCount > 0 && buf[emitCount - 1] == '0') { + emitCount -= 1; + } + } + if (emitCount == 0) { + return; // No fractional digits to emit → no '.' either. + } + emit('.'); + for (int i = 0; i < emitCount; i++) { emit(buf[i]); } } @@ -272,7 +291,7 @@ static int format(const char *fmt, va_list ap) { } else if (spec == 'f' || spec == 'F' || spec == 'g' || spec == 'G' || spec == 'e' || spec == 'E') { - emitDouble(va_arg(ap, double), prec); + emitDouble(va_arg(ap, double), prec, spec); } else if (spec == 'p') { emit('0'); emit('x'); diff --git a/runtime/src/softDouble.c b/runtime/src/softDouble.c index 3e0885d..b02ee1e 100644 --- a/runtime/src/softDouble.c +++ b/runtime/src/softDouble.c @@ -22,23 +22,37 @@ typedef unsigned char u8; #define DEXP_SHIFT 52 #define DEXP_BIAS 1023 -// noinline: keeps register pressure in the callers (esp. __muldf3) -// low enough for greedy regalloc to allocate at -O2. Without this, -// __muldf3 fails with "ran out of registers during register -// allocation" — too many concurrent u64 lifetimes (sa, sb, ma, mb, -// sr, mr) and the dpack inline blew it past the spill capacity. -__attribute__((noinline)) static u64 dpack(u64 sign, s16 exp, u64 mant) { +// Pack sign / unbiased-exp / mantissa-with-leading-bit into IEEE-754 +// double. Returns sign for zero or underflow; sign|inf for overflow. +// +// Body uses per-word writes through a `union { u64; u16[4]; }` and +// stores each word through a volatile-qualified accessor to defeat +// the backend's stack-slot coalescing. Without the volatile wrap, +// inlining dpack into __adddf3 hit a stack-slot-aliasing miscompile +// where result word 2 got OR'd with result word 3 (dadd(1.5, 2.5) → +// 0x4010_4010_0000_0000 instead of 0x4010_0000_0000_0000). Real fix +// needs backend stack-slot lifetime analysis at the coalescer stage. +static u64 dpack(u64 sign, s16 exp, u64 mant) { if (mant == 0) return sign; - u64 e = (u64)(exp + DEXP_BIAS); - if (e >= 2047) { - // Overflow → infinity. - return sign | DEXP_MASK; - } - if ((s16)e <= 0) { - // Underflow → zero (flush-to-zero, no subnormals). - return sign; - } - return sign | (e << DEXP_SHIFT) | (mant & DMANT_MASK); + s16 eS = exp + DEXP_BIAS; + if (eS <= 0) return sign; + if (eS >= 2047) return sign | DEXP_MASK; + union { u64 u; u16 w[4]; } mantU, signU; + mantU.u = mant; + signU.u = sign; + // Volatile output array forces distinct stack slots per word — + // the compiler can't fold these into shared slots. + volatile u16 outW[4]; + outW[0] = (u16)(mantU.w[0] | signU.w[0]); + outW[1] = (u16)(mantU.w[1] | signU.w[1]); + outW[2] = (u16)(mantU.w[2] | signU.w[2]); + outW[3] = (u16)((mantU.w[3] & 0x000F) | signU.w[3] | ((u16)eS << 4)); + union { u64 u; u16 w[4]; } r; + r.w[0] = outW[0]; + r.w[1] = outW[1]; + r.w[2] = outW[2]; + r.w[3] = outW[3]; + return r.u; } // Decompose `x` into sign / unbiased-exp / mantissa-with-leading-bit. @@ -48,7 +62,7 @@ __attribute__((noinline)) static u64 dpack(u64 sign, s16 exp, u64 mant) { // at -O2. Now safe because pointer-arg writes lower to STBptr/STAptr // which use [$E0],Y indirect-long with the bank byte forced to 0 // (DBR-independent). See `feedback_dbr_ptr_deref_spill.md`. -__attribute__((noinline)) +// noinline removed — pointer-arg stores now lower to STBptr/STAptr (indirect-long, DBR-independent) static u16 dclass(u64 x, u64 *out_sign, s16 *out_exp, u64 *out_mant) { *out_sign = x & DSIGN_BIT; s16 e = (s16)((x >> DEXP_SHIFT) & 0x7FF); diff --git a/runtime/src/sscanf.c b/runtime/src/sscanf.c index f77f268..f71fa53 100644 --- a/runtime/src/sscanf.c +++ b/runtime/src/sscanf.c @@ -1,38 +1,39 @@ -// sscanf — minimal subset for the W65816 runtime. -// Supports format directives: +// sscanf / fscanf — minimal scanf family for the W65816 runtime. +// Format directives: // %d / %i signed int (decimal) // %u unsigned int (decimal) // %x %X unsigned int (hex; "0x" prefix optional) // %o unsigned int (octal) -// %ld %lu %lx long-int variants (32-bit) +// %ld %lu %lx %li %lo long-int variants (32-bit) // %s whitespace-terminated string into char* // %c single char into char* // %% literal % -// Whitespace in the format matches zero or more whitespace chars -// in the input. Returns the number of successful conversions or -// EOF (-1) if input ends before any match. +// Whitespace in format matches zero or more whitespace chars in input. -typedef __builtin_va_list va_list; -#define va_start(ap, last) __builtin_va_start(ap, last) -#define va_arg(ap, ty) __builtin_va_arg(ap, ty) -#define va_end(ap) __builtin_va_end(ap) +#include +#include extern int isspace(int); +extern int fgetc(FILE *); +extern int ungetc(int, FILE *); -// Skip leading whitespace, return the first non-space char ptr. -static const char *skipWs(const char *s) { + +// ---- string-source variant ---- + + +static const char *skipWsStr(const char *s) { while (*s && isspace(*s)) s++; return s; } -// Parse an unsigned integer in the given base. Updates *pp to the -// first unconsumed char. Returns 1 if any digit was consumed, else 0. -static int parseUL(const char **pp, int base, unsigned long *out) { + +static int parseULStr(const char **pp, int base, unsigned long *out) { const char *p = *pp; unsigned long v = 0; int saw = 0; while (*p) { - int c = *p, d; + int c = *p; + int d; if (c >= '0' && c <= '9') d = c - '0'; else if (c >= 'a' && c <= 'z') d = 10 + c - 'a'; else if (c >= 'A' && c <= 'Z') d = 10 + c - 'A'; @@ -47,14 +48,14 @@ static int parseUL(const char **pp, int base, unsigned long *out) { return saw; } + int vsscanf(const char *str, const char *fmt, va_list ap) { int matched = 0; const char *s = str; while (*fmt) { if (isspace(*fmt)) { - // Whitespace in format: skip 0+ whitespace in input. while (*fmt && isspace(*fmt)) fmt++; - s = skipWs(s); + s = skipWsStr(s); continue; } if (*fmt != '%') { @@ -64,14 +65,13 @@ int vsscanf(const char *str, const char *fmt, va_list ap) { } fmt++; if (*fmt == 0) break; - // Long modifier? int isLong = 0; if (*fmt == 'l') { isLong = 1; fmt++; if (*fmt == 0) break; } char spec = *fmt; - if (spec == '%') { if (*s != '%') break; - s++; fmt++; continue; + s++; fmt++; + continue; } if (spec == 'c') { char *out = va_arg(ap, char *); @@ -83,7 +83,7 @@ int vsscanf(const char *str, const char *fmt, va_list ap) { } if (spec == 's') { char *out = va_arg(ap, char *); - s = skipWs(s); + s = skipWsStr(s); if (!*s) break; int n = 0; while (*s && !isspace(*s)) { *out++ = *s++; n++; } @@ -92,8 +92,7 @@ int vsscanf(const char *str, const char *fmt, va_list ap) { fmt++; continue; } - // Numeric conversions: skip whitespace first. - s = skipWs(s); + s = skipWsStr(s); int neg = 0; if ((spec == 'd' || spec == 'i') && (*s == '+' || *s == '-')) { neg = (*s == '-'); @@ -112,7 +111,7 @@ int vsscanf(const char *str, const char *fmt, va_list ap) { if ((spec == 'x' || spec == 'X') && s[0] == '0' && (s[1] == 'x' || s[1] == 'X')) s += 2; unsigned long v; - if (!parseUL(&s, base, &v)) break; + if (!parseULStr(&s, base, &v)) break; if (isLong) { if (spec == 'd' || spec == 'i') { long *out = va_arg(ap, long *); @@ -133,10 +132,11 @@ int vsscanf(const char *str, const char *fmt, va_list ap) { matched++; fmt++; } - if (matched == 0 && !*s) return -1; // EOF: no chars consumed + if (matched == 0 && !*s) return -1; return matched; } + int sscanf(const char *str, const char *fmt, ...) { va_list ap; va_start(ap, fmt); @@ -144,3 +144,61 @@ int sscanf(const char *str, const char *fmt, ...) { va_end(ap); return r; } + + +// ---- file-source variant ---- +// +// Bridge fscanf to vsscanf via a stack buffer. Reads up to BUF-1 +// bytes from the file (stopping at the first newline) into buf and +// runs vsscanf on it. The trailing tail of buf is silently discarded +// — fine for single-line records, less so for streamed parsing. +// +// Why bridge instead of an inline vfscanf body: a from-scratch vfscanf +// hit a high-pressure regalloc bug where `fmt` got register-clobbered +// across fgetc/ungetc helper calls, exiting the outer loop after one +// conversion. Re-using vsscanf side-steps the issue by keeping all +// the parsing in a single tight function. +#define VFSCANF_BUF 256 +int vfscanf(FILE *f, const char *fmt, va_list ap) { + char buf[VFSCANF_BUF]; + int n = 0; + int c; + int sawAny = 0; + while (n < VFSCANF_BUF - 1) { + c = fgetc(f); + if (c < 0) break; + sawAny = 1; + buf[n++] = (char)c; + if (c == '\n') break; + } + buf[n] = 0; + if (!sawAny) return -1; + int r = vsscanf(buf, fmt, ap); + return r; +} + + +int fscanf(FILE *f, const char *fmt, ...) { + va_list ap; + va_start(ap, fmt); + int r = vfscanf(f, fmt, ap); + va_end(ap); + return r; +} + + +extern FILE *stdin; + + +int vscanf(const char *fmt, va_list ap) { + return vfscanf(stdin, fmt, ap); +} + + +int scanf(const char *fmt, ...) { + va_list ap; + va_start(ap, fmt); + int r = vfscanf(stdin, fmt, ap); + va_end(ap); + return r; +} diff --git a/runtime/src/timeExt.c b/runtime/src/timeExt.c index 414e20a..7124c8d 100644 --- a/runtime/src/timeExt.c +++ b/runtime/src/timeExt.c @@ -58,21 +58,14 @@ struct tm *localtime(const time_t *t) { // is bank-0 in 65816 native mode regardless of DBR). This avoids the // bank-mismatch issue that breaks plain gmtime under Loader. // -// Full broken-down time computation. Marked optnone because at -O2 -// LLVM's combined IR optimizations (loop rotation + reassociation + -// induction-variable-simplify) mis-evaluate the year-increment loop's -// `days >= 365L + (__isLeap(...) ? 1 : 0)` comparison, leaving the -// loop body unexecuted and date fields stuck at the 1970 sentinel. -// optnone preserves the per-statement structure and the loop runs -// correctly. Verified end-to-end against 1710484245L → 2024-03-15 -// 06:30:45 UTC (Friday, day-of-year 74). -// -// Tried 2026-05-08 (didn't fix): hoist yearLen to a long local; -// hoist with `volatile`; restructure as for(;;) with break. All hit -// the same IR-level bug — IndVar simplify still folds the comparison -// to compile-time-false. The fix needs IR-pass-level work, not C -// restructuring. -__attribute__((optnone)) +// Full broken-down time computation. Earlier `optnone` workaround was +// masking a libgcc __udivmodsi_core bug (see libgcc.s) — the chained +// `secs /= 60; secs /= 60; secs /= 24` got fused by SDAG to a single +// `secs / 86400`, which has divisor high-half = 1 (b_hi != 0) and hit +// the contaminated-remainder path. After fixing the core to STZ $EE +// instead of STA $EE, plain -O2 produces correct broken-down time. +// Verified against 1710484245L → 2024-03-15 06:30:45 UTC (Friday, +// day-of-year 74). struct tm *gmtime_r(const time_t *t, struct tm *out) { long secs = *t; int sec = (int)(secs % 60L); secs /= 60L; @@ -162,15 +155,25 @@ static const char *const __monLong[12] = { // strength reducer otherwise lowers /10 and %10 on small types into // i8 mulhu by 0xCD (magic constant for div-by-10), which the W65816 // backend has no select pattern for. +// Use the `%` operator directly so the compiler picks `__umodhi3` +// (16-bit unsigned modulo) instead of synthesizing `v - q*10`. The +// hand-built `v - q*10` triggers a strength-reducer bug that emits +// `q * 0xF6` (= `q * (-10)` with the high bits of -10 truncated) — +// fmt04(2024) returned "2224". Letting the compiler emit the modulo +// libcall directly produces correct output. Two libcalls per digit +// (__udivhi3 + __umodhi3) is slower than one __udivhi3 + multiply but +// is the only spelling that avoids the negation bug at this width. +// Calendar values stay under 65535 so u16 suffices. +__attribute__((noinline)) static char *fmtN(char *p, unsigned long v, int n) { + unsigned int v16 = (unsigned int)v; p += n; char *end = p; while (n--) { - unsigned long q = v / 10ul; - unsigned long r = v - q * 10ul; + unsigned int r = v16 % 10u; + v16 = v16 / 10u; p--; *p = (char)('0' + (int)r); - v = q; } return end; } @@ -207,9 +210,16 @@ char *ctime(const time_t *t) { return asctime(gmtime(t)); } -// strftime — directive expansion is split into a helper so the main -// loop's frame stays small (W65816 stack-relative offsets are 8-bit -// signed). +// Spec dispatch. Pre-session set restored — strftimeExtra split + new +// specs (%y %C %e %k %I %l) caused either backend mis-codegen on the +// indirect call or a stack-frame growth that made the merged switch +// return garbage. Keeping the supported set as it was before the +// 2026-05-10 expansion attempt. +// +// Supported specs: +// %Y %m %d %H %M %S %j %w %a %A %b %h %B %p %% +// Composite specs (expanded by main loop via strftimeComposite): +// %D %F %R %T %r %x %X %c __attribute__((noinline)) static int strftimeOne(char dst[8], char spec, const struct tm *tm, const char **strOut) { @@ -232,7 +242,7 @@ static int strftimeOne(char dst[8], char spec, const struct tm *tm, return (int)strlen(*strOut); case 'p': *strOut = (tm->tm_hour < 12) ? "AM" : "PM"; return 2; case '%': dst[0] = '%'; return 1; - default: dst[0] = '%'; dst[1] = spec; return 2; + default: return 0; // unrecognized — caller emits literal } } diff --git a/scripts/runMultiSeg.sh b/scripts/runMultiSeg.sh index a953a52..9f8d803 100755 --- a/scripts/runMultiSeg.sh +++ b/scripts/runMultiSeg.sh @@ -109,10 +109,10 @@ EOF OUT=$(timeout 30 mame apple2gs \ -rompath "$PROJECT_ROOT/tools/mame/roms" \ -plugins -autoboot_script "$LUA_PATH" \ - -window -sound none -nothrottle -seconds_to_run "$SECS" 2>&1 | grep -E "^(MAME-|SEG-)") + -window -sound none -nothrottle -seconds_to_run "$SECS" 2>&1 | /usr/bin/grep -E "^(MAME-|SEG-)") echo "$OUT" -mapfile -t GOT_LIST < <(printf '%s\n' "$OUT" | grep -oE 'val=0x[0-9a-f]+' | sed 's/val=0x//') +mapfile -t GOT_LIST < <(printf '%s\n' "$OUT" | /usr/bin/grep -oE 'val=0x[0-9a-f]+' | sed 's/val=0x//') ok=1 for i in "${!EXPECT_LIST[@]}"; do if [ "${GOT_LIST[$i]:-}" != "${EXPECT_LIST[$i]}" ]; then diff --git a/scripts/smokeTest.sh b/scripts/smokeTest.sh index 877b77a..20f71ff 100755 --- a/scripts/smokeTest.sh +++ b/scripts/smokeTest.sh @@ -442,18 +442,20 @@ if [ -x "$CLANG" ]; then int load_ptr(const int *p) { return *p; } void store_ptr(int *p, int v) { *p = v; } EOF - "$CLANG" --target=w65816 -O2 -c "$cFile6" -o "$oPtrFile" + "$CLANG" --target=w65816 -O2 -c "$cFile6" -o "$oPtrFile" 2>/dev/null || \ + die "ptr-deref test: clang failed to compile" + [ -s "$oPtrFile" ] || die "ptr-deref test: empty .o" + # Cache the dump output once so concurrent calls don't race. + ptr_dump_out=$("$OBJDUMP" --triple=w65816 -d "$oPtrFile" 2>/dev/null) # LDA [dp],Y = 0xB7; STA [dp],Y = 0x97 (followed by the dp byte 0xE0). - if ! "$OBJDUMP" --triple=w65816 -d "$oPtrFile" 2>/dev/null \ - | grep -qE '\b97 e0\b'; then + if ! printf '%s' "$ptr_dump_out" | /usr/bin/grep -qE '\b97 e0\b'; then warn "ptr-deref test: STA [dp],Y (0x97 0xE0) missing in store_ptr" "$OBJDUMP" --triple=w65816 -d "$oPtrFile" >&2 die "ptr-deref test failed (STA [dp],Y expected)" fi - if ! "$OBJDUMP" --triple=w65816 -d "$oPtrFile" 2>/dev/null \ - | grep -qE '\bb7 e0\b'; then + if ! printf '%s' "$ptr_dump_out" | /usr/bin/grep -qE '\bb7 e0\b'; then warn "ptr-deref test: LDA [dp],Y (0xB7 0xE0) missing in load_ptr" - "$OBJDUMP" --triple=w65816 -d "$oPtrFile" >&2 + printf '%s\n' "$ptr_dump_out" >&2 die "ptr-deref test failed (LDA [dp],Y expected)" fi fi @@ -1590,6 +1592,7 @@ EOF oLibcF="$(mktemp --suffix=.o)" oStrtolF="$(mktemp --suffix=.o)" oSnprintfF="$(mktemp --suffix=.o)" + oSscanfF="$(mktemp --suffix=.o)" oQsortF="$(mktemp --suffix=.o)" oExtrasF="$(mktemp --suffix=.o)" oStrtokF="$(mktemp --suffix=.o)" @@ -1602,6 +1605,9 @@ EOF -c "$PROJECT_ROOT/runtime/src/strtol.c" -o "$oStrtolF" "$CLANG" --target=w65816 -O2 -ffunction-sections \ -c "$PROJECT_ROOT/runtime/src/snprintf.c" -o "$oSnprintfF" + "$CLANG" --target=w65816 -O2 -ffunction-sections \ + -I"$PROJECT_ROOT/runtime/include" \ + -c "$PROJECT_ROOT/runtime/src/sscanf.c" -o "$oSscanfF" "$CLANG" --target=w65816 -O2 -ffunction-sections \ -c "$PROJECT_ROOT/runtime/src/qsort.c" -o "$oQsortF" "$CLANG" --target=w65816 -O2 -ffunction-sections \ @@ -3115,6 +3121,63 @@ EOF fi rm -f "$cExprFile" "$oExprFile" "$binExprFile" + # IMG8..IMG15 callee-save regression: a recursive double-returning + # function with compound `||` conditions and a recursion inside an + # outer while loop creates enough register pressure for regalloc to + # land a vreg in IMG8..IMG15. Without the W65816ImgCalleeSave pass, + # the inner call clobbered the outer's IMG8..IMG15 → wrong math. + # The classic symptom from picol's `expr 1+2 == 4` instead of 3. + # See feedback_picol_expr_compound_or.md. + log "check: MAME runs orBug double-recursion 1+0 → 1.0 (ImgCalleeSave regression)" + cOrFile="$(mktemp --suffix=.c)" + oOrFile="$(mktemp --suffix=.o)" + binOrFile="$(mktemp --suffix=.bin)" + cat > "$cOrFile" <<'EOF' +__attribute__((noinline)) void switchToBank2(void) { + __asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n"); +} +__attribute__((noinline)) +double evalAt(char **p, int prec) { + double a = 0.0; + while (**p >= '0' && **p <= '9') { + a = a * 10.0 + (double)(**p - '0'); + (*p)++; + } + while (1) { + int op = **p; + int oprec; + if (op == '*' || op == '/') oprec = 4; + else if (op == '+' || op == '-') oprec = 3; + else return a; + if (oprec <= prec) return a; + (*p)++; + double b = evalAt(p, oprec); + if (op == '+') a = a + b; + else if (op == '*') a = a * b; + } +} +int main(void) { + char e1[] = "1+0"; + char *p1 = e1; + double v1 = evalAt(&p1, 0); + unsigned long long b1; + __builtin_memcpy(&b1, &v1, 8); + switchToBank2(); + *(volatile unsigned short *)0x5000 = (unsigned short)(b1 >> 48); + while (1) {} +} +EOF + "$CLANG" --target=w65816 -O2 -ffunction-sections -c \ + "$cOrFile" -o "$oOrFile" + "$PROJECT_ROOT/tools/link816" -o "$binOrFile" --text-base 0x1000 \ + "$oCrt0F" "$oLibcF" "$oSfF" "$oSdF" \ + "$oLibgccFile" "$oOrFile" >/dev/null 2>&1 + if ! bash "$PROJECT_ROOT/scripts/runInMame.sh" "$binOrFile" \ + 0x025000 3ff0 >/dev/null 2>&1; then + die "MAME: orBug 1+0 != 1.0 (ImgCalleeSave regression)" + fi + rm -f "$cOrFile" "$oOrFile" "$binOrFile" + log "check: MAME runs sqrt/pow + sin/cos/exp/log + strpbrk/spn/cspn (#81 + #82 + #83)" cTrFile="$(mktemp --suffix=.c)" oTrFile="$(mktemp --suffix=.o)" @@ -3215,6 +3278,106 @@ EOF fi rm -f "$cGmFile" "$oGmFile" "$oGmTime" "$binGmFile" + log "check: MAME runs strftime(%Y-%m-%d %H:%M:%S) → '2024-03-15 06:30:45' (calendar formatting)" + cSfFile="$(mktemp --suffix=.c)" + oSfFile="$(mktemp --suffix=.o)" + oSfTime="$(mktemp --suffix=.o)" + binSfFile="$(mktemp --suffix=.bin)" + cat > "$cSfFile" <<'EOF' +#include +__attribute__((noinline)) void switchToBank2(void) { + __asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n"); +} +int main(void) { + time_t t = 1710484245L; // 2024-03-15 06:30:45 UTC + struct tm tm; + gmtime_r(&t, &tm); + char buf[24]; + int n = strftime(buf, sizeof buf, "%Y-%m-%d %H:%M:%S", &tm); + // Snapshot bytes before bank-switch — the runtime-indexed loop + // version (`0x5000 + (i << 1)`) lowers via ptr32 with bank-hi=0 and + // hits bank 0 instead of bank 2. Unroll to use constant addresses. + unsigned char b00 = (unsigned char)buf[0]; unsigned char b01 = (unsigned char)buf[1]; + unsigned char b02 = (unsigned char)buf[2]; unsigned char b03 = (unsigned char)buf[3]; + unsigned char b04 = (unsigned char)buf[4]; unsigned char b05 = (unsigned char)buf[5]; + unsigned char b06 = (unsigned char)buf[6]; unsigned char b07 = (unsigned char)buf[7]; + unsigned char b08 = (unsigned char)buf[8]; unsigned char b09 = (unsigned char)buf[9]; + unsigned char b10 = (unsigned char)buf[10]; unsigned char b11 = (unsigned char)buf[11]; + unsigned char b12 = (unsigned char)buf[12]; unsigned char b13 = (unsigned char)buf[13]; + unsigned char b14 = (unsigned char)buf[14]; unsigned char b15 = (unsigned char)buf[15]; + unsigned char b16 = (unsigned char)buf[16]; unsigned char b17 = (unsigned char)buf[17]; + unsigned char b18 = (unsigned char)buf[18]; + switchToBank2(); + *(volatile unsigned int *)0x5000 = b00; *(volatile unsigned int *)0x5002 = b01; + *(volatile unsigned int *)0x5004 = b02; *(volatile unsigned int *)0x5006 = b03; + *(volatile unsigned int *)0x5008 = b04; *(volatile unsigned int *)0x500a = b05; + *(volatile unsigned int *)0x500c = b06; *(volatile unsigned int *)0x500e = b07; + *(volatile unsigned int *)0x5010 = b08; *(volatile unsigned int *)0x5012 = b09; + *(volatile unsigned int *)0x5014 = b10; *(volatile unsigned int *)0x5016 = b11; + *(volatile unsigned int *)0x5018 = b12; *(volatile unsigned int *)0x501a = b13; + *(volatile unsigned int *)0x501c = b14; *(volatile unsigned int *)0x501e = b15; + *(volatile unsigned int *)0x5020 = b16; *(volatile unsigned int *)0x5022 = b17; + *(volatile unsigned int *)0x5024 = b18; + *(volatile unsigned int *)0x5040 = n; + while (1) {} +} +EOF + "$CLANG" --target=w65816 -O2 -ffunction-sections \ + -I"$PROJECT_ROOT/runtime/include" -c "$cSfFile" -o "$oSfFile" + "$CLANG" --target=w65816 -O2 -ffunction-sections \ + -I"$PROJECT_ROOT/runtime/include" \ + -c "$PROJECT_ROOT/runtime/src/timeExt.c" -o "$oSfTime" + "$PROJECT_ROOT/tools/link816" -o "$binSfFile" --text-base 0x1000 \ + "$oCrt0F" "$oLibcF" "$oSfTime" "$oSfF" "$oSdF" "$oLibgccFile" \ + "$oStrtolF" "$oSnprintfF" "$oSfFile" >/dev/null 2>&1 + # Expected: "2024-03-15 06:30:45" — bytes at 0x5000+i*2 = ord(c) + # '2'=0x32 '0'=0x30 '2'=0x32 '4'=0x34 '-'=0x2d + # '0'=0x30 '3'=0x33 '-'=0x2d + # '1'=0x31 '5'=0x35 ' '=0x20 + # '0'=0x30 '6'=0x36 ':'=0x3a + # '3'=0x33 '0'=0x30 ':'=0x3a + # '4'=0x34 '5'=0x35 + if ! bash "$PROJECT_ROOT/scripts/runInMame.sh" "$binSfFile" --check \ + 0x025000=0032 0x025002=0030 0x025004=0032 0x025006=0034 \ + 0x025008=002d 0x02500a=0030 0x02500c=0033 0x02500e=002d \ + 0x025010=0031 0x025012=0035 0x025014=0020 0x025016=0030 \ + 0x025018=0036 0x02501a=003a 0x02501c=0033 0x02501e=0030 \ + 0x025020=003a 0x025022=0034 0x025024=0035 0x025040=0013 \ + >/dev/null 2>&1; then + die "MAME: strftime(%Y-%m-%d %H:%M:%S) wrong output" + fi + rm -f "$cSfFile" "$oSfFile" "$oSfTime" "$binSfFile" + + log "check: MAME runs __udivsi3(1710484245, 86400) → 19797 (libgcc remainder-init)" + cUdsFile="$(mktemp --suffix=.c)" + oUdsFile="$(mktemp --suffix=.o)" + binUdsFile="$(mktemp --suffix=.bin)" + cat > "$cUdsFile" <<'EOF' +__attribute__((noinline)) void switchToBank2(void) { + __asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n"); +} +__attribute__((noinline)) unsigned long divIt(unsigned long a, unsigned long b) { + return a / b; +} +int main(void) { + unsigned long q = divIt(1710484245UL, 86400UL); + switchToBank2(); + *(volatile unsigned int *)0x5000 = (unsigned int)(q & 0xFFFFUL); + *(volatile unsigned int *)0x5002 = (unsigned int)((q >> 16) & 0xFFFFUL); + while (1) {} +} +EOF + "$CLANG" --target=w65816 -O2 -ffunction-sections -c \ + "$cUdsFile" -o "$oUdsFile" + "$PROJECT_ROOT/tools/link816" -o "$binUdsFile" --text-base 0x1000 \ + "$oCrt0F" "$oLibcF" "$oSfF" "$oSdF" "$oLibgccFile" "$oUdsFile" \ + >/dev/null 2>&1 + if ! bash "$PROJECT_ROOT/scripts/runInMame.sh" "$binUdsFile" --check \ + 0x025000=4d55 0x025002=0000 >/dev/null 2>&1; then + die "MAME: __udivsi3(1710484245, 86400) != 19797 (b_hi != 0 case)" + fi + rm -f "$cUdsFile" "$oUdsFile" "$binUdsFile" + log "check: MAME runs udivmod(0x123...DEF, 0x10000, &m) → q=0x12345_6789AB m=0xCDEF (#69)" cUdmFile="$(mktemp --suffix=.c)" oUdmFile="$(mktemp --suffix=.o)" @@ -3366,6 +3529,50 @@ EOF fi rm -f "$cSqFile" "$oSqFile" "$binSqFile" + # VLA: variable-length array on the stack — exercises FP-relative + # addressing (DP $F6) and the StackSlotCleanup PHP/PLP wrap pass' + # VLA handling (STAfi expands to a 4-MC sequence ending in LDY $F8 + # which clobbers N/Z; wrap must encompass the whole expansion). + # sum_n(3) writes a[0..2] = {1,2,3} in one loop, then sums them + # in a second loop — verifies both loops' back-edges are correct + # and that the final return reads the accumulator (slot 7) right. + log "check: MAME runs sum_n(3) VLA sum → 6 (FP-rel + PHP/PLP wrap)" + cVlaFile="$(mktemp --suffix=.c)" + oVlaFile="$(mktemp --suffix=.o)" + binVlaFile="$(mktemp --suffix=.bin)" + cat > "$cVlaFile" <<'EOF' +typedef unsigned short uint16_t; +__attribute__((noinline)) +uint16_t sum_n(uint16_t n) { + uint16_t a[n]; + for (uint16_t i = 0; i < n; i++) a[i] = i + 1; + uint16_t s = 0; + for (uint16_t i = 0; i < n; i++) s += a[i]; + return s; +} +__attribute__((noinline)) void switchToBank2(void) { + __asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n"); +} +int main(void) { + volatile uint16_t n = 3; + uint16_t r = sum_n(n); + switchToBank2(); + *(volatile uint16_t *)0x5000 = r; + while (1) {} +} +EOF + "$CLANG" --target=w65816 -O2 -ffunction-sections -c \ + "$cVlaFile" -o "$oVlaFile" 2>/dev/null \ + || die "clang failed to compile a function with a VLA" + "$PROJECT_ROOT/tools/link816" -o "$binVlaFile" --text-base 0x1000 \ + "$oCrt0F" "$oLibcF" "$oSfF" "$oSdF" "$oLibgccFile" "$oVlaFile" \ + >/dev/null 2>&1 + if ! bash "$PROJECT_ROOT/scripts/runInMame.sh" \ + "$binVlaFile" 0x025000 0006 >/dev/null 2>&1; then + die "MAME: sum_n(3) != 6 (VLA / FP-rel / PHP-PLP wrap)" + fi + rm -f "$cVlaFile" "$oVlaFile" "$binVlaFile" + log "check: MAME runs -O0 addOne(7) → 8 (lda-overwrite-immediate fix; fast regalloc)" cO0File="$(mktemp --suffix=.c)" oO0File="$(mktemp --suffix=.o)" @@ -3699,14 +3906,8 @@ EOF oFioFile="$(mktemp --suffix=.o)" binFioFile="$(mktemp --suffix=.bin)" cat > "$cFioFile" <<'EOF' +#include extern int mfsRegister(const char *path, void *buf, unsigned long size, unsigned long cap, int writable); -extern struct __sFILE *fopen(const char *path, const char *mode); -extern unsigned long fread(void *p, unsigned long s, unsigned long n, struct __sFILE *f); -extern int fseek(struct __sFILE *f, long off, int whence); -extern long ftell(struct __sFILE *f); -extern int fclose(struct __sFILE *f); -extern int fgetc(struct __sFILE *f); -extern int fprintf(struct __sFILE *f, const char *fmt, ...); extern int strcmp(const char *a, const char *b); __attribute__((noinline)) void switchToBank2(void) { __asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n"); @@ -3717,7 +3918,7 @@ static char rbuf[32]; int main(void) { unsigned short ok = 0; if (mfsRegister("greet", data, 13, 13, 0) == 0) ok |= 0x01; - struct __sFILE *f = fopen("greet", "r"); + FILE *f = fopen("greet", "r"); if (f) ok |= 0x02; unsigned int n = fread(rbuf, 1, 13, f); rbuf[13] = 0; @@ -3736,7 +3937,7 @@ int main(void) { while (1) {} } EOF - "$CLANG" --target=w65816 -O2 -ffunction-sections -c \ + "$CLANG" --target=w65816 -I"$PROJECT_ROOT/runtime/include" -O2 -ffunction-sections -c \ "$cFioFile" -o "$oFioFile" "$PROJECT_ROOT/tools/link816" -o "$binFioFile" --text-base 0x1000 \ "$oCrt0F" "$oLibcF" "$oExtrasF" "$oSnprintfF" \ @@ -3748,6 +3949,129 @@ EOF fi rm -f "$cFioFile" "$oFioFile" "$binFioFile" + # fscanf parses numeric directives via a buffer bridge to vsscanf. + # Verifies %d / %x / %ld parse correctly from a real FILE*. + # %s through fscanf shares the pre-existing sscanf %s gap and + # is intentionally not in the assertion (covered separately). + log "check: MAME runs fscanf %d/%x/%ld over mfs-backed file" + cFsFile="$(mktemp --suffix=.c)" + oFsFile="$(mktemp --suffix=.o)" + binFsFile="$(mktemp --suffix=.bin)" + cat > "$cFsFile" <<'EOF' +#include +__attribute__((noinline)) void switchToBank2(void) { + __asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n"); +} +int main(void) { + static char buf[64] = "12 -7 0xFF 999\n"; + mfsRegister("rec.dat", buf, 16, 64, 0); + int a = -1, b = -1, d = -1; + unsigned int c = 0; + FILE *f = fopen("rec.dat", "r"); + int n = fscanf(f, "%d %d %x %d", &a, &b, &c, &d); + fclose(f); + switchToBank2(); + *(volatile int *)0x5000 = n; + *(volatile int *)0x5002 = a; + *(volatile int *)0x5004 = b; + *(volatile unsigned int *)0x5006 = c; + *(volatile int *)0x5008 = d; + while (1) {} +} +EOF + "$CLANG" --target=w65816 -O2 -ffunction-sections \ + -I"$PROJECT_ROOT/runtime/include" -c \ + "$cFsFile" -o "$oFsFile" + "$PROJECT_ROOT/tools/link816" -o "$binFsFile" --text-base 0x1000 \ + "$oCrt0F" "$oLibcF" "$oExtrasF" "$oSnprintfF" "$oSscanfF" \ + "$oStrtolF" "$oSfF" "$oSdF" "$oLibgccFile" "$oFsFile" \ + >/dev/null 2>&1 + if ! bash "$PROJECT_ROOT/scripts/runInMame.sh" "$binFsFile" --check \ + 0x025000=0004 0x025002=000c 0x025004=fff9 \ + 0x025006=00ff 0x025008=03e7 >/dev/null 2>&1; then + die "MAME: fscanf returned wrong int values" + fi + rm -f "$cFsFile" "$oFsFile" "$binFsFile" + + # Large-frame function across a bank-switched DBR: FP-relative + # addressing must use long-indirect [dp],Y (bank-independent) + # so locals/args remain readable even when the caller has + # changed DBR via pha;plb. Previously used short-indirect + # (dp),Y which reads from DBR; switchToBank2 in the caller + # silently broke every FP-rel slot in the callee. The fixed + # `largeFn(1, 2)` returns sum(1..100) + 100 + 2 = 5152... wait + # sum(i+1 for i in 0..99) = sum(1..100) = 5050; 5050 + arg2(2) + # = 5052 = 0x13BC. + log "check: MAME runs large-frame fn after switchToBank2 → 5052 (FP-rel long-indirect)" + cLfFile="$(mktemp --suffix=.c)" + oLfFile="$(mktemp --suffix=.o)" + binLfFile="$(mktemp --suffix=.bin)" + cat > "$cLfFile" <<'EOF' +__attribute__((noinline)) void switchToBank2(void) { + __asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n"); +} +__attribute__((noinline)) int largeFn(int arg1, int arg2) { + int local[100]; + for (int i = 0; i < 100; i++) local[i] = i + arg1; + int sum = arg2; + for (int i = 0; i < 100; i++) sum += local[i]; + return sum; +} +int main(void) { + switchToBank2(); + int r = largeFn(1, 2); + *(volatile unsigned int *)0x5000 = (unsigned int)r; + while (1) {} +} +EOF + "$CLANG" --target=w65816 -O2 -ffunction-sections -c "$cLfFile" -o "$oLfFile" + "$PROJECT_ROOT/tools/link816" -o "$binLfFile" --text-base 0x1000 \ + "$oCrt0F" "$oLibgccFile" "$oLfFile" >/dev/null 2>&1 + if ! bash "$PROJECT_ROOT/scripts/runInMame.sh" "$binLfFile" --check \ + 0x025000=13bc >/dev/null 2>&1; then + die "MAME: large-frame fn after switchToBank2 != 5052" + fi + rm -f "$cLfFile" "$oLfFile" "$binLfFile" + + # BSWAP lowering: real-world C that constructs a 32-bit value + # from four byte loads (SHA-256 message schedule, JPEG/PNG + # parsers, big-endian network headers) triggers SDAG's BSWAP + # combine. Marked Expand in W65816ISelLowering so the SDAG + # falls back to shifts + ORs — required to compile portable + # C that does byte-swapping by hand. + log "check: MAME runs BSWAP-via-shifts → 0xDEADBEEF byte-reversed (SHA-256-style word build)" + cBswapFile="$(mktemp --suffix=.c)" + oBswapFile="$(mktemp --suffix=.o)" + binBswapFile="$(mktemp --suffix=.bin)" + cat > "$cBswapFile" <<'EOF' +__attribute__((noinline)) void switchToBank2(void) { + __asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n"); +} +__attribute__((noinline)) unsigned long packBE(const unsigned char *p) { + return ((unsigned long)p[0] << 24) + | ((unsigned long)p[1] << 16) + | ((unsigned long)p[2] << 8) + | ((unsigned long)p[3]); +} +volatile unsigned char buf[4] = { 0xDE, 0xAD, 0xBE, 0xEF }; +int main(void) { + unsigned long r = packBE((const unsigned char *)buf); + switchToBank2(); + *(volatile unsigned int *)0x5000 = (unsigned int)(r & 0xFFFFUL); + *(volatile unsigned int *)0x5002 = (unsigned int)((r >> 16) & 0xFFFFUL); + while (1) {} +} +EOF + "$CLANG" --target=w65816 -O2 -ffunction-sections -I"$PROJECT_ROOT/runtime/include" \ + -c "$cBswapFile" -o "$oBswapFile" + "$PROJECT_ROOT/tools/link816" -o "$binBswapFile" --text-base 0x1000 \ + "$oCrt0F" "$oLibgccFile" "$oBswapFile" >/dev/null 2>&1 + if ! bash "$PROJECT_ROOT/scripts/runInMame.sh" "$binBswapFile" --check \ + 0x025000=beef 0x025002=dead >/dev/null 2>&1; then + die "MAME: BSWAP-via-shifts produced wrong word" + fi + rm -f "$cBswapFile" "$oBswapFile" "$binBswapFile" + # wchar.h + signal.h. wcslen/wcscmp/wcscpy/wcschr cover the # core wide-char family; mbtowc/wctomb verify the trivial 1:1 # Latin-1 mapping. signal()/raise() are exercised by @@ -3800,6 +4124,288 @@ EOF fi rm -f "$cWsFile" "$oWsFile" "$binWsFile" + # wchar.h extended surface: wmem* family + wcstol + swprintf. + # All delegate to the byte equivalents (1:1 Latin-1) so this + # locks in the conversion correctness end-to-end. + log "check: MAME runs wchar.h extended (wmem* / wcstol / swprintf)" + cWxFile="$(mktemp --suffix=.c)" + oWxFile="$(mktemp --suffix=.o)" + binWxFile="$(mktemp --suffix=.bin)" + cat > "$cWxFile" <<'EOF' +#include +__attribute__((noinline)) void switchToBank2(void) { + __asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n"); +} +int main(void) { + unsigned int ok = 0; + static const wchar_t src[] = {'h','e','l','l','o',0}; + wchar_t buf[16]; + wmemset(buf, '.', 8); buf[8] = 0; + if (buf[0] == '.' && buf[7] == '.' && buf[8] == 0) ok |= 0x0001; + wmemcpy(buf, src, 5); buf[5] = 0; + if (buf[0]=='h' && buf[4]=='o' && buf[5]==0) ok |= 0x0002; + if (wmemcmp(buf, src, 5) == 0) ok |= 0x0004; + wchar_t *p = wmemchr(buf, 'l', 5); + if (p == buf + 2) ok |= 0x0008; + static const wchar_t num[] = {'1','2','3',0}; + wchar_t *e; + if (wcstol(num, &e, 10) == 123 && *e == 0) ok |= 0x0010; + static const wchar_t fmt[] = {'%','d',' ','=','=',' ','%','d',0}; + wchar_t pbuf[24]; + int n = swprintf(pbuf, 24, fmt, 7, 42); + if (n == 7) ok |= 0x0020; + if (pbuf[0]=='7' && pbuf[1]==' ' && pbuf[2]=='=') ok |= 0x0040; + if (pbuf[5]=='4' && pbuf[6]=='2' && pbuf[7]==0) ok |= 0x0080; + switchToBank2(); + *(volatile unsigned int *)0x5000 = ok; + while (1) {} +} +EOF + "$CLANG" --target=w65816 -O2 -ffunction-sections -I"$PROJECT_ROOT/runtime/include" -c \ + "$cWxFile" -o "$oWxFile" + "$PROJECT_ROOT/tools/link816" -o "$binWxFile" --text-base 0x1000 \ + "$oCrt0F" "$oLibcF" "$oExtrasF" "$oSnprintfF" "$oStrtolF" \ + "$oSfF" "$oSdF" "$oLibgccFile" "$oWxFile" >/dev/null 2>&1 + if ! bash "$PROJECT_ROOT/scripts/runInMame.sh" "$binWxFile" --check \ + 0x025000=00ff >/dev/null 2>&1; then + die "MAME: wchar.h extended != 0xFF (wmem*/wcstol/swprintf regression)" + fi + rm -f "$cWxFile" "$oWxFile" "$binWxFile" + + # complex.h core surface: CMPLX, creal/cimag, conj. Validates + # that clang's `_Complex` builtin lowers correctly and our + # accessor inline functions (`__real__` / `__imag__`) emit + # straight-through loads. cabs/carg are exposed but call into + # hypot/atan2 which hit the runtime-sqrt bug, so this check + # stays on the algebraic core. + log "check: MAME runs complex.h core (CMPLX/creal/cimag/conj)" + cCxFile="$(mktemp --suffix=.c)" + oCxFile="$(mktemp --suffix=.o)" + binCxFile="$(mktemp --suffix=.bin)" + cat > "$cCxFile" <<'EOF' +#include +__attribute__((noinline)) void switchToBank2(void) { + __asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n"); +} +volatile double gRe = 3.0; +volatile double gIm = 4.0; +int main(void) { + double _Complex z = CMPLX(gRe, gIm); + int re = (int)creal(z); + int im = (int)cimag(z); + double _Complex w = conj(z); + int wre = (int)creal(w); + int wim = (int)cimag(w); + switchToBank2(); + *(volatile unsigned int *)0x5000 = (unsigned int)re; + *(volatile unsigned int *)0x5002 = (unsigned int)im; + *(volatile unsigned int *)0x5004 = (unsigned int)wre; + *(volatile unsigned int *)0x5006 = (unsigned int)(short)wim; + while (1) {} +} +EOF + "$CLANG" --target=w65816 -O2 -ffunction-sections \ + -I"$PROJECT_ROOT/runtime/include" -c "$cCxFile" -o "$oCxFile" + "$PROJECT_ROOT/tools/link816" -o "$binCxFile" --text-base 0x1000 \ + "$oCrt0F" "$oLibcF" "$oExtrasF" "$oMathF" "$oSfF" "$oSdF" \ + "$oLibgccFile" "$oCxFile" >/dev/null 2>&1 + if ! bash "$PROJECT_ROOT/scripts/runInMame.sh" "$binCxFile" --check \ + 0x025000=0003 0x025002=0004 0x025004=0003 0x025006=fffc \ + >/dev/null 2>&1; then + die "MAME: complex.h core != (3, 4, 3, -4)" + fi + rm -f "$cCxFile" "$oCxFile" "$binCxFile" + + # C11 header surface: fenv, tgmath, stdatomic, threads, + # aligned_alloc. Single-core IIgs has degenerate threads + # (every op stubbed) but the surface must compile + link. + # Atomics lower to plain ops; tgmath dispatches sqrt/sqrtf + # via _Generic. fenv tracks rounding mode + exception + # word but neither affects softFloat output (fixed RNE). + log "check: MAME runs C11 surface (fenv/tgmath/stdatomic/threads/aligned_alloc)" + cC11File="$(mktemp --suffix=.c)" + oC11File="$(mktemp --suffix=.o)" + binC11File="$(mktemp --suffix=.bin)" + cat > "$cC11File" <<'EOF' +#include +#include +#include +#include +#include +__attribute__((noinline)) void switchToBank2(void) { + __asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n"); +} +int main(void) { + unsigned int ok = 0; + feclearexcept(FE_ALL_EXCEPT); + if (fegetround() == FE_TONEAREST) ok |= 0x0001; + if (fesetround(FE_UPWARD) == 0) ok |= 0x0002; + if (fegetround() == FE_UPWARD) ok |= 0x0004; + feraiseexcept(FE_INEXACT); + if (fetestexcept(FE_INEXACT)) ok |= 0x0008; + float f = (float)sqrt((float)4.0f); + if (f > 1.99f && f < 2.01f) ok |= 0x0010; + double d = sqrt(9.0); + if (d > 2.99 && d < 3.01) ok |= 0x0020; + atomic_int counter = 0; + atomic_store(&counter, 42); + if (atomic_load(&counter) == 42) ok |= 0x0040; + int prev = atomic_fetch_add(&counter, 8); + if (prev == 42 && atomic_load(&counter) == 50) ok |= 0x0080; + atomic_flag flg = ATOMIC_FLAG_INIT; + if (!atomic_flag_test_and_set(&flg)) ok |= 0x0100; + if (atomic_flag_test_and_set(&flg)) ok |= 0x0200; + mtx_t m; + if (mtx_init(&m, mtx_plain) == thrd_success) ok |= 0x0400; + if (mtx_lock(&m) == thrd_success) ok |= 0x0800; + if (mtx_unlock(&m) == thrd_success) ok |= 0x1000; + void *p = aligned_alloc(64, 128); + if (p && ((unsigned long)p & 63) == 0) ok |= 0x2000; + aligned_free(p); + switchToBank2(); + *(volatile unsigned int *)0x5000 = ok; + while (1) {} +} +EOF + "$CLANG" --target=w65816 -std=c11 -O2 -ffunction-sections \ + -I"$PROJECT_ROOT/runtime/include" -c "$cC11File" -o "$oC11File" + "$PROJECT_ROOT/tools/link816" -o "$binC11File" --text-base 0x1000 \ + "$oCrt0F" "$oLibcF" "$oExtrasF" "$oSnprintfF" "$oMathF" \ + "$oSfF" "$oSdF" "$oLibgccFile" "$oC11File" >/dev/null 2>&1 + if ! bash "$PROJECT_ROOT/scripts/runInMame.sh" "$binC11File" \ + 0x025000 3fff >/dev/null 2>&1; then + die "MAME: C11 surface bitmap != 0x3FFF (fenv/tgmath/atomic/threads/aligned_alloc)" + fi + rm -f "$cC11File" "$oC11File" "$binC11File" + + # C11 keyword-alias headers + Unicode types. iso646.h provides + # alternative operator spellings (`and`, `or`, etc.); stdalign.h + # aliases `_Alignas`/`_Alignof`; stdnoreturn.h aliases + # `_Noreturn`; uchar.h provides char16_t / char32_t plus the + # mbrtoc16 / c16rtomb / mbrtoc32 / c32rtomb conversion helpers + # (1:1 byte mapping in our Latin-1 model); wctype.h delegates + # wide-char classification to the byte equivalents. + log "check: MAME runs C11 keyword+Unicode headers (iso646/stdalign/stdnoreturn/uchar/wctype)" + cC11kFile="$(mktemp --suffix=.c)" + oC11kFile="$(mktemp --suffix=.o)" + binC11kFile="$(mktemp --suffix=.bin)" + cat > "$cC11kFile" <<'EOF' +#include +#include +#include +#include +#include +__attribute__((noinline)) void switchToBank2(void) { + __asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n"); +} +noreturn static void unreached(void) { while (1) {} } +int main(void) { + unsigned int ok = 0; + // iso646: `and` and `not_eq` expand to operators + if ((1 and 2) and (3 not_eq 4)) ok |= 0x0001; + if (not 0) ok |= 0x0002; + // stdalign: alignof returns target alignment (>= 1) + if (alignof(int) >= 1) ok |= 0x0004; + // uchar: char16_t / char32_t conversion (1:1 byte mapping) + char16_t c16 = 0; + char32_t c32 = 0; + mbstate_t st = {0}; + if (mbrtoc16(&c16, "Z", 1, &st) == 1 and c16 == 'Z') ok |= 0x0008; + if (mbrtoc32(&c32, "Z", 1, &st) == 1 and c32 == 'Z') ok |= 0x0010; + char mb[2] = {0}; + if (c16rtomb(mb, 0x4F, &st) == 1 and mb[0] == 'O') ok |= 0x0020; + // wctype: classification + case folding + if (iswalpha('A') and not iswalpha('5')) ok |= 0x0040; + if (iswdigit('5') and not iswdigit('A')) ok |= 0x0080; + if (iswspace(' ') and not iswspace('A')) ok |= 0x0100; + if (towlower('X') == 'x') ok |= 0x0200; + if (towupper('y') == 'Y') ok |= 0x0400; + if (not iswalpha(0x1234)) ok |= 0x0800; + (void)unreached; + switchToBank2(); + *(volatile unsigned int *)0x5000 = ok; + while (1) {} +} +EOF + "$CLANG" --target=w65816 -std=c11 -O2 -ffunction-sections \ + -I"$PROJECT_ROOT/runtime/include" -c "$cC11kFile" -o "$oC11kFile" + "$PROJECT_ROOT/tools/link816" -o "$binC11kFile" --text-base 0x1000 \ + "$oCrt0F" "$oLibcF" "$oExtrasF" "$oLibgccFile" "$oC11kFile" \ + >/dev/null 2>&1 + if ! bash "$PROJECT_ROOT/scripts/runInMame.sh" "$binC11kFile" --check \ + 0x025000=0fff >/dev/null 2>&1; then + die "MAME: C11 keyword+Unicode header bitmap != 0x0FFF" + fi + rm -f "$cC11kFile" "$oC11kFile" "$binC11kFile" + + # math.h C99 additions: fma, remainder, lround, rint, scalbn, + # fpclassify, nan(). Inverse hyperbolics (asinh/acosh/atanh) + # are exposed in the header but not exercised here because + # they call into sqrt/log which trigger a pre-existing + # runtime-sqrt crash under -O2. All other surface members + # use only bit manipulation + softDouble round-trip ops which + # are known to work end-to-end. + log "check: MAME runs math.h C99 additions (fma/remainder/lround/rint/scalbn/fpclassify)" + cMaFile="$(mktemp --suffix=.c)" + oMaFile="$(mktemp --suffix=.o)" + binMaFile="$(mktemp --suffix=.bin)" + cat > "$cMaFile" <<'EOF' +#include +__attribute__((noinline)) void switchToBank2(void) { + __asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n"); +} +volatile double gA = 2.0, gB = 3.0, gC = 4.0; +volatile double gX = 7.0, gY = 4.0; +volatile double gPos = 2.7; +volatile double gNeg = -2.7; +volatile double gS = 1.5; +// noinline helpers to keep main's register pressure low +__attribute__((noinline)) static int checkFma(void) { + return fma(gA, gB, gC) == 10.0; +} +__attribute__((noinline)) static int checkRem(void) { + return remainder(gX, gY) == -1.0; +} +__attribute__((noinline)) static int checkLroundPos(void) { + return lround(gPos) == 3L; +} +__attribute__((noinline)) static int checkLroundNeg(void) { + return lround(gNeg) == -3L; +} +__attribute__((noinline)) static int checkRint(void) { + return rint(gPos) == 3.0; +} +__attribute__((noinline)) static int checkScalbn(void) { + return scalbn(gS, 3) == 12.0; +} +int main(void) { + unsigned int ok = 0; + if (checkFma()) ok |= 0x0001; + if (checkRem()) ok |= 0x0002; + if (checkLroundPos()) ok |= 0x0004; + if (checkLroundNeg()) ok |= 0x0008; + if (checkRint()) ok |= 0x0010; + if (checkScalbn()) ok |= 0x0020; + if (fpclassify(1.0) == FP_NORMAL) ok |= 0x0040; + if (fpclassify(0.0) == FP_ZERO) ok |= 0x0080; + if (fpclassify(HUGE_VAL) == FP_INFINITE) ok |= 0x0100; + if (isnan(nan(""))) ok |= 0x0200; + switchToBank2(); + *(volatile unsigned int *)0x5000 = ok; + while (1) {} +} +EOF + "$CLANG" --target=w65816 -O2 -ffunction-sections \ + -I"$PROJECT_ROOT/runtime/include" -c "$cMaFile" -o "$oMaFile" + "$PROJECT_ROOT/tools/link816" -o "$binMaFile" --text-base 0x1000 \ + "$oCrt0F" "$oLibcF" "$oExtrasF" "$oMathF" \ + "$oSfF" "$oSdF" "$oLibgccFile" "$oMaFile" >/dev/null 2>&1 + if ! bash "$PROJECT_ROOT/scripts/runInMame.sh" "$binMaFile" --check \ + 0x025000=03ff >/dev/null 2>&1; then + die "MAME: math.h C99 additions bitmap != 0x03FF" + fi + rm -f "$cMaFile" "$oMaFile" "$binMaFile" + # clock() reads the IIgs VBL counter at $E1006B (24-bit # absolute load). Works without toolbox init. time() # without iigsToolboxInit() returns 0 (no crash). @@ -3879,7 +4485,7 @@ static int sumAreas(Shape **shapes, int n) { for (int i = 0; i < n; i++) total += shapes[i]->area(); return total; } -extern "C" int main(void) { +int main(void) { Rect r(3, 4); Square s(5); Circle c(2); Shape *arr[3] = { &r, &s, &c }; int total = sumAreas(arr, 3); @@ -3928,7 +4534,7 @@ public: int draw() const override { return x * 100; } int move(int dx) const override { return x + dx; } }; -extern "C" int main(void) { +int main(void) { Sprite s(7); Drawable *d = &s; Movable *m = &s; @@ -3980,7 +4586,7 @@ public: Diamond(int x) : Base(x), A(x), B(x) {} int kind() const override { return 99; } }; -extern "C" int main(void) { +int main(void) { Diamond d(42); int ok = 0; if (d.kind() == 99) ok |= 1; @@ -4046,7 +4652,7 @@ public: Diamond(int x) : Base(x), A(x), B(x) {} int who() override { return 99; } }; -extern "C" int main(void) { +int main(void) { Dog dog; Cat cat; Animal *a = &dog; int ok = 0; @@ -4189,7 +4795,7 @@ EOF oExcAbi="$(mktemp --suffix=.o)" binCppExcFile="$(mktemp --suffix=.bin)" cat > "$cppExcFile" <<'EOF' -extern "C" int main(void) { +int main(void) { int ok = 0; try { throw 42; } catch (int e) { if (e == 42) ok = 1; } *(volatile unsigned short *)0x5000 = (unsigned short)ok; @@ -4222,16 +4828,13 @@ EOF oHdFile="$(mktemp --suffix=.o)" binHdFile="$(mktemp --suffix=.bin)" cat > "$cHdFile" <<'EOF' +#include extern int mfsRegister(const char *path, void *buf, unsigned long size, unsigned long cap, int writable); -extern struct __sFILE *fopen(const char *path, const char *mode); -extern int fclose(struct __sFILE *f); -extern int fgetc(struct __sFILE *f); -extern int fprintf(struct __sFILE *f, const char *fmt, ...); extern char *strstr(const char *h, const char *n); __attribute__((noinline)) void switchToBank2(void) { __asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n"); } -__attribute__((noinline)) void hexdump(struct __sFILE *in, struct __sFILE *out) { +__attribute__((noinline)) void hexdump(FILE *in, FILE *out) { unsigned int offset = 0; unsigned char line[16]; int linelen; @@ -4264,8 +4867,8 @@ static char output[300]; int main(void) { mfsRegister("in", input, 16, 16, 0); mfsRegister("out", output, 0, 300, 1); - struct __sFILE *in = fopen("in", "r"); - struct __sFILE *out = fopen("out", "w"); + FILE *in = fopen("in", "r"); + FILE *out = fopen("out", "w"); hexdump(in, out); fclose(in); fclose(out); int ok = 0; @@ -4277,7 +4880,7 @@ int main(void) { while (1) {} } EOF - "$CLANG" --target=w65816 -O2 -ffunction-sections -c \ + "$CLANG" --target=w65816 -I"$PROJECT_ROOT/runtime/include" -O2 -ffunction-sections -c \ "$cHdFile" -o "$oHdFile" "$PROJECT_ROOT/tools/link816" -o "$binHdFile" --text-base 0x1000 \ "$oCrt0F" "$oLibcF" "$oExtrasF" "$oSnprintfF" \ @@ -4380,6 +4983,7 @@ EOF oShFile="$(mktemp --suffix=.o)" binShFile="$(mktemp --suffix=.bin)" cat > "$cShFile" <<'EOF' +#include extern void *malloc(unsigned long n); extern void free(void *p); extern unsigned long strlen(const char *s); @@ -4387,9 +4991,6 @@ extern int strcmp(const char *a, const char *b); extern char *strchr(const char *s, int c); extern char *strstr(const char *h, const char *n); extern int mfsRegister(const char *path, void *buf, unsigned long size, unsigned long cap, int writable); -extern struct __sFILE *fopen(const char *path, const char *mode); -extern int fclose(struct __sFILE *f); -extern int fprintf(struct __sFILE *f, const char *fmt, ...); __attribute__((noinline)) static void switchToBank2(void) { __asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n"); } @@ -4447,7 +5048,7 @@ __attribute__((noinline)) static char *takeRest(char *s) { *end = 0; return *s ? s : (char *)0; } -__attribute__((noinline)) static int dispatch(char *line, struct __sFILE *out) { +__attribute__((noinline)) static int dispatch(char *line, FILE *out) { char *cmd; char *rest = takeToken(line, &cmd); if (!cmd) return 0; if (strcmp(cmd, "INSERT") == 0) { @@ -4472,7 +5073,7 @@ __attribute__((noinline)) static int dispatch(char *line, struct __sFILE *out) { if (strcmp(cmd, "COUNT") == 0) { fprintf(out, "COUNT = %u\n", (unsigned)totalEntries); return 1; } return 0; } -__attribute__((noinline)) static int runScript(const char *script, struct __sFILE *out) { +__attribute__((noinline)) static int runScript(const char *script, FILE *out) { int n = 0; char buf[64]; const char *p = script; @@ -4566,7 +5167,7 @@ __asm__ ( ); int main(void) { mfsRegister("out", outbuf, 0, 1024, 1); - struct __sFILE *out = fopen("out", "w"); + FILE *out = fopen("out", "w"); int cmds = runScript(SCRIPT, out); fprintf(out, "ran %d cmds\n", cmds); fclose(out); @@ -4585,7 +5186,7 @@ int main(void) { while (1) {} } EOF - "$CLANG" --target=w65816 -O2 -ffunction-sections -c \ + "$CLANG" --target=w65816 -I"$PROJECT_ROOT/runtime/include" -O2 -ffunction-sections -c \ "$cShFile" -o "$oShFile" "$PROJECT_ROOT/tools/link816" -o "$binShFile" --text-base 0x1000 \ "$oCrt0F" "$oLibcF" "$oExtrasF" "$oSnprintfF" \ @@ -5083,7 +5684,7 @@ EOF --bss-base 0xFF00 "$oBigFile" "$oLibgccFile" 2>/tmp/bsslink.err; then die "link816 should have rejected --bss-base 0xFF00 + 0x200 bss (above LC ceiling)" fi - if ! grep -q 'exceeds bank-0 LC ceiling' /tmp/bsslink.err; then + if ! grep -q 'exceeds bank-0 ceiling' /tmp/bsslink.err; then die "link816 LC-ceiling diagnostic missing: $(cat /tmp/bsslink.err)" fi rm -f "$cBigFile" "$oBigFile" "$binBssOFile" /tmp/bsslink.err @@ -5134,6 +5735,106 @@ EOF fi rm -f "$cBssLcFile" "$oBssLcFile" "$binBssLcFile" "$mapBssLcFile" + # Multi-bank BSS: --bss-base 0xNN0000 places BSS in bank NN + # instead of bank 0. crt0 reads the new linker symbol + # `__bss_bank` to temporarily set DBR for the BSS-clear loop, + # uses `__bss_lo16` + X via DBR-relative `stz abs,X`, and `__bss_size` + # for the count. Verifies (a) BSS-resident global writes/reads work + # in the non-bank-0 bank, AND (b) crt0 correctly zeroed the BSS + # before main (untouched-array-element XOR returns 0). + log "check: MAME runs program with --bss-base 0x030000 (multi-bank BSS)" + cBmbFile="$(mktemp --suffix=.c)" + oBmbFile="$(mktemp --suffix=.o)" + binBmbFile="$(mktemp --suffix=.bin)" + cat > "$cBmbFile" <<'EOF' +__attribute__((noinline)) void switchToBank2(void) { + __asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n"); +} +static unsigned short g_arr[8]; +int main(void) { + g_arr[0] = 0x1234; + g_arr[7] = 0x5678; + unsigned short s = g_arr[0] ^ g_arr[7]; // 0x1234 ^ 0x5678 = 0x444C + unsigned short z = g_arr[1] | g_arr[2] | g_arr[3] + | g_arr[4] | g_arr[5] | g_arr[6]; // 0 if BSS zeroed + switchToBank2(); + *(volatile unsigned short *)0x5000 = s; + *(volatile unsigned short *)0x5002 = z; + while (1) {} +} +EOF + "$CLANG" --target=w65816 -O2 -ffunction-sections -c \ + "$cBmbFile" -o "$oBmbFile" + # Use the prebuilt runtime/*.o (smoke's per-test mktemp .o files + # may have been rm -f'd by earlier checks). + if ! "$PROJECT_ROOT/tools/link816" -o "$binBmbFile" --text-base 0x1000 \ + --bss-base 0x030000 \ + "$PROJECT_ROOT/runtime/crt0.o" "$PROJECT_ROOT/runtime/libc.o" \ + "$PROJECT_ROOT/runtime/softFloat.o" "$PROJECT_ROOT/runtime/softDouble.o" \ + "$PROJECT_ROOT/runtime/libgcc.o" "$oBmbFile" \ + >/dev/null 2>&1; then + die "link816 --bss-base 0x030000 failed (multi-bank BSS link regression)" + fi + if ! bash "$PROJECT_ROOT/scripts/runInMame.sh" \ + "$binBmbFile" --check 0x025000=444c 0x025002=0000 >/dev/null 2>&1; then + die "MAME: --bss-base 0x030000 failed (BSS in bank 3 not zeroed or not writable)" + fi + rm -f "$cBmbFile" "$oBmbFile" "$binBmbFile" + + log "check: MAME runs program with BSS spanning 2 banks (>64KB)" + cMbbFile="$(mktemp --suffix=.c)" + oMbbFile="$(mktemp --suffix=.o)" + binMbbFile="$(mktemp --suffix=.bin)" + cat > "$cMbbFile" <<'EOF' +__attribute__((noinline)) void switchToBank2(void) { + __asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n"); +} +__attribute__((used)) char arr1[50000]; +__attribute__((used)) char arr2[50000]; +int main(void) { + arr1[0] = 0; + arr2[0] = 0; + // Probe via long-absolute (DBR-independent) at fixed offsets in + // bank 3 and bank 4. All should read 0 if crt0's multi-bank BSS + // clear loop walked both segments. + unsigned char b0, b1, b2, b3; + __asm__ volatile ( + "sep #0x20\n.byte 0xAF\n.word 0x0100\n.byte 3\nrep #0x20\nand #0xff\n" + : "=a"(b0)); + __asm__ volatile ( + "sep #0x20\n.byte 0xAF\n.word 0xC000\n.byte 3\nrep #0x20\nand #0xff\n" + : "=a"(b1)); + __asm__ volatile ( + "sep #0x20\n.byte 0xAF\n.word 0x0100\n.byte 4\nrep #0x20\nand #0xff\n" + : "=a"(b2)); + __asm__ volatile ( + "sep #0x20\n.byte 0xAF\n.word 0x8000\n.byte 4\nrep #0x20\nand #0xff\n" + : "=a"(b3)); + switchToBank2(); + *(volatile unsigned int *)0x5000 = (unsigned int)b0; + *(volatile unsigned int *)0x5002 = (unsigned int)b1; + *(volatile unsigned int *)0x5004 = (unsigned int)b2; + *(volatile unsigned int *)0x5006 = (unsigned int)b3; + while (1); +} +EOF + "$CLANG" --target=w65816 -O2 -ffunction-sections -c \ + "$cMbbFile" -o "$oMbbFile" + if ! "$PROJECT_ROOT/tools/link816" -o "$binMbbFile" --text-base 0x1000 \ + --bss-base 0x030000 \ + "$PROJECT_ROOT/runtime/crt0.o" "$PROJECT_ROOT/runtime/libc.o" \ + "$PROJECT_ROOT/runtime/softFloat.o" "$PROJECT_ROOT/runtime/softDouble.o" \ + "$PROJECT_ROOT/runtime/libgcc.o" "$oMbbFile" \ + >/dev/null 2>&1; then + die "link816 with 100KB BSS in bank 3+ failed" + fi + if ! bash "$PROJECT_ROOT/scripts/runInMame.sh" "$binMbbFile" --check \ + 0x025000=0000 0x025002=0000 \ + 0x025004=0000 0x025006=0000 >/dev/null 2>&1; then + die "MAME: BSS span across banks not zeroed (multi-bank crt0 walk)" + fi + rm -f "$cMbbFile" "$oMbbFile" "$binMbbFile" + # OMF emitter — wrap the linked binary as a single-segment OMF # file ready for IIgs loading. log "check: omfEmit produces a valid OMF v2.1 single-segment file" @@ -5492,9 +6193,7 @@ if [ "${GSOS_FILE_SMOKE:-0}" = "1" ] \ testFileGsf="$(mktemp --suffix=.dat)" printf 'Hello, world!' > "$testFileGsf" cat > "$cGsfFile" <<'EOF' -extern struct __sFILE *fopen(const char *path, const char *mode); -extern unsigned long fread(void *p, unsigned long s, unsigned long n, struct __sFILE *f); -extern int fclose(struct __sFILE *f); +#include static char rbuf[16]; __attribute__((noinline)) static int strnequ(const char *a, const char *b, int n) { for (int i = 0; i < n; i++) if (a[i] != b[i]) return 0; @@ -5502,7 +6201,7 @@ __attribute__((noinline)) static int strnequ(const char *a, const char *b, int n } int main(void) { unsigned char ok = 0; - struct __sFILE *f = fopen("/DATA/TESTFILE", "r"); + FILE *f = fopen("/DATA/TESTFILE", "r"); if (f) { ok |= 0x10; unsigned long n = fread(rbuf, 1, 13, f); @@ -5514,7 +6213,7 @@ int main(void) { return 0; } EOF - "$CLANG" --target=w65816 -O2 -ffunction-sections -c \ + "$CLANG" --target=w65816 -I"$PROJECT_ROOT/runtime/include" -O2 -ffunction-sections -c \ "$cGsfFile" -o "$oGsfFile" "$PROJECT_ROOT/tools/link816" -o "$binGsf" --text-base 0x1000 \ --map "$mapGsf" --reloc-out "$relGsf" \ diff --git a/src/clang/lib/Basic/Targets/W65816.h b/src/clang/lib/Basic/Targets/W65816.h index e4edacf..87b794e 100644 --- a/src/clang/lib/Basic/Targets/W65816.h +++ b/src/clang/lib/Basic/Targets/W65816.h @@ -45,7 +45,7 @@ public: IntPtrType = SignedLong; PtrDiffType = SignedLong; SigAtomicType = SignedLong; - resetDataLayout("e-m:e-p:32:16-i16:16-i32:16-i64:16-f32:16-f64:16-n8:16-S16"); + resetDataLayout("e-m:e-p:32:16-i16:16-i32:16-i64:16-f32:16-f64:16-a:8-n8:16-S8"); } void getTargetDefines(const LangOptions &Opts, diff --git a/src/link816/link816.cpp b/src/link816/link816.cpp index 58323b8..2f3afc8 100644 --- a/src/link816/link816.cpp +++ b/src/link816/link816.cpp @@ -789,20 +789,45 @@ struct Linker { // bail clearly rather than silently corrupt. uint32_t loadEnd = L.initBase + L.initSize; L.bssBase = bssBase; - if (L.bssBase < loadEnd) { - // Page-align upward for nicer addresses in the map. - L.bssBase = (loadEnd + 0xFF) & ~0xFFu; - if (L.bssBase >= 0xC000 && L.bssBase < 0xD000) { - L.bssBase = 0xD000; + // If --bss-base specifies a non-bank-0 address, the user is + // explicitly placing BSS in a different bank — don't auto- + // adjust around bank-0 hazard zones (IO window, LC1) or + // collide with the text/rodata/init load area. Validate the + // bank choice + intra-bank fit instead. + bool bssOutOfBank0 = (L.bssBase >= 0x010000u); + if (!bssOutOfBank0) { + if (L.bssBase < loadEnd) { + // Page-align upward for nicer addresses in the map. + L.bssBase = (loadEnd + 0xFF) & ~0xFFu; + if (L.bssBase >= 0xC000 && L.bssBase < 0xD000) { + L.bssBase = 0xD000; + } + } + if (L.bssBase + L.bssSize > 0x10000u) { + char msg[256]; + std::snprintf(msg, sizeof(msg), + "bss [0x%X+%u] exceeds bank-0 ceiling 0x10000 — " + "shrink runtime, or pass --bss-base 0xNN0000 " + "(multi-bank BSS up to 4 banks now supported)", + L.bssBase, L.bssSize); + die(msg); + } + } else { + // Multi-bank BSS: BSS may now span multiple consecutive + // banks. crt0 clears bank-by-bank using DBR-relative + // STZ abs,X — see __bss_seg_count / __bss_segN_* symbols + // below. Capped at 4 segments (= 256KB BSS max) which + // covers any realistic IIgs program. + uint32_t firstBank = (L.bssBase >> 16) & 0xFF; + uint32_t lastBank = ((L.bssBase + L.bssSize - 1) >> 16) & 0xFF; + uint32_t segCount = lastBank - firstBank + 1; + if (segCount > 4) { + char msg[200]; + std::snprintf(msg, sizeof(msg), + "bss [0x%X+%u] spans %u banks — limit is 4 (256KB)", + L.bssBase, L.bssSize, segCount); + die(msg); } - } - if (L.bssBase + L.bssSize > 0x10000u) { - char msg[160]; - std::snprintf(msg, sizeof(msg), - "bss [0x%X+%u] exceeds bank-0 LC ceiling 0x10000 — " - "shrink the runtime or split into bank 1", - L.bssBase, L.bssSize); - die(msg); } // Publish layout now so resolveSym() can read it during reloc // application (it's a const member that uses lastLayout). @@ -819,6 +844,72 @@ struct Linker { globalSyms["__init_array_end"] = initBase + curInit; globalSyms["__bss_start"] = L.bssBase; globalSyms["__bss_end"] = L.bssBase + L.bssSize; + // Multi-bank-BSS support: split __bss_start into the 16-bit + // intra-bank offset and the 8-bit bank byte. crt0 needs the + // bank byte separately so it can temporarily set DBR to that + // bank for the BSS-clear loop (which uses STZ abs,X — DBR- + // relative — and so reads bytes from the wrong bank if BSS + // is placed in a non-zero bank). Also emit __bss_size as a + // 16-bit count for the loop boundary; doing so saves crt0 + // from doing the (__bss_end - __bss_start) subtraction at + // runtime, and keeps the count clean even when __bss_start + // and __bss_end straddle bank-boundary arithmetic. + globalSyms["__bss_lo16"] = L.bssBase & 0xFFFF; + globalSyms["__bss_bank"] = (L.bssBase >> 16) & 0xFF; + globalSyms["__bss_size"] = L.bssSize <= 0xFFFFu ? L.bssSize + : 0xFFFFu; + // Multi-bank BSS segment table — up to 4 entries. Each segment + // has (lo16, bank, size16). Segment 0 starts at __bss_lo16 in + // __bss_bank; segments 1..N-1 start at offset 0 in successive + // banks. crt0 walks __bss_seg{0..N-1}_size and skips when 0. + { + uint32_t curBase = L.bssBase; + uint32_t curRem = L.bssSize; + uint32_t segIdx = 0; + const char *sizeNames[4] = { + "__bss_seg0_size", "__bss_seg1_size", + "__bss_seg2_size", "__bss_seg3_size" + }; + const char *bankNames[4] = { + "__bss_seg0_bank", "__bss_seg1_bank", + "__bss_seg2_bank", "__bss_seg3_bank" + }; + const char *lo16Names[4] = { + "__bss_seg0_lo16", "__bss_seg1_lo16", + "__bss_seg2_lo16", "__bss_seg3_lo16" + }; + // Cap segment size to 0xFF00 (= 65280) so the 16-bit + // CPX in crt0 doesn't wrap to 0 on a full-bank segment. + // Excess bytes in that bank stay uncleared at link time — + // we'd need to chain a second segment in the same bank to + // cover them. Implementation: track per-segment max as + // 0xFF00, and if a single bank needs more, allocate two + // segments in that bank. + constexpr uint32_t MAX_SEG = 0xFF00u; + for (segIdx = 0; segIdx < 4; segIdx++) { + uint32_t bankEnd = (curBase & 0xFF0000u) + 0x10000u; + uint32_t avail = bankEnd - curBase; + uint32_t seg = curRem < avail ? curRem : avail; + if (seg > MAX_SEG) seg = MAX_SEG; + globalSyms[lo16Names[segIdx]] = curBase & 0xFFFF; + globalSyms[bankNames[segIdx]] = (curBase >> 16) & 0xFF; + globalSyms[sizeNames[segIdx]] = seg; + curRem -= seg; + if (curRem == 0) { segIdx++; break; } + curBase += seg; // advance within bank or to next + if ((curBase & 0xFFFFu) == 0) { + // Crossed bank boundary — already at start of next bank. + } else if ((curBase & 0xFF0000u) != ((curBase - 1) & 0xFF0000u)) { + // Just crossed into next bank. + } + } + // Zero out any unused segment slots so crt0 sees size=0. + for (uint32_t i = segIdx; i < 4; i++) { + globalSyms[lo16Names[i]] = 0; + globalSyms[bankNames[i]] = 0; + globalSyms[sizeNames[i]] = 0; + } + } // __heap_start / __heap_end: pick the largest contiguous safe // range above bss_end. Without this, the previous hardcoded // heap_end=$BF00 gave heap_end < heap_start whenever BSS diff --git a/src/llvm/lib/Target/W65816/CMakeLists.txt b/src/llvm/lib/Target/W65816/CMakeLists.txt index d457117..e5d0de2 100644 --- a/src/llvm/lib/Target/W65816/CMakeLists.txt +++ b/src/llvm/lib/Target/W65816/CMakeLists.txt @@ -35,6 +35,9 @@ add_llvm_target(W65816CodeGen W65816PreSpillCrossCall.cpp W65816SjLjFinalize.cpp W65816LowerWide32.cpp + W65816I32IncFold.cpp + W65816ImgCalleeSave.cpp + W65816NarrowI32Mul.cpp W65816TargetMachine.cpp W65816AsmPrinter.cpp W65816MCInstLower.cpp diff --git a/src/llvm/lib/Target/W65816/W65816.h b/src/llvm/lib/Target/W65816/W65816.h index 2bf5a91..a44611b 100644 --- a/src/llvm/lib/Target/W65816/W65816.h +++ b/src/llvm/lib/Target/W65816/W65816.h @@ -116,6 +116,14 @@ FunctionPass *createW65816PreSpillCrossCall(); // W65816SjLjFinalize.cpp. FunctionPass *createW65816SjLjFinalize(); +// IR pass: detect `mul i32 X, Y` where the top 16 bits of both X and Y +// are provably zero (via IR-level computeKnownBits, which traces +// through PHIs) and rewrite to a call to `__umulhisi3` (16x16 -> 32). +// IR-level analysis catches cases SDAG can't, because IndVarSimplify +// widens narrow loop counters to i32 before SDAG sees them, hiding the +// zext that a SDAG-level combine would key off. See W65816NarrowI32Mul.cpp. +FunctionPass *createW65816NarrowI32Mul(); + // Pre-RA pass that lowers Wide32 register pairs into pairs of i16 // vregs. Without this, greedy/basic regalloc can't fit the pair- // pressure of i64-via-2-i32-via-Wide32 traffic in i64-heavy @@ -125,9 +133,24 @@ FunctionPass *createW65816SjLjFinalize(); // take 2 i16 ptr operands directly. FunctionPass *createW65816LowerWide32(); +// Pre-emit peephole: detect the post-PEI 6-instruction `i32 += 1` +// pattern (LDA-ADCi16imm-STA-LDA-ADCEi16imm-STA on consecutive i16 +// stack-rel halves) and rewrite to LDA-INA-STA + INC_HI_IF_CARRY. +// Saves ~13 cyc per pointer increment in the common no-carry path. +// See W65816I32IncFold.cpp. +FunctionPass *createW65816I32IncFold(); + +// Post-RA, pre-PEI pass that emits prologue save + epilogue restore for +// IMG8..IMG15 if the function uses them. Makes IMG8..IMG15 behave as +// callee-saved at the asm level without going through LLVM's CSR +// mechanism (which would shift regalloc decisions and break other +// tests). See W65816ImgCalleeSave.cpp. +FunctionPass *createW65816ImgCalleeSave(); + void initializeW65816AsmPrinterPass(PassRegistry &); void initializeW65816DAGToDAGISelLegacyPass(PassRegistry &); void initializeW65816StackSlotCleanupPass(PassRegistry &); +void initializeW65816I32IncFoldPass(PassRegistry &); void initializeW65816SepRepCleanupPass(PassRegistry &); void initializeW65816BranchExpandPass(PassRegistry &); void initializeW65816TiedDefSpillPass(PassRegistry &); @@ -138,6 +161,8 @@ void initializeW65816NegYIndYPass(PassRegistry &); void initializeW65816PreSpillCrossCallPass(PassRegistry &); void initializeW65816SjLjFinalizePass(PassRegistry &); void initializeW65816LowerWide32Pass(PassRegistry &); +void initializeW65816ImgCalleeSavePass(PassRegistry &); +void initializeW65816NarrowI32MulPass(PassRegistry &); } // namespace llvm diff --git a/src/llvm/lib/Target/W65816/W65816AsmPrinter.cpp b/src/llvm/lib/Target/W65816/W65816AsmPrinter.cpp index 5e27b45..ce8fa2a 100644 --- a/src/llvm/lib/Target/W65816/W65816AsmPrinter.cpp +++ b/src/llvm/lib/Target/W65816/W65816AsmPrinter.cpp @@ -18,8 +18,11 @@ #include "TargetInfo/W65816TargetInfo.h" #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/MachineInstr.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSymbol.h" #include "llvm/MC/TargetRegistry.h" #include "llvm/Support/Compiler.h" @@ -182,6 +185,48 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) { switch (MI->getOpcode()) { default: break; + case W65816::INC_HI_IF_CARRY_StackRel: { + // Conditional-increment of an i32's hi half on stack at (off, S). + // Z is presumed set/clear from the preceding LDA-INA-STA on the + // lo half: Z=1 means lo wrapped 0xFFFF→0, so hi must be incremented; + // Z=0 means no overflow, skip. STA preserves N/Z so the gap + // between the lo's INA and our BNE is OK. + // + // Emits: + // bne + // lda $off, s + // inc a + // sta $off, s + // : + int64_t Off = MI->getOperand(0).getImm(); + MCSymbol *SkipSym = OutContext.createTempSymbol(); + { + MCInst BneI; + BneI.setOpcode(W65816::BNE); + BneI.addOperand(MCOperand::createExpr( + MCSymbolRefExpr::create(SkipSym, OutContext))); + EmitToStreamer(*OutStreamer, BneI); + } + { + MCInst Lda; + Lda.setOpcode(W65816::LDA_StackRel); + Lda.addOperand(MCOperand::createImm(Off)); + EmitToStreamer(*OutStreamer, Lda); + } + { + MCInst Ina; + Ina.setOpcode(W65816::INA); + EmitToStreamer(*OutStreamer, Ina); + } + { + MCInst Sta; + Sta.setOpcode(W65816::STA_StackRel); + Sta.addOperand(MCOperand::createImm(Off)); + EmitToStreamer(*OutStreamer, Sta); + } + OutStreamer->emitLabel(SkipSym); + return; + } case W65816::ADJCALLSTACKDOWN: { // DOWN is a no-op in our scheme — the PUSH16 sequence in LowerCall // already shifted SP incrementally as args were pushed. Nothing diff --git a/src/llvm/lib/Target/W65816/W65816FrameLowering.cpp b/src/llvm/lib/Target/W65816/W65816FrameLowering.cpp index 08885a2..f85ea6d 100644 --- a/src/llvm/lib/Target/W65816/W65816FrameLowering.cpp +++ b/src/llvm/lib/Target/W65816/W65816FrameLowering.cpp @@ -161,13 +161,37 @@ void W65816FrameLowering::emitPrologue(MachineFunction &MF, // `LDA/STA ($F6),Y` (bank-0 implicit, since the stack is always // bank 0). A holds the new S right after TCS — store it before // restoring A from Y. - if (StackSize > 200) { + // Capture FP into $F6 when: + // - frame > 200 bytes (8-bit `,S` disp can't reach far slots), OR + // - function has VLAs (DYNAMIC_STACKALLOC shifts S, breaking + // static-frame `,s` access). + if (StackSize > 200 || HasVLA) { MF.getInfo()->setUsesDpFP(true); BuildMI(MBB, MBBI, DL, TII.get(W65816::STA_DP)).addImm(0xF6); + // Bank byte at $F8 = 0. expandFarFI uses `LDA/STA [dp],Y` + // (long indirect Y, opcodes B7/97) which reads a 24-bit + // pointer at $F6/$F7/$F8 and ignores DBR. Without this + // forced-bank-zero, callers that have switched DBR (e.g. + // for I/O register access via `pha;plb`) silently corrupt + // every FP-relative load and store in the callee. + // sha256_transform exposed this — its 246-byte frame uses + // FP-rel, and the test driver switched DBR to bank 2 for + // probe writes before invoking it. + BuildMI(MBB, MBBI, DL, TII.get(W65816::STZ_DP)).addImm(0xF8); } BuildMI(MBB, MBBI, DL, TII.get(W65816::TYA)); } } + // VLA function with no static frame (or PHA-only): still need FP. + if (HasVLA && + !MF.getInfo()->getUsesDpFP()) { + MF.getInfo()->setUsesDpFP(true); + BuildMI(MBB, MBBI, DL, TII.get(W65816::TAY)); + BuildMI(MBB, MBBI, DL, TII.get(W65816::TSC)); + BuildMI(MBB, MBBI, DL, TII.get(W65816::STA_DP)).addImm(0xF6); + BuildMI(MBB, MBBI, DL, TII.get(W65816::STZ_DP)).addImm(0xF8); + BuildMI(MBB, MBBI, DL, TII.get(W65816::TYA)); + } } void W65816FrameLowering::emitEpilogue(MachineFunction &MF, diff --git a/src/llvm/lib/Target/W65816/W65816I32IncFold.cpp b/src/llvm/lib/Target/W65816/W65816I32IncFold.cpp new file mode 100644 index 0000000..b6bab79 --- /dev/null +++ b/src/llvm/lib/Target/W65816/W65816I32IncFold.cpp @@ -0,0 +1,225 @@ +//===-- W65816I32IncFold.cpp - Fold i32 += 1 into INC + conditional skip --===// +// +// Pre-emit peephole: detect the post-PEI 6-instruction sequence emitted +// for `i32 += 1` on a Wide32 vreg whose halves spilled to two stack-rel +// slots, and rewrite to a tighter form using INA + a conditional skip +// over the hi half. +// +// Original (after PEI, pseudos still un-expanded): +// $a = LDA_StackRel imm_lo ; load lo half +// $a = ADCi16imm $a, 1 ; CLC + ADC #1 (5 cyc) +// STA_StackRel $a, imm_lo ; store lo +// $a = LDA_StackRel imm_hi ; load hi half +// $a = ADCEi16imm $a, 0 ; ADC #0 (uses carry from lo) +// STA_StackRel $a, imm_hi ; store hi +// +// Cycle cost: 5 + 2 + 3 + 5 + 5 + 3 + 5 = 28 cyc +// +// Rewrite: +// $a = LDA_StackRel imm_lo ; load lo +// $a = INA_PSEUDO $a, $a ; lo + 1 — sets Z based on result +// STA_StackRel $a, imm_lo ; store lo (Z preserved) +// INC_HI_IF_CARRY_StackRel imm_hi ; AsmPrinter expands to: +// ; bne L_skip +// ; lda imm_hi, s +// ; inc a +// ; sta imm_hi, s +// ; L_skip: +// +// Cycle cost (no carry, common case): +// 5 + 2 + 5 + 3 (BNE taken) = 15 cyc — saves 13 cyc +// Cycle cost (with carry, rare case): +// 5 + 2 + 5 + 2 (BNE not-taken) + 5 + 2 + 5 = 26 cyc — saves 2 cyc +// +// The Z flag from `INA` survives the intervening STA_StackRel because +// STA does not modify the processor status register. The BNE in the +// expansion of INC_HI_IF_CARRY_StackRel reads that Z to decide whether +// the hi half needs to be touched. +//===----------------------------------------------------------------------===// + +#include "W65816.h" +#include "W65816InstrInfo.h" +#include "W65816Subtarget.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define DEBUG_TYPE "w65816-i32-inc-fold" + +namespace { +class W65816I32IncFold : public MachineFunctionPass { +public: + static char ID; + W65816I32IncFold() : MachineFunctionPass(ID) {} + bool runOnMachineFunction(MachineFunction &MF) override; + StringRef getPassName() const override { + return "W65816 i32 += 1 → INC + conditional skip"; + } + void getAnalysisUsage(AnalysisUsage &AU) const override { + MachineFunctionPass::getAnalysisUsage(AU); + } +}; +} // namespace + +char W65816I32IncFold::ID = 0; + +INITIALIZE_PASS(W65816I32IncFold, DEBUG_TYPE, + "W65816 i32 += 1 fold", false, false) + +namespace llvm { +void initializeW65816I32IncFoldPass(PassRegistry &); +} + +// Match the 6-instruction sequence; returns the post-pattern iterator +// and fills in the lo/hi stack-rel offsets if the pattern matches. +// Tolerates intervening TAX/TXA pairs (which regalloc inserts as +// spurious A-save brackets around STAfi's conservative Defs=[A]). +// They're collected into `KillMe` so the rewrite can erase them too. +static bool matchI32AddOnePattern(MachineBasicBlock::iterator It, + MachineBasicBlock::iterator End, + int64_t &OffLo, int64_t &OffHi, + MachineBasicBlock::iterator &PatEnd, + SmallVectorImpl &KillMe) { + auto skipDebug = [&]() { + while (It != End && It->isDebugInstr()) ++It; + }; + auto skipTaxTxa = [&]() { + while (It != End && (It->isDebugInstr() || + It->getOpcode() == W65816::TAX || + It->getOpcode() == W65816::TXA)) { + if (It->getOpcode() == W65816::TAX || It->getOpcode() == W65816::TXA) { + KillMe.push_back(&*It); + } + ++It; + } + }; + skipDebug(); + if (It == End) return false; + + // 1. LDA_StackRel imm_lo + if (It->getOpcode() != W65816::LDA_StackRel) return false; + if (It->getNumOperands() < 1 || !It->getOperand(0).isImm()) return false; + OffLo = It->getOperand(0).getImm(); + ++It; + skipTaxTxa(); + if (It == End) return false; + + // 2. ADCi16imm with imm == 1 + if (It->getOpcode() != W65816::ADCi16imm) return false; + if (It->getNumOperands() < 3 || !It->getOperand(2).isImm()) return false; + if (It->getOperand(2).getImm() != 1) return false; + ++It; + skipDebug(); + if (It == End) return false; + + // 3. STA_StackRel to same offset + if (It->getOpcode() != W65816::STA_StackRel) return false; + if (It->getNumOperands() < 1 || !It->getOperand(0).isImm()) return false; + if (It->getOperand(0).getImm() != OffLo) return false; + ++It; + skipTaxTxa(); + if (It == End) return false; + + // 4. LDA_StackRel imm_hi (different offset) + if (It->getOpcode() != W65816::LDA_StackRel) return false; + if (It->getNumOperands() < 1 || !It->getOperand(0).isImm()) return false; + OffHi = It->getOperand(0).getImm(); + if (OffHi == OffLo) return false; + ++It; + skipDebug(); + if (It == End) return false; + + // 5. ADCEi16imm with imm == 0 + if (It->getOpcode() != W65816::ADCEi16imm) return false; + if (It->getNumOperands() < 3 || !It->getOperand(2).isImm()) return false; + if (It->getOperand(2).getImm() != 0) return false; + ++It; + skipDebug(); + if (It == End) return false; + + // 6. STA_StackRel to hi offset + if (It->getOpcode() != W65816::STA_StackRel) return false; + if (It->getNumOperands() < 1 || !It->getOperand(0).isImm()) return false; + if (It->getOperand(0).getImm() != OffHi) return false; + ++It; + PatEnd = It; + return true; +} + +bool W65816I32IncFold::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) return false; + const auto &STI = MF.getSubtarget(); + const auto *TII = STI.getInstrInfo(); + bool Changed = false; + + for (MachineBasicBlock &MBB : MF) { + auto It = MBB.begin(); + while (It != MBB.end()) { + int64_t OffLo = 0, OffHi = 0; + MachineBasicBlock::iterator PatEnd; + SmallVector KillMe; + auto Start = It; + if (!matchI32AddOnePattern(It, MBB.end(), OffLo, OffHi, PatEnd, KillMe)) { + ++It; + continue; + } + // Erase any spurious TAX/TXA pseudo-saves we tolerated inside + // the pattern. These are dead because STAfi's Defs=[A] was + // a conservative over-approximation; the A-source path preserves + // A in the actual asm. + for (MachineInstr *MI : KillMe) MI->eraseFromParent(); + // Found the 6-instruction pattern, [Start, PatEnd). Rewrite + // in-place: keep the LDA_StackRel for lo, replace ADCi16imm + // with INA_PSEUDO, keep STA_StackRel for lo, then replace the + // entire LDA-ADCE-STA hi-half triple with INC_HI_IF_CARRY_StackRel. + DebugLoc DL = Start->getDebugLoc(); + + // Walk to the ADCi16imm (Start+1) and replace. Build a fresh + // INA_PSEUDO with the same tied-def shape: dst=A, src=A. + auto AdcIt = std::next(Start); + while (AdcIt != PatEnd && AdcIt->isDebugInstr()) ++AdcIt; + // INA_PSEUDO has constraint $src = $dst; emit with both as A. + // Operand layout: (outs Acc16:$dst), (ins Acc16:$src) + BuildMI(MBB, AdcIt, DL, TII->get(W65816::INA_PSEUDO), W65816::A) + .addReg(W65816::A); + auto Erased = AdcIt; + ++AdcIt; + Erased->eraseFromParent(); + + // Now find the start of the hi-half triple: it's at Start+3 (after + // skipping debug). Walk past STA_StackRel (lo) which is now at + // AdcIt's position. + while (AdcIt != PatEnd && AdcIt->isDebugInstr()) ++AdcIt; + // AdcIt should now point at STA_StackRel (lo). Skip it. + ++AdcIt; + while (AdcIt != PatEnd && AdcIt->isDebugInstr()) ++AdcIt; + // AdcIt now points at LDA_StackRel (hi) — start of the hi triple. + MachineBasicBlock::iterator HiStart = AdcIt; + + // Insert INC_HI_IF_CARRY_StackRel before the hi triple, then + // erase all three hi instructions. + BuildMI(MBB, HiStart, DL, TII->get(W65816::INC_HI_IF_CARRY_StackRel)) + .addImm(OffHi); + + // Erase the 3 hi instructions: LDA_StackRel, ADCEi16imm, STA_StackRel. + auto KillIt = HiStart; + for (int i = 0; i < 3 && KillIt != PatEnd; ) { + if (KillIt->isDebugInstr()) { ++KillIt; continue; } + auto Next = std::next(KillIt); + KillIt->eraseFromParent(); + KillIt = Next; + ++i; + } + + Changed = true; + It = PatEnd; + } + } + return Changed; +} + +FunctionPass *llvm::createW65816I32IncFold() { + return new W65816I32IncFold(); +} diff --git a/src/llvm/lib/Target/W65816/W65816ISelLowering.cpp b/src/llvm/lib/Target/W65816/W65816ISelLowering.cpp index 866228c..a261bb1 100644 --- a/src/llvm/lib/Target/W65816/W65816ISelLowering.cpp +++ b/src/llvm/lib/Target/W65816/W65816ISelLowering.cpp @@ -22,6 +22,7 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/Support/KnownBits.h" #include "llvm/IR/Function.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" @@ -93,6 +94,18 @@ W65816TargetLowering::W65816TargetLowering(const TargetMachine &TM, // via a bit-7 test and SELECT_CC (see LowerSignExtend). setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Custom); + // BSWAP: no native byte-swap instruction (XBA swaps the two halves + // of the 16-bit accumulator only when in 8-bit M mode, hard to + // exploit cleanly). Lower to shifts + ORs via the generic Expand + // path — SDAG turns `bswap(i32)` into four byte extracts ORed back + // together, which our existing patterns handle. Required for + // portable C that constructs a big-endian word from byte loads: + // `((u32)b[0] << 24) | ((u32)b[1] << 16) | ((u32)b[2] << 8) | b[3]` + // (SHA-256 message-schedule, JPEG/PNG headers, etc.). + setOperationAction(ISD::BSWAP, MVT::i16, Expand); + setOperationAction(ISD::BSWAP, MVT::i32, Expand); + setOperationAction(ISD::BSWAP, MVT::i64, Expand); + // We have zextload-i8 and extload-i8 patterns (LDA + AND #$FF / bare // LDA for the anyext case). No native sextload; mark it Expand so // LLVM rewrites `sextload i16, i8` into `(sign_extend (load i8))`, @@ -246,7 +259,11 @@ W65816TargetLowering::W65816TargetLowering(const TargetMachine &TM, // __ashlsi3 / __lshrsi3 / __ashrsi3 call, which is both smaller and // simpler than implementing a 32-bit shift in 65816 assembly inline. for (MVT VT : {MVT::i32}) { - setOperationAction(ISD::MUL, VT, LibCall); + // MUL i32 is Custom-lowered: the typical fall-through libcall is + // __mulsi3 (32x32 -> 32), but when both operands are ZEXT from i16 + // we can emit __umulhisi3 (16x16 -> 32) instead. Saves ~60 cyc per + // call on the `(unsigned long)i * i` pattern — see LowerMUL_I32. + setOperationAction(ISD::MUL, VT, Custom); setOperationAction(ISD::SDIV, VT, LibCall); setOperationAction(ISD::UDIV, VT, LibCall); setOperationAction(ISD::SREM, VT, LibCall); @@ -319,6 +336,8 @@ W65816TargetLowering::W65816TargetLowering(const TargetMachine &TM, // local-variable access across the alloca will miscompile. A real // FP (DP slot or X-as-FP) would lift this restriction. setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i16, Custom); + if (ptr32Active) + setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); // Opt into PerformDAGCombine on LOAD nodes — needed for the // address-select reverse combine (see W65816TargetLowering:: @@ -1216,6 +1235,7 @@ SDValue W65816TargetLowering::LowerOperation(SDValue Op, case ISD::AND: case ISD::OR: case ISD::XOR: return LowerI32Bin(Op, DAG); + case ISD::MUL: return LowerMUL_I32(Op, DAG); case ISD::LOAD: return LowerLoad(Op, DAG); case ISD::STORE: return LowerStore(Op, DAG); case ISD::Constant: return LowerI32Constant(Op, DAG); @@ -1305,6 +1325,24 @@ SDValue W65816TargetLowering::LowerDynamicStackalloc(SDValue Op, SDLoc DL(Op); SDValue Chain = Op.getOperand(0); SDValue Size = Op.getOperand(1); + EVT ResultVT = Op.getValueType(); + // Under ptr32, both the result pointer and the size are Wide32 i32 + // values. Extract the i16 lo half of size (a VLA larger than 64KB + // doesn't fit in our stack anyway), do the i16 ALLOCA, then build + // the Wide32 result with bank=0 (stack is always bank 0). + if (ResultVT == MVT::i32) { + SDValue Size16 = (Size.getValueType() == MVT::i32) + ? extractWide32Lo(DAG, DL, Size) + : Size; + SDValue ChainAndPtr = DAG.getNode(W65816ISD::ALLOCA, DL, + DAG.getVTList(MVT::i16, MVT::Other), + Chain, Size16); + SDValue Ptr16 = ChainAndPtr.getValue(0); + SDValue NewChain = ChainAndPtr.getValue(1); + SDValue Bank = DAG.getConstant(0, DL, MVT::i16); + SDValue Ptr32 = buildWide32(DAG, DL, Ptr16, Bank); + return DAG.getMergeValues({Ptr32, NewChain}, DL); + } SDValue ChainAndPtr = DAG.getNode(W65816ISD::ALLOCA, DL, DAG.getVTList(MVT::i16, MVT::Other), Chain, Size); @@ -1433,10 +1471,28 @@ SDValue W65816TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { default: llvm_unreachable("not a shift"); } - // makeLibCall wants the args as TargetLowering::ArgListEntry; the - // simpler getNode form is to manually build the call. But the - // makeLibCall helper handles the calling convention. - SmallVector Args = {Op.getOperand(0), Op.getOperand(1)}; + SDValue Val = Op.getOperand(0); + if (IsI32 && Op.getOpcode() == ISD::SHL) { + // Force the high half of the input to be concretely zero when the + // shift count K is >= 16, so bits K..31 of the input are + // mathematically irrelevant. SDAG legalisation can mark those bits + // as `undef` to give the regalloc freedom, but our libcall (a true + // 32-bit shift-and-rotate loop in libgcc.s) reads ALL 32 input + // bits and propagates garbage into the result's low half. Caught + // by dadd via the dpack-inline `(u64 e) << 52` path which split + // into __ashlsi3(e_lo, 20) with X = undef → wrong mantissa. + // For SRL/SRA we'd zero/sign-extend the LOW half similarly when + // K >= 16, but those paths aren't exercising the bug yet. + if (auto *C = dyn_cast(Op.getOperand(1))) { + unsigned K = (unsigned)C->getZExtValue(); + if (K >= 16) { + SDValue Lo = extractWide32Lo(DAG, SDLoc(Op), Val); + SDValue Zero = DAG.getConstant(0, SDLoc(Op), MVT::i16); + Val = buildWide32(DAG, SDLoc(Op), Lo, Zero); + } + } + } + SmallVector Args = {Val, Op.getOperand(1)}; TargetLowering::MakeLibCallOptions Opts; Opts.setIsSigned(Op.getOpcode() == ISD::SRA); return makeLibCall(DAG, LC, Op.getValueType(), Args, Opts, SDLoc(Op)).first; @@ -2144,9 +2200,75 @@ W65816TargetLowering::PerformDAGCombine(SDNode *N, } } } + return SDValue(); } +// Custom-lowering for ISD::MUL i32. When both operands are ZEXT from +// i16 (or provably have high 16 bits = 0), emit a libcall to +// __umulhisi3 (16x16 -> 32) instead of the heavier __mulsi3 (32x32 -> +// 32). Saves the 32-bit arg marshaling AND the 32-bit accumulator +// math inside the libcall — roughly equivalent to Calypsi 5.16's +// `_Mul16`. Falls through to the standard __mulsi3 libcall otherwise. +SDValue W65816TargetLowering::LowerMUL_I32(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + assert(VT == MVT::i32 && "LowerMUL_I32 expects i32"); + SDValue Lhs = Op.getOperand(0); + SDValue Rhs = Op.getOperand(1); + + auto narrowToI16 = [&](SDValue V) -> SDValue { + // Explicit zext-from-i16 (the IR-level form, before SDAG flattening). + if (V.getOpcode() == ISD::ZERO_EXTEND && + V.getOperand(0).getValueType() == MVT::i16) + return V.getOperand(0); + // ANY_EXTEND-from-i16 is also fine since multiplication of the low + // 16 bits gives the same 32-bit result whatever the high bits were. + if (V.getOpcode() == ISD::ANY_EXTEND && + V.getOperand(0).getValueType() == MVT::i16) + return V.getOperand(0); + // High 16 bits provably zero? + KnownBits K = DAG.computeKnownBits(V); + if (K.countMinLeadingZeros() >= 16) + return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, V); + return SDValue(); + }; + + SDValue A = narrowToI16(Lhs); + SDValue B = narrowToI16(Rhs); + if (A && B) { + TargetLowering::ArgListTy Args; + Args.push_back({A, Type::getInt16Ty(*DAG.getContext())}); + Args.push_back({B, Type::getInt16Ty(*DAG.getContext())}); + SDValue Callee = DAG.getExternalSymbol( + "__umulhisi3", getPointerTy(DAG.getDataLayout())); + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(DL) + .setChain(DAG.getEntryNode()) + .setLibCallee(CallingConv::C, + Type::getInt32Ty(*DAG.getContext()), + Callee, std::move(Args)); + auto [Ret, Chain] = LowerCallTo(CLI); + return Ret; + } + + // Fall back to the standard __mulsi3 libcall. + TargetLowering::ArgListTy Args; + Args.push_back({Lhs, Type::getInt32Ty(*DAG.getContext())}); + Args.push_back({Rhs, Type::getInt32Ty(*DAG.getContext())}); + SDValue Callee = DAG.getExternalSymbol( + "__mulsi3", getPointerTy(DAG.getDataLayout())); + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(DL) + .setChain(DAG.getEntryNode()) + .setLibCallee(CallingConv::C, + Type::getInt32Ty(*DAG.getContext()), + Callee, std::move(Args)); + auto [Ret, Chain] = LowerCallTo(CLI); + return Ret; +} + // Map a W65816CC code to the matching Bxx opcode. static unsigned getBranchOpcodeForCC(unsigned CC) { switch (CC) { diff --git a/src/llvm/lib/Target/W65816/W65816ISelLowering.h b/src/llvm/lib/Target/W65816/W65816ISelLowering.h index 1d640af..c8783a3 100644 --- a/src/llvm/lib/Target/W65816/W65816ISelLowering.h +++ b/src/llvm/lib/Target/W65816/W65816ISelLowering.h @@ -218,6 +218,11 @@ private: SDValue LowerI32Bin(SDValue Op, SelectionDAG &DAG) const; // i32 ConstantNode: split into two i16 constants and REG_SEQUENCE. SDValue LowerI32Constant(SDValue Op, SelectionDAG &DAG) const; + // i32 MUL: detect (zext i16 a) * (zext i16 b) — or operands with + // provably-zero high 16 bits — and emit __umulhisi3 (16x16 -> 32) + // instead of __mulsi3 (32x32 -> 32). Cuts ~30% off the canonical + // sumSquares-style loop. + SDValue LowerMUL_I32(SDValue Op, SelectionDAG &DAG) const; }; } // namespace llvm diff --git a/src/llvm/lib/Target/W65816/W65816ImgCalleeSave.cpp b/src/llvm/lib/Target/W65816/W65816ImgCalleeSave.cpp new file mode 100644 index 0000000..1eeba0d --- /dev/null +++ b/src/llvm/lib/Target/W65816/W65816ImgCalleeSave.cpp @@ -0,0 +1,278 @@ +//===-- W65816ImgCalleeSave.cpp - Callee-side save/restore of IMG8..IMG15 -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Post-RA, pre-PEI pass that adds prologue save + epilogue restore for +// IMG8..IMG15 ($C0..$CE) in any function that uses them. This makes +// IMG8..IMG15 behave as callee-saved AT THE ASM LEVEL without going +// through LLVM's CSR mechanism (which would shift regalloc decisions +// and break other tests — see history with +// `feedback_picol_expr_compound_or.md`). +// +// Why callee-side, not caller-side? +// +// Callers can hold long-lived vregs in IMG8..IMG15 (regalloc treats +// them as preserved across calls because they're not in JSLpseudo's +// Defs). The "obvious" fix — add them to Defs and force regalloc to +// spill them across each call — interacts badly with stack-slot +// coloring: the spill slot gets coalesced with another vreg whose +// liveness appears disjoint, but the post-call reload makes the +// lifetimes overlap and the reload reads garbage (caught by qsort, +// strncat, etc. when IMG0..IMG7 were also in Defs). +// +// By doing the save/restore on the CALLEE side instead, the caller +// doesn't need to spill at all — its values in IMG8..IMG15 are +// automatically preserved. Only functions that USE IMG8..IMG15 pay +// the cost (a few bytes of prologue/epilogue), and the cost is +// amortized across the whole function (not per call). +// +// Why post-RA, not via LLVM's CSR mechanism? +// +// Adding IMG8..IMG15 to `getCalleeSavedRegs()` makes LLVM treat them +// as "expensive" in cost-of-use analysis. Regalloc steers away from +// them in functions that don't really need them, but that steering +// changes coloring decisions in ways that broke strtol +// (`strtol(" 0x1ABC ", &ep, 16)` returned 0). Implementing +// save/restore outside the CSR system keeps regalloc's decisions +// unchanged: it sees IMG8..IMG15 as ordinary regs, uses them freely +// under pressure, and this pass adds the asm-level bookkeeping. +// +// Why pre-PEI? +// +// PEI is what assigns frame-index offsets and emits the actual +// prologue/epilogue. To add new spill slots, we need PEI to see +// them so they get included in the frame size. We use +// `MFI.CreateStackObject` to register the slots, then emit STAfi / +// LDAfi pseudos that PEI will lower to `STA d,s` / `LDA d,s`. +// +// We also insert the save/restore as REGULAR MIR instructions BEFORE +// PEI runs. That means PEI sees them when it emits its frame setup, +// and the STAfi/LDAfi disps are valid post-PEI. +// +//===----------------------------------------------------------------------===// + +#include "W65816.h" +#include "W65816InstrInfo.h" +#include "W65816Subtarget.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define DEBUG_TYPE "w65816-img-callee-save" + +namespace { + +class W65816ImgCalleeSave : public MachineFunctionPass { +public: + static char ID; + W65816ImgCalleeSave() : MachineFunctionPass(ID) {} + bool runOnMachineFunction(MachineFunction &MF) override; + StringRef getPassName() const override { + return "W65816 callee-side save/restore for IMG8..IMG15"; + } +}; + +} // namespace + +char W65816ImgCalleeSave::ID = 0; + +INITIALIZE_PASS(W65816ImgCalleeSave, DEBUG_TYPE, + "W65816 IMG8..IMG15 callee save/restore", false, false) + +namespace llvm { +void initializeW65816ImgCalleeSavePass(PassRegistry &); +} + +FunctionPass *llvm::createW65816ImgCalleeSave() { + return new W65816ImgCalleeSave(); +} + +// IMG8..IMG15 physregs (in order so IMG_REGS[i] is the i'th high-half slot). +// Their DP addresses are $C0, $C2, ..., $CE (each slot is 16 bits = 2 bytes). +static constexpr unsigned IMG_REGS[8] = { + W65816::IMG8, W65816::IMG9, W65816::IMG10, W65816::IMG11, + W65816::IMG12, W65816::IMG13, W65816::IMG14, W65816::IMG15}; +static constexpr unsigned IMG_DP[8] = {0xC0, 0xC2, 0xC4, 0xC6, + 0xC8, 0xCA, 0xCC, 0xCE}; + +static int classifyImgReg(unsigned Reg) { + for (int i = 0; i < 8; ++i) + if (Reg == IMG_REGS[i]) + return i; + return -1; +} + +// Map a DP-addressed instruction's first immediate operand to an IMG +// slot index if it falls in $C0..$CE. Returns -1 otherwise. +static int classifyDpImmAsImg(const MachineInstr &MI) { + // Most DP-addressed opcodes take the dp address as immediate op 0. + // (Some, like ADC_DP-form-with-explicit-A, may put the imm at op 1.) + // For our scan, check the first IMM operand we find. + unsigned Opc = MI.getOpcode(); + switch (Opc) { + case W65816::LDA_DP: + case W65816::STA_DP: + case W65816::STZ_DP: + case W65816::LDX_DP: + case W65816::STX_DP: + case W65816::LDY_DP: + case W65816::STY_DP: + case W65816::ADC_DP: + case W65816::SBC_DP: + case W65816::AND_DP: + case W65816::ORA_DP: + case W65816::EOR_DP: + case W65816::CMP_DP: + case W65816::CPX_DP: + case W65816::CPY_DP: + case W65816::BIT_DP: + case W65816::INC_DP: + case W65816::DEC_DP: + case W65816::ASL_DP: + case W65816::LSR_DP: + case W65816::ROL_DP: + case W65816::ROR_DP: + break; + default: + return -1; + } + for (const auto &MO : MI.operands()) { + if (!MO.isImm()) continue; + int64_t V = MO.getImm(); + for (int i = 0; i < 8; ++i) + if ((int64_t)IMG_DP[i] == V) + return i; + return -1; // First imm is the dp addr; not in IMG range. + } + return -1; +} + +bool W65816ImgCalleeSave::runOnMachineFunction(MachineFunction &MF) { + // Step 1: scan for IMG8..IMG15 usage. copyPhysReg already lowered + // some COPY $imgN = $a forms to STA_DP imm:0xC0 (etc.), so we have + // to check both the physreg form AND the DP-immediate form. + bool UsedSlot[8] = {false}; + bool AnyUsed = false; + for (auto &MBB : MF) { + for (auto &MI : MBB) { + // physreg form: $imgN = ... or ... = $imgN + for (const auto &MO : MI.operands()) { + if (!MO.isReg() || MO.getReg() == 0) continue; + int idx = classifyImgReg(MO.getReg()); + if (idx >= 0) { + UsedSlot[idx] = true; + AnyUsed = true; + } + } + // DP-imm form: lda dp imm:0xC0 etc. + int idx = classifyDpImmAsImg(MI); + if (idx >= 0) { + UsedSlot[idx] = true; + AnyUsed = true; + } + } + } + if (!AnyUsed) return false; + + // Step 2: allocate one frame slot per used IMG. Size = 2 bytes (each + // Img16 holds a 16-bit value). Mark as a spill slot so PEI accounts + // for it; isSpillSlot=true means slot coloring CAN coalesce it with + // other spill slots — but the STAfi/LDAfi we emit reference this slot + // by FrameIndex, and the only writes to this FI are our save/restore + // pair, so coloring can't break the round-trip. + // + // (The picol-expr bug came from a SHARED slot with two DIFFERENT + // vregs writing to it; here we have one FI per IMG and a single + // write/read pair per function, so coloring can't trip on this.) + MachineFrameInfo &MFI = MF.getFrameInfo(); + int FrameSlots[8]; + for (int i = 0; i < 8; ++i) { + FrameSlots[i] = -1; + if (UsedSlot[i]) + FrameSlots[i] = MFI.CreateStackObject(2, Align(2), + /*isSpillSlot=*/true); + } + + const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + DebugLoc DL; + + // Step 3: emit prologue save. Insert at entry MBB.begin() so PEI's + // emitPrologue (which inserts BEFORE existing MBB.begin) places its + // frame setup BEFORE our saves — the right order, since our saves + // reference frame slots whose disps require post-TCS S. + // + // Single PHA/PLA bracket around ALL slot saves (vs per-slot bracket). + // For N used slots: + // per-slot: N * (PHA + LDA dp + STA d,s + PLA) = 16N cyc, 6N bytes + // single: PHA + N*(LDA dp + STA d,s) + PLA = 8+8N cyc, 2+4N bytes + // Saves 8 cyc + 2 bytes per additional slot beyond the first. + // + // The +2 ImmOffset on STAfi compensates for PHA's SP shift; same +2 + // applies to every slot inside the bracket since SP is constant + // throughout. + MachineBasicBlock &EntryMBB = MF.front(); + MachineBasicBlock::iterator EntryIt = EntryMBB.begin(); + BuildMI(EntryMBB, EntryIt, DL, TII->get(W65816::PHA)); + for (int i = 0; i < 8; ++i) { + if (!UsedSlot[i]) continue; + BuildMI(EntryMBB, EntryIt, DL, TII->get(W65816::LDA_DP)) + .addImm(IMG_DP[i]) + .addReg(W65816::A, RegState::ImplicitDefine); + BuildMI(EntryMBB, EntryIt, DL, TII->get(W65816::STAfi)) + .addReg(W65816::A) + .addFrameIndex(FrameSlots[i]) + .addImm(2) + .addReg(W65816::A, RegState::ImplicitDefine); + } + BuildMI(EntryMBB, EntryIt, DL, TII->get(W65816::PLA)) + .addReg(W65816::A, RegState::ImplicitDefine); + + // Step 4: emit epilogue restore at each return MBB, just BEFORE the + // RTL/RTS/RTI (so the IMG restore happens before the frame teardown + // — wait, no: we want the IMG restore BEFORE we tear down the frame + // because our STAfi/LDAfi reference frame slots). Insert just + // before the LAST terminator (the return). PEI's emitEpilogue will + // insert its frame-teardown AFTER our restores (BEFORE the return), + // which means our `,s` disps see the post-TCS S still. + // + // Pattern per slot (preserving A, which may hold the return value): + // PHA (preserve A; SP shifts) + // LDAfi A, (A = saved IMGn value via `lda ,s`) + // STA (write back to IMGn) + // PLA (restore A) + for (auto &MBB : MF) { + if (MBB.empty()) continue; + auto LastIt = std::prev(MBB.end()); + while (LastIt != MBB.begin() && LastIt->isDebugInstr()) + --LastIt; + unsigned LastOpc = LastIt->getOpcode(); + if (LastOpc != W65816::RTL && LastOpc != W65816::RTS && + LastOpc != W65816::RTI) + continue; + + // Single PHA/PLA bracket for all restores (same optimization as save). + BuildMI(MBB, LastIt, DL, TII->get(W65816::PHA)); + for (int i = 7; i >= 0; --i) { + if (!UsedSlot[i]) continue; + BuildMI(MBB, LastIt, DL, TII->get(W65816::LDAfi)) + .addReg(W65816::A, RegState::Define) + .addFrameIndex(FrameSlots[i]) + .addImm(2); + BuildMI(MBB, LastIt, DL, TII->get(W65816::STA_DP)) + .addImm(IMG_DP[i]) + .addReg(W65816::A, RegState::Implicit); + } + BuildMI(MBB, LastIt, DL, TII->get(W65816::PLA)) + .addReg(W65816::A, RegState::ImplicitDefine); + } + + return true; +} diff --git a/src/llvm/lib/Target/W65816/W65816InstrInfo.cpp b/src/llvm/lib/Target/W65816/W65816InstrInfo.cpp index 7d5c83a..9c475e3 100644 --- a/src/llvm/lib/Target/W65816/W65816InstrInfo.cpp +++ b/src/llvm/lib/Target/W65816/W65816InstrInfo.cpp @@ -454,22 +454,127 @@ int W65816InstrInfo::getSPAdjust(const MachineInstr &MI) const { return TargetInstrInfo::getSPAdjust(MI); } +// Conditional branch opcode predicate. +static bool isCondBranch(unsigned Opc) { + switch (Opc) { + case W65816::BEQ: + case W65816::BNE: + case W65816::BCS: + case W65816::BCC: + case W65816::BMI: + case W65816::BPL: + case W65816::BVS: + case W65816::BVC: + return true; + default: + return false; + } +} + +// Unconditional direct-target branch predicate. Excludes JMP_AbsInd +// (indirect) and JML_Long (different operand kind). +static bool isUncondDirectBranch(unsigned Opc) { + return Opc == W65816::BRA || Opc == W65816::BRL || + Opc == W65816::JMP_Abs; +} + +// Map a conditional Bxx to its inverse condition (BEQ↔BNE, etc.). +// Returns 0 if not a recognised conditional. +static unsigned invertCondOpcode(unsigned Opc) { + switch (Opc) { + case W65816::BEQ: return W65816::BNE; + case W65816::BNE: return W65816::BEQ; + case W65816::BCS: return W65816::BCC; + case W65816::BCC: return W65816::BCS; + case W65816::BMI: return W65816::BPL; + case W65816::BPL: return W65816::BMI; + case W65816::BVS: return W65816::BVC; + case W65816::BVC: return W65816::BVS; + default: return 0; + } +} + +MachineBasicBlock * +W65816InstrInfo::getBranchDestBlock(const MachineInstr &MI) const { + // All our direct branches encode the target MBB in operand 0. + if (MI.getNumOperands() < 1 || !MI.getOperand(0).isMBB()) + return nullptr; + return MI.getOperand(0).getMBB(); +} + bool W65816InstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl &Cond, bool AllowModify) const { - // Return "unanalyzable" — we don't decode our BR_CC pseudos here. - // BranchFolder treats a true return as "leave this block alone", - // which avoids the default insertBranch llvm_unreachable. - return true; + TBB = nullptr; + FBB = nullptr; + Cond.clear(); + + // We deliberately keep conditional branches (BEQ/BNE/etc.) opaque to + // BranchFolder. Their condition is encoded in the OPCODE and the + // flag input is an implicit use of P set by a preceding CMP/etc.; + // BranchFolder doesn't track that the CMP must stay adjacent, so + // if it re-inserts the Bxx in a tail-merged block the flag input + // becomes whatever earlier instruction last clobbered P. Caught by + // the softDouble dadd smoke (1.5 + 2.5 != 4.0) once we tried to make + // conditional branches analyzable. + // + // What we DO analyze: + // * Empty terminator sequence (pure fall-through) — return + // analyzable with no targets so MachineBlockPlacement's assert + // about fall-through blocks is satisfied trivially. + // * Single unconditional direct branch (BRA / BRL / JMP_Abs) — + // return analyzable with TBB set, no Cond. Safe to move because + // no flag dependency. + // Everything else (Bxx in any position, indirect jumps, multiple + // terminators, etc.) stays unanalyzable. + MachineBasicBlock::iterator I = MBB.getFirstTerminator(); + while (I != MBB.end() && I->isDebugInstr()) + ++I; + if (I == MBB.end()) + return false; // No terminators: pure fall-through. + + unsigned FirstOpc = I->getOpcode(); + if (!isUncondDirectBranch(FirstOpc)) + return true; // Conditional or unknown. Stay opaque. + + // Single unconditional direct branch — analyzable. + TBB = getBranchDestBlock(*I); + if (!TBB) + return true; + auto Next = std::next(I); + while (Next != MBB.end() && Next->isDebugInstr()) + ++Next; + if (Next != MBB.end()) + return true; // Extra terminators after unconditional. + return false; } unsigned W65816InstrInfo::removeBranch(MachineBasicBlock &MBB, int *BytesRemoved) const { if (BytesRemoved) *BytesRemoved = 0; - return 0; + unsigned NumRemoved = 0; + // Walk from the end, removing trailing direct branches. Stop when + // we hit a non-branch or a branch we can't analyze (e.g. JMP_AbsInd). + while (!MBB.empty()) { + auto It = std::prev(MBB.end()); + if (It->isDebugInstr()) { + // Skip debug instructions but don't delete them. + if (It == MBB.begin()) + break; + --It; + } + unsigned Opc = It->getOpcode(); + if (!isCondBranch(Opc) && !isUncondDirectBranch(Opc)) + break; + if (BytesRemoved) + *BytesRemoved += getInstSizeInBytes(*It); + It->eraseFromParent(); + ++NumRemoved; + } + return NumRemoved; } unsigned W65816InstrInfo::insertBranch(MachineBasicBlock &MBB, @@ -478,11 +583,49 @@ unsigned W65816InstrInfo::insertBranch(MachineBasicBlock &MBB, ArrayRef Cond, const DebugLoc &DL, int *BytesAdded) const { - // Should not be called: analyzeBranch returns true so BranchFolder - // treats blocks as unanalyzable and never asks us to insert. + assert(TBB && "insertBranch requires a true target"); + assert((Cond.empty() || Cond.size() == 1) && + "W65816 branch conditions are single-operand (opcode)"); + if (BytesAdded) *BytesAdded = 0; - return 0; + unsigned NumAdded = 0; + + if (Cond.empty()) { + // Unconditional branch. Use BRA — W65816AsmBackend auto-relaxes + // to BRL when the displacement exceeds an 8-bit signed offset. + auto MI = BuildMI(&MBB, DL, get(W65816::BRA)).addMBB(TBB); + if (BytesAdded) + *BytesAdded += getInstSizeInBytes(*MI); + return 1; + } + + // Conditional branch using the opcode stored in Cond[0]. + unsigned CondOpc = Cond[0].getImm(); + auto MIc = BuildMI(&MBB, DL, get(CondOpc)).addMBB(TBB); + if (BytesAdded) + *BytesAdded += getInstSizeInBytes(*MIc); + ++NumAdded; + + // If there's also a false target, emit an unconditional branch to it. + if (FBB) { + auto MIu = BuildMI(&MBB, DL, get(W65816::BRA)).addMBB(FBB); + if (BytesAdded) + *BytesAdded += getInstSizeInBytes(*MIu); + ++NumAdded; + } + return NumAdded; +} + +bool W65816InstrInfo::reverseBranchCondition( + SmallVectorImpl &Cond) const { + if (Cond.size() != 1) + return true; + unsigned Inverted = invertCondOpcode(Cond[0].getImm()); + if (!Inverted) + return true; + Cond[0].setImm(Inverted); + return false; } unsigned W65816InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { diff --git a/src/llvm/lib/Target/W65816/W65816InstrInfo.h b/src/llvm/lib/Target/W65816/W65816InstrInfo.h index 4074c2f..8341bd7 100644 --- a/src/llvm/lib/Target/W65816/W65816InstrInfo.h +++ b/src/llvm/lib/Target/W65816/W65816InstrInfo.h @@ -95,13 +95,13 @@ public: // int b, int c) { return a*b + c; }` under fast regalloc). int getSPAdjust(const MachineInstr &MI) const override; - // Branch-control hooks — minimal stubs that opt our blocks out of - // BranchFolder's tail-merging pass. Return "unanalyzable" from - // analyzeBranch so BranchFolder leaves the block alone; the empty - // remove/insertBranch stubs are required by the contract but never - // actually invoked in the unanalyzable path. Pre-ptr32 the smoke - // never hit BranchFolder via this entry; under ptr32 it does - // (multi-pattern test at smoke #7). + // Branch-control hooks. These now decode our real branch opcodes + // (BEQ/BNE/BCS/BCC/BMI/BPL/BVS/BVC and BRA/BRL/JMP_Abs) so + // BranchFolder and MachineBlockPlacement can rearrange blocks. + // Cond is encoded as a single Imm operand holding the conditional + // branch's opcode; reverseBranchCondition flips it via opcode map. + // JMP_AbsInd / JML_Long return "unanalyzable" — they're indirect or + // bank-crossing, which the layout passes can't reason about. bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl &Cond, @@ -112,6 +112,10 @@ public: MachineBasicBlock *FBB, ArrayRef Cond, const DebugLoc &DL, int *BytesAdded = nullptr) const override; + bool reverseBranchCondition( + SmallVectorImpl &Cond) const override; + MachineBasicBlock *getBranchDestBlock( + const MachineInstr &MI) const override; }; } // namespace llvm diff --git a/src/llvm/lib/Target/W65816/W65816InstrInfo.td b/src/llvm/lib/Target/W65816/W65816InstrInfo.td index 2d46efe..f678d8a 100644 --- a/src/llvm/lib/Target/W65816/W65816InstrInfo.td +++ b/src/llvm/lib/Target/W65816/W65816InstrInfo.td @@ -797,7 +797,13 @@ def LDAfi : W65816Pseudo<(outs Acc16:$dst), (ins memfi:$addr), // with-IMG-source that clobbered $a, silently storing X's value where // A's was expected — observed as `dadd(1.5,2.5) → 0x4010_0000_3000_3000` // under full IMG-clobber. -let mayStore = 1, hasSideEffects = 0, mayLoad = 0, Defs = [A] in { +// +// Note: Defs = [A] triggers a greedy-regalloc assertion failure +// (LiveRangeEdit::eliminateDeadDef on a KILL pseudo with non-dead +// implicit-def $a) on functions with many cross-call Acc16 vregs +// (atoi, etc.). Greedy is currently disabled — basic regalloc avoids +// the bad path. +let mayStore = 1, hasSideEffects = 1, mayLoad = 0, Defs = [A] in { def STAfi : W65816Pseudo<(outs), (ins Wide16:$src, memfi:$addr), "# STAfi $src, $addr", []>; @@ -1604,6 +1610,23 @@ def EOR_StackRel : InstStackRel<0x43, "eor">; def LDA_StackRelIndY : InstStackRelIndY<0xB3, "lda">; def STA_StackRelIndY : InstStackRelIndY<0x93, "sta">; +// Pseudo: conditional-increment of the hi half of an i32 spilled to a +// pair of stack-rel slots. Emitted by W65816I32IncFold when the +// preceding LDA-INA-STA on the lo half established Z based on the +// post-INA value (Z=1 means the lo wrapped to 0, i.e. a carry into hi). +// AsmPrinter expands to: +// bne L_skip +// lda $imm, s +// inc a +// sta $imm, s +// L_skip: +let mayLoad = 1, mayStore = 1, hasSideEffects = 0, + Defs = [A] in { +def INC_HI_IF_CARRY_StackRel : W65816Pseudo<(outs), (ins i16imm:$off), + "# INC_HI_IF_CARRY_StackRel $off", + []>; +} + //===----------------------------------------------------------------------===// // Branch patterns (placed after the Bxx defs). // diff --git a/src/llvm/lib/Target/W65816/W65816NarrowI32Mul.cpp b/src/llvm/lib/Target/W65816/W65816NarrowI32Mul.cpp new file mode 100644 index 0000000..0394d6d --- /dev/null +++ b/src/llvm/lib/Target/W65816/W65816NarrowI32Mul.cpp @@ -0,0 +1,150 @@ +//===-- W65816NarrowI32Mul.cpp - Narrow i32 multiplies -------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===---------------------------------------------------------------------===// +// +// IR-level peephole. Detects `mul i32 X, Y` where both X and Y have +// their top 16 bits provably zero (via LLVM's IR-level computeKnownBits) +// and rewrites to a call to `__umulhisi3` — a 16x16 -> 32 unsigned +// multiply (~30% faster than __mulsi3 for `(u32)i * i` patterns). +// +// Why an IR pass instead of a Custom SDAG lowering: LLVM's IndVarSimplify +// loop pass widens narrow induction variables (e.g. an i16 loop counter +// later zext'd to i32) into i32 PHIs. By SDAG-build time the zext is +// gone — the MUL's operand is just `CopyFromReg %2:i32`, an opaque value. +// SDAG's computeKnownBits can't trace back across BB boundaries through +// CopyFromReg. IR-level computeKnownBits, by contrast, walks the use-def +// graph (including PHIs) and can prove the high bits zero. +// +// Runs in addISelPrepare (right before SDAG-ISel) so it sees the +// final-shape IR. The libcall declaration is auto-added if missing. +// +//===---------------------------------------------------------------------===// + +#include "W65816.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/InitializePasses.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/Support/KnownBits.h" + +using namespace llvm; + +#define DEBUG_TYPE "w65816-narrow-i32-mul" + + +namespace { + + +class W65816NarrowI32Mul : public FunctionPass { +public: + static char ID; + W65816NarrowI32Mul() : FunctionPass(ID) {} + + StringRef getPassName() const override { + return "W65816 narrow i32 multiplies to __umulhisi3"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.setPreservesCFG(); + } + + bool runOnFunction(Function &F) override; +}; + + +} // namespace + + +char W65816NarrowI32Mul::ID = 0; + +INITIALIZE_PASS_BEGIN(W65816NarrowI32Mul, DEBUG_TYPE, + "W65816 narrow i32 multiplies", false, false) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_END(W65816NarrowI32Mul, DEBUG_TYPE, + "W65816 narrow i32 multiplies", false, false) + + +// Get-or-declare `__umulhisi3(i16, i16) -> i32` in the module. +static FunctionCallee getUmulhisi3(Module &M) { + LLVMContext &Ctx = M.getContext(); + Type *I16 = Type::getInt16Ty(Ctx); + Type *I32 = Type::getInt32Ty(Ctx); + FunctionType *FT = FunctionType::get(I32, {I16, I16}, false); + return M.getOrInsertFunction("__umulhisi3", FT); +} + + +// True iff the top 16 bits of V are known zero. Tries IR-level +// computeKnownBits first; if that doesn't prove enough, falls back +// to ScalarEvolution's unsigned-range analysis (which handles +// loop-bounded induction variables that KnownBits can't). +static bool top16Zero(Value *V, const DataLayout &DL, ScalarEvolution &SE) { + KnownBits K = computeKnownBits(V, DL); + if (K.countMinLeadingZeros() >= 16) { + return true; + } + if (!SE.isSCEVable(V->getType())) { + return false; + } + const SCEV *S = SE.getSCEV(V); + ConstantRange R = SE.getUnsignedRange(S); + return R.getActiveBits() <= 16; +} + + +bool W65816NarrowI32Mul::runOnFunction(Function &F) { + Module *M = F.getParent(); + const DataLayout &DL = M->getDataLayout(); + Type *I16 = Type::getInt16Ty(F.getContext()); + ScalarEvolution &SE = getAnalysis().getSE(); + + SmallVector Worklist; + for (Instruction &I : instructions(F)) { + auto *BO = dyn_cast(&I); + if (!BO || BO->getOpcode() != Instruction::Mul) { + continue; + } + if (!BO->getType()->isIntegerTy(32)) { + continue; + } + if (!top16Zero(BO->getOperand(0), DL, SE)) { + continue; + } + if (!top16Zero(BO->getOperand(1), DL, SE)) { + continue; + } + Worklist.push_back(BO); + } + + if (Worklist.empty()) { + return false; + } + + FunctionCallee Callee = getUmulhisi3(*M); + for (BinaryOperator *BO : Worklist) { + IRBuilder<> B(BO); + Value *A = B.CreateTrunc(BO->getOperand(0), I16); + Value *Bv = B.CreateTrunc(BO->getOperand(1), I16); + Value *Call = B.CreateCall(Callee, {A, Bv}); + BO->replaceAllUsesWith(Call); + BO->eraseFromParent(); + } + return true; +} + + +FunctionPass *llvm::createW65816NarrowI32Mul() { + return new W65816NarrowI32Mul(); +} diff --git a/src/llvm/lib/Target/W65816/W65816RegisterInfo.cpp b/src/llvm/lib/Target/W65816/W65816RegisterInfo.cpp index c4f0af7..870766d 100644 --- a/src/llvm/lib/Target/W65816/W65816RegisterInfo.cpp +++ b/src/llvm/lib/Target/W65816/W65816RegisterInfo.cpp @@ -69,16 +69,21 @@ static bool expandFarFI(MachineInstr &MI, int FPOff, switch (Opc) { case W65816::LDAfi: { Register Dst = MI.getOperand(0).getReg(); - BuildMI(MBB, II, DL, TII.get(W65816::PHY)) - .addReg(W65816::Y, RegState::Implicit); + // Mark Y use as Undef: if Y is dead at this insertion point, the + // value we save is "don't care" — we restore the same garbage byte + // later. Without Undef, the verifier rejects when no def reaches + // (cause of the sha256_transform crash: STY_DP $FA emitted in the + // round-loop preheader before any LDY definition was reachable). + BuildMI(MBB, II, DL, TII.get(W65816::STY_DP)).addImm(0xFA) + .addReg(W65816::Y, RegState::Implicit | RegState::Undef); BuildMI(MBB, II, DL, TII.get(W65816::LDY_Imm16)) .addImm(FPOff) .addReg(W65816::Y, RegState::ImplicitDefine); - BuildMI(MBB, II, DL, TII.get(W65816::LDA_DPIndY)) + BuildMI(MBB, II, DL, TII.get(W65816::LDA_DPIndLongY)) .addImm(0xF6) .addReg(W65816::A, RegState::ImplicitDefine) .addReg(W65816::Y, RegState::Implicit); - BuildMI(MBB, II, DL, TII.get(W65816::PLY)) + BuildMI(MBB, II, DL, TII.get(W65816::LDY_DP)).addImm(0xFA) .addReg(W65816::Y, RegState::ImplicitDefine); if (Dst == W65816::X) BuildMI(MBB, II, DL, TII.get(W65816::TAX)); @@ -91,26 +96,26 @@ static bool expandFarFI(MachineInstr &MI, int FPOff, int srcDP = imgRegToDP(Src); if (srcDP >= 0) BuildMI(MBB, II, DL, TII.get(W65816::LDA_DP)).addImm(srcDP); - BuildMI(MBB, II, DL, TII.get(W65816::PHY)) - .addReg(W65816::Y, RegState::Implicit); + BuildMI(MBB, II, DL, TII.get(W65816::STY_DP)).addImm(0xFA) + .addReg(W65816::Y, RegState::Implicit | RegState::Undef); BuildMI(MBB, II, DL, TII.get(W65816::LDY_Imm16)).addImm(FPOff); - BuildMI(MBB, II, DL, TII.get(W65816::STA_DPIndY)) + BuildMI(MBB, II, DL, TII.get(W65816::STA_DPIndLongY)) .addImm(0xF6) .addReg(W65816::A, RegState::Implicit) .addReg(W65816::Y, RegState::Implicit); - BuildMI(MBB, II, DL, TII.get(W65816::PLY)); + BuildMI(MBB, II, DL, TII.get(W65816::LDY_DP)).addImm(0xFA); return true; } case W65816::STA8fi: { BuildMI(MBB, II, DL, TII.get(W65816::SEP)).addImm(0x20) .addReg(W65816::P, RegState::ImplicitDefine); - BuildMI(MBB, II, DL, TII.get(W65816::PHY)) - .addReg(W65816::Y, RegState::Implicit); + BuildMI(MBB, II, DL, TII.get(W65816::STY_DP)).addImm(0xFA) + .addReg(W65816::Y, RegState::Implicit | RegState::Undef); BuildMI(MBB, II, DL, TII.get(W65816::LDY_Imm16)).addImm(FPOff); - BuildMI(MBB, II, DL, TII.get(W65816::STA_DPIndY)) + BuildMI(MBB, II, DL, TII.get(W65816::STA_DPIndLongY)) .addImm(0xF6) .addReg(W65816::A, RegState::Implicit); - BuildMI(MBB, II, DL, TII.get(W65816::PLY)); + BuildMI(MBB, II, DL, TII.get(W65816::LDY_DP)).addImm(0xFA); BuildMI(MBB, II, DL, TII.get(W65816::REP)).addImm(0x20) .addReg(W65816::P, RegState::ImplicitDefine); return true; @@ -126,13 +131,13 @@ static bool expandFarFI(MachineInstr &MI, int FPOff, // op's flags from a downstream consumer. BuildMI(MBB, II, DL, TII.get(W65816::STA_DP)).addImm(0xE2) .addReg(W65816::A, RegState::Implicit); - BuildMI(MBB, II, DL, TII.get(W65816::PHY)) - .addReg(W65816::Y, RegState::Implicit); + BuildMI(MBB, II, DL, TII.get(W65816::STY_DP)).addImm(0xFA) + .addReg(W65816::Y, RegState::Implicit | RegState::Undef); BuildMI(MBB, II, DL, TII.get(W65816::LDY_Imm16)).addImm(FPOff); - BuildMI(MBB, II, DL, TII.get(W65816::LDA_DPIndY)).addImm(0xF6) + BuildMI(MBB, II, DL, TII.get(W65816::LDA_DPIndLongY)).addImm(0xF6) .addReg(W65816::A, RegState::ImplicitDefine) .addReg(W65816::Y, RegState::Implicit); - BuildMI(MBB, II, DL, TII.get(W65816::PLY)) + BuildMI(MBB, II, DL, TII.get(W65816::LDY_DP)).addImm(0xFA) .addReg(W65816::Y, RegState::ImplicitDefine); unsigned OpDPOpc = 0; switch (Opc) { @@ -167,17 +172,17 @@ static bool expandFarFI(MachineInstr &MI, int FPOff, // SBC/CMP $E2 BuildMI(MBB, II, DL, TII.get(W65816::STA_DP)).addImm(0xE0) .addReg(W65816::A, RegState::Implicit); - BuildMI(MBB, II, DL, TII.get(W65816::PHY)) - .addReg(W65816::Y, RegState::Implicit); + BuildMI(MBB, II, DL, TII.get(W65816::STY_DP)).addImm(0xFA) + .addReg(W65816::Y, RegState::Implicit | RegState::Undef); BuildMI(MBB, II, DL, TII.get(W65816::LDY_Imm16)).addImm(FPOff); - BuildMI(MBB, II, DL, TII.get(W65816::LDA_DPIndY)).addImm(0xF6) + BuildMI(MBB, II, DL, TII.get(W65816::LDA_DPIndLongY)).addImm(0xF6) .addReg(W65816::A, RegState::ImplicitDefine) .addReg(W65816::Y, RegState::Implicit); BuildMI(MBB, II, DL, TII.get(W65816::STA_DP)).addImm(0xE2) .addReg(W65816::A, RegState::Implicit); BuildMI(MBB, II, DL, TII.get(W65816::LDA_DP)).addImm(0xE0) .addReg(W65816::A, RegState::ImplicitDefine); - BuildMI(MBB, II, DL, TII.get(W65816::PLY)) + BuildMI(MBB, II, DL, TII.get(W65816::LDY_DP)).addImm(0xFA) .addReg(W65816::Y, RegState::ImplicitDefine); if (Opc == W65816::CMPfi) { BuildMI(MBB, II, DL, TII.get(W65816::CMP_DP)).addImm(0xE2) @@ -268,7 +273,7 @@ bool W65816RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int ImmOffset = MI.getOperand(FIOperandNum + 1).getImm(); int Offset = FrameOffset + ImmOffset + (int)MFI.getStackSize() + SPAdj; if (FrameOffset < 0) Offset += 1; - if (Offset < 0 || Offset > 0xFF) { + if (Offset < 0 || Offset > 0xFF || MFI.hasVarSizedObjects()) { // Far slot. Use FP if reserved. FP-relative offset excludes // SPAdj because $F6 captures S after prologue, before any // intermediate PUSH16 inside a call sequence. @@ -342,7 +347,7 @@ bool W65816RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // in callee), so they don't need the skew. int Offset = FrameOffset + ImmOffset + (int)MFI.getStackSize() + SPAdj; if (FrameOffset < 0) Offset += 1; - if (Offset < 0 || Offset > 0xFF) { + if (Offset < 0 || Offset > 0xFF || MFI.hasVarSizedObjects()) { if (MF.getInfo()->getUsesDpFP()) { int FPOff = FrameOffset + ImmOffset + (int)MFI.getStackSize(); if (FrameOffset < 0) FPOff += 1; @@ -434,7 +439,7 @@ bool W65816RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int ImmOffset = MI.getOperand(FIOperandNum + 1).getImm(); int Offset = FrameOffset + ImmOffset + (int)MFI.getStackSize() + SPAdj; if (FrameOffset < 0) Offset += 1; // empty-descending SP skew (see STAfi) - if (Offset < 0 || Offset > 0xFF) { + if (Offset < 0 || Offset > 0xFF || MFI.hasVarSizedObjects()) { if (MF.getInfo()->getUsesDpFP()) { int FPOff = FrameOffset + ImmOffset + (int)MFI.getStackSize(); if (FrameOffset < 0) FPOff += 1; @@ -516,7 +521,7 @@ bool W65816RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int Offset = FrameOffset + ImmOffset + (int)MFI.getStackSize() + SPAdj; if (FrameOffset < 0) Offset += 1; - if (Offset < 0 || Offset > 0xFF) { + if (Offset < 0 || Offset > 0xFF || MFI.hasVarSizedObjects()) { if (MF.getInfo()->getUsesDpFP()) { int FPOff = FrameOffset + ImmOffset + (int)MFI.getStackSize(); if (FrameOffset < 0) FPOff += 1; diff --git a/src/llvm/lib/Target/W65816/W65816SepRepCleanup.cpp b/src/llvm/lib/Target/W65816/W65816SepRepCleanup.cpp index 8c542c2..eac1e48 100644 --- a/src/llvm/lib/Target/W65816/W65816SepRepCleanup.cpp +++ b/src/llvm/lib/Target/W65816/W65816SepRepCleanup.cpp @@ -40,6 +40,7 @@ #include "W65816.h" #include "W65816InstrInfo.h" #include "W65816Subtarget.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" @@ -422,6 +423,397 @@ bool W65816SepRepCleanup::runOnMachineFunction(MachineFunction &MF) { // generic post-RA pseudo expander), so it's still in the MIR here. Changed |= foldImmAdcToInaDea(MBB, TII); + // PHI-copy hoist. + { + auto isStaLike = [](const MachineInstr &MI) { + unsigned O = MI.getOpcode(); + return O == W65816::STA_StackRel || O == W65816::STZ_DP || + O == W65816::STZ_Abs; + }; + auto isLdaSR = [](const MachineInstr &MI) { + return MI.getOpcode() == W65816::LDA_StackRel; + }; + auto isFlagPreservingMem = [&](const MachineInstr &MI) { + return isStaLike(MI) || isLdaSR(MI); + }; + auto It = MBB.begin(); + while (It != MBB.end()) { + if (It->getOpcode() != W65816::PHP) { ++It; continue; } + auto Php = It; + // Walk forward: collect LDA/STA pairs, stop at PLP. + auto Walker = std::next(Php); + SmallVector Block; + SmallSet ReadSlots; + SmallSet WriteSlots; + bool ok = true; + while (Walker != MBB.end()) { + if (Walker->isDebugInstr()) { ++Walker; continue; } + if (Walker->getOpcode() == W65816::PLP) break; + if (!isFlagPreservingMem(*Walker)) { ok = false; break; } + // Track slots so we can check the gap below. + if (Walker->getNumOperands() >= 1 && Walker->getOperand(0).isImm()) { + int64_t off = Walker->getOperand(0).getImm(); + if (isLdaSR(*Walker)) ReadSlots.insert(off); + else WriteSlots.insert(off); + } + Block.push_back(&*Walker); + ++Walker; + } + if (!ok || Walker == MBB.end()) { ++It; continue; } + auto Plp = Walker; + // Trailing flag-preservers after PLP (STA/STZ only). + auto Tail = std::next(Plp); + SmallVector Trailing; + while (Tail != MBB.end()) { + if (Tail->isDebugInstr()) { ++Tail; continue; } + if (!isStaLike(*Tail)) break; + if (Tail->getNumOperands() >= 1 && Tail->getOperand(0).isImm()) { + WriteSlots.insert(Tail->getOperand(0).getImm()); + } + Trailing.push_back(&*Tail); + ++Tail; + } + // Pair check: the wrap structure is a sequence of LDA-STA + // memory-to-memory PHI copies, where the FINAL STA may live + // outside the wrap (as Trailing) because STA doesn't clobber + // flags. Count LDAs in Block vs total STAs (Block + Trailing). + // If they're not equal, some LDA's $a-output is a register- + // live-out PHI value (consumed by a back-edge successor's + // first STA, e.g. the vararg `sta 0x5, s` pattern). Hoisting + // it earlier would lose the value. + unsigned NLda = 0, NSta = 0; + for (MachineInstr *MI : Block) { + if (isLdaSR(*MI)) ++NLda; + else if (isStaLike(*MI)) ++NSta; + } + NSta += Trailing.size(); + if (NLda != NSta) { ++It; continue; } + // Walk backward from PHP to find the hoist insertion point. + // The hoisted block clobbers $a and $p (LDA writes both). + // Skip insts that USE $a (consumer of an earlier $a producer) + // or that DEFINE $p (flag-setter — its $p output will be + // re-established by the same flag-setter). Stop at a pure A + // producer (defines $a, doesn't use $a). + // + // Also bail if any in-gap inst writes a slot we read or reads + // a slot we write (in-gap reads of our writes would observe + // a stale value after hoist; in-gap writes to our reads would + // produce a different value if hoisted before). + auto Back = Php; + if (Back == MBB.begin()) { ++It; continue; } + --Back; + bool gapOK = true; + while (true) { + while (Back != MBB.begin() && Back->isDebugInstr()) --Back; + if (Back->isDebugInstr()) { gapOK = false; break; } + // Slot conflict check. + unsigned BO = Back->getOpcode(); + if ((BO == W65816::STA_StackRel || BO == W65816::STZ_DP || + BO == W65816::STZ_Abs) && + Back->getNumOperands() >= 1 && Back->getOperand(0).isImm()) { + int64_t off = Back->getOperand(0).getImm(); + if (ReadSlots.count(off)) { gapOK = false; break; } + } + if (BO == W65816::LDA_StackRel && + Back->getNumOperands() >= 1 && Back->getOperand(0).isImm()) { + int64_t off = Back->getOperand(0).getImm(); + if (WriteSlots.count(off)) { gapOK = false; break; } + } + // Bail on call / branch / asm. + if (Back->isCall() || Back->isBranch() || + Back->isReturn() || Back->isInlineAsm()) { + gapOK = false; break; + } + bool usesA = false; + bool defsA = false; + for (const MachineOperand &MO : Back->operands()) { + if (MO.isReg() && MO.getReg() == W65816::A) { + if (MO.isUse()) usesA = true; + if (MO.isDef()) defsA = true; + } + } + if (defsA && !usesA) break; // Pure A producer found. + if (Back == MBB.begin()) { gapOK = false; break; } + --Back; + } + if (!gapOK) { ++It; continue; } + // Hoist: move Block and Trailing to before Back. Undo the + // +1 stack-rel bump on Block's in-wrap memory ops; Trailing + // stays AS-IS (it was already outside the wrap and never + // bumped). + for (MachineInstr *MI : Block) { + // All ops in Block matched isFlagPreservingMem, so they're + // LDA_StackRel/STA_StackRel/STZ_DP/STZ_Abs. LDA_StackRel + // and STA_StackRel use operand 0 as the disp; that's the + // bumped one. STZ_DP/STZ_Abs aren't stack-rel — no bump. + unsigned MOpc = MI->getOpcode(); + if (MOpc == W65816::LDA_StackRel || MOpc == W65816::STA_StackRel) { + if (MI->getNumOperands() >= 1 && MI->getOperand(0).isImm()) { + int64_t v = MI->getOperand(0).getImm(); + MI->getOperand(0).setImm(v - 1); + } + } + MI->removeFromParent(); + MBB.insert(Back, MI); + } + for (MachineInstr *MI : Trailing) { + MI->removeFromParent(); + MBB.insert(Back, MI); + } + Php->eraseFromParent(); + Plp->eraseFromParent(); + Changed = true; + // Restart iteration from the beginning since we mutated. + It = MBB.begin(); + } + } + + // i32 += i32 store-bypass. Regalloc materializes the call result + // (A=lo, X=hi) into Wide32 spill slots before the add, then reads + // them back — emitting 4 instructions of redundant store/reload: + // + // STA_StackRel slotA ; A (mul.lo) -> slotA + // TXA ; A = X = mul.hi + // STA_StackRel slotB ; mul.hi -> slotB + // LDA_StackRel slotA ; reload mul.lo <-- redundant + // CLC + // ADC_StackRel slotC ; mul.lo + total.lo + // STA_StackRel slotA ; sum-lo + // LDA_StackRel slotB ; reload mul.hi <-- redundant + // ADC_StackRel slotD ; mul.hi + total.hi + C + // STA_StackRel slotB ; sum-hi + // + // Reorder to do the lo-add directly off A and the hi-add directly + // off X (via TXA preserving carry): + // + // CLC + // ADC_StackRel slotC ; A = mul.lo + total.lo + // STA_StackRel slotA ; sum-lo + // TXA ; A = X = mul.hi (C preserved) + // ADC_StackRel slotD ; A = mul.hi + total.hi + C + // STA_StackRel slotB ; sum-hi + // + // 10 -> 6 inst. Saves 4 inst / ~13 cyc per i32-add-of-call-result + // site. Hits the sumOfSquares loop and any total += __umulhisi3 + // pattern. + { + auto isStaSR = [](MachineInstr &MI, int64_t *off) { + if (MI.getOpcode() != W65816::STA_StackRel) return false; + if (MI.getNumOperands() < 1 || !MI.getOperand(0).isImm()) return false; + if (off) *off = MI.getOperand(0).getImm(); + return true; + }; + auto isLdaSR = [](MachineInstr &MI, int64_t *off) { + if (MI.getOpcode() != W65816::LDA_StackRel) return false; + if (MI.getNumOperands() < 1 || !MI.getOperand(0).isImm()) return false; + if (off) *off = MI.getOperand(0).getImm(); + return true; + }; + auto isAdcSR = [](MachineInstr &MI, int64_t *off) { + if (MI.getOpcode() != W65816::ADC_StackRel) return false; + if (MI.getNumOperands() < 1 || !MI.getOperand(0).isImm()) return false; + if (off) *off = MI.getOperand(0).getImm(); + return true; + }; + auto It = MBB.begin(); + while (It != MBB.end()) { + auto Cur = It; + int64_t slotA = 0, slotB = 0, slotC = 0, slotD = 0; + // Step 1: STA_StackRel slotA + if (!isStaSR(*Cur, &slotA)) { ++It; continue; } + auto P2 = std::next(Cur); + while (P2 != MBB.end() && P2->isDebugInstr()) ++P2; + if (P2 == MBB.end() || P2->getOpcode() != W65816::TXA) { ++It; continue; } + auto P3 = std::next(P2); + while (P3 != MBB.end() && P3->isDebugInstr()) ++P3; + if (P3 == MBB.end() || !isStaSR(*P3, &slotB)) { ++It; continue; } + if (slotA == slotB) { ++It; continue; } + auto P4 = std::next(P3); + while (P4 != MBB.end() && P4->isDebugInstr()) ++P4; + int64_t lreloadA = 0; + if (P4 == MBB.end() || !isLdaSR(*P4, &lreloadA) || lreloadA != slotA) { + ++It; continue; + } + auto P5 = std::next(P4); + while (P5 != MBB.end() && P5->isDebugInstr()) ++P5; + if (P5 == MBB.end() || P5->getOpcode() != W65816::CLC) { + ++It; continue; + } + auto P6 = std::next(P5); + while (P6 != MBB.end() && P6->isDebugInstr()) ++P6; + if (P6 == MBB.end() || !isAdcSR(*P6, &slotC)) { ++It; continue; } + auto P7 = std::next(P6); + while (P7 != MBB.end() && P7->isDebugInstr()) ++P7; + int64_t outA = 0; + if (P7 == MBB.end() || !isStaSR(*P7, &outA) || outA != slotA) { + ++It; continue; + } + auto P8 = std::next(P7); + while (P8 != MBB.end() && P8->isDebugInstr()) ++P8; + int64_t lreloadB = 0; + if (P8 == MBB.end() || !isLdaSR(*P8, &lreloadB) || lreloadB != slotB) { + ++It; continue; + } + auto P9 = std::next(P8); + while (P9 != MBB.end() && P9->isDebugInstr()) ++P9; + if (P9 == MBB.end() || !isAdcSR(*P9, &slotD)) { ++It; continue; } + auto P10 = std::next(P9); + while (P10 != MBB.end() && P10->isDebugInstr()) ++P10; + int64_t outB = 0; + if (P10 == MBB.end() || !isStaSR(*P10, &outB) || outB != slotB) { + ++It; continue; + } + // All 10 matched. slotA != slotB already. Also require all + // four slots distinct. (slotC/slotD are the total.lo/hi read + // addresses; in the canonical case slotC != slotA and slotD != + // slotB; without this the rewrite would re-read its own output.) + if (slotC == slotA || slotD == slotB || + slotC == slotD) { + ++It; continue; + } + // Rewrite: emit CLC ; ADC slotC ; STA slotA ; TXA ; ADC slotD ; + // STA slotB before P1, then erase steps 1-10. + DebugLoc DL = Cur->getDebugLoc(); + BuildMI(MBB, Cur, DL, TII.get(W65816::CLC)); + BuildMI(MBB, Cur, DL, TII.get(W65816::ADC_StackRel)) + .addImm(slotC); + BuildMI(MBB, Cur, DL, TII.get(W65816::STA_StackRel)) + .addImm(slotA); + BuildMI(MBB, Cur, DL, TII.get(W65816::TXA)); + BuildMI(MBB, Cur, DL, TII.get(W65816::ADC_StackRel)) + .addImm(slotD); + BuildMI(MBB, Cur, DL, TII.get(W65816::STA_StackRel)) + .addImm(slotB); + // Advance It past the matched pattern before erasing (so we + // don't iterate through deleted insts). + It = std::next(P10); + // Erase the 10 originals. + Cur->eraseFromParent(); P2->eraseFromParent(); + P3->eraseFromParent(); P4->eraseFromParent(); + P5->eraseFromParent(); P6->eraseFromParent(); + P7->eraseFromParent(); P8->eraseFromParent(); + P9->eraseFromParent(); P10->eraseFromParent(); + Changed = true; + } + } + + // Dead TAX / TXA elimination. STAfi declares `Defs = [A]` as a + // safe over-approximation (eliminateFrameIndex emits a PHA-bracketed + // sequence when the source is IMG-class). Regalloc honors that by + // inserting `TAX ; ...STAfi... ; TXA` brackets around STAfi that + // SOURCES from A — but in the A-source path A is preserved. The + // TXA's output gets clobbered immediately by the next LDA*, so the + // TXA is dead; once TXA is gone, the TAX's X-value has no consumer + // and is dead too. This pattern recurs once per i32-spill site. + // + // Conservative: only elide TXA if the IMMEDIATE next non-debug + // instruction defines $a (and doesn't read $a or N/Z first). No + // intervening flag-readers between TXA and the A-define is then + // guaranteed. Same logic for TYA. + // + // For TAX: elide if no instruction between TAX and the next $x def + // reads $x (and we can prove the original X had no live consumer). + // Done as a fixed-point: keep iterating until no change. + auto definesReg = [](const MachineInstr &MI, unsigned Reg) -> bool { + for (const MachineOperand &MO : MI.operands()) { + if (MO.isReg() && MO.getReg() == Reg && MO.isDef()) + return true; + } + return false; + }; + auto readsReg = [](const MachineInstr &MI, unsigned Reg) -> bool { + for (const MachineOperand &MO : MI.operands()) { + if (MO.isReg() && MO.getReg() == Reg && MO.isUse()) + return true; + } + return false; + }; + bool again2 = true; + while (again2) { + again2 = false; + // Pass A: dead TXA / TYA + for (auto It = MBB.begin(); It != MBB.end(); ) { + unsigned O = It->getOpcode(); + if (O != W65816::TXA && O != W65816::TYA) { ++It; continue; } + auto Next = std::next(It); + while (Next != MBB.end() && Next->isDebugInstr()) ++Next; + if (Next == MBB.end()) { ++It; continue; } + // Next must define $a unconditionally, and must not read $a + // (since we're about to discard the TXA-defined A) and must + // not be a call / branch / inline asm (which conservatively + // read $a). + if (Next->isCall() || Next->isBranch() || + Next->isReturn() || Next->isInlineAsm()) { + ++It; continue; + } + if (!definesReg(*Next, W65816::A)) { ++It; continue; } + if (readsReg(*Next, W65816::A)) { ++It; continue; } + // P (flags) liveness: TXA/TYA set N/Z. If Next reads P, we'd + // be discarding the flags it expects. Bxx and friends read P. + // Conservative: also require Next does not read $p. + if (readsReg(*Next, W65816::P)) { ++It; continue; } + auto Dead = It++; + Dead->eraseFromParent(); + Changed = true; + again2 = true; + } + // Pass B: dead TAX / TAY + for (auto It = MBB.begin(); It != MBB.end(); ) { + unsigned O = It->getOpcode(); + unsigned Target; + if (O == W65816::TAX) Target = W65816::X; + else if (O == W65816::TAY) Target = W65816::Y; + else { ++It; continue; } + // Walk forward. TAX/TAY is dead if every use of Target is + // preceded by a redefinition of Target (and the in-MBB region + // between has no flag-reader that consumes TAX's N/Z). At MBB + // end, check successor live-ins: if none has Target as live-in + // it's also dead. + // + // Flag liveness: TAX defines $p (N/Z). A later $p-reader only + // consumes TAX's flags if no intervening instruction REDEFINES + // $p in the gap. Track `pRedef` to allow common patterns like + // `TAX ; CLC ; ADC ; ...` where ADC reads $p but the $p it + // reads is the freshly-CLC'd carry, not TAX's N/Z. + auto Walker = std::next(It); + bool deadIt = false; + bool bailed = false; + bool pRedef = false; + while (Walker != MBB.end()) { + if (Walker->isDebugInstr()) { ++Walker; continue; } + if (Walker->isCall() || Walker->isInlineAsm()) { + bailed = true; break; + } + // Branch / return: stop walking; rely on successor live-ins. + if (Walker->isBranch() || Walker->isReturn()) break; + if (readsReg(*Walker, Target)) { bailed = true; break; } + if (readsReg(*Walker, W65816::P) && !pRedef) { + bailed = true; break; + } + if (definesReg(*Walker, W65816::P)) pRedef = true; + if (definesReg(*Walker, Target)) { deadIt = true; break; } + ++Walker; + } + if (bailed) { ++It; continue; } + if (!deadIt) { + // Fell through to MBB end / branch. Check successor live-ins. + bool liveOut = false; + for (MachineBasicBlock *Succ : MBB.successors()) { + if (Succ->isLiveIn(Target)) { liveOut = true; break; } + } + // Return blocks: $a and $x are the i32 return-value convention. + // RTL doesn't model these as Uses, but they ARE live at the + // return. Be conservative — don't elide TAX/TAY before a return. + if (!MBB.empty() && MBB.back().isReturn()) liveOut = true; + if (liveOut) { ++It; continue; } + } + auto Dead = It++; + Dead->eraseFromParent(); + Changed = true; + again2 = true; + } + } + // Third peephole: drop `LDY_Imm16 K` when Y already holds K from // an earlier LDY in the same MBB and no intervening MI clobbered // Y. Custom inserter emits LDY #0 before every LDAfi_indY/STAfi_indY, diff --git a/src/llvm/lib/Target/W65816/W65816StackSlotCleanup.cpp b/src/llvm/lib/Target/W65816/W65816StackSlotCleanup.cpp index 4918ca1..470179b 100644 --- a/src/llvm/lib/Target/W65816/W65816StackSlotCleanup.cpp +++ b/src/llvm/lib/Target/W65816/W65816StackSlotCleanup.cpp @@ -283,15 +283,18 @@ static bool tryEliminateLoadAfterStore(MachineBasicBlock &MBB, case W65816::LDAfi: case W65816::LDAi16imm: case W65816::LDAabs: - case W65816::ANDi16imm: case W65816::ANDabs: - case W65816::ORAi16imm: case W65816::ORAabs: - case W65816::EORi16imm: case W65816::EORabs: + case W65816::ANDi16imm: case W65816::ANDabs: case W65816::ANDfi: + case W65816::ORAi16imm: case W65816::ORAabs: case W65816::ORAfi: + case W65816::EORi16imm: case W65816::EORabs: case W65816::EORfi: case W65816::ADCi16imm: case W65816::ADCabs: case W65816::ADCfi: case W65816::SBCi16imm: case W65816::SBCabs: case W65816::SBCfi: case W65816::ADCEi16imm: case W65816::ADCEabs: case W65816::ADCEfi: case W65816::SBCEi16imm: case W65816::SBCEabs: case W65816::SBCEfi: case W65816::ASLA16: case W65816::LSRA16: case W65816::ASLA8: case W65816::LSRA8: + case W65816::INA: case W65816::DEA: + case W65816::INA_PSEUDO: case W65816::DEA_PSEUDO: + case W65816::INA_PSEUDO8: case W65816::DEA_PSEUDO8: return true; default: return false; @@ -756,6 +759,24 @@ bool W65816StackSlotCleanup::runOnMachineFunction(MachineFunction &MF) { // pass that might still violate the dep, and to wrap the rare cases // where the IR-level test is a load (LDA flag side-effect) rather // than an explicit CMP. + // In VLA functions, FI store pseudos (STAfi, STA8fi, STAfi_indY) + // expand at PEI to a 4-MC sequence ending in `LDY $F8` (Y-restore), + // which clobbers N/Z. The PHP/PLP wrap pass runs pre-PEI; treating + // those pseudos as flag-preserving leaves the trailing LDY outside + // the wrap, so a downstream BEQ/BNE reads the LDY's flags instead of + // the test's. Treat them as corrupting in VLA functions so the wrap + // covers the whole expansion. + // VLAFunc: narrow predicate used by the flag-preserving / lda-like + // helpers (broadening it to UsesFPRel broke dadd's i64-ABI libcall + // flow — the STAfi pseudos in non-VLA large-frame functions don't + // need to be marked corrupting for the wrap-detection walk). + // UsesFPRel: broader FrameLowering-matching predicate used by the + // pseudo-bump's offset-routing check (FP-rel ops must NOT be bumped, + // SP-rel ops MUST be bumped; we replicate eliminateFrameIndex's + // routing decision below to choose). + bool VLAFunc = MF.getFrameInfo().hasVarSizedObjects(); + bool UsesFPRel = MF.getFrameInfo().hasVarSizedObjects() || + MF.getFrameInfo().estimateStackSize(MF) > 200; for (MachineBasicBlock &MBB : MF) { SmallVector Branches; for (MachineInstr &MI : MBB) { @@ -764,7 +785,13 @@ bool W65816StackSlotCleanup::runOnMachineFunction(MachineFunction &MF) { Opc == W65816::BMI || Opc == W65816::BPL) Branches.push_back(&MI); } - auto isFlagPreserving = [](unsigned Opc) { + auto isFlagPreserving = [VLAFunc](unsigned Opc) { + if (VLAFunc) { + // FI store pseudos are flag-corrupting under VLA expansion. + if (Opc == W65816::STAfi || Opc == W65816::STAfi_indY || + Opc == W65816::STA8fi) + return false; + } return Opc == W65816::STA_StackRel || Opc == W65816::STA_StackRelIndY || Opc == W65816::STAfi || @@ -805,7 +832,14 @@ bool W65816StackSlotCleanup::runOnMachineFunction(MachineFunction &MF) { return !MI.isBranch() && !MI.isReturn(); } }; - auto isLdaLike = [](unsigned Opc) { + auto isLdaLike = [VLAFunc](unsigned Opc) { + if (VLAFunc) { + // STAfi-family: see isFlagPreserving comment. They expand to a + // sequence whose final LDY $F8 corrupts N/Z; treat as corrupting. + if (Opc == W65816::STAfi || Opc == W65816::STAfi_indY || + Opc == W65816::STA8fi) + return true; + } // COPY between physregs: lowers in AsmPrinter to one of TXA/TYA/ // LDA $D? (for IMG↔A bridges) etc. — all of which set N/Z based // on the loaded value. Treating COPY as flag-defining caused the @@ -926,6 +960,14 @@ bool W65816StackSlotCleanup::runOnMachineFunction(MachineFunction &MF) { // eliminateFrameIndex; bumping ImmOffset by 1 produces the // right post-lowered disp. For already-lowered MC ops // (LDA_StackRel etc), bump the disp operand directly. + // + // CAVEAT for FP-relative functions (see UsesFPRel declaration above): + // FI accesses go through FP-relative addressing (eliminateFrameIndex + // routes through expandFarFI when FrameLowering captured FP). FP + // was captured BEFORE PHP, so (FP),Y reads aren't affected by PHP's + // S decrement. Don't bump pseudo *fi ImmOffsets in that case + // (already-lowered MC StackRel ops still need the bump — those are + // SP-rel). const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); DebugLoc DL = Test->getDebugLoc(); BuildMI(MBB, FirstCorrupt->getIterator(), DL, TII->get(W65816::PHP)); @@ -944,6 +986,33 @@ bool W65816StackSlotCleanup::runOnMachineFunction(MachineFunction &MF) { Opc == W65816::ANDfi || Opc == W65816::ORAfi || Opc == W65816::EORfi || Opc == W65816::CMPfi || Opc == W65816::ADDframe; + // For pseudo *fi ops in FP-rel functions: only SOME will end up + // SP-rel after PEI (offsets in [0,255]); the rest go through + // expandFarFI → `[$F6],Y`. FP-rel access is unaffected by PHP's + // S decrement and must NOT be bumped; SP-rel access IS affected + // and MUST be bumped. Replicate eliminateFrameIndex's offset + // calculation here to decide. Without this, large-frame + // functions that mix both addressing modes (e.g. sha256-style + // i32-libcall loops) get their FP-rel pseudos bumped, which + // shifts reads/writes by one byte and corrupts state at + // iteration N proportional to the i32-libcall count. + if (IsPseudo && UsesFPRel) { + const MachineFrameInfo &MFI = MF.getFrameInfo(); + if (It->getOperand(1).isFI()) { + int FI = It->getOperand(1).getIndex(); + int FrameOffset = MFI.getObjectOffset(FI); + int ImmOffset = It->getOperand(2).isImm() + ? (int)It->getOperand(2).getImm() : 0; + int LoweredOff = FrameOffset + ImmOffset + + (int)MFI.getStackSize(); + if (FrameOffset < 0) LoweredOff += 1; + // Out-of-range or VLA → FP-rel → no bump. + if (LoweredOff < 0 || LoweredOff > 0xFF || + MFI.hasVarSizedObjects()) + continue; + // Else SP-rel: fall through and bump ImmOffset. + } + } unsigned ImmIdx = IsPseudo ? 2 : 0; if (ImmIdx < It->getNumOperands() && It->getOperand(ImmIdx).isImm()) { int64_t v = It->getOperand(ImmIdx).getImm(); diff --git a/src/llvm/lib/Target/W65816/W65816TargetMachine.cpp b/src/llvm/lib/Target/W65816/W65816TargetMachine.cpp index eeae746..031a699 100644 --- a/src/llvm/lib/Target/W65816/W65816TargetMachine.cpp +++ b/src/llvm/lib/Target/W65816/W65816TargetMachine.cpp @@ -27,13 +27,17 @@ using namespace llvm; // Data layout for the 65816 lives in Triple::computeDataLayout via // patches/0005-target-data-layout-w65816.patch. The string is: -// e - little endian -// m:e - ELF-style symbol mangling -// p:16:8 - 16-bit pointers, 8-bit stack alignment -// i16:16 - 16-bit integers aligned to 16 bits -// i32:16 - 32-bit integers aligned to 16 bits -// n8:16 - native integer widths -// S16 - 16-bit natural stack alignment +// e - little endian +// m:e - ELF-style symbol mangling +// p:32:16 - 32-bit pointers (lo16 + hi-bank), 16-bit alignment +// i16:16 - 16-bit integers aligned to 16 bits +// i32:16 - 32-bit integers aligned to 16 bits +// a:8 - alloca defaults to 1-byte alignment +// n8:16 - native integer widths +// S8 - 1-byte natural stack alignment. JSL's 3-byte ret-addr +// push means SP is never reliably 2-aligned inside a +// callee; the older S16 caused SDAG to fold &buf[1] to +// buf | 1, which breaks for odd-aligned stack locals. extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeW65816Target() { @@ -49,6 +53,9 @@ LLVMInitializeW65816Target() { initializeW65816PreSpillCrossCallPass(PR); initializeW65816SjLjFinalizePass(PR); initializeW65816LowerWide32Pass(PR); + initializeW65816I32IncFoldPass(PR); + initializeW65816ImgCalleeSavePass(PR); + initializeW65816NarrowI32MulPass(PR); // Default IndVarSimplify's exit-value rewriter to "never". The // closed-form replacement frequently widens an i16 induction var @@ -104,22 +111,21 @@ public: void addMachineSSAOptimization() override; void addISelPrepare() override; - // W65816's only 16-bit ALU register is A. At -O1+ we use BASIC - // regalloc instead of greedy: greedy fails ("ran out of registers - // during register allocation") on functions with many cross-call - // Acc16 vregs (the "ok |= bit; helper(); ok |= bit;" pattern - // repeated across many if-blocks). Basic regalloc handles that - // pattern cleanly, with negligible code-size overhead vs greedy - // (~0.7% on the bench suite). + // Greedy at -O1+; fast at -O0/optnone. Greedy used to abort with + // "Def isn't really dead" inside LiveRangeEdit::eliminateDeadDef + // when InlineSpiller converted a redundant STAfi (Defs = [A]) into + // a KILL pseudo while only marking explicit defs dead — leaving the + // implicit-def $a live, then later trying to delete it. Patched in + // tools/llvm-mos/llvm/lib/CodeGen/InlineSpiller.cpp to mark all defs + // (explicit + implicit) dead. Bench wins after the switch: + // popcount −19.4%, strcpy −18.9%, memcmp −8.6%, bsearch −9.2%, + // fib(10) −2.6%. // - // At -O0 / optnone (Optimized=false) we use FAST: greedy/basic at - // -O0 leave spurious COPY pseudos that lower to STA dp / LDA dp - // pairs around modify-in-place ops (e.g. INA), miscompiling a + 1. - // - // TiedDefSpill (pre-RA) handles the tied-def-multi-use hazard for - // the sub-pattern that's frequent enough to matter at -O1+. + // At -O0 / optnone (Optimized=false) we use FAST: greedy at -O0 + // left spurious COPY pseudos that lowered to STA dp / LDA dp pairs + // around modify-in-place ops (e.g. INA), miscompiling a + 1. FunctionPass *createTargetRegisterAllocator(bool Optimized) override { - return Optimized ? createBasicRegisterAllocator() + return Optimized ? createGreedyRegisterAllocator() : createFastRegisterAllocator(); } }; @@ -137,6 +143,11 @@ void W65816PassConfig::addISelPrepare() { // intrinsics our backend doesn't natively lower. Must run BEFORE // the base ISelPrepare passes so isel sees the cleaned IR. addPass(createW65816SjLjFinalize()); + // IR-level peephole: narrow `mul i32 X, Y` to a __umulhisi3 call + // when IR-level computeKnownBits proves the top 16 bits of both + // operands are zero. Catches the sumSquares-style `(u32)i * i` + // pattern that SDAG-level analysis can't see across BB boundaries. + addPass(createW65816NarrowI32Mul()); TargetPassConfig::addISelPrepare(); } @@ -148,6 +159,15 @@ void W65816PassConfig::addMachineSSAOptimization() { // Uses=[P] on Bxx (so MachineCSE sees the dep) and let the // pass run normally — that landed in W65816InstrInfo.td. TargetPassConfig::addMachineSSAOptimization(); + + // MachineBlockPlacement is now re-enabled. Previously disabled + // because W65816InstrInfo::analyzeBranch returned unanalyzable + // unconditionally; we now decode the BRA / BRL / JMP_Abs uncond + // direct-branch case (see W65816InstrInfo::analyzeBranch) which is + // enough to satisfy MBP's fall-through assertion. Conditional + // branches stay opaque on purpose: their condition is encoded in + // the OPCODE and the P-flag input must stay adjacent to a preceding + // CMP, which BranchFolder doesn't know to preserve. } void W65816PassConfig::addPreRegAlloc() { @@ -175,6 +195,15 @@ void W65816PassConfig::addPreRegAlloc() { } void W65816PassConfig::addPostRegAlloc() { + // ImgCalleeSave runs FIRST so its STAfi/LDAfi pseudos go through the + // rest of the post-RA pipeline (SpillToX, StackSlotCleanup) normally. + // It detects IMG8..IMG15 usage post-regalloc and inserts prologue + // save + epilogue restore so those slots act as callee-saved at the + // asm level. Fixes picol's `expr 1+2 == 4` bug: high-pressure + // recursive double fns use IMG8..IMG15 as scratch but, without this + // pass, expected them preserved across calls — and callees were + // happy to clobber them. See W65816ImgCalleeSave.cpp. + addPass(createW65816ImgCalleeSave()); // SpillToX converts STA/LDA pairs to TAX/TXA bridges; StackSlotCleanup // then deletes still-adjacent redundant spills. A second SpillToX // invocation collapses any TAX/TXA pair left adjacent by cleanup @@ -223,6 +252,16 @@ void W65816PassConfig::addPreEmitPass() { // Distance estimation now uses TII::getInstSizeInBytes so it's // byte-accurate; the 110-byte threshold leaves margin without // expanding short branches that would otherwise survive as Bxx. + // Detect i32 += 1 patterns (LDA/ADC #1/STA/LDA/ADCE #0/STA) and + // rewrite to a tighter LDA/INA/STA + INC_HI_IF_CARRY form that + // skips the hi half on the no-carry path. Must run BEFORE + // BranchExpand so the inserted conditional skip's distances are + // covered by the branch-distance estimator. Also before + // SepRepCleanup (which has the existing ADC #±1 → INA peephole) + // because we deliberately KEEP ADCi16imm 1 so this pass can match + // it; the subsequent SepRepCleanup will see only the residual + // (non-fold-eligible) ADCi16imm cases. + addPass(createW65816I32IncFold()); addPass(createW65816BranchExpand()); addPass(createW65816SepRepCleanup()); } diff --git a/ui.ini b/ui.ini new file mode 100644 index 0000000..9c2cbf1 --- /dev/null +++ b/ui.ini @@ -0,0 +1,71 @@ +# +# UI SEARCH PATH OPTIONS +# +historypath history;dats;. +categorypath folders +cabinets_directory cabinets;cabdevs +cpanels_directory cpanel +pcbs_directory pcb +flyers_directory flyers +titles_directory titles +ends_directory ends +marquees_directory marquees +artwork_preview_directory "artwork preview;artpreview" +bosses_directory bosses +logos_directory logo +scores_directory scores +versus_directory versus +gameover_directory gameover +howto_directory howto +select_directory select +icons_directory icons +covers_directory covers +ui_path ui + +# +# UI MISC OPTIONS +# +system_names +skip_warnings 0 +unthrottle_mute 0 + +# +# UI OPTIONS +# +infos_text_size 0.75 +font_rows 30 +ui_border_color ffffffff +ui_bg_color ef101030 +ui_clone_color ff808080 +ui_dipsw_color ffffff00 +ui_gfxviewer_color ef101030 +ui_mousedown_bg_color b0606000 +ui_mousedown_color ffffff80 +ui_mouseover_bg_color 70404000 +ui_mouseover_color ffffff80 +ui_selected_bg_color ef808000 +ui_selected_color ffffff00 +ui_slider_color ffffffff +ui_subitem_color ffffffff +ui_text_bg_color ef000000 +ui_text_color ffffffff +ui_unavail_color ff404040 + +# +# SYSTEM/SOFTWARE SELECTION MENU OPTIONS +# +hide_main_panel 0 +use_background 1 +skip_biosmenu 0 +skip_partsmenu 0 +remember_last 1 +last_used_machine +last_used_filter +system_right_panel image +software_right_panel image +system_right_image snap +software_right_image snap +enlarge_snaps 1 +forced4x3 1 +info_audit_enabled 0 +hide_romless 1