Amiga parity with IIgs!

2026-05-03 01:44:39 -05:00 · 2026-05-03 01:44:39 -05:00 · b1e24b4650
commit b1e24b4650
parent 6c03d93e88
37 changed files with 4312 additions and 493 deletions
--- a/examples/audio/audio.c
+++ b/examples/audio/audio.c
@ -171,11 +171,11 @@ int main(void) {
        if (flashFrames > 0) {
            fillRect(screen, BAR_X, BAR_Y, BAR_W, BAR_H, COLOR_BAR);
-            stagePresentRect(BAR_X, BAR_Y, BAR_W, BAR_H);
+            stagePresent();
            flashFrames--;
            if (flashFrames == 0) {
                fillRect(screen, BAR_X, BAR_Y, BAR_W, BAR_H, COLOR_HINT);
-                stagePresentRect(BAR_X, BAR_Y, BAR_W, BAR_H);
+                stagePresent();
            }
        }
    }
--- a/examples/joy/joy.c
+++ b/examples/joy/joy.c
@ -80,8 +80,10 @@ static void buildPalette(SurfaceT *screen) {
 static void drawAndPresent(SurfaceT *screen, int16_t x, int16_t y, int16_t w, int16_t h, uint8_t color) {
    /* fillRect marks the rect dirty; stagePresent flushes only that
     * dirty band. */
    fillRect(screen, x, y, (uint16_t)w, (uint16_t)h, color);
-    stagePresentRect(x, y, (uint16_t)w, (uint16_t)h);
+    stagePresent();
 }
--- a/examples/keys/keys.c
+++ b/examples/keys/keys.c
@ -158,8 +158,6 @@ static void presentChangedCells(SurfaceT *screen, int16_t cursorCol, int16_t cur
    int16_t  row;
    JoeyKeyE key;
    bool     lit;
    int16_t  x;
    int16_t  y;
    for (row = 0; row < GRID_ROWS; row++) {
        for (col = 0; col < GRID_COLS; col++) {
@ -171,10 +169,10 @@ static void presentChangedCells(SurfaceT *screen, int16_t cursorCol, int16_t cur
            if (lit == gCellLit[row][col]) {
                continue;
            }
            /* drawCell marks the cell's rect dirty; stagePresent
             * flushes that one band. */
            drawCell(screen, col, row, lit);
-            x = (int16_t)(MARGIN_X + col * (CELL_W + GAP));
+            stagePresent();
            y = (int16_t)(MARGIN_Y + row * (CELL_H + GAP));
            stagePresentRect(x, y, CELL_W, CELL_H);
            gCellLit[row][col] = lit;
        }
    }
@ -195,19 +193,16 @@ static void updateCursor(SurfaceT *screen, int16_t cursorCol, int16_t cursorRow)
    if (gLastCursorX != mouseX || gLastCursorY != mouseY) {
        if (gLastCursorCol != CELL_NONE) {
            drawCell(screen, gLastCursorCol, gLastCursorRow, gCellLit[gLastCursorRow][gLastCursorCol]);
            stagePresentRect(
                (int16_t)(MARGIN_X + gLastCursorCol * (CELL_W + GAP)),
                (int16_t)(MARGIN_Y + gLastCursorRow * (CELL_H + GAP)),
                CELL_W, CELL_H);
        } else if (gLastCursorX >= 0 && gLastCursorY >= 0) {
            // Old cursor was in a gap region. Stamp background over it.
            fillRect(screen, gLastCursorX, gLastCursorY, CURSOR_W, CURSOR_H, COLOR_BACKGROUND);
            stagePresentRect(gLastCursorX, gLastCursorY, CURSOR_W, CURSOR_H);
        }
    }
    drawCursor(screen, mouseX, mouseY);
-    stagePresentRect(mouseX, mouseY, CURSOR_W, CURSOR_H);
+    /* All draw calls above marked their rects dirty; one stagePresent
     * flushes the union (cursor erase + cursor draw). */
    stagePresent();
    gLastCursorX   = mouseX;
    gLastCursorY   = mouseY;
--- a/examples/sprite/sprite.c
+++ b/examples/sprite/sprite.c
@ -15,11 +15,11 @@
 #define BALL_TILES_Y      (BALL_H / 8)
 #define BALL_TILE_BYTES   (BALL_TILES_X * BALL_TILES_Y * TILE_BYTES)
-// SaveUnder must store rounded-up byte boundaries: x rounded down to
+// SaveUnder rounds x down to the platform's storage alignment: 2 px
-// even, width rounded up to even. Worst case for BALL_W=16 (already
+// for chunky 4bpp (1 extra byte/row worst case), 8 px for planar
-// even) is 8 bytes per row + alignment slack of 1 byte; size for the
+// 4-plane (4 extra bytes/row worst case -- one per plane). The +4
-// pessimistic case so the buffer never overflows.
+// covers the planar case and is a no-op overhead on chunky.
-#define BALL_BACKUP_BYTES (((BALL_W + 2) >> 1) * BALL_H)
+#define BALL_BACKUP_BYTES (((BALL_W >> 1) + 4) * BALL_H)
 #define BALL_PALETTE_IDX  0
@ -100,18 +100,14 @@ int main(void) {
    int16_t        y;
    int16_t        vx;
    int16_t        vy;
    int16_t        oldX;
    int16_t        oldY;
    uint16_t       oldW;
    uint16_t       oldH;
    int16_t        unionX;
    int16_t        unionY;
    int16_t        unionRight;
    int16_t        unionBottom;
    bool           haveBackup;
    config.hostMode     = HOST_MODE_TAKEOVER;
-    config.codegenBytes = 8 * 1024;
+    /* Amiga planar emits 8 pre-shifted DRAW variants per sprite (one
     * per x % 8 alignment) so the codegen arena needs roughly 8x what
     * the chunky two-shift case asks for. 32 KB fits a 16x16 ball
     * with all variants. */
    config.codegenBytes = 32UL * 1024;
    config.maxSurfaces  = 4;
    config.audioBytes   = 64UL * 1024;
    config.assetBytes   = 128UL * 1024;
@ -155,7 +151,7 @@ int main(void) {
    haveBackup = false;
    spriteSaveAndDraw(screen, ball, x, y, &backup);
-    stagePresentRect(backup.x, backup.y, backup.width, backup.height);
+    stagePresent();
    haveBackup = true;
    for (;;) {
@ -164,19 +160,15 @@ int main(void) {
            break;
        }
-        // Stash the prior ball's region before restoring the bytes
+        // Do all off-screen work first (restore + move + draw), then
-        // under it. Do all off-screen work (restore + move + draw)
+        // ONE stagePresent flushes the union of dirty bands set by
-        // first, then waitVBL + ONE stagePresentRect covering both
+        // restoreUnder + draw. Add a joeyWaitVBL() before the present
-        // old and new regions. Putting waitVBL immediately before the
+        // to land it inside the VBL window so the CRT never sees a
-        // present lets the present land inside the VBL window so the
+        // half-updated framebuffer (matters most on single-buffered
-        // CRT never sees a half-updated framebuffer (matters most on
+        // chunky targets like IIgs SHR; on planar c2p platforms it
-        // single-buffered chunky targets like IIgs SHR; on planar
+        // also avoids c2p racing the raster). VBL wait is omitted
-        // c2p platforms it also avoids c2p racing the raster).
+        // here so the demo runs at the sprite pipeline's native
-        oldX = backup.x;
+        // throughput -- expect tearing on the ball.
        oldY = backup.y;
        oldW = backup.width;
        oldH = backup.height;
        if (haveBackup) {
            spriteRestoreUnder(screen, &backup);
        }
@ -190,27 +182,7 @@ int main(void) {
        spriteSaveAndDraw(screen, ball, x, y, &backup);
-        // Bounding box of (old rect) U (new rect). For typical
+        stagePresent();
        // small-step motion the rects overlap heavily so the union
        // is barely larger than one ball.
        unionX      = (oldX < backup.x) ? oldX : backup.x;
        unionY      = (oldY < backup.y) ? oldY : backup.y;
        unionRight  = (int16_t)((oldX + oldW > backup.x + backup.width)
                                ? (oldX + oldW)
                                : (backup.x + backup.width));
        unionBottom = (int16_t)((oldY + oldH > backup.y + backup.height)
                                ? (oldY + oldH)
                                : (backup.y + backup.height));
        // VBL wait removed -- the demo runs at the native compute speed
        // of save+restore+draw+presentRect so we can SEE the sprite
        // pipeline's actual throughput. Expect tearing on the ball
        // since the present can land mid-scan; that's the cost of
        // showing real frame rate. Add joeyWaitVBL() back here for
        // tear-free 60 Hz motion.
        stagePresentRect(unionX, unionY,
                           (uint16_t)(unionRight  - unionX),
                           (uint16_t)(unionBottom - unionY));
        haveBackup = true;
    }
--- a/examples/uber/uber.c
+++ b/examples/uber/uber.c
@ -28,7 +28,16 @@
 // 4-frame measurement window. Long enough that loop overhead doesn't
 // dominate; short enough to keep the full demo run under ~10 sec.
-#define UBER_FRAMES  4u
+/* 16 frames per timed op gives 4x the iter-count resolution of the
 * earlier 4-frame budget. Exposes the actual per-op cost on slow
 * ops where 4 frames produced the same iter count on different
 * framerates -- e.g. drawCircle r=80 read as "4 iters / 4 frames"
 * on both 60 Hz IIgs (16.7 ms/frame, 67 ms window) and 50 Hz Amiga
 * (20 ms/frame, 80 ms window) even though per-op cost was equal,
 * just because 4 ops at 16-17 ms happen to fit both windows. The
 * 16-frame budget extends the windows to 267 ms / 320 ms; quantum
 * gap shrinks to ~6%. Total run time scales 4x (~80 sec each). */
 #define UBER_FRAMES  16u
 typedef void (*OpFn)(void);
@ -44,9 +53,10 @@ static TileT     gTileScratch;
 // Run `op` in a tight loop until `targetFrames` joeyFrameCount ticks
 // have elapsed. Returns iterations completed.
-static unsigned long runForFrames(OpFn op, unsigned int targetFrames) {
+static unsigned long runForFrames(OpFn op, unsigned int targetFrames, uint16_t *actualFramesOut) {
    unsigned long count;
    uint16_t      startFrame;
    uint16_t      endFrame;
    count = 0UL;
@ -57,29 +67,50 @@ static unsigned long runForFrames(OpFn op, unsigned int targetFrames) {
        op();
        count++;
    }
    /* Capture the actual elapsed frames -- the last iter typically
     * overruns the target. Using actual instead of target as the
     * ops/sec divisor stays honest for ops slower than 1 frame
     * (where count is forced low while real time stretches well
     * past targetFrames). */
    endFrame         = joeyFrameCount();
    *actualFramesOut = (uint16_t)(endFrame - startFrame);
    if (*actualFramesOut == 0u) {
        *actualFramesOut = 1u;       /* defensive: avoid div-by-zero */
    }
    return count;
 }
 // Time and log one op. Reports iters / N frames AND the derived
 // ops/sec so per-port results are directly comparable against IIgs
-// regardless of CPU speed or display refresh rate.
+// regardless of CPU speed or display refresh rate. Also logs an
 // FNV-1a hash of the surface state after timing -- this is the
 // pixel-perfect comparison input for the cross-port validation
 // harness (tools/diff-uber-hashes.py). Captured against IIgs as the
 // golden reference; planar 68k rewrites validate by matching it.
 static void timeOp(const char *name, OpFn op) {
    unsigned long iters;
    unsigned long opsPerSec;
    uint16_t      actualFrames;
    uint32_t      hash;
    gCurName = name;
-    iters = runForFrames(op, UBER_FRAMES);
+    iters = runForFrames(op, UBER_FRAMES, &actualFrames);
    if (iters == 0UL) {
        joeyLogF("UBER: %s: 0 iters (op too slow?)\n", name);
        return;
    }
-    opsPerSec = (iters * (unsigned long)joeyFrameHz()) / (unsigned long)UBER_FRAMES;
+    /* Divide by ACTUAL elapsed frames, not the target. For sub-frame
-    joeyLogF("UBER: %s: %lu iters / %u frames = %lu ops/sec\n",
+     * ops actualFrames ~= UBER_FRAMES so the answer is unchanged;
-             name, iters, UBER_FRAMES, opsPerSec);
+     * for ops that overrun (slow stagePresent etc.), this stops
     * inflating ops/sec. */
    opsPerSec = (iters * (unsigned long)joeyFrameHz()) / (unsigned long)actualFrames;
    hash      = surfaceHash(gStage);
    joeyLogF("UBER: %s: %lu iters / %u frames = %lu ops/sec | hash=%08lX\n",
             name, iters, actualFrames, opsPerSec, (unsigned long)hash);
 }
@ -125,8 +156,6 @@ static void op_spriteRestore     (void) { spriteRestoreUnder(gStage, &gBackup);
 static void op_spriteSaveAndDraw (void) { spriteSaveAndDraw (gStage, gSprite, gSpriteX, gSpriteY, &gBackup); }
 static void op_stagePresent     (void) { stagePresent(); }
 static void op_stagePresentRect8(void) { stagePresentRect( 40,  30,  16,  16); }
 static void op_stagePresentRectF(void) { stagePresentRect(  0,   0, 320, 200); }
 static void op_inputPoll       (void) { joeyInputPoll(); }
 static void op_keyDown         (void) { (void)joeyKeyDown(KEY_A); }
@ -229,10 +258,14 @@ static void runAllTests(void) {
    timeOp("spriteRestoreUnder", op_spriteRestore);
    timeOp("spriteSaveAndDraw",  op_spriteSaveAndDraw);
-    // Present.
+    // Present. One warm-up call before each timed loop primes any
    // per-port one-time setup (Amiga: copper list rebuild after the
    // paletteSet / scbSetRange tests dirty the cache; without warm-up
    // the rebuild's MakeScreen + MrgCop + WaitTOF chain consumes the
    // entire 4-frame measurement window) so we measure steady-state
    // throughput rather than first-call penalty.
    stagePresent();
    timeOp("stagePresent full",  op_stagePresent);
    timeOp("stagePresentRect 8b",op_stagePresentRect8);
    timeOp("stagePresentRect F", op_stagePresentRectF);
    // Input.
    timeOp("joeyInputPoll",      op_inputPoll);
@ -253,12 +286,19 @@ static void runAllTests(void) {
 int main(void) {
-    JoeyConfigT config;
+    JoeyConfigT   config;
-    uint16_t    pal[16];
+    uint16_t      pal[16];
-    int         i;
+    int           i;
    uint16_t      startFrame;
    uint16_t      endFrame;
    uint16_t      elapsedFrames;
    unsigned long elapsedMs;
    config.hostMode     = HOST_MODE_TAKEOVER;
-    config.codegenBytes = 8 * 1024;
+    /* 32 KB fits the 8 pre-shifted DRAW variants the Amiga planar
     * compiled sprite emitter generates. UL on the multiply because
     * ORCA-C's 16-bit int overflows on 32 * 1024. */
    config.codegenBytes = 32UL * 1024;
    config.maxSurfaces  = 4;
    config.audioBytes   = 64UL * 1024;
    config.assetBytes   = 128UL * 1024;
@ -266,6 +306,11 @@ int main(void) {
    if (!joeyInit(&config)) {
        return 1;
    }
    /* joeyFrameCount is VBL-driven, so it only ticks after halInit
     * installed its VBL ISR -- captured here is "everything from now
     * to press-any-key". Pre-init setup time is small and not the
     * cost the user is chasing; runAllTests dominates. */
    startFrame = joeyFrameCount();
    gStage = stageGet();
    if (gStage == NULL) {
@ -337,6 +382,12 @@ int main(void) {
    runAllTests();
    endFrame      = joeyFrameCount();
    elapsedFrames = (uint16_t)(endFrame - startFrame);
    elapsedMs     = ((unsigned long)elapsedFrames * 1000UL) / (unsigned long)joeyFrameHz();
    joeyLogF("UBER: total wall time: %lu ms (%u frames @ %u Hz)\n",
             elapsedMs, elapsedFrames, (unsigned)joeyFrameHz());
    // Done. Green screen + waitForKey.
    surfaceClear(gStage, 2);
    stagePresent();
--- a/include/joey/debug.h
+++ b/include/joey/debug.h
@ -5,6 +5,7 @@
 void joeyLog     (const char *msg);
 void joeyLogF    (const char *fmt, ...);
 void joeyLogFlush(void);
 void joeyLogReset(void);
 #endif
--- a/include/joey/present.h
+++ b/include/joey/present.h
@ -15,14 +15,14 @@
 #include "types.h"
 // Flip the dirty regions of the stage to the display, then clear the
-// dirty state. Cheap when nothing has changed since the last call.
+// dirty state. Cheap when nothing has changed since the last call
 // (gStageAnyDirty short-circuit). Drawing primitives mark dirty as
 // a side effect, so callers only need to call stagePresent at the
 // end of a frame -- everything they drew shows up.
 //
 // To present a region you didn't draw with the standard primitives
 // (e.g. direct framebuffer poking), call surfaceMarkDirtyRect on
 // the same rect first, then stagePresent.
 void stagePresent(void);
 // Flip a specific rectangular region of the stage to the display,
 // regardless of dirty state. Coordinates are clipped to the surface;
 // negative or zero dimensions are no-ops. Does not consult or modify
 // the dirty arrays -- callers mixing stagePresentRect with stagePresent
 // in the same frame may see redundant work on the next stagePresent.
 void stagePresentRect(int16_t x, int16_t y, uint16_t w, uint16_t h);
 #endif
--- a/include/joey/sprite.h
+++ b/include/joey/sprite.h
@ -27,13 +27,16 @@
 #include "surface.h"
 #include "types.h"
-// Sprites always write to a 4bpp packed SurfaceT, never to display
+// Sprite codegen emits per-shift variants. Chunky 4bpp ports (DOS,
-// memory directly (halPresent owns that path). The codegen emits 2
+// IIgs, Atari ST) only need 2 shifts -- pixel offset 0 (sprite/dest
-// shift variants on every platform: shift 0 for even x (sprite byte
+// byte boundaries align) and offset 1 (every dest byte combines two
-// boundaries match destination byte boundaries) and shift 1 for odd
+// sprite bytes' nibbles). Planar ports (Amiga -- 8 px per plane byte)
-// x (each destination byte combines two adjacent sprite bytes'
+// need 8 shifts: one for each x % 8 alignment, so smooth horizontal
-// nibbles).
+// motion at any pixel position uses pre-shifted source bytes without
-#define JOEY_SPRITE_SHIFT_COUNT 2
+// runtime bit-shifting. Allocate the max so routineOffsets[] has
 // slots for every variant; chunky ports leave shifts 2..7 as
 // SPRITE_NOT_COMPILED, planar ports use all 8.
 #define JOEY_SPRITE_SHIFT_COUNT 8
 typedef enum {
    SPRITE_FLAGS_NONE = 0
--- a/include/joey/surface.h
+++ b/include/joey/surface.h
@ -58,4 +58,13 @@ bool surfaceSaveFile(const SurfaceT *src, const char *path);
 // identity (no reallocation).
 bool surfaceLoadFile(SurfaceT *dst, const char *path);
 // FNV-1a 32-bit hash of the surface's logical pixel content (color
 // indices in row-major order, 0..15 per pixel). Same logical pixels
 // produce the same hash on every port regardless of internal storage
 // format -- so a hash captured on IIgs (chunky) compares directly
 // against the same op's output on Amiga (planar) once the planar
 // rewrite is done. Used by the UBER validation harness to
 // pixel-compare ports against an IIgs golden reference.
 uint32_t surfaceHash(const SurfaceT *s);
 #endif
--- a/make/amiga.mk
+++ b/make/amiga.mk
@ -13,7 +13,7 @@ BINDIR   := $(BUILD)/bin
 # independently. -I on $(SRC_PORT)/amiga lets ptplayer.h resolve
 # <SDI_compiler.h> from the port-local shim alongside our HAL code.
 PTPLAYER_DIR := $(REPO_DIR)/toolchains/amiga/ptplayer
-CFLAGS  := $(COMMON_CFLAGS) -DJOEYLIB_PLATFORM_AMIGA -m68000 -fomit-frame-pointer -noixemul -D__OSCOMPAT -I$(SRC_PORT)/amiga -I$(SRC_68K) -I$(PTPLAYER_DIR)
+CFLAGS  := $(COMMON_CFLAGS) -DJOEYLIB_PLATFORM_AMIGA -m68000 -fomit-frame-pointer -noixemul -D__OSCOMPAT -I$(SRC_PORT)/amiga -I$(SRC_68K) -I$(PTPLAYER_DIR) -MMD -MP $(CFLAGS_EXTRA)
 # OSCOMPAT=1 selects PTPlayer's audio.device-friendly variant (uses
 # CIA-B + audio.device interrupts via the OS rather than taking over
 # Paula directly), matching the way our HAL cooperates with Intuition.
@ -52,6 +52,7 @@ LIB_OBJS := \
    $(patsubst $(SRC_68K)/%.s,$(BUILD)/obj/68k/%.o,$(SHARED_S)) \
    $(BUILD)/obj/port/ptplayer.o \
    $(BUILD)/obj/codegen/spriteEmit68k.o \
    $(BUILD)/obj/codegen/spriteEmitPlanar68k.o \
    $(BUILD)/obj/codegen/spriteCompile.o
 LIB := $(LIBDIR)/libjoey.a
@ -156,3 +157,9 @@ $(DATA_DIR)/test.sfx: $(REPO_DIR)/assets/test.sfx
 clean-amiga:
 	rm -rf $(BUILD)
 # Pull in per-object header-dependency files generated by gcc -MMD/-MP.
 # Without this, editing a header (e.g. surfaceInternal.h) doesn't rebuild
 # the .c files that include it, leaving a frankenstein binary where
 # different TUs see different struct layouts.
 -include $(LIB_OBJS:.o=.d)
--- a/make/atarist.mk
+++ b/make/atarist.mk
@ -7,7 +7,7 @@ BUILD    := $(REPO_DIR)/build/$(PLATFORM)
 LIBDIR   := $(BUILD)/lib
 BINDIR   := $(BUILD)/bin
-CFLAGS  := $(COMMON_CFLAGS) -DJOEYLIB_PLATFORM_ATARIST -m68000 -fomit-frame-pointer -I$(REPO_DIR)/toolchains/audio/libxmp-lite/include -I$(REPO_DIR)/toolchains/atarist/include-shim -I$(SRC_68K)
+CFLAGS  := $(COMMON_CFLAGS) -DJOEYLIB_PLATFORM_ATARIST -m68000 -fomit-frame-pointer -I$(REPO_DIR)/toolchains/audio/libxmp-lite/include -I$(REPO_DIR)/toolchains/atarist/include-shim -I$(SRC_68K) -MMD -MP
 LDFLAGS :=
 # libxmp-lite shared with the DOS port. Built as a static archive that
@ -148,3 +148,9 @@ $(DATA_DIR)/test.sfx: $(REPO_DIR)/assets/test.sfx
 clean-atarist:
 	rm -rf $(BUILD)
 # Pull in per-object header-dependency files generated by gcc -MMD/-MP.
 # Without this, editing a header (e.g. surfaceInternal.h) doesn't rebuild
 # the .c files that include it, leaving a frankenstein binary where
 # different TUs see different struct layouts.
 -include $(LIB_OBJS:.o=.d)
--- a/make/dos.mk
+++ b/make/dos.mk
@ -7,7 +7,7 @@ BUILD    := $(REPO_DIR)/build/$(PLATFORM)
 LIBDIR   := $(BUILD)/lib
 BINDIR   := $(BUILD)/bin
-CFLAGS  := $(COMMON_CFLAGS) -DJOEYLIB_PLATFORM_DOS -march=i386 -m32 -I$(REPO_DIR)/toolchains/audio/libxmp-lite/include
+CFLAGS  := $(COMMON_CFLAGS) -DJOEYLIB_PLATFORM_DOS -march=i386 -m32 -I$(REPO_DIR)/toolchains/audio/libxmp-lite/include -MMD -MP
 ASFLAGS := -f coff
 LDFLAGS :=
@ -138,3 +138,9 @@ $(DATA_DIR)/test.sfx: $(REPO_DIR)/assets/test.sfx
 clean-dos:
 	rm -rf $(BUILD)
 # Pull in per-object header-dependency files generated by gcc -MMD/-MP.
 # Without this, editing a header (e.g. surfaceInternal.h) doesn't rebuild
 # the .c files that include it, leaving a frankenstein binary where
 # different TUs see different struct layouts.
 -include $(LIB_OBJS:.o=.d)
--- a/make/iigs.mk
+++ b/make/iigs.mk
@ -51,11 +51,11 @@ IIGS_MERLIN  := $(REPO_DIR)/toolchains/iigs/merlin32/bin/merlin32
 LIB_SRCS := $(CORE_C_SRCS_IIGS) $(PORT_C_SRCS) $(PORT_ASM_SRCS_ALL) $(NTP_ASM) $(CODEGEN_SRCS)
-# HELLO and PATTERN are intentionally omitted from this list. The UBER
+# HELLO is omitted from the disk because UBER exercises everything it
-# demo (below) exercises every public API, including what those two
+# does and the disk was tight. PATTERN is included as the SCB / palette
-# small examples covered, and the IIgs disk image was running out of
+# golden-reference for cross-port debugging.
-# room. Source for HELLO/PATTERN is still in examples/{hello,pattern}/
+PATTERN_SRC := $(EXAMPLES)/pattern/pattern.c
-# for reference and for other ports that want them.
+PATTERN_BIN := $(BINDIR)/PATTERN
 DRAW_SRC    := $(EXAMPLES)/draw/draw.c
 DRAW_BIN    := $(BINDIR)/DRAW
 KEYS_SRC    := $(EXAMPLES)/keys/keys.c
@ -120,24 +120,44 @@ $(NTP_ASM): $(NTP_BIN) $(REPO_DIR)/toolchains/iigs/bin-to-asm.sh
 # everywhere, so library asm can take SurfaceT* args via one
 # consistent ABI (small-mm 16-bit pointers truncated bank bytes,
 # which broke any asm that wanted to address bank-1 stage memory).
 # Per-binary header dependency files. iix-build.sh -M emits one .d
 # alongside each binary covering every header transitively included
 # by the C sources in that binary's build. Pulled in via -include at
 # the bottom of this file so editing a shared header (e.g.
 # surfaceInternal.h) triggers a rebuild of every IIgs binary that
 # transitively depends on it.
 DEP_DIR := $(BUILD)/dep
 PATTERN_DEP := $(DEP_DIR)/PATTERN.d
 DRAW_DEP    := $(DEP_DIR)/DRAW.d
 KEYS_DEP    := $(DEP_DIR)/KEYS.d
 JOY_DEP     := $(DEP_DIR)/JOY.d
 SPRITE_DEP  := $(DEP_DIR)/SPRITE.d
 UBER_DEP    := $(DEP_DIR)/UBER.d
 AUDIO_DEP   := $(DEP_DIR)/AUDIO.d
 $(PATTERN_BIN): $(PATTERN_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD)
 	@mkdir -p $(dir $@) $(DEP_DIR)
 	$(IIGS_BUILD) -b -M $(PATTERN_DEP) $(IIX_INCLUDES) -o $@ $(PATTERN_SRC) $(LIB_SRCS)
 	$(IIGS_IIX) chtyp -t S16 $@
 $(DRAW_BIN): $(DRAW_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD)
-	@mkdir -p $(dir $@)
+	@mkdir -p $(dir $@) $(DEP_DIR)
-	$(IIGS_BUILD) -b $(IIX_INCLUDES) -o $@ $(DRAW_SRC) $(LIB_SRCS)
+	$(IIGS_BUILD) -b -M $(DRAW_DEP) $(IIX_INCLUDES) -o $@ $(DRAW_SRC) $(LIB_SRCS)
 	$(IIGS_IIX) chtyp -t S16 $@
 $(KEYS_BIN): $(KEYS_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD)
-	@mkdir -p $(dir $@)
+	@mkdir -p $(dir $@) $(DEP_DIR)
-	$(IIGS_BUILD) -b $(IIX_INCLUDES) -o $@ $(KEYS_SRC) $(LIB_SRCS)
+	$(IIGS_BUILD) -b -M $(KEYS_DEP) $(IIX_INCLUDES) -o $@ $(KEYS_SRC) $(LIB_SRCS)
 	$(IIGS_IIX) chtyp -t S16 $@
 $(JOY_BIN): $(JOY_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD)
-	@mkdir -p $(dir $@)
+	@mkdir -p $(dir $@) $(DEP_DIR)
-	$(IIGS_BUILD) -b $(IIX_INCLUDES) -o $@ $(JOY_SRC) $(LIB_SRCS)
+	$(IIGS_BUILD) -b -M $(JOY_DEP) $(IIX_INCLUDES) -o $@ $(JOY_SRC) $(LIB_SRCS)
 	$(IIGS_IIX) chtyp -t S16 $@
 $(SPRITE_BIN): $(SPRITE_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD)
-	@mkdir -p $(dir $@)
+	@mkdir -p $(dir $@) $(DEP_DIR)
-	$(IIGS_BUILD) -b $(IIX_INCLUDES) -o $@ $(SPRITE_SRC) $(LIB_SRCS)
+	$(IIGS_BUILD) -b -M $(SPRITE_DEP) $(IIX_INCLUDES) -o $@ $(SPRITE_SRC) $(LIB_SRCS)
 	$(IIGS_IIX) chtyp -t S16 $@
 # UBER bumps user stack to 16 KB. ORCA-C's default user stack is small
@ -147,8 +167,8 @@ $(SPRITE_BIN): $(SPRITE_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD)
 # decimal formatter in uber.c also uses larger stack-local buffers
 # (line[96], num[16]) than typical demos. 16 KB is plenty of headroom.
 $(UBER_BIN): $(UBER_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD)
-	@mkdir -p $(dir $@)
+	@mkdir -p $(dir $@) $(DEP_DIR)
-	$(IIGS_BUILD) -b -s 16384 $(IIX_INCLUDES) -o $@ $(UBER_SRC) $(LIB_SRCS)
+	$(IIGS_BUILD) -b -s 16384 -M $(UBER_DEP) $(IIX_INCLUDES) -o $@ $(UBER_SRC) $(LIB_SRCS)
 	$(IIGS_IIX) chtyp -t S16 $@
 # Convert the cross-platform .MOD asset to NinjaTrackerPlus runtime
@ -170,17 +190,23 @@ AUDIO_DATA_FILES := $(AUDIO_SFX)
 endif
 $(AUDIO_BIN): $(AUDIO_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD)
-	@mkdir -p $(dir $@)
+	@mkdir -p $(dir $@) $(DEP_DIR)
-	$(IIGS_BUILD) -b $(IIX_INCLUDES) -I $(EXAMPLES)/audio -o $@ $(AUDIO_SRC) $(LIB_SRCS)
+	$(IIGS_BUILD) -b -M $(AUDIO_DEP) $(IIX_INCLUDES) -I $(EXAMPLES)/audio -o $@ $(AUDIO_SRC) $(LIB_SRCS)
 	$(IIGS_IIX) chtyp -t S16 $@
 # Assemble a ProDOS 2img containing the examples, ready to mount in
 # GSplus alongside a GS/OS boot volume.
 iigs-disk: $(DISK_IMG)
-$(DISK_IMG): $(DRAW_BIN) $(KEYS_BIN) $(JOY_BIN) $(SPRITE_BIN) $(AUDIO_BIN) $(UBER_BIN) $(AUDIO_DATA_FILES) $(IIGS_PACKAGE)
+$(DISK_IMG): $(PATTERN_BIN) $(DRAW_BIN) $(KEYS_BIN) $(JOY_BIN) $(SPRITE_BIN) $(AUDIO_BIN) $(UBER_BIN) $(AUDIO_DATA_FILES) $(IIGS_PACKAGE)
 	@mkdir -p $(dir $@)
-	$(IIGS_PACKAGE) $@ $(DRAW_BIN) $(KEYS_BIN) $(JOY_BIN) $(SPRITE_BIN) $(AUDIO_BIN) $(UBER_BIN) -- $(AUDIO_DATA_FILES)
+	$(IIGS_PACKAGE) $@ $(PATTERN_BIN) $(DRAW_BIN) $(KEYS_BIN) $(JOY_BIN) $(SPRITE_BIN) $(AUDIO_BIN) $(UBER_BIN) -- $(AUDIO_DATA_FILES)
 clean-iigs:
 	rm -rf $(BUILD)
 # Pull in per-binary header-dependency files generated by iix-build.sh -M.
 # Without this, editing a header (e.g. surfaceInternal.h) doesn't rebuild
 # IIgs binaries that include it -- the IIgs's iix toolchain has no native
 # -MMD analog, so iix-build.sh shells out to host gcc for the scan.
 -include $(PATTERN_DEP) $(DRAW_DEP) $(KEYS_DEP) $(JOY_DEP) $(SPRITE_DEP) $(UBER_DEP) $(AUDIO_DEP)
--- a/src/codegen/spriteCompile.c
+++ b/src/codegen/spriteCompile.c
@ -14,6 +14,7 @@
 #include "joey/sprite.h"
 #include "joey/surface.h"
 #include "codegenArenaInternal.h"
 #include "hal.h"
 #include "spriteEmitter.h"
 #include "spriteInternal.h"
 #include "surfaceInternal.h"
@ -33,7 +34,9 @@
 static uint16_t emitDrawForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift) {
 #if defined(JOEYLIB_PLATFORM_DOS)
    return spriteEmitDrawX86(out, sp, shift);
-#elif defined(JOEYLIB_PLATFORM_AMIGA) || defined(JOEYLIB_PLATFORM_ATARIST)
+#elif defined(JOEYLIB_PLATFORM_AMIGA)
    return spriteEmitDrawPlanar68k(out, sp, shift);
 #elif defined(JOEYLIB_PLATFORM_ATARIST)
    return spriteEmitDraw68k(out, sp, shift);
 #elif defined(JOEYLIB_PLATFORM_IIGS)
    return spriteEmitDrawIigs(out, sp, shift);
@ -51,7 +54,9 @@ static uint16_t emitDrawForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift
 static uint16_t emitSaveForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift) {
 #if defined(JOEYLIB_PLATFORM_DOS)
    return spriteEmitSaveX86(out, sp, shift);
-#elif defined(JOEYLIB_PLATFORM_AMIGA) || defined(JOEYLIB_PLATFORM_ATARIST)
+#elif defined(JOEYLIB_PLATFORM_AMIGA)
    return spriteEmitSavePlanar68k(out, sp, shift);
 #elif defined(JOEYLIB_PLATFORM_ATARIST)
    return spriteEmitSave68k(out, sp, shift);
 #elif defined(JOEYLIB_PLATFORM_IIGS)
    return spriteEmitSaveIigs(out, sp, shift);
@ -65,7 +70,9 @@ static uint16_t emitSaveForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift
 static uint16_t emitRestoreForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift) {
 #if defined(JOEYLIB_PLATFORM_DOS)
    return spriteEmitRestoreX86(out, sp, shift);
-#elif defined(JOEYLIB_PLATFORM_AMIGA) || defined(JOEYLIB_PLATFORM_ATARIST)
+#elif defined(JOEYLIB_PLATFORM_AMIGA)
    return spriteEmitRestorePlanar68k(out, sp, shift);
 #elif defined(JOEYLIB_PLATFORM_ATARIST)
    return spriteEmitRestore68k(out, sp, shift);
 #elif defined(JOEYLIB_PLATFORM_IIGS)
    return spriteEmitRestoreIigs(out, sp, shift);
@ -114,6 +121,13 @@ bool spriteCompile(SpriteT *sp) {
    if (sp->tileData == NULL) {
        return false;
    }
    /* Amiga (post-Phase 9) uses spriteEmitPlanar68k.c which writes
     * directly to bitplanes. DRAW emits a unique pre-shifted variant
     * per shift in 0..7 (smooth horizontal motion at any pixel x);
     * SAVE/RESTORE emit only shift 0 and shift 1 since shifted variants
     * 1..7 share identical bytes (plain memcpy of widthTiles+1 plane
     * bytes per row). The post-emit pass below aliases slots 2..7
     * for save/restore to slot 1's bytes. */
    scratch = (uint8_t *)malloc(SPRITE_EMIT_SCRATCH_BYTES);
    if (scratch == NULL) {
@ -150,6 +164,16 @@ bool spriteCompile(SpriteT *sp) {
            }
        }
    }
 #if defined(JOEYLIB_PLATFORM_AMIGA)
    /* Save/restore bytes for any non-zero shift are identical (plain
     * memcpy of widthTiles+1 plane bytes per row). The emitter emits
     * them once at slot 1; alias slots 2..7 here so the dispatcher
     * gate (sprite.c) sees them as compiled. */
    for (shift = 2; shift < JOEY_SPRITE_SHIFT_COUNT; shift++) {
        sp->routineOffsets[shift][SPRITE_OP_SAVE]    = sp->routineOffsets[1][SPRITE_OP_SAVE];
        sp->routineOffsets[shift][SPRITE_OP_RESTORE] = sp->routineOffsets[1][SPRITE_OP_RESTORE];
    }
 #endif
    sp->slot = slot;
    free(scratch);
    return true;
@ -554,6 +578,112 @@ void spriteCompiledRestoreUnder(SurfaceT *dst, const SpriteBackupT *backup) {
    }
 }
 #elif defined(JOEYLIB_PLATFORM_AMIGA)
 /* Amiga planar dispatchers. spriteEmitPlanar68k.c emits routines with
 * cdecl(p0, p1, p2, p3[, backup]) signatures that write directly to
 * bitplanes. Compute byteOff = y*40 + x/8 and pass plane[i]+byteOff
 * as the 4 plane args. shift = x % 8 selects the variant; today only
 * shift 0 emits non-zero bytes, so callers should already have
 * gated on routineOffsets[shift][op] != SPRITE_NOT_COMPILED.
 *
 * For non-zero shifts (x not 8-px-aligned), the dispatcher in
 * src/core/sprite.c (spriteDraw / spriteSaveUnder / spriteRestoreUnder)
 * sees SPRITE_NOT_COMPILED for the shift and falls back to the
 * interpreter, which handles arbitrary x via halSpriteDrawPlanes /
 * halSpriteSavePlanes / halSpriteRestorePlanes. */
 #define AMIGA_BYTES_PER_ROW_LOCAL 40
 void spriteCompiledDraw(SurfaceT *dst, const SpriteT *sp, int16_t x, int16_t y) {
    typedef void (*DrawFn)(uint8_t *p0, uint8_t *p1, uint8_t *p2, uint8_t *p3);
    uint8_t   shift;
    uint16_t  byteOff;
    uint8_t  *p0;
    uint8_t  *p1;
    uint8_t  *p2;
    uint8_t  *p3;
    DrawFn    fn;
    shift   = (uint8_t)(x & 7);
    byteOff = (uint16_t)((uint16_t)y * AMIGA_BYTES_PER_ROW_LOCAL + ((uint16_t)x >> 3));
    p0      = halSurfacePlanePtr(dst, 0); if (p0 == NULL) return;
    p1      = halSurfacePlanePtr(dst, 1);
    p2      = halSurfacePlanePtr(dst, 2);
    p3      = halSurfacePlanePtr(dst, 3);
    fn = (DrawFn)(codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_DRAW]);
    fn(p0 + byteOff, p1 + byteOff, p2 + byteOff, p3 + byteOff);
 }
 void spriteCompiledSaveUnder(const SurfaceT *src, SpriteT *sp, int16_t x, int16_t y, SpriteBackupT *backup) {
    typedef void (*SaveFn)(uint8_t *p0, uint8_t *p1, uint8_t *p2, uint8_t *p3, uint8_t *backup);
    uint8_t   shift;
    int16_t   clippedX;
    uint16_t  widthPx;
    uint16_t  heightPx;
    uint16_t  byteOff;
    uint8_t  *p0;
    uint8_t  *p1;
    uint8_t  *p2;
    uint8_t  *p3;
    SaveFn    fn;
    shift    = (uint8_t)(x & 7);
    clippedX = (int16_t)(x & ~7);
    widthPx  = (uint16_t)(sp->widthTiles  * 8);
    heightPx = (uint16_t)(sp->heightTiles * 8);
    /* Shifts 1..7 spill into one extra plane byte per row (= +8 px). */
    if (shift != 0u) {
        widthPx = (uint16_t)(widthPx + 8u);
    }
    byteOff  = (uint16_t)((uint16_t)y * AMIGA_BYTES_PER_ROW_LOCAL + ((uint16_t)clippedX >> 3));
    backup->sprite    = sp;
    backup->x         = clippedX;
    backup->y         = y;
    backup->width     = widthPx;
    backup->height    = heightPx;
    /* 4 planes * h * (widthPx/8) bytes = h * widthPx/2. */
    backup->sizeBytes = (uint16_t)((uint16_t)heightPx * (widthPx >> 1));
    p0 = halSurfacePlanePtr(src, 0); if (p0 == NULL) return;
    p1 = halSurfacePlanePtr(src, 1);
    p2 = halSurfacePlanePtr(src, 2);
    p3 = halSurfacePlanePtr(src, 3);
    fn = (SaveFn)(codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_SAVE]);
    fn(p0 + byteOff, p1 + byteOff, p2 + byteOff, p3 + byteOff, backup->bytes);
 }
 void spriteCompiledRestoreUnder(SurfaceT *dst, const SpriteBackupT *backup) {
    typedef void (*RestoreFn)(uint8_t *p0, uint8_t *p1, uint8_t *p2, uint8_t *p3, const uint8_t *backup);
    SpriteT  *sp;
    uint8_t   shift;
    uint16_t  byteOff;
    uint8_t  *p0;
    uint8_t  *p1;
    uint8_t  *p2;
    uint8_t  *p3;
    RestoreFn fn;
    sp      = backup->sprite;
    /* backup->x is 8-px aligned (clippedX from save), so x & 7 is
     * useless for picking the original shift. Encode it via
     * backup->width: == widthTiles*8 means shift 0; > means shifted.
     * Shifted slots 1..7 all alias to the same restore bytes, so
     * slot 1 stands in for any non-zero shift. */
    shift   = (uint8_t)(backup->width > (uint16_t)(sp->widthTiles * 8) ? 1u : 0u);
    byteOff = (uint16_t)((uint16_t)backup->y * AMIGA_BYTES_PER_ROW_LOCAL + ((uint16_t)backup->x >> 3));
    p0 = halSurfacePlanePtr(dst, 0); if (p0 == NULL) return;
    p1 = halSurfacePlanePtr(dst, 1);
    p2 = halSurfacePlanePtr(dst, 2);
    p3 = halSurfacePlanePtr(dst, 3);
    fn = (RestoreFn)(codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_RESTORE]);
    fn(p0 + byteOff, p1 + byteOff, p2 + byteOff, p3 + byteOff, backup->bytes);
 }
 #else
 void spriteCompiledDraw(SurfaceT *dst, const SpriteT *sp, int16_t x, int16_t y) {
--- a/src/codegen/spriteEmit68k.c
+++ b/src/codegen/spriteEmit68k.c
@ -166,6 +166,13 @@ uint16_t spriteEmitDraw68k(uint8_t *out, const SpriteT *sp, uint8_t shift) {
    uint8_t  value;
    uint8_t  opaqueMask;
    // Chunky 4bpp has only two nibble-alignment positions; the
    // dispatcher uses x & 1 so shifts 2..7 are unreachable. Bail
    // early so the arena slot stays SPRITE_NOT_COMPILED.
    if (shift > 1u) {
        return 0u;
    }
    cursor             = 0;
    heightPx           = (uint16_t)(sp->heightTiles * TILE_PIXELS);
    spriteBytesPerRow  = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW);
@ -225,6 +232,10 @@ uint16_t spriteEmitRestore68k(uint8_t *out, const SpriteT *sp, uint8_t shift) {
    uint16_t heightPx;
    uint16_t copyBytes;
    if (shift > 1u) {
        return 0u;
    }
    cursor    = 0;
    heightPx  = (uint16_t)(sp->heightTiles * TILE_PIXELS);
    copyBytes = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW + (shift == 1u ? 1u : 0u));
@ -248,6 +259,10 @@ uint16_t spriteEmitSave68k(uint8_t *out, const SpriteT *sp, uint8_t shift) {
    uint16_t heightPx;
    uint16_t copyBytes;
    if (shift > 1u) {
        return 0u;
    }
    cursor    = 0;
    heightPx  = (uint16_t)(sp->heightTiles * TILE_PIXELS);
    copyBytes = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW + (shift == 1u ? 1u : 0u));
--- a/src/codegen/spriteEmitIigs.c
+++ b/src/codegen/spriteEmitIigs.c
@ -189,6 +189,10 @@ uint16_t spriteEmitSaveIigs(uint8_t *out, const SpriteT *sp, uint8_t shift) {
    uint16_t spriteBytesPerRow;
    uint16_t copyBytes;
    if (shift > 1u) {
        return 0u;
    }
    heightPx          = (uint16_t)(sp->heightTiles * TILE_PIXELS);
    spriteBytesPerRow = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW);
    copyBytes         = (uint16_t)(spriteBytesPerRow + (shift == 1 ? 1 : 0));
@ -205,6 +209,10 @@ uint16_t spriteEmitRestoreIigs(uint8_t *out, const SpriteT *sp, uint8_t shift) {
    uint16_t spriteBytesPerRow;
    uint16_t copyBytes;
    if (shift > 1u) {
        return 0u;
    }
    heightPx          = (uint16_t)(sp->heightTiles * TILE_PIXELS);
    spriteBytesPerRow = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW);
    copyBytes         = (uint16_t)(spriteBytesPerRow + (shift == 1 ? 1 : 0));
@ -258,6 +266,10 @@ uint16_t spriteEmitDrawIigs(uint8_t *out, const SpriteT *sp, uint8_t shift) {
    uint8_t  nextOpaqueMask;
    bool     wide;
    if (shift > 1u) {
        return 0u;
    }
    cursor             = 0;
    heightPx           = (uint16_t)(sp->heightTiles * TILE_PIXELS);
    spriteBytesPerRow  = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW);
--- a/src/codegen/spriteEmitPlanar68k.c
+++ b/src/codegen/spriteEmitPlanar68k.c
@ -0,0 +1,505 @@
 // Planar 68k sprite codegen for Amiga (post-Phase 9, no chunky shadow).
 //
 // Emits PIC routines that write directly to the four bitplanes via 4
 // address-register pointers (a0..a3 = plane[0..3] base + byteOff,
 // where byteOff = y*40 + x/8 -- the dispatcher pre-computes this).
 //
 // Calling convention (cdecl on m68k-amigaos-gcc):
 //   draw(p0, p1, p2, p3):
 //     args at 4(sp), 8(sp), 12(sp), 16(sp) -- one ULONG per plane.
 //     loaded into a0..a3 by the prologue.
 //   save(p0, p1, p2, p3, backup):
 //     5 args; backup at 20(sp), loaded into a4.
 //   restore(p0, p1, p2, p3, backup):
 //     same as save but reads backup, writes planes.
 //
 // Per-byte plane write encoding decisions:
 //   - all-transparent (mask=0):  skip the byte entirely
 //   - all-opaque (mask=0xFF):    move.b #imm, d16(an)        (6 bytes)
 //   - mixed (0<mask<0xFF):       move.b d16(an), d0;
 //                                andi.b #~mask, d0;
 //                                ori.b  #imm, d0;
 //                                move.b d0, d16(an)          (4+6+6+4 = 20 bytes)
 //
 // Per row advance: 4 plane pointers each get adda.w #SURFACE_WIDTH/8
 // = adda.w #40, an  (4 bytes encoded each, 16 bytes total per row).
 // We omit the advance after the last row.
 //
 // Shift handling: shifts 0..7 are pre-baked. The dispatcher selects
 // the variant via x % 8 and pre-computes byteOff = y*40 + (x & ~7)/8
 // (i.e. round x DOWN to 8-pixel boundary). The variant for shift s
 // then emits to (widthTiles + 1) plane bytes per row when s != 0
 // (the rightmost shift bits spill into one extra plane byte) and to
 // widthTiles plane bytes per row when s == 0.
 //
 // The emitter assumes sprite width is a multiple of 8 (= a multiple
 // of one tile = a multiple of 8 pixels) so plane bytes per row are
 // integer. JoeyLib sprites are always tile-multiple by API contract.
 #include "joey/sprite.h"
 #include "joey/surface.h"
 #include "spriteEmitter.h"
 #include "spriteInternal.h"
 // ----- Constants -----
 #define TILE_PIXELS              8
 #define TILE_BYTES               32
 #define TILE_BYTES_PER_ROW       4
 #define TRANSPARENT_NIBBLE       0
 #define AMIGA_BITPLANES          4
 #define AMIGA_BYTES_PER_ROW      40
 // ----- Instruction encoding helpers -----
 static uint16_t writeBE16(uint8_t *out, uint16_t value) {
    out[0] = (uint8_t)((value >> 8) & 0xFFu);
    out[1] = (uint8_t)(value & 0xFFu);
    return 2u;
 }
 // movea.l <d16,SP>, an  -- load arg at SP+disp into An.
 // Encoding: 0010 nnn 001 010 111  + disp16
 //           = 0x2057 + (n << 9), where n is dst An.
 //   a0: 0x206F, a1: 0x226F, a2: 0x246F, a3: 0x266F, a4: 0x286F.
 static const uint16_t kMoveaSpToAn[] = {
    0x206Fu, 0x226Fu, 0x246Fu, 0x266Fu, 0x286Fu, 0x2A6Fu, 0x2C6Fu, 0x2E6Fu
 };
 // adda.w #imm, an  -- adds 16-bit signed imm to An (sign-extended).
 // Encoding: 1101 nnn 011 111 100  + imm
 //           = 0xD0FC + (n << 9).
 static const uint16_t kAddaWImmToAn[] = {
    0xD0FCu, 0xD2FCu, 0xD4FCu, 0xD6FCu, 0xD8FCu, 0xDAFCu, 0xDCFCu, 0xDEFCu
 };
 // ANDI.B #imm, D0  -- 4 bytes (opcode word + imm word, byte in low half).
 // Opcode: 0000 0010 00 000 000  (size=byte, mode=Dn, reg=D0)
 #define ANDI_B_IMM_D0   0x0200u
 // ORI.B  #imm, D0  -- 4 bytes (opcode word + imm word, byte in low half).
 // Opcode: 0000 0000 00 000 000
 #define ORI_B_IMM_D0    0x0000u
 // MOVE.B d16(An), D0  -- 4 bytes (opcode + disp).
 // Encoding: 0001 000 000 mode reg
 //   = size=01 (byte), dst reg=000 (D0), dst mode=000 (Dn),
 //     src mode=101 (d16,An), src reg=An.
 //   = 0001000 000 101 nnn = 0x1028 + An.
 static const uint16_t kMoveBD16AnToD0[] = {
    0x1028u, 0x1029u, 0x102Au, 0x102Bu
 };
 // MOVE.B D0, d16(An)  -- 4 bytes (opcode + disp).
 // Encoding: 0001 nnn 101 000 000 = 0x1140 + (An << 9).
 static const uint16_t kMoveBD0ToD16An[] = {
    0x1140u, 0x1340u, 0x1540u, 0x1740u
 };
 // MOVE.B #imm, d16(An) -- 6 bytes (opcode + imm + disp).
 // Encoding: 0001 nnn 101 111 100 = 0x117C + (An << 9).
 //   (Was 0x113C earlier -- that's mode=100=predec; mode=101=d16(An)
 //    is the bit difference. Predec emits a 4-byte instruction with no
 //    disp word, so the byte stream went out of sync and every
 //    subsequent instruction decoded into garbage.)
 static const uint16_t kMoveBImmToD16An[] = {
    0x117Cu, 0x137Cu, 0x157Cu, 0x177Cu
 };
 // MOVE.B (a4)+, d16(An) -- 4 bytes (opcode + disp).  -- used by save/restore (backup in a4)
 // Encoding: 0001 nnn 101 011 100 = 0x115C + (An << 9).
 static const uint16_t kMoveBA4PostincToD16An[] = {
    0x115Cu, 0x135Cu, 0x155Cu, 0x175Cu
 };
 // MOVE.B d16(An), (a4)+ -- 4 bytes (opcode + disp).  -- used by save (planes -> backup)
 // Encoding: 1001 100 011 mode reg
 //   Wait, MOVE.B src,(a4)+ : dst mode = 011 (an+), dst reg = 100 (A4),
 //   so dst reg=100, dst mode=011 -> opcode high = 0001 100 011 ...
 //   = 0001100011 mode reg = 0x18C0..
 //   0001 100 011 101 nnn = 0x18E8 + An.
 static const uint16_t kMoveBD16AnToA4Postinc[] = {
    0x18E8u, 0x18E9u, 0x18EAu, 0x18EBu
 };
 // MOVEM.L reglist, -(SP)  -- 4 bytes (opcode + reglist mask).
 //   Opcode 0x48E7. Predec mask is REVERSED vs all other modes:
 //   bit 15 = D0, ..., bit 8 = D7, bit 7 = A0, bit 6 = A1, bit 5 = A2,
 //   bit 4 = A3, bit 3 = A4, bit 2 = A5, bit 1 = A6, bit 0 = A7.
 #define MOVEM_L_PUSH_OPCODE   0x48E7u
 #define MOVEM_L_MASK_A2_A3    0x0030u  /* bits 5,4 = A2,A3 (predec order) */
 #define MOVEM_L_MASK_A2_A3_A4 0x0038u  /* bits 5,4,3 = A2,A3,A4 */
 // MOVEM.L (SP)+, reglist  -- 4 bytes (opcode + reglist mask).
 //   Opcode 0x4CDF. Postinc mask follows the standard layout:
 //   bit 0 = D0, ..., bit 7 = D7, bit 8 = A0, ..., bit 15 = A7.
 #define MOVEM_L_POP_OPCODE    0x4CDFu
 #define MOVEM_L_MASK_POP_A2_A3    0x0C00u  /* bits 11,10 = A3,A2 */
 #define MOVEM_L_MASK_POP_A2_A3_A4 0x1C00u  /* bits 12,11,10 = A4,A3,A2 */
 // RTS opcode.
 #define OPCODE_RTS            0x4E75u
 // ----- Emit helpers -----
 // For shift 0 (byte-aligned x), the sprite's chunky tile data converts
 // directly to plane bytes without any sub-byte shifting. For each
 // (row, col-byte, plane) we extract the 8 plane bits from 4 chunky
 // bytes (= 8 pixels) and produce one plane byte; we also produce a
 // mask byte indicating which pixel positions are non-transparent
 // (any plane bit != 0 in the source means non-transparent if
 // transparent index is 0, the JoeyLib convention).
 //
 // Sprite layout: tileData = wTiles x hTiles tiles, each tile = 8 rows
 // x 4 chunky bytes (32 bytes). Tiles laid out row-major within the
 // sprite. For plane-byte column `c` of row `r`:
 //   tileX = c (since each plane byte covers exactly one tile column)
 //   tileY = r / 8
 //   inTileY = r % 8
 //   chunky bytes = tileData + (tileY*wTiles + tileX)*32 + inTileY*4 + 0..3
 //
 // `col` must be in [0, widthTiles); callers handle out-of-range cols
 // (used when computing shifted variants that span widthTiles+1 output
 // bytes per row) by passing a sentinel and checking against widthTiles
 // before invoking this helper.
 static void planeByteAndMaskAt(const SpriteT *sp, uint16_t row, uint16_t col,
                               uint8_t *planeBytes /*[4]*/, uint8_t *maskByte)
 {
    uint16_t       tileX;
    uint16_t       tileY;
    uint16_t       inTileY;
    const uint8_t *tile;
    const uint8_t *chunky;
    uint8_t        nibbles[8];
    uint8_t        b0, b1, b2, b3;
    uint16_t       p;
    uint8_t        bitMask;
    uint8_t        pix;
    tileX   = col;
    tileY   = row >> 3;
    inTileY = row & 7u;
    tile   = sp->tileData + (uint32_t)((tileY * sp->widthTiles + tileX) * 32u);
    chunky = tile + inTileY * 4u;
    nibbles[0] = (uint8_t)(chunky[0] >> 4);
    nibbles[1] = (uint8_t)(chunky[0] & 0x0Fu);
    nibbles[2] = (uint8_t)(chunky[1] >> 4);
    nibbles[3] = (uint8_t)(chunky[1] & 0x0Fu);
    nibbles[4] = (uint8_t)(chunky[2] >> 4);
    nibbles[5] = (uint8_t)(chunky[2] & 0x0Fu);
    nibbles[6] = (uint8_t)(chunky[3] >> 4);
    nibbles[7] = (uint8_t)(chunky[3] & 0x0Fu);
    b0 = 0u; b1 = 0u; b2 = 0u; b3 = 0u;
    *maskByte = 0u;
    for (p = 0; p < 8u; p++) {
        pix = nibbles[p];
        if (pix == TRANSPARENT_NIBBLE) {
            continue;
        }
        bitMask = (uint8_t)(0x80u >> p);
        *maskByte = (uint8_t)(*maskByte | bitMask);
        if (pix & 1u) b0 = (uint8_t)(b0 | bitMask);
        if (pix & 2u) b1 = (uint8_t)(b1 | bitMask);
        if (pix & 4u) b2 = (uint8_t)(b2 | bitMask);
        if (pix & 8u) b3 = (uint8_t)(b3 | bitMask);
    }
    planeBytes[0] = b0;
    planeBytes[1] = b1;
    planeBytes[2] = b2;
    planeBytes[3] = b3;
 }
 // Shifted variant: produces 4 plane bytes and 1 mask byte for output
 // column `outCol` (0..widthTiles inclusive) of row `row` when the
 // sprite is shifted right by `shift` pixels (1..7). For shift 0,
 // callers should use planeByteAndMaskAt directly (faster, no spill).
 //
 // Each output byte is composed of bits drawn from up to two source
 // plane bytes:
 //   leftPart  = src[outCol-1] << (8 - shift)   (high (shift) bits)
 //   rightPart = src[outCol]   >> shift          (low (8-shift) bits)
 // with src[-1] and src[widthTiles] treated as 0/transparent. The
 // resulting plane byte is leftPart | rightPart; the mask byte is the
 // shifted union of the per-byte source masks.
 static void planeByteAndMaskShifted(const SpriteT *sp, uint16_t row, uint16_t outCol,
                                    uint8_t shift, uint16_t widthTiles,
                                    uint8_t *planeBytes /*[4]*/, uint8_t *maskByte)
 {
    uint8_t leftPlanes[AMIGA_BITPLANES];
    uint8_t leftMask;
    uint8_t rightPlanes[AMIGA_BITPLANES];
    uint8_t rightMask;
    uint8_t i;
    leftMask  = 0u;
    rightMask = 0u;
    for (i = 0; i < AMIGA_BITPLANES; i++) {
        leftPlanes[i]  = 0u;
        rightPlanes[i] = 0u;
    }
    if (outCol > 0u && (uint16_t)(outCol - 1u) < widthTiles) {
        planeByteAndMaskAt(sp, row, (uint16_t)(outCol - 1u), leftPlanes, &leftMask);
    }
    if (outCol < widthTiles) {
        planeByteAndMaskAt(sp, row, outCol, rightPlanes, &rightMask);
    }
    *maskByte = (uint8_t)(((leftMask  << (8u - shift)) & 0xFFu) |
                          ((rightMask >>       shift)  & 0xFFu));
    for (i = 0; i < AMIGA_BITPLANES; i++) {
        planeBytes[i] = (uint8_t)(((leftPlanes[i]  << (8u - shift)) & 0xFFu) |
                                  ((rightPlanes[i] >>       shift)  & 0xFFu));
    }
 }
 // Emit code that merges one plane byte into d16(an) where d16 is the
 // row-relative byte offset (0 since we re-base each row by adda.w).
 // The choice of all-opaque vs mixed encoding cuts code size when many
 // pixels are opaque (typical for sprite interiors).
 static uint16_t emitMergeByteToD16An(uint8_t *out, uint16_t cursor,
                                     uint8_t an, uint8_t disp,
                                     uint8_t maskByte, uint8_t srcByte)
 {
    if (maskByte == 0u) {
        return cursor;  /* nothing to write */
    }
    if (maskByte == 0xFFu) {
        /* All-opaque shortcut: move.b #src, d16(an). */
        cursor += writeBE16(out + cursor, kMoveBImmToD16An[an]);
        cursor += writeBE16(out + cursor, (uint16_t)srcByte);
        cursor += writeBE16(out + cursor, (uint16_t)disp);
        return cursor;
    }
    /* Mixed: load existing, clear mask bits, OR in src, write back. */
    cursor += writeBE16(out + cursor, kMoveBD16AnToD0[an]);
    cursor += writeBE16(out + cursor, (uint16_t)disp);
    cursor += writeBE16(out + cursor, ANDI_B_IMM_D0);
    cursor += writeBE16(out + cursor, (uint16_t)((~maskByte) & 0xFFu));
    cursor += writeBE16(out + cursor, ORI_B_IMM_D0);
    cursor += writeBE16(out + cursor, (uint16_t)srcByte);
    cursor += writeBE16(out + cursor, kMoveBD0ToD16An[an]);
    cursor += writeBE16(out + cursor, (uint16_t)disp);
    return cursor;
 }
 // ----- Public API -----
 uint16_t spriteEmitDrawPlanar68k(uint8_t *out, const SpriteT *sp, uint8_t shift) {
    uint16_t cursor;
    uint16_t row;
    uint16_t col;
    uint16_t heightPx;
    uint16_t widthTiles;
    uint16_t bytesPerRow;       /* per plane, per row */
    uint8_t  planeBytes[AMIGA_BITPLANES];
    uint8_t  maskByte;
    uint8_t  i;
    if (shift > 7u) {
        return 0u;
    }
    cursor      = 0;
    heightPx    = (uint16_t)(sp->heightTiles * TILE_PIXELS);
    widthTiles  = (uint16_t)sp->widthTiles;
    bytesPerRow = (uint16_t)(widthTiles + (shift == 0u ? 0u : 1u));
    /* Prologue: m68k cdecl callee-saves a2-a6; we clobber a2 and a3
     * loading plane pointers, so push them first. After the push, all
     * stack arg displacements shift by +8 (two longs). */
    cursor += writeBE16(out + cursor, MOVEM_L_PUSH_OPCODE);
    cursor += writeBE16(out + cursor, MOVEM_L_MASK_A2_A3);
    for (i = 0; i < AMIGA_BITPLANES; i++) {
        cursor += writeBE16(out + cursor, kMoveaSpToAn[i]);
        cursor += writeBE16(out + cursor, (uint16_t)(8u + 4u + i * 4u));
    }
    for (row = 0; row < heightPx; row++) {
        for (col = 0; col < bytesPerRow; col++) {
            if (shift == 0u) {
                planeByteAndMaskAt(sp, row, col, planeBytes, &maskByte);
            } else {
                planeByteAndMaskShifted(sp, row, col, shift, widthTiles, planeBytes, &maskByte);
            }
            for (i = 0; i < AMIGA_BITPLANES; i++) {
                cursor = emitMergeByteToD16An(out, cursor, i, (uint8_t)col,
                                              maskByte, planeBytes[i]);
            }
        }
        if (row + 1u < heightPx) {
            for (i = 0; i < AMIGA_BITPLANES; i++) {
                cursor += writeBE16(out + cursor, kAddaWImmToAn[i]);
                cursor += writeBE16(out + cursor, (uint16_t)AMIGA_BYTES_PER_ROW);
            }
        }
    }
    /* Epilogue: restore a2-a3, rts. */
    cursor += writeBE16(out + cursor, MOVEM_L_POP_OPCODE);
    cursor += writeBE16(out + cursor, MOVEM_L_MASK_POP_A2_A3);
    cursor += writeBE16(out + cursor, OPCODE_RTS);
    return cursor;
 }
 // SAVE: planes -> backup. backup is one contiguous 4*H*W/8 byte buffer
 // laid out as 4 plane stripes, matching halSpriteSavePlanes format
 // (so cross-platform save buffer is interchangeable).
 //
 // Per row: for each plane, copy bytesPerRow bytes from d16(an) to
 // (a4)+. After the row's reads, the planes need to advance by 40,
 // while a4 advances naturally via post-increment.
 //
 // Plane stripes are sequential in backup. We could either (a) do all
 // rows of plane 0, then plane 1, etc. (matches halSpriteSavePlanes
 // layout), or (b) interleave rows of all 4 planes (different layout).
 // halSpriteSavePlanes does (a) -- 4 separate plane stripes. The
 // emitted code below matches that layout for compat.
 uint16_t spriteEmitSavePlanar68k(uint8_t *out, const SpriteT *sp, uint8_t shift) {
    uint16_t cursor;
    uint16_t row;
    uint16_t col;
    uint16_t heightPx;
    uint16_t bytesPerRow;
    uint8_t  i;
    /* Shifts 2..7 reuse shift 1's bytes (identical memcpy). The
     * spriteCompile post-emit pass aliases their routineOffsets to
     * slot 1 so this routine is emitted once. */
    if (shift > 1u) {
        return 0u;
    }
    cursor      = 0;
    heightPx    = (uint16_t)(sp->heightTiles * TILE_PIXELS);
    bytesPerRow = (uint16_t)(sp->widthTiles + (shift == 0u ? 0u : 1u));
    /* Prologue: callee-save a2/a3/a4 (m68k cdecl), then load 4 plane
     * pointers + backup pointer. After the push, all stack arg disps
     * shift by +12 (three longs). */
    cursor += writeBE16(out + cursor, MOVEM_L_PUSH_OPCODE);
    cursor += writeBE16(out + cursor, MOVEM_L_MASK_A2_A3_A4);
    for (i = 0; i < AMIGA_BITPLANES; i++) {
        cursor += writeBE16(out + cursor, kMoveaSpToAn[i]);
        cursor += writeBE16(out + cursor, (uint16_t)(12u + 4u + i * 4u));
    }
    /* a4 = backup. */
    cursor += writeBE16(out + cursor, kMoveaSpToAn[4]);
    cursor += writeBE16(out + cursor, (uint16_t)(12u + 4u + 4u * 4u));
    /* Plane-major: for each plane, walk all rows. After this routine,
     * each An has advanced by H*40 (one frame full); we don't need to
     * unwind because the function returns. We DO need to reset An
     * back to start before walking the NEXT plane though.
     *
     * Simpler alternative: row-major (interleaved). Per row, copy
     * bytesPerRow bytes from each plane to (a4)+, then advance all
     * 4 planes by 40. Net: a4 advances by 4*H*bytesPerRow; planes
     * advance by H*40. Backup layout becomes interleaved (plane0_row0,
     * plane1_row0, plane2_row0, plane3_row0, plane0_row1, ...).
     *
     * That doesn't match halSpriteSavePlanes' plane-major layout. Need
     * to either (a) match it -- emit per-plane outer loop with a4
     * stride between planes -- or (b) change halSpriteSavePlanes to
     * interleaved. Picking (b) is simpler in emitted code, but ALSO
     * requires updating halSpriteRestorePlanes and halSpriteRestoreUnder
     * fallback math.
     *
     * For now: use plane-major matching halSpriteSavePlanes. Per
     * plane: walk rows, copy bytes from d16(an) to (a4)+, advance an
     * by 40 after each row except the last; reset an back to start
     * before next plane. */
    for (i = 0; i < AMIGA_BITPLANES; i++) {
        for (row = 0; row < heightPx; row++) {
            for (col = 0; col < bytesPerRow; col++) {
                cursor += writeBE16(out + cursor, kMoveBD16AnToA4Postinc[i]);
                cursor += writeBE16(out + cursor, (uint16_t)col);
            }
            if (row + 1u < heightPx) {
                cursor += writeBE16(out + cursor, kAddaWImmToAn[i]);
                cursor += writeBE16(out + cursor, (uint16_t)AMIGA_BYTES_PER_ROW);
            }
        }
        /* Reset An back to the plane base for next iteration. The
         * total advance was (heightPx - 1) * 40. Subtract that. */
        if (i + 1u < AMIGA_BITPLANES) {
            cursor += writeBE16(out + cursor, kAddaWImmToAn[i]);
            cursor += writeBE16(out + cursor, (uint16_t)(0u - ((heightPx - 1u) * AMIGA_BYTES_PER_ROW)));
        }
    }
    cursor += writeBE16(out + cursor, MOVEM_L_POP_OPCODE);
    cursor += writeBE16(out + cursor, MOVEM_L_MASK_POP_A2_A3_A4);
    cursor += writeBE16(out + cursor, OPCODE_RTS);
    return cursor;
 }
 // RESTORE: backup -> planes. Mirror of save. Uses MOVE.B (a4)+, d16(an).
 uint16_t spriteEmitRestorePlanar68k(uint8_t *out, const SpriteT *sp, uint8_t shift) {
    uint16_t cursor;
    uint16_t row;
    uint16_t col;
    uint16_t heightPx;
    uint16_t bytesPerRow;
    uint8_t  i;
    if (shift > 1u) {
        return 0u;
    }
    cursor      = 0;
    heightPx    = (uint16_t)(sp->heightTiles * TILE_PIXELS);
    bytesPerRow = (uint16_t)(sp->widthTiles + (shift == 0u ? 0u : 1u));
    /* Callee-save a2/a3/a4; arg disps shift by +12. */
    cursor += writeBE16(out + cursor, MOVEM_L_PUSH_OPCODE);
    cursor += writeBE16(out + cursor, MOVEM_L_MASK_A2_A3_A4);
    for (i = 0; i < AMIGA_BITPLANES; i++) {
        cursor += writeBE16(out + cursor, kMoveaSpToAn[i]);
        cursor += writeBE16(out + cursor, (uint16_t)(12u + 4u + i * 4u));
    }
    cursor += writeBE16(out + cursor, kMoveaSpToAn[4]);
    cursor += writeBE16(out + cursor, (uint16_t)(12u + 4u + 4u * 4u));
    for (i = 0; i < AMIGA_BITPLANES; i++) {
        for (row = 0; row < heightPx; row++) {
            for (col = 0; col < bytesPerRow; col++) {
                cursor += writeBE16(out + cursor, kMoveBA4PostincToD16An[i]);
                cursor += writeBE16(out + cursor, (uint16_t)col);
            }
            if (row + 1u < heightPx) {
                cursor += writeBE16(out + cursor, kAddaWImmToAn[i]);
                cursor += writeBE16(out + cursor, (uint16_t)AMIGA_BYTES_PER_ROW);
            }
        }
        if (i + 1u < AMIGA_BITPLANES) {
            cursor += writeBE16(out + cursor, kAddaWImmToAn[i]);
            cursor += writeBE16(out + cursor, (uint16_t)(0u - ((heightPx - 1u) * AMIGA_BYTES_PER_ROW)));
        }
    }
    cursor += writeBE16(out + cursor, MOVEM_L_POP_OPCODE);
    cursor += writeBE16(out + cursor, MOVEM_L_MASK_POP_A2_A3_A4);
    cursor += writeBE16(out + cursor, OPCODE_RTS);
    return cursor;
 }
--- a/src/codegen/spriteEmitX86.c
+++ b/src/codegen/spriteEmitX86.c
@ -200,6 +200,10 @@ uint16_t spriteEmitDrawX86(uint8_t *out, const SpriteT *sp, uint8_t shift) {
    uint8_t   v3;
    uint8_t   m;
    if (shift > 1u) {
        return 0u;
    }
    cursor             = 0;
    heightPx           = (uint16_t)(sp->heightTiles * TILE_PIXELS);
    spriteBytesPerRow  = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW);
@ -313,6 +317,10 @@ uint16_t spriteEmitRestoreX86(uint8_t *out, const SpriteT *sp, uint8_t shift) {
    uint16_t heightPx;
    uint16_t copyBytes;
    if (shift > 1u) {
        return 0u;
    }
    cursor    = 0;
    heightPx  = (uint16_t)(sp->heightTiles * TILE_PIXELS);
    copyBytes = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW + (shift == 1u ? 1u : 0u));
@ -339,6 +347,10 @@ uint16_t spriteEmitSaveX86(uint8_t *out, const SpriteT *sp, uint8_t shift) {
    uint16_t heightPx;
    uint16_t copyBytes;
    if (shift > 1u) {
        return 0u;
    }
    cursor    = 0;
    heightPx  = (uint16_t)(sp->heightTiles * TILE_PIXELS);
    copyBytes = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW + (shift == 1u ? 1u : 0u));
--- a/src/codegen/spriteEmitter.h
+++ b/src/codegen/spriteEmitter.h
@ -42,4 +42,19 @@ uint16_t spriteEmitRestoreX86 (uint8_t *out, const SpriteT *sp, uint8_t shift);
 uint16_t spriteEmitSave68k    (uint8_t *out, const SpriteT *sp, uint8_t shift);
 uint16_t spriteEmitRestore68k (uint8_t *out, const SpriteT *sp, uint8_t shift);
 // Planar 68k emitters (Amiga). Distinct from the chunky 68k emitters
 // above because the destination addressing is across 4 separate
 // bitplane buffers, not a single packed-pixel surface. Calling
 // convention for the emitted bytes (cdecl):
 //   void draw    (uint8_t *p0, uint8_t *p1, uint8_t *p2, uint8_t *p3);
 //   void save    (uint8_t *p0, uint8_t *p1, uint8_t *p2, uint8_t *p3, uint8_t *backup);
 //   void restore (uint8_t *p0, uint8_t *p1, uint8_t *p2, uint8_t *p3, const uint8_t *backup);
 // Each pi is plane_base + byteOff (= y*40 + x/8 already added by the
 // dispatcher). Returns 0 for shifts not yet implemented (today only
 // shift 0 == byte-aligned x is emitted; shifts 1..7 fall back to the
 // cross-platform interpreter).
 uint16_t spriteEmitDrawPlanar68k    (uint8_t *out, const SpriteT *sp, uint8_t shift);
 uint16_t spriteEmitSavePlanar68k    (uint8_t *out, const SpriteT *sp, uint8_t shift);
 uint16_t spriteEmitRestorePlanar68k (uint8_t *out, const SpriteT *sp, uint8_t shift);
 #endif
--- a/src/core/debug.c
+++ b/src/core/debug.c
@ -1,11 +1,18 @@
-// Cross-platform "where did it hang?" logger. Each call opens
+// Cross-platform "where did it hang?" logger. Holds joeylog.txt open
-// joeylog.txt, appends a line, fflushes, closes. Slow but durable
+// across calls; libc's stdio buffer absorbs writes (~4 KB) and the
-// -- the last line in the file is guaranteed to be on disk before
+// final fclose at program exit (via atexit) gets the buffer to disk.
 // any subsequent operation that might hang.
 //
-// Build only as needed for diagnostics; remove the calls when the
+// Earlier rev opened+closed per call for crash durability ("last line
-// bug is fixed. The hang on ST kept us looking at the wrong layer
+// guaranteed on disk if we hang"); that cost ~1 second per call
-// without this kind of trace.
+// through GoldenGate's ProDOS FST emulation -- a 50-line UBER run
 // burned ~5 minutes in IO. Even per-line fflush is too expensive
 // because every fflush forces an FST WRITE, and host-OS file IO time
 // isn't tracked by the IIgs VBL counter so wall-time logs underreport.
 //
 // Tradeoff: if the program crashes mid-run, buffered log lines may
 // not reach disk. For UBER and similar batch demos that's acceptable;
 // for hang-debugging where durability matters, call joeyLogFlush()
 // at the suspected hang points.
 #include <stdio.h>
 #include <stdarg.h>
@ -15,6 +22,27 @@
 static const char *kLogPath = "joeylog.txt";
 static FILE       *gLogFp   = NULL;
 /* 16 KB is enough for UBER's full log (~5 KB) plus generous headroom,
 * so the file never auto-flushes mid-run. ORCA-C / libnix default
 * buffers are only ~512 bytes; with that, a 50-line log triggers ~10
 * ProDOS / AmigaDOS WRITEs through the host FST, each of which is
 * untracked-host-time (seconds). Buffer the whole thing in memory and
 * let the atexit fclose flush once. */
 #define JOEY_LOG_BUF_BYTES 16384
 static char        gLogBuf[JOEY_LOG_BUF_BYTES];
 /* Lazy-open. Returns NULL if the open failed (silently disable). */
 static FILE *logFile(void) {
    if (gLogFp == NULL) {
        gLogFp = fopen(kLogPath, "a");
        if (gLogFp != NULL) {
            (void)setvbuf(gLogFp, gLogBuf, _IOFBF, sizeof(gLogBuf));
        }
    }
    return gLogFp;
 }
 void joeyLog(const char *msg) {
@ -22,13 +50,12 @@ void joeyLog(const char *msg) {
    if (msg == NULL) {
        return;
    }
-    fp = fopen(kLogPath, "a");
+    fp = logFile();
    if (fp == NULL) {
        return;
    }
    fputs(msg, fp);
    fputc('\n', fp);
    fclose(fp);
 }
@ -38,7 +65,7 @@ void joeyLogF(const char *fmt, ...) {
    if (fmt == NULL) {
        return;
    }
-    fp = fopen(kLogPath, "a");
+    fp = logFile();
    if (fp == NULL) {
        return;
    }
@ -46,14 +73,27 @@ void joeyLogF(const char *fmt, ...) {
    vfprintf(fp, fmt, args);
    va_end(args);
    fputc('\n', fp);
-    fclose(fp);
+}
 void joeyLogFlush(void) {
    if (gLogFp != NULL) {
        fflush(gLogFp);
    }
 }
 void joeyLogReset(void) {
-    FILE *fp;
+    if (gLogFp != NULL) {
-    fp = fopen(kLogPath, "w");
+        fclose(gLogFp);
-    if (fp != NULL) {
+        gLogFp = NULL;
-        fclose(fp);
+    }
    /* Truncate by opening for write then closing; subsequent
     * joeyLog* will reopen for append. */
    {
        FILE *fp = fopen(kLogPath, "w");
        if (fp != NULL) {
            fclose(fp);
        }
    }
 }
--- a/src/core/draw.c
+++ b/src/core/draw.c
@ -186,13 +186,17 @@ static void floodFillInternal(SurfaceT *s, int16_t startX, int16_t startY, uint8
            continue;
        }
-        // Highest-tier asm fast path: seed-test + walk-left + walk-right
+        /* Phase 9: planar ports have NULL s->pixels and the asm fast
-        // + 1-row fill + scan-above + scan-below + push, all in one
+         * paths take a chunky-row pointer. Skip them on planar; the C
-        // cross-segment call. The asm caches row addr / match decoder
+         * fallback below uses halSamplePixel which works on both
-        // across every sub-operation. C just pops and dispatches; this
+         * storage layouts. */
-        // path completes the entire per-seed work and computes the row
+        if (s->pixels != NULL) {
-        // address itself, so we don't pay y*160 in C unless we fall back.
+            // Highest-tier asm fast path: seed-test + walk-left + walk-right
-        {
+            // + 1-row fill + scan-above + scan-below + push, all in one
            // cross-segment call. The asm caches row addr / match decoder
            // across every sub-operation. C just pops and dispatches; this
            // path completes the entire per-seed work and computes the row
            // address itself, so we don't pay y*160 in C unless we fall back.
            bool seedMatched;
            if (halFastFloodWalkAndScans(s->pixels, x, y,
                                         matchColor, newNibble, matchEqual,
@ -203,22 +207,27 @@ static void floodFillInternal(SurfaceT *s, int16_t startX, int16_t startY, uint8
            }
        }
-        // Fallback path needs row; compute it here so the asm path
+        /* Fallback path: compute row only if chunky; halFastFloodWalk
-        // above doesn't pay for an unused y*160 multiply on every iter.
+         * needs it but isn't implemented on Amiga. */
-        row = &s->pixels[SURFACE_ROW_OFFSET(y)];
+        row = (s->pixels != NULL) ? &s->pixels[SURFACE_ROW_OFFSET(y)] : NULL;
        // Tier-2 asm fast path: combined seed test + walk-left +
        // walk-right in one cross-segment call. Falls back to the
        // pure-C walks below on ports without an asm implementation.
        {
            bool seedMatched;
-            if (halFastFloodWalk(row, x, matchColor, newNibble, matchEqual,
+            if (row != NULL && halFastFloodWalk(row, x, matchColor, newNibble, matchEqual,
-                                 &seedMatched, &leftX, &rightX)) {
+                                                &seedMatched, &leftX, &rightX)) {
                if (!seedMatched) {
                    continue;
                }
            } else if (halFloodWalkPlanes(s, x, y, matchColor, newNibble, matchEqual,
                                          &seedMatched, &leftX, &rightX)) {
                if (!seedMatched) {
                    continue;
                }
            } else {
-                pix = srcPixel(row, x);
+                pix = halSamplePixel(s, x, y);
                pixMatch = (pix == matchColor);
                if (matchEqual) {
                    if (!pixMatch) {
@ -233,7 +242,7 @@ static void floodFillInternal(SurfaceT *s, int16_t startX, int16_t startY, uint8
                // Walk left to find the start of the matching run.
                leftX = x;
                while (leftX > 0) {
-                    pix = srcPixel(row, (int16_t)(leftX - 1));
+                    pix = halSamplePixel(s, (int16_t)(leftX - 1), y);
                    pixMatch = (pix == matchColor);
                    if (matchEqual ? !pixMatch : (pixMatch || pix == newNibble)) {
                        break;
@ -244,7 +253,7 @@ static void floodFillInternal(SurfaceT *s, int16_t startX, int16_t startY, uint8
                // Walk right to find the end.
                rightX = x;
                while (rightX < SURFACE_WIDTH - 1) {
-                    pix = srcPixel(row, (int16_t)(rightX + 1));
+                    pix = halSamplePixel(s, (int16_t)(rightX + 1), y);
                    pixMatch = (pix == matchColor);
                    if (matchEqual ? !pixMatch : (pixMatch || pix == newNibble)) {
                        break;
@ -256,12 +265,18 @@ static void floodFillInternal(SurfaceT *s, int16_t startX, int16_t startY, uint8
        // Fill the span. Bypass fillRect's clipping wrapper: walk-out
        // already guaranteed leftX/rightX are in [0..SURFACE_WIDTH-1]
-        // and the seed-pop bounds check did the same for y.
+        // and the seed-pop bounds check did the same for y. We DO
        // need the planar dual-write (which fillRect's wrapper would
        // call), so invoke halFillRectPlanes explicitly after the
        // chunky span fill -- otherwise PLANAR_PRESENT builds (and,
        // post-Phase-9, every build) display flood-filled regions
        // as the unfilled background.
        {
            int16_t spanW = (int16_t)(rightX - leftX + 1);
            if (!halFastFillRect(s, leftX, y, (uint16_t)spanW, 1, newNibble)) {
                fillRectClipped(s, leftX, y, spanW, 1, newNibble);
            }
            halFillRectPlanes(s, leftX, y, (uint16_t)spanW, 1, newNibble);
        }
        // Scan rows above and below for run boundaries. The hot
@ -291,19 +306,26 @@ static void floodFillInternal(SurfaceT *s, int16_t startX, int16_t startY, uint8
                    }
                    scanY  = (int16_t)(y + 1);
                }
-                scanRow = &s->pixels[SURFACE_ROW_OFFSET(scanY)];
+                scanRow = (s->pixels != NULL) ? &s->pixels[SURFACE_ROW_OFFSET(scanY)] : NULL;
                // Prefer the combined scan+push asm path (one call per
-                // scan, no markBuf and no per-pixel C edge walk).
+                // scan, no markBuf and no per-pixel C edge walk). Skip
-                if (!halFastFloodScanAndPush(scanRow, leftX, rightX,
+                // the asm tiers if we don't have a chunky row pointer
                // (Phase 9 planar ports).
                if (scanRow == NULL ||
                    !halFastFloodScanAndPush(scanRow, leftX, rightX,
                                             matchColor, newNibble, matchEqual,
                                             scanY, stackX, stackY,
                                             &sp, FLOOD_STACK_SIZE)) {
-                    if (!halFastFloodScanRow(scanRow, leftX, rightX,
+                    if ((scanRow == NULL ||
-                                             matchColor, newNibble, matchEqual,
+                         !halFastFloodScanRow(scanRow, leftX, rightX,
-                                             floodMarkBuf)) {
+                                              matchColor, newNibble, matchEqual,
                                              floodMarkBuf)) &&
                        !halFloodScanRowPlanes(s, leftX, rightX, scanY,
                                               matchColor, newNibble, matchEqual,
                                               floodMarkBuf)) {
                        // C fallback: fill markBuf the slow way.
                        for (i = 0; i < spanLen; i++) {
-                            pix = srcPixel(scanRow, (int16_t)(leftX + i));
+                            pix = halSamplePixel(s, (int16_t)(leftX + i), scanY);
                            pixMatch = (pix == matchColor);
                            floodMarkBuf[i] = (uint8_t)(matchEqual
                                ? (pixMatch ? 1 : 0)
@ -621,12 +643,12 @@ void fillRect(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t
    if (!halFastFillRect(s, sx, sy, (uint16_t)sw, (uint16_t)sh, colorIndex)) {
        fillRectClipped(s, sx, sy, sw, sh, colorIndex);
    }
    halFillRectPlanes(s, sx, sy, (uint16_t)sw, (uint16_t)sh, colorIndex);
    surfaceMarkDirtyRect(s, sx, sy, sw, sh);
 }
 void floodFill(SurfaceT *s, int16_t x, int16_t y, uint8_t newColor) {
    uint8_t *row;
    uint8_t  seedColor;
    if (s == NULL) {
@ -635,8 +657,9 @@ void floodFill(SurfaceT *s, int16_t x, int16_t y, uint8_t newColor) {
    if (x < 0 || x >= SURFACE_WIDTH || y < 0 || y >= SURFACE_HEIGHT) {
        return;
    }
-    row       = &s->pixels[SURFACE_ROW_OFFSET(y)];
+    /* halSamplePixel reads from whichever storage the port uses --
-    seedColor = srcPixel(row, x);
+     * works on both chunky (s->pixels) and planar (s->portData) ports. */
    seedColor = halSamplePixel(s, x, y);
    if ((seedColor & 0x0F) == (newColor & 0x0F)) {
        return;
    }
@ -645,7 +668,6 @@ void floodFill(SurfaceT *s, int16_t x, int16_t y, uint8_t newColor) {
 void floodFillBounded(SurfaceT *s, int16_t x, int16_t y, uint8_t newColor, uint8_t boundaryColor) {
    uint8_t *row;
    uint8_t  pix;
    if (s == NULL) {
@ -654,8 +676,7 @@ void floodFillBounded(SurfaceT *s, int16_t x, int16_t y, uint8_t newColor, uint8
    if (x < 0 || x >= SURFACE_WIDTH || y < 0 || y >= SURFACE_HEIGHT) {
        return;
    }
-    row = &s->pixels[SURFACE_ROW_OFFSET(y)];
+    pix = halSamplePixel(s, x, y);
    pix = srcPixel(row, x);
    // Starting on a boundary pixel or already-filled pixel: nothing
    // to do.
    if ((pix & 0x0F) == (boundaryColor & 0x0F)) {
@ -669,25 +690,16 @@ void floodFillBounded(SurfaceT *s, int16_t x, int16_t y, uint8_t newColor, uint8
 uint8_t samplePixel(const SurfaceT *s, int16_t x, int16_t y) {
    uint8_t byte;
    if (s == NULL) {
        return 0;
    }
    if (x < 0 || x >= SURFACE_WIDTH || y < 0 || y >= SURFACE_HEIGHT) {
        return 0;
    }
-
+    /* halSamplePixel reads from whichever storage the port uses --
-    /* Cast to uint16_t before shift -- already validated x >= 0,
+     * chunky ports return a nibble extracted from s->pixels; planar
-     * unsigned semantics match. Avoids ~SSHIFTRIGHT helper. */
+     * ports read 4 plane bits and assemble the nibble. */
-    byte = s->pixels[SURFACE_ROW_OFFSET(y) + ((uint16_t)x >> 1)];
+    return halSamplePixel(s, x, y);
    if (x & 1) {
        return (uint8_t)(byte & 0x0F);
    }
    /* `byte >> 4` is uint8_t but ORCA-C promotes to int (signed 16-bit)
     * for the shift, then narrows -- triggers ~SSHIFTRIGHT. The
     * mask-then-shift sidesteps the promotion path. */
    return (uint8_t)((byte & 0xF0u) >> 4);
 }
@ -725,6 +737,8 @@ void surfaceBlit(SurfaceT *dst, const JoeyAssetT *src, int16_t x, int16_t y) {
            }
        }
    }
    halBlitRectPlanes(dst, x, y, src->pixels, srcX0, srcY0,
                      copyW, copyH, srcRowBytes, 0xFFFFu);
    surfaceMarkDirtyRect(dst, x, y, copyW, copyH);
 }
@ -768,6 +782,8 @@ void surfaceBlitMasked(SurfaceT *dst, const JoeyAssetT *src, int16_t x, int16_t
            }
        }
    }
    halBlitRectPlanes(dst, x, y, src->pixels, srcX0, srcY0,
                      copyW, copyH, srcRowBytes, (uint16_t)transparent);
    surfaceMarkDirtyRect(dst, x, y, copyW, copyH);
 }
--- a/src/core/hal.h
+++ b/src/core/hal.h
@ -9,8 +9,11 @@
 #ifndef JOEYLIB_HAL_H
 #define JOEYLIB_HAL_H
 #include <stdio.h>
 #include "joey/core.h"
 #include "joey/input.h"
 #include "joey/sprite.h"
 #include "joey/surface.h"
 // Per-port one-shot initialization. Called from joeyInit after config
@ -27,17 +30,131 @@ void halShutdown(void);
 // backs the library-owned stage surface. Ports that have a
 // hardware-friendly pin location for the back buffer (IIgs $01/2000
 // with SHR shadow inhibited) return that address here; ports with no
-// such constraint just malloc/free.
+// such constraint just malloc/free. Planar 68k ports may return NULL
 // if the surface is planar-only and has no chunky shadow.
 uint8_t *halStageAllocPixels(void);
 void     halStageFreePixels(uint8_t *pixels);
-// Present the entire source surface to the display.
+// Allocate / release the per-surface portData blob (see SurfaceT in
-void halPresent(const SurfaceT *src);
+// surfaceInternal.h). Chunky ports return NULL from Init -- they keep
 // portData unused and operate on the chunky `pixels` buffer. Planar
 // 68k ports allocate a per-surface struct here describing the
 // bitplane storage (Amiga: 4 separate plane buffers + stride; ST: one
 // interleaved buffer + stride). Called by surfaceCreate / stageAlloc
 // after pixels is allocated; freed by surfaceDestroy / stageFree
 // before pixels is freed. `isStage` lets the port short-circuit for
 // the stage if its planes are display-owned (e.g. Amiga's BitMap
 // planes from OpenScreen) rather than allocated per surface.
 void *halSurfaceAllocPortData(SurfaceT *s, bool isStage);
 void  halSurfaceFreePortData(SurfaceT *s, bool isStage, void *portData);
-// Present a rectangular region of the source surface. The caller has
+// Phase 3 planar dual-write: called from cross-platform fillRect AFTER
-// already validated and clipped the rect to be fully inside the
+// the chunky shadow has been written, with the same already-clipped
-// surface bounds and to have positive extents.
+// (x, y, w, h) and the raw color index 0..15. Planar ports update
-void halPresentRect(const SurfaceT *src, int16_t x, int16_t y, uint16_t w, uint16_t h);
+// the bitplanes with the rect's bit pattern (per-plane bit value =
 // (color >> plane) & 1). Chunky ports (DOS, IIgs) provide a no-op
 // stub. Called unconditionally so cross-platform code doesn't have
 // to know the port is planar; the per-port stub is the cheapest
 // possible thing on chunky ports.
 void halFillRectPlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex);
 // Phase 3 planar dual-write for surfaceCopy: called from cross-platform
 // surfaceCopy AFTER the chunky pixel buffer is memcpy'd. Planar ports
 // also memcpy the bitplanes from src to dst so JOEYLIB_PLANAR_PRESENT
 // builds see correct planes. dst and src are non-NULL and distinct
 // (caller's no-op guards already passed).
 void halSurfaceCopyPlanes(SurfaceT *dst, const SurfaceT *src);
 // Phase 5 planar dual-write for tile ops. Called from cross-platform
 // tile.c AFTER the chunky path completes. (bx, by) are tile-grid
 // coords (0..39 horiz, 0..24 vert; surface is 40x25 tiles).
 // transparentIndex for tileCopyMasked: pixel value to skip. tilePaste
 // reads from a packed 32-byte chunky TileT (4 bytes/row x 8 rows).
 // All Amiga impls operate on the off-screen shadow planes via
 // AmigaPlanarT; chunky-port stubs are no-ops. tileSnap is read-only
 // so has no planar dual-write hook.
 void halTileFillPlanes(SurfaceT *s, uint8_t bx, uint8_t by, uint8_t colorIndex);
 void halTileCopyPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy);
 void halTileCopyMaskedPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy, uint8_t transparentIndex);
 void halTilePastePlanes(SurfaceT *dst, uint8_t bx, uint8_t by, const uint8_t *chunkyTile);
 // tileSnap: cross-platform code reads s->pixels chunky bytes into a
 // 32-byte TileT. On planar ports (s->pixels NULL) the chunky read
 // crashes -- this hook is the planar derivation: reads bitplane bits
 // for the tile rect and assembles 32 chunky bytes (4 bytes/row x 8
 // rows) into chunkyTileOut. Chunky ports (s->pixels valid) implement
 // this as a no-op since the cross-platform fallback already filled
 // chunkyTileOut from s->pixels.
 void halTileSnapPlanes(const SurfaceT *src, uint8_t bx, uint8_t by, uint8_t *chunkyTileOut);
 // Phase 6 planar dual-write for spriteDraw. Called from cross-platform
 // sprite.c AFTER spriteCompiledDraw or spriteDrawInterpreted has
 // updated the chunky shadow. (x, y) is the destination top-left in
 // surface pixels (may be partially off-surface; the hook does its own
 // clipping). Walks the sprite's chunky tile data and updates dst
 // surface planes for every non-transparent pixel (nibble != 0).
 // Save/restore have NO planar dual-write yet -- after spriteSaveUnder
 // + spriteDraw + spriteRestoreUnder under JOEYLIB_PLANAR_PRESENT, the
 // planes still show the sprite (chunky restored, planes unchanged).
 // Workable approach for that needs a parallel plane backup buffer;
 // deferred until apps actually depend on PLANAR_PRESENT save/restore.
 void halSpriteDrawPlanes(SurfaceT *s, const SpriteT *sp, int16_t x, int16_t y);
 // Phase 8 planar dual-write for asset blits and full surface loads.
 // halBlitRectPlanes is called from surfaceBlit / surfaceBlitMasked
 // AFTER the chunky path. transparent == 0xFFFF means opaque blit; any
 // other value is a nibble (0..15) to skip. srcBytes is the asset's
 // raw chunky pixel buffer; srcRowBytes is its stride. (x, y) is the
 // already-clipped destination top-left in dst surface pixels;
 // srcX0/srcY0 is where in the asset the visible region starts after
 // clip; copyW/copyH is the visible region size in pixels.
 //
 void halBlitRectPlanes(SurfaceT *dst, int16_t x, int16_t y, const uint8_t *srcBytes, int16_t srcX0, int16_t srcY0, int16_t copyW, int16_t copyH, int16_t srcRowBytes, uint16_t transparent);
 // Phase 9 sprite save/restore plane data. Chunky ports already hold
 // pixel data in backup->bytes via the cross-platform memcpy. Planar
 // ports (Amiga) DO have chunky NULL, so backup->bytes is unused by
 // the chunky path -- we repurpose it to hold per-plane bytes. Layout:
 // 4 plane stripes of (h * bytesPerPlaneRow) bytes each, where
 // bytesPerPlaneRow = w/8 (sprite x and w are guaranteed 2-pixel
 // aligned by spriteSaveUnder; planar requires further 8-pixel
 // rounding -- see Amiga impl notes). Total bytes:
 // 4 * h * w/8 = h * w/2 = same as chunky. backup->sizeBytes capacity
 // works on both ports. Chunky-port impls are no-ops; Amiga writes /
 // reads plane bytes via AmigaPlanarT.
 void halSpriteSavePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t *dstPlaneBytes);
 void halSpriteRestorePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, const uint8_t *srcPlaneBytes);
 // Phase 9 reader hooks. Cross-platform code calls these instead of
 // reading from s->pixels directly so it works regardless of whether
 // the port stores chunky or planar as the source of truth. Chunky
 // ports (DOS, IIgs) implement these reading from s->pixels (cheap);
 // Amiga reads from the bitplanes in AmigaPlanarT. (x, y) bounds are
 // already validated by the caller.
 //
 // halSamplePixel: returns the 0..15 nibble at (x, y).
 // halSurfaceHash: returns the FNV-style hash of pixel + scb + palette
 //   that surfaceHash currently computes by walking s->pixels. Allows
 //   ports to use their native pixel storage instead.
 // halSurfaceCopyChunky: cross-platform surfaceCopy used to memcpy
 //   s->pixels src->dst; on planar ports there is no chunky to copy
 //   (planes already covered by halSurfaceCopyPlanes). Chunky ports
 //   do the memcpy here; Amiga is a no-op.
 // halSurfaceLoadFileChunky / halSurfaceSaveFileChunky wrap fread /
 //   fwrite of the pixel data. Chunky ports stream directly to/from
 //   s->pixels; Amiga uses a scratch buffer + c2p (load) or
 //   plane->chunky derivation (save).
 uint8_t halSamplePixel(const SurfaceT *s, int16_t x, int16_t y);
 uint32_t halSurfaceHash(const SurfaceT *s);
 void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src);
 bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp);
 bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp);
 // Present the dirty regions of the source surface to the display.
 // The cross-platform stagePresent walks the dirty arrays before
 // calling this; ports may use the dirty arrays themselves to skip
 // untouched rows.
 void halPresent(const SurfaceT *src);
 // Optional: returns a port-specific error message string for the last
 // HAL failure, or NULL if none. Ports may return NULL always.
@ -73,9 +190,23 @@ uint16_t halFrameHz(void);
 // Audio: per-port engine setup, module + SFX playback, teardown.
 // halAudioInit returns true if the platform has a working engine.
-// All entry points are safe to call when init failed -- they become
+// Per-surface chunky pixel allocation. Chunky ports (DOS, IIgs, ST
-// no-ops. See joey/audio.h for the public API contract that wraps
+// while still chunky) allocate SURFACE_PIXELS_SIZE bytes (calloc-
-// these.
+// style, zero-filled). Pure-planar Amiga returns NULL -- there's no
 // chunky shadow; cross-platform code that previously read s->pixels
 // goes through halSamplePixel / halSurfaceCopyChunky / etc. instead.
 // halSurfaceFreePixels mirrors free(); NULL is a valid input on
 // planar ports.
 uint8_t *halSurfaceAllocPixels(void);
 void     halSurfaceFreePixels(uint8_t *pixels);
 // Get a pointer to the start of bitplane `planeIdx` (0..3) for surface
 // `s`. Returns NULL on chunky ports (no planes). On Amiga returns
 // pd->planes[planeIdx] from the AmigaPlanarT struct in portData.
 // Used by the planar sprite codegen dispatcher to compute the 4
 // plane addresses to hand the emitted asm.
 uint8_t *halSurfacePlanePtr(const SurfaceT *s, uint8_t planeIdx);
 bool halAudioInit(void);
 void halAudioShutdown(void);
 void halAudioPlayMod(const uint8_t *data, uint32_t length, bool loop);
@ -185,6 +316,21 @@ bool halFastFloodWalkAndScans(uint8_t *pixels, int16_t x, int16_t y,
                              bool *seedMatched,
                              int16_t *leftXOut, int16_t *rightXOut);
 // Planar variants of halFastFloodWalk / halFastFloodScanRow. Take a
 // SurfaceT* instead of a chunky-row pointer so they work on planar
 // ports (Amiga post-Phase 9) where s->pixels is NULL. Same semantics;
 // chunky ports return false (the chunky variants above are faster
 // when a chunky row is available). Replace the per-pixel
 // halSamplePixel walk on planar ports.
 bool halFloodWalkPlanes(const SurfaceT *s, int16_t startX, int16_t y,
                        uint8_t matchColor, uint8_t newColor, bool matchEqual,
                        bool *seedMatched,
                        int16_t *leftXOut, int16_t *rightXOut);
 bool halFloodScanRowPlanes(const SurfaceT *s, int16_t leftX, int16_t rightX, int16_t scanY,
                           uint8_t matchColor, uint8_t newColor, bool matchEqual,
                           uint8_t *markBuf);
 // surfaceBlit / surfaceBlitMasked rect-copy helper. Caller has done
 // the clip math: dstRow0 / srcRow0 point at row 0 of the source/dest
 // regions, dstX / srcX are intra-row pixel offsets, copyW/copyH are
@ -333,6 +479,12 @@ extern uint16_t gFloodRightX;
 #undef  halFastFloodScanAndPush
 #define halFastFloodScanAndPush(_row, _lx, _rx, _mc, _nc, _me, _sy, _sx, _syA, _sp, _ms) (false)
 // IIgs is chunky; the planar flood hooks are never reachable.
 #undef  halFloodWalkPlanes
 #define halFloodWalkPlanes(_s, _sx, _y, _mc, _nc, _me, _sm, _lx, _rx) (false)
 #undef  halFloodScanRowPlanes
 #define halFloodScanRowPlanes(_s, _lx, _rx, _sy, _mc, _nc, _me, _mb) (false)
 // Tier-1 flood: multi-output. Asm sets gFloodSeedMatch / gFloodLeftX /
 // gFloodRightX; macro reads those into the caller's out-ptrs.
 #undef  halFastFloodWalkAndScans
--- a/src/core/present.c
+++ b/src/core/present.c
@ -2,8 +2,7 @@
 //
 // stagePresent walks the per-row dirty bands set by drawing primitives
 // and asks the port HAL to flip just those rows to the display, then
-// resets the dirty state. stagePresentRect bypasses dirty tracking
+// resets the dirty state.
 // entirely and slams a caller-specified rectangle (after clipping).
 #include <stddef.h>
@ -25,48 +24,3 @@ void stagePresent(void) {
    halPresent(stage);
    stageDirtyClearAll();
 }
 void stagePresentRect(int16_t x, int16_t y, uint16_t w, uint16_t h) {
    SurfaceT *stage;
    int16_t   sx;
    int16_t   sy;
    int16_t   sw;
    int16_t   sh;
    stage = stageGet();
    if (stage == NULL) {
        return;
    }
    sx = x;
    sy = y;
    sw = (int16_t)w;
    sh = (int16_t)h;
    if (sw <= 0 || sh <= 0) {
        return;
    }
    if (sx < 0) {
        sw += sx;
        sx = 0;
    }
    if (sy < 0) {
        sh += sy;
        sy = 0;
    }
    if (sx >= SURFACE_WIDTH || sy >= SURFACE_HEIGHT) {
        return;
    }
    if (sx + sw > SURFACE_WIDTH) {
        sw = SURFACE_WIDTH - sx;
    }
    if (sy + sh > SURFACE_HEIGHT) {
        sh = SURFACE_HEIGHT - sy;
    }
    if (sw <= 0 || sh <= 0) {
        return;
    }
    halPresentRect(stage, sx, sy, (uint16_t)sw, (uint16_t)sh);
 }
--- a/src/core/sprite.c
+++ b/src/core/sprite.c
@ -10,6 +10,7 @@
 #include "joey/sprite.h"
 #include "codegenArenaInternal.h"
 #include "hal.h"
 #include "spriteInternal.h"
 #include "surfaceInternal.h"
@ -22,6 +23,20 @@
 // Color 0 is always transparent for sprites (DESIGN.md contract).
 #define TRANSPARENT_NIBBLE 0
 // On Amiga (post-Phase 9 / Phase 6 redux) the compiled sprite emitter
 // writes directly to the bitplanes, so the halSpritePlanes hooks are
 // pure duplicate work after a compiled call. On other ports the
 // hooks are either no-op stubs (chunky-only IIgs/DOS) or the only
 // thing writing planes (ST: chunky-shadow + planes). Slow / interpreter
 // paths still need the hooks unconditionally on every platform -- the
 // chunky interpreter is a no-op on Amiga (s->pixels NULL) so the hook
 // is the only draw.
 #if defined(JOEYLIB_PLATFORM_AMIGA)
 #define COMPILED_SPRITE_WRITES_PLANES 1
 #else
 #define COMPILED_SPRITE_WRITES_PLANES 0
 #endif
 // ----- Prototypes -----
@ -144,14 +159,20 @@ static void spriteDrawInterpreted(SurfaceT *s, SpriteT *sp, int16_t x, int16_t y
        return;
    }
-    for (row = 0; row < h; row++) {
+    /* Skip the chunky write loop on planar ports (s->pixels == NULL).
-        dstRow = &s->pixels[(dy + row) * SURFACE_BYTES_PER_ROW];
+     * halSpriteDrawPlanes is called by the spriteDraw caller and does
-        for (col = 0; col < w; col++) {
+     * its own clip + plane write, so the dirty mark + planar update
-            nibble = srcNibble(sp, (int16_t)(sx + col), (int16_t)(sy + row));
+     * happen there. Phase 9 dropped the chunky shadow on Amiga. */
-            if (nibble == TRANSPARENT_NIBBLE) {
+    if (s->pixels != NULL) {
-                continue;
+        for (row = 0; row < h; row++) {
            dstRow = &s->pixels[(dy + row) * SURFACE_BYTES_PER_ROW];
            for (col = 0; col < w; col++) {
                nibble = srcNibble(sp, (int16_t)(sx + col), (int16_t)(sy + row));
                if (nibble == TRANSPARENT_NIBBLE) {
                    continue;
                }
                writeDstNibble(dstRow, (int16_t)(dx + col), nibble);
            }
            writeDstNibble(dstRow, (int16_t)(dx + col), nibble);
        }
    }
    surfaceMarkDirtyRect(s, dx, dy, w, h);
@ -200,6 +221,13 @@ SpriteT *spriteCreateFromSurface(const SurfaceT *src, int16_t x, int16_t y,
    if (src == NULL || widthTiles == 0 || heightTiles == 0) {
        return NULL;
    }
    /* Phase 9: planar ports have NULL src->pixels. Capturing a sprite
     * from such a surface needs a planar-to-chunky derivation hook;
     * not implemented yet, so refuse the call. Apps targeting Amiga
     * should ship sprites as static tile data instead. */
    if (src->pixels == NULL) {
        return NULL;
    }
    // Source x/y must be on a tile boundary so each captured tile lands
    // on whole bytes -- mid-byte snapshots would lose half a pixel at
    // the left edge.
@ -284,10 +312,14 @@ void spriteDraw(SurfaceT *s, SpriteT *sp, int16_t x, int16_t y) {
    // need clip math (they walk fixed offsets).
    if (sp->slot != NULL && isFullyOnSurface(x, y, widthPx, heightPx)) {
        spriteCompiledDraw(s, sp, x, y);
        if (!COMPILED_SPRITE_WRITES_PLANES) {
            halSpriteDrawPlanes(s, sp, x, y);
        }
        surfaceMarkDirtyRect(s, x, y, (int16_t)widthPx, (int16_t)heightPx);
        return;
    }
    spriteDrawInterpreted(s, sp, x, y);
    halSpriteDrawPlanes(s, sp, x, y);
 }
@ -332,7 +364,7 @@ void spriteSaveAndDraw(SurfaceT *s, SpriteT *sp, int16_t x, int16_t y, SpriteBac
        uint16_t  saveIdx;
        uint16_t  drawIdx;
        uint8_t  *offsetsBase;
-        shift       = (uint8_t)(x & 1);
+        shift       = SPRITE_SHIFT_INDEX(x);
        saveIdx     = (uint16_t)(((uint16_t)shift << 1) + (uint16_t)shift + SPRITE_OP_SAVE);
        drawIdx     = (uint16_t)(((uint16_t)shift << 1) + (uint16_t)shift + SPRITE_OP_DRAW);
        offsetsBase = (uint8_t *)sp->routineOffsets;
@ -340,6 +372,10 @@ void spriteSaveAndDraw(SurfaceT *s, SpriteT *sp, int16_t x, int16_t y, SpriteBac
            *(uint16_t *)(offsetsBase + (drawIdx << 1)) != SPRITE_NOT_COMPILED) {
            spriteCompiledSaveUnder(s, sp, x, y, backup);
            spriteCompiledDraw    (s, sp, x, y);
            if (!COMPILED_SPRITE_WRITES_PLANES) {
                halSpriteSavePlanes(s, backup->x, backup->y, backup->width, backup->height, backup->bytes);
                halSpriteDrawPlanes(s, sp, x, y);
            }
            surfaceMarkDirtyRect  (s, x, y, (int16_t)widthPx, (int16_t)heightPx);
            return;
        }
@ -630,13 +666,18 @@ void spriteRestoreUnder(SurfaceT *s, const SpriteBackupT *backup) {
        routeOffset = *(uint16_t *)((uint8_t *)sp->routineOffsets + (routeIdx << 1));
        if (routeOffset != SPRITE_NOT_COMPILED) {
            spriteCompiledRestoreUnder(s, backup);
            if (!COMPILED_SPRITE_WRITES_PLANES) {
                halSpriteRestorePlanes(s, bx, by, bw, bh, backup->bytes);
            }
            surfaceMarkDirtyRect(s, bx, by, (int16_t)bw, (int16_t)bh);
            return;
        }
    }
-    /* Slow / interpreted memcpy fallback. */
+    /* Slow / interpreted memcpy fallback. Skip the chunky memcpy if
-    {
+     * the port has no chunky shadow (Phase 9 Amiga: s->pixels NULL);
     * halSpriteRestorePlanes below does the planar restore. */
    if (s->pixels != NULL) {
        int16_t  row;
        int16_t  byteStart;
        uint8_t *dstRow;
@ -650,6 +691,7 @@ void spriteRestoreUnder(SurfaceT *s, const SpriteBackupT *backup) {
                   (size_t)copyBytes);
        }
    }
    halSpriteRestorePlanes(s, bx, by, bw, bh, backup->bytes);
    surfaceMarkDirtyRect(s, bx, by, (int16_t)bw, (int16_t)bh);
 }
@ -684,11 +726,14 @@ void spriteSaveUnder(const SurfaceT *s, SpriteT *sp, int16_t x, int16_t y, Sprit
    if (backup->bytes != NULL && slot != NULL && isFullyOnSurface(x, y, widthPx, heightPx)) {
        uint16_t routeIdx;
        uint16_t routeOffset;
-        shift       = (uint8_t)(x & 1);
+        shift       = SPRITE_SHIFT_INDEX(x);
        routeIdx    = (uint16_t)(((uint16_t)shift << 1) + (uint16_t)shift + SPRITE_OP_SAVE);
        routeOffset = *(uint16_t *)((uint8_t *)sp->routineOffsets + (routeIdx << 1));
        if (routeOffset != SPRITE_NOT_COMPILED) {
            spriteCompiledSaveUnder(s, sp, x, y, backup);
            if (!COMPILED_SPRITE_WRITES_PLANES) {
                halSpriteSavePlanes(s, backup->x, backup->y, backup->width, backup->height, backup->bytes);
            }
            return;
        }
    }
@ -744,11 +789,16 @@ void spriteSaveUnder(const SurfaceT *s, SpriteT *sp, int16_t x, int16_t y, Sprit
        // backup with bytes==NULL.
        return;
    }
-    for (row = 0; row < h; row++) {
+    /* Chunky save path: skip on planar ports (s->pixels NULL).
-        srcRow = &s->pixels[(dy + row) * SURFACE_BYTES_PER_ROW];
+     * halSpriteSavePlanes below covers the planar case. */
-        memcpy(&backup->bytes[(uint16_t)row * (uint16_t)copyBytes],
+    if (s->pixels != NULL) {
-               &srcRow[byteStart],
+        for (row = 0; row < h; row++) {
-               (size_t)copyBytes);
+            srcRow = &s->pixels[(dy + row) * SURFACE_BYTES_PER_ROW];
            memcpy(&backup->bytes[(uint16_t)row * (uint16_t)copyBytes],
                   &srcRow[byteStart],
                   (size_t)copyBytes);
        }
    }
    halSpriteSavePlanes(s, clippedX, dy, (uint16_t)clippedW, (uint16_t)h, backup->bytes);
    }   /* end slow path */
 }
--- a/src/core/spriteInternal.h
+++ b/src/core/spriteInternal.h
@ -13,6 +13,16 @@
 #define SPRITE_OP_RESTORE 2
 #define SPRITE_OP_COUNT   3
 // Per-platform shift index used by the dispatcher. Chunky 4bpp ports
 // store one nibble per pixel pair so the only sub-byte alignment is
 // x % 2. Amiga planar packs 8 pixels per plane byte so all 8
 // alignments matter.
 #if defined(JOEYLIB_PLATFORM_AMIGA)
 #define SPRITE_SHIFT_INDEX(x)  ((uint8_t)((x) & 7))
 #else
 #define SPRITE_SHIFT_INDEX(x)  ((uint8_t)((x) & 1))
 #endif
 // Sentinel stored in routineOffsets[shift][op] when that op's emitter
 // returned 0 bytes (i.e., the platform doesn't implement compiled
 // codegen for that op yet). Distinct from a real offset of 0, which
--- a/src/core/surface.c
+++ b/src/core/surface.c
@ -65,9 +65,10 @@ void surfaceCopy(SurfaceT *dst, const SurfaceT *src) {
    if (dst == NULL || src == NULL || dst == src) {
        return;
    }
-    memcpy(dst->pixels,  src->pixels,  SURFACE_PIXELS_SIZE);
+    halSurfaceCopyChunky(dst, src);          /* memcpy on chunky ports; no-op on planar */
    memcpy(dst->scb,     src->scb,     sizeof(src->scb));
    memcpy(dst->palette, src->palette, sizeof(src->palette));
    halSurfaceCopyPlanes(dst, src);          /* 4 plane memcpys on planar ports; no-op on chunky */
    surfaceMarkDirtyAll(dst);
 }
@ -79,11 +80,10 @@ SurfaceT *surfaceCreate(void) {
    if (s == NULL) {
        return NULL;
    }
-    s->pixels = (uint8_t *)calloc(1, SURFACE_PIXELS_SIZE);
+    /* halSurfaceAllocPixels returns NULL on planar ports (Amiga); the
-    if (s->pixels == NULL) {
+     * primary storage is the port-allocated planes via portData below. */
-        free(s);
+    s->pixels = halSurfaceAllocPixels();
-        return NULL;
+    s->portData = halSurfaceAllocPortData(s, false);
    }
    paletteInitDefault(s);
    return s;
 }
@ -96,11 +96,44 @@ void surfaceDestroy(SurfaceT *s) {
    if (s == gStage) {
        return;
    }
-    free(s->pixels);
+    halSurfaceFreePortData(s, false, s->portData);
    halSurfaceFreePixels(s->pixels);
    free(s);
 }
 // Cheapest deterministic hash that still detects per-byte changes:
 // (hash << 1) ^ byte, a single 16-bit accumulator. ORCA-C / 65816
 // compiles to ASL + EOR -- about 35 cyc per byte. A 32-bit multiply
 // FNV-style hash takes ~200 cyc per byte via ~UMUL4, which adds
 // 80+ seconds to a UBER run on IIgs. Discrimination is weaker than
 // FNV but plenty for cross-port validation: we only need "did the
 // same logical-pixel sequence produce the same hash?" -- not
 // crypto-grade collision resistance over arbitrary inputs.
 //
 // Walks the chunky pixel buffer byte-by-byte, the same logical-pixel
 // ordering on every chunky-format port (IIgs, DOS, Amiga and ST
 // while still chunky). When the planar rewrite drops s->pixels on
 // Amiga/ST this function will need a HAL hook (halSurfaceHash) to
 // read planes natively while producing the same logical hash.
 /* Cross-port FNV-style hash of pixels + SCB + palette. The hash logic
 * (multiplier streams, byte ordering for palette) is identical across
 * ports, but the pixel READS go through the port HAL so chunky ports
 * walk s->pixels and planar ports walk plane bits and assemble nibble
 * pairs into chunky bytes for the hash. Both produce the same logical-
 * pixel hash because they hash the same logical pixel sequence in the
 * same chunky byte order. SCB and palette are still hashed inline
 * here because they live in the SurfaceT struct on every port (no
 * port-specific storage) and the byte/value-with-explicit-byte-order
 * walks are already endian-independent. */
 uint32_t surfaceHash(const SurfaceT *s) {
    if (s == NULL) {
        return 0u;
    }
    return halSurfaceHash(s);
 }
 bool surfaceLoadFile(SurfaceT *dst, const char *path) {
    FILE *fp;
    long  fileSize;
@ -125,7 +158,7 @@ bool surfaceLoadFile(SurfaceT *dst, const char *path) {
        fclose(fp);
        return false;
    }
-    if (fread(dst->pixels, 1, SURFACE_PIXELS_SIZE, fp) != SURFACE_PIXELS_SIZE) {
+    if (!halSurfaceLoadFileChunky(dst, fp)) {
        fclose(fp);
        return false;
    }
@ -153,7 +186,7 @@ bool surfaceSaveFile(const SurfaceT *src, const char *path) {
    if (fp == NULL) {
        return false;
    }
-    if (fwrite(src->pixels, 1, SURFACE_PIXELS_SIZE, fp) != SURFACE_PIXELS_SIZE) {
+    if (!halSurfaceSaveFileChunky(src, fp)) {
        fclose(fp);
        return false;
    }
@ -228,13 +261,14 @@ bool stageAlloc(void) {
    if (gStage == NULL) {
        return false;
    }
    /* halStageAllocPixels returns NULL on planar ports (Amiga) where
     * the chunky shadow doesn't exist; the planes from portData are
     * the source of truth. NULL pixels is no longer a failure. */
    gStage->pixels = halStageAllocPixels();
-    if (gStage->pixels == NULL) {
+    if (gStage->pixels != NULL) {
-        free(gStage);
+        memset(gStage->pixels, 0, SURFACE_PIXELS_SIZE);
        gStage = NULL;
        return false;
    }
-    memset(gStage->pixels, 0, SURFACE_PIXELS_SIZE);
+    gStage->portData = halSurfaceAllocPortData(gStage, true);
    stageDirtyClearAll();
    paletteInitDefault(gStage);
    return true;
@ -255,6 +289,7 @@ void stageFree(void) {
    if (gStage == NULL) {
        return;
    }
    halSurfaceFreePortData(gStage, true, gStage->portData);
    halStageFreePixels(gStage->pixels);
    free(gStage);
    gStage = NULL;
--- a/src/core/surfaceInternal.h
+++ b/src/core/surfaceInternal.h
@ -14,8 +14,17 @@
 // auto-mirroring to $E1). Caller-side `s->pixels[i]` syntax is
 // unchanged; only allocation/copy paths in surface.c shift to a
 // two-buffer model.
 //
 // portData is per-port opaque storage. On chunky ports (IIgs, DOS) it
 // stays NULL -- pixels is the source of truth. On planar ports
 // (Amiga, Atari ST) it points to a port-private struct describing the
 // 4 bitplanes (Amiga: 4 separate plane buffers + stride; ST: single
 // interleaved buffer + stride). Cross-platform code never touches it
 // directly -- all primitive access goes through halFast* on planar
 // ports. See project_planar_68k_plan.md for the full architecture.
 struct SurfaceT {
    uint8_t  *pixels;
    void     *portData;
    uint8_t   scb[SURFACE_HEIGHT];
    uint16_t  palette[SURFACE_PALETTE_COUNT][SURFACE_COLORS_PER_PALETTE];
 };
@ -38,6 +47,18 @@ struct SurfaceT {
 extern uint8_t gStageMinWord[SURFACE_HEIGHT];
 extern uint8_t gStageMaxWord[SURFACE_HEIGHT];
 // Per-byte mixer for surfaceHash. Two-stream: lo *= 31 + b, hi *= 251 + b.
 // Strength-reduced to shifts so ORCA-C doesn't emit `~UMUL2` (~150 cyc
 // per call); 32 KB hashed twice -> ~5 minutes per UBER run. The
 // shift form is 16-bit-equivalent (mod 2^16) so hash values are
 // identical to the original `* 31u` / `* 251u` form.
 //   lo *= 31  ==  (lo << 5) - lo
 //   hi *= 251 ==  (hi << 8) - (hi << 2) - hi
 #define SURFACE_HASH_MIX_BYTE(lo_, hi_, b_) do { \
    (lo_) = (uint16_t)(((((lo_) << 5) - (lo_)) + (b_))); \
    (hi_) = (uint16_t)((((hi_) << 8) - ((hi_) << 2) - (hi_)) + (b_)); \
 } while (0)
 // Stage SCB / palette dirty flags. scbSet* and paletteSet set them
 // true when the stage's data is modified; the per-port present code
 // checks the flags and clears after upload. Replaces a per-frame
@ -50,6 +71,15 @@ extern bool gStagePaletteDirty;
 // bands are widened to cover the rect. If `s` is any other surface,
 // the call is a no-op -- non-stage surfaces never get presented, so
 // they don't carry dirty state.
 //
 // Planar ports rely on the chunky shadow + c2p path through Phase 8.
 // Planar-native primitives (Phases 3+) dual-write: they update both
 // the chunky pixels and the bitplanes in the same call, so c2p at
 // present time always derives correct planes from up-to-date chunky.
 // Phase 9 deletes the chunky shadow + c2p; only at that point will
 // per-row planar-vs-chunky tracking even be a possible question, and
 // the plan is to avoid it entirely there too (planes become the only
 // source of truth).
 void surfaceMarkDirtyRect(const SurfaceT *s, int16_t x, int16_t y, int16_t w, int16_t h);
 // Shorthand for "every row, full width" -- used by surfaceClear and
--- a/src/core/tile.c
+++ b/src/core/tile.c
@ -147,6 +147,7 @@ void tileCopy(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src,
    if (!halFastTileCopy(dstRow0, srcRow0)) {
        copyTileOpaque(dstRow0, srcRow0);
    }
    halTileCopyPlanes(dst, dstBx, dstBy, src, srcBx, srcBy);
    surfaceMarkDirtyRect(dst, (int16_t)dstPixelX, (int16_t)dstPixelY,
                         TILE_PIXELS_PER_SIDE, TILE_PIXELS_PER_SIDE);
 }
@ -178,6 +179,7 @@ void tileCopyMasked(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT
    if (!halFastTileCopyMasked(dstRow0, srcRow0, transparentIndex)) {
        copyTileMasked(dstRow0, srcRow0, transparentIndex);
    }
    halTileCopyMaskedPlanes(dst, dstBx, dstBy, src, srcBx, srcBy, transparentIndex);
    surfaceMarkDirtyRect(dst, (int16_t)dstPixelX, (int16_t)dstPixelY,
                         TILE_PIXELS_PER_SIDE, TILE_PIXELS_PER_SIDE);
 }
@ -209,6 +211,7 @@ void tileFill(SurfaceT *s, uint8_t bx, uint8_t by, uint8_t colorIndex) {
            row += SURFACE_BYTES_PER_ROW;
        }
    }
    halTileFillPlanes(s, bx, by, colorIndex);
    surfaceMarkDirtyRect(s, (int16_t)pixelX, (int16_t)pixelY,
                         TILE_PIXELS_PER_SIDE, TILE_PIXELS_PER_SIDE);
 }
@ -241,6 +244,7 @@ void tilePaste(SurfaceT *dst, uint8_t bx, uint8_t by, const TileT *in) {
            src    += TILE_BYTES_PER_ROW;
        }
    }
    halTilePastePlanes(dst, bx, by, &in->pixels[0]);
    surfaceMarkDirtyRect(dst, (int16_t)pixelX, (int16_t)pixelY,
                         TILE_PIXELS_PER_SIDE, TILE_PIXELS_PER_SIDE);
 }
@ -261,9 +265,12 @@ void tileSnap(const SurfaceT *src, uint8_t bx, uint8_t by, TileT *out) {
    }
    pixelX = (uint16_t)((uint16_t)bx * TILE_PIXELS_PER_SIDE);
    pixelY = (uint16_t)((uint16_t)by * TILE_PIXELS_PER_SIDE);
    srcRow = &src->pixels[SURFACE_ROW_OFFSET(pixelY) + (pixelX >> 1)];
    dst    = &out->pixels[0];
-    if (!halFastTileSnap(dst, srcRow)) {
+    /* On planar ports (s->pixels NULL) the chunky read path is
     * skipped; halTileSnapPlanes below derives the tile bytes from
     * the bitplanes. */
    if (src->pixels != NULL && !halFastTileSnap(dst, &src->pixels[SURFACE_ROW_OFFSET(pixelY) + (pixelX >> 1)])) {
        srcRow = &src->pixels[SURFACE_ROW_OFFSET(pixelY) + (pixelX >> 1)];
        for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) {
            dst[0] = srcRow[0];
            dst[1] = srcRow[1];
@ -273,4 +280,5 @@ void tileSnap(const SurfaceT *src, uint8_t bx, uint8_t by, TileT *out) {
            dst    += TILE_BYTES_PER_ROW;
        }
    }
    halTileSnapPlanes(src, bx, by, &out->pixels[0]);
 }
--- a/src/port/amiga/circle.s
+++ b/src/port/amiga/circle.s
@ -0,0 +1,270 @@
 | Amiga planar circle outline V4 -- 16-way color-specialized.
 |
 | Per Bresenham iter:
 |   1. Precompute 4 xp records (xp_byte_w + bitMask_b + notMask_b) for
 |      cx +/- bx and cx +/- by, stored at sp+0..15 (4 records x 4 bytes).
 |   2. Precompute 4 yp40 words for cy +/- by and cy +/- bx, stored at
 |      sp+16..23 (4 words x 2 bytes).
 |   3. Plot 8 octant pixels with hardcoded color: each pixel does 4
 |      branchless plane RMW ops (or.b for set bits, and.b for clear
 |      bits) -- no btst, no per-plane branch.
 |   4. Bresenham step.
 |
 | At function entry the color is masked to 4 bits and used as the index
 | into a 16-entry jump table that selects the matching main loop.
 | Each main loop has the color hardcoded into the per-plane RMW ops.
 |
 | The branchless plot saves ~20-28 cyc per plane vs V3's btst+branch
 | pattern -- ~640-900 cyc per Bresenham iter.
 |
 | ABI: cdecl. d2-d7/a2-a6 callee-save.
 |
 | void surface68kAmigaCircleOutline(uint8_t *p0, uint8_t *p1,
 |                                   uint8_t *p2, uint8_t *p3,
 |                                   uint16_t cx, uint16_t cy,
 |                                   uint16_t r,  uint8_t  color);
 |
 | Register allocation across the iter loop:
 |   d2.w   = bx (Bresenham)
 |   d3.w   = by (Bresenham)
 |   d4.w   = err (Bresenham)
 |   d5.w   = cx (cached)
 |   a4     = cy (cached, sign-extended)
 |   a0..a3 = plane bases
 |   a5     = bitMaskLut
 |   d0,d1,d6,d7 = scratch in precompute / plot
 |
 | Scratch block (24 bytes) at sp+0..23:
 |   sp+0..3:   xp1 record [xp_byte_w, bitMask_b, notMask_b] for cx+bx
 |   sp+4..7:   xp2 record for cx-bx
 |   sp+8..11:  xp3 record for cx+by
 |   sp+12..15: xp4 record for cx-by
 |   sp+16..17: yp1 word (cy+by) * 40
 |   sp+18..19: yp2 word (cy-by) * 40
 |   sp+20..21: yp3 word (cy+bx) * 40
 |   sp+22..23: yp4 word (cy-bx) * 40
                .text
 | ---- XP_REC: build xp record at sp+slot for xp = cx <signOp> <xreg> ----
 | signOp: add or sub
 | xreg:   %d2 (bx) or %d3 (by)
 | slot:   0, 4, 8, or 12
 | Trashes: d0, d1, d6, d7
                .macro  XP_REC  slot, signOp, xreg
                move.w  %d5,%d6
                \signOp\().w \xreg,%d6        | d6 = xp
                move.w  %d6,%d7
                lsr.w   #3,%d7                | d7 = xp >> 3 (xp_byte)
                and.w   #7,%d6                | d6 = xp & 7
                move.b  (%a5,%d6.w),%d6       | d6 = bitMask
                move.b  %d6,%d1
                not.b   %d1                   | d1 = notMask
                move.w  %d7,\slot(%sp)        | xp_byte word
                move.b  %d6,\slot+2(%sp)      | bitMask byte
                move.b  %d1,\slot+3(%sp)      | notMask byte
                .endm
 | ---- YP_REC: build yp40 word at sp+slot for yp = cy <signOp> <yreg> ----
                .macro  YP_REC  slot, signOp, yreg
                move.l  %a4,%d6
                \signOp\().w \yreg,%d6        | d6.w = yp
                move.w  %d6,%d0
                lsl.w   #3,%d6                | d6 = yp << 3
                lsl.w   #5,%d0                | d0 = yp << 5
                add.w   %d6,%d0               | d0 = yp * 40
                move.w  %d0,\slot(%sp)
                .endm
 | ---- PLOT_FIXED: plot one pixel with hardcoded 4-bit color ----
 | slotYp: 16, 18, 20, or 22 (yp40 word slot)
 | slotXp: 0, 4, 8, or 12   (xp record slot)
 | color:  literal 0..15
 | Trashes: d0, d1, d7
                .macro  PLOT_FIXED  slotYp, slotXp, color
                move.w  \slotYp(%sp),%d0      | d0 = yp40
                add.w   \slotXp(%sp),%d0      | d0 += xp_byte
                move.b  \slotXp+2(%sp),%d1    | d1.b = bitMask
                move.b  \slotXp+3(%sp),%d7    | d7.b = notMask
                .if  ((\color) & 1)
                or.b    %d1,(%a0,%d0.w)
                .else
                and.b   %d7,(%a0,%d0.w)
                .endif
                .if  ((\color) & 2)
                or.b    %d1,(%a1,%d0.w)
                .else
                and.b   %d7,(%a1,%d0.w)
                .endif
                .if  ((\color) & 4)
                or.b    %d1,(%a2,%d0.w)
                .else
                and.b   %d7,(%a2,%d0.w)
                .endif
                .if  ((\color) & 8)
                or.b    %d1,(%a3,%d0.w)
                .else
                and.b   %d7,(%a3,%d0.w)
                .endif
                .endm
 | ---- PLOT_8: plot all 8 octant pixels for a given hardcoded color ----
                .macro  PLOT_8  color
                PLOT_FIXED  16,  0, \color    | (cx+bx, cy+by)
                PLOT_FIXED  16,  4, \color    | (cx-bx, cy+by)
                PLOT_FIXED  18,  0, \color    | (cx+bx, cy-by)
                PLOT_FIXED  18,  4, \color    | (cx-bx, cy-by)
                PLOT_FIXED  20,  8, \color    | (cx+by, cy+bx)
                PLOT_FIXED  20, 12, \color    | (cx-by, cy+bx)
                PLOT_FIXED  22,  8, \color    | (cx+by, cy-bx)
                PLOT_FIXED  22, 12, \color    | (cx-by, cy-bx)
                .endm
 | ---- CO_BODY: full Bresenham loop body for a hardcoded color ----
 | Generates the per-iter precompute, branchless plot, and Bresenham
 | step. Uses unique labels via \color suffix.
                .macro  CO_BODY  color
                XP_REC   0, add, %d2          | xp1 = cx+bx
                XP_REC   4, sub, %d2          | xp2 = cx-bx
                XP_REC   8, add, %d3          | xp3 = cx+by
                XP_REC  12, sub, %d3          | xp4 = cx-by
                YP_REC  16, add, %d3          | yp1 = cy+by
                YP_REC  18, sub, %d3          | yp2 = cy-by
                YP_REC  20, add, %d2          | yp3 = cy+bx
                YP_REC  22, sub, %d2          | yp4 = cy-bx
                PLOT_8  \color
                addq.w  #1,%d3
                tst.w   %d4
                bgt     .LcoDecX_\color
                add.w   %d3,%d4
                add.w   %d3,%d4
                addq.w  #1,%d4
                bra.w   .LcoLoop_\color
 .LcoDecX_\color:
                subq.w  #1,%d2
                add.w   %d3,%d4
                add.w   %d3,%d4
                sub.w   %d2,%d4
                sub.w   %d2,%d4
                addq.w  #1,%d4
                bra.w   .LcoLoop_\color
                .endm
 | ---- CO_LOOP_HDR: emit a labelled loop header for a color ----
                .macro  CO_LOOP_HDR  color
 .LcoLoop_\color:
                cmp.w   %d3,%d2
                bcs.w   .LcoDone
                CO_BODY \color
                .endm
 | ---- Function entry ----
                .equ    SP_SAVED, 44
                .equ    SP_LOCAL, 24
                .equ    SP_OFF,         (SP_SAVED + 4 + SP_LOCAL)
                .equ    SP_P0,    SP_OFF + 0
                .equ    SP_P1,    SP_OFF + 4
                .equ    SP_P2,    SP_OFF + 8
                .equ    SP_P3,    SP_OFF + 12
                .equ    SP_CX,    SP_OFF + 16 + 2
                .equ    SP_CY,    SP_OFF + 20 + 2
                .equ    SP_R,     SP_OFF + 24 + 2
                .equ    SP_COLOR, SP_OFF + 28 + 3
                .globl  _surface68kAmigaCircleOutline
 _surface68kAmigaCircleOutline:
                movem.l %d2-%d7/%a2-%a6,-(%sp)
                lea     -SP_LOCAL(%sp),%sp
                | Plane bases.
                move.l  SP_P0(%sp),%a0
                move.l  SP_P1(%sp),%a1
                move.l  SP_P2(%sp),%a2
                move.l  SP_P3(%sp),%a3
                lea     bitMaskLut(%pc),%a5
                | Cache cx in d5, cy (sign-extended) in a4.
                move.w  SP_CX(%sp),%d5
                move.w  SP_CY(%sp),%d6
                ext.l   %d6
                movea.l %d6,%a4
                | Bresenham init.
                move.w  SP_R(%sp),%d2         | bx = r
                moveq   #0,%d3                | by = 0
                moveq   #1,%d4
                sub.w   %d2,%d4               | err = 1 - bx
                | Dispatch on color (low 4 bits) -> one of 16 main loops.
                | Each table entry is a bra.w (4 bytes), so index *= 4.
                moveq   #0,%d6
                move.b  SP_COLOR(%sp),%d6
                and.w   #0x0F,%d6
                add.w   %d6,%d6
                add.w   %d6,%d6
                lea     .LcoTable(%pc),%a6
                jmp     0(%a6,%d6.w)
 .LcoTable:
                bra.w   .LcoLoop_0
                bra.w   .LcoLoop_1
                bra.w   .LcoLoop_2
                bra.w   .LcoLoop_3
                bra.w   .LcoLoop_4
                bra.w   .LcoLoop_5
                bra.w   .LcoLoop_6
                bra.w   .LcoLoop_7
                bra.w   .LcoLoop_8
                bra.w   .LcoLoop_9
                bra.w   .LcoLoop_10
                bra.w   .LcoLoop_11
                bra.w   .LcoLoop_12
                bra.w   .LcoLoop_13
                bra.w   .LcoLoop_14
                bra.w   .LcoLoop_15
                CO_LOOP_HDR  0
                CO_LOOP_HDR  1
                CO_LOOP_HDR  2
                CO_LOOP_HDR  3
                CO_LOOP_HDR  4
                CO_LOOP_HDR  5
                CO_LOOP_HDR  6
                CO_LOOP_HDR  7
                CO_LOOP_HDR  8
                CO_LOOP_HDR  9
                CO_LOOP_HDR  10
                CO_LOOP_HDR  11
                CO_LOOP_HDR  12
                CO_LOOP_HDR  13
                CO_LOOP_HDR  14
                CO_LOOP_HDR  15
 .LcoDone:
                lea     SP_LOCAL(%sp),%sp
                movem.l (%sp)+,%d2-%d7/%a2-%a6
                rts
                .align  2
 bitMaskLut:
                .byte   0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01
--- a/src/port/amiga/hal.c
+++ b/src/port/amiga/hal.c
--- a/src/port/atarist/hal.c
+++ b/src/port/atarist/hal.c
@ -526,26 +526,6 @@ void halPresent(const SurfaceT *src) {
 }
 void halPresentRect(const SurfaceT *src, int16_t x, int16_t y, uint16_t w, uint16_t h) {
    uint16_t groupStart;
    uint16_t groupEnd;
    if (src == NULL || !gModeSet) {
        return;
    }
    refreshPaletteStateIfNeeded(src);
    // Each c2p group covers 16 horizontal pixels. Round dirty pixel
    // range to the enclosing group range to keep the planar word
    // alignment without missing edge pixels.
    groupStart = (uint16_t)(x >> 4);
    groupEnd   = (uint16_t)(((uint16_t)x + w + 15) >> 4);
    if (groupEnd > ST_GROUPS_PER_ROW) {
        groupEnd = ST_GROUPS_PER_ROW;
    }
    c2pRange(src, y, y + (int16_t)h, groupStart, groupEnd);
 }
 // Vsync() is XBIOS opcode 37; mintlib exposes it directly. It blocks
 // until the next 50 Hz (PAL) or 60 Hz (NTSC) vertical blank.
 void halWaitVBL(void) {
@ -730,6 +710,20 @@ bool halFastFloodWalk(uint8_t *row, int16_t startX, uint8_t matchColor, uint8_t
 }
 bool halFloodWalkPlanes(const SurfaceT *s, int16_t startX, int16_t y, uint8_t matchColor, uint8_t newColor, bool matchEqual, bool *seedMatched, int16_t *leftXOut, int16_t *rightXOut) {
    (void)s; (void)startX; (void)y; (void)matchColor; (void)newColor; (void)matchEqual;
    (void)seedMatched; (void)leftXOut; (void)rightXOut;
    return false;
 }
 bool halFloodScanRowPlanes(const SurfaceT *s, int16_t leftX, int16_t rightX, int16_t scanY, uint8_t matchColor, uint8_t newColor, bool matchEqual, uint8_t *markBuf) {
    (void)s; (void)leftX; (void)rightX; (void)scanY; (void)matchColor; (void)newColor; (void)matchEqual;
    (void)markBuf;
    return false;
 }
 bool halFastFloodScanRow(uint8_t *row, int16_t leftX, int16_t rightX, uint8_t matchColor, uint8_t newColor, bool matchEqual, uint8_t *markBuf) {
    (void)row;
    (void)leftX;
@ -798,6 +792,146 @@ bool halFastTileFill(SurfaceT *s, uint8_t bx, uint8_t by, uint16_t fillWord) {
 }
 // Phase-1 planar plumbing: portData hooks declared and exported, but
 // returning NULL keeps the ST port operating in the legacy
 // chunky-with-c2p model. Phase 4 replaces this with an interleaved
 // planar buffer + stride blob, and rewrites every halFast* primitive
 // to read/write planes directly.
 void *halSurfaceAllocPortData(SurfaceT *s, bool isStage) {
    (void)s;
    (void)isStage;
    return NULL;
 }
 void halSurfaceFreePortData(SurfaceT *s, bool isStage, void *portData) {
    (void)s;
    (void)isStage;
    (void)portData;
 }
 // ST planar dual-write isn't implemented yet (interleaved word-planar
 // layout needs a different code path than Amiga's separate plane
 // buffers). Stub for now; chunky shadow + c2p still drives display.
 void halFillRectPlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex) {
    (void)s;
    (void)x;
    (void)y;
    (void)w;
    (void)h;
    (void)colorIndex;
 }
 void halSurfaceCopyPlanes(SurfaceT *dst, const SurfaceT *src) {
    (void)dst;
    (void)src;
 }
 void halTileFillPlanes(SurfaceT *s, uint8_t bx, uint8_t by, uint8_t colorIndex) {
    (void)s; (void)bx; (void)by; (void)colorIndex;
 }
 void halTileCopyPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy) {
    (void)dst; (void)dstBx; (void)dstBy; (void)src; (void)srcBx; (void)srcBy;
 }
 void halTileCopyMaskedPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy, uint8_t transparentIndex) {
    (void)dst; (void)dstBx; (void)dstBy; (void)src; (void)srcBx; (void)srcBy; (void)transparentIndex;
 }
 void halTilePastePlanes(SurfaceT *dst, uint8_t bx, uint8_t by, const uint8_t *chunkyTile) {
    (void)dst; (void)bx; (void)by; (void)chunkyTile;
 }
 void halTileSnapPlanes(const SurfaceT *src, uint8_t bx, uint8_t by, uint8_t *chunkyTileOut) {
    (void)src; (void)bx; (void)by; (void)chunkyTileOut;
 }
 void halSpriteDrawPlanes(SurfaceT *s, const SpriteT *sp, int16_t x, int16_t y) {
    (void)s; (void)sp; (void)x; (void)y;
 }
 void halBlitRectPlanes(SurfaceT *dst, int16_t x, int16_t y, const uint8_t *srcBytes, int16_t srcX0, int16_t srcY0, int16_t copyW, int16_t copyH, int16_t srcRowBytes, uint16_t transparent) {
    (void)dst; (void)x; (void)y; (void)srcBytes; (void)srcX0; (void)srcY0;
    (void)copyW; (void)copyH; (void)srcRowBytes; (void)transparent;
 }
 void halSpriteSavePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t *dstPlaneBytes) {
    (void)s; (void)x; (void)y; (void)w; (void)h; (void)dstPlaneBytes;
 }
 void halSpriteRestorePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, const uint8_t *srcPlaneBytes) {
    (void)s; (void)x; (void)y; (void)w; (void)h; (void)srcPlaneBytes;
 }
 /* Phase 9 chunky reader hooks -- ST is still chunky-shadow + c2p,
 * so reads come from s->pixels just like DOS / IIgs. */
 uint8_t halSamplePixel(const SurfaceT *s, int16_t x, int16_t y) {
    uint8_t byte = s->pixels[(uint16_t)y * SURFACE_BYTES_PER_ROW + ((uint16_t)x >> 1)];
    if (x & 1) return (uint8_t)(byte & 0x0Fu);
    return (uint8_t)((byte & 0xF0u) >> 4);
 }
 uint32_t halSurfaceHash(const SurfaceT *s) {
    uint16_t        lo = 0xACE1u, hi = 0x1357u, blocks, n, v;
    const uint8_t  *p;
    const uint16_t *w;
    uint8_t         b;
    p      = s->pixels;
    blocks = (uint16_t)(SURFACE_PIXELS_SIZE / 8);
    do {
        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
        blocks--;
    } while (blocks > 0u);
    p = s->scb;
    for (n = 0; n < (uint16_t)SURFACE_HEIGHT; n++) {
        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
    }
    w = &s->palette[0][0];
    for (n = 0; n < (uint16_t)SURFACE_PALETTE_ENTRIES; n++) {
        v  = *w++;
        b  = (uint8_t)((v >> 8) & 0xFFu);  SURFACE_HASH_MIX_BYTE(lo, hi, b);
        b  = (uint8_t)(v & 0xFFu);         SURFACE_HASH_MIX_BYTE(lo, hi, b);
    }
    return ((uint32_t)hi << 16) | (uint32_t)lo;
 }
 void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src) {
    memcpy(dst->pixels, src->pixels, SURFACE_PIXELS_SIZE);
 }
 bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp) {
    return fread(dst->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE;
 }
 bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp) {
    return fwrite(src->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE;
 }
 uint8_t *halSurfaceAllocPixels(void) {
    return (uint8_t *)calloc(1, SURFACE_PIXELS_SIZE);
 }
 void halSurfaceFreePixels(uint8_t *pixels) {
    free(pixels);
 }
 uint8_t *halSurfacePlanePtr(const SurfaceT *s, uint8_t planeIdx) {
    (void)s; (void)planeIdx;
    return NULL;
 }
 uint8_t *halStageAllocPixels(void) {
    return (uint8_t *)malloc(SURFACE_PIXELS_SIZE);
 }
--- a/src/port/dos/hal.c
+++ b/src/port/dos/hal.c
@ -244,21 +244,6 @@ void halPresent(const SurfaceT *src) {
 }
 void halPresentRect(const SurfaceT *src, int16_t x, int16_t y, uint16_t w, uint16_t h) {
    int16_t py;
    int16_t yEnd;
    if (src == NULL || gVgaMem == NULL) {
        return;
    }
    uploadPaletteIfNeeded(src);
    yEnd = y + (int16_t)h;
    for (py = y; py < yEnd; py++) {
        expandAndWriteLine(src, py, x, w, &gVgaMem[py * VGA_STRIDE]);
    }
 }
 // VGA mode 13h vertical refresh on a real CRT runs at ~70 Hz. We
 // detect the start of vertical retrace by polling input status
 // register 1 ($3DA) bit 3: 1 = currently in vretrace. To get a
@ -423,6 +408,20 @@ bool halFastFloodWalk(uint8_t *row, int16_t startX, uint8_t matchColor, uint8_t
 }
 bool halFloodWalkPlanes(const SurfaceT *s, int16_t startX, int16_t y, uint8_t matchColor, uint8_t newColor, bool matchEqual, bool *seedMatched, int16_t *leftXOut, int16_t *rightXOut) {
    (void)s; (void)startX; (void)y; (void)matchColor; (void)newColor; (void)matchEqual;
    (void)seedMatched; (void)leftXOut; (void)rightXOut;
    return false;
 }
 bool halFloodScanRowPlanes(const SurfaceT *s, int16_t leftX, int16_t rightX, int16_t scanY, uint8_t matchColor, uint8_t newColor, bool matchEqual, uint8_t *markBuf) {
    (void)s; (void)leftX; (void)rightX; (void)scanY; (void)matchColor; (void)newColor; (void)matchEqual;
    (void)markBuf;
    return false;
 }
 bool halFastFloodScanRow(uint8_t *row, int16_t leftX, int16_t rightX, uint8_t matchColor, uint8_t newColor, bool matchEqual, uint8_t *markBuf) {
    (void)row;
    (void)leftX;
@ -499,3 +498,143 @@ uint8_t *halStageAllocPixels(void) {
 void halStageFreePixels(uint8_t *pixels) {
    free(pixels);
 }
 // DOS / VGA mode 13h is chunky-native (8bpp linear). portData is
 // unused; the chunky `pixels` buffer feeds the present-time
 // nearest-neighbor copy to VGA RAM.
 void *halSurfaceAllocPortData(SurfaceT *s, bool isStage) {
    (void)s;
    (void)isStage;
    return NULL;
 }
 void halSurfaceFreePortData(SurfaceT *s, bool isStage, void *portData) {
    (void)s;
    (void)isStage;
    (void)portData;
 }
 // DOS has no bitplanes -- chunky pixels are the source of truth and
 // expandAndWriteLine derives the VGA DAC indices straight from them.
 // This hook is a stub here; the cross-platform fillRect calls it
 // unconditionally.
 void halFillRectPlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex) {
    (void)s;
    (void)x;
    (void)y;
    (void)w;
    (void)h;
    (void)colorIndex;
 }
 void halSurfaceCopyPlanes(SurfaceT *dst, const SurfaceT *src) {
    (void)dst;
    (void)src;
 }
 void halTileFillPlanes(SurfaceT *s, uint8_t bx, uint8_t by, uint8_t colorIndex) {
    (void)s; (void)bx; (void)by; (void)colorIndex;
 }
 void halTileCopyPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy) {
    (void)dst; (void)dstBx; (void)dstBy; (void)src; (void)srcBx; (void)srcBy;
 }
 void halTileCopyMaskedPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy, uint8_t transparentIndex) {
    (void)dst; (void)dstBx; (void)dstBy; (void)src; (void)srcBx; (void)srcBy; (void)transparentIndex;
 }
 void halTilePastePlanes(SurfaceT *dst, uint8_t bx, uint8_t by, const uint8_t *chunkyTile) {
    (void)dst; (void)bx; (void)by; (void)chunkyTile;
 }
 void halTileSnapPlanes(const SurfaceT *src, uint8_t bx, uint8_t by, uint8_t *chunkyTileOut) {
    (void)src; (void)bx; (void)by; (void)chunkyTileOut;
 }
 void halSpriteDrawPlanes(SurfaceT *s, const SpriteT *sp, int16_t x, int16_t y) {
    (void)s; (void)sp; (void)x; (void)y;
 }
 void halBlitRectPlanes(SurfaceT *dst, int16_t x, int16_t y, const uint8_t *srcBytes, int16_t srcX0, int16_t srcY0, int16_t copyW, int16_t copyH, int16_t srcRowBytes, uint16_t transparent) {
    (void)dst; (void)x; (void)y; (void)srcBytes; (void)srcX0; (void)srcY0;
    (void)copyW; (void)copyH; (void)srcRowBytes; (void)transparent;
 }
 void halSpriteSavePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t *dstPlaneBytes) {
    (void)s; (void)x; (void)y; (void)w; (void)h; (void)dstPlaneBytes;
 }
 void halSpriteRestorePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, const uint8_t *srcPlaneBytes) {
    (void)s; (void)x; (void)y; (void)w; (void)h; (void)srcPlaneBytes;
 }
 /* Phase 9 reader hooks: chunky ports use the original s->pixels-based
 * paths. */
 uint8_t halSamplePixel(const SurfaceT *s, int16_t x, int16_t y) {
    uint8_t byte = s->pixels[(uint16_t)y * SURFACE_BYTES_PER_ROW + ((uint16_t)x >> 1)];
    if (x & 1) return (uint8_t)(byte & 0x0Fu);
    return (uint8_t)((byte & 0xF0u) >> 4);
 }
 uint32_t halSurfaceHash(const SurfaceT *s) {
    uint16_t        lo = 0xACE1u, hi = 0x1357u, blocks, n, v;
    const uint8_t  *p;
    const uint16_t *w;
    uint8_t         b;
    p      = s->pixels;
    blocks = (uint16_t)(SURFACE_PIXELS_SIZE / 8);
    do {
        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
        blocks--;
    } while (blocks > 0u);
    p = s->scb;
    for (n = 0; n < (uint16_t)SURFACE_HEIGHT; n++) {
        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
    }
    w = &s->palette[0][0];
    for (n = 0; n < (uint16_t)SURFACE_PALETTE_ENTRIES; n++) {
        v  = *w++;
        b  = (uint8_t)((v >> 8) & 0xFFu);  SURFACE_HASH_MIX_BYTE(lo, hi, b);
        b  = (uint8_t)(v & 0xFFu);         SURFACE_HASH_MIX_BYTE(lo, hi, b);
    }
    return ((uint32_t)hi << 16) | (uint32_t)lo;
 }
 void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src) {
    memcpy(dst->pixels, src->pixels, SURFACE_PIXELS_SIZE);
 }
 bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp) {
    return fread(dst->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE;
 }
 bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp) {
    return fwrite(src->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE;
 }
 uint8_t *halSurfaceAllocPixels(void) {
    return (uint8_t *)calloc(1, SURFACE_PIXELS_SIZE);
 }
 void halSurfaceFreePixels(uint8_t *pixels) {
    free(pixels);
 }
 uint8_t *halSurfacePlanePtr(const SurfaceT *s, uint8_t planeIdx) {
    (void)s; (void)planeIdx;
    return NULL;
 }
--- a/src/port/iigs/hal.c
+++ b/src/port/iigs/hal.c
@ -26,12 +26,25 @@
 // crowd up against the 64 KB-per-bank limit).
 #include <stddef.h>
 #include <stdlib.h>
 #include <string.h>
 #include "joey/debug.h"
 #include "hal.h"
 #include "surfaceInternal.h"
 /* GetTick wrapper in peislam.asm: invokes the Misc Toolset GetTick
 * ($2503) and returns the low 16 bits of the system's tick counter
 * (firmware VBL ISR-driven). Polling $C019 from C user code missed
 * transitions for any op over ~1 ms; the system's tick counter is
 * updated by the actual interrupt handler so it stays accurate
 * regardless of caller polling rate. Tick rate matches the video
 * field rate -- 60 Hz on NTSC, 50 Hz on PAL. */
 extern uint16_t iigsGetTickWord(void);
 /* Reads battery RAM hrtz50or60: 0 = NTSC, 1 = PAL. */
 extern uint16_t iigsReadHzParam(void);
 static uint16_t gFrameHz = 60u;
 // hal.c is the single TU that calls into joeyDraw.asm. Cross-
 // platform draw.c / tile.c / etc. dispatch through halFast*
 // functions defined here; they never reference the asm symbols
@ -210,6 +223,7 @@ bool halInit(const JoeyConfigT *config) {
    // is unreliable from halInit's calling context, so we don't try
    // it here -- the first present will set up SCB to 320 mode.
    iigsInitRowLut();
    gFrameHz = (iigsReadHzParam() == 1u) ? 50u : 60u;
    gModeSet = true;
    return true;
 }
@ -234,40 +248,6 @@ void halPresent(const SurfaceT *src) {
 }
 void halPresentRect(const SurfaceT *src, int16_t x, int16_t y, uint16_t w, uint16_t h) {
    uint16_t copyBytes;
    int16_t  byteStart;
    uint16_t srcOffset;
    if (src == NULL) {
        return;
    }
    uploadScbAndPaletteIfNeeded(src);
    // Pixel copy: byte-aligned runs per scanline. x is always >= 0
    // after API-level clipping. Use unsigned shifts to avoid
    // ~SSHIFTRIGHT helper for `x >> 1` on signed int16_t.
    byteStart = (int16_t)((uint16_t)x >> 1);
    copyBytes = (uint16_t)((((uint16_t)x + w + 1u) >> 1) - (uint16_t)byteStart);
    if (copyBytes == 0 || h == 0) {
        return;
    }
    // Pixel copy: prefer the PEI-slam variant when the rect satisfies
    // its contract (copyBytes even, 2..80). Sprite-rect presents
    // (typical 8 bytes wide) hit this ~3x faster than MVN. Wider or
    // odd-byte rects fall back to MVN, which has no width cap.
    srcOffset = (uint16_t)(0x2000 + SURFACE_ROW_OFFSET(y) + byteStart);
    if ((copyBytes & 1) == 0 && copyBytes >= 2 && copyBytes <= 80) {
        iigsBlitRectStageToShrPEI(srcOffset, copyBytes, h);
    } else {
        iigsBlitRectStageToShr(srcOffset, copyBytes, h);
    }
 }
 void halShutdown(void) {
    if (gModeSet) {
        *IIGS_NEWVIDEO_REG = gPreviousNewVideo;
@ -305,6 +285,142 @@ void halStageFreePixels(uint8_t *pixels) {
 }
 // IIgs is chunky-native: portData is unused. The chunky `pixels`
 // buffer at $01:2000 is the stage's pixel storage and the source for
 // stagePresent's PEI-slam to $E1.
 void *halSurfaceAllocPortData(SurfaceT *s, bool isStage) {
    (void)s;
    (void)isStage;
    return NULL;
 }
 void halSurfaceFreePortData(SurfaceT *s, bool isStage, void *portData) {
    (void)s;
    (void)isStage;
    (void)portData;
 }
 // IIgs SHR is chunky-native; no bitplanes to update.
 void halFillRectPlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex) {
    (void)s;
    (void)x;
    (void)y;
    (void)w;
    (void)h;
    (void)colorIndex;
 }
 void halSurfaceCopyPlanes(SurfaceT *dst, const SurfaceT *src) {
    (void)dst;
    (void)src;
 }
 void halTileFillPlanes(SurfaceT *s, uint8_t bx, uint8_t by, uint8_t colorIndex) {
    (void)s; (void)bx; (void)by; (void)colorIndex;
 }
 void halTileCopyPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy) {
    (void)dst; (void)dstBx; (void)dstBy; (void)src; (void)srcBx; (void)srcBy;
 }
 void halTileCopyMaskedPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy, uint8_t transparentIndex) {
    (void)dst; (void)dstBx; (void)dstBy; (void)src; (void)srcBx; (void)srcBy; (void)transparentIndex;
 }
 void halTilePastePlanes(SurfaceT *dst, uint8_t bx, uint8_t by, const uint8_t *chunkyTile) {
    (void)dst; (void)bx; (void)by; (void)chunkyTile;
 }
 void halTileSnapPlanes(const SurfaceT *src, uint8_t bx, uint8_t by, uint8_t *chunkyTileOut) {
    (void)src; (void)bx; (void)by; (void)chunkyTileOut;
 }
 void halSpriteDrawPlanes(SurfaceT *s, const SpriteT *sp, int16_t x, int16_t y) {
    (void)s; (void)sp; (void)x; (void)y;
 }
 void halBlitRectPlanes(SurfaceT *dst, int16_t x, int16_t y, const uint8_t *srcBytes, int16_t srcX0, int16_t srcY0, int16_t copyW, int16_t copyH, int16_t srcRowBytes, uint16_t transparent) {
    (void)dst; (void)x; (void)y; (void)srcBytes; (void)srcX0; (void)srcY0;
    (void)copyW; (void)copyH; (void)srcRowBytes; (void)transparent;
 }
 void halSpriteSavePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t *dstPlaneBytes) {
    (void)s; (void)x; (void)y; (void)w; (void)h; (void)dstPlaneBytes;
 }
 void halSpriteRestorePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, const uint8_t *srcPlaneBytes) {
    (void)s; (void)x; (void)y; (void)w; (void)h; (void)srcPlaneBytes;
 }
 /* Phase 9 chunky reader hooks: IIgs reads from s->pixels just like
 * the legacy paths did. Same logic as the DOS port. */
 uint8_t halSamplePixel(const SurfaceT *s, int16_t x, int16_t y) {
    uint8_t byte = s->pixels[(uint16_t)y * SURFACE_BYTES_PER_ROW + ((uint16_t)x >> 1)];
    if (x & 1) return (uint8_t)(byte & 0x0Fu);
    return (uint8_t)((byte & 0xF0u) >> 4);
 }
 uint32_t halSurfaceHash(const SurfaceT *s) {
    uint16_t        lo = 0xACE1u, hi = 0x1357u, blocks, n, v;
    const uint8_t  *p;
    const uint16_t *w;
    uint8_t         b;
    p      = s->pixels;
    blocks = (uint16_t)(SURFACE_PIXELS_SIZE / 8);
    do {
        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
        blocks--;
    } while (blocks > 0u);
    p = s->scb;
    for (n = 0; n < (uint16_t)SURFACE_HEIGHT; n++) {
        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
    }
    w = &s->palette[0][0];
    for (n = 0; n < (uint16_t)SURFACE_PALETTE_ENTRIES; n++) {
        v  = *w++;
        b  = (uint8_t)((v >> 8) & 0xFFu);  SURFACE_HASH_MIX_BYTE(lo, hi, b);
        b  = (uint8_t)(v & 0xFFu);         SURFACE_HASH_MIX_BYTE(lo, hi, b);
    }
    return ((uint32_t)hi << 16) | (uint32_t)lo;
 }
 void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src) {
    memcpy(dst->pixels, src->pixels, SURFACE_PIXELS_SIZE);
 }
 bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp) {
    return fread(dst->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE;
 }
 bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp) {
    return fwrite(src->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE;
 }
 uint8_t *halSurfaceAllocPixels(void) {
    return (uint8_t *)calloc(1, SURFACE_PIXELS_SIZE);
 }
 void halSurfaceFreePixels(uint8_t *pixels) {
    free(pixels);
 }
 uint8_t *halSurfacePlanePtr(const SurfaceT *s, uint8_t planeIdx) {
    (void)s; (void)planeIdx;
    return NULL;
 }
 // $C019 RDVBLBAR: bit 7 = 0 during vertical blank, 1 during active
 // scan. To produce a rising-edge wait (one VBL per call), first spin
 // while VBL is currently active (bit 7 = 0), then spin until VBL
@ -333,24 +449,11 @@ void halWaitVBL(void) {
 // byte and the counter never advances. The explicit lda > / sta >
 // pattern uses long-mode addressing throughout, which is
 // DBR-independent.
 static uint16_t gFrameCount  = 0;
 static uint8_t  gPrevInVbl   = 0;
 uint16_t halFrameCount(void) {
-    uint8_t  now;
+    return iigsGetTickWord();
    uint16_t cnt;
    now = (*IIGS_VBL_STATUS & VBL_BAR_BIT) == 0;
    if (now && !gPrevInVbl) {
        cnt = gFrameCount;
        cnt = (uint16_t)(cnt + 1u);
        gFrameCount = cnt;
    }
    gPrevInVbl = now;
    return gFrameCount;
 }
 uint16_t halFrameHz(void) {
-    return 60u;
+    return gFrameHz;
 }
--- a/src/port/iigs/peislam.asm
+++ b/src/port/iigs/peislam.asm
@ -1,15 +1,66 @@
-* peislam.asm - placeholder.
+* peislam.asm - originally a PEI-slam helper, now hosts the GetTick
-*
+* and ReadBParam trampolines. The PEI-slam logic was rolled into
-* The original PEI-slam-per-row helper was removed; its functionality
+* iigsBlitStageToShr in joeyDraw.asm.
 * was rolled into iigsBlitStageToShr in joeyDraw.asm (full PEI-slam
 * with per-row dirty skip). This stub remains so the build's
 * PORT_ASM_SRCS_ALL wildcard pulls in a file with a recognized load
 * segment and the linker keeps the same segment-bank layout it had
 * when peislam.asm was a real translation unit.
                keep    PEISLAM
                case    on
 * Stub kept so the PEISLAM load segment stays present (the build's
 * PORT_ASM_SRCS_ALL wildcard pulls in this file by name).
 peislamStub     start   IIGSASM
                rtl
                end
 ****************************************************************
 * uint16_t iigsGetTickWord(void)
 *
 * Calls Misc Toolset GetTick ($2503) and returns the low 16 bits of
 * the 32-bit tick counter. The system increments this counter from
 * the actual VBL hardware interrupt, so it stays accurate regardless
 * of caller polling rate -- C-side polling of $C019 missed transitions
 * for any op over ~1 ms.
 *
 * GetTick output convention: caller pushes 4 bytes of output space,
 * tool dispatcher writes the LongWord into them. We pull the low 16
 * bits into A (ORCA-C Word return convention -- A holds the result,
 * not Y; verified against jIIgs.asm asmGetVbl) and discard the high
 * 16 into X.
 *
 * ORCA-C cdecl ABI: caller has M=I=16. Word return in A.
 ****************************************************************
 iigsGetTickWord start IIGSASM
                pha             ; output space high word
                pha             ; output space low word
                ldx     #$2503  ; _GetTick
                jsl     $E10000
                pla             ; A = low 16 bits (return value)
                plx             ; discard high 16 bits
                rtl
                end
 ****************************************************************
 * uint16_t iigsReadHzParam(void)
 *
 * Reads battery RAM parameter hrtz50or60 ($1D) via _ReadBParam ($0C03)
 * and returns the raw value: 0 = NTSC (60 Hz), 1 = PAL (50 Hz).
 *
 * GetTick fires from the hardware VBL ISR, so its rate matches the
 * video field rate -- 60 Hz on NTSC, 50 Hz on PAL. halFrameHz must
 * report whichever this machine actually runs so wall-clock math
 * (frames * 1000 / halFrameHz) is correct on both.
 ****************************************************************
 iigsReadHzParam start IIGSASM
                pha             ; output space (Word)
                pea     $001D   ; hrtz50or60 parameter ID
                ldx     #$0C03  ; _ReadBParam
                jsl     $E10000
                pla             ; A = result (ORCA-C Word return)
                rtl
                end
--- a/src/shared68k/surface68k.s
+++ b/src/shared68k/surface68k.s
@ -253,3 +253,253 @@ _surface68kFillRectByteAligned:
 .Lfrb_done:
                movem.l (%sp)+,%d2-%d6
                rts
 | ----------------------------------------------------------------
 | void surface68kFillSpan4Planes(uint8_t *p0, uint8_t *p1,
 |                                uint8_t *p2, uint8_t *p3,
 |                                uint16_t numMid,
 |                                uint8_t  leftMask, uint8_t rightMask,
 |                                uint8_t  fb0, uint8_t fb1,
 |                                uint8_t  fb2, uint8_t fb3);
 |
 | Fill ONE planar row across 4 planes -- the per-row body of
 | halFillRectPlanes lifted into asm. Each pN points at the leading
 | byte (already advanced by planeBase + y*40 + byteFirst on the C
 | side). leftMask and rightMask are the partial-byte masks for the
 | left/right edges; numMid is the count of full bytes between them.
 | fbN is 0x00 or 0xFF, the per-plane fill byte (caller pre-classifies
 | (colorIndex >> N) & 1 -> 0xFF or 0x00).
 |
 | Used by Amiga halFastFillCircle (one call per scanline span) and
 | Amiga halFillRectPlanes (one call per row of the rect). Replaces
 | the C inner loop whose ~13 cyc/byte was the gating cost on
 | fillCircle r=40 even after C-side inlining.
 |
 | Mask convention is uniform for all planes:
 |   leading byte  := (*p & ~leftMask)  | (fbN & leftMask)
 |   middle bytes  := fbN
 |   trailing byte := (*p & ~rightMask) | (fbN & rightMask)
 | -- branchless: the same arithmetic produces "set" or "clear" based
 | on whether fbN is 0xFF or 0x00.
 |
 | ABI: m68k cdecl. d2-d7/a2-a6 callee-save (movem'd here).
 | Stack offset to first arg after MOVEM: 11 regs * 4 = 44 bytes saved
 | + 4 ret PC = 48.
 | ----------------------------------------------------------------
                .globl  _surface68kFillSpan4Planes
                .equ    SP_SAVED, 44
                .equ    SP_RPC,    4
                .equ    SP_OFF,   (SP_SAVED + SP_RPC)
                .equ    SP_P0,    SP_OFF + 0
                .equ    SP_P1,    SP_OFF + 4
                .equ    SP_P2,    SP_OFF + 8
                .equ    SP_P3,    SP_OFF + 12
                .equ    SP_NMID,  SP_OFF + 16 + 2  | int -> low word at +2
                .equ    SP_LMASK, SP_OFF + 20 + 3  | int -> low byte at +3
                .equ    SP_RMASK, SP_OFF + 24 + 3
                .equ    SP_FB0,   SP_OFF + 28 + 3
                .equ    SP_FB1,   SP_OFF + 32 + 3
                .equ    SP_FB2,   SP_OFF + 36 + 3
                .equ    SP_FB3,   SP_OFF + 40 + 3
 | Macro: per-plane work fully inlined. Args:
 |   plane_an  = the address register holding this plane's pointer.
 |   fb_off    = the stack offset for this plane's fillByte.
 | Uses d6/d7 as scratch; d1=leftMask, d2=~leftMask, d3=rightMask,
 | d4=~rightMask; d0=numMid-1 (only valid if mid_count > 0). The mid
 | loop is skipped via .LfsSkipMid_<n> when numMid was 0 at entry --
 | the per-plane caller branches to the right tail label.
 |
 | Hand-unrolled 4x rather than using bsr+rts to dodge ~12 cyc per
 | return + the per-plane re-test of numMid that the previous build
 | paid. The mid-loop label suffix is the plane index so all four
 | inline copies can coexist without label collisions.
 |
 | Plain text version of the per-plane body (translate to asm 4x with
 | different a-regs and fb stack offsets):
 |
 |   move.b  (an),%d6
 |   and.b   %d2,%d6
 |   move.b  fb,%d7
 |   and.b   %d1,%d7
 |   or.b    %d7,%d6
 |   move.b  %d6,(an)+
 |   < if has-middle path: >
 |     move.w  %d0,%d7
 |   .midN:
 |     move.b  fb,(an)+
 |     dbra    %d7,.midN
 |   < trailing: >
 |   move.b  (an),%d6
 |   and.b   %d4,%d6
 |   move.b  fb,%d7
 |   and.b   %d3,%d7
 |   or.b    %d7,%d6
 |   move.b  %d6,(an)
 _surface68kFillSpan4Planes:
                movem.l %d2-%d7/%a2-%a6,-(%sp)
                move.b  SP_LMASK(%sp),%d1
                move.b  %d1,%d2
                not.b   %d2
                move.b  SP_RMASK(%sp),%d3
                move.b  %d3,%d4
                not.b   %d4
                move.l  SP_P0(%sp),%a0
                move.l  SP_P1(%sp),%a1
                move.l  SP_P2(%sp),%a2
                move.l  SP_P3(%sp),%a3
                | One-time numMid test. d0.w = numMid; if 0 jump to
                | the no-middle entry, otherwise pre-decrement for dbra
                | and fall into the with-middle entry. Both paths
                | unroll all 4 planes.
                move.w  SP_NMID(%sp),%d0
                beq     .LfsNoMid
                subq.w  #1,%d0
                | ---- WITH-MIDDLE PATH ----
                | Plane 0
                move.b  (%a0),%d6
                and.b   %d2,%d6
                move.b  SP_FB0(%sp),%d5
                move.b  %d5,%d7
                and.b   %d1,%d7
                or.b    %d7,%d6
                move.b  %d6,(%a0)+
                move.w  %d0,%d7
 .LfsMid0:       move.b  %d5,(%a0)+
                dbra    %d7,.LfsMid0
                move.b  (%a0),%d6
                and.b   %d4,%d6
                move.b  %d5,%d7
                and.b   %d3,%d7
                or.b    %d7,%d6
                move.b  %d6,(%a0)
                | Plane 1
                move.b  (%a1),%d6
                and.b   %d2,%d6
                move.b  SP_FB1(%sp),%d5
                move.b  %d5,%d7
                and.b   %d1,%d7
                or.b    %d7,%d6
                move.b  %d6,(%a1)+
                move.w  %d0,%d7
 .LfsMid1:       move.b  %d5,(%a1)+
                dbra    %d7,.LfsMid1
                move.b  (%a1),%d6
                and.b   %d4,%d6
                move.b  %d5,%d7
                and.b   %d3,%d7
                or.b    %d7,%d6
                move.b  %d6,(%a1)
                | Plane 2
                move.b  (%a2),%d6
                and.b   %d2,%d6
                move.b  SP_FB2(%sp),%d5
                move.b  %d5,%d7
                and.b   %d1,%d7
                or.b    %d7,%d6
                move.b  %d6,(%a2)+
                move.w  %d0,%d7
 .LfsMid2:       move.b  %d5,(%a2)+
                dbra    %d7,.LfsMid2
                move.b  (%a2),%d6
                and.b   %d4,%d6
                move.b  %d5,%d7
                and.b   %d3,%d7
                or.b    %d7,%d6
                move.b  %d6,(%a2)
                | Plane 3
                move.b  (%a3),%d6
                and.b   %d2,%d6
                move.b  SP_FB3(%sp),%d5
                move.b  %d5,%d7
                and.b   %d1,%d7
                or.b    %d7,%d6
                move.b  %d6,(%a3)+
                move.w  %d0,%d7
 .LfsMid3:       move.b  %d5,(%a3)+
                dbra    %d7,.LfsMid3
                move.b  (%a3),%d6
                and.b   %d4,%d6
                move.b  %d5,%d7
                and.b   %d3,%d7
                or.b    %d7,%d6
                move.b  %d6,(%a3)
                movem.l (%sp)+,%d2-%d7/%a2-%a6
                rts
 .LfsNoMid:
                | ---- NO-MIDDLE PATH (just leading + trailing) ----
                | Plane 0
                move.b  (%a0),%d6
                and.b   %d2,%d6
                move.b  SP_FB0(%sp),%d5
                move.b  %d5,%d7
                and.b   %d1,%d7
                or.b    %d7,%d6
                move.b  %d6,(%a0)+
                move.b  (%a0),%d6
                and.b   %d4,%d6
                move.b  %d5,%d7
                and.b   %d3,%d7
                or.b    %d7,%d6
                move.b  %d6,(%a0)
                | Plane 1
                move.b  (%a1),%d6
                and.b   %d2,%d6
                move.b  SP_FB1(%sp),%d5
                move.b  %d5,%d7
                and.b   %d1,%d7
                or.b    %d7,%d6
                move.b  %d6,(%a1)+
                move.b  (%a1),%d6
                and.b   %d4,%d6
                move.b  %d5,%d7
                and.b   %d3,%d7
                or.b    %d7,%d6
                move.b  %d6,(%a1)
                | Plane 2
                move.b  (%a2),%d6
                and.b   %d2,%d6
                move.b  SP_FB2(%sp),%d5
                move.b  %d5,%d7
                and.b   %d1,%d7
                or.b    %d7,%d6
                move.b  %d6,(%a2)+
                move.b  (%a2),%d6
                and.b   %d4,%d6
                move.b  %d5,%d7
                and.b   %d3,%d7
                or.b    %d7,%d6
                move.b  %d6,(%a2)
                | Plane 3
                move.b  (%a3),%d6
                and.b   %d2,%d6
                move.b  SP_FB3(%sp),%d5
                move.b  %d5,%d7
                and.b   %d1,%d7
                or.b    %d7,%d6
                move.b  %d6,(%a3)+
                move.b  (%a3),%d6
                and.b   %d4,%d6
                move.b  %d5,%d7
                and.b   %d3,%d7
                or.b    %d7,%d6
                move.b  %d6,(%a3)
                movem.l (%sp)+,%d2-%d7/%a2-%a6
                rts
--- a/tools/diff-uber-hashes
+++ b/tools/diff-uber-hashes
@ -0,0 +1,93 @@
 #!/usr/bin/env python3
 """Compare two UBER joeylog.txt files by per-op surface hash.
 Used by the planar 68k rewrite (project_planar_68k_plan.md): IIgs
 captures the golden reference, each 68k port re-runs UBER after a
 primitive conversion, and this tool tells you which ops produced
 different pixels. Without this, "looks right visually" misses the
 subtle mismatches that cascade into hard-to-debug corruption.
 Usage:
    tools/diff-uber-hashes <reference-log> <test-log>
 Exit code:
    0 = all hashes match
    1 = at least one mismatch
    2 = usage error or missing file
 """
 import re
 import sys
 # Match e.g.:
 #   UBER: drawCircle r=80: 56 iters / 4 frames = 840 ops/sec | hash=A1B2C3D4
 LINE_RE = re.compile(
    r"UBER:\s+(?P<op>[^:]+):\s+\d+\s+iters\s+/\s+\d+\s+frames\s+=\s+\d+\s+ops/sec\s+\|\s+hash=(?P<hash>[0-9A-Fa-f]+)"
 )
 def parse_log(path):
    """Return ordered dict {op_name: hash} from a UBER log file.
    Multiple runs may be concatenated in the same log (joeyLog appends)
    -- in that case the LAST hash for each op wins, matching the most
    recent run.
    """
    hashes = {}
    with open(path) as f:
        for line in f:
            m = LINE_RE.search(line)
            if m:
                hashes[m.group("op").strip()] = m.group("hash").upper()
    return hashes
 def main(argv):
    if len(argv) != 3:
        sys.stderr.write(
            "usage: diff-uber-hashes <reference-log> <test-log>\n"
        )
        return 2
    try:
        ref = parse_log(argv[1])
        test = parse_log(argv[2])
    except OSError as e:
        sys.stderr.write(f"error: {e}\n")
        return 2
    if not ref:
        sys.stderr.write(f"error: no UBER hash lines found in {argv[1]}\n")
        return 2
    if not test:
        sys.stderr.write(f"error: no UBER hash lines found in {argv[2]}\n")
        return 2
    mismatches = 0
    matches = 0
    for op, ref_hash in ref.items():
        test_hash = test.get(op)
        if test_hash is None:
            print(f"  MISSING in test: {op}  (ref={ref_hash})")
            mismatches += 1
        elif test_hash != ref_hash:
            print(f"  MISMATCH {op}: ref={ref_hash}  test={test_hash}")
            mismatches += 1
        else:
            matches += 1
    extras = [op for op in test if op not in ref]
    for op in extras:
        print(f"  EXTRA in test: {op}  (test={test[op]})")
    total = len(ref) + len(extras)
    print()
    if mismatches == 0 and not extras:
        print(f"OK: {matches}/{total} ops match")
        return 0
    print(f"FAIL: {matches} match, {mismatches} mismatch, {len(extras)} extras")
    return 1
 if __name__ == "__main__":
    sys.exit(main(sys.argv))
--- a/tools/diff-uber-perf
+++ b/tools/diff-uber-perf
@ -0,0 +1,132 @@
 #!/usr/bin/env python3
 """Compare two UBER joeylog.txt files by per-op ops/sec.
 Sibling of diff-uber-hashes (which compares pixel correctness). This
 tool drives Phase 10 of project_planar_68k_plan.md: pick the
 biggest perf gaps vs the IIgs reference and target asm/algorithmic
 optimization at those.
 Usage:
    tools/diff-uber-perf <reference-log> <test-log> [--threshold 1.0]
 Output is sorted by speed ratio (test/ref) ascending, so the worst
 gaps print first. Ops missing from either log are flagged. The
 threshold flag (default 1.0) marks ops below that ratio as FAIL --
 project_perf_directive.md says "IIgs is the perf floor; every
 other target must match or beat it", so parity = 1.0x. Use
 --threshold 0.8 for the project_planar_68k_plan looser acceptance.
 Exit code:
    0 = all common ops at >= threshold
    1 = at least one op below threshold (or missing)
    2 = usage error or missing file
 """
 import re
 import sys
 # Match e.g.:
 #   UBER: drawCircle r=80: 56 iters / 4 frames = 840 ops/sec | hash=A1B2C3D4
 LINE_RE = re.compile(
    r"UBER:\s+(?P<op>[^:]+):\s+\d+\s+iters\s+/\s+\d+\s+frames\s+=\s+(?P<ops>\d+)\s+ops/sec"
 )
 def parse_log(path):
    """Return ordered dict {op_name: ops_per_sec} from a UBER log file.
    Multiple runs may be concatenated (joeyLog appends); last value
    for each op wins, matching the most recent run.
    """
    perf = {}
    with open(path) as f:
        for line in f:
            m = LINE_RE.search(line)
            if m:
                perf[m.group("op").strip()] = int(m.group("ops"))
    return perf
 def main(argv):
    threshold = 1.0
    args = []
    i = 1
    while i < len(argv):
        if argv[i] == "--threshold" and i + 1 < len(argv):
            try:
                threshold = float(argv[i + 1])
            except ValueError:
                sys.stderr.write(f"error: bad threshold {argv[i+1]}\n")
                return 2
            i += 2
        else:
            args.append(argv[i])
            i += 1
    if len(args) != 2:
        sys.stderr.write(
            "usage: diff-uber-perf <reference-log> <test-log> [--threshold 1.0]\n"
        )
        return 2
    try:
        ref = parse_log(args[0])
        test = parse_log(args[1])
    except OSError as e:
        sys.stderr.write(f"error: {e}\n")
        return 2
    if not ref:
        sys.stderr.write(f"error: no UBER lines found in {args[0]}\n")
        return 2
    if not test:
        sys.stderr.write(f"error: no UBER lines found in {args[1]}\n")
        return 2
    rows = []
    for op, ref_ops in ref.items():
        test_ops = test.get(op)
        if test_ops is None:
            rows.append((op, ref_ops, None, None, "MISSING"))
            continue
        if ref_ops == 0:
            ratio = float("inf") if test_ops > 0 else 1.0
        else:
            ratio = test_ops / ref_ops
        status = "ok" if ratio >= threshold else "FAIL"
        rows.append((op, ref_ops, test_ops, ratio, status))
    extras = [(op, None, test[op], None, "EXTRA") for op in test if op not in ref]
    # Sort: missing/fail first by worst ratio, then ok ascending by ratio.
    def sort_key(row):
        op, refv, testv, ratio, status = row
        if status == "MISSING":
            return (0, 0.0, op)
        if status == "EXTRA":
            return (3, 0.0, op)
        return (1 if status == "FAIL" else 2, ratio, op)
    rows.sort(key=sort_key)
    op_w = max(len(op) for op in ref) if ref else 8
    op_w = max(op_w, max((len(op) for op in test), default=8), len("op"))
    print(f"{'op':<{op_w}}  {'ref':>10}  {'test':>10}  {'ratio':>7}  status")
    print(f"{'-'*op_w}  {'-'*10}  {'-'*10}  {'-'*7}  ------")
    fails = 0
    for op, refv, testv, ratio, status in rows + extras:
        refs = "" if refv is None else str(refv)
        tests = "" if testv is None else str(testv)
        rats = "" if ratio is None else f"{ratio:.2f}x"
        print(f"{op:<{op_w}}  {refs:>10}  {tests:>10}  {rats:>7}  {status}")
        if status in ("FAIL", "MISSING"):
            fails += 1
    print()
    print(f"threshold: {threshold:.2f}x  ({len(rows)} ops compared, {fails} below threshold)")
    return 1 if fails > 0 else 0
 if __name__ == "__main__":
    sys.exit(main(sys.argv))