Amiga parity with IIgs!

2026-05-03 01:44:39 -05:00 · 2026-05-03 01:44:39 -05:00 · b1e24b4650
commit b1e24b4650
parent 6c03d93e88
37 changed files with 4312 additions and 493 deletions
--- a/examples/audio/audio.c
+++ b/examples/audio/audio.c
@ -171,11 +171,11 @@ int main(void) {

        if (flashFrames > 0) {
            fillRect(screen, BAR_X, BAR_Y, BAR_W, BAR_H, COLOR_BAR);
-            stagePresentRect(BAR_X, BAR_Y, BAR_W, BAR_H);
+            stagePresent();
            flashFrames--;
            if (flashFrames == 0) {
                fillRect(screen, BAR_X, BAR_Y, BAR_W, BAR_H, COLOR_HINT);
-                stagePresentRect(BAR_X, BAR_Y, BAR_W, BAR_H);
+                stagePresent();
            }
        }
    }
--- a/examples/joy/joy.c
+++ b/examples/joy/joy.c
@ -80,8 +80,10 @@ static void buildPalette(SurfaceT *screen) {


 static void drawAndPresent(SurfaceT *screen, int16_t x, int16_t y, int16_t w, int16_t h, uint8_t color) {
+    /* fillRect marks the rect dirty; stagePresent flushes only that
+     * dirty band. */
    fillRect(screen, x, y, (uint16_t)w, (uint16_t)h, color);
-    stagePresentRect(x, y, (uint16_t)w, (uint16_t)h);
+    stagePresent();
 }


--- a/examples/keys/keys.c
+++ b/examples/keys/keys.c
@ -158,8 +158,6 @@ static void presentChangedCells(SurfaceT *screen, int16_t cursorCol, int16_t cur
    int16_t  row;
    JoeyKeyE key;
    bool     lit;
-    int16_t  x;
-    int16_t  y;

    for (row = 0; row < GRID_ROWS; row++) {
        for (col = 0; col < GRID_COLS; col++) {
@ -171,10 +169,10 @@ static void presentChangedCells(SurfaceT *screen, int16_t cursorCol, int16_t cur
            if (lit == gCellLit[row][col]) {
                continue;
            }
+            /* drawCell marks the cell's rect dirty; stagePresent
+             * flushes that one band. */
            drawCell(screen, col, row, lit);
-            x = (int16_t)(MARGIN_X + col * (CELL_W + GAP));
-            y = (int16_t)(MARGIN_Y + row * (CELL_H + GAP));
-            stagePresentRect(x, y, CELL_W, CELL_H);
+            stagePresent();
            gCellLit[row][col] = lit;
        }
    }
@ -195,19 +193,16 @@ static void updateCursor(SurfaceT *screen, int16_t cursorCol, int16_t cursorRow)
    if (gLastCursorX != mouseX || gLastCursorY != mouseY) {
        if (gLastCursorCol != CELL_NONE) {
            drawCell(screen, gLastCursorCol, gLastCursorRow, gCellLit[gLastCursorRow][gLastCursorCol]);
-            stagePresentRect(
-                (int16_t)(MARGIN_X + gLastCursorCol * (CELL_W + GAP)),
-                (int16_t)(MARGIN_Y + gLastCursorRow * (CELL_H + GAP)),
-                CELL_W, CELL_H);
        } else if (gLastCursorX >= 0 && gLastCursorY >= 0) {
            // Old cursor was in a gap region. Stamp background over it.
            fillRect(screen, gLastCursorX, gLastCursorY, CURSOR_W, CURSOR_H, COLOR_BACKGROUND);
-            stagePresentRect(gLastCursorX, gLastCursorY, CURSOR_W, CURSOR_H);
        }
    }

    drawCursor(screen, mouseX, mouseY);
-    stagePresentRect(mouseX, mouseY, CURSOR_W, CURSOR_H);
+    /* All draw calls above marked their rects dirty; one stagePresent
+     * flushes the union (cursor erase + cursor draw). */
+    stagePresent();

    gLastCursorX   = mouseX;
    gLastCursorY   = mouseY;
--- a/examples/sprite/sprite.c
+++ b/examples/sprite/sprite.c
@ -15,11 +15,11 @@
 #define BALL_TILES_Y      (BALL_H / 8)

 #define BALL_TILE_BYTES   (BALL_TILES_X * BALL_TILES_Y * TILE_BYTES)
-// SaveUnder must store rounded-up byte boundaries: x rounded down to
-// even, width rounded up to even. Worst case for BALL_W=16 (already
-// even) is 8 bytes per row + alignment slack of 1 byte; size for the
-// pessimistic case so the buffer never overflows.
-#define BALL_BACKUP_BYTES (((BALL_W + 2) >> 1) * BALL_H)
+// SaveUnder rounds x down to the platform's storage alignment: 2 px
+// for chunky 4bpp (1 extra byte/row worst case), 8 px for planar
+// 4-plane (4 extra bytes/row worst case -- one per plane). The +4
+// covers the planar case and is a no-op overhead on chunky.
+#define BALL_BACKUP_BYTES (((BALL_W >> 1) + 4) * BALL_H)

 #define BALL_PALETTE_IDX  0

@ -100,18 +100,14 @@ int main(void) {
    int16_t        y;
    int16_t        vx;
    int16_t        vy;
-    int16_t        oldX;
-    int16_t        oldY;
-    uint16_t       oldW;
-    uint16_t       oldH;
-    int16_t        unionX;
-    int16_t        unionY;
-    int16_t        unionRight;
-    int16_t        unionBottom;
    bool           haveBackup;

    config.hostMode     = HOST_MODE_TAKEOVER;
-    config.codegenBytes = 8 * 1024;
+    /* Amiga planar emits 8 pre-shifted DRAW variants per sprite (one
+     * per x % 8 alignment) so the codegen arena needs roughly 8x what
+     * the chunky two-shift case asks for. 32 KB fits a 16x16 ball
+     * with all variants. */
+    config.codegenBytes = 32UL * 1024;
    config.maxSurfaces  = 4;
    config.audioBytes   = 64UL * 1024;
    config.assetBytes   = 128UL * 1024;
@ -155,7 +151,7 @@ int main(void) {
    haveBackup = false;

    spriteSaveAndDraw(screen, ball, x, y, &backup);
-    stagePresentRect(backup.x, backup.y, backup.width, backup.height);
+    stagePresent();
    haveBackup = true;

    for (;;) {
@ -164,19 +160,15 @@ int main(void) {
            break;
        }

-        // Stash the prior ball's region before restoring the bytes
-        // under it. Do all off-screen work (restore + move + draw)
-        // first, then waitVBL + ONE stagePresentRect covering both
-        // old and new regions. Putting waitVBL immediately before the
-        // present lets the present land inside the VBL window so the
-        // CRT never sees a half-updated framebuffer (matters most on
-        // single-buffered chunky targets like IIgs SHR; on planar
-        // c2p platforms it also avoids c2p racing the raster).
-        oldX = backup.x;
-        oldY = backup.y;
-        oldW = backup.width;
-        oldH = backup.height;
-
+        // Do all off-screen work first (restore + move + draw), then
+        // ONE stagePresent flushes the union of dirty bands set by
+        // restoreUnder + draw. Add a joeyWaitVBL() before the present
+        // to land it inside the VBL window so the CRT never sees a
+        // half-updated framebuffer (matters most on single-buffered
+        // chunky targets like IIgs SHR; on planar c2p platforms it
+        // also avoids c2p racing the raster). VBL wait is omitted
+        // here so the demo runs at the sprite pipeline's native
+        // throughput -- expect tearing on the ball.
        if (haveBackup) {
            spriteRestoreUnder(screen, &backup);
        }
@ -190,27 +182,7 @@ int main(void) {

        spriteSaveAndDraw(screen, ball, x, y, &backup);

-        // Bounding box of (old rect) U (new rect). For typical
-        // small-step motion the rects overlap heavily so the union
-        // is barely larger than one ball.
-        unionX      = (oldX < backup.x) ? oldX : backup.x;
-        unionY      = (oldY < backup.y) ? oldY : backup.y;
-        unionRight  = (int16_t)((oldX + oldW > backup.x + backup.width)
-                                ? (oldX + oldW)
-                                : (backup.x + backup.width));
-        unionBottom = (int16_t)((oldY + oldH > backup.y + backup.height)
-                                ? (oldY + oldH)
-                                : (backup.y + backup.height));
-
-        // VBL wait removed -- the demo runs at the native compute speed
-        // of save+restore+draw+presentRect so we can SEE the sprite
-        // pipeline's actual throughput. Expect tearing on the ball
-        // since the present can land mid-scan; that's the cost of
-        // showing real frame rate. Add joeyWaitVBL() back here for
-        // tear-free 60 Hz motion.
-        stagePresentRect(unionX, unionY,
-                           (uint16_t)(unionRight  - unionX),
-                           (uint16_t)(unionBottom - unionY));
+        stagePresent();
        haveBackup = true;
    }

--- a/examples/uber/uber.c
+++ b/examples/uber/uber.c
@ -28,7 +28,16 @@

 // 4-frame measurement window. Long enough that loop overhead doesn't
 // dominate; short enough to keep the full demo run under ~10 sec.
-#define UBER_FRAMES  4u
+/* 16 frames per timed op gives 4x the iter-count resolution of the
+ * earlier 4-frame budget. Exposes the actual per-op cost on slow
+ * ops where 4 frames produced the same iter count on different
+ * framerates -- e.g. drawCircle r=80 read as "4 iters / 4 frames"
+ * on both 60 Hz IIgs (16.7 ms/frame, 67 ms window) and 50 Hz Amiga
+ * (20 ms/frame, 80 ms window) even though per-op cost was equal,
+ * just because 4 ops at 16-17 ms happen to fit both windows. The
+ * 16-frame budget extends the windows to 267 ms / 320 ms; quantum
+ * gap shrinks to ~6%. Total run time scales 4x (~80 sec each). */
+#define UBER_FRAMES  16u


 typedef void (*OpFn)(void);
@ -44,9 +53,10 @@ static TileT     gTileScratch;

 // Run `op` in a tight loop until `targetFrames` joeyFrameCount ticks
 // have elapsed. Returns iterations completed.
-static unsigned long runForFrames(OpFn op, unsigned int targetFrames) {
+static unsigned long runForFrames(OpFn op, unsigned int targetFrames, uint16_t *actualFramesOut) {
    unsigned long count;
    uint16_t      startFrame;
+    uint16_t      endFrame;

    count = 0UL;

@ -57,29 +67,50 @@ static unsigned long runForFrames(OpFn op, unsigned int targetFrames) {
        op();
        count++;
    }
+    /* Capture the actual elapsed frames -- the last iter typically
+     * overruns the target. Using actual instead of target as the
+     * ops/sec divisor stays honest for ops slower than 1 frame
+     * (where count is forced low while real time stretches well
+     * past targetFrames). */
+    endFrame         = joeyFrameCount();
+    *actualFramesOut = (uint16_t)(endFrame - startFrame);
+    if (*actualFramesOut == 0u) {
+        *actualFramesOut = 1u;       /* defensive: avoid div-by-zero */
+    }
    return count;
 }


 // Time and log one op. Reports iters / N frames AND the derived
 // ops/sec so per-port results are directly comparable against IIgs
-// regardless of CPU speed or display refresh rate.
+// regardless of CPU speed or display refresh rate. Also logs an
+// FNV-1a hash of the surface state after timing -- this is the
+// pixel-perfect comparison input for the cross-port validation
+// harness (tools/diff-uber-hashes.py). Captured against IIgs as the
+// golden reference; planar 68k rewrites validate by matching it.
 static void timeOp(const char *name, OpFn op) {
    unsigned long iters;
    unsigned long opsPerSec;
+    uint16_t      actualFrames;
+    uint32_t      hash;

    gCurName = name;

-    iters = runForFrames(op, UBER_FRAMES);
+    iters = runForFrames(op, UBER_FRAMES, &actualFrames);

    if (iters == 0UL) {
        joeyLogF("UBER: %s: 0 iters (op too slow?)\n", name);
        return;
    }

-    opsPerSec = (iters * (unsigned long)joeyFrameHz()) / (unsigned long)UBER_FRAMES;
-    joeyLogF("UBER: %s: %lu iters / %u frames = %lu ops/sec\n",
-             name, iters, UBER_FRAMES, opsPerSec);
+    /* Divide by ACTUAL elapsed frames, not the target. For sub-frame
+     * ops actualFrames ~= UBER_FRAMES so the answer is unchanged;
+     * for ops that overrun (slow stagePresent etc.), this stops
+     * inflating ops/sec. */
+    opsPerSec = (iters * (unsigned long)joeyFrameHz()) / (unsigned long)actualFrames;
+    hash      = surfaceHash(gStage);
+    joeyLogF("UBER: %s: %lu iters / %u frames = %lu ops/sec | hash=%08lX\n",
+             name, iters, actualFrames, opsPerSec, (unsigned long)hash);
 }


@ -125,8 +156,6 @@ static void op_spriteRestore     (void) { spriteRestoreUnder(gStage, &gBackup);
 static void op_spriteSaveAndDraw (void) { spriteSaveAndDraw (gStage, gSprite, gSpriteX, gSpriteY, &gBackup); }

 static void op_stagePresent     (void) { stagePresent(); }
-static void op_stagePresentRect8(void) { stagePresentRect( 40,  30,  16,  16); }
-static void op_stagePresentRectF(void) { stagePresentRect(  0,   0, 320, 200); }

 static void op_inputPoll       (void) { joeyInputPoll(); }
 static void op_keyDown         (void) { (void)joeyKeyDown(KEY_A); }
@ -229,10 +258,14 @@ static void runAllTests(void) {
    timeOp("spriteRestoreUnder", op_spriteRestore);
    timeOp("spriteSaveAndDraw",  op_spriteSaveAndDraw);

-    // Present.
+    // Present. One warm-up call before each timed loop primes any
+    // per-port one-time setup (Amiga: copper list rebuild after the
+    // paletteSet / scbSetRange tests dirty the cache; without warm-up
+    // the rebuild's MakeScreen + MrgCop + WaitTOF chain consumes the
+    // entire 4-frame measurement window) so we measure steady-state
+    // throughput rather than first-call penalty.
+    stagePresent();
    timeOp("stagePresent full",  op_stagePresent);
-    timeOp("stagePresentRect 8b",op_stagePresentRect8);
-    timeOp("stagePresentRect F", op_stagePresentRectF);

    // Input.
    timeOp("joeyInputPoll",      op_inputPoll);
@ -256,9 +289,16 @@ int main(void) {
    JoeyConfigT   config;
    uint16_t      pal[16];
    int           i;
+    uint16_t      startFrame;
+    uint16_t      endFrame;
+    uint16_t      elapsedFrames;
+    unsigned long elapsedMs;

    config.hostMode     = HOST_MODE_TAKEOVER;
-    config.codegenBytes = 8 * 1024;
+    /* 32 KB fits the 8 pre-shifted DRAW variants the Amiga planar
+     * compiled sprite emitter generates. UL on the multiply because
+     * ORCA-C's 16-bit int overflows on 32 * 1024. */
+    config.codegenBytes = 32UL * 1024;
    config.maxSurfaces  = 4;
    config.audioBytes   = 64UL * 1024;
    config.assetBytes   = 128UL * 1024;
@ -266,6 +306,11 @@ int main(void) {
    if (!joeyInit(&config)) {
        return 1;
    }
+    /* joeyFrameCount is VBL-driven, so it only ticks after halInit
+     * installed its VBL ISR -- captured here is "everything from now
+     * to press-any-key". Pre-init setup time is small and not the
+     * cost the user is chasing; runAllTests dominates. */
+    startFrame = joeyFrameCount();

    gStage = stageGet();
    if (gStage == NULL) {
@ -337,6 +382,12 @@ int main(void) {

    runAllTests();

+    endFrame      = joeyFrameCount();
+    elapsedFrames = (uint16_t)(endFrame - startFrame);
+    elapsedMs     = ((unsigned long)elapsedFrames * 1000UL) / (unsigned long)joeyFrameHz();
+    joeyLogF("UBER: total wall time: %lu ms (%u frames @ %u Hz)\n",
+             elapsedMs, elapsedFrames, (unsigned)joeyFrameHz());
+
    // Done. Green screen + waitForKey.
    surfaceClear(gStage, 2);
    stagePresent();
--- a/include/joey/debug.h
+++ b/include/joey/debug.h
@ -5,6 +5,7 @@

 void joeyLog     (const char *msg);
 void joeyLogF    (const char *fmt, ...);
+void joeyLogFlush(void);
 void joeyLogReset(void);

 #endif
--- a/include/joey/present.h
+++ b/include/joey/present.h
@ -15,14 +15,14 @@
 #include "types.h"

 // Flip the dirty regions of the stage to the display, then clear the
-// dirty state. Cheap when nothing has changed since the last call.
+// dirty state. Cheap when nothing has changed since the last call
+// (gStageAnyDirty short-circuit). Drawing primitives mark dirty as
+// a side effect, so callers only need to call stagePresent at the
+// end of a frame -- everything they drew shows up.
+//
+// To present a region you didn't draw with the standard primitives
+// (e.g. direct framebuffer poking), call surfaceMarkDirtyRect on
+// the same rect first, then stagePresent.
 void stagePresent(void);

-// Flip a specific rectangular region of the stage to the display,
-// regardless of dirty state. Coordinates are clipped to the surface;
-// negative or zero dimensions are no-ops. Does not consult or modify
-// the dirty arrays -- callers mixing stagePresentRect with stagePresent
-// in the same frame may see redundant work on the next stagePresent.
-void stagePresentRect(int16_t x, int16_t y, uint16_t w, uint16_t h);
-
 #endif
--- a/include/joey/sprite.h
+++ b/include/joey/sprite.h
@ -27,13 +27,16 @@
 #include "surface.h"
 #include "types.h"

-// Sprites always write to a 4bpp packed SurfaceT, never to display
-// memory directly (halPresent owns that path). The codegen emits 2
-// shift variants on every platform: shift 0 for even x (sprite byte
-// boundaries match destination byte boundaries) and shift 1 for odd
-// x (each destination byte combines two adjacent sprite bytes'
-// nibbles).
-#define JOEY_SPRITE_SHIFT_COUNT 2
+// Sprite codegen emits per-shift variants. Chunky 4bpp ports (DOS,
+// IIgs, Atari ST) only need 2 shifts -- pixel offset 0 (sprite/dest
+// byte boundaries align) and offset 1 (every dest byte combines two
+// sprite bytes' nibbles). Planar ports (Amiga -- 8 px per plane byte)
+// need 8 shifts: one for each x % 8 alignment, so smooth horizontal
+// motion at any pixel position uses pre-shifted source bytes without
+// runtime bit-shifting. Allocate the max so routineOffsets[] has
+// slots for every variant; chunky ports leave shifts 2..7 as
+// SPRITE_NOT_COMPILED, planar ports use all 8.
+#define JOEY_SPRITE_SHIFT_COUNT 8

 typedef enum {
    SPRITE_FLAGS_NONE = 0
--- a/include/joey/surface.h
+++ b/include/joey/surface.h
@ -58,4 +58,13 @@ bool surfaceSaveFile(const SurfaceT *src, const char *path);
 // identity (no reallocation).
 bool surfaceLoadFile(SurfaceT *dst, const char *path);

+// FNV-1a 32-bit hash of the surface's logical pixel content (color
+// indices in row-major order, 0..15 per pixel). Same logical pixels
+// produce the same hash on every port regardless of internal storage
+// format -- so a hash captured on IIgs (chunky) compares directly
+// against the same op's output on Amiga (planar) once the planar
+// rewrite is done. Used by the UBER validation harness to
+// pixel-compare ports against an IIgs golden reference.
+uint32_t surfaceHash(const SurfaceT *s);
+
 #endif
--- a/make/amiga.mk
+++ b/make/amiga.mk
@ -13,7 +13,7 @@ BINDIR   := $(BUILD)/bin
 # independently. -I on $(SRC_PORT)/amiga lets ptplayer.h resolve
 # <SDI_compiler.h> from the port-local shim alongside our HAL code.
 PTPLAYER_DIR := $(REPO_DIR)/toolchains/amiga/ptplayer
-CFLAGS  := $(COMMON_CFLAGS) -DJOEYLIB_PLATFORM_AMIGA -m68000 -fomit-frame-pointer -noixemul -D__OSCOMPAT -I$(SRC_PORT)/amiga -I$(SRC_68K) -I$(PTPLAYER_DIR)
+CFLAGS  := $(COMMON_CFLAGS) -DJOEYLIB_PLATFORM_AMIGA -m68000 -fomit-frame-pointer -noixemul -D__OSCOMPAT -I$(SRC_PORT)/amiga -I$(SRC_68K) -I$(PTPLAYER_DIR) -MMD -MP $(CFLAGS_EXTRA)
 # OSCOMPAT=1 selects PTPlayer's audio.device-friendly variant (uses
 # CIA-B + audio.device interrupts via the OS rather than taking over
 # Paula directly), matching the way our HAL cooperates with Intuition.
@ -52,6 +52,7 @@ LIB_OBJS := \
    $(patsubst $(SRC_68K)/%.s,$(BUILD)/obj/68k/%.o,$(SHARED_S)) \
    $(BUILD)/obj/port/ptplayer.o \
    $(BUILD)/obj/codegen/spriteEmit68k.o \
+    $(BUILD)/obj/codegen/spriteEmitPlanar68k.o \
    $(BUILD)/obj/codegen/spriteCompile.o

 LIB := $(LIBDIR)/libjoey.a
@ -156,3 +157,9 @@ $(DATA_DIR)/test.sfx: $(REPO_DIR)/assets/test.sfx

 clean-amiga:
 	rm -rf $(BUILD)
+
+# Pull in per-object header-dependency files generated by gcc -MMD/-MP.
+# Without this, editing a header (e.g. surfaceInternal.h) doesn't rebuild
+# the .c files that include it, leaving a frankenstein binary where
+# different TUs see different struct layouts.
+-include $(LIB_OBJS:.o=.d)
--- a/make/atarist.mk
+++ b/make/atarist.mk
@ -7,7 +7,7 @@ BUILD    := $(REPO_DIR)/build/$(PLATFORM)
 LIBDIR   := $(BUILD)/lib
 BINDIR   := $(BUILD)/bin

-CFLAGS  := $(COMMON_CFLAGS) -DJOEYLIB_PLATFORM_ATARIST -m68000 -fomit-frame-pointer -I$(REPO_DIR)/toolchains/audio/libxmp-lite/include -I$(REPO_DIR)/toolchains/atarist/include-shim -I$(SRC_68K)
+CFLAGS  := $(COMMON_CFLAGS) -DJOEYLIB_PLATFORM_ATARIST -m68000 -fomit-frame-pointer -I$(REPO_DIR)/toolchains/audio/libxmp-lite/include -I$(REPO_DIR)/toolchains/atarist/include-shim -I$(SRC_68K) -MMD -MP
 LDFLAGS :=

 # libxmp-lite shared with the DOS port. Built as a static archive that
@ -148,3 +148,9 @@ $(DATA_DIR)/test.sfx: $(REPO_DIR)/assets/test.sfx

 clean-atarist:
 	rm -rf $(BUILD)
+
+# Pull in per-object header-dependency files generated by gcc -MMD/-MP.
+# Without this, editing a header (e.g. surfaceInternal.h) doesn't rebuild
+# the .c files that include it, leaving a frankenstein binary where
+# different TUs see different struct layouts.
+-include $(LIB_OBJS:.o=.d)
--- a/make/dos.mk
+++ b/make/dos.mk
@ -7,7 +7,7 @@ BUILD    := $(REPO_DIR)/build/$(PLATFORM)
 LIBDIR   := $(BUILD)/lib
 BINDIR   := $(BUILD)/bin

-CFLAGS  := $(COMMON_CFLAGS) -DJOEYLIB_PLATFORM_DOS -march=i386 -m32 -I$(REPO_DIR)/toolchains/audio/libxmp-lite/include
+CFLAGS  := $(COMMON_CFLAGS) -DJOEYLIB_PLATFORM_DOS -march=i386 -m32 -I$(REPO_DIR)/toolchains/audio/libxmp-lite/include -MMD -MP
 ASFLAGS := -f coff
 LDFLAGS :=

@ -138,3 +138,9 @@ $(DATA_DIR)/test.sfx: $(REPO_DIR)/assets/test.sfx

 clean-dos:
 	rm -rf $(BUILD)
+
+# Pull in per-object header-dependency files generated by gcc -MMD/-MP.
+# Without this, editing a header (e.g. surfaceInternal.h) doesn't rebuild
+# the .c files that include it, leaving a frankenstein binary where
+# different TUs see different struct layouts.
+-include $(LIB_OBJS:.o=.d)
--- a/make/iigs.mk
+++ b/make/iigs.mk
@ -51,11 +51,11 @@ IIGS_MERLIN  := $(REPO_DIR)/toolchains/iigs/merlin32/bin/merlin32

 LIB_SRCS := $(CORE_C_SRCS_IIGS) $(PORT_C_SRCS) $(PORT_ASM_SRCS_ALL) $(NTP_ASM) $(CODEGEN_SRCS)

-# HELLO and PATTERN are intentionally omitted from this list. The UBER
-# demo (below) exercises every public API, including what those two
-# small examples covered, and the IIgs disk image was running out of
-# room. Source for HELLO/PATTERN is still in examples/{hello,pattern}/
-# for reference and for other ports that want them.
+# HELLO is omitted from the disk because UBER exercises everything it
+# does and the disk was tight. PATTERN is included as the SCB / palette
+# golden-reference for cross-port debugging.
+PATTERN_SRC := $(EXAMPLES)/pattern/pattern.c
+PATTERN_BIN := $(BINDIR)/PATTERN
 DRAW_SRC    := $(EXAMPLES)/draw/draw.c
 DRAW_BIN    := $(BINDIR)/DRAW
 KEYS_SRC    := $(EXAMPLES)/keys/keys.c
@ -120,24 +120,44 @@ $(NTP_ASM): $(NTP_BIN) $(REPO_DIR)/toolchains/iigs/bin-to-asm.sh
 # everywhere, so library asm can take SurfaceT* args via one
 # consistent ABI (small-mm 16-bit pointers truncated bank bytes,
 # which broke any asm that wanted to address bank-1 stage memory).
+# Per-binary header dependency files. iix-build.sh -M emits one .d
+# alongside each binary covering every header transitively included
+# by the C sources in that binary's build. Pulled in via -include at
+# the bottom of this file so editing a shared header (e.g.
+# surfaceInternal.h) triggers a rebuild of every IIgs binary that
+# transitively depends on it.
+DEP_DIR := $(BUILD)/dep
+PATTERN_DEP := $(DEP_DIR)/PATTERN.d
+DRAW_DEP    := $(DEP_DIR)/DRAW.d
+KEYS_DEP    := $(DEP_DIR)/KEYS.d
+JOY_DEP     := $(DEP_DIR)/JOY.d
+SPRITE_DEP  := $(DEP_DIR)/SPRITE.d
+UBER_DEP    := $(DEP_DIR)/UBER.d
+AUDIO_DEP   := $(DEP_DIR)/AUDIO.d
+
+$(PATTERN_BIN): $(PATTERN_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD)
+	@mkdir -p $(dir $@) $(DEP_DIR)
+	$(IIGS_BUILD) -b -M $(PATTERN_DEP) $(IIX_INCLUDES) -o $@ $(PATTERN_SRC) $(LIB_SRCS)
+	$(IIGS_IIX) chtyp -t S16 $@
+
 $(DRAW_BIN): $(DRAW_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD)
-	@mkdir -p $(dir $@)
-	$(IIGS_BUILD) -b $(IIX_INCLUDES) -o $@ $(DRAW_SRC) $(LIB_SRCS)
+	@mkdir -p $(dir $@) $(DEP_DIR)
+	$(IIGS_BUILD) -b -M $(DRAW_DEP) $(IIX_INCLUDES) -o $@ $(DRAW_SRC) $(LIB_SRCS)
 	$(IIGS_IIX) chtyp -t S16 $@

 $(KEYS_BIN): $(KEYS_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD)
-	@mkdir -p $(dir $@)
-	$(IIGS_BUILD) -b $(IIX_INCLUDES) -o $@ $(KEYS_SRC) $(LIB_SRCS)
+	@mkdir -p $(dir $@) $(DEP_DIR)
+	$(IIGS_BUILD) -b -M $(KEYS_DEP) $(IIX_INCLUDES) -o $@ $(KEYS_SRC) $(LIB_SRCS)
 	$(IIGS_IIX) chtyp -t S16 $@

 $(JOY_BIN): $(JOY_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD)
-	@mkdir -p $(dir $@)
-	$(IIGS_BUILD) -b $(IIX_INCLUDES) -o $@ $(JOY_SRC) $(LIB_SRCS)
+	@mkdir -p $(dir $@) $(DEP_DIR)
+	$(IIGS_BUILD) -b -M $(JOY_DEP) $(IIX_INCLUDES) -o $@ $(JOY_SRC) $(LIB_SRCS)
 	$(IIGS_IIX) chtyp -t S16 $@

 $(SPRITE_BIN): $(SPRITE_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD)
-	@mkdir -p $(dir $@)
-	$(IIGS_BUILD) -b $(IIX_INCLUDES) -o $@ $(SPRITE_SRC) $(LIB_SRCS)
+	@mkdir -p $(dir $@) $(DEP_DIR)
+	$(IIGS_BUILD) -b -M $(SPRITE_DEP) $(IIX_INCLUDES) -o $@ $(SPRITE_SRC) $(LIB_SRCS)
 	$(IIGS_IIX) chtyp -t S16 $@

 # UBER bumps user stack to 16 KB. ORCA-C's default user stack is small
@ -147,8 +167,8 @@ $(SPRITE_BIN): $(SPRITE_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD)
 # decimal formatter in uber.c also uses larger stack-local buffers
 # (line[96], num[16]) than typical demos. 16 KB is plenty of headroom.
 $(UBER_BIN): $(UBER_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD)
-	@mkdir -p $(dir $@)
-	$(IIGS_BUILD) -b -s 16384 $(IIX_INCLUDES) -o $@ $(UBER_SRC) $(LIB_SRCS)
+	@mkdir -p $(dir $@) $(DEP_DIR)
+	$(IIGS_BUILD) -b -s 16384 -M $(UBER_DEP) $(IIX_INCLUDES) -o $@ $(UBER_SRC) $(LIB_SRCS)
 	$(IIGS_IIX) chtyp -t S16 $@

 # Convert the cross-platform .MOD asset to NinjaTrackerPlus runtime
@ -170,17 +190,23 @@ AUDIO_DATA_FILES := $(AUDIO_SFX)
 endif

 $(AUDIO_BIN): $(AUDIO_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD)
-	@mkdir -p $(dir $@)
-	$(IIGS_BUILD) -b $(IIX_INCLUDES) -I $(EXAMPLES)/audio -o $@ $(AUDIO_SRC) $(LIB_SRCS)
+	@mkdir -p $(dir $@) $(DEP_DIR)
+	$(IIGS_BUILD) -b -M $(AUDIO_DEP) $(IIX_INCLUDES) -I $(EXAMPLES)/audio -o $@ $(AUDIO_SRC) $(LIB_SRCS)
 	$(IIGS_IIX) chtyp -t S16 $@

 # Assemble a ProDOS 2img containing the examples, ready to mount in
 # GSplus alongside a GS/OS boot volume.
 iigs-disk: $(DISK_IMG)

-$(DISK_IMG): $(DRAW_BIN) $(KEYS_BIN) $(JOY_BIN) $(SPRITE_BIN) $(AUDIO_BIN) $(UBER_BIN) $(AUDIO_DATA_FILES) $(IIGS_PACKAGE)
+$(DISK_IMG): $(PATTERN_BIN) $(DRAW_BIN) $(KEYS_BIN) $(JOY_BIN) $(SPRITE_BIN) $(AUDIO_BIN) $(UBER_BIN) $(AUDIO_DATA_FILES) $(IIGS_PACKAGE)
 	@mkdir -p $(dir $@)
-	$(IIGS_PACKAGE) $@ $(DRAW_BIN) $(KEYS_BIN) $(JOY_BIN) $(SPRITE_BIN) $(AUDIO_BIN) $(UBER_BIN) -- $(AUDIO_DATA_FILES)
+	$(IIGS_PACKAGE) $@ $(PATTERN_BIN) $(DRAW_BIN) $(KEYS_BIN) $(JOY_BIN) $(SPRITE_BIN) $(AUDIO_BIN) $(UBER_BIN) -- $(AUDIO_DATA_FILES)

 clean-iigs:
 	rm -rf $(BUILD)
+
+# Pull in per-binary header-dependency files generated by iix-build.sh -M.
+# Without this, editing a header (e.g. surfaceInternal.h) doesn't rebuild
+# IIgs binaries that include it -- the IIgs's iix toolchain has no native
+# -MMD analog, so iix-build.sh shells out to host gcc for the scan.
+-include $(PATTERN_DEP) $(DRAW_DEP) $(KEYS_DEP) $(JOY_DEP) $(SPRITE_DEP) $(UBER_DEP) $(AUDIO_DEP)
--- a/src/codegen/spriteCompile.c
+++ b/src/codegen/spriteCompile.c
@ -14,6 +14,7 @@
 #include "joey/sprite.h"
 #include "joey/surface.h"
 #include "codegenArenaInternal.h"
+#include "hal.h"
 #include "spriteEmitter.h"
 #include "spriteInternal.h"
 #include "surfaceInternal.h"
@ -33,7 +34,9 @@
 static uint16_t emitDrawForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift) {
 #if defined(JOEYLIB_PLATFORM_DOS)
    return spriteEmitDrawX86(out, sp, shift);
-#elif defined(JOEYLIB_PLATFORM_AMIGA) || defined(JOEYLIB_PLATFORM_ATARIST)
+#elif defined(JOEYLIB_PLATFORM_AMIGA)
+    return spriteEmitDrawPlanar68k(out, sp, shift);
+#elif defined(JOEYLIB_PLATFORM_ATARIST)
    return spriteEmitDraw68k(out, sp, shift);
 #elif defined(JOEYLIB_PLATFORM_IIGS)
    return spriteEmitDrawIigs(out, sp, shift);
@ -51,7 +54,9 @@ static uint16_t emitDrawForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift
 static uint16_t emitSaveForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift) {
 #if defined(JOEYLIB_PLATFORM_DOS)
    return spriteEmitSaveX86(out, sp, shift);
-#elif defined(JOEYLIB_PLATFORM_AMIGA) || defined(JOEYLIB_PLATFORM_ATARIST)
+#elif defined(JOEYLIB_PLATFORM_AMIGA)
+    return spriteEmitSavePlanar68k(out, sp, shift);
+#elif defined(JOEYLIB_PLATFORM_ATARIST)
    return spriteEmitSave68k(out, sp, shift);
 #elif defined(JOEYLIB_PLATFORM_IIGS)
    return spriteEmitSaveIigs(out, sp, shift);
@ -65,7 +70,9 @@ static uint16_t emitSaveForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift
 static uint16_t emitRestoreForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift) {
 #if defined(JOEYLIB_PLATFORM_DOS)
    return spriteEmitRestoreX86(out, sp, shift);
-#elif defined(JOEYLIB_PLATFORM_AMIGA) || defined(JOEYLIB_PLATFORM_ATARIST)
+#elif defined(JOEYLIB_PLATFORM_AMIGA)
+    return spriteEmitRestorePlanar68k(out, sp, shift);
+#elif defined(JOEYLIB_PLATFORM_ATARIST)
    return spriteEmitRestore68k(out, sp, shift);
 #elif defined(JOEYLIB_PLATFORM_IIGS)
    return spriteEmitRestoreIigs(out, sp, shift);
@ -114,6 +121,13 @@ bool spriteCompile(SpriteT *sp) {
    if (sp->tileData == NULL) {
        return false;
    }
+    /* Amiga (post-Phase 9) uses spriteEmitPlanar68k.c which writes
+     * directly to bitplanes. DRAW emits a unique pre-shifted variant
+     * per shift in 0..7 (smooth horizontal motion at any pixel x);
+     * SAVE/RESTORE emit only shift 0 and shift 1 since shifted variants
+     * 1..7 share identical bytes (plain memcpy of widthTiles+1 plane
+     * bytes per row). The post-emit pass below aliases slots 2..7
+     * for save/restore to slot 1's bytes. */

    scratch = (uint8_t *)malloc(SPRITE_EMIT_SCRATCH_BYTES);
    if (scratch == NULL) {
@ -150,6 +164,16 @@ bool spriteCompile(SpriteT *sp) {
            }
        }
    }
+#if defined(JOEYLIB_PLATFORM_AMIGA)
+    /* Save/restore bytes for any non-zero shift are identical (plain
+     * memcpy of widthTiles+1 plane bytes per row). The emitter emits
+     * them once at slot 1; alias slots 2..7 here so the dispatcher
+     * gate (sprite.c) sees them as compiled. */
+    for (shift = 2; shift < JOEY_SPRITE_SHIFT_COUNT; shift++) {
+        sp->routineOffsets[shift][SPRITE_OP_SAVE]    = sp->routineOffsets[1][SPRITE_OP_SAVE];
+        sp->routineOffsets[shift][SPRITE_OP_RESTORE] = sp->routineOffsets[1][SPRITE_OP_RESTORE];
+    }
+#endif
    sp->slot = slot;
    free(scratch);
    return true;
@ -554,6 +578,112 @@ void spriteCompiledRestoreUnder(SurfaceT *dst, const SpriteBackupT *backup) {
    }
 }

+#elif defined(JOEYLIB_PLATFORM_AMIGA)
+
+/* Amiga planar dispatchers. spriteEmitPlanar68k.c emits routines with
+ * cdecl(p0, p1, p2, p3[, backup]) signatures that write directly to
+ * bitplanes. Compute byteOff = y*40 + x/8 and pass plane[i]+byteOff
+ * as the 4 plane args. shift = x % 8 selects the variant; today only
+ * shift 0 emits non-zero bytes, so callers should already have
+ * gated on routineOffsets[shift][op] != SPRITE_NOT_COMPILED.
+ *
+ * For non-zero shifts (x not 8-px-aligned), the dispatcher in
+ * src/core/sprite.c (spriteDraw / spriteSaveUnder / spriteRestoreUnder)
+ * sees SPRITE_NOT_COMPILED for the shift and falls back to the
+ * interpreter, which handles arbitrary x via halSpriteDrawPlanes /
+ * halSpriteSavePlanes / halSpriteRestorePlanes. */
+
+#define AMIGA_BYTES_PER_ROW_LOCAL 40
+
+void spriteCompiledDraw(SurfaceT *dst, const SpriteT *sp, int16_t x, int16_t y) {
+    typedef void (*DrawFn)(uint8_t *p0, uint8_t *p1, uint8_t *p2, uint8_t *p3);
+    uint8_t   shift;
+    uint16_t  byteOff;
+    uint8_t  *p0;
+    uint8_t  *p1;
+    uint8_t  *p2;
+    uint8_t  *p3;
+    DrawFn    fn;
+
+    shift   = (uint8_t)(x & 7);
+    byteOff = (uint16_t)((uint16_t)y * AMIGA_BYTES_PER_ROW_LOCAL + ((uint16_t)x >> 3));
+    p0      = halSurfacePlanePtr(dst, 0); if (p0 == NULL) return;
+    p1      = halSurfacePlanePtr(dst, 1);
+    p2      = halSurfacePlanePtr(dst, 2);
+    p3      = halSurfacePlanePtr(dst, 3);
+    fn = (DrawFn)(codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_DRAW]);
+    fn(p0 + byteOff, p1 + byteOff, p2 + byteOff, p3 + byteOff);
+}
+
+
+void spriteCompiledSaveUnder(const SurfaceT *src, SpriteT *sp, int16_t x, int16_t y, SpriteBackupT *backup) {
+    typedef void (*SaveFn)(uint8_t *p0, uint8_t *p1, uint8_t *p2, uint8_t *p3, uint8_t *backup);
+    uint8_t   shift;
+    int16_t   clippedX;
+    uint16_t  widthPx;
+    uint16_t  heightPx;
+    uint16_t  byteOff;
+    uint8_t  *p0;
+    uint8_t  *p1;
+    uint8_t  *p2;
+    uint8_t  *p3;
+    SaveFn    fn;
+
+    shift    = (uint8_t)(x & 7);
+    clippedX = (int16_t)(x & ~7);
+    widthPx  = (uint16_t)(sp->widthTiles  * 8);
+    heightPx = (uint16_t)(sp->heightTiles * 8);
+    /* Shifts 1..7 spill into one extra plane byte per row (= +8 px). */
+    if (shift != 0u) {
+        widthPx = (uint16_t)(widthPx + 8u);
+    }
+    byteOff  = (uint16_t)((uint16_t)y * AMIGA_BYTES_PER_ROW_LOCAL + ((uint16_t)clippedX >> 3));
+
+    backup->sprite    = sp;
+    backup->x         = clippedX;
+    backup->y         = y;
+    backup->width     = widthPx;
+    backup->height    = heightPx;
+    /* 4 planes * h * (widthPx/8) bytes = h * widthPx/2. */
+    backup->sizeBytes = (uint16_t)((uint16_t)heightPx * (widthPx >> 1));
+
+    p0 = halSurfacePlanePtr(src, 0); if (p0 == NULL) return;
+    p1 = halSurfacePlanePtr(src, 1);
+    p2 = halSurfacePlanePtr(src, 2);
+    p3 = halSurfacePlanePtr(src, 3);
+    fn = (SaveFn)(codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_SAVE]);
+    fn(p0 + byteOff, p1 + byteOff, p2 + byteOff, p3 + byteOff, backup->bytes);
+}
+
+
+void spriteCompiledRestoreUnder(SurfaceT *dst, const SpriteBackupT *backup) {
+    typedef void (*RestoreFn)(uint8_t *p0, uint8_t *p1, uint8_t *p2, uint8_t *p3, const uint8_t *backup);
+    SpriteT  *sp;
+    uint8_t   shift;
+    uint16_t  byteOff;
+    uint8_t  *p0;
+    uint8_t  *p1;
+    uint8_t  *p2;
+    uint8_t  *p3;
+    RestoreFn fn;
+
+    sp      = backup->sprite;
+    /* backup->x is 8-px aligned (clippedX from save), so x & 7 is
+     * useless for picking the original shift. Encode it via
+     * backup->width: == widthTiles*8 means shift 0; > means shifted.
+     * Shifted slots 1..7 all alias to the same restore bytes, so
+     * slot 1 stands in for any non-zero shift. */
+    shift   = (uint8_t)(backup->width > (uint16_t)(sp->widthTiles * 8) ? 1u : 0u);
+    byteOff = (uint16_t)((uint16_t)backup->y * AMIGA_BYTES_PER_ROW_LOCAL + ((uint16_t)backup->x >> 3));
+
+    p0 = halSurfacePlanePtr(dst, 0); if (p0 == NULL) return;
+    p1 = halSurfacePlanePtr(dst, 1);
+    p2 = halSurfacePlanePtr(dst, 2);
+    p3 = halSurfacePlanePtr(dst, 3);
+    fn = (RestoreFn)(codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_RESTORE]);
+    fn(p0 + byteOff, p1 + byteOff, p2 + byteOff, p3 + byteOff, backup->bytes);
+}
+
 #else

 void spriteCompiledDraw(SurfaceT *dst, const SpriteT *sp, int16_t x, int16_t y) {
--- a/src/codegen/spriteEmit68k.c
+++ b/src/codegen/spriteEmit68k.c
@ -166,6 +166,13 @@ uint16_t spriteEmitDraw68k(uint8_t *out, const SpriteT *sp, uint8_t shift) {
    uint8_t  value;
    uint8_t  opaqueMask;

+    // Chunky 4bpp has only two nibble-alignment positions; the
+    // dispatcher uses x & 1 so shifts 2..7 are unreachable. Bail
+    // early so the arena slot stays SPRITE_NOT_COMPILED.
+    if (shift > 1u) {
+        return 0u;
+    }
+
    cursor             = 0;
    heightPx           = (uint16_t)(sp->heightTiles * TILE_PIXELS);
    spriteBytesPerRow  = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW);
@ -225,6 +232,10 @@ uint16_t spriteEmitRestore68k(uint8_t *out, const SpriteT *sp, uint8_t shift) {
    uint16_t heightPx;
    uint16_t copyBytes;

+    if (shift > 1u) {
+        return 0u;
+    }
+
    cursor    = 0;
    heightPx  = (uint16_t)(sp->heightTiles * TILE_PIXELS);
    copyBytes = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW + (shift == 1u ? 1u : 0u));
@ -248,6 +259,10 @@ uint16_t spriteEmitSave68k(uint8_t *out, const SpriteT *sp, uint8_t shift) {
    uint16_t heightPx;
    uint16_t copyBytes;

+    if (shift > 1u) {
+        return 0u;
+    }
+
    cursor    = 0;
    heightPx  = (uint16_t)(sp->heightTiles * TILE_PIXELS);
    copyBytes = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW + (shift == 1u ? 1u : 0u));
--- a/src/codegen/spriteEmitIigs.c
+++ b/src/codegen/spriteEmitIigs.c
@ -189,6 +189,10 @@ uint16_t spriteEmitSaveIigs(uint8_t *out, const SpriteT *sp, uint8_t shift) {
    uint16_t spriteBytesPerRow;
    uint16_t copyBytes;

+    if (shift > 1u) {
+        return 0u;
+    }
+
    heightPx          = (uint16_t)(sp->heightTiles * TILE_PIXELS);
    spriteBytesPerRow = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW);
    copyBytes         = (uint16_t)(spriteBytesPerRow + (shift == 1 ? 1 : 0));
@ -205,6 +209,10 @@ uint16_t spriteEmitRestoreIigs(uint8_t *out, const SpriteT *sp, uint8_t shift) {
    uint16_t spriteBytesPerRow;
    uint16_t copyBytes;

+    if (shift > 1u) {
+        return 0u;
+    }
+
    heightPx          = (uint16_t)(sp->heightTiles * TILE_PIXELS);
    spriteBytesPerRow = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW);
    copyBytes         = (uint16_t)(spriteBytesPerRow + (shift == 1 ? 1 : 0));
@ -258,6 +266,10 @@ uint16_t spriteEmitDrawIigs(uint8_t *out, const SpriteT *sp, uint8_t shift) {
    uint8_t  nextOpaqueMask;
    bool     wide;

+    if (shift > 1u) {
+        return 0u;
+    }
+
    cursor             = 0;
    heightPx           = (uint16_t)(sp->heightTiles * TILE_PIXELS);
    spriteBytesPerRow  = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW);
--- a/src/codegen/spriteEmitPlanar68k.c
+++ b/src/codegen/spriteEmitPlanar68k.c
@ -0,0 +1,505 @@
+// Planar 68k sprite codegen for Amiga (post-Phase 9, no chunky shadow).
+//
+// Emits PIC routines that write directly to the four bitplanes via 4
+// address-register pointers (a0..a3 = plane[0..3] base + byteOff,
+// where byteOff = y*40 + x/8 -- the dispatcher pre-computes this).
+//
+// Calling convention (cdecl on m68k-amigaos-gcc):
+//   draw(p0, p1, p2, p3):
+//     args at 4(sp), 8(sp), 12(sp), 16(sp) -- one ULONG per plane.
+//     loaded into a0..a3 by the prologue.
+//   save(p0, p1, p2, p3, backup):
+//     5 args; backup at 20(sp), loaded into a4.
+//   restore(p0, p1, p2, p3, backup):
+//     same as save but reads backup, writes planes.
+//
+// Per-byte plane write encoding decisions:
+//   - all-transparent (mask=0):  skip the byte entirely
+//   - all-opaque (mask=0xFF):    move.b #imm, d16(an)        (6 bytes)
+//   - mixed (0<mask<0xFF):       move.b d16(an), d0;
+//                                andi.b #~mask, d0;
+//                                ori.b  #imm, d0;
+//                                move.b d0, d16(an)          (4+6+6+4 = 20 bytes)
+//
+// Per row advance: 4 plane pointers each get adda.w #SURFACE_WIDTH/8
+// = adda.w #40, an  (4 bytes encoded each, 16 bytes total per row).
+// We omit the advance after the last row.
+//
+// Shift handling: shifts 0..7 are pre-baked. The dispatcher selects
+// the variant via x % 8 and pre-computes byteOff = y*40 + (x & ~7)/8
+// (i.e. round x DOWN to 8-pixel boundary). The variant for shift s
+// then emits to (widthTiles + 1) plane bytes per row when s != 0
+// (the rightmost shift bits spill into one extra plane byte) and to
+// widthTiles plane bytes per row when s == 0.
+//
+// The emitter assumes sprite width is a multiple of 8 (= a multiple
+// of one tile = a multiple of 8 pixels) so plane bytes per row are
+// integer. JoeyLib sprites are always tile-multiple by API contract.
+
+#include "joey/sprite.h"
+#include "joey/surface.h"
+#include "spriteEmitter.h"
+#include "spriteInternal.h"
+
+
+// ----- Constants -----
+
+#define TILE_PIXELS              8
+#define TILE_BYTES               32
+#define TILE_BYTES_PER_ROW       4
+#define TRANSPARENT_NIBBLE       0
+#define AMIGA_BITPLANES          4
+#define AMIGA_BYTES_PER_ROW      40
+
+
+// ----- Instruction encoding helpers -----
+
+static uint16_t writeBE16(uint8_t *out, uint16_t value) {
+    out[0] = (uint8_t)((value >> 8) & 0xFFu);
+    out[1] = (uint8_t)(value & 0xFFu);
+    return 2u;
+}
+
+
+// movea.l <d16,SP>, an  -- load arg at SP+disp into An.
+// Encoding: 0010 nnn 001 010 111  + disp16
+//           = 0x2057 + (n << 9), where n is dst An.
+//   a0: 0x206F, a1: 0x226F, a2: 0x246F, a3: 0x266F, a4: 0x286F.
+static const uint16_t kMoveaSpToAn[] = {
+    0x206Fu, 0x226Fu, 0x246Fu, 0x266Fu, 0x286Fu, 0x2A6Fu, 0x2C6Fu, 0x2E6Fu
+};
+
+
+// adda.w #imm, an  -- adds 16-bit signed imm to An (sign-extended).
+// Encoding: 1101 nnn 011 111 100  + imm
+//           = 0xD0FC + (n << 9).
+static const uint16_t kAddaWImmToAn[] = {
+    0xD0FCu, 0xD2FCu, 0xD4FCu, 0xD6FCu, 0xD8FCu, 0xDAFCu, 0xDCFCu, 0xDEFCu
+};
+
+
+// ANDI.B #imm, D0  -- 4 bytes (opcode word + imm word, byte in low half).
+// Opcode: 0000 0010 00 000 000  (size=byte, mode=Dn, reg=D0)
+#define ANDI_B_IMM_D0   0x0200u
+
+// ORI.B  #imm, D0  -- 4 bytes (opcode word + imm word, byte in low half).
+// Opcode: 0000 0000 00 000 000
+#define ORI_B_IMM_D0    0x0000u
+
+
+// MOVE.B d16(An), D0  -- 4 bytes (opcode + disp).
+// Encoding: 0001 000 000 mode reg
+//   = size=01 (byte), dst reg=000 (D0), dst mode=000 (Dn),
+//     src mode=101 (d16,An), src reg=An.
+//   = 0001000 000 101 nnn = 0x1028 + An.
+static const uint16_t kMoveBD16AnToD0[] = {
+    0x1028u, 0x1029u, 0x102Au, 0x102Bu
+};
+
+
+// MOVE.B D0, d16(An)  -- 4 bytes (opcode + disp).
+// Encoding: 0001 nnn 101 000 000 = 0x1140 + (An << 9).
+static const uint16_t kMoveBD0ToD16An[] = {
+    0x1140u, 0x1340u, 0x1540u, 0x1740u
+};
+
+
+// MOVE.B #imm, d16(An) -- 6 bytes (opcode + imm + disp).
+// Encoding: 0001 nnn 101 111 100 = 0x117C + (An << 9).
+//   (Was 0x113C earlier -- that's mode=100=predec; mode=101=d16(An)
+//    is the bit difference. Predec emits a 4-byte instruction with no
+//    disp word, so the byte stream went out of sync and every
+//    subsequent instruction decoded into garbage.)
+static const uint16_t kMoveBImmToD16An[] = {
+    0x117Cu, 0x137Cu, 0x157Cu, 0x177Cu
+};
+
+
+// MOVE.B (a4)+, d16(An) -- 4 bytes (opcode + disp).  -- used by save/restore (backup in a4)
+// Encoding: 0001 nnn 101 011 100 = 0x115C + (An << 9).
+static const uint16_t kMoveBA4PostincToD16An[] = {
+    0x115Cu, 0x135Cu, 0x155Cu, 0x175Cu
+};
+
+
+// MOVE.B d16(An), (a4)+ -- 4 bytes (opcode + disp).  -- used by save (planes -> backup)
+// Encoding: 1001 100 011 mode reg
+//   Wait, MOVE.B src,(a4)+ : dst mode = 011 (an+), dst reg = 100 (A4),
+//   so dst reg=100, dst mode=011 -> opcode high = 0001 100 011 ...
+//   = 0001100011 mode reg = 0x18C0..
+//   0001 100 011 101 nnn = 0x18E8 + An.
+static const uint16_t kMoveBD16AnToA4Postinc[] = {
+    0x18E8u, 0x18E9u, 0x18EAu, 0x18EBu
+};
+
+
+// MOVEM.L reglist, -(SP)  -- 4 bytes (opcode + reglist mask).
+//   Opcode 0x48E7. Predec mask is REVERSED vs all other modes:
+//   bit 15 = D0, ..., bit 8 = D7, bit 7 = A0, bit 6 = A1, bit 5 = A2,
+//   bit 4 = A3, bit 3 = A4, bit 2 = A5, bit 1 = A6, bit 0 = A7.
+#define MOVEM_L_PUSH_OPCODE   0x48E7u
+#define MOVEM_L_MASK_A2_A3    0x0030u  /* bits 5,4 = A2,A3 (predec order) */
+#define MOVEM_L_MASK_A2_A3_A4 0x0038u  /* bits 5,4,3 = A2,A3,A4 */
+
+// MOVEM.L (SP)+, reglist  -- 4 bytes (opcode + reglist mask).
+//   Opcode 0x4CDF. Postinc mask follows the standard layout:
+//   bit 0 = D0, ..., bit 7 = D7, bit 8 = A0, ..., bit 15 = A7.
+#define MOVEM_L_POP_OPCODE    0x4CDFu
+#define MOVEM_L_MASK_POP_A2_A3    0x0C00u  /* bits 11,10 = A3,A2 */
+#define MOVEM_L_MASK_POP_A2_A3_A4 0x1C00u  /* bits 12,11,10 = A4,A3,A2 */
+
+// RTS opcode.
+#define OPCODE_RTS            0x4E75u
+
+
+// ----- Emit helpers -----
+
+// For shift 0 (byte-aligned x), the sprite's chunky tile data converts
+// directly to plane bytes without any sub-byte shifting. For each
+// (row, col-byte, plane) we extract the 8 plane bits from 4 chunky
+// bytes (= 8 pixels) and produce one plane byte; we also produce a
+// mask byte indicating which pixel positions are non-transparent
+// (any plane bit != 0 in the source means non-transparent if
+// transparent index is 0, the JoeyLib convention).
+//
+// Sprite layout: tileData = wTiles x hTiles tiles, each tile = 8 rows
+// x 4 chunky bytes (32 bytes). Tiles laid out row-major within the
+// sprite. For plane-byte column `c` of row `r`:
+//   tileX = c (since each plane byte covers exactly one tile column)
+//   tileY = r / 8
+//   inTileY = r % 8
+//   chunky bytes = tileData + (tileY*wTiles + tileX)*32 + inTileY*4 + 0..3
+//
+// `col` must be in [0, widthTiles); callers handle out-of-range cols
+// (used when computing shifted variants that span widthTiles+1 output
+// bytes per row) by passing a sentinel and checking against widthTiles
+// before invoking this helper.
+static void planeByteAndMaskAt(const SpriteT *sp, uint16_t row, uint16_t col,
+                               uint8_t *planeBytes /*[4]*/, uint8_t *maskByte)
+{
+    uint16_t       tileX;
+    uint16_t       tileY;
+    uint16_t       inTileY;
+    const uint8_t *tile;
+    const uint8_t *chunky;
+    uint8_t        nibbles[8];
+    uint8_t        b0, b1, b2, b3;
+    uint16_t       p;
+    uint8_t        bitMask;
+    uint8_t        pix;
+
+    tileX   = col;
+    tileY   = row >> 3;
+    inTileY = row & 7u;
+
+    tile   = sp->tileData + (uint32_t)((tileY * sp->widthTiles + tileX) * 32u);
+    chunky = tile + inTileY * 4u;
+
+    nibbles[0] = (uint8_t)(chunky[0] >> 4);
+    nibbles[1] = (uint8_t)(chunky[0] & 0x0Fu);
+    nibbles[2] = (uint8_t)(chunky[1] >> 4);
+    nibbles[3] = (uint8_t)(chunky[1] & 0x0Fu);
+    nibbles[4] = (uint8_t)(chunky[2] >> 4);
+    nibbles[5] = (uint8_t)(chunky[2] & 0x0Fu);
+    nibbles[6] = (uint8_t)(chunky[3] >> 4);
+    nibbles[7] = (uint8_t)(chunky[3] & 0x0Fu);
+
+    b0 = 0u; b1 = 0u; b2 = 0u; b3 = 0u;
+    *maskByte = 0u;
+    for (p = 0; p < 8u; p++) {
+        pix = nibbles[p];
+        if (pix == TRANSPARENT_NIBBLE) {
+            continue;
+        }
+        bitMask = (uint8_t)(0x80u >> p);
+        *maskByte = (uint8_t)(*maskByte | bitMask);
+        if (pix & 1u) b0 = (uint8_t)(b0 | bitMask);
+        if (pix & 2u) b1 = (uint8_t)(b1 | bitMask);
+        if (pix & 4u) b2 = (uint8_t)(b2 | bitMask);
+        if (pix & 8u) b3 = (uint8_t)(b3 | bitMask);
+    }
+    planeBytes[0] = b0;
+    planeBytes[1] = b1;
+    planeBytes[2] = b2;
+    planeBytes[3] = b3;
+}
+
+
+// Shifted variant: produces 4 plane bytes and 1 mask byte for output
+// column `outCol` (0..widthTiles inclusive) of row `row` when the
+// sprite is shifted right by `shift` pixels (1..7). For shift 0,
+// callers should use planeByteAndMaskAt directly (faster, no spill).
+//
+// Each output byte is composed of bits drawn from up to two source
+// plane bytes:
+//   leftPart  = src[outCol-1] << (8 - shift)   (high (shift) bits)
+//   rightPart = src[outCol]   >> shift          (low (8-shift) bits)
+// with src[-1] and src[widthTiles] treated as 0/transparent. The
+// resulting plane byte is leftPart | rightPart; the mask byte is the
+// shifted union of the per-byte source masks.
+static void planeByteAndMaskShifted(const SpriteT *sp, uint16_t row, uint16_t outCol,
+                                    uint8_t shift, uint16_t widthTiles,
+                                    uint8_t *planeBytes /*[4]*/, uint8_t *maskByte)
+{
+    uint8_t leftPlanes[AMIGA_BITPLANES];
+    uint8_t leftMask;
+    uint8_t rightPlanes[AMIGA_BITPLANES];
+    uint8_t rightMask;
+    uint8_t i;
+
+    leftMask  = 0u;
+    rightMask = 0u;
+    for (i = 0; i < AMIGA_BITPLANES; i++) {
+        leftPlanes[i]  = 0u;
+        rightPlanes[i] = 0u;
+    }
+
+    if (outCol > 0u && (uint16_t)(outCol - 1u) < widthTiles) {
+        planeByteAndMaskAt(sp, row, (uint16_t)(outCol - 1u), leftPlanes, &leftMask);
+    }
+    if (outCol < widthTiles) {
+        planeByteAndMaskAt(sp, row, outCol, rightPlanes, &rightMask);
+    }
+
+    *maskByte = (uint8_t)(((leftMask  << (8u - shift)) & 0xFFu) |
+                          ((rightMask >>       shift)  & 0xFFu));
+    for (i = 0; i < AMIGA_BITPLANES; i++) {
+        planeBytes[i] = (uint8_t)(((leftPlanes[i]  << (8u - shift)) & 0xFFu) |
+                                  ((rightPlanes[i] >>       shift)  & 0xFFu));
+    }
+}
+
+
+// Emit code that merges one plane byte into d16(an) where d16 is the
+// row-relative byte offset (0 since we re-base each row by adda.w).
+// The choice of all-opaque vs mixed encoding cuts code size when many
+// pixels are opaque (typical for sprite interiors).
+static uint16_t emitMergeByteToD16An(uint8_t *out, uint16_t cursor,
+                                     uint8_t an, uint8_t disp,
+                                     uint8_t maskByte, uint8_t srcByte)
+{
+    if (maskByte == 0u) {
+        return cursor;  /* nothing to write */
+    }
+    if (maskByte == 0xFFu) {
+        /* All-opaque shortcut: move.b #src, d16(an). */
+        cursor += writeBE16(out + cursor, kMoveBImmToD16An[an]);
+        cursor += writeBE16(out + cursor, (uint16_t)srcByte);
+        cursor += writeBE16(out + cursor, (uint16_t)disp);
+        return cursor;
+    }
+    /* Mixed: load existing, clear mask bits, OR in src, write back. */
+    cursor += writeBE16(out + cursor, kMoveBD16AnToD0[an]);
+    cursor += writeBE16(out + cursor, (uint16_t)disp);
+    cursor += writeBE16(out + cursor, ANDI_B_IMM_D0);
+    cursor += writeBE16(out + cursor, (uint16_t)((~maskByte) & 0xFFu));
+    cursor += writeBE16(out + cursor, ORI_B_IMM_D0);
+    cursor += writeBE16(out + cursor, (uint16_t)srcByte);
+    cursor += writeBE16(out + cursor, kMoveBD0ToD16An[an]);
+    cursor += writeBE16(out + cursor, (uint16_t)disp);
+    return cursor;
+}
+
+
+// ----- Public API -----
+
+uint16_t spriteEmitDrawPlanar68k(uint8_t *out, const SpriteT *sp, uint8_t shift) {
+    uint16_t cursor;
+    uint16_t row;
+    uint16_t col;
+    uint16_t heightPx;
+    uint16_t widthTiles;
+    uint16_t bytesPerRow;       /* per plane, per row */
+    uint8_t  planeBytes[AMIGA_BITPLANES];
+    uint8_t  maskByte;
+    uint8_t  i;
+
+    if (shift > 7u) {
+        return 0u;
+    }
+
+    cursor      = 0;
+    heightPx    = (uint16_t)(sp->heightTiles * TILE_PIXELS);
+    widthTiles  = (uint16_t)sp->widthTiles;
+    bytesPerRow = (uint16_t)(widthTiles + (shift == 0u ? 0u : 1u));
+
+    /* Prologue: m68k cdecl callee-saves a2-a6; we clobber a2 and a3
+     * loading plane pointers, so push them first. After the push, all
+     * stack arg displacements shift by +8 (two longs). */
+    cursor += writeBE16(out + cursor, MOVEM_L_PUSH_OPCODE);
+    cursor += writeBE16(out + cursor, MOVEM_L_MASK_A2_A3);
+    for (i = 0; i < AMIGA_BITPLANES; i++) {
+        cursor += writeBE16(out + cursor, kMoveaSpToAn[i]);
+        cursor += writeBE16(out + cursor, (uint16_t)(8u + 4u + i * 4u));
+    }
+
+    for (row = 0; row < heightPx; row++) {
+        for (col = 0; col < bytesPerRow; col++) {
+            if (shift == 0u) {
+                planeByteAndMaskAt(sp, row, col, planeBytes, &maskByte);
+            } else {
+                planeByteAndMaskShifted(sp, row, col, shift, widthTiles, planeBytes, &maskByte);
+            }
+            for (i = 0; i < AMIGA_BITPLANES; i++) {
+                cursor = emitMergeByteToD16An(out, cursor, i, (uint8_t)col,
+                                              maskByte, planeBytes[i]);
+            }
+        }
+        if (row + 1u < heightPx) {
+            for (i = 0; i < AMIGA_BITPLANES; i++) {
+                cursor += writeBE16(out + cursor, kAddaWImmToAn[i]);
+                cursor += writeBE16(out + cursor, (uint16_t)AMIGA_BYTES_PER_ROW);
+            }
+        }
+    }
+
+    /* Epilogue: restore a2-a3, rts. */
+    cursor += writeBE16(out + cursor, MOVEM_L_POP_OPCODE);
+    cursor += writeBE16(out + cursor, MOVEM_L_MASK_POP_A2_A3);
+    cursor += writeBE16(out + cursor, OPCODE_RTS);
+    return cursor;
+}
+
+
+// SAVE: planes -> backup. backup is one contiguous 4*H*W/8 byte buffer
+// laid out as 4 plane stripes, matching halSpriteSavePlanes format
+// (so cross-platform save buffer is interchangeable).
+//
+// Per row: for each plane, copy bytesPerRow bytes from d16(an) to
+// (a4)+. After the row's reads, the planes need to advance by 40,
+// while a4 advances naturally via post-increment.
+//
+// Plane stripes are sequential in backup. We could either (a) do all
+// rows of plane 0, then plane 1, etc. (matches halSpriteSavePlanes
+// layout), or (b) interleave rows of all 4 planes (different layout).
+// halSpriteSavePlanes does (a) -- 4 separate plane stripes. The
+// emitted code below matches that layout for compat.
+uint16_t spriteEmitSavePlanar68k(uint8_t *out, const SpriteT *sp, uint8_t shift) {
+    uint16_t cursor;
+    uint16_t row;
+    uint16_t col;
+    uint16_t heightPx;
+    uint16_t bytesPerRow;
+    uint8_t  i;
+
+    /* Shifts 2..7 reuse shift 1's bytes (identical memcpy). The
+     * spriteCompile post-emit pass aliases their routineOffsets to
+     * slot 1 so this routine is emitted once. */
+    if (shift > 1u) {
+        return 0u;
+    }
+
+    cursor      = 0;
+    heightPx    = (uint16_t)(sp->heightTiles * TILE_PIXELS);
+    bytesPerRow = (uint16_t)(sp->widthTiles + (shift == 0u ? 0u : 1u));
+
+    /* Prologue: callee-save a2/a3/a4 (m68k cdecl), then load 4 plane
+     * pointers + backup pointer. After the push, all stack arg disps
+     * shift by +12 (three longs). */
+    cursor += writeBE16(out + cursor, MOVEM_L_PUSH_OPCODE);
+    cursor += writeBE16(out + cursor, MOVEM_L_MASK_A2_A3_A4);
+    for (i = 0; i < AMIGA_BITPLANES; i++) {
+        cursor += writeBE16(out + cursor, kMoveaSpToAn[i]);
+        cursor += writeBE16(out + cursor, (uint16_t)(12u + 4u + i * 4u));
+    }
+    /* a4 = backup. */
+    cursor += writeBE16(out + cursor, kMoveaSpToAn[4]);
+    cursor += writeBE16(out + cursor, (uint16_t)(12u + 4u + 4u * 4u));
+
+    /* Plane-major: for each plane, walk all rows. After this routine,
+     * each An has advanced by H*40 (one frame full); we don't need to
+     * unwind because the function returns. We DO need to reset An
+     * back to start before walking the NEXT plane though.
+     *
+     * Simpler alternative: row-major (interleaved). Per row, copy
+     * bytesPerRow bytes from each plane to (a4)+, then advance all
+     * 4 planes by 40. Net: a4 advances by 4*H*bytesPerRow; planes
+     * advance by H*40. Backup layout becomes interleaved (plane0_row0,
+     * plane1_row0, plane2_row0, plane3_row0, plane0_row1, ...).
+     *
+     * That doesn't match halSpriteSavePlanes' plane-major layout. Need
+     * to either (a) match it -- emit per-plane outer loop with a4
+     * stride between planes -- or (b) change halSpriteSavePlanes to
+     * interleaved. Picking (b) is simpler in emitted code, but ALSO
+     * requires updating halSpriteRestorePlanes and halSpriteRestoreUnder
+     * fallback math.
+     *
+     * For now: use plane-major matching halSpriteSavePlanes. Per
+     * plane: walk rows, copy bytes from d16(an) to (a4)+, advance an
+     * by 40 after each row except the last; reset an back to start
+     * before next plane. */
+    for (i = 0; i < AMIGA_BITPLANES; i++) {
+        for (row = 0; row < heightPx; row++) {
+            for (col = 0; col < bytesPerRow; col++) {
+                cursor += writeBE16(out + cursor, kMoveBD16AnToA4Postinc[i]);
+                cursor += writeBE16(out + cursor, (uint16_t)col);
+            }
+            if (row + 1u < heightPx) {
+                cursor += writeBE16(out + cursor, kAddaWImmToAn[i]);
+                cursor += writeBE16(out + cursor, (uint16_t)AMIGA_BYTES_PER_ROW);
+            }
+        }
+        /* Reset An back to the plane base for next iteration. The
+         * total advance was (heightPx - 1) * 40. Subtract that. */
+        if (i + 1u < AMIGA_BITPLANES) {
+            cursor += writeBE16(out + cursor, kAddaWImmToAn[i]);
+            cursor += writeBE16(out + cursor, (uint16_t)(0u - ((heightPx - 1u) * AMIGA_BYTES_PER_ROW)));
+        }
+    }
+
+    cursor += writeBE16(out + cursor, MOVEM_L_POP_OPCODE);
+    cursor += writeBE16(out + cursor, MOVEM_L_MASK_POP_A2_A3_A4);
+    cursor += writeBE16(out + cursor, OPCODE_RTS);
+    return cursor;
+}
+
+
+// RESTORE: backup -> planes. Mirror of save. Uses MOVE.B (a4)+, d16(an).
+uint16_t spriteEmitRestorePlanar68k(uint8_t *out, const SpriteT *sp, uint8_t shift) {
+    uint16_t cursor;
+    uint16_t row;
+    uint16_t col;
+    uint16_t heightPx;
+    uint16_t bytesPerRow;
+    uint8_t  i;
+
+    if (shift > 1u) {
+        return 0u;
+    }
+
+    cursor      = 0;
+    heightPx    = (uint16_t)(sp->heightTiles * TILE_PIXELS);
+    bytesPerRow = (uint16_t)(sp->widthTiles + (shift == 0u ? 0u : 1u));
+
+    /* Callee-save a2/a3/a4; arg disps shift by +12. */
+    cursor += writeBE16(out + cursor, MOVEM_L_PUSH_OPCODE);
+    cursor += writeBE16(out + cursor, MOVEM_L_MASK_A2_A3_A4);
+    for (i = 0; i < AMIGA_BITPLANES; i++) {
+        cursor += writeBE16(out + cursor, kMoveaSpToAn[i]);
+        cursor += writeBE16(out + cursor, (uint16_t)(12u + 4u + i * 4u));
+    }
+    cursor += writeBE16(out + cursor, kMoveaSpToAn[4]);
+    cursor += writeBE16(out + cursor, (uint16_t)(12u + 4u + 4u * 4u));
+
+    for (i = 0; i < AMIGA_BITPLANES; i++) {
+        for (row = 0; row < heightPx; row++) {
+            for (col = 0; col < bytesPerRow; col++) {
+                cursor += writeBE16(out + cursor, kMoveBA4PostincToD16An[i]);
+                cursor += writeBE16(out + cursor, (uint16_t)col);
+            }
+            if (row + 1u < heightPx) {
+                cursor += writeBE16(out + cursor, kAddaWImmToAn[i]);
+                cursor += writeBE16(out + cursor, (uint16_t)AMIGA_BYTES_PER_ROW);
+            }
+        }
+        if (i + 1u < AMIGA_BITPLANES) {
+            cursor += writeBE16(out + cursor, kAddaWImmToAn[i]);
+            cursor += writeBE16(out + cursor, (uint16_t)(0u - ((heightPx - 1u) * AMIGA_BYTES_PER_ROW)));
+        }
+    }
+
+    cursor += writeBE16(out + cursor, MOVEM_L_POP_OPCODE);
+    cursor += writeBE16(out + cursor, MOVEM_L_MASK_POP_A2_A3_A4);
+    cursor += writeBE16(out + cursor, OPCODE_RTS);
+    return cursor;
+}
--- a/src/codegen/spriteEmitX86.c
+++ b/src/codegen/spriteEmitX86.c
@ -200,6 +200,10 @@ uint16_t spriteEmitDrawX86(uint8_t *out, const SpriteT *sp, uint8_t shift) {
    uint8_t   v3;
    uint8_t   m;

+    if (shift > 1u) {
+        return 0u;
+    }
+
    cursor             = 0;
    heightPx           = (uint16_t)(sp->heightTiles * TILE_PIXELS);
    spriteBytesPerRow  = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW);
@ -313,6 +317,10 @@ uint16_t spriteEmitRestoreX86(uint8_t *out, const SpriteT *sp, uint8_t shift) {
    uint16_t heightPx;
    uint16_t copyBytes;

+    if (shift > 1u) {
+        return 0u;
+    }
+
    cursor    = 0;
    heightPx  = (uint16_t)(sp->heightTiles * TILE_PIXELS);
    copyBytes = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW + (shift == 1u ? 1u : 0u));
@ -339,6 +347,10 @@ uint16_t spriteEmitSaveX86(uint8_t *out, const SpriteT *sp, uint8_t shift) {
    uint16_t heightPx;
    uint16_t copyBytes;

+    if (shift > 1u) {
+        return 0u;
+    }
+
    cursor    = 0;
    heightPx  = (uint16_t)(sp->heightTiles * TILE_PIXELS);
    copyBytes = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW + (shift == 1u ? 1u : 0u));
--- a/src/codegen/spriteEmitter.h
+++ b/src/codegen/spriteEmitter.h
@ -42,4 +42,19 @@ uint16_t spriteEmitRestoreX86 (uint8_t *out, const SpriteT *sp, uint8_t shift);
 uint16_t spriteEmitSave68k    (uint8_t *out, const SpriteT *sp, uint8_t shift);
 uint16_t spriteEmitRestore68k (uint8_t *out, const SpriteT *sp, uint8_t shift);

+// Planar 68k emitters (Amiga). Distinct from the chunky 68k emitters
+// above because the destination addressing is across 4 separate
+// bitplane buffers, not a single packed-pixel surface. Calling
+// convention for the emitted bytes (cdecl):
+//   void draw    (uint8_t *p0, uint8_t *p1, uint8_t *p2, uint8_t *p3);
+//   void save    (uint8_t *p0, uint8_t *p1, uint8_t *p2, uint8_t *p3, uint8_t *backup);
+//   void restore (uint8_t *p0, uint8_t *p1, uint8_t *p2, uint8_t *p3, const uint8_t *backup);
+// Each pi is plane_base + byteOff (= y*40 + x/8 already added by the
+// dispatcher). Returns 0 for shifts not yet implemented (today only
+// shift 0 == byte-aligned x is emitted; shifts 1..7 fall back to the
+// cross-platform interpreter).
+uint16_t spriteEmitDrawPlanar68k    (uint8_t *out, const SpriteT *sp, uint8_t shift);
+uint16_t spriteEmitSavePlanar68k    (uint8_t *out, const SpriteT *sp, uint8_t shift);
+uint16_t spriteEmitRestorePlanar68k (uint8_t *out, const SpriteT *sp, uint8_t shift);
+
 #endif
--- a/src/core/debug.c
+++ b/src/core/debug.c
@ -1,11 +1,18 @@
-// Cross-platform "where did it hang?" logger. Each call opens
-// joeylog.txt, appends a line, fflushes, closes. Slow but durable
-// -- the last line in the file is guaranteed to be on disk before
-// any subsequent operation that might hang.
+// Cross-platform "where did it hang?" logger. Holds joeylog.txt open
+// across calls; libc's stdio buffer absorbs writes (~4 KB) and the
+// final fclose at program exit (via atexit) gets the buffer to disk.
 //
-// Build only as needed for diagnostics; remove the calls when the
-// bug is fixed. The hang on ST kept us looking at the wrong layer
-// without this kind of trace.
+// Earlier rev opened+closed per call for crash durability ("last line
+// guaranteed on disk if we hang"); that cost ~1 second per call
+// through GoldenGate's ProDOS FST emulation -- a 50-line UBER run
+// burned ~5 minutes in IO. Even per-line fflush is too expensive
+// because every fflush forces an FST WRITE, and host-OS file IO time
+// isn't tracked by the IIgs VBL counter so wall-time logs underreport.
+//
+// Tradeoff: if the program crashes mid-run, buffered log lines may
+// not reach disk. For UBER and similar batch demos that's acceptable;
+// for hang-debugging where durability matters, call joeyLogFlush()
+// at the suspected hang points.

 #include <stdio.h>
 #include <stdarg.h>
@ -15,6 +22,27 @@


 static const char *kLogPath = "joeylog.txt";
+static FILE       *gLogFp   = NULL;
+/* 16 KB is enough for UBER's full log (~5 KB) plus generous headroom,
+ * so the file never auto-flushes mid-run. ORCA-C / libnix default
+ * buffers are only ~512 bytes; with that, a 50-line log triggers ~10
+ * ProDOS / AmigaDOS WRITEs through the host FST, each of which is
+ * untracked-host-time (seconds). Buffer the whole thing in memory and
+ * let the atexit fclose flush once. */
+#define JOEY_LOG_BUF_BYTES 16384
+static char        gLogBuf[JOEY_LOG_BUF_BYTES];
+
+
+/* Lazy-open. Returns NULL if the open failed (silently disable). */
+static FILE *logFile(void) {
+    if (gLogFp == NULL) {
+        gLogFp = fopen(kLogPath, "a");
+        if (gLogFp != NULL) {
+            (void)setvbuf(gLogFp, gLogBuf, _IOFBF, sizeof(gLogBuf));
+        }
+    }
+    return gLogFp;
+}


 void joeyLog(const char *msg) {
@ -22,13 +50,12 @@ void joeyLog(const char *msg) {
    if (msg == NULL) {
        return;
    }
-    fp = fopen(kLogPath, "a");
+    fp = logFile();
    if (fp == NULL) {
        return;
    }
    fputs(msg, fp);
    fputc('\n', fp);
-    fclose(fp);
 }


@ -38,7 +65,7 @@ void joeyLogF(const char *fmt, ...) {
    if (fmt == NULL) {
        return;
    }
-    fp = fopen(kLogPath, "a");
+    fp = logFile();
    if (fp == NULL) {
        return;
    }
@ -46,14 +73,27 @@ void joeyLogF(const char *fmt, ...) {
    vfprintf(fp, fmt, args);
    va_end(args);
    fputc('\n', fp);
-    fclose(fp);
+}
+
+
+void joeyLogFlush(void) {
+    if (gLogFp != NULL) {
+        fflush(gLogFp);
+    }
 }


 void joeyLogReset(void) {
-    FILE *fp;
-    fp = fopen(kLogPath, "w");
+    if (gLogFp != NULL) {
+        fclose(gLogFp);
+        gLogFp = NULL;
+    }
+    /* Truncate by opening for write then closing; subsequent
+     * joeyLog* will reopen for append. */
+    {
+        FILE *fp = fopen(kLogPath, "w");
        if (fp != NULL) {
            fclose(fp);
        }
+    }
 }
--- a/src/core/draw.c
+++ b/src/core/draw.c
@ -186,13 +186,17 @@ static void floodFillInternal(SurfaceT *s, int16_t startX, int16_t startY, uint8
            continue;
        }

+        /* Phase 9: planar ports have NULL s->pixels and the asm fast
+         * paths take a chunky-row pointer. Skip them on planar; the C
+         * fallback below uses halSamplePixel which works on both
+         * storage layouts. */
+        if (s->pixels != NULL) {
            // Highest-tier asm fast path: seed-test + walk-left + walk-right
            // + 1-row fill + scan-above + scan-below + push, all in one
            // cross-segment call. The asm caches row addr / match decoder
            // across every sub-operation. C just pops and dispatches; this
            // path completes the entire per-seed work and computes the row
            // address itself, so we don't pay y*160 in C unless we fall back.
-        {
            bool seedMatched;
            if (halFastFloodWalkAndScans(s->pixels, x, y,
                                         matchColor, newNibble, matchEqual,
@ -203,22 +207,27 @@ static void floodFillInternal(SurfaceT *s, int16_t startX, int16_t startY, uint8
            }
        }

-        // Fallback path needs row; compute it here so the asm path
-        // above doesn't pay for an unused y*160 multiply on every iter.
-        row = &s->pixels[SURFACE_ROW_OFFSET(y)];
+        /* Fallback path: compute row only if chunky; halFastFloodWalk
+         * needs it but isn't implemented on Amiga. */
+        row = (s->pixels != NULL) ? &s->pixels[SURFACE_ROW_OFFSET(y)] : NULL;

        // Tier-2 asm fast path: combined seed test + walk-left +
        // walk-right in one cross-segment call. Falls back to the
        // pure-C walks below on ports without an asm implementation.
        {
            bool seedMatched;
-            if (halFastFloodWalk(row, x, matchColor, newNibble, matchEqual,
+            if (row != NULL && halFastFloodWalk(row, x, matchColor, newNibble, matchEqual,
+                                                &seedMatched, &leftX, &rightX)) {
+                if (!seedMatched) {
+                    continue;
+                }
+            } else if (halFloodWalkPlanes(s, x, y, matchColor, newNibble, matchEqual,
                                          &seedMatched, &leftX, &rightX)) {
                if (!seedMatched) {
                    continue;
                }
            } else {
-                pix = srcPixel(row, x);
+                pix = halSamplePixel(s, x, y);
                pixMatch = (pix == matchColor);
                if (matchEqual) {
                    if (!pixMatch) {
@ -233,7 +242,7 @@ static void floodFillInternal(SurfaceT *s, int16_t startX, int16_t startY, uint8
                // Walk left to find the start of the matching run.
                leftX = x;
                while (leftX > 0) {
-                    pix = srcPixel(row, (int16_t)(leftX - 1));
+                    pix = halSamplePixel(s, (int16_t)(leftX - 1), y);
                    pixMatch = (pix == matchColor);
                    if (matchEqual ? !pixMatch : (pixMatch || pix == newNibble)) {
                        break;
@ -244,7 +253,7 @@ static void floodFillInternal(SurfaceT *s, int16_t startX, int16_t startY, uint8
                // Walk right to find the end.
                rightX = x;
                while (rightX < SURFACE_WIDTH - 1) {
-                    pix = srcPixel(row, (int16_t)(rightX + 1));
+                    pix = halSamplePixel(s, (int16_t)(rightX + 1), y);
                    pixMatch = (pix == matchColor);
                    if (matchEqual ? !pixMatch : (pixMatch || pix == newNibble)) {
                        break;
@ -256,12 +265,18 @@ static void floodFillInternal(SurfaceT *s, int16_t startX, int16_t startY, uint8

        // Fill the span. Bypass fillRect's clipping wrapper: walk-out
        // already guaranteed leftX/rightX are in [0..SURFACE_WIDTH-1]
-        // and the seed-pop bounds check did the same for y.
+        // and the seed-pop bounds check did the same for y. We DO
+        // need the planar dual-write (which fillRect's wrapper would
+        // call), so invoke halFillRectPlanes explicitly after the
+        // chunky span fill -- otherwise PLANAR_PRESENT builds (and,
+        // post-Phase-9, every build) display flood-filled regions
+        // as the unfilled background.
        {
            int16_t spanW = (int16_t)(rightX - leftX + 1);
            if (!halFastFillRect(s, leftX, y, (uint16_t)spanW, 1, newNibble)) {
                fillRectClipped(s, leftX, y, spanW, 1, newNibble);
            }
+            halFillRectPlanes(s, leftX, y, (uint16_t)spanW, 1, newNibble);
        }

        // Scan rows above and below for run boundaries. The hot
@ -291,19 +306,26 @@ static void floodFillInternal(SurfaceT *s, int16_t startX, int16_t startY, uint8
                    }
                    scanY  = (int16_t)(y + 1);
                }
-                scanRow = &s->pixels[SURFACE_ROW_OFFSET(scanY)];
+                scanRow = (s->pixels != NULL) ? &s->pixels[SURFACE_ROW_OFFSET(scanY)] : NULL;
                // Prefer the combined scan+push asm path (one call per
-                // scan, no markBuf and no per-pixel C edge walk).
-                if (!halFastFloodScanAndPush(scanRow, leftX, rightX,
+                // scan, no markBuf and no per-pixel C edge walk). Skip
+                // the asm tiers if we don't have a chunky row pointer
+                // (Phase 9 planar ports).
+                if (scanRow == NULL ||
+                    !halFastFloodScanAndPush(scanRow, leftX, rightX,
                                             matchColor, newNibble, matchEqual,
                                             scanY, stackX, stackY,
                                             &sp, FLOOD_STACK_SIZE)) {
-                    if (!halFastFloodScanRow(scanRow, leftX, rightX,
+                    if ((scanRow == NULL ||
+                         !halFastFloodScanRow(scanRow, leftX, rightX,
+                                              matchColor, newNibble, matchEqual,
+                                              floodMarkBuf)) &&
+                        !halFloodScanRowPlanes(s, leftX, rightX, scanY,
                                               matchColor, newNibble, matchEqual,
                                               floodMarkBuf)) {
                        // C fallback: fill markBuf the slow way.
                        for (i = 0; i < spanLen; i++) {
-                            pix = srcPixel(scanRow, (int16_t)(leftX + i));
+                            pix = halSamplePixel(s, (int16_t)(leftX + i), scanY);
                            pixMatch = (pix == matchColor);
                            floodMarkBuf[i] = (uint8_t)(matchEqual
                                ? (pixMatch ? 1 : 0)
@ -621,12 +643,12 @@ void fillRect(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t
    if (!halFastFillRect(s, sx, sy, (uint16_t)sw, (uint16_t)sh, colorIndex)) {
        fillRectClipped(s, sx, sy, sw, sh, colorIndex);
    }
+    halFillRectPlanes(s, sx, sy, (uint16_t)sw, (uint16_t)sh, colorIndex);
    surfaceMarkDirtyRect(s, sx, sy, sw, sh);
 }


 void floodFill(SurfaceT *s, int16_t x, int16_t y, uint8_t newColor) {
-    uint8_t *row;
    uint8_t  seedColor;

    if (s == NULL) {
@ -635,8 +657,9 @@ void floodFill(SurfaceT *s, int16_t x, int16_t y, uint8_t newColor) {
    if (x < 0 || x >= SURFACE_WIDTH || y < 0 || y >= SURFACE_HEIGHT) {
        return;
    }
-    row       = &s->pixels[SURFACE_ROW_OFFSET(y)];
-    seedColor = srcPixel(row, x);
+    /* halSamplePixel reads from whichever storage the port uses --
+     * works on both chunky (s->pixels) and planar (s->portData) ports. */
+    seedColor = halSamplePixel(s, x, y);
    if ((seedColor & 0x0F) == (newColor & 0x0F)) {
        return;
    }
@ -645,7 +668,6 @@ void floodFill(SurfaceT *s, int16_t x, int16_t y, uint8_t newColor) {


 void floodFillBounded(SurfaceT *s, int16_t x, int16_t y, uint8_t newColor, uint8_t boundaryColor) {
-    uint8_t *row;
    uint8_t  pix;

    if (s == NULL) {
@ -654,8 +676,7 @@ void floodFillBounded(SurfaceT *s, int16_t x, int16_t y, uint8_t newColor, uint8
    if (x < 0 || x >= SURFACE_WIDTH || y < 0 || y >= SURFACE_HEIGHT) {
        return;
    }
-    row = &s->pixels[SURFACE_ROW_OFFSET(y)];
-    pix = srcPixel(row, x);
+    pix = halSamplePixel(s, x, y);
    // Starting on a boundary pixel or already-filled pixel: nothing
    // to do.
    if ((pix & 0x0F) == (boundaryColor & 0x0F)) {
@ -669,25 +690,16 @@ void floodFillBounded(SurfaceT *s, int16_t x, int16_t y, uint8_t newColor, uint8


 uint8_t samplePixel(const SurfaceT *s, int16_t x, int16_t y) {
-    uint8_t byte;
-
    if (s == NULL) {
        return 0;
    }
    if (x < 0 || x >= SURFACE_WIDTH || y < 0 || y >= SURFACE_HEIGHT) {
        return 0;
    }
-
-    /* Cast to uint16_t before shift -- already validated x >= 0,
-     * unsigned semantics match. Avoids ~SSHIFTRIGHT helper. */
-    byte = s->pixels[SURFACE_ROW_OFFSET(y) + ((uint16_t)x >> 1)];
-    if (x & 1) {
-        return (uint8_t)(byte & 0x0F);
-    }
-    /* `byte >> 4` is uint8_t but ORCA-C promotes to int (signed 16-bit)
-     * for the shift, then narrows -- triggers ~SSHIFTRIGHT. The
-     * mask-then-shift sidesteps the promotion path. */
-    return (uint8_t)((byte & 0xF0u) >> 4);
+    /* halSamplePixel reads from whichever storage the port uses --
+     * chunky ports return a nibble extracted from s->pixels; planar
+     * ports read 4 plane bits and assemble the nibble. */
+    return halSamplePixel(s, x, y);
 }


@ -725,6 +737,8 @@ void surfaceBlit(SurfaceT *dst, const JoeyAssetT *src, int16_t x, int16_t y) {
            }
        }
    }
+    halBlitRectPlanes(dst, x, y, src->pixels, srcX0, srcY0,
+                      copyW, copyH, srcRowBytes, 0xFFFFu);
    surfaceMarkDirtyRect(dst, x, y, copyW, copyH);
 }

@ -768,6 +782,8 @@ void surfaceBlitMasked(SurfaceT *dst, const JoeyAssetT *src, int16_t x, int16_t
            }
        }
    }
+    halBlitRectPlanes(dst, x, y, src->pixels, srcX0, srcY0,
+                      copyW, copyH, srcRowBytes, (uint16_t)transparent);
    surfaceMarkDirtyRect(dst, x, y, copyW, copyH);
 }

--- a/src/core/hal.h
+++ b/src/core/hal.h
@ -9,8 +9,11 @@
 #ifndef JOEYLIB_HAL_H
 #define JOEYLIB_HAL_H

+#include <stdio.h>
+
 #include "joey/core.h"
 #include "joey/input.h"
+#include "joey/sprite.h"
 #include "joey/surface.h"

 // Per-port one-shot initialization. Called from joeyInit after config
@ -27,17 +30,131 @@ void halShutdown(void);
 // backs the library-owned stage surface. Ports that have a
 // hardware-friendly pin location for the back buffer (IIgs $01/2000
 // with SHR shadow inhibited) return that address here; ports with no
-// such constraint just malloc/free.
+// such constraint just malloc/free. Planar 68k ports may return NULL
+// if the surface is planar-only and has no chunky shadow.
 uint8_t *halStageAllocPixels(void);
 void     halStageFreePixels(uint8_t *pixels);

-// Present the entire source surface to the display.
-void halPresent(const SurfaceT *src);
+// Allocate / release the per-surface portData blob (see SurfaceT in
+// surfaceInternal.h). Chunky ports return NULL from Init -- they keep
+// portData unused and operate on the chunky `pixels` buffer. Planar
+// 68k ports allocate a per-surface struct here describing the
+// bitplane storage (Amiga: 4 separate plane buffers + stride; ST: one
+// interleaved buffer + stride). Called by surfaceCreate / stageAlloc
+// after pixels is allocated; freed by surfaceDestroy / stageFree
+// before pixels is freed. `isStage` lets the port short-circuit for
+// the stage if its planes are display-owned (e.g. Amiga's BitMap
+// planes from OpenScreen) rather than allocated per surface.
+void *halSurfaceAllocPortData(SurfaceT *s, bool isStage);
+void  halSurfaceFreePortData(SurfaceT *s, bool isStage, void *portData);

-// Present a rectangular region of the source surface. The caller has
-// already validated and clipped the rect to be fully inside the
-// surface bounds and to have positive extents.
-void halPresentRect(const SurfaceT *src, int16_t x, int16_t y, uint16_t w, uint16_t h);
+// Phase 3 planar dual-write: called from cross-platform fillRect AFTER
+// the chunky shadow has been written, with the same already-clipped
+// (x, y, w, h) and the raw color index 0..15. Planar ports update
+// the bitplanes with the rect's bit pattern (per-plane bit value =
+// (color >> plane) & 1). Chunky ports (DOS, IIgs) provide a no-op
+// stub. Called unconditionally so cross-platform code doesn't have
+// to know the port is planar; the per-port stub is the cheapest
+// possible thing on chunky ports.
+void halFillRectPlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex);
+
+// Phase 3 planar dual-write for surfaceCopy: called from cross-platform
+// surfaceCopy AFTER the chunky pixel buffer is memcpy'd. Planar ports
+// also memcpy the bitplanes from src to dst so JOEYLIB_PLANAR_PRESENT
+// builds see correct planes. dst and src are non-NULL and distinct
+// (caller's no-op guards already passed).
+void halSurfaceCopyPlanes(SurfaceT *dst, const SurfaceT *src);
+
+// Phase 5 planar dual-write for tile ops. Called from cross-platform
+// tile.c AFTER the chunky path completes. (bx, by) are tile-grid
+// coords (0..39 horiz, 0..24 vert; surface is 40x25 tiles).
+// transparentIndex for tileCopyMasked: pixel value to skip. tilePaste
+// reads from a packed 32-byte chunky TileT (4 bytes/row x 8 rows).
+// All Amiga impls operate on the off-screen shadow planes via
+// AmigaPlanarT; chunky-port stubs are no-ops. tileSnap is read-only
+// so has no planar dual-write hook.
+void halTileFillPlanes(SurfaceT *s, uint8_t bx, uint8_t by, uint8_t colorIndex);
+void halTileCopyPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy);
+void halTileCopyMaskedPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy, uint8_t transparentIndex);
+void halTilePastePlanes(SurfaceT *dst, uint8_t bx, uint8_t by, const uint8_t *chunkyTile);
+
+// tileSnap: cross-platform code reads s->pixels chunky bytes into a
+// 32-byte TileT. On planar ports (s->pixels NULL) the chunky read
+// crashes -- this hook is the planar derivation: reads bitplane bits
+// for the tile rect and assembles 32 chunky bytes (4 bytes/row x 8
+// rows) into chunkyTileOut. Chunky ports (s->pixels valid) implement
+// this as a no-op since the cross-platform fallback already filled
+// chunkyTileOut from s->pixels.
+void halTileSnapPlanes(const SurfaceT *src, uint8_t bx, uint8_t by, uint8_t *chunkyTileOut);
+
+// Phase 6 planar dual-write for spriteDraw. Called from cross-platform
+// sprite.c AFTER spriteCompiledDraw or spriteDrawInterpreted has
+// updated the chunky shadow. (x, y) is the destination top-left in
+// surface pixels (may be partially off-surface; the hook does its own
+// clipping). Walks the sprite's chunky tile data and updates dst
+// surface planes for every non-transparent pixel (nibble != 0).
+// Save/restore have NO planar dual-write yet -- after spriteSaveUnder
+// + spriteDraw + spriteRestoreUnder under JOEYLIB_PLANAR_PRESENT, the
+// planes still show the sprite (chunky restored, planes unchanged).
+// Workable approach for that needs a parallel plane backup buffer;
+// deferred until apps actually depend on PLANAR_PRESENT save/restore.
+void halSpriteDrawPlanes(SurfaceT *s, const SpriteT *sp, int16_t x, int16_t y);
+
+// Phase 8 planar dual-write for asset blits and full surface loads.
+// halBlitRectPlanes is called from surfaceBlit / surfaceBlitMasked
+// AFTER the chunky path. transparent == 0xFFFF means opaque blit; any
+// other value is a nibble (0..15) to skip. srcBytes is the asset's
+// raw chunky pixel buffer; srcRowBytes is its stride. (x, y) is the
+// already-clipped destination top-left in dst surface pixels;
+// srcX0/srcY0 is where in the asset the visible region starts after
+// clip; copyW/copyH is the visible region size in pixels.
+//
+void halBlitRectPlanes(SurfaceT *dst, int16_t x, int16_t y, const uint8_t *srcBytes, int16_t srcX0, int16_t srcY0, int16_t copyW, int16_t copyH, int16_t srcRowBytes, uint16_t transparent);
+
+// Phase 9 sprite save/restore plane data. Chunky ports already hold
+// pixel data in backup->bytes via the cross-platform memcpy. Planar
+// ports (Amiga) DO have chunky NULL, so backup->bytes is unused by
+// the chunky path -- we repurpose it to hold per-plane bytes. Layout:
+// 4 plane stripes of (h * bytesPerPlaneRow) bytes each, where
+// bytesPerPlaneRow = w/8 (sprite x and w are guaranteed 2-pixel
+// aligned by spriteSaveUnder; planar requires further 8-pixel
+// rounding -- see Amiga impl notes). Total bytes:
+// 4 * h * w/8 = h * w/2 = same as chunky. backup->sizeBytes capacity
+// works on both ports. Chunky-port impls are no-ops; Amiga writes /
+// reads plane bytes via AmigaPlanarT.
+void halSpriteSavePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t *dstPlaneBytes);
+void halSpriteRestorePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, const uint8_t *srcPlaneBytes);
+
+// Phase 9 reader hooks. Cross-platform code calls these instead of
+// reading from s->pixels directly so it works regardless of whether
+// the port stores chunky or planar as the source of truth. Chunky
+// ports (DOS, IIgs) implement these reading from s->pixels (cheap);
+// Amiga reads from the bitplanes in AmigaPlanarT. (x, y) bounds are
+// already validated by the caller.
+//
+// halSamplePixel: returns the 0..15 nibble at (x, y).
+// halSurfaceHash: returns the FNV-style hash of pixel + scb + palette
+//   that surfaceHash currently computes by walking s->pixels. Allows
+//   ports to use their native pixel storage instead.
+// halSurfaceCopyChunky: cross-platform surfaceCopy used to memcpy
+//   s->pixels src->dst; on planar ports there is no chunky to copy
+//   (planes already covered by halSurfaceCopyPlanes). Chunky ports
+//   do the memcpy here; Amiga is a no-op.
+// halSurfaceLoadFileChunky / halSurfaceSaveFileChunky wrap fread /
+//   fwrite of the pixel data. Chunky ports stream directly to/from
+//   s->pixels; Amiga uses a scratch buffer + c2p (load) or
+//   plane->chunky derivation (save).
+uint8_t halSamplePixel(const SurfaceT *s, int16_t x, int16_t y);
+uint32_t halSurfaceHash(const SurfaceT *s);
+void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src);
+bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp);
+bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp);
+
+// Present the dirty regions of the source surface to the display.
+// The cross-platform stagePresent walks the dirty arrays before
+// calling this; ports may use the dirty arrays themselves to skip
+// untouched rows.
+void halPresent(const SurfaceT *src);

 // Optional: returns a port-specific error message string for the last
 // HAL failure, or NULL if none. Ports may return NULL always.
@ -73,9 +190,23 @@ uint16_t halFrameHz(void);

 // Audio: per-port engine setup, module + SFX playback, teardown.
 // halAudioInit returns true if the platform has a working engine.
-// All entry points are safe to call when init failed -- they become
-// no-ops. See joey/audio.h for the public API contract that wraps
-// these.
+// Per-surface chunky pixel allocation. Chunky ports (DOS, IIgs, ST
+// while still chunky) allocate SURFACE_PIXELS_SIZE bytes (calloc-
+// style, zero-filled). Pure-planar Amiga returns NULL -- there's no
+// chunky shadow; cross-platform code that previously read s->pixels
+// goes through halSamplePixel / halSurfaceCopyChunky / etc. instead.
+// halSurfaceFreePixels mirrors free(); NULL is a valid input on
+// planar ports.
+uint8_t *halSurfaceAllocPixels(void);
+void     halSurfaceFreePixels(uint8_t *pixels);
+
+// Get a pointer to the start of bitplane `planeIdx` (0..3) for surface
+// `s`. Returns NULL on chunky ports (no planes). On Amiga returns
+// pd->planes[planeIdx] from the AmigaPlanarT struct in portData.
+// Used by the planar sprite codegen dispatcher to compute the 4
+// plane addresses to hand the emitted asm.
+uint8_t *halSurfacePlanePtr(const SurfaceT *s, uint8_t planeIdx);
+
 bool halAudioInit(void);
 void halAudioShutdown(void);
 void halAudioPlayMod(const uint8_t *data, uint32_t length, bool loop);
@ -185,6 +316,21 @@ bool halFastFloodWalkAndScans(uint8_t *pixels, int16_t x, int16_t y,
                              bool *seedMatched,
                              int16_t *leftXOut, int16_t *rightXOut);

+// Planar variants of halFastFloodWalk / halFastFloodScanRow. Take a
+// SurfaceT* instead of a chunky-row pointer so they work on planar
+// ports (Amiga post-Phase 9) where s->pixels is NULL. Same semantics;
+// chunky ports return false (the chunky variants above are faster
+// when a chunky row is available). Replace the per-pixel
+// halSamplePixel walk on planar ports.
+bool halFloodWalkPlanes(const SurfaceT *s, int16_t startX, int16_t y,
+                        uint8_t matchColor, uint8_t newColor, bool matchEqual,
+                        bool *seedMatched,
+                        int16_t *leftXOut, int16_t *rightXOut);
+
+bool halFloodScanRowPlanes(const SurfaceT *s, int16_t leftX, int16_t rightX, int16_t scanY,
+                           uint8_t matchColor, uint8_t newColor, bool matchEqual,
+                           uint8_t *markBuf);
+
 // surfaceBlit / surfaceBlitMasked rect-copy helper. Caller has done
 // the clip math: dstRow0 / srcRow0 point at row 0 of the source/dest
 // regions, dstX / srcX are intra-row pixel offsets, copyW/copyH are
@ -333,6 +479,12 @@ extern uint16_t gFloodRightX;
 #undef  halFastFloodScanAndPush
 #define halFastFloodScanAndPush(_row, _lx, _rx, _mc, _nc, _me, _sy, _sx, _syA, _sp, _ms) (false)

+// IIgs is chunky; the planar flood hooks are never reachable.
+#undef  halFloodWalkPlanes
+#define halFloodWalkPlanes(_s, _sx, _y, _mc, _nc, _me, _sm, _lx, _rx) (false)
+#undef  halFloodScanRowPlanes
+#define halFloodScanRowPlanes(_s, _lx, _rx, _sy, _mc, _nc, _me, _mb) (false)
+
 // Tier-1 flood: multi-output. Asm sets gFloodSeedMatch / gFloodLeftX /
 // gFloodRightX; macro reads those into the caller's out-ptrs.
 #undef  halFastFloodWalkAndScans
--- a/src/core/present.c
+++ b/src/core/present.c
@ -2,8 +2,7 @@
 //
 // stagePresent walks the per-row dirty bands set by drawing primitives
 // and asks the port HAL to flip just those rows to the display, then
-// resets the dirty state. stagePresentRect bypasses dirty tracking
-// entirely and slams a caller-specified rectangle (after clipping).
+// resets the dirty state.

 #include <stddef.h>

@ -25,48 +24,3 @@ void stagePresent(void) {
    halPresent(stage);
    stageDirtyClearAll();
 }
-
-
-void stagePresentRect(int16_t x, int16_t y, uint16_t w, uint16_t h) {
-    SurfaceT *stage;
-    int16_t   sx;
-    int16_t   sy;
-    int16_t   sw;
-    int16_t   sh;
-
-    stage = stageGet();
-    if (stage == NULL) {
-        return;
-    }
-
-    sx = x;
-    sy = y;
-    sw = (int16_t)w;
-    sh = (int16_t)h;
-
-    if (sw <= 0 || sh <= 0) {
-        return;
-    }
-    if (sx < 0) {
-        sw += sx;
-        sx = 0;
-    }
-    if (sy < 0) {
-        sh += sy;
-        sy = 0;
-    }
-    if (sx >= SURFACE_WIDTH || sy >= SURFACE_HEIGHT) {
-        return;
-    }
-    if (sx + sw > SURFACE_WIDTH) {
-        sw = SURFACE_WIDTH - sx;
-    }
-    if (sy + sh > SURFACE_HEIGHT) {
-        sh = SURFACE_HEIGHT - sy;
-    }
-    if (sw <= 0 || sh <= 0) {
-        return;
-    }
-
-    halPresentRect(stage, sx, sy, (uint16_t)sw, (uint16_t)sh);
-}
--- a/src/core/sprite.c
+++ b/src/core/sprite.c
@ -10,6 +10,7 @@

 #include "joey/sprite.h"
 #include "codegenArenaInternal.h"
+#include "hal.h"
 #include "spriteInternal.h"
 #include "surfaceInternal.h"

@ -22,6 +23,20 @@
 // Color 0 is always transparent for sprites (DESIGN.md contract).
 #define TRANSPARENT_NIBBLE 0

+// On Amiga (post-Phase 9 / Phase 6 redux) the compiled sprite emitter
+// writes directly to the bitplanes, so the halSpritePlanes hooks are
+// pure duplicate work after a compiled call. On other ports the
+// hooks are either no-op stubs (chunky-only IIgs/DOS) or the only
+// thing writing planes (ST: chunky-shadow + planes). Slow / interpreter
+// paths still need the hooks unconditionally on every platform -- the
+// chunky interpreter is a no-op on Amiga (s->pixels NULL) so the hook
+// is the only draw.
+#if defined(JOEYLIB_PLATFORM_AMIGA)
+#define COMPILED_SPRITE_WRITES_PLANES 1
+#else
+#define COMPILED_SPRITE_WRITES_PLANES 0
+#endif
+

 // ----- Prototypes -----

@ -144,6 +159,11 @@ static void spriteDrawInterpreted(SurfaceT *s, SpriteT *sp, int16_t x, int16_t y
        return;
    }

+    /* Skip the chunky write loop on planar ports (s->pixels == NULL).
+     * halSpriteDrawPlanes is called by the spriteDraw caller and does
+     * its own clip + plane write, so the dirty mark + planar update
+     * happen there. Phase 9 dropped the chunky shadow on Amiga. */
+    if (s->pixels != NULL) {
        for (row = 0; row < h; row++) {
            dstRow = &s->pixels[(dy + row) * SURFACE_BYTES_PER_ROW];
            for (col = 0; col < w; col++) {
@ -154,6 +174,7 @@ static void spriteDrawInterpreted(SurfaceT *s, SpriteT *sp, int16_t x, int16_t y
                writeDstNibble(dstRow, (int16_t)(dx + col), nibble);
            }
        }
+    }
    surfaceMarkDirtyRect(s, dx, dy, w, h);
 }

@ -200,6 +221,13 @@ SpriteT *spriteCreateFromSurface(const SurfaceT *src, int16_t x, int16_t y,
    if (src == NULL || widthTiles == 0 || heightTiles == 0) {
        return NULL;
    }
+    /* Phase 9: planar ports have NULL src->pixels. Capturing a sprite
+     * from such a surface needs a planar-to-chunky derivation hook;
+     * not implemented yet, so refuse the call. Apps targeting Amiga
+     * should ship sprites as static tile data instead. */
+    if (src->pixels == NULL) {
+        return NULL;
+    }
    // Source x/y must be on a tile boundary so each captured tile lands
    // on whole bytes -- mid-byte snapshots would lose half a pixel at
    // the left edge.
@ -284,10 +312,14 @@ void spriteDraw(SurfaceT *s, SpriteT *sp, int16_t x, int16_t y) {
    // need clip math (they walk fixed offsets).
    if (sp->slot != NULL && isFullyOnSurface(x, y, widthPx, heightPx)) {
        spriteCompiledDraw(s, sp, x, y);
+        if (!COMPILED_SPRITE_WRITES_PLANES) {
+            halSpriteDrawPlanes(s, sp, x, y);
+        }
        surfaceMarkDirtyRect(s, x, y, (int16_t)widthPx, (int16_t)heightPx);
        return;
    }
    spriteDrawInterpreted(s, sp, x, y);
+    halSpriteDrawPlanes(s, sp, x, y);
 }


@ -332,7 +364,7 @@ void spriteSaveAndDraw(SurfaceT *s, SpriteT *sp, int16_t x, int16_t y, SpriteBac
        uint16_t  saveIdx;
        uint16_t  drawIdx;
        uint8_t  *offsetsBase;
-        shift       = (uint8_t)(x & 1);
+        shift       = SPRITE_SHIFT_INDEX(x);
        saveIdx     = (uint16_t)(((uint16_t)shift << 1) + (uint16_t)shift + SPRITE_OP_SAVE);
        drawIdx     = (uint16_t)(((uint16_t)shift << 1) + (uint16_t)shift + SPRITE_OP_DRAW);
        offsetsBase = (uint8_t *)sp->routineOffsets;
@ -340,6 +372,10 @@ void spriteSaveAndDraw(SurfaceT *s, SpriteT *sp, int16_t x, int16_t y, SpriteBac
            *(uint16_t *)(offsetsBase + (drawIdx << 1)) != SPRITE_NOT_COMPILED) {
            spriteCompiledSaveUnder(s, sp, x, y, backup);
            spriteCompiledDraw    (s, sp, x, y);
+            if (!COMPILED_SPRITE_WRITES_PLANES) {
+                halSpriteSavePlanes(s, backup->x, backup->y, backup->width, backup->height, backup->bytes);
+                halSpriteDrawPlanes(s, sp, x, y);
+            }
            surfaceMarkDirtyRect  (s, x, y, (int16_t)widthPx, (int16_t)heightPx);
            return;
        }
@ -630,13 +666,18 @@ void spriteRestoreUnder(SurfaceT *s, const SpriteBackupT *backup) {
        routeOffset = *(uint16_t *)((uint8_t *)sp->routineOffsets + (routeIdx << 1));
        if (routeOffset != SPRITE_NOT_COMPILED) {
            spriteCompiledRestoreUnder(s, backup);
+            if (!COMPILED_SPRITE_WRITES_PLANES) {
+                halSpriteRestorePlanes(s, bx, by, bw, bh, backup->bytes);
+            }
            surfaceMarkDirtyRect(s, bx, by, (int16_t)bw, (int16_t)bh);
            return;
        }
    }

-    /* Slow / interpreted memcpy fallback. */
-    {
+    /* Slow / interpreted memcpy fallback. Skip the chunky memcpy if
+     * the port has no chunky shadow (Phase 9 Amiga: s->pixels NULL);
+     * halSpriteRestorePlanes below does the planar restore. */
+    if (s->pixels != NULL) {
        int16_t  row;
        int16_t  byteStart;
        uint8_t *dstRow;
@ -650,6 +691,7 @@ void spriteRestoreUnder(SurfaceT *s, const SpriteBackupT *backup) {
                   (size_t)copyBytes);
        }
    }
+    halSpriteRestorePlanes(s, bx, by, bw, bh, backup->bytes);
    surfaceMarkDirtyRect(s, bx, by, (int16_t)bw, (int16_t)bh);
 }

@ -684,11 +726,14 @@ void spriteSaveUnder(const SurfaceT *s, SpriteT *sp, int16_t x, int16_t y, Sprit
    if (backup->bytes != NULL && slot != NULL && isFullyOnSurface(x, y, widthPx, heightPx)) {
        uint16_t routeIdx;
        uint16_t routeOffset;
-        shift       = (uint8_t)(x & 1);
+        shift       = SPRITE_SHIFT_INDEX(x);
        routeIdx    = (uint16_t)(((uint16_t)shift << 1) + (uint16_t)shift + SPRITE_OP_SAVE);
        routeOffset = *(uint16_t *)((uint8_t *)sp->routineOffsets + (routeIdx << 1));
        if (routeOffset != SPRITE_NOT_COMPILED) {
            spriteCompiledSaveUnder(s, sp, x, y, backup);
+            if (!COMPILED_SPRITE_WRITES_PLANES) {
+                halSpriteSavePlanes(s, backup->x, backup->y, backup->width, backup->height, backup->bytes);
+            }
            return;
        }
    }
@ -744,11 +789,16 @@ void spriteSaveUnder(const SurfaceT *s, SpriteT *sp, int16_t x, int16_t y, Sprit
        // backup with bytes==NULL.
        return;
    }
+    /* Chunky save path: skip on planar ports (s->pixels NULL).
+     * halSpriteSavePlanes below covers the planar case. */
+    if (s->pixels != NULL) {
        for (row = 0; row < h; row++) {
            srcRow = &s->pixels[(dy + row) * SURFACE_BYTES_PER_ROW];
            memcpy(&backup->bytes[(uint16_t)row * (uint16_t)copyBytes],
                   &srcRow[byteStart],
                   (size_t)copyBytes);
        }
+    }
+    halSpriteSavePlanes(s, clippedX, dy, (uint16_t)clippedW, (uint16_t)h, backup->bytes);
    }   /* end slow path */
 }
--- a/src/core/spriteInternal.h
+++ b/src/core/spriteInternal.h
@ -13,6 +13,16 @@
 #define SPRITE_OP_RESTORE 2
 #define SPRITE_OP_COUNT   3

+// Per-platform shift index used by the dispatcher. Chunky 4bpp ports
+// store one nibble per pixel pair so the only sub-byte alignment is
+// x % 2. Amiga planar packs 8 pixels per plane byte so all 8
+// alignments matter.
+#if defined(JOEYLIB_PLATFORM_AMIGA)
+#define SPRITE_SHIFT_INDEX(x)  ((uint8_t)((x) & 7))
+#else
+#define SPRITE_SHIFT_INDEX(x)  ((uint8_t)((x) & 1))
+#endif
+
 // Sentinel stored in routineOffsets[shift][op] when that op's emitter
 // returned 0 bytes (i.e., the platform doesn't implement compiled
 // codegen for that op yet). Distinct from a real offset of 0, which
--- a/src/core/surface.c
+++ b/src/core/surface.c
@ -65,9 +65,10 @@ void surfaceCopy(SurfaceT *dst, const SurfaceT *src) {
    if (dst == NULL || src == NULL || dst == src) {
        return;
    }
-    memcpy(dst->pixels,  src->pixels,  SURFACE_PIXELS_SIZE);
+    halSurfaceCopyChunky(dst, src);          /* memcpy on chunky ports; no-op on planar */
    memcpy(dst->scb,     src->scb,     sizeof(src->scb));
    memcpy(dst->palette, src->palette, sizeof(src->palette));
+    halSurfaceCopyPlanes(dst, src);          /* 4 plane memcpys on planar ports; no-op on chunky */
    surfaceMarkDirtyAll(dst);
 }

@ -79,11 +80,10 @@ SurfaceT *surfaceCreate(void) {
    if (s == NULL) {
        return NULL;
    }
-    s->pixels = (uint8_t *)calloc(1, SURFACE_PIXELS_SIZE);
-    if (s->pixels == NULL) {
-        free(s);
-        return NULL;
-    }
+    /* halSurfaceAllocPixels returns NULL on planar ports (Amiga); the
+     * primary storage is the port-allocated planes via portData below. */
+    s->pixels = halSurfaceAllocPixels();
+    s->portData = halSurfaceAllocPortData(s, false);
    paletteInitDefault(s);
    return s;
 }
@ -96,11 +96,44 @@ void surfaceDestroy(SurfaceT *s) {
    if (s == gStage) {
        return;
    }
-    free(s->pixels);
+    halSurfaceFreePortData(s, false, s->portData);
+    halSurfaceFreePixels(s->pixels);
    free(s);
 }


+// Cheapest deterministic hash that still detects per-byte changes:
+// (hash << 1) ^ byte, a single 16-bit accumulator. ORCA-C / 65816
+// compiles to ASL + EOR -- about 35 cyc per byte. A 32-bit multiply
+// FNV-style hash takes ~200 cyc per byte via ~UMUL4, which adds
+// 80+ seconds to a UBER run on IIgs. Discrimination is weaker than
+// FNV but plenty for cross-port validation: we only need "did the
+// same logical-pixel sequence produce the same hash?" -- not
+// crypto-grade collision resistance over arbitrary inputs.
+//
+// Walks the chunky pixel buffer byte-by-byte, the same logical-pixel
+// ordering on every chunky-format port (IIgs, DOS, Amiga and ST
+// while still chunky). When the planar rewrite drops s->pixels on
+// Amiga/ST this function will need a HAL hook (halSurfaceHash) to
+// read planes natively while producing the same logical hash.
+/* Cross-port FNV-style hash of pixels + SCB + palette. The hash logic
+ * (multiplier streams, byte ordering for palette) is identical across
+ * ports, but the pixel READS go through the port HAL so chunky ports
+ * walk s->pixels and planar ports walk plane bits and assemble nibble
+ * pairs into chunky bytes for the hash. Both produce the same logical-
+ * pixel hash because they hash the same logical pixel sequence in the
+ * same chunky byte order. SCB and palette are still hashed inline
+ * here because they live in the SurfaceT struct on every port (no
+ * port-specific storage) and the byte/value-with-explicit-byte-order
+ * walks are already endian-independent. */
+uint32_t surfaceHash(const SurfaceT *s) {
+    if (s == NULL) {
+        return 0u;
+    }
+    return halSurfaceHash(s);
+}
+
+
 bool surfaceLoadFile(SurfaceT *dst, const char *path) {
    FILE *fp;
    long  fileSize;
@ -125,7 +158,7 @@ bool surfaceLoadFile(SurfaceT *dst, const char *path) {
        fclose(fp);
        return false;
    }
-    if (fread(dst->pixels, 1, SURFACE_PIXELS_SIZE, fp) != SURFACE_PIXELS_SIZE) {
+    if (!halSurfaceLoadFileChunky(dst, fp)) {
        fclose(fp);
        return false;
    }
@ -153,7 +186,7 @@ bool surfaceSaveFile(const SurfaceT *src, const char *path) {
    if (fp == NULL) {
        return false;
    }
-    if (fwrite(src->pixels, 1, SURFACE_PIXELS_SIZE, fp) != SURFACE_PIXELS_SIZE) {
+    if (!halSurfaceSaveFileChunky(src, fp)) {
        fclose(fp);
        return false;
    }
@ -228,13 +261,14 @@ bool stageAlloc(void) {
    if (gStage == NULL) {
        return false;
    }
+    /* halStageAllocPixels returns NULL on planar ports (Amiga) where
+     * the chunky shadow doesn't exist; the planes from portData are
+     * the source of truth. NULL pixels is no longer a failure. */
    gStage->pixels = halStageAllocPixels();
-    if (gStage->pixels == NULL) {
-        free(gStage);
-        gStage = NULL;
-        return false;
-    }
+    if (gStage->pixels != NULL) {
        memset(gStage->pixels, 0, SURFACE_PIXELS_SIZE);
+    }
+    gStage->portData = halSurfaceAllocPortData(gStage, true);
    stageDirtyClearAll();
    paletteInitDefault(gStage);
    return true;
@ -255,6 +289,7 @@ void stageFree(void) {
    if (gStage == NULL) {
        return;
    }
+    halSurfaceFreePortData(gStage, true, gStage->portData);
    halStageFreePixels(gStage->pixels);
    free(gStage);
    gStage = NULL;
--- a/src/core/surfaceInternal.h
+++ b/src/core/surfaceInternal.h
@ -14,8 +14,17 @@
 // auto-mirroring to $E1). Caller-side `s->pixels[i]` syntax is
 // unchanged; only allocation/copy paths in surface.c shift to a
 // two-buffer model.
+//
+// portData is per-port opaque storage. On chunky ports (IIgs, DOS) it
+// stays NULL -- pixels is the source of truth. On planar ports
+// (Amiga, Atari ST) it points to a port-private struct describing the
+// 4 bitplanes (Amiga: 4 separate plane buffers + stride; ST: single
+// interleaved buffer + stride). Cross-platform code never touches it
+// directly -- all primitive access goes through halFast* on planar
+// ports. See project_planar_68k_plan.md for the full architecture.
 struct SurfaceT {
    uint8_t  *pixels;
+    void     *portData;
    uint8_t   scb[SURFACE_HEIGHT];
    uint16_t  palette[SURFACE_PALETTE_COUNT][SURFACE_COLORS_PER_PALETTE];
 };
@ -38,6 +47,18 @@ struct SurfaceT {
 extern uint8_t gStageMinWord[SURFACE_HEIGHT];
 extern uint8_t gStageMaxWord[SURFACE_HEIGHT];

+// Per-byte mixer for surfaceHash. Two-stream: lo *= 31 + b, hi *= 251 + b.
+// Strength-reduced to shifts so ORCA-C doesn't emit `~UMUL2` (~150 cyc
+// per call); 32 KB hashed twice -> ~5 minutes per UBER run. The
+// shift form is 16-bit-equivalent (mod 2^16) so hash values are
+// identical to the original `* 31u` / `* 251u` form.
+//   lo *= 31  ==  (lo << 5) - lo
+//   hi *= 251 ==  (hi << 8) - (hi << 2) - hi
+#define SURFACE_HASH_MIX_BYTE(lo_, hi_, b_) do { \
+    (lo_) = (uint16_t)(((((lo_) << 5) - (lo_)) + (b_))); \
+    (hi_) = (uint16_t)((((hi_) << 8) - ((hi_) << 2) - (hi_)) + (b_)); \
+} while (0)
+
 // Stage SCB / palette dirty flags. scbSet* and paletteSet set them
 // true when the stage's data is modified; the per-port present code
 // checks the flags and clears after upload. Replaces a per-frame
@ -50,6 +71,15 @@ extern bool gStagePaletteDirty;
 // bands are widened to cover the rect. If `s` is any other surface,
 // the call is a no-op -- non-stage surfaces never get presented, so
 // they don't carry dirty state.
+//
+// Planar ports rely on the chunky shadow + c2p path through Phase 8.
+// Planar-native primitives (Phases 3+) dual-write: they update both
+// the chunky pixels and the bitplanes in the same call, so c2p at
+// present time always derives correct planes from up-to-date chunky.
+// Phase 9 deletes the chunky shadow + c2p; only at that point will
+// per-row planar-vs-chunky tracking even be a possible question, and
+// the plan is to avoid it entirely there too (planes become the only
+// source of truth).
 void surfaceMarkDirtyRect(const SurfaceT *s, int16_t x, int16_t y, int16_t w, int16_t h);

 // Shorthand for "every row, full width" -- used by surfaceClear and
--- a/src/core/tile.c
+++ b/src/core/tile.c
@ -147,6 +147,7 @@ void tileCopy(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src,
    if (!halFastTileCopy(dstRow0, srcRow0)) {
        copyTileOpaque(dstRow0, srcRow0);
    }
+    halTileCopyPlanes(dst, dstBx, dstBy, src, srcBx, srcBy);
    surfaceMarkDirtyRect(dst, (int16_t)dstPixelX, (int16_t)dstPixelY,
                         TILE_PIXELS_PER_SIDE, TILE_PIXELS_PER_SIDE);
 }
@ -178,6 +179,7 @@ void tileCopyMasked(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT
    if (!halFastTileCopyMasked(dstRow0, srcRow0, transparentIndex)) {
        copyTileMasked(dstRow0, srcRow0, transparentIndex);
    }
+    halTileCopyMaskedPlanes(dst, dstBx, dstBy, src, srcBx, srcBy, transparentIndex);
    surfaceMarkDirtyRect(dst, (int16_t)dstPixelX, (int16_t)dstPixelY,
                         TILE_PIXELS_PER_SIDE, TILE_PIXELS_PER_SIDE);
 }
@ -209,6 +211,7 @@ void tileFill(SurfaceT *s, uint8_t bx, uint8_t by, uint8_t colorIndex) {
            row += SURFACE_BYTES_PER_ROW;
        }
    }
+    halTileFillPlanes(s, bx, by, colorIndex);
    surfaceMarkDirtyRect(s, (int16_t)pixelX, (int16_t)pixelY,
                         TILE_PIXELS_PER_SIDE, TILE_PIXELS_PER_SIDE);
 }
@ -241,6 +244,7 @@ void tilePaste(SurfaceT *dst, uint8_t bx, uint8_t by, const TileT *in) {
            src    += TILE_BYTES_PER_ROW;
        }
    }
+    halTilePastePlanes(dst, bx, by, &in->pixels[0]);
    surfaceMarkDirtyRect(dst, (int16_t)pixelX, (int16_t)pixelY,
                         TILE_PIXELS_PER_SIDE, TILE_PIXELS_PER_SIDE);
 }
@ -261,9 +265,12 @@ void tileSnap(const SurfaceT *src, uint8_t bx, uint8_t by, TileT *out) {
    }
    pixelX = (uint16_t)((uint16_t)bx * TILE_PIXELS_PER_SIDE);
    pixelY = (uint16_t)((uint16_t)by * TILE_PIXELS_PER_SIDE);
-    srcRow = &src->pixels[SURFACE_ROW_OFFSET(pixelY) + (pixelX >> 1)];
    dst    = &out->pixels[0];
-    if (!halFastTileSnap(dst, srcRow)) {
+    /* On planar ports (s->pixels NULL) the chunky read path is
+     * skipped; halTileSnapPlanes below derives the tile bytes from
+     * the bitplanes. */
+    if (src->pixels != NULL && !halFastTileSnap(dst, &src->pixels[SURFACE_ROW_OFFSET(pixelY) + (pixelX >> 1)])) {
+        srcRow = &src->pixels[SURFACE_ROW_OFFSET(pixelY) + (pixelX >> 1)];
        for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) {
            dst[0] = srcRow[0];
            dst[1] = srcRow[1];
@ -273,4 +280,5 @@ void tileSnap(const SurfaceT *src, uint8_t bx, uint8_t by, TileT *out) {
            dst    += TILE_BYTES_PER_ROW;
        }
    }
+    halTileSnapPlanes(src, bx, by, &out->pixels[0]);
 }
--- a/src/port/amiga/circle.s
+++ b/src/port/amiga/circle.s
@ -0,0 +1,270 @@
+| Amiga planar circle outline V4 -- 16-way color-specialized.
+|
+| Per Bresenham iter:
+|   1. Precompute 4 xp records (xp_byte_w + bitMask_b + notMask_b) for
+|      cx +/- bx and cx +/- by, stored at sp+0..15 (4 records x 4 bytes).
+|   2. Precompute 4 yp40 words for cy +/- by and cy +/- bx, stored at
+|      sp+16..23 (4 words x 2 bytes).
+|   3. Plot 8 octant pixels with hardcoded color: each pixel does 4
+|      branchless plane RMW ops (or.b for set bits, and.b for clear
+|      bits) -- no btst, no per-plane branch.
+|   4. Bresenham step.
+|
+| At function entry the color is masked to 4 bits and used as the index
+| into a 16-entry jump table that selects the matching main loop.
+| Each main loop has the color hardcoded into the per-plane RMW ops.
+|
+| The branchless plot saves ~20-28 cyc per plane vs V3's btst+branch
+| pattern -- ~640-900 cyc per Bresenham iter.
+|
+| ABI: cdecl. d2-d7/a2-a6 callee-save.
+|
+| void surface68kAmigaCircleOutline(uint8_t *p0, uint8_t *p1,
+|                                   uint8_t *p2, uint8_t *p3,
+|                                   uint16_t cx, uint16_t cy,
+|                                   uint16_t r,  uint8_t  color);
+|
+| Register allocation across the iter loop:
+|   d2.w   = bx (Bresenham)
+|   d3.w   = by (Bresenham)
+|   d4.w   = err (Bresenham)
+|   d5.w   = cx (cached)
+|   a4     = cy (cached, sign-extended)
+|   a0..a3 = plane bases
+|   a5     = bitMaskLut
+|   d0,d1,d6,d7 = scratch in precompute / plot
+|
+| Scratch block (24 bytes) at sp+0..23:
+|   sp+0..3:   xp1 record [xp_byte_w, bitMask_b, notMask_b] for cx+bx
+|   sp+4..7:   xp2 record for cx-bx
+|   sp+8..11:  xp3 record for cx+by
+|   sp+12..15: xp4 record for cx-by
+|   sp+16..17: yp1 word (cy+by) * 40
+|   sp+18..19: yp2 word (cy-by) * 40
+|   sp+20..21: yp3 word (cy+bx) * 40
+|   sp+22..23: yp4 word (cy-bx) * 40
+
+                .text
+
+
+| ---- XP_REC: build xp record at sp+slot for xp = cx <signOp> <xreg> ----
+| signOp: add or sub
+| xreg:   %d2 (bx) or %d3 (by)
+| slot:   0, 4, 8, or 12
+| Trashes: d0, d1, d6, d7
+
+                .macro  XP_REC  slot, signOp, xreg
+                move.w  %d5,%d6
+                \signOp\().w \xreg,%d6        | d6 = xp
+                move.w  %d6,%d7
+                lsr.w   #3,%d7                | d7 = xp >> 3 (xp_byte)
+                and.w   #7,%d6                | d6 = xp & 7
+                move.b  (%a5,%d6.w),%d6       | d6 = bitMask
+                move.b  %d6,%d1
+                not.b   %d1                   | d1 = notMask
+                move.w  %d7,\slot(%sp)        | xp_byte word
+                move.b  %d6,\slot+2(%sp)      | bitMask byte
+                move.b  %d1,\slot+3(%sp)      | notMask byte
+                .endm
+
+
+| ---- YP_REC: build yp40 word at sp+slot for yp = cy <signOp> <yreg> ----
+
+                .macro  YP_REC  slot, signOp, yreg
+                move.l  %a4,%d6
+                \signOp\().w \yreg,%d6        | d6.w = yp
+                move.w  %d6,%d0
+                lsl.w   #3,%d6                | d6 = yp << 3
+                lsl.w   #5,%d0                | d0 = yp << 5
+                add.w   %d6,%d0               | d0 = yp * 40
+                move.w  %d0,\slot(%sp)
+                .endm
+
+
+| ---- PLOT_FIXED: plot one pixel with hardcoded 4-bit color ----
+| slotYp: 16, 18, 20, or 22 (yp40 word slot)
+| slotXp: 0, 4, 8, or 12   (xp record slot)
+| color:  literal 0..15
+| Trashes: d0, d1, d7
+
+                .macro  PLOT_FIXED  slotYp, slotXp, color
+                move.w  \slotYp(%sp),%d0      | d0 = yp40
+                add.w   \slotXp(%sp),%d0      | d0 += xp_byte
+                move.b  \slotXp+2(%sp),%d1    | d1.b = bitMask
+                move.b  \slotXp+3(%sp),%d7    | d7.b = notMask
+                .if  ((\color) & 1)
+                or.b    %d1,(%a0,%d0.w)
+                .else
+                and.b   %d7,(%a0,%d0.w)
+                .endif
+                .if  ((\color) & 2)
+                or.b    %d1,(%a1,%d0.w)
+                .else
+                and.b   %d7,(%a1,%d0.w)
+                .endif
+                .if  ((\color) & 4)
+                or.b    %d1,(%a2,%d0.w)
+                .else
+                and.b   %d7,(%a2,%d0.w)
+                .endif
+                .if  ((\color) & 8)
+                or.b    %d1,(%a3,%d0.w)
+                .else
+                and.b   %d7,(%a3,%d0.w)
+                .endif
+                .endm
+
+
+| ---- PLOT_8: plot all 8 octant pixels for a given hardcoded color ----
+
+                .macro  PLOT_8  color
+                PLOT_FIXED  16,  0, \color    | (cx+bx, cy+by)
+                PLOT_FIXED  16,  4, \color    | (cx-bx, cy+by)
+                PLOT_FIXED  18,  0, \color    | (cx+bx, cy-by)
+                PLOT_FIXED  18,  4, \color    | (cx-bx, cy-by)
+                PLOT_FIXED  20,  8, \color    | (cx+by, cy+bx)
+                PLOT_FIXED  20, 12, \color    | (cx-by, cy+bx)
+                PLOT_FIXED  22,  8, \color    | (cx+by, cy-bx)
+                PLOT_FIXED  22, 12, \color    | (cx-by, cy-bx)
+                .endm
+
+
+| ---- CO_BODY: full Bresenham loop body for a hardcoded color ----
+| Generates the per-iter precompute, branchless plot, and Bresenham
+| step. Uses unique labels via \color suffix.
+
+                .macro  CO_BODY  color
+                XP_REC   0, add, %d2          | xp1 = cx+bx
+                XP_REC   4, sub, %d2          | xp2 = cx-bx
+                XP_REC   8, add, %d3          | xp3 = cx+by
+                XP_REC  12, sub, %d3          | xp4 = cx-by
+                YP_REC  16, add, %d3          | yp1 = cy+by
+                YP_REC  18, sub, %d3          | yp2 = cy-by
+                YP_REC  20, add, %d2          | yp3 = cy+bx
+                YP_REC  22, sub, %d2          | yp4 = cy-bx
+
+                PLOT_8  \color
+
+                addq.w  #1,%d3
+                tst.w   %d4
+                bgt     .LcoDecX_\color
+                add.w   %d3,%d4
+                add.w   %d3,%d4
+                addq.w  #1,%d4
+                bra.w   .LcoLoop_\color
+.LcoDecX_\color:
+                subq.w  #1,%d2
+                add.w   %d3,%d4
+                add.w   %d3,%d4
+                sub.w   %d2,%d4
+                sub.w   %d2,%d4
+                addq.w  #1,%d4
+                bra.w   .LcoLoop_\color
+                .endm
+
+
+| ---- CO_LOOP_HDR: emit a labelled loop header for a color ----
+
+                .macro  CO_LOOP_HDR  color
+.LcoLoop_\color:
+                cmp.w   %d3,%d2
+                bcs.w   .LcoDone
+                CO_BODY \color
+                .endm
+
+
+| ---- Function entry ----
+
+                .equ    SP_SAVED, 44
+                .equ    SP_LOCAL, 24
+
+                .equ    SP_OFF,         (SP_SAVED + 4 + SP_LOCAL)
+
+                .equ    SP_P0,    SP_OFF + 0
+                .equ    SP_P1,    SP_OFF + 4
+                .equ    SP_P2,    SP_OFF + 8
+                .equ    SP_P3,    SP_OFF + 12
+                .equ    SP_CX,    SP_OFF + 16 + 2
+                .equ    SP_CY,    SP_OFF + 20 + 2
+                .equ    SP_R,     SP_OFF + 24 + 2
+                .equ    SP_COLOR, SP_OFF + 28 + 3
+
+                .globl  _surface68kAmigaCircleOutline
+
+_surface68kAmigaCircleOutline:
+                movem.l %d2-%d7/%a2-%a6,-(%sp)
+                lea     -SP_LOCAL(%sp),%sp
+
+                | Plane bases.
+                move.l  SP_P0(%sp),%a0
+                move.l  SP_P1(%sp),%a1
+                move.l  SP_P2(%sp),%a2
+                move.l  SP_P3(%sp),%a3
+                lea     bitMaskLut(%pc),%a5
+
+                | Cache cx in d5, cy (sign-extended) in a4.
+                move.w  SP_CX(%sp),%d5
+                move.w  SP_CY(%sp),%d6
+                ext.l   %d6
+                movea.l %d6,%a4
+
+                | Bresenham init.
+                move.w  SP_R(%sp),%d2         | bx = r
+                moveq   #0,%d3                | by = 0
+                moveq   #1,%d4
+                sub.w   %d2,%d4               | err = 1 - bx
+
+                | Dispatch on color (low 4 bits) -> one of 16 main loops.
+                | Each table entry is a bra.w (4 bytes), so index *= 4.
+                moveq   #0,%d6
+                move.b  SP_COLOR(%sp),%d6
+                and.w   #0x0F,%d6
+                add.w   %d6,%d6
+                add.w   %d6,%d6
+                lea     .LcoTable(%pc),%a6
+                jmp     0(%a6,%d6.w)
+
+.LcoTable:
+                bra.w   .LcoLoop_0
+                bra.w   .LcoLoop_1
+                bra.w   .LcoLoop_2
+                bra.w   .LcoLoop_3
+                bra.w   .LcoLoop_4
+                bra.w   .LcoLoop_5
+                bra.w   .LcoLoop_6
+                bra.w   .LcoLoop_7
+                bra.w   .LcoLoop_8
+                bra.w   .LcoLoop_9
+                bra.w   .LcoLoop_10
+                bra.w   .LcoLoop_11
+                bra.w   .LcoLoop_12
+                bra.w   .LcoLoop_13
+                bra.w   .LcoLoop_14
+                bra.w   .LcoLoop_15
+
+                CO_LOOP_HDR  0
+                CO_LOOP_HDR  1
+                CO_LOOP_HDR  2
+                CO_LOOP_HDR  3
+                CO_LOOP_HDR  4
+                CO_LOOP_HDR  5
+                CO_LOOP_HDR  6
+                CO_LOOP_HDR  7
+                CO_LOOP_HDR  8
+                CO_LOOP_HDR  9
+                CO_LOOP_HDR  10
+                CO_LOOP_HDR  11
+                CO_LOOP_HDR  12
+                CO_LOOP_HDR  13
+                CO_LOOP_HDR  14
+                CO_LOOP_HDR  15
+
+.LcoDone:
+                lea     SP_LOCAL(%sp),%sp
+                movem.l (%sp)+,%d2-%d7/%a2-%a6
+                rts
+
+
+                .align  2
+bitMaskLut:
+                .byte   0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01
--- a/src/port/amiga/hal.c
+++ b/src/port/amiga/hal.c
--- a/src/port/atarist/hal.c
+++ b/src/port/atarist/hal.c
@ -526,26 +526,6 @@ void halPresent(const SurfaceT *src) {
 }


-void halPresentRect(const SurfaceT *src, int16_t x, int16_t y, uint16_t w, uint16_t h) {
-    uint16_t groupStart;
-    uint16_t groupEnd;
-
-    if (src == NULL || !gModeSet) {
-        return;
-    }
-    refreshPaletteStateIfNeeded(src);
-    // Each c2p group covers 16 horizontal pixels. Round dirty pixel
-    // range to the enclosing group range to keep the planar word
-    // alignment without missing edge pixels.
-    groupStart = (uint16_t)(x >> 4);
-    groupEnd   = (uint16_t)(((uint16_t)x + w + 15) >> 4);
-    if (groupEnd > ST_GROUPS_PER_ROW) {
-        groupEnd = ST_GROUPS_PER_ROW;
-    }
-    c2pRange(src, y, y + (int16_t)h, groupStart, groupEnd);
-}
-
-
 // Vsync() is XBIOS opcode 37; mintlib exposes it directly. It blocks
 // until the next 50 Hz (PAL) or 60 Hz (NTSC) vertical blank.
 void halWaitVBL(void) {
@ -730,6 +710,20 @@ bool halFastFloodWalk(uint8_t *row, int16_t startX, uint8_t matchColor, uint8_t
 }


+bool halFloodWalkPlanes(const SurfaceT *s, int16_t startX, int16_t y, uint8_t matchColor, uint8_t newColor, bool matchEqual, bool *seedMatched, int16_t *leftXOut, int16_t *rightXOut) {
+    (void)s; (void)startX; (void)y; (void)matchColor; (void)newColor; (void)matchEqual;
+    (void)seedMatched; (void)leftXOut; (void)rightXOut;
+    return false;
+}
+
+
+bool halFloodScanRowPlanes(const SurfaceT *s, int16_t leftX, int16_t rightX, int16_t scanY, uint8_t matchColor, uint8_t newColor, bool matchEqual, uint8_t *markBuf) {
+    (void)s; (void)leftX; (void)rightX; (void)scanY; (void)matchColor; (void)newColor; (void)matchEqual;
+    (void)markBuf;
+    return false;
+}
+
+
 bool halFastFloodScanRow(uint8_t *row, int16_t leftX, int16_t rightX, uint8_t matchColor, uint8_t newColor, bool matchEqual, uint8_t *markBuf) {
    (void)row;
    (void)leftX;
@ -798,6 +792,146 @@ bool halFastTileFill(SurfaceT *s, uint8_t bx, uint8_t by, uint16_t fillWord) {
 }


+// Phase-1 planar plumbing: portData hooks declared and exported, but
+// returning NULL keeps the ST port operating in the legacy
+// chunky-with-c2p model. Phase 4 replaces this with an interleaved
+// planar buffer + stride blob, and rewrites every halFast* primitive
+// to read/write planes directly.
+void *halSurfaceAllocPortData(SurfaceT *s, bool isStage) {
+    (void)s;
+    (void)isStage;
+    return NULL;
+}
+
+
+void halSurfaceFreePortData(SurfaceT *s, bool isStage, void *portData) {
+    (void)s;
+    (void)isStage;
+    (void)portData;
+}
+
+
+// ST planar dual-write isn't implemented yet (interleaved word-planar
+// layout needs a different code path than Amiga's separate plane
+// buffers). Stub for now; chunky shadow + c2p still drives display.
+void halFillRectPlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex) {
+    (void)s;
+    (void)x;
+    (void)y;
+    (void)w;
+    (void)h;
+    (void)colorIndex;
+}
+
+
+void halSurfaceCopyPlanes(SurfaceT *dst, const SurfaceT *src) {
+    (void)dst;
+    (void)src;
+}
+
+
+void halTileFillPlanes(SurfaceT *s, uint8_t bx, uint8_t by, uint8_t colorIndex) {
+    (void)s; (void)bx; (void)by; (void)colorIndex;
+}
+void halTileCopyPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy) {
+    (void)dst; (void)dstBx; (void)dstBy; (void)src; (void)srcBx; (void)srcBy;
+}
+void halTileCopyMaskedPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy, uint8_t transparentIndex) {
+    (void)dst; (void)dstBx; (void)dstBy; (void)src; (void)srcBx; (void)srcBy; (void)transparentIndex;
+}
+void halTilePastePlanes(SurfaceT *dst, uint8_t bx, uint8_t by, const uint8_t *chunkyTile) {
+    (void)dst; (void)bx; (void)by; (void)chunkyTile;
+}
+void halTileSnapPlanes(const SurfaceT *src, uint8_t bx, uint8_t by, uint8_t *chunkyTileOut) {
+    (void)src; (void)bx; (void)by; (void)chunkyTileOut;
+}
+void halSpriteDrawPlanes(SurfaceT *s, const SpriteT *sp, int16_t x, int16_t y) {
+    (void)s; (void)sp; (void)x; (void)y;
+}
+void halBlitRectPlanes(SurfaceT *dst, int16_t x, int16_t y, const uint8_t *srcBytes, int16_t srcX0, int16_t srcY0, int16_t copyW, int16_t copyH, int16_t srcRowBytes, uint16_t transparent) {
+    (void)dst; (void)x; (void)y; (void)srcBytes; (void)srcX0; (void)srcY0;
+    (void)copyW; (void)copyH; (void)srcRowBytes; (void)transparent;
+}
+void halSpriteSavePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t *dstPlaneBytes) {
+    (void)s; (void)x; (void)y; (void)w; (void)h; (void)dstPlaneBytes;
+}
+void halSpriteRestorePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, const uint8_t *srcPlaneBytes) {
+    (void)s; (void)x; (void)y; (void)w; (void)h; (void)srcPlaneBytes;
+}
+
+
+/* Phase 9 chunky reader hooks -- ST is still chunky-shadow + c2p,
+ * so reads come from s->pixels just like DOS / IIgs. */
+uint8_t halSamplePixel(const SurfaceT *s, int16_t x, int16_t y) {
+    uint8_t byte = s->pixels[(uint16_t)y * SURFACE_BYTES_PER_ROW + ((uint16_t)x >> 1)];
+    if (x & 1) return (uint8_t)(byte & 0x0Fu);
+    return (uint8_t)((byte & 0xF0u) >> 4);
+}
+
+
+uint32_t halSurfaceHash(const SurfaceT *s) {
+    uint16_t        lo = 0xACE1u, hi = 0x1357u, blocks, n, v;
+    const uint8_t  *p;
+    const uint16_t *w;
+    uint8_t         b;
+    p      = s->pixels;
+    blocks = (uint16_t)(SURFACE_PIXELS_SIZE / 8);
+    do {
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        blocks--;
+    } while (blocks > 0u);
+    p = s->scb;
+    for (n = 0; n < (uint16_t)SURFACE_HEIGHT; n++) {
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+    }
+    w = &s->palette[0][0];
+    for (n = 0; n < (uint16_t)SURFACE_PALETTE_ENTRIES; n++) {
+        v  = *w++;
+        b  = (uint8_t)((v >> 8) & 0xFFu);  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        b  = (uint8_t)(v & 0xFFu);         SURFACE_HASH_MIX_BYTE(lo, hi, b);
+    }
+    return ((uint32_t)hi << 16) | (uint32_t)lo;
+}
+
+
+void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src) {
+    memcpy(dst->pixels, src->pixels, SURFACE_PIXELS_SIZE);
+}
+
+
+bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp) {
+    return fread(dst->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE;
+}
+
+
+bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp) {
+    return fwrite(src->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE;
+}
+
+
+uint8_t *halSurfaceAllocPixels(void) {
+    return (uint8_t *)calloc(1, SURFACE_PIXELS_SIZE);
+}
+
+
+void halSurfaceFreePixels(uint8_t *pixels) {
+    free(pixels);
+}
+
+
+uint8_t *halSurfacePlanePtr(const SurfaceT *s, uint8_t planeIdx) {
+    (void)s; (void)planeIdx;
+    return NULL;
+}
+
+
 uint8_t *halStageAllocPixels(void) {
    return (uint8_t *)malloc(SURFACE_PIXELS_SIZE);
 }
--- a/src/port/dos/hal.c
+++ b/src/port/dos/hal.c
@ -244,21 +244,6 @@ void halPresent(const SurfaceT *src) {
 }


-void halPresentRect(const SurfaceT *src, int16_t x, int16_t y, uint16_t w, uint16_t h) {
-    int16_t py;
-    int16_t yEnd;
-
-    if (src == NULL || gVgaMem == NULL) {
-        return;
-    }
-    uploadPaletteIfNeeded(src);
-    yEnd = y + (int16_t)h;
-    for (py = y; py < yEnd; py++) {
-        expandAndWriteLine(src, py, x, w, &gVgaMem[py * VGA_STRIDE]);
-    }
-}
-
-
 // VGA mode 13h vertical refresh on a real CRT runs at ~70 Hz. We
 // detect the start of vertical retrace by polling input status
 // register 1 ($3DA) bit 3: 1 = currently in vretrace. To get a
@ -423,6 +408,20 @@ bool halFastFloodWalk(uint8_t *row, int16_t startX, uint8_t matchColor, uint8_t
 }


+bool halFloodWalkPlanes(const SurfaceT *s, int16_t startX, int16_t y, uint8_t matchColor, uint8_t newColor, bool matchEqual, bool *seedMatched, int16_t *leftXOut, int16_t *rightXOut) {
+    (void)s; (void)startX; (void)y; (void)matchColor; (void)newColor; (void)matchEqual;
+    (void)seedMatched; (void)leftXOut; (void)rightXOut;
+    return false;
+}
+
+
+bool halFloodScanRowPlanes(const SurfaceT *s, int16_t leftX, int16_t rightX, int16_t scanY, uint8_t matchColor, uint8_t newColor, bool matchEqual, uint8_t *markBuf) {
+    (void)s; (void)leftX; (void)rightX; (void)scanY; (void)matchColor; (void)newColor; (void)matchEqual;
+    (void)markBuf;
+    return false;
+}
+
+
 bool halFastFloodScanRow(uint8_t *row, int16_t leftX, int16_t rightX, uint8_t matchColor, uint8_t newColor, bool matchEqual, uint8_t *markBuf) {
    (void)row;
    (void)leftX;
@ -499,3 +498,143 @@ uint8_t *halStageAllocPixels(void) {
 void halStageFreePixels(uint8_t *pixels) {
    free(pixels);
 }
+
+
+// DOS / VGA mode 13h is chunky-native (8bpp linear). portData is
+// unused; the chunky `pixels` buffer feeds the present-time
+// nearest-neighbor copy to VGA RAM.
+void *halSurfaceAllocPortData(SurfaceT *s, bool isStage) {
+    (void)s;
+    (void)isStage;
+    return NULL;
+}
+
+
+void halSurfaceFreePortData(SurfaceT *s, bool isStage, void *portData) {
+    (void)s;
+    (void)isStage;
+    (void)portData;
+}
+
+
+// DOS has no bitplanes -- chunky pixels are the source of truth and
+// expandAndWriteLine derives the VGA DAC indices straight from them.
+// This hook is a stub here; the cross-platform fillRect calls it
+// unconditionally.
+void halFillRectPlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex) {
+    (void)s;
+    (void)x;
+    (void)y;
+    (void)w;
+    (void)h;
+    (void)colorIndex;
+}
+
+
+void halSurfaceCopyPlanes(SurfaceT *dst, const SurfaceT *src) {
+    (void)dst;
+    (void)src;
+}
+
+
+void halTileFillPlanes(SurfaceT *s, uint8_t bx, uint8_t by, uint8_t colorIndex) {
+    (void)s; (void)bx; (void)by; (void)colorIndex;
+}
+void halTileCopyPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy) {
+    (void)dst; (void)dstBx; (void)dstBy; (void)src; (void)srcBx; (void)srcBy;
+}
+void halTileCopyMaskedPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy, uint8_t transparentIndex) {
+    (void)dst; (void)dstBx; (void)dstBy; (void)src; (void)srcBx; (void)srcBy; (void)transparentIndex;
+}
+void halTilePastePlanes(SurfaceT *dst, uint8_t bx, uint8_t by, const uint8_t *chunkyTile) {
+    (void)dst; (void)bx; (void)by; (void)chunkyTile;
+}
+void halTileSnapPlanes(const SurfaceT *src, uint8_t bx, uint8_t by, uint8_t *chunkyTileOut) {
+    (void)src; (void)bx; (void)by; (void)chunkyTileOut;
+}
+void halSpriteDrawPlanes(SurfaceT *s, const SpriteT *sp, int16_t x, int16_t y) {
+    (void)s; (void)sp; (void)x; (void)y;
+}
+void halBlitRectPlanes(SurfaceT *dst, int16_t x, int16_t y, const uint8_t *srcBytes, int16_t srcX0, int16_t srcY0, int16_t copyW, int16_t copyH, int16_t srcRowBytes, uint16_t transparent) {
+    (void)dst; (void)x; (void)y; (void)srcBytes; (void)srcX0; (void)srcY0;
+    (void)copyW; (void)copyH; (void)srcRowBytes; (void)transparent;
+}
+void halSpriteSavePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t *dstPlaneBytes) {
+    (void)s; (void)x; (void)y; (void)w; (void)h; (void)dstPlaneBytes;
+}
+void halSpriteRestorePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, const uint8_t *srcPlaneBytes) {
+    (void)s; (void)x; (void)y; (void)w; (void)h; (void)srcPlaneBytes;
+}
+
+
+/* Phase 9 reader hooks: chunky ports use the original s->pixels-based
+ * paths. */
+
+uint8_t halSamplePixel(const SurfaceT *s, int16_t x, int16_t y) {
+    uint8_t byte = s->pixels[(uint16_t)y * SURFACE_BYTES_PER_ROW + ((uint16_t)x >> 1)];
+    if (x & 1) return (uint8_t)(byte & 0x0Fu);
+    return (uint8_t)((byte & 0xF0u) >> 4);
+}
+
+
+uint32_t halSurfaceHash(const SurfaceT *s) {
+    uint16_t        lo = 0xACE1u, hi = 0x1357u, blocks, n, v;
+    const uint8_t  *p;
+    const uint16_t *w;
+    uint8_t         b;
+    p      = s->pixels;
+    blocks = (uint16_t)(SURFACE_PIXELS_SIZE / 8);
+    do {
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        blocks--;
+    } while (blocks > 0u);
+    p = s->scb;
+    for (n = 0; n < (uint16_t)SURFACE_HEIGHT; n++) {
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+    }
+    w = &s->palette[0][0];
+    for (n = 0; n < (uint16_t)SURFACE_PALETTE_ENTRIES; n++) {
+        v  = *w++;
+        b  = (uint8_t)((v >> 8) & 0xFFu);  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        b  = (uint8_t)(v & 0xFFu);         SURFACE_HASH_MIX_BYTE(lo, hi, b);
+    }
+    return ((uint32_t)hi << 16) | (uint32_t)lo;
+}
+
+
+void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src) {
+    memcpy(dst->pixels, src->pixels, SURFACE_PIXELS_SIZE);
+}
+
+
+bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp) {
+    return fread(dst->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE;
+}
+
+
+bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp) {
+    return fwrite(src->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE;
+}
+
+
+uint8_t *halSurfaceAllocPixels(void) {
+    return (uint8_t *)calloc(1, SURFACE_PIXELS_SIZE);
+}
+
+
+void halSurfaceFreePixels(uint8_t *pixels) {
+    free(pixels);
+}
+
+
+uint8_t *halSurfacePlanePtr(const SurfaceT *s, uint8_t planeIdx) {
+    (void)s; (void)planeIdx;
+    return NULL;
+}
--- a/src/port/iigs/hal.c
+++ b/src/port/iigs/hal.c
@ -26,12 +26,25 @@
 // crowd up against the 64 KB-per-bank limit).

 #include <stddef.h>
+#include <stdlib.h>
 #include <string.h>

 #include "joey/debug.h"
 #include "hal.h"
 #include "surfaceInternal.h"

+/* GetTick wrapper in peislam.asm: invokes the Misc Toolset GetTick
+ * ($2503) and returns the low 16 bits of the system's tick counter
+ * (firmware VBL ISR-driven). Polling $C019 from C user code missed
+ * transitions for any op over ~1 ms; the system's tick counter is
+ * updated by the actual interrupt handler so it stays accurate
+ * regardless of caller polling rate. Tick rate matches the video
+ * field rate -- 60 Hz on NTSC, 50 Hz on PAL. */
+extern uint16_t iigsGetTickWord(void);
+/* Reads battery RAM hrtz50or60: 0 = NTSC, 1 = PAL. */
+extern uint16_t iigsReadHzParam(void);
+static uint16_t gFrameHz = 60u;
+
 // hal.c is the single TU that calls into joeyDraw.asm. Cross-
 // platform draw.c / tile.c / etc. dispatch through halFast*
 // functions defined here; they never reference the asm symbols
@ -210,6 +223,7 @@ bool halInit(const JoeyConfigT *config) {
    // is unreliable from halInit's calling context, so we don't try
    // it here -- the first present will set up SCB to 320 mode.
    iigsInitRowLut();
+    gFrameHz = (iigsReadHzParam() == 1u) ? 50u : 60u;
    gModeSet = true;
    return true;
 }
@ -234,40 +248,6 @@ void halPresent(const SurfaceT *src) {
 }


-void halPresentRect(const SurfaceT *src, int16_t x, int16_t y, uint16_t w, uint16_t h) {
-    uint16_t copyBytes;
-    int16_t  byteStart;
-    uint16_t srcOffset;
-
-    if (src == NULL) {
-        return;
-    }
-
-    uploadScbAndPaletteIfNeeded(src);
-
-    // Pixel copy: byte-aligned runs per scanline. x is always >= 0
-    // after API-level clipping. Use unsigned shifts to avoid
-    // ~SSHIFTRIGHT helper for `x >> 1` on signed int16_t.
-    byteStart = (int16_t)((uint16_t)x >> 1);
-    copyBytes = (uint16_t)((((uint16_t)x + w + 1u) >> 1) - (uint16_t)byteStart);
-
-    if (copyBytes == 0 || h == 0) {
-        return;
-    }
-
-    // Pixel copy: prefer the PEI-slam variant when the rect satisfies
-    // its contract (copyBytes even, 2..80). Sprite-rect presents
-    // (typical 8 bytes wide) hit this ~3x faster than MVN. Wider or
-    // odd-byte rects fall back to MVN, which has no width cap.
-    srcOffset = (uint16_t)(0x2000 + SURFACE_ROW_OFFSET(y) + byteStart);
-    if ((copyBytes & 1) == 0 && copyBytes >= 2 && copyBytes <= 80) {
-        iigsBlitRectStageToShrPEI(srcOffset, copyBytes, h);
-    } else {
-        iigsBlitRectStageToShr(srcOffset, copyBytes, h);
-    }
-}
-
-
 void halShutdown(void) {
    if (gModeSet) {
        *IIGS_NEWVIDEO_REG = gPreviousNewVideo;
@ -305,6 +285,142 @@ void halStageFreePixels(uint8_t *pixels) {
 }


+// IIgs is chunky-native: portData is unused. The chunky `pixels`
+// buffer at $01:2000 is the stage's pixel storage and the source for
+// stagePresent's PEI-slam to $E1.
+void *halSurfaceAllocPortData(SurfaceT *s, bool isStage) {
+    (void)s;
+    (void)isStage;
+    return NULL;
+}
+
+
+void halSurfaceFreePortData(SurfaceT *s, bool isStage, void *portData) {
+    (void)s;
+    (void)isStage;
+    (void)portData;
+}
+
+
+// IIgs SHR is chunky-native; no bitplanes to update.
+void halFillRectPlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex) {
+    (void)s;
+    (void)x;
+    (void)y;
+    (void)w;
+    (void)h;
+    (void)colorIndex;
+}
+
+
+void halSurfaceCopyPlanes(SurfaceT *dst, const SurfaceT *src) {
+    (void)dst;
+    (void)src;
+}
+
+
+void halTileFillPlanes(SurfaceT *s, uint8_t bx, uint8_t by, uint8_t colorIndex) {
+    (void)s; (void)bx; (void)by; (void)colorIndex;
+}
+void halTileCopyPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy) {
+    (void)dst; (void)dstBx; (void)dstBy; (void)src; (void)srcBx; (void)srcBy;
+}
+void halTileCopyMaskedPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy, uint8_t transparentIndex) {
+    (void)dst; (void)dstBx; (void)dstBy; (void)src; (void)srcBx; (void)srcBy; (void)transparentIndex;
+}
+void halTilePastePlanes(SurfaceT *dst, uint8_t bx, uint8_t by, const uint8_t *chunkyTile) {
+    (void)dst; (void)bx; (void)by; (void)chunkyTile;
+}
+void halTileSnapPlanes(const SurfaceT *src, uint8_t bx, uint8_t by, uint8_t *chunkyTileOut) {
+    (void)src; (void)bx; (void)by; (void)chunkyTileOut;
+}
+void halSpriteDrawPlanes(SurfaceT *s, const SpriteT *sp, int16_t x, int16_t y) {
+    (void)s; (void)sp; (void)x; (void)y;
+}
+void halBlitRectPlanes(SurfaceT *dst, int16_t x, int16_t y, const uint8_t *srcBytes, int16_t srcX0, int16_t srcY0, int16_t copyW, int16_t copyH, int16_t srcRowBytes, uint16_t transparent) {
+    (void)dst; (void)x; (void)y; (void)srcBytes; (void)srcX0; (void)srcY0;
+    (void)copyW; (void)copyH; (void)srcRowBytes; (void)transparent;
+}
+void halSpriteSavePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t *dstPlaneBytes) {
+    (void)s; (void)x; (void)y; (void)w; (void)h; (void)dstPlaneBytes;
+}
+void halSpriteRestorePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, const uint8_t *srcPlaneBytes) {
+    (void)s; (void)x; (void)y; (void)w; (void)h; (void)srcPlaneBytes;
+}
+
+
+/* Phase 9 chunky reader hooks: IIgs reads from s->pixels just like
+ * the legacy paths did. Same logic as the DOS port. */
+uint8_t halSamplePixel(const SurfaceT *s, int16_t x, int16_t y) {
+    uint8_t byte = s->pixels[(uint16_t)y * SURFACE_BYTES_PER_ROW + ((uint16_t)x >> 1)];
+    if (x & 1) return (uint8_t)(byte & 0x0Fu);
+    return (uint8_t)((byte & 0xF0u) >> 4);
+}
+
+
+uint32_t halSurfaceHash(const SurfaceT *s) {
+    uint16_t        lo = 0xACE1u, hi = 0x1357u, blocks, n, v;
+    const uint8_t  *p;
+    const uint16_t *w;
+    uint8_t         b;
+    p      = s->pixels;
+    blocks = (uint16_t)(SURFACE_PIXELS_SIZE / 8);
+    do {
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        blocks--;
+    } while (blocks > 0u);
+    p = s->scb;
+    for (n = 0; n < (uint16_t)SURFACE_HEIGHT; n++) {
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+    }
+    w = &s->palette[0][0];
+    for (n = 0; n < (uint16_t)SURFACE_PALETTE_ENTRIES; n++) {
+        v  = *w++;
+        b  = (uint8_t)((v >> 8) & 0xFFu);  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        b  = (uint8_t)(v & 0xFFu);         SURFACE_HASH_MIX_BYTE(lo, hi, b);
+    }
+    return ((uint32_t)hi << 16) | (uint32_t)lo;
+}
+
+
+void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src) {
+    memcpy(dst->pixels, src->pixels, SURFACE_PIXELS_SIZE);
+}
+
+
+bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp) {
+    return fread(dst->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE;
+}
+
+
+bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp) {
+    return fwrite(src->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE;
+}
+
+
+uint8_t *halSurfaceAllocPixels(void) {
+    return (uint8_t *)calloc(1, SURFACE_PIXELS_SIZE);
+}
+
+
+void halSurfaceFreePixels(uint8_t *pixels) {
+    free(pixels);
+}
+
+
+uint8_t *halSurfacePlanePtr(const SurfaceT *s, uint8_t planeIdx) {
+    (void)s; (void)planeIdx;
+    return NULL;
+}
+
+
 // $C019 RDVBLBAR: bit 7 = 0 during vertical blank, 1 during active
 // scan. To produce a rising-edge wait (one VBL per call), first spin
 // while VBL is currently active (bit 7 = 0), then spin until VBL
@ -333,24 +449,11 @@ void halWaitVBL(void) {
 // byte and the counter never advances. The explicit lda > / sta >
 // pattern uses long-mode addressing throughout, which is
 // DBR-independent.
-static uint16_t gFrameCount  = 0;
-static uint8_t  gPrevInVbl   = 0;
-
 uint16_t halFrameCount(void) {
-    uint8_t  now;
-    uint16_t cnt;
-
-    now = (*IIGS_VBL_STATUS & VBL_BAR_BIT) == 0;
-    if (now && !gPrevInVbl) {
-        cnt = gFrameCount;
-        cnt = (uint16_t)(cnt + 1u);
-        gFrameCount = cnt;
-    }
-    gPrevInVbl = now;
-    return gFrameCount;
+    return iigsGetTickWord();
 }


 uint16_t halFrameHz(void) {
-    return 60u;
+    return gFrameHz;
 }
--- a/src/port/iigs/peislam.asm
+++ b/src/port/iigs/peislam.asm
@ -1,15 +1,66 @@
-* peislam.asm - placeholder.
-*
-* The original PEI-slam-per-row helper was removed; its functionality
-* was rolled into iigsBlitStageToShr in joeyDraw.asm (full PEI-slam
-* with per-row dirty skip). This stub remains so the build's
-* PORT_ASM_SRCS_ALL wildcard pulls in a file with a recognized load
-* segment and the linker keeps the same segment-bank layout it had
-* when peislam.asm was a real translation unit.
+* peislam.asm - originally a PEI-slam helper, now hosts the GetTick
+* and ReadBParam trampolines. The PEI-slam logic was rolled into
+* iigsBlitStageToShr in joeyDraw.asm.

                keep    PEISLAM
                case    on

+
+* Stub kept so the PEISLAM load segment stays present (the build's
+* PORT_ASM_SRCS_ALL wildcard pulls in this file by name).
 peislamStub     start   IIGSASM
                rtl
                end
+
+
+****************************************************************
+* uint16_t iigsGetTickWord(void)
+*
+* Calls Misc Toolset GetTick ($2503) and returns the low 16 bits of
+* the 32-bit tick counter. The system increments this counter from
+* the actual VBL hardware interrupt, so it stays accurate regardless
+* of caller polling rate -- C-side polling of $C019 missed transitions
+* for any op over ~1 ms.
+*
+* GetTick output convention: caller pushes 4 bytes of output space,
+* tool dispatcher writes the LongWord into them. We pull the low 16
+* bits into A (ORCA-C Word return convention -- A holds the result,
+* not Y; verified against jIIgs.asm asmGetVbl) and discard the high
+* 16 into X.
+*
+* ORCA-C cdecl ABI: caller has M=I=16. Word return in A.
+****************************************************************
+
+iigsGetTickWord start IIGSASM
+                pha             ; output space high word
+                pha             ; output space low word
+                ldx     #$2503  ; _GetTick
+                jsl     $E10000
+
+                pla             ; A = low 16 bits (return value)
+                plx             ; discard high 16 bits
+                rtl
+                end
+
+
+****************************************************************
+* uint16_t iigsReadHzParam(void)
+*
+* Reads battery RAM parameter hrtz50or60 ($1D) via _ReadBParam ($0C03)
+* and returns the raw value: 0 = NTSC (60 Hz), 1 = PAL (50 Hz).
+*
+* GetTick fires from the hardware VBL ISR, so its rate matches the
+* video field rate -- 60 Hz on NTSC, 50 Hz on PAL. halFrameHz must
+* report whichever this machine actually runs so wall-clock math
+* (frames * 1000 / halFrameHz) is correct on both.
+****************************************************************
+
+iigsReadHzParam start IIGSASM
+                pha             ; output space (Word)
+                pea     $001D   ; hrtz50or60 parameter ID
+                ldx     #$0C03  ; _ReadBParam
+                jsl     $E10000
+
+                pla             ; A = result (ORCA-C Word return)
+                rtl
+                end
--- a/src/shared68k/surface68k.s
+++ b/src/shared68k/surface68k.s
@ -253,3 +253,253 @@ _surface68kFillRectByteAligned:
 .Lfrb_done:
                movem.l (%sp)+,%d2-%d6
                rts
+
+
+| ----------------------------------------------------------------
+| void surface68kFillSpan4Planes(uint8_t *p0, uint8_t *p1,
+|                                uint8_t *p2, uint8_t *p3,
+|                                uint16_t numMid,
+|                                uint8_t  leftMask, uint8_t rightMask,
+|                                uint8_t  fb0, uint8_t fb1,
+|                                uint8_t  fb2, uint8_t fb3);
+|
+| Fill ONE planar row across 4 planes -- the per-row body of
+| halFillRectPlanes lifted into asm. Each pN points at the leading
+| byte (already advanced by planeBase + y*40 + byteFirst on the C
+| side). leftMask and rightMask are the partial-byte masks for the
+| left/right edges; numMid is the count of full bytes between them.
+| fbN is 0x00 or 0xFF, the per-plane fill byte (caller pre-classifies
+| (colorIndex >> N) & 1 -> 0xFF or 0x00).
+|
+| Used by Amiga halFastFillCircle (one call per scanline span) and
+| Amiga halFillRectPlanes (one call per row of the rect). Replaces
+| the C inner loop whose ~13 cyc/byte was the gating cost on
+| fillCircle r=40 even after C-side inlining.
+|
+| Mask convention is uniform for all planes:
+|   leading byte  := (*p & ~leftMask)  | (fbN & leftMask)
+|   middle bytes  := fbN
+|   trailing byte := (*p & ~rightMask) | (fbN & rightMask)
+| -- branchless: the same arithmetic produces "set" or "clear" based
+| on whether fbN is 0xFF or 0x00.
+|
+| ABI: m68k cdecl. d2-d7/a2-a6 callee-save (movem'd here).
+| Stack offset to first arg after MOVEM: 11 regs * 4 = 44 bytes saved
+| + 4 ret PC = 48.
+| ----------------------------------------------------------------
+                .globl  _surface68kFillSpan4Planes
+
+                .equ    SP_SAVED, 44
+                .equ    SP_RPC,    4
+                .equ    SP_OFF,   (SP_SAVED + SP_RPC)
+
+                .equ    SP_P0,    SP_OFF + 0
+                .equ    SP_P1,    SP_OFF + 4
+                .equ    SP_P2,    SP_OFF + 8
+                .equ    SP_P3,    SP_OFF + 12
+                .equ    SP_NMID,  SP_OFF + 16 + 2  | int -> low word at +2
+                .equ    SP_LMASK, SP_OFF + 20 + 3  | int -> low byte at +3
+                .equ    SP_RMASK, SP_OFF + 24 + 3
+                .equ    SP_FB0,   SP_OFF + 28 + 3
+                .equ    SP_FB1,   SP_OFF + 32 + 3
+                .equ    SP_FB2,   SP_OFF + 36 + 3
+                .equ    SP_FB3,   SP_OFF + 40 + 3
+
+| Macro: per-plane work fully inlined. Args:
+|   plane_an  = the address register holding this plane's pointer.
+|   fb_off    = the stack offset for this plane's fillByte.
+| Uses d6/d7 as scratch; d1=leftMask, d2=~leftMask, d3=rightMask,
+| d4=~rightMask; d0=numMid-1 (only valid if mid_count > 0). The mid
+| loop is skipped via .LfsSkipMid_<n> when numMid was 0 at entry --
+| the per-plane caller branches to the right tail label.
+|
+| Hand-unrolled 4x rather than using bsr+rts to dodge ~12 cyc per
+| return + the per-plane re-test of numMid that the previous build
+| paid. The mid-loop label suffix is the plane index so all four
+| inline copies can coexist without label collisions.
+|
+| Plain text version of the per-plane body (translate to asm 4x with
+| different a-regs and fb stack offsets):
+|
+|   move.b  (an),%d6
+|   and.b   %d2,%d6
+|   move.b  fb,%d7
+|   and.b   %d1,%d7
+|   or.b    %d7,%d6
+|   move.b  %d6,(an)+
+|   < if has-middle path: >
+|     move.w  %d0,%d7
+|   .midN:
+|     move.b  fb,(an)+
+|     dbra    %d7,.midN
+|   < trailing: >
+|   move.b  (an),%d6
+|   and.b   %d4,%d6
+|   move.b  fb,%d7
+|   and.b   %d3,%d7
+|   or.b    %d7,%d6
+|   move.b  %d6,(an)
+
+_surface68kFillSpan4Planes:
+                movem.l %d2-%d7/%a2-%a6,-(%sp)
+
+                move.b  SP_LMASK(%sp),%d1
+                move.b  %d1,%d2
+                not.b   %d2
+                move.b  SP_RMASK(%sp),%d3
+                move.b  %d3,%d4
+                not.b   %d4
+
+                move.l  SP_P0(%sp),%a0
+                move.l  SP_P1(%sp),%a1
+                move.l  SP_P2(%sp),%a2
+                move.l  SP_P3(%sp),%a3
+
+                | One-time numMid test. d0.w = numMid; if 0 jump to
+                | the no-middle entry, otherwise pre-decrement for dbra
+                | and fall into the with-middle entry. Both paths
+                | unroll all 4 planes.
+                move.w  SP_NMID(%sp),%d0
+                beq     .LfsNoMid
+                subq.w  #1,%d0
+
+                | ---- WITH-MIDDLE PATH ----
+                | Plane 0
+                move.b  (%a0),%d6
+                and.b   %d2,%d6
+                move.b  SP_FB0(%sp),%d5
+                move.b  %d5,%d7
+                and.b   %d1,%d7
+                or.b    %d7,%d6
+                move.b  %d6,(%a0)+
+                move.w  %d0,%d7
+.LfsMid0:       move.b  %d5,(%a0)+
+                dbra    %d7,.LfsMid0
+                move.b  (%a0),%d6
+                and.b   %d4,%d6
+                move.b  %d5,%d7
+                and.b   %d3,%d7
+                or.b    %d7,%d6
+                move.b  %d6,(%a0)
+
+                | Plane 1
+                move.b  (%a1),%d6
+                and.b   %d2,%d6
+                move.b  SP_FB1(%sp),%d5
+                move.b  %d5,%d7
+                and.b   %d1,%d7
+                or.b    %d7,%d6
+                move.b  %d6,(%a1)+
+                move.w  %d0,%d7
+.LfsMid1:       move.b  %d5,(%a1)+
+                dbra    %d7,.LfsMid1
+                move.b  (%a1),%d6
+                and.b   %d4,%d6
+                move.b  %d5,%d7
+                and.b   %d3,%d7
+                or.b    %d7,%d6
+                move.b  %d6,(%a1)
+
+                | Plane 2
+                move.b  (%a2),%d6
+                and.b   %d2,%d6
+                move.b  SP_FB2(%sp),%d5
+                move.b  %d5,%d7
+                and.b   %d1,%d7
+                or.b    %d7,%d6
+                move.b  %d6,(%a2)+
+                move.w  %d0,%d7
+.LfsMid2:       move.b  %d5,(%a2)+
+                dbra    %d7,.LfsMid2
+                move.b  (%a2),%d6
+                and.b   %d4,%d6
+                move.b  %d5,%d7
+                and.b   %d3,%d7
+                or.b    %d7,%d6
+                move.b  %d6,(%a2)
+
+                | Plane 3
+                move.b  (%a3),%d6
+                and.b   %d2,%d6
+                move.b  SP_FB3(%sp),%d5
+                move.b  %d5,%d7
+                and.b   %d1,%d7
+                or.b    %d7,%d6
+                move.b  %d6,(%a3)+
+                move.w  %d0,%d7
+.LfsMid3:       move.b  %d5,(%a3)+
+                dbra    %d7,.LfsMid3
+                move.b  (%a3),%d6
+                and.b   %d4,%d6
+                move.b  %d5,%d7
+                and.b   %d3,%d7
+                or.b    %d7,%d6
+                move.b  %d6,(%a3)
+
+                movem.l (%sp)+,%d2-%d7/%a2-%a6
+                rts
+
+.LfsNoMid:
+                | ---- NO-MIDDLE PATH (just leading + trailing) ----
+                | Plane 0
+                move.b  (%a0),%d6
+                and.b   %d2,%d6
+                move.b  SP_FB0(%sp),%d5
+                move.b  %d5,%d7
+                and.b   %d1,%d7
+                or.b    %d7,%d6
+                move.b  %d6,(%a0)+
+                move.b  (%a0),%d6
+                and.b   %d4,%d6
+                move.b  %d5,%d7
+                and.b   %d3,%d7
+                or.b    %d7,%d6
+                move.b  %d6,(%a0)
+
+                | Plane 1
+                move.b  (%a1),%d6
+                and.b   %d2,%d6
+                move.b  SP_FB1(%sp),%d5
+                move.b  %d5,%d7
+                and.b   %d1,%d7
+                or.b    %d7,%d6
+                move.b  %d6,(%a1)+
+                move.b  (%a1),%d6
+                and.b   %d4,%d6
+                move.b  %d5,%d7
+                and.b   %d3,%d7
+                or.b    %d7,%d6
+                move.b  %d6,(%a1)
+
+                | Plane 2
+                move.b  (%a2),%d6
+                and.b   %d2,%d6
+                move.b  SP_FB2(%sp),%d5
+                move.b  %d5,%d7
+                and.b   %d1,%d7
+                or.b    %d7,%d6
+                move.b  %d6,(%a2)+
+                move.b  (%a2),%d6
+                and.b   %d4,%d6
+                move.b  %d5,%d7
+                and.b   %d3,%d7
+                or.b    %d7,%d6
+                move.b  %d6,(%a2)
+
+                | Plane 3
+                move.b  (%a3),%d6
+                and.b   %d2,%d6
+                move.b  SP_FB3(%sp),%d5
+                move.b  %d5,%d7
+                and.b   %d1,%d7
+                or.b    %d7,%d6
+                move.b  %d6,(%a3)+
+                move.b  (%a3),%d6
+                and.b   %d4,%d6
+                move.b  %d5,%d7
+                and.b   %d3,%d7
+                or.b    %d7,%d6
+                move.b  %d6,(%a3)
+
+                movem.l (%sp)+,%d2-%d7/%a2-%a6
+                rts
--- a/tools/diff-uber-hashes
+++ b/tools/diff-uber-hashes
@ -0,0 +1,93 @@
+#!/usr/bin/env python3
+"""Compare two UBER joeylog.txt files by per-op surface hash.
+
+Used by the planar 68k rewrite (project_planar_68k_plan.md): IIgs
+captures the golden reference, each 68k port re-runs UBER after a
+primitive conversion, and this tool tells you which ops produced
+different pixels. Without this, "looks right visually" misses the
+subtle mismatches that cascade into hard-to-debug corruption.
+
+Usage:
+    tools/diff-uber-hashes <reference-log> <test-log>
+
+Exit code:
+    0 = all hashes match
+    1 = at least one mismatch
+    2 = usage error or missing file
+"""
+
+import re
+import sys
+
+# Match e.g.:
+#   UBER: drawCircle r=80: 56 iters / 4 frames = 840 ops/sec | hash=A1B2C3D4
+LINE_RE = re.compile(
+    r"UBER:\s+(?P<op>[^:]+):\s+\d+\s+iters\s+/\s+\d+\s+frames\s+=\s+\d+\s+ops/sec\s+\|\s+hash=(?P<hash>[0-9A-Fa-f]+)"
+)
+
+
+def parse_log(path):
+    """Return ordered dict {op_name: hash} from a UBER log file.
+
+    Multiple runs may be concatenated in the same log (joeyLog appends)
+    -- in that case the LAST hash for each op wins, matching the most
+    recent run.
+    """
+    hashes = {}
+    with open(path) as f:
+        for line in f:
+            m = LINE_RE.search(line)
+            if m:
+                hashes[m.group("op").strip()] = m.group("hash").upper()
+    return hashes
+
+
+def main(argv):
+    if len(argv) != 3:
+        sys.stderr.write(
+            "usage: diff-uber-hashes <reference-log> <test-log>\n"
+        )
+        return 2
+
+    try:
+        ref = parse_log(argv[1])
+        test = parse_log(argv[2])
+    except OSError as e:
+        sys.stderr.write(f"error: {e}\n")
+        return 2
+
+    if not ref:
+        sys.stderr.write(f"error: no UBER hash lines found in {argv[1]}\n")
+        return 2
+    if not test:
+        sys.stderr.write(f"error: no UBER hash lines found in {argv[2]}\n")
+        return 2
+
+    mismatches = 0
+    matches = 0
+    for op, ref_hash in ref.items():
+        test_hash = test.get(op)
+        if test_hash is None:
+            print(f"  MISSING in test: {op}  (ref={ref_hash})")
+            mismatches += 1
+        elif test_hash != ref_hash:
+            print(f"  MISMATCH {op}: ref={ref_hash}  test={test_hash}")
+            mismatches += 1
+        else:
+            matches += 1
+
+    extras = [op for op in test if op not in ref]
+    for op in extras:
+        print(f"  EXTRA in test: {op}  (test={test[op]})")
+
+    total = len(ref) + len(extras)
+    print()
+    if mismatches == 0 and not extras:
+        print(f"OK: {matches}/{total} ops match")
+        return 0
+    print(f"FAIL: {matches} match, {mismatches} mismatch, {len(extras)} extras")
+    return 1
+
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv))
--- a/tools/diff-uber-perf
+++ b/tools/diff-uber-perf
@ -0,0 +1,132 @@
+#!/usr/bin/env python3
+"""Compare two UBER joeylog.txt files by per-op ops/sec.
+
+Sibling of diff-uber-hashes (which compares pixel correctness). This
+tool drives Phase 10 of project_planar_68k_plan.md: pick the
+biggest perf gaps vs the IIgs reference and target asm/algorithmic
+optimization at those.
+
+Usage:
+    tools/diff-uber-perf <reference-log> <test-log> [--threshold 1.0]
+
+Output is sorted by speed ratio (test/ref) ascending, so the worst
+gaps print first. Ops missing from either log are flagged. The
+threshold flag (default 1.0) marks ops below that ratio as FAIL --
+project_perf_directive.md says "IIgs is the perf floor; every
+other target must match or beat it", so parity = 1.0x. Use
+--threshold 0.8 for the project_planar_68k_plan looser acceptance.
+
+Exit code:
+    0 = all common ops at >= threshold
+    1 = at least one op below threshold (or missing)
+    2 = usage error or missing file
+"""
+
+import re
+import sys
+
+# Match e.g.:
+#   UBER: drawCircle r=80: 56 iters / 4 frames = 840 ops/sec | hash=A1B2C3D4
+LINE_RE = re.compile(
+    r"UBER:\s+(?P<op>[^:]+):\s+\d+\s+iters\s+/\s+\d+\s+frames\s+=\s+(?P<ops>\d+)\s+ops/sec"
+)
+
+
+def parse_log(path):
+    """Return ordered dict {op_name: ops_per_sec} from a UBER log file.
+
+    Multiple runs may be concatenated (joeyLog appends); last value
+    for each op wins, matching the most recent run.
+    """
+    perf = {}
+    with open(path) as f:
+        for line in f:
+            m = LINE_RE.search(line)
+            if m:
+                perf[m.group("op").strip()] = int(m.group("ops"))
+    return perf
+
+
+def main(argv):
+    threshold = 1.0
+    args = []
+    i = 1
+    while i < len(argv):
+        if argv[i] == "--threshold" and i + 1 < len(argv):
+            try:
+                threshold = float(argv[i + 1])
+            except ValueError:
+                sys.stderr.write(f"error: bad threshold {argv[i+1]}\n")
+                return 2
+            i += 2
+        else:
+            args.append(argv[i])
+            i += 1
+
+    if len(args) != 2:
+        sys.stderr.write(
+            "usage: diff-uber-perf <reference-log> <test-log> [--threshold 1.0]\n"
+        )
+        return 2
+
+    try:
+        ref = parse_log(args[0])
+        test = parse_log(args[1])
+    except OSError as e:
+        sys.stderr.write(f"error: {e}\n")
+        return 2
+
+    if not ref:
+        sys.stderr.write(f"error: no UBER lines found in {args[0]}\n")
+        return 2
+    if not test:
+        sys.stderr.write(f"error: no UBER lines found in {args[1]}\n")
+        return 2
+
+    rows = []
+    for op, ref_ops in ref.items():
+        test_ops = test.get(op)
+        if test_ops is None:
+            rows.append((op, ref_ops, None, None, "MISSING"))
+            continue
+        if ref_ops == 0:
+            ratio = float("inf") if test_ops > 0 else 1.0
+        else:
+            ratio = test_ops / ref_ops
+        status = "ok" if ratio >= threshold else "FAIL"
+        rows.append((op, ref_ops, test_ops, ratio, status))
+
+    extras = [(op, None, test[op], None, "EXTRA") for op in test if op not in ref]
+
+    # Sort: missing/fail first by worst ratio, then ok ascending by ratio.
+    def sort_key(row):
+        op, refv, testv, ratio, status = row
+        if status == "MISSING":
+            return (0, 0.0, op)
+        if status == "EXTRA":
+            return (3, 0.0, op)
+        return (1 if status == "FAIL" else 2, ratio, op)
+
+    rows.sort(key=sort_key)
+
+    op_w = max(len(op) for op in ref) if ref else 8
+    op_w = max(op_w, max((len(op) for op in test), default=8), len("op"))
+
+    print(f"{'op':<{op_w}}  {'ref':>10}  {'test':>10}  {'ratio':>7}  status")
+    print(f"{'-'*op_w}  {'-'*10}  {'-'*10}  {'-'*7}  ------")
+    fails = 0
+    for op, refv, testv, ratio, status in rows + extras:
+        refs = "" if refv is None else str(refv)
+        tests = "" if testv is None else str(testv)
+        rats = "" if ratio is None else f"{ratio:.2f}x"
+        print(f"{op:<{op_w}}  {refs:>10}  {tests:>10}  {rats:>7}  {status}")
+        if status in ("FAIL", "MISSING"):
+            fails += 1
+
+    print()
+    print(f"threshold: {threshold:.2f}x  ({len(rows)} ops compared, {fails} below threshold)")
+    return 1 if fails > 0 else 0
+
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv))