From b1e24b4650c8cce2787df6f1426f0a8204c6a561 Mon Sep 17 00:00:00 2001
From: Scott Duensing <scott@duensing.com>
Date: Sun, 3 May 2026 01:44:39 -0500
Subject: [PATCH] Amiga parity with IIgs!

---
 examples/audio/audio.c            |    4 +-
 examples/joy/joy.c                |    4 +-
 examples/keys/keys.c              |   17 +-
 examples/sprite/sprite.c          |   70 +-
 examples/uber/uber.c              |   83 +-
 include/joey/debug.h              |    1 +
 include/joey/present.h            |   16 +-
 include/joey/sprite.h             |   17 +-
 include/joey/surface.h            |    9 +
 make/amiga.mk                     |    9 +-
 make/atarist.mk                   |    8 +-
 make/dos.mk                       |    8 +-
 make/iigs.mk                      |   64 +-
 src/codegen/spriteCompile.c       |  136 ++-
 src/codegen/spriteEmit68k.c       |   15 +
 src/codegen/spriteEmitIigs.c      |   12 +
 src/codegen/spriteEmitPlanar68k.c |  505 ++++++++
 src/codegen/spriteEmitX86.c       |   12 +
 src/codegen/spriteEmitter.h       |   15 +
 src/core/debug.c                  |   70 +-
 src/core/draw.c                   |  100 +-
 src/core/hal.h                    |  172 ++-
 src/core/present.c                |   48 +-
 src/core/sprite.c                 |   82 +-
 src/core/spriteInternal.h         |   10 +
 src/core/surface.c                |   63 +-
 src/core/surfaceInternal.h        |   30 +
 src/core/tile.c                   |   12 +-
 src/port/amiga/circle.s           |  270 +++++
 src/port/amiga/hal.c              | 1857 ++++++++++++++++++++++++++---
 src/port/atarist/hal.c            |  174 ++-
 src/port/dos/hal.c                |  169 ++-
 src/port/iigs/hal.c               |  201 +++-
 src/port/iigs/peislam.asm         |   67 +-
 src/shared68k/surface68k.s        |  250 ++++
 tools/diff-uber-hashes            |   93 ++
 tools/diff-uber-perf              |  132 ++
 37 files changed, 4312 insertions(+), 493 deletions(-)
 create mode 100644 src/codegen/spriteEmitPlanar68k.c
 create mode 100644 src/port/amiga/circle.s
 create mode 100755 tools/diff-uber-hashes
 create mode 100755 tools/diff-uber-perf

diff --git a/examples/audio/audio.c b/examples/audio/audio.c
index 90ac866..163c14c 100644
--- a/examples/audio/audio.c
+++ b/examples/audio/audio.c
@@ -171,11 +171,11 @@ int main(void) {
 
         if (flashFrames > 0) {
             fillRect(screen, BAR_X, BAR_Y, BAR_W, BAR_H, COLOR_BAR);
-            stagePresentRect(BAR_X, BAR_Y, BAR_W, BAR_H);
+            stagePresent();
             flashFrames--;
             if (flashFrames == 0) {
                 fillRect(screen, BAR_X, BAR_Y, BAR_W, BAR_H, COLOR_HINT);
-                stagePresentRect(BAR_X, BAR_Y, BAR_W, BAR_H);
+                stagePresent();
             }
         }
     }
diff --git a/examples/joy/joy.c b/examples/joy/joy.c
index 2683706..24de236 100644
--- a/examples/joy/joy.c
+++ b/examples/joy/joy.c
@@ -80,8 +80,10 @@ static void buildPalette(SurfaceT *screen) {
 
 
 static void drawAndPresent(SurfaceT *screen, int16_t x, int16_t y, int16_t w, int16_t h, uint8_t color) {
+    /* fillRect marks the rect dirty; stagePresent flushes only that
+     * dirty band. */
     fillRect(screen, x, y, (uint16_t)w, (uint16_t)h, color);
-    stagePresentRect(x, y, (uint16_t)w, (uint16_t)h);
+    stagePresent();
 }
 
 
diff --git a/examples/keys/keys.c b/examples/keys/keys.c
index 31bbfef..253977b 100644
--- a/examples/keys/keys.c
+++ b/examples/keys/keys.c
@@ -158,8 +158,6 @@ static void presentChangedCells(SurfaceT *screen, int16_t cursorCol, int16_t cur
     int16_t  row;
     JoeyKeyE key;
     bool     lit;
-    int16_t  x;
-    int16_t  y;
 
     for (row = 0; row < GRID_ROWS; row++) {
         for (col = 0; col < GRID_COLS; col++) {
@@ -171,10 +169,10 @@ static void presentChangedCells(SurfaceT *screen, int16_t cursorCol, int16_t cur
             if (lit == gCellLit[row][col]) {
                 continue;
             }
+            /* drawCell marks the cell's rect dirty; stagePresent
+             * flushes that one band. */
             drawCell(screen, col, row, lit);
-            x = (int16_t)(MARGIN_X + col * (CELL_W + GAP));
-            y = (int16_t)(MARGIN_Y + row * (CELL_H + GAP));
-            stagePresentRect(x, y, CELL_W, CELL_H);
+            stagePresent();
             gCellLit[row][col] = lit;
         }
     }
@@ -195,19 +193,16 @@ static void updateCursor(SurfaceT *screen, int16_t cursorCol, int16_t cursorRow)
     if (gLastCursorX != mouseX || gLastCursorY != mouseY) {
         if (gLastCursorCol != CELL_NONE) {
             drawCell(screen, gLastCursorCol, gLastCursorRow, gCellLit[gLastCursorRow][gLastCursorCol]);
-            stagePresentRect(
-                (int16_t)(MARGIN_X + gLastCursorCol * (CELL_W + GAP)),
-                (int16_t)(MARGIN_Y + gLastCursorRow * (CELL_H + GAP)),
-                CELL_W, CELL_H);
         } else if (gLastCursorX >= 0 && gLastCursorY >= 0) {
             // Old cursor was in a gap region. Stamp background over it.
             fillRect(screen, gLastCursorX, gLastCursorY, CURSOR_W, CURSOR_H, COLOR_BACKGROUND);
-            stagePresentRect(gLastCursorX, gLastCursorY, CURSOR_W, CURSOR_H);
         }
     }
 
     drawCursor(screen, mouseX, mouseY);
-    stagePresentRect(mouseX, mouseY, CURSOR_W, CURSOR_H);
+    /* All draw calls above marked their rects dirty; one stagePresent
+     * flushes the union (cursor erase + cursor draw). */
+    stagePresent();
 
     gLastCursorX   = mouseX;
     gLastCursorY   = mouseY;
diff --git a/examples/sprite/sprite.c b/examples/sprite/sprite.c
index 3c7509b..49e5a96 100644
--- a/examples/sprite/sprite.c
+++ b/examples/sprite/sprite.c
@@ -15,11 +15,11 @@
 #define BALL_TILES_Y      (BALL_H / 8)
 
 #define BALL_TILE_BYTES   (BALL_TILES_X * BALL_TILES_Y * TILE_BYTES)
-// SaveUnder must store rounded-up byte boundaries: x rounded down to
-// even, width rounded up to even. Worst case for BALL_W=16 (already
-// even) is 8 bytes per row + alignment slack of 1 byte; size for the
-// pessimistic case so the buffer never overflows.
-#define BALL_BACKUP_BYTES (((BALL_W + 2) >> 1) * BALL_H)
+// SaveUnder rounds x down to the platform's storage alignment: 2 px
+// for chunky 4bpp (1 extra byte/row worst case), 8 px for planar
+// 4-plane (4 extra bytes/row worst case -- one per plane). The +4
+// covers the planar case and is a no-op overhead on chunky.
+#define BALL_BACKUP_BYTES (((BALL_W >> 1) + 4) * BALL_H)
 
 #define BALL_PALETTE_IDX  0
 
@@ -100,18 +100,14 @@ int main(void) {
     int16_t        y;
     int16_t        vx;
     int16_t        vy;
-    int16_t        oldX;
-    int16_t        oldY;
-    uint16_t       oldW;
-    uint16_t       oldH;
-    int16_t        unionX;
-    int16_t        unionY;
-    int16_t        unionRight;
-    int16_t        unionBottom;
     bool           haveBackup;
 
     config.hostMode     = HOST_MODE_TAKEOVER;
-    config.codegenBytes = 8 * 1024;
+    /* Amiga planar emits 8 pre-shifted DRAW variants per sprite (one
+     * per x % 8 alignment) so the codegen arena needs roughly 8x what
+     * the chunky two-shift case asks for. 32 KB fits a 16x16 ball
+     * with all variants. */
+    config.codegenBytes = 32UL * 1024;
     config.maxSurfaces  = 4;
     config.audioBytes   = 64UL * 1024;
     config.assetBytes   = 128UL * 1024;
@@ -155,7 +151,7 @@ int main(void) {
     haveBackup = false;
 
     spriteSaveAndDraw(screen, ball, x, y, &backup);
-    stagePresentRect(backup.x, backup.y, backup.width, backup.height);
+    stagePresent();
     haveBackup = true;
 
     for (;;) {
@@ -164,19 +160,15 @@ int main(void) {
             break;
         }
 
-        // Stash the prior ball's region before restoring the bytes
-        // under it. Do all off-screen work (restore + move + draw)
-        // first, then waitVBL + ONE stagePresentRect covering both
-        // old and new regions. Putting waitVBL immediately before the
-        // present lets the present land inside the VBL window so the
-        // CRT never sees a half-updated framebuffer (matters most on
-        // single-buffered chunky targets like IIgs SHR; on planar
-        // c2p platforms it also avoids c2p racing the raster).
-        oldX = backup.x;
-        oldY = backup.y;
-        oldW = backup.width;
-        oldH = backup.height;
-
+        // Do all off-screen work first (restore + move + draw), then
+        // ONE stagePresent flushes the union of dirty bands set by
+        // restoreUnder + draw. Add a joeyWaitVBL() before the present
+        // to land it inside the VBL window so the CRT never sees a
+        // half-updated framebuffer (matters most on single-buffered
+        // chunky targets like IIgs SHR; on planar c2p platforms it
+        // also avoids c2p racing the raster). VBL wait is omitted
+        // here so the demo runs at the sprite pipeline's native
+        // throughput -- expect tearing on the ball.
         if (haveBackup) {
             spriteRestoreUnder(screen, &backup);
         }
@@ -190,27 +182,7 @@ int main(void) {
 
         spriteSaveAndDraw(screen, ball, x, y, &backup);
 
-        // Bounding box of (old rect) U (new rect). For typical
-        // small-step motion the rects overlap heavily so the union
-        // is barely larger than one ball.
-        unionX      = (oldX < backup.x) ? oldX : backup.x;
-        unionY      = (oldY < backup.y) ? oldY : backup.y;
-        unionRight  = (int16_t)((oldX + oldW > backup.x + backup.width)
-                                ? (oldX + oldW)
-                                : (backup.x + backup.width));
-        unionBottom = (int16_t)((oldY + oldH > backup.y + backup.height)
-                                ? (oldY + oldH)
-                                : (backup.y + backup.height));
-
-        // VBL wait removed -- the demo runs at the native compute speed
-        // of save+restore+draw+presentRect so we can SEE the sprite
-        // pipeline's actual throughput. Expect tearing on the ball
-        // since the present can land mid-scan; that's the cost of
-        // showing real frame rate. Add joeyWaitVBL() back here for
-        // tear-free 60 Hz motion.
-        stagePresentRect(unionX, unionY,
-                           (uint16_t)(unionRight  - unionX),
-                           (uint16_t)(unionBottom - unionY));
+        stagePresent();
         haveBackup = true;
     }
 
diff --git a/examples/uber/uber.c b/examples/uber/uber.c
index 5538b48..eb9e243 100644
--- a/examples/uber/uber.c
+++ b/examples/uber/uber.c
@@ -28,7 +28,16 @@
 
 // 4-frame measurement window. Long enough that loop overhead doesn't
 // dominate; short enough to keep the full demo run under ~10 sec.
-#define UBER_FRAMES  4u
+/* 16 frames per timed op gives 4x the iter-count resolution of the
+ * earlier 4-frame budget. Exposes the actual per-op cost on slow
+ * ops where 4 frames produced the same iter count on different
+ * framerates -- e.g. drawCircle r=80 read as "4 iters / 4 frames"
+ * on both 60 Hz IIgs (16.7 ms/frame, 67 ms window) and 50 Hz Amiga
+ * (20 ms/frame, 80 ms window) even though per-op cost was equal,
+ * just because 4 ops at 16-17 ms happen to fit both windows. The
+ * 16-frame budget extends the windows to 267 ms / 320 ms; quantum
+ * gap shrinks to ~6%. Total run time scales 4x (~80 sec each). */
+#define UBER_FRAMES  16u
 
 
 typedef void (*OpFn)(void);
@@ -44,9 +53,10 @@ static TileT     gTileScratch;
 
 // Run `op` in a tight loop until `targetFrames` joeyFrameCount ticks
 // have elapsed. Returns iterations completed.
-static unsigned long runForFrames(OpFn op, unsigned int targetFrames) {
+static unsigned long runForFrames(OpFn op, unsigned int targetFrames, uint16_t *actualFramesOut) {
     unsigned long count;
     uint16_t      startFrame;
+    uint16_t      endFrame;
 
     count = 0UL;
 
@@ -57,29 +67,50 @@ static unsigned long runForFrames(OpFn op, unsigned int targetFrames) {
         op();
         count++;
     }
+    /* Capture the actual elapsed frames -- the last iter typically
+     * overruns the target. Using actual instead of target as the
+     * ops/sec divisor stays honest for ops slower than 1 frame
+     * (where count is forced low while real time stretches well
+     * past targetFrames). */
+    endFrame         = joeyFrameCount();
+    *actualFramesOut = (uint16_t)(endFrame - startFrame);
+    if (*actualFramesOut == 0u) {
+        *actualFramesOut = 1u;       /* defensive: avoid div-by-zero */
+    }
     return count;
 }
 
 
 // Time and log one op. Reports iters / N frames AND the derived
 // ops/sec so per-port results are directly comparable against IIgs
-// regardless of CPU speed or display refresh rate.
+// regardless of CPU speed or display refresh rate. Also logs an
+// FNV-1a hash of the surface state after timing -- this is the
+// pixel-perfect comparison input for the cross-port validation
+// harness (tools/diff-uber-hashes.py). Captured against IIgs as the
+// golden reference; planar 68k rewrites validate by matching it.
 static void timeOp(const char *name, OpFn op) {
     unsigned long iters;
     unsigned long opsPerSec;
+    uint16_t      actualFrames;
+    uint32_t      hash;
 
     gCurName = name;
 
-    iters = runForFrames(op, UBER_FRAMES);
+    iters = runForFrames(op, UBER_FRAMES, &actualFrames);
 
     if (iters == 0UL) {
         joeyLogF("UBER: %s: 0 iters (op too slow?)\n", name);
         return;
     }
 
-    opsPerSec = (iters * (unsigned long)joeyFrameHz()) / (unsigned long)UBER_FRAMES;
-    joeyLogF("UBER: %s: %lu iters / %u frames = %lu ops/sec\n",
-             name, iters, UBER_FRAMES, opsPerSec);
+    /* Divide by ACTUAL elapsed frames, not the target. For sub-frame
+     * ops actualFrames ~= UBER_FRAMES so the answer is unchanged;
+     * for ops that overrun (slow stagePresent etc.), this stops
+     * inflating ops/sec. */
+    opsPerSec = (iters * (unsigned long)joeyFrameHz()) / (unsigned long)actualFrames;
+    hash      = surfaceHash(gStage);
+    joeyLogF("UBER: %s: %lu iters / %u frames = %lu ops/sec | hash=%08lX\n",
+             name, iters, actualFrames, opsPerSec, (unsigned long)hash);
 }
 
 
@@ -125,8 +156,6 @@ static void op_spriteRestore     (void) { spriteRestoreUnder(gStage, &gBackup);
 static void op_spriteSaveAndDraw (void) { spriteSaveAndDraw (gStage, gSprite, gSpriteX, gSpriteY, &gBackup); }
 
 static void op_stagePresent     (void) { stagePresent(); }
-static void op_stagePresentRect8(void) { stagePresentRect( 40,  30,  16,  16); }
-static void op_stagePresentRectF(void) { stagePresentRect(  0,   0, 320, 200); }
 
 static void op_inputPoll       (void) { joeyInputPoll(); }
 static void op_keyDown         (void) { (void)joeyKeyDown(KEY_A); }
@@ -229,10 +258,14 @@ static void runAllTests(void) {
     timeOp("spriteRestoreUnder", op_spriteRestore);
     timeOp("spriteSaveAndDraw",  op_spriteSaveAndDraw);
 
-    // Present.
+    // Present. One warm-up call before each timed loop primes any
+    // per-port one-time setup (Amiga: copper list rebuild after the
+    // paletteSet / scbSetRange tests dirty the cache; without warm-up
+    // the rebuild's MakeScreen + MrgCop + WaitTOF chain consumes the
+    // entire 4-frame measurement window) so we measure steady-state
+    // throughput rather than first-call penalty.
+    stagePresent();
     timeOp("stagePresent full",  op_stagePresent);
-    timeOp("stagePresentRect 8b",op_stagePresentRect8);
-    timeOp("stagePresentRect F", op_stagePresentRectF);
 
     // Input.
     timeOp("joeyInputPoll",      op_inputPoll);
@@ -253,12 +286,19 @@ static void runAllTests(void) {
 
 
 int main(void) {
-    JoeyConfigT config;
-    uint16_t    pal[16];
-    int         i;
+    JoeyConfigT   config;
+    uint16_t      pal[16];
+    int           i;
+    uint16_t      startFrame;
+    uint16_t      endFrame;
+    uint16_t      elapsedFrames;
+    unsigned long elapsedMs;
 
     config.hostMode     = HOST_MODE_TAKEOVER;
-    config.codegenBytes = 8 * 1024;
+    /* 32 KB fits the 8 pre-shifted DRAW variants the Amiga planar
+     * compiled sprite emitter generates. UL on the multiply because
+     * ORCA-C's 16-bit int overflows on 32 * 1024. */
+    config.codegenBytes = 32UL * 1024;
     config.maxSurfaces  = 4;
     config.audioBytes   = 64UL * 1024;
     config.assetBytes   = 128UL * 1024;
@@ -266,6 +306,11 @@ int main(void) {
     if (!joeyInit(&config)) {
         return 1;
     }
+    /* joeyFrameCount is VBL-driven, so it only ticks after halInit
+     * installed its VBL ISR -- captured here is "everything from now
+     * to press-any-key". Pre-init setup time is small and not the
+     * cost the user is chasing; runAllTests dominates. */
+    startFrame = joeyFrameCount();
 
     gStage = stageGet();
     if (gStage == NULL) {
@@ -337,6 +382,12 @@ int main(void) {
 
     runAllTests();
 
+    endFrame      = joeyFrameCount();
+    elapsedFrames = (uint16_t)(endFrame - startFrame);
+    elapsedMs     = ((unsigned long)elapsedFrames * 1000UL) / (unsigned long)joeyFrameHz();
+    joeyLogF("UBER: total wall time: %lu ms (%u frames @ %u Hz)\n",
+             elapsedMs, elapsedFrames, (unsigned)joeyFrameHz());
+
     // Done. Green screen + waitForKey.
     surfaceClear(gStage, 2);
     stagePresent();
diff --git a/include/joey/debug.h b/include/joey/debug.h
index 161fdef..c446530 100644
--- a/include/joey/debug.h
+++ b/include/joey/debug.h
@@ -5,6 +5,7 @@
 
 void joeyLog     (const char *msg);
 void joeyLogF    (const char *fmt, ...);
+void joeyLogFlush(void);
 void joeyLogReset(void);
 
 #endif
diff --git a/include/joey/present.h b/include/joey/present.h
index 521baf7..cdc4abc 100644
--- a/include/joey/present.h
+++ b/include/joey/present.h
@@ -15,14 +15,14 @@
 #include "types.h"
 
 // Flip the dirty regions of the stage to the display, then clear the
-// dirty state. Cheap when nothing has changed since the last call.
+// dirty state. Cheap when nothing has changed since the last call
+// (gStageAnyDirty short-circuit). Drawing primitives mark dirty as
+// a side effect, so callers only need to call stagePresent at the
+// end of a frame -- everything they drew shows up.
+//
+// To present a region you didn't draw with the standard primitives
+// (e.g. direct framebuffer poking), call surfaceMarkDirtyRect on
+// the same rect first, then stagePresent.
 void stagePresent(void);
 
-// Flip a specific rectangular region of the stage to the display,
-// regardless of dirty state. Coordinates are clipped to the surface;
-// negative or zero dimensions are no-ops. Does not consult or modify
-// the dirty arrays -- callers mixing stagePresentRect with stagePresent
-// in the same frame may see redundant work on the next stagePresent.
-void stagePresentRect(int16_t x, int16_t y, uint16_t w, uint16_t h);
-
 #endif
diff --git a/include/joey/sprite.h b/include/joey/sprite.h
index 0a34ee8..c66474c 100644
--- a/include/joey/sprite.h
+++ b/include/joey/sprite.h
@@ -27,13 +27,16 @@
 #include "surface.h"
 #include "types.h"
 
-// Sprites always write to a 4bpp packed SurfaceT, never to display
-// memory directly (halPresent owns that path). The codegen emits 2
-// shift variants on every platform: shift 0 for even x (sprite byte
-// boundaries match destination byte boundaries) and shift 1 for odd
-// x (each destination byte combines two adjacent sprite bytes'
-// nibbles).
-#define JOEY_SPRITE_SHIFT_COUNT 2
+// Sprite codegen emits per-shift variants. Chunky 4bpp ports (DOS,
+// IIgs, Atari ST) only need 2 shifts -- pixel offset 0 (sprite/dest
+// byte boundaries align) and offset 1 (every dest byte combines two
+// sprite bytes' nibbles). Planar ports (Amiga -- 8 px per plane byte)
+// need 8 shifts: one for each x % 8 alignment, so smooth horizontal
+// motion at any pixel position uses pre-shifted source bytes without
+// runtime bit-shifting. Allocate the max so routineOffsets[] has
+// slots for every variant; chunky ports leave shifts 2..7 as
+// SPRITE_NOT_COMPILED, planar ports use all 8.
+#define JOEY_SPRITE_SHIFT_COUNT 8
 
 typedef enum {
     SPRITE_FLAGS_NONE = 0
diff --git a/include/joey/surface.h b/include/joey/surface.h
index 6f6e9d9..dcab6b9 100644
--- a/include/joey/surface.h
+++ b/include/joey/surface.h
@@ -58,4 +58,13 @@ bool surfaceSaveFile(const SurfaceT *src, const char *path);
 // identity (no reallocation).
 bool surfaceLoadFile(SurfaceT *dst, const char *path);
 
+// FNV-1a 32-bit hash of the surface's logical pixel content (color
+// indices in row-major order, 0..15 per pixel). Same logical pixels
+// produce the same hash on every port regardless of internal storage
+// format -- so a hash captured on IIgs (chunky) compares directly
+// against the same op's output on Amiga (planar) once the planar
+// rewrite is done. Used by the UBER validation harness to
+// pixel-compare ports against an IIgs golden reference.
+uint32_t surfaceHash(const SurfaceT *s);
+
 #endif
diff --git a/make/amiga.mk b/make/amiga.mk
index 63741b4..ecd08eb 100644
--- a/make/amiga.mk
+++ b/make/amiga.mk
@@ -13,7 +13,7 @@ BINDIR   := $(BUILD)/bin
 # independently. -I on $(SRC_PORT)/amiga lets ptplayer.h resolve
 # <SDI_compiler.h> from the port-local shim alongside our HAL code.
 PTPLAYER_DIR := $(REPO_DIR)/toolchains/amiga/ptplayer
-CFLAGS  := $(COMMON_CFLAGS) -DJOEYLIB_PLATFORM_AMIGA -m68000 -fomit-frame-pointer -noixemul -D__OSCOMPAT -I$(SRC_PORT)/amiga -I$(SRC_68K) -I$(PTPLAYER_DIR)
+CFLAGS  := $(COMMON_CFLAGS) -DJOEYLIB_PLATFORM_AMIGA -m68000 -fomit-frame-pointer -noixemul -D__OSCOMPAT -I$(SRC_PORT)/amiga -I$(SRC_68K) -I$(PTPLAYER_DIR) -MMD -MP $(CFLAGS_EXTRA)
 # OSCOMPAT=1 selects PTPlayer's audio.device-friendly variant (uses
 # CIA-B + audio.device interrupts via the OS rather than taking over
 # Paula directly), matching the way our HAL cooperates with Intuition.
@@ -52,6 +52,7 @@ LIB_OBJS := \
     $(patsubst $(SRC_68K)/%.s,$(BUILD)/obj/68k/%.o,$(SHARED_S)) \
     $(BUILD)/obj/port/ptplayer.o \
     $(BUILD)/obj/codegen/spriteEmit68k.o \
+    $(BUILD)/obj/codegen/spriteEmitPlanar68k.o \
     $(BUILD)/obj/codegen/spriteCompile.o
 
 LIB := $(LIBDIR)/libjoey.a
@@ -156,3 +157,9 @@ $(DATA_DIR)/test.sfx: $(REPO_DIR)/assets/test.sfx
 
 clean-amiga:
 	rm -rf $(BUILD)
+
+# Pull in per-object header-dependency files generated by gcc -MMD/-MP.
+# Without this, editing a header (e.g. surfaceInternal.h) doesn't rebuild
+# the .c files that include it, leaving a frankenstein binary where
+# different TUs see different struct layouts.
+-include $(LIB_OBJS:.o=.d)
diff --git a/make/atarist.mk b/make/atarist.mk
index c8d7536..38bc119 100644
--- a/make/atarist.mk
+++ b/make/atarist.mk
@@ -7,7 +7,7 @@ BUILD    := $(REPO_DIR)/build/$(PLATFORM)
 LIBDIR   := $(BUILD)/lib
 BINDIR   := $(BUILD)/bin
 
-CFLAGS  := $(COMMON_CFLAGS) -DJOEYLIB_PLATFORM_ATARIST -m68000 -fomit-frame-pointer -I$(REPO_DIR)/toolchains/audio/libxmp-lite/include -I$(REPO_DIR)/toolchains/atarist/include-shim -I$(SRC_68K)
+CFLAGS  := $(COMMON_CFLAGS) -DJOEYLIB_PLATFORM_ATARIST -m68000 -fomit-frame-pointer -I$(REPO_DIR)/toolchains/audio/libxmp-lite/include -I$(REPO_DIR)/toolchains/atarist/include-shim -I$(SRC_68K) -MMD -MP
 LDFLAGS :=
 
 # libxmp-lite shared with the DOS port. Built as a static archive that
@@ -148,3 +148,9 @@ $(DATA_DIR)/test.sfx: $(REPO_DIR)/assets/test.sfx
 
 clean-atarist:
 	rm -rf $(BUILD)
+
+# Pull in per-object header-dependency files generated by gcc -MMD/-MP.
+# Without this, editing a header (e.g. surfaceInternal.h) doesn't rebuild
+# the .c files that include it, leaving a frankenstein binary where
+# different TUs see different struct layouts.
+-include $(LIB_OBJS:.o=.d)
diff --git a/make/dos.mk b/make/dos.mk
index ee960ec..15ac57a 100644
--- a/make/dos.mk
+++ b/make/dos.mk
@@ -7,7 +7,7 @@ BUILD    := $(REPO_DIR)/build/$(PLATFORM)
 LIBDIR   := $(BUILD)/lib
 BINDIR   := $(BUILD)/bin
 
-CFLAGS  := $(COMMON_CFLAGS) -DJOEYLIB_PLATFORM_DOS -march=i386 -m32 -I$(REPO_DIR)/toolchains/audio/libxmp-lite/include
+CFLAGS  := $(COMMON_CFLAGS) -DJOEYLIB_PLATFORM_DOS -march=i386 -m32 -I$(REPO_DIR)/toolchains/audio/libxmp-lite/include -MMD -MP
 ASFLAGS := -f coff
 LDFLAGS :=
 
@@ -138,3 +138,9 @@ $(DATA_DIR)/test.sfx: $(REPO_DIR)/assets/test.sfx
 
 clean-dos:
 	rm -rf $(BUILD)
+
+# Pull in per-object header-dependency files generated by gcc -MMD/-MP.
+# Without this, editing a header (e.g. surfaceInternal.h) doesn't rebuild
+# the .c files that include it, leaving a frankenstein binary where
+# different TUs see different struct layouts.
+-include $(LIB_OBJS:.o=.d)
diff --git a/make/iigs.mk b/make/iigs.mk
index a049d5c..d895384 100644
--- a/make/iigs.mk
+++ b/make/iigs.mk
@@ -51,11 +51,11 @@ IIGS_MERLIN  := $(REPO_DIR)/toolchains/iigs/merlin32/bin/merlin32
 
 LIB_SRCS := $(CORE_C_SRCS_IIGS) $(PORT_C_SRCS) $(PORT_ASM_SRCS_ALL) $(NTP_ASM) $(CODEGEN_SRCS)
 
-# HELLO and PATTERN are intentionally omitted from this list. The UBER
-# demo (below) exercises every public API, including what those two
-# small examples covered, and the IIgs disk image was running out of
-# room. Source for HELLO/PATTERN is still in examples/{hello,pattern}/
-# for reference and for other ports that want them.
+# HELLO is omitted from the disk because UBER exercises everything it
+# does and the disk was tight. PATTERN is included as the SCB / palette
+# golden-reference for cross-port debugging.
+PATTERN_SRC := $(EXAMPLES)/pattern/pattern.c
+PATTERN_BIN := $(BINDIR)/PATTERN
 DRAW_SRC    := $(EXAMPLES)/draw/draw.c
 DRAW_BIN    := $(BINDIR)/DRAW
 KEYS_SRC    := $(EXAMPLES)/keys/keys.c
@@ -120,24 +120,44 @@ $(NTP_ASM): $(NTP_BIN) $(REPO_DIR)/toolchains/iigs/bin-to-asm.sh
 # everywhere, so library asm can take SurfaceT* args via one
 # consistent ABI (small-mm 16-bit pointers truncated bank bytes,
 # which broke any asm that wanted to address bank-1 stage memory).
+# Per-binary header dependency files. iix-build.sh -M emits one .d
+# alongside each binary covering every header transitively included
+# by the C sources in that binary's build. Pulled in via -include at
+# the bottom of this file so editing a shared header (e.g.
+# surfaceInternal.h) triggers a rebuild of every IIgs binary that
+# transitively depends on it.
+DEP_DIR := $(BUILD)/dep
+PATTERN_DEP := $(DEP_DIR)/PATTERN.d
+DRAW_DEP    := $(DEP_DIR)/DRAW.d
+KEYS_DEP    := $(DEP_DIR)/KEYS.d
+JOY_DEP     := $(DEP_DIR)/JOY.d
+SPRITE_DEP  := $(DEP_DIR)/SPRITE.d
+UBER_DEP    := $(DEP_DIR)/UBER.d
+AUDIO_DEP   := $(DEP_DIR)/AUDIO.d
+
+$(PATTERN_BIN): $(PATTERN_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD)
+	@mkdir -p $(dir $@) $(DEP_DIR)
+	$(IIGS_BUILD) -b -M $(PATTERN_DEP) $(IIX_INCLUDES) -o $@ $(PATTERN_SRC) $(LIB_SRCS)
+	$(IIGS_IIX) chtyp -t S16 $@
+
 $(DRAW_BIN): $(DRAW_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD)
-	@mkdir -p $(dir $@)
-	$(IIGS_BUILD) -b $(IIX_INCLUDES) -o $@ $(DRAW_SRC) $(LIB_SRCS)
+	@mkdir -p $(dir $@) $(DEP_DIR)
+	$(IIGS_BUILD) -b -M $(DRAW_DEP) $(IIX_INCLUDES) -o $@ $(DRAW_SRC) $(LIB_SRCS)
 	$(IIGS_IIX) chtyp -t S16 $@
 
 $(KEYS_BIN): $(KEYS_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD)
-	@mkdir -p $(dir $@)
-	$(IIGS_BUILD) -b $(IIX_INCLUDES) -o $@ $(KEYS_SRC) $(LIB_SRCS)
+	@mkdir -p $(dir $@) $(DEP_DIR)
+	$(IIGS_BUILD) -b -M $(KEYS_DEP) $(IIX_INCLUDES) -o $@ $(KEYS_SRC) $(LIB_SRCS)
 	$(IIGS_IIX) chtyp -t S16 $@
 
 $(JOY_BIN): $(JOY_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD)
-	@mkdir -p $(dir $@)
-	$(IIGS_BUILD) -b $(IIX_INCLUDES) -o $@ $(JOY_SRC) $(LIB_SRCS)
+	@mkdir -p $(dir $@) $(DEP_DIR)
+	$(IIGS_BUILD) -b -M $(JOY_DEP) $(IIX_INCLUDES) -o $@ $(JOY_SRC) $(LIB_SRCS)
 	$(IIGS_IIX) chtyp -t S16 $@
 
 $(SPRITE_BIN): $(SPRITE_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD)
-	@mkdir -p $(dir $@)
-	$(IIGS_BUILD) -b $(IIX_INCLUDES) -o $@ $(SPRITE_SRC) $(LIB_SRCS)
+	@mkdir -p $(dir $@) $(DEP_DIR)
+	$(IIGS_BUILD) -b -M $(SPRITE_DEP) $(IIX_INCLUDES) -o $@ $(SPRITE_SRC) $(LIB_SRCS)
 	$(IIGS_IIX) chtyp -t S16 $@
 
 # UBER bumps user stack to 16 KB. ORCA-C's default user stack is small
@@ -147,8 +167,8 @@ $(SPRITE_BIN): $(SPRITE_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD)
 # decimal formatter in uber.c also uses larger stack-local buffers
 # (line[96], num[16]) than typical demos. 16 KB is plenty of headroom.
 $(UBER_BIN): $(UBER_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD)
-	@mkdir -p $(dir $@)
-	$(IIGS_BUILD) -b -s 16384 $(IIX_INCLUDES) -o $@ $(UBER_SRC) $(LIB_SRCS)
+	@mkdir -p $(dir $@) $(DEP_DIR)
+	$(IIGS_BUILD) -b -s 16384 -M $(UBER_DEP) $(IIX_INCLUDES) -o $@ $(UBER_SRC) $(LIB_SRCS)
 	$(IIGS_IIX) chtyp -t S16 $@
 
 # Convert the cross-platform .MOD asset to NinjaTrackerPlus runtime
@@ -170,17 +190,23 @@ AUDIO_DATA_FILES := $(AUDIO_SFX)
 endif
 
 $(AUDIO_BIN): $(AUDIO_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD)
-	@mkdir -p $(dir $@)
-	$(IIGS_BUILD) -b $(IIX_INCLUDES) -I $(EXAMPLES)/audio -o $@ $(AUDIO_SRC) $(LIB_SRCS)
+	@mkdir -p $(dir $@) $(DEP_DIR)
+	$(IIGS_BUILD) -b -M $(AUDIO_DEP) $(IIX_INCLUDES) -I $(EXAMPLES)/audio -o $@ $(AUDIO_SRC) $(LIB_SRCS)
 	$(IIGS_IIX) chtyp -t S16 $@
 
 # Assemble a ProDOS 2img containing the examples, ready to mount in
 # GSplus alongside a GS/OS boot volume.
 iigs-disk: $(DISK_IMG)
 
-$(DISK_IMG): $(DRAW_BIN) $(KEYS_BIN) $(JOY_BIN) $(SPRITE_BIN) $(AUDIO_BIN) $(UBER_BIN) $(AUDIO_DATA_FILES) $(IIGS_PACKAGE)
+$(DISK_IMG): $(PATTERN_BIN) $(DRAW_BIN) $(KEYS_BIN) $(JOY_BIN) $(SPRITE_BIN) $(AUDIO_BIN) $(UBER_BIN) $(AUDIO_DATA_FILES) $(IIGS_PACKAGE)
 	@mkdir -p $(dir $@)
-	$(IIGS_PACKAGE) $@ $(DRAW_BIN) $(KEYS_BIN) $(JOY_BIN) $(SPRITE_BIN) $(AUDIO_BIN) $(UBER_BIN) -- $(AUDIO_DATA_FILES)
+	$(IIGS_PACKAGE) $@ $(PATTERN_BIN) $(DRAW_BIN) $(KEYS_BIN) $(JOY_BIN) $(SPRITE_BIN) $(AUDIO_BIN) $(UBER_BIN) -- $(AUDIO_DATA_FILES)
 
 clean-iigs:
 	rm -rf $(BUILD)
+
+# Pull in per-binary header-dependency files generated by iix-build.sh -M.
+# Without this, editing a header (e.g. surfaceInternal.h) doesn't rebuild
+# IIgs binaries that include it -- the IIgs's iix toolchain has no native
+# -MMD analog, so iix-build.sh shells out to host gcc for the scan.
+-include $(PATTERN_DEP) $(DRAW_DEP) $(KEYS_DEP) $(JOY_DEP) $(SPRITE_DEP) $(UBER_DEP) $(AUDIO_DEP)
diff --git a/src/codegen/spriteCompile.c b/src/codegen/spriteCompile.c
index a01c59f..750283f 100644
--- a/src/codegen/spriteCompile.c
+++ b/src/codegen/spriteCompile.c
@@ -14,6 +14,7 @@
 #include "joey/sprite.h"
 #include "joey/surface.h"
 #include "codegenArenaInternal.h"
+#include "hal.h"
 #include "spriteEmitter.h"
 #include "spriteInternal.h"
 #include "surfaceInternal.h"
@@ -33,7 +34,9 @@
 static uint16_t emitDrawForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift) {
 #if defined(JOEYLIB_PLATFORM_DOS)
     return spriteEmitDrawX86(out, sp, shift);
-#elif defined(JOEYLIB_PLATFORM_AMIGA) || defined(JOEYLIB_PLATFORM_ATARIST)
+#elif defined(JOEYLIB_PLATFORM_AMIGA)
+    return spriteEmitDrawPlanar68k(out, sp, shift);
+#elif defined(JOEYLIB_PLATFORM_ATARIST)
     return spriteEmitDraw68k(out, sp, shift);
 #elif defined(JOEYLIB_PLATFORM_IIGS)
     return spriteEmitDrawIigs(out, sp, shift);
@@ -51,7 +54,9 @@ static uint16_t emitDrawForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift
 static uint16_t emitSaveForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift) {
 #if defined(JOEYLIB_PLATFORM_DOS)
     return spriteEmitSaveX86(out, sp, shift);
-#elif defined(JOEYLIB_PLATFORM_AMIGA) || defined(JOEYLIB_PLATFORM_ATARIST)
+#elif defined(JOEYLIB_PLATFORM_AMIGA)
+    return spriteEmitSavePlanar68k(out, sp, shift);
+#elif defined(JOEYLIB_PLATFORM_ATARIST)
     return spriteEmitSave68k(out, sp, shift);
 #elif defined(JOEYLIB_PLATFORM_IIGS)
     return spriteEmitSaveIigs(out, sp, shift);
@@ -65,7 +70,9 @@ static uint16_t emitSaveForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift
 static uint16_t emitRestoreForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift) {
 #if defined(JOEYLIB_PLATFORM_DOS)
     return spriteEmitRestoreX86(out, sp, shift);
-#elif defined(JOEYLIB_PLATFORM_AMIGA) || defined(JOEYLIB_PLATFORM_ATARIST)
+#elif defined(JOEYLIB_PLATFORM_AMIGA)
+    return spriteEmitRestorePlanar68k(out, sp, shift);
+#elif defined(JOEYLIB_PLATFORM_ATARIST)
     return spriteEmitRestore68k(out, sp, shift);
 #elif defined(JOEYLIB_PLATFORM_IIGS)
     return spriteEmitRestoreIigs(out, sp, shift);
@@ -114,6 +121,13 @@ bool spriteCompile(SpriteT *sp) {
     if (sp->tileData == NULL) {
         return false;
     }
+    /* Amiga (post-Phase 9) uses spriteEmitPlanar68k.c which writes
+     * directly to bitplanes. DRAW emits a unique pre-shifted variant
+     * per shift in 0..7 (smooth horizontal motion at any pixel x);
+     * SAVE/RESTORE emit only shift 0 and shift 1 since shifted variants
+     * 1..7 share identical bytes (plain memcpy of widthTiles+1 plane
+     * bytes per row). The post-emit pass below aliases slots 2..7
+     * for save/restore to slot 1's bytes. */
 
     scratch = (uint8_t *)malloc(SPRITE_EMIT_SCRATCH_BYTES);
     if (scratch == NULL) {
@@ -150,6 +164,16 @@ bool spriteCompile(SpriteT *sp) {
             }
         }
     }
+#if defined(JOEYLIB_PLATFORM_AMIGA)
+    /* Save/restore bytes for any non-zero shift are identical (plain
+     * memcpy of widthTiles+1 plane bytes per row). The emitter emits
+     * them once at slot 1; alias slots 2..7 here so the dispatcher
+     * gate (sprite.c) sees them as compiled. */
+    for (shift = 2; shift < JOEY_SPRITE_SHIFT_COUNT; shift++) {
+        sp->routineOffsets[shift][SPRITE_OP_SAVE]    = sp->routineOffsets[1][SPRITE_OP_SAVE];
+        sp->routineOffsets[shift][SPRITE_OP_RESTORE] = sp->routineOffsets[1][SPRITE_OP_RESTORE];
+    }
+#endif
     sp->slot = slot;
     free(scratch);
     return true;
@@ -554,6 +578,112 @@ void spriteCompiledRestoreUnder(SurfaceT *dst, const SpriteBackupT *backup) {
     }
 }
 
+#elif defined(JOEYLIB_PLATFORM_AMIGA)
+
+/* Amiga planar dispatchers. spriteEmitPlanar68k.c emits routines with
+ * cdecl(p0, p1, p2, p3[, backup]) signatures that write directly to
+ * bitplanes. Compute byteOff = y*40 + x/8 and pass plane[i]+byteOff
+ * as the 4 plane args. shift = x % 8 selects the variant; today only
+ * shift 0 emits non-zero bytes, so callers should already have
+ * gated on routineOffsets[shift][op] != SPRITE_NOT_COMPILED.
+ *
+ * For non-zero shifts (x not 8-px-aligned), the dispatcher in
+ * src/core/sprite.c (spriteDraw / spriteSaveUnder / spriteRestoreUnder)
+ * sees SPRITE_NOT_COMPILED for the shift and falls back to the
+ * interpreter, which handles arbitrary x via halSpriteDrawPlanes /
+ * halSpriteSavePlanes / halSpriteRestorePlanes. */
+
+#define AMIGA_BYTES_PER_ROW_LOCAL 40
+
+void spriteCompiledDraw(SurfaceT *dst, const SpriteT *sp, int16_t x, int16_t y) {
+    typedef void (*DrawFn)(uint8_t *p0, uint8_t *p1, uint8_t *p2, uint8_t *p3);
+    uint8_t   shift;
+    uint16_t  byteOff;
+    uint8_t  *p0;
+    uint8_t  *p1;
+    uint8_t  *p2;
+    uint8_t  *p3;
+    DrawFn    fn;
+
+    shift   = (uint8_t)(x & 7);
+    byteOff = (uint16_t)((uint16_t)y * AMIGA_BYTES_PER_ROW_LOCAL + ((uint16_t)x >> 3));
+    p0      = halSurfacePlanePtr(dst, 0); if (p0 == NULL) return;
+    p1      = halSurfacePlanePtr(dst, 1);
+    p2      = halSurfacePlanePtr(dst, 2);
+    p3      = halSurfacePlanePtr(dst, 3);
+    fn = (DrawFn)(codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_DRAW]);
+    fn(p0 + byteOff, p1 + byteOff, p2 + byteOff, p3 + byteOff);
+}
+
+
+void spriteCompiledSaveUnder(const SurfaceT *src, SpriteT *sp, int16_t x, int16_t y, SpriteBackupT *backup) {
+    typedef void (*SaveFn)(uint8_t *p0, uint8_t *p1, uint8_t *p2, uint8_t *p3, uint8_t *backup);
+    uint8_t   shift;
+    int16_t   clippedX;
+    uint16_t  widthPx;
+    uint16_t  heightPx;
+    uint16_t  byteOff;
+    uint8_t  *p0;
+    uint8_t  *p1;
+    uint8_t  *p2;
+    uint8_t  *p3;
+    SaveFn    fn;
+
+    shift    = (uint8_t)(x & 7);
+    clippedX = (int16_t)(x & ~7);
+    widthPx  = (uint16_t)(sp->widthTiles  * 8);
+    heightPx = (uint16_t)(sp->heightTiles * 8);
+    /* Shifts 1..7 spill into one extra plane byte per row (= +8 px). */
+    if (shift != 0u) {
+        widthPx = (uint16_t)(widthPx + 8u);
+    }
+    byteOff  = (uint16_t)((uint16_t)y * AMIGA_BYTES_PER_ROW_LOCAL + ((uint16_t)clippedX >> 3));
+
+    backup->sprite    = sp;
+    backup->x         = clippedX;
+    backup->y         = y;
+    backup->width     = widthPx;
+    backup->height    = heightPx;
+    /* 4 planes * h * (widthPx/8) bytes = h * widthPx/2. */
+    backup->sizeBytes = (uint16_t)((uint16_t)heightPx * (widthPx >> 1));
+
+    p0 = halSurfacePlanePtr(src, 0); if (p0 == NULL) return;
+    p1 = halSurfacePlanePtr(src, 1);
+    p2 = halSurfacePlanePtr(src, 2);
+    p3 = halSurfacePlanePtr(src, 3);
+    fn = (SaveFn)(codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_SAVE]);
+    fn(p0 + byteOff, p1 + byteOff, p2 + byteOff, p3 + byteOff, backup->bytes);
+}
+
+
+void spriteCompiledRestoreUnder(SurfaceT *dst, const SpriteBackupT *backup) {
+    typedef void (*RestoreFn)(uint8_t *p0, uint8_t *p1, uint8_t *p2, uint8_t *p3, const uint8_t *backup);
+    SpriteT  *sp;
+    uint8_t   shift;
+    uint16_t  byteOff;
+    uint8_t  *p0;
+    uint8_t  *p1;
+    uint8_t  *p2;
+    uint8_t  *p3;
+    RestoreFn fn;
+
+    sp      = backup->sprite;
+    /* backup->x is 8-px aligned (clippedX from save), so x & 7 is
+     * useless for picking the original shift. Encode it via
+     * backup->width: == widthTiles*8 means shift 0; > means shifted.
+     * Shifted slots 1..7 all alias to the same restore bytes, so
+     * slot 1 stands in for any non-zero shift. */
+    shift   = (uint8_t)(backup->width > (uint16_t)(sp->widthTiles * 8) ? 1u : 0u);
+    byteOff = (uint16_t)((uint16_t)backup->y * AMIGA_BYTES_PER_ROW_LOCAL + ((uint16_t)backup->x >> 3));
+
+    p0 = halSurfacePlanePtr(dst, 0); if (p0 == NULL) return;
+    p1 = halSurfacePlanePtr(dst, 1);
+    p2 = halSurfacePlanePtr(dst, 2);
+    p3 = halSurfacePlanePtr(dst, 3);
+    fn = (RestoreFn)(codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_RESTORE]);
+    fn(p0 + byteOff, p1 + byteOff, p2 + byteOff, p3 + byteOff, backup->bytes);
+}
+
 #else
 
 void spriteCompiledDraw(SurfaceT *dst, const SpriteT *sp, int16_t x, int16_t y) {
diff --git a/src/codegen/spriteEmit68k.c b/src/codegen/spriteEmit68k.c
index b86851c..3afb2fa 100644
--- a/src/codegen/spriteEmit68k.c
+++ b/src/codegen/spriteEmit68k.c
@@ -166,6 +166,13 @@ uint16_t spriteEmitDraw68k(uint8_t *out, const SpriteT *sp, uint8_t shift) {
     uint8_t  value;
     uint8_t  opaqueMask;
 
+    // Chunky 4bpp has only two nibble-alignment positions; the
+    // dispatcher uses x & 1 so shifts 2..7 are unreachable. Bail
+    // early so the arena slot stays SPRITE_NOT_COMPILED.
+    if (shift > 1u) {
+        return 0u;
+    }
+
     cursor             = 0;
     heightPx           = (uint16_t)(sp->heightTiles * TILE_PIXELS);
     spriteBytesPerRow  = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW);
@@ -225,6 +232,10 @@ uint16_t spriteEmitRestore68k(uint8_t *out, const SpriteT *sp, uint8_t shift) {
     uint16_t heightPx;
     uint16_t copyBytes;
 
+    if (shift > 1u) {
+        return 0u;
+    }
+
     cursor    = 0;
     heightPx  = (uint16_t)(sp->heightTiles * TILE_PIXELS);
     copyBytes = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW + (shift == 1u ? 1u : 0u));
@@ -248,6 +259,10 @@ uint16_t spriteEmitSave68k(uint8_t *out, const SpriteT *sp, uint8_t shift) {
     uint16_t heightPx;
     uint16_t copyBytes;
 
+    if (shift > 1u) {
+        return 0u;
+    }
+
     cursor    = 0;
     heightPx  = (uint16_t)(sp->heightTiles * TILE_PIXELS);
     copyBytes = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW + (shift == 1u ? 1u : 0u));
diff --git a/src/codegen/spriteEmitIigs.c b/src/codegen/spriteEmitIigs.c
index 4975c1c..a960297 100644
--- a/src/codegen/spriteEmitIigs.c
+++ b/src/codegen/spriteEmitIigs.c
@@ -189,6 +189,10 @@ uint16_t spriteEmitSaveIigs(uint8_t *out, const SpriteT *sp, uint8_t shift) {
     uint16_t spriteBytesPerRow;
     uint16_t copyBytes;
 
+    if (shift > 1u) {
+        return 0u;
+    }
+
     heightPx          = (uint16_t)(sp->heightTiles * TILE_PIXELS);
     spriteBytesPerRow = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW);
     copyBytes         = (uint16_t)(spriteBytesPerRow + (shift == 1 ? 1 : 0));
@@ -205,6 +209,10 @@ uint16_t spriteEmitRestoreIigs(uint8_t *out, const SpriteT *sp, uint8_t shift) {
     uint16_t spriteBytesPerRow;
     uint16_t copyBytes;
 
+    if (shift > 1u) {
+        return 0u;
+    }
+
     heightPx          = (uint16_t)(sp->heightTiles * TILE_PIXELS);
     spriteBytesPerRow = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW);
     copyBytes         = (uint16_t)(spriteBytesPerRow + (shift == 1 ? 1 : 0));
@@ -258,6 +266,10 @@ uint16_t spriteEmitDrawIigs(uint8_t *out, const SpriteT *sp, uint8_t shift) {
     uint8_t  nextOpaqueMask;
     bool     wide;
 
+    if (shift > 1u) {
+        return 0u;
+    }
+
     cursor             = 0;
     heightPx           = (uint16_t)(sp->heightTiles * TILE_PIXELS);
     spriteBytesPerRow  = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW);
diff --git a/src/codegen/spriteEmitPlanar68k.c b/src/codegen/spriteEmitPlanar68k.c
new file mode 100644
index 0000000..d201e80
--- /dev/null
+++ b/src/codegen/spriteEmitPlanar68k.c
@@ -0,0 +1,505 @@
+// Planar 68k sprite codegen for Amiga (post-Phase 9, no chunky shadow).
+//
+// Emits PIC routines that write directly to the four bitplanes via 4
+// address-register pointers (a0..a3 = plane[0..3] base + byteOff,
+// where byteOff = y*40 + x/8 -- the dispatcher pre-computes this).
+//
+// Calling convention (cdecl on m68k-amigaos-gcc):
+//   draw(p0, p1, p2, p3):
+//     args at 4(sp), 8(sp), 12(sp), 16(sp) -- one ULONG per plane.
+//     loaded into a0..a3 by the prologue.
+//   save(p0, p1, p2, p3, backup):
+//     5 args; backup at 20(sp), loaded into a4.
+//   restore(p0, p1, p2, p3, backup):
+//     same as save but reads backup, writes planes.
+//
+// Per-byte plane write encoding decisions:
+//   - all-transparent (mask=0):  skip the byte entirely
+//   - all-opaque (mask=0xFF):    move.b #imm, d16(an)        (6 bytes)
+//   - mixed (0<mask<0xFF):       move.b d16(an), d0;
+//                                andi.b #~mask, d0;
+//                                ori.b  #imm, d0;
+//                                move.b d0, d16(an)          (4+6+6+4 = 20 bytes)
+//
+// Per row advance: 4 plane pointers each get adda.w #SURFACE_WIDTH/8
+// = adda.w #40, an  (4 bytes encoded each, 16 bytes total per row).
+// We omit the advance after the last row.
+//
+// Shift handling: shifts 0..7 are pre-baked. The dispatcher selects
+// the variant via x % 8 and pre-computes byteOff = y*40 + (x & ~7)/8
+// (i.e. round x DOWN to 8-pixel boundary). The variant for shift s
+// then emits to (widthTiles + 1) plane bytes per row when s != 0
+// (the rightmost shift bits spill into one extra plane byte) and to
+// widthTiles plane bytes per row when s == 0.
+//
+// The emitter assumes sprite width is a multiple of 8 (= a multiple
+// of one tile = a multiple of 8 pixels) so plane bytes per row are
+// integer. JoeyLib sprites are always tile-multiple by API contract.
+
+#include "joey/sprite.h"
+#include "joey/surface.h"
+#include "spriteEmitter.h"
+#include "spriteInternal.h"
+
+
+// ----- Constants -----
+
+#define TILE_PIXELS              8
+#define TILE_BYTES               32
+#define TILE_BYTES_PER_ROW       4
+#define TRANSPARENT_NIBBLE       0
+#define AMIGA_BITPLANES          4
+#define AMIGA_BYTES_PER_ROW      40
+
+
+// ----- Instruction encoding helpers -----
+
+static uint16_t writeBE16(uint8_t *out, uint16_t value) {
+    out[0] = (uint8_t)((value >> 8) & 0xFFu);
+    out[1] = (uint8_t)(value & 0xFFu);
+    return 2u;
+}
+
+
+// movea.l <d16,SP>, an  -- load arg at SP+disp into An.
+// Encoding: 0010 nnn 001 010 111  + disp16
+//           = 0x2057 + (n << 9), where n is dst An.
+//   a0: 0x206F, a1: 0x226F, a2: 0x246F, a3: 0x266F, a4: 0x286F.
+static const uint16_t kMoveaSpToAn[] = {
+    0x206Fu, 0x226Fu, 0x246Fu, 0x266Fu, 0x286Fu, 0x2A6Fu, 0x2C6Fu, 0x2E6Fu
+};
+
+
+// adda.w #imm, an  -- adds 16-bit signed imm to An (sign-extended).
+// Encoding: 1101 nnn 011 111 100  + imm
+//           = 0xD0FC + (n << 9).
+static const uint16_t kAddaWImmToAn[] = {
+    0xD0FCu, 0xD2FCu, 0xD4FCu, 0xD6FCu, 0xD8FCu, 0xDAFCu, 0xDCFCu, 0xDEFCu
+};
+
+
+// ANDI.B #imm, D0  -- 4 bytes (opcode word + imm word, byte in low half).
+// Opcode: 0000 0010 00 000 000  (size=byte, mode=Dn, reg=D0)
+#define ANDI_B_IMM_D0   0x0200u
+
+// ORI.B  #imm, D0  -- 4 bytes (opcode word + imm word, byte in low half).
+// Opcode: 0000 0000 00 000 000
+#define ORI_B_IMM_D0    0x0000u
+
+
+// MOVE.B d16(An), D0  -- 4 bytes (opcode + disp).
+// Encoding: 0001 000 000 mode reg
+//   = size=01 (byte), dst reg=000 (D0), dst mode=000 (Dn),
+//     src mode=101 (d16,An), src reg=An.
+//   = 0001000 000 101 nnn = 0x1028 + An.
+static const uint16_t kMoveBD16AnToD0[] = {
+    0x1028u, 0x1029u, 0x102Au, 0x102Bu
+};
+
+
+// MOVE.B D0, d16(An)  -- 4 bytes (opcode + disp).
+// Encoding: 0001 nnn 101 000 000 = 0x1140 + (An << 9).
+static const uint16_t kMoveBD0ToD16An[] = {
+    0x1140u, 0x1340u, 0x1540u, 0x1740u
+};
+
+
+// MOVE.B #imm, d16(An) -- 6 bytes (opcode + imm + disp).
+// Encoding: 0001 nnn 101 111 100 = 0x117C + (An << 9).
+//   (Was 0x113C earlier -- that's mode=100=predec; mode=101=d16(An)
+//    is the bit difference. Predec emits a 4-byte instruction with no
+//    disp word, so the byte stream went out of sync and every
+//    subsequent instruction decoded into garbage.)
+static const uint16_t kMoveBImmToD16An[] = {
+    0x117Cu, 0x137Cu, 0x157Cu, 0x177Cu
+};
+
+
+// MOVE.B (a4)+, d16(An) -- 4 bytes (opcode + disp).  -- used by save/restore (backup in a4)
+// Encoding: 0001 nnn 101 011 100 = 0x115C + (An << 9).
+static const uint16_t kMoveBA4PostincToD16An[] = {
+    0x115Cu, 0x135Cu, 0x155Cu, 0x175Cu
+};
+
+
+// MOVE.B d16(An), (a4)+ -- 4 bytes (opcode + disp).  -- used by save (planes -> backup)
+// Encoding: 1001 100 011 mode reg
+//   Wait, MOVE.B src,(a4)+ : dst mode = 011 (an+), dst reg = 100 (A4),
+//   so dst reg=100, dst mode=011 -> opcode high = 0001 100 011 ...
+//   = 0001100011 mode reg = 0x18C0..
+//   0001 100 011 101 nnn = 0x18E8 + An.
+static const uint16_t kMoveBD16AnToA4Postinc[] = {
+    0x18E8u, 0x18E9u, 0x18EAu, 0x18EBu
+};
+
+
+// MOVEM.L reglist, -(SP)  -- 4 bytes (opcode + reglist mask).
+//   Opcode 0x48E7. Predec mask is REVERSED vs all other modes:
+//   bit 15 = D0, ..., bit 8 = D7, bit 7 = A0, bit 6 = A1, bit 5 = A2,
+//   bit 4 = A3, bit 3 = A4, bit 2 = A5, bit 1 = A6, bit 0 = A7.
+#define MOVEM_L_PUSH_OPCODE   0x48E7u
+#define MOVEM_L_MASK_A2_A3    0x0030u  /* bits 5,4 = A2,A3 (predec order) */
+#define MOVEM_L_MASK_A2_A3_A4 0x0038u  /* bits 5,4,3 = A2,A3,A4 */
+
+// MOVEM.L (SP)+, reglist  -- 4 bytes (opcode + reglist mask).
+//   Opcode 0x4CDF. Postinc mask follows the standard layout:
+//   bit 0 = D0, ..., bit 7 = D7, bit 8 = A0, ..., bit 15 = A7.
+#define MOVEM_L_POP_OPCODE    0x4CDFu
+#define MOVEM_L_MASK_POP_A2_A3    0x0C00u  /* bits 11,10 = A3,A2 */
+#define MOVEM_L_MASK_POP_A2_A3_A4 0x1C00u  /* bits 12,11,10 = A4,A3,A2 */
+
+// RTS opcode.
+#define OPCODE_RTS            0x4E75u
+
+
+// ----- Emit helpers -----
+
+// For shift 0 (byte-aligned x), the sprite's chunky tile data converts
+// directly to plane bytes without any sub-byte shifting. For each
+// (row, col-byte, plane) we extract the 8 plane bits from 4 chunky
+// bytes (= 8 pixels) and produce one plane byte; we also produce a
+// mask byte indicating which pixel positions are non-transparent
+// (any plane bit != 0 in the source means non-transparent if
+// transparent index is 0, the JoeyLib convention).
+//
+// Sprite layout: tileData = wTiles x hTiles tiles, each tile = 8 rows
+// x 4 chunky bytes (32 bytes). Tiles laid out row-major within the
+// sprite. For plane-byte column `c` of row `r`:
+//   tileX = c (since each plane byte covers exactly one tile column)
+//   tileY = r / 8
+//   inTileY = r % 8
+//   chunky bytes = tileData + (tileY*wTiles + tileX)*32 + inTileY*4 + 0..3
+//
+// `col` must be in [0, widthTiles); callers handle out-of-range cols
+// (used when computing shifted variants that span widthTiles+1 output
+// bytes per row) by passing a sentinel and checking against widthTiles
+// before invoking this helper.
+static void planeByteAndMaskAt(const SpriteT *sp, uint16_t row, uint16_t col,
+                               uint8_t *planeBytes /*[4]*/, uint8_t *maskByte)
+{
+    uint16_t       tileX;
+    uint16_t       tileY;
+    uint16_t       inTileY;
+    const uint8_t *tile;
+    const uint8_t *chunky;
+    uint8_t        nibbles[8];
+    uint8_t        b0, b1, b2, b3;
+    uint16_t       p;
+    uint8_t        bitMask;
+    uint8_t        pix;
+
+    tileX   = col;
+    tileY   = row >> 3;
+    inTileY = row & 7u;
+
+    tile   = sp->tileData + (uint32_t)((tileY * sp->widthTiles + tileX) * 32u);
+    chunky = tile + inTileY * 4u;
+
+    nibbles[0] = (uint8_t)(chunky[0] >> 4);
+    nibbles[1] = (uint8_t)(chunky[0] & 0x0Fu);
+    nibbles[2] = (uint8_t)(chunky[1] >> 4);
+    nibbles[3] = (uint8_t)(chunky[1] & 0x0Fu);
+    nibbles[4] = (uint8_t)(chunky[2] >> 4);
+    nibbles[5] = (uint8_t)(chunky[2] & 0x0Fu);
+    nibbles[6] = (uint8_t)(chunky[3] >> 4);
+    nibbles[7] = (uint8_t)(chunky[3] & 0x0Fu);
+
+    b0 = 0u; b1 = 0u; b2 = 0u; b3 = 0u;
+    *maskByte = 0u;
+    for (p = 0; p < 8u; p++) {
+        pix = nibbles[p];
+        if (pix == TRANSPARENT_NIBBLE) {
+            continue;
+        }
+        bitMask = (uint8_t)(0x80u >> p);
+        *maskByte = (uint8_t)(*maskByte | bitMask);
+        if (pix & 1u) b0 = (uint8_t)(b0 | bitMask);
+        if (pix & 2u) b1 = (uint8_t)(b1 | bitMask);
+        if (pix & 4u) b2 = (uint8_t)(b2 | bitMask);
+        if (pix & 8u) b3 = (uint8_t)(b3 | bitMask);
+    }
+    planeBytes[0] = b0;
+    planeBytes[1] = b1;
+    planeBytes[2] = b2;
+    planeBytes[3] = b3;
+}
+
+
+// Shifted variant: produces 4 plane bytes and 1 mask byte for output
+// column `outCol` (0..widthTiles inclusive) of row `row` when the
+// sprite is shifted right by `shift` pixels (1..7). For shift 0,
+// callers should use planeByteAndMaskAt directly (faster, no spill).
+//
+// Each output byte is composed of bits drawn from up to two source
+// plane bytes:
+//   leftPart  = src[outCol-1] << (8 - shift)   (high (shift) bits)
+//   rightPart = src[outCol]   >> shift          (low (8-shift) bits)
+// with src[-1] and src[widthTiles] treated as 0/transparent. The
+// resulting plane byte is leftPart | rightPart; the mask byte is the
+// shifted union of the per-byte source masks.
+static void planeByteAndMaskShifted(const SpriteT *sp, uint16_t row, uint16_t outCol,
+                                    uint8_t shift, uint16_t widthTiles,
+                                    uint8_t *planeBytes /*[4]*/, uint8_t *maskByte)
+{
+    uint8_t leftPlanes[AMIGA_BITPLANES];
+    uint8_t leftMask;
+    uint8_t rightPlanes[AMIGA_BITPLANES];
+    uint8_t rightMask;
+    uint8_t i;
+
+    leftMask  = 0u;
+    rightMask = 0u;
+    for (i = 0; i < AMIGA_BITPLANES; i++) {
+        leftPlanes[i]  = 0u;
+        rightPlanes[i] = 0u;
+    }
+
+    if (outCol > 0u && (uint16_t)(outCol - 1u) < widthTiles) {
+        planeByteAndMaskAt(sp, row, (uint16_t)(outCol - 1u), leftPlanes, &leftMask);
+    }
+    if (outCol < widthTiles) {
+        planeByteAndMaskAt(sp, row, outCol, rightPlanes, &rightMask);
+    }
+
+    *maskByte = (uint8_t)(((leftMask  << (8u - shift)) & 0xFFu) |
+                          ((rightMask >>       shift)  & 0xFFu));
+    for (i = 0; i < AMIGA_BITPLANES; i++) {
+        planeBytes[i] = (uint8_t)(((leftPlanes[i]  << (8u - shift)) & 0xFFu) |
+                                  ((rightPlanes[i] >>       shift)  & 0xFFu));
+    }
+}
+
+
+// Emit code that merges one plane byte into d16(an) where d16 is the
+// row-relative byte offset (0 since we re-base each row by adda.w).
+// The choice of all-opaque vs mixed encoding cuts code size when many
+// pixels are opaque (typical for sprite interiors).
+static uint16_t emitMergeByteToD16An(uint8_t *out, uint16_t cursor,
+                                     uint8_t an, uint8_t disp,
+                                     uint8_t maskByte, uint8_t srcByte)
+{
+    if (maskByte == 0u) {
+        return cursor;  /* nothing to write */
+    }
+    if (maskByte == 0xFFu) {
+        /* All-opaque shortcut: move.b #src, d16(an). */
+        cursor += writeBE16(out + cursor, kMoveBImmToD16An[an]);
+        cursor += writeBE16(out + cursor, (uint16_t)srcByte);
+        cursor += writeBE16(out + cursor, (uint16_t)disp);
+        return cursor;
+    }
+    /* Mixed: load existing, clear mask bits, OR in src, write back. */
+    cursor += writeBE16(out + cursor, kMoveBD16AnToD0[an]);
+    cursor += writeBE16(out + cursor, (uint16_t)disp);
+    cursor += writeBE16(out + cursor, ANDI_B_IMM_D0);
+    cursor += writeBE16(out + cursor, (uint16_t)((~maskByte) & 0xFFu));
+    cursor += writeBE16(out + cursor, ORI_B_IMM_D0);
+    cursor += writeBE16(out + cursor, (uint16_t)srcByte);
+    cursor += writeBE16(out + cursor, kMoveBD0ToD16An[an]);
+    cursor += writeBE16(out + cursor, (uint16_t)disp);
+    return cursor;
+}
+
+
+// ----- Public API -----
+
+uint16_t spriteEmitDrawPlanar68k(uint8_t *out, const SpriteT *sp, uint8_t shift) {
+    uint16_t cursor;
+    uint16_t row;
+    uint16_t col;
+    uint16_t heightPx;
+    uint16_t widthTiles;
+    uint16_t bytesPerRow;       /* per plane, per row */
+    uint8_t  planeBytes[AMIGA_BITPLANES];
+    uint8_t  maskByte;
+    uint8_t  i;
+
+    if (shift > 7u) {
+        return 0u;
+    }
+
+    cursor      = 0;
+    heightPx    = (uint16_t)(sp->heightTiles * TILE_PIXELS);
+    widthTiles  = (uint16_t)sp->widthTiles;
+    bytesPerRow = (uint16_t)(widthTiles + (shift == 0u ? 0u : 1u));
+
+    /* Prologue: m68k cdecl callee-saves a2-a6; we clobber a2 and a3
+     * loading plane pointers, so push them first. After the push, all
+     * stack arg displacements shift by +8 (two longs). */
+    cursor += writeBE16(out + cursor, MOVEM_L_PUSH_OPCODE);
+    cursor += writeBE16(out + cursor, MOVEM_L_MASK_A2_A3);
+    for (i = 0; i < AMIGA_BITPLANES; i++) {
+        cursor += writeBE16(out + cursor, kMoveaSpToAn[i]);
+        cursor += writeBE16(out + cursor, (uint16_t)(8u + 4u + i * 4u));
+    }
+
+    for (row = 0; row < heightPx; row++) {
+        for (col = 0; col < bytesPerRow; col++) {
+            if (shift == 0u) {
+                planeByteAndMaskAt(sp, row, col, planeBytes, &maskByte);
+            } else {
+                planeByteAndMaskShifted(sp, row, col, shift, widthTiles, planeBytes, &maskByte);
+            }
+            for (i = 0; i < AMIGA_BITPLANES; i++) {
+                cursor = emitMergeByteToD16An(out, cursor, i, (uint8_t)col,
+                                              maskByte, planeBytes[i]);
+            }
+        }
+        if (row + 1u < heightPx) {
+            for (i = 0; i < AMIGA_BITPLANES; i++) {
+                cursor += writeBE16(out + cursor, kAddaWImmToAn[i]);
+                cursor += writeBE16(out + cursor, (uint16_t)AMIGA_BYTES_PER_ROW);
+            }
+        }
+    }
+
+    /* Epilogue: restore a2-a3, rts. */
+    cursor += writeBE16(out + cursor, MOVEM_L_POP_OPCODE);
+    cursor += writeBE16(out + cursor, MOVEM_L_MASK_POP_A2_A3);
+    cursor += writeBE16(out + cursor, OPCODE_RTS);
+    return cursor;
+}
+
+
+// SAVE: planes -> backup. backup is one contiguous 4*H*W/8 byte buffer
+// laid out as 4 plane stripes, matching halSpriteSavePlanes format
+// (so cross-platform save buffer is interchangeable).
+//
+// Per row: for each plane, copy bytesPerRow bytes from d16(an) to
+// (a4)+. After the row's reads, the planes need to advance by 40,
+// while a4 advances naturally via post-increment.
+//
+// Plane stripes are sequential in backup. We could either (a) do all
+// rows of plane 0, then plane 1, etc. (matches halSpriteSavePlanes
+// layout), or (b) interleave rows of all 4 planes (different layout).
+// halSpriteSavePlanes does (a) -- 4 separate plane stripes. The
+// emitted code below matches that layout for compat.
+uint16_t spriteEmitSavePlanar68k(uint8_t *out, const SpriteT *sp, uint8_t shift) {
+    uint16_t cursor;
+    uint16_t row;
+    uint16_t col;
+    uint16_t heightPx;
+    uint16_t bytesPerRow;
+    uint8_t  i;
+
+    /* Shifts 2..7 reuse shift 1's bytes (identical memcpy). The
+     * spriteCompile post-emit pass aliases their routineOffsets to
+     * slot 1 so this routine is emitted once. */
+    if (shift > 1u) {
+        return 0u;
+    }
+
+    cursor      = 0;
+    heightPx    = (uint16_t)(sp->heightTiles * TILE_PIXELS);
+    bytesPerRow = (uint16_t)(sp->widthTiles + (shift == 0u ? 0u : 1u));
+
+    /* Prologue: callee-save a2/a3/a4 (m68k cdecl), then load 4 plane
+     * pointers + backup pointer. After the push, all stack arg disps
+     * shift by +12 (three longs). */
+    cursor += writeBE16(out + cursor, MOVEM_L_PUSH_OPCODE);
+    cursor += writeBE16(out + cursor, MOVEM_L_MASK_A2_A3_A4);
+    for (i = 0; i < AMIGA_BITPLANES; i++) {
+        cursor += writeBE16(out + cursor, kMoveaSpToAn[i]);
+        cursor += writeBE16(out + cursor, (uint16_t)(12u + 4u + i * 4u));
+    }
+    /* a4 = backup. */
+    cursor += writeBE16(out + cursor, kMoveaSpToAn[4]);
+    cursor += writeBE16(out + cursor, (uint16_t)(12u + 4u + 4u * 4u));
+
+    /* Plane-major: for each plane, walk all rows. After this routine,
+     * each An has advanced by H*40 (one frame full); we don't need to
+     * unwind because the function returns. We DO need to reset An
+     * back to start before walking the NEXT plane though.
+     *
+     * Simpler alternative: row-major (interleaved). Per row, copy
+     * bytesPerRow bytes from each plane to (a4)+, then advance all
+     * 4 planes by 40. Net: a4 advances by 4*H*bytesPerRow; planes
+     * advance by H*40. Backup layout becomes interleaved (plane0_row0,
+     * plane1_row0, plane2_row0, plane3_row0, plane0_row1, ...).
+     *
+     * That doesn't match halSpriteSavePlanes' plane-major layout. Need
+     * to either (a) match it -- emit per-plane outer loop with a4
+     * stride between planes -- or (b) change halSpriteSavePlanes to
+     * interleaved. Picking (b) is simpler in emitted code, but ALSO
+     * requires updating halSpriteRestorePlanes and halSpriteRestoreUnder
+     * fallback math.
+     *
+     * For now: use plane-major matching halSpriteSavePlanes. Per
+     * plane: walk rows, copy bytes from d16(an) to (a4)+, advance an
+     * by 40 after each row except the last; reset an back to start
+     * before next plane. */
+    for (i = 0; i < AMIGA_BITPLANES; i++) {
+        for (row = 0; row < heightPx; row++) {
+            for (col = 0; col < bytesPerRow; col++) {
+                cursor += writeBE16(out + cursor, kMoveBD16AnToA4Postinc[i]);
+                cursor += writeBE16(out + cursor, (uint16_t)col);
+            }
+            if (row + 1u < heightPx) {
+                cursor += writeBE16(out + cursor, kAddaWImmToAn[i]);
+                cursor += writeBE16(out + cursor, (uint16_t)AMIGA_BYTES_PER_ROW);
+            }
+        }
+        /* Reset An back to the plane base for next iteration. The
+         * total advance was (heightPx - 1) * 40. Subtract that. */
+        if (i + 1u < AMIGA_BITPLANES) {
+            cursor += writeBE16(out + cursor, kAddaWImmToAn[i]);
+            cursor += writeBE16(out + cursor, (uint16_t)(0u - ((heightPx - 1u) * AMIGA_BYTES_PER_ROW)));
+        }
+    }
+
+    cursor += writeBE16(out + cursor, MOVEM_L_POP_OPCODE);
+    cursor += writeBE16(out + cursor, MOVEM_L_MASK_POP_A2_A3_A4);
+    cursor += writeBE16(out + cursor, OPCODE_RTS);
+    return cursor;
+}
+
+
+// RESTORE: backup -> planes. Mirror of save. Uses MOVE.B (a4)+, d16(an).
+uint16_t spriteEmitRestorePlanar68k(uint8_t *out, const SpriteT *sp, uint8_t shift) {
+    uint16_t cursor;
+    uint16_t row;
+    uint16_t col;
+    uint16_t heightPx;
+    uint16_t bytesPerRow;
+    uint8_t  i;
+
+    if (shift > 1u) {
+        return 0u;
+    }
+
+    cursor      = 0;
+    heightPx    = (uint16_t)(sp->heightTiles * TILE_PIXELS);
+    bytesPerRow = (uint16_t)(sp->widthTiles + (shift == 0u ? 0u : 1u));
+
+    /* Callee-save a2/a3/a4; arg disps shift by +12. */
+    cursor += writeBE16(out + cursor, MOVEM_L_PUSH_OPCODE);
+    cursor += writeBE16(out + cursor, MOVEM_L_MASK_A2_A3_A4);
+    for (i = 0; i < AMIGA_BITPLANES; i++) {
+        cursor += writeBE16(out + cursor, kMoveaSpToAn[i]);
+        cursor += writeBE16(out + cursor, (uint16_t)(12u + 4u + i * 4u));
+    }
+    cursor += writeBE16(out + cursor, kMoveaSpToAn[4]);
+    cursor += writeBE16(out + cursor, (uint16_t)(12u + 4u + 4u * 4u));
+
+    for (i = 0; i < AMIGA_BITPLANES; i++) {
+        for (row = 0; row < heightPx; row++) {
+            for (col = 0; col < bytesPerRow; col++) {
+                cursor += writeBE16(out + cursor, kMoveBA4PostincToD16An[i]);
+                cursor += writeBE16(out + cursor, (uint16_t)col);
+            }
+            if (row + 1u < heightPx) {
+                cursor += writeBE16(out + cursor, kAddaWImmToAn[i]);
+                cursor += writeBE16(out + cursor, (uint16_t)AMIGA_BYTES_PER_ROW);
+            }
+        }
+        if (i + 1u < AMIGA_BITPLANES) {
+            cursor += writeBE16(out + cursor, kAddaWImmToAn[i]);
+            cursor += writeBE16(out + cursor, (uint16_t)(0u - ((heightPx - 1u) * AMIGA_BYTES_PER_ROW)));
+        }
+    }
+
+    cursor += writeBE16(out + cursor, MOVEM_L_POP_OPCODE);
+    cursor += writeBE16(out + cursor, MOVEM_L_MASK_POP_A2_A3_A4);
+    cursor += writeBE16(out + cursor, OPCODE_RTS);
+    return cursor;
+}
diff --git a/src/codegen/spriteEmitX86.c b/src/codegen/spriteEmitX86.c
index b0c1bbf..226eb9c 100644
--- a/src/codegen/spriteEmitX86.c
+++ b/src/codegen/spriteEmitX86.c
@@ -200,6 +200,10 @@ uint16_t spriteEmitDrawX86(uint8_t *out, const SpriteT *sp, uint8_t shift) {
     uint8_t   v3;
     uint8_t   m;
 
+    if (shift > 1u) {
+        return 0u;
+    }
+
     cursor             = 0;
     heightPx           = (uint16_t)(sp->heightTiles * TILE_PIXELS);
     spriteBytesPerRow  = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW);
@@ -313,6 +317,10 @@ uint16_t spriteEmitRestoreX86(uint8_t *out, const SpriteT *sp, uint8_t shift) {
     uint16_t heightPx;
     uint16_t copyBytes;
 
+    if (shift > 1u) {
+        return 0u;
+    }
+
     cursor    = 0;
     heightPx  = (uint16_t)(sp->heightTiles * TILE_PIXELS);
     copyBytes = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW + (shift == 1u ? 1u : 0u));
@@ -339,6 +347,10 @@ uint16_t spriteEmitSaveX86(uint8_t *out, const SpriteT *sp, uint8_t shift) {
     uint16_t heightPx;
     uint16_t copyBytes;
 
+    if (shift > 1u) {
+        return 0u;
+    }
+
     cursor    = 0;
     heightPx  = (uint16_t)(sp->heightTiles * TILE_PIXELS);
     copyBytes = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW + (shift == 1u ? 1u : 0u));
diff --git a/src/codegen/spriteEmitter.h b/src/codegen/spriteEmitter.h
index 8fbe359..acd7169 100644
--- a/src/codegen/spriteEmitter.h
+++ b/src/codegen/spriteEmitter.h
@@ -42,4 +42,19 @@ uint16_t spriteEmitRestoreX86 (uint8_t *out, const SpriteT *sp, uint8_t shift);
 uint16_t spriteEmitSave68k    (uint8_t *out, const SpriteT *sp, uint8_t shift);
 uint16_t spriteEmitRestore68k (uint8_t *out, const SpriteT *sp, uint8_t shift);
 
+// Planar 68k emitters (Amiga). Distinct from the chunky 68k emitters
+// above because the destination addressing is across 4 separate
+// bitplane buffers, not a single packed-pixel surface. Calling
+// convention for the emitted bytes (cdecl):
+//   void draw    (uint8_t *p0, uint8_t *p1, uint8_t *p2, uint8_t *p3);
+//   void save    (uint8_t *p0, uint8_t *p1, uint8_t *p2, uint8_t *p3, uint8_t *backup);
+//   void restore (uint8_t *p0, uint8_t *p1, uint8_t *p2, uint8_t *p3, const uint8_t *backup);
+// Each pi is plane_base + byteOff (= y*40 + x/8 already added by the
+// dispatcher). Returns 0 for shifts not yet implemented (today only
+// shift 0 == byte-aligned x is emitted; shifts 1..7 fall back to the
+// cross-platform interpreter).
+uint16_t spriteEmitDrawPlanar68k    (uint8_t *out, const SpriteT *sp, uint8_t shift);
+uint16_t spriteEmitSavePlanar68k    (uint8_t *out, const SpriteT *sp, uint8_t shift);
+uint16_t spriteEmitRestorePlanar68k (uint8_t *out, const SpriteT *sp, uint8_t shift);
+
 #endif
diff --git a/src/core/debug.c b/src/core/debug.c
index b748ac2..8dbad17 100644
--- a/src/core/debug.c
+++ b/src/core/debug.c
@@ -1,11 +1,18 @@
-// Cross-platform "where did it hang?" logger. Each call opens
-// joeylog.txt, appends a line, fflushes, closes. Slow but durable
-// -- the last line in the file is guaranteed to be on disk before
-// any subsequent operation that might hang.
+// Cross-platform "where did it hang?" logger. Holds joeylog.txt open
+// across calls; libc's stdio buffer absorbs writes (~4 KB) and the
+// final fclose at program exit (via atexit) gets the buffer to disk.
 //
-// Build only as needed for diagnostics; remove the calls when the
-// bug is fixed. The hang on ST kept us looking at the wrong layer
-// without this kind of trace.
+// Earlier rev opened+closed per call for crash durability ("last line
+// guaranteed on disk if we hang"); that cost ~1 second per call
+// through GoldenGate's ProDOS FST emulation -- a 50-line UBER run
+// burned ~5 minutes in IO. Even per-line fflush is too expensive
+// because every fflush forces an FST WRITE, and host-OS file IO time
+// isn't tracked by the IIgs VBL counter so wall-time logs underreport.
+//
+// Tradeoff: if the program crashes mid-run, buffered log lines may
+// not reach disk. For UBER and similar batch demos that's acceptable;
+// for hang-debugging where durability matters, call joeyLogFlush()
+// at the suspected hang points.
 
 #include <stdio.h>
 #include <stdarg.h>
@@ -15,6 +22,27 @@
 
 
 static const char *kLogPath = "joeylog.txt";
+static FILE       *gLogFp   = NULL;
+/* 16 KB is enough for UBER's full log (~5 KB) plus generous headroom,
+ * so the file never auto-flushes mid-run. ORCA-C / libnix default
+ * buffers are only ~512 bytes; with that, a 50-line log triggers ~10
+ * ProDOS / AmigaDOS WRITEs through the host FST, each of which is
+ * untracked-host-time (seconds). Buffer the whole thing in memory and
+ * let the atexit fclose flush once. */
+#define JOEY_LOG_BUF_BYTES 16384
+static char        gLogBuf[JOEY_LOG_BUF_BYTES];
+
+
+/* Lazy-open. Returns NULL if the open failed (silently disable). */
+static FILE *logFile(void) {
+    if (gLogFp == NULL) {
+        gLogFp = fopen(kLogPath, "a");
+        if (gLogFp != NULL) {
+            (void)setvbuf(gLogFp, gLogBuf, _IOFBF, sizeof(gLogBuf));
+        }
+    }
+    return gLogFp;
+}
 
 
 void joeyLog(const char *msg) {
@@ -22,13 +50,12 @@ void joeyLog(const char *msg) {
     if (msg == NULL) {
         return;
     }
-    fp = fopen(kLogPath, "a");
+    fp = logFile();
     if (fp == NULL) {
         return;
     }
     fputs(msg, fp);
     fputc('\n', fp);
-    fclose(fp);
 }
 
 
@@ -38,7 +65,7 @@ void joeyLogF(const char *fmt, ...) {
     if (fmt == NULL) {
         return;
     }
-    fp = fopen(kLogPath, "a");
+    fp = logFile();
     if (fp == NULL) {
         return;
     }
@@ -46,14 +73,27 @@ void joeyLogF(const char *fmt, ...) {
     vfprintf(fp, fmt, args);
     va_end(args);
     fputc('\n', fp);
-    fclose(fp);
+}
+
+
+void joeyLogFlush(void) {
+    if (gLogFp != NULL) {
+        fflush(gLogFp);
+    }
 }
 
 
 void joeyLogReset(void) {
-    FILE *fp;
-    fp = fopen(kLogPath, "w");
-    if (fp != NULL) {
-        fclose(fp);
+    if (gLogFp != NULL) {
+        fclose(gLogFp);
+        gLogFp = NULL;
+    }
+    /* Truncate by opening for write then closing; subsequent
+     * joeyLog* will reopen for append. */
+    {
+        FILE *fp = fopen(kLogPath, "w");
+        if (fp != NULL) {
+            fclose(fp);
+        }
     }
 }
diff --git a/src/core/draw.c b/src/core/draw.c
index c60d092..802351e 100644
--- a/src/core/draw.c
+++ b/src/core/draw.c
@@ -186,13 +186,17 @@ static void floodFillInternal(SurfaceT *s, int16_t startX, int16_t startY, uint8
             continue;
         }
 
-        // Highest-tier asm fast path: seed-test + walk-left + walk-right
-        // + 1-row fill + scan-above + scan-below + push, all in one
-        // cross-segment call. The asm caches row addr / match decoder
-        // across every sub-operation. C just pops and dispatches; this
-        // path completes the entire per-seed work and computes the row
-        // address itself, so we don't pay y*160 in C unless we fall back.
-        {
+        /* Phase 9: planar ports have NULL s->pixels and the asm fast
+         * paths take a chunky-row pointer. Skip them on planar; the C
+         * fallback below uses halSamplePixel which works on both
+         * storage layouts. */
+        if (s->pixels != NULL) {
+            // Highest-tier asm fast path: seed-test + walk-left + walk-right
+            // + 1-row fill + scan-above + scan-below + push, all in one
+            // cross-segment call. The asm caches row addr / match decoder
+            // across every sub-operation. C just pops and dispatches; this
+            // path completes the entire per-seed work and computes the row
+            // address itself, so we don't pay y*160 in C unless we fall back.
             bool seedMatched;
             if (halFastFloodWalkAndScans(s->pixels, x, y,
                                          matchColor, newNibble, matchEqual,
@@ -203,22 +207,27 @@ static void floodFillInternal(SurfaceT *s, int16_t startX, int16_t startY, uint8
             }
         }
 
-        // Fallback path needs row; compute it here so the asm path
-        // above doesn't pay for an unused y*160 multiply on every iter.
-        row = &s->pixels[SURFACE_ROW_OFFSET(y)];
+        /* Fallback path: compute row only if chunky; halFastFloodWalk
+         * needs it but isn't implemented on Amiga. */
+        row = (s->pixels != NULL) ? &s->pixels[SURFACE_ROW_OFFSET(y)] : NULL;
 
         // Tier-2 asm fast path: combined seed test + walk-left +
         // walk-right in one cross-segment call. Falls back to the
         // pure-C walks below on ports without an asm implementation.
         {
             bool seedMatched;
-            if (halFastFloodWalk(row, x, matchColor, newNibble, matchEqual,
-                                 &seedMatched, &leftX, &rightX)) {
+            if (row != NULL && halFastFloodWalk(row, x, matchColor, newNibble, matchEqual,
+                                                &seedMatched, &leftX, &rightX)) {
+                if (!seedMatched) {
+                    continue;
+                }
+            } else if (halFloodWalkPlanes(s, x, y, matchColor, newNibble, matchEqual,
+                                          &seedMatched, &leftX, &rightX)) {
                 if (!seedMatched) {
                     continue;
                 }
             } else {
-                pix = srcPixel(row, x);
+                pix = halSamplePixel(s, x, y);
                 pixMatch = (pix == matchColor);
                 if (matchEqual) {
                     if (!pixMatch) {
@@ -233,7 +242,7 @@ static void floodFillInternal(SurfaceT *s, int16_t startX, int16_t startY, uint8
                 // Walk left to find the start of the matching run.
                 leftX = x;
                 while (leftX > 0) {
-                    pix = srcPixel(row, (int16_t)(leftX - 1));
+                    pix = halSamplePixel(s, (int16_t)(leftX - 1), y);
                     pixMatch = (pix == matchColor);
                     if (matchEqual ? !pixMatch : (pixMatch || pix == newNibble)) {
                         break;
@@ -244,7 +253,7 @@ static void floodFillInternal(SurfaceT *s, int16_t startX, int16_t startY, uint8
                 // Walk right to find the end.
                 rightX = x;
                 while (rightX < SURFACE_WIDTH - 1) {
-                    pix = srcPixel(row, (int16_t)(rightX + 1));
+                    pix = halSamplePixel(s, (int16_t)(rightX + 1), y);
                     pixMatch = (pix == matchColor);
                     if (matchEqual ? !pixMatch : (pixMatch || pix == newNibble)) {
                         break;
@@ -256,12 +265,18 @@ static void floodFillInternal(SurfaceT *s, int16_t startX, int16_t startY, uint8
 
         // Fill the span. Bypass fillRect's clipping wrapper: walk-out
         // already guaranteed leftX/rightX are in [0..SURFACE_WIDTH-1]
-        // and the seed-pop bounds check did the same for y.
+        // and the seed-pop bounds check did the same for y. We DO
+        // need the planar dual-write (which fillRect's wrapper would
+        // call), so invoke halFillRectPlanes explicitly after the
+        // chunky span fill -- otherwise PLANAR_PRESENT builds (and,
+        // post-Phase-9, every build) display flood-filled regions
+        // as the unfilled background.
         {
             int16_t spanW = (int16_t)(rightX - leftX + 1);
             if (!halFastFillRect(s, leftX, y, (uint16_t)spanW, 1, newNibble)) {
                 fillRectClipped(s, leftX, y, spanW, 1, newNibble);
             }
+            halFillRectPlanes(s, leftX, y, (uint16_t)spanW, 1, newNibble);
         }
 
         // Scan rows above and below for run boundaries. The hot
@@ -291,19 +306,26 @@ static void floodFillInternal(SurfaceT *s, int16_t startX, int16_t startY, uint8
                     }
                     scanY  = (int16_t)(y + 1);
                 }
-                scanRow = &s->pixels[SURFACE_ROW_OFFSET(scanY)];
+                scanRow = (s->pixels != NULL) ? &s->pixels[SURFACE_ROW_OFFSET(scanY)] : NULL;
                 // Prefer the combined scan+push asm path (one call per
-                // scan, no markBuf and no per-pixel C edge walk).
-                if (!halFastFloodScanAndPush(scanRow, leftX, rightX,
+                // scan, no markBuf and no per-pixel C edge walk). Skip
+                // the asm tiers if we don't have a chunky row pointer
+                // (Phase 9 planar ports).
+                if (scanRow == NULL ||
+                    !halFastFloodScanAndPush(scanRow, leftX, rightX,
                                              matchColor, newNibble, matchEqual,
                                              scanY, stackX, stackY,
                                              &sp, FLOOD_STACK_SIZE)) {
-                    if (!halFastFloodScanRow(scanRow, leftX, rightX,
-                                             matchColor, newNibble, matchEqual,
-                                             floodMarkBuf)) {
+                    if ((scanRow == NULL ||
+                         !halFastFloodScanRow(scanRow, leftX, rightX,
+                                              matchColor, newNibble, matchEqual,
+                                              floodMarkBuf)) &&
+                        !halFloodScanRowPlanes(s, leftX, rightX, scanY,
+                                               matchColor, newNibble, matchEqual,
+                                               floodMarkBuf)) {
                         // C fallback: fill markBuf the slow way.
                         for (i = 0; i < spanLen; i++) {
-                            pix = srcPixel(scanRow, (int16_t)(leftX + i));
+                            pix = halSamplePixel(s, (int16_t)(leftX + i), scanY);
                             pixMatch = (pix == matchColor);
                             floodMarkBuf[i] = (uint8_t)(matchEqual
                                 ? (pixMatch ? 1 : 0)
@@ -621,12 +643,12 @@ void fillRect(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t
     if (!halFastFillRect(s, sx, sy, (uint16_t)sw, (uint16_t)sh, colorIndex)) {
         fillRectClipped(s, sx, sy, sw, sh, colorIndex);
     }
+    halFillRectPlanes(s, sx, sy, (uint16_t)sw, (uint16_t)sh, colorIndex);
     surfaceMarkDirtyRect(s, sx, sy, sw, sh);
 }
 
 
 void floodFill(SurfaceT *s, int16_t x, int16_t y, uint8_t newColor) {
-    uint8_t *row;
     uint8_t  seedColor;
 
     if (s == NULL) {
@@ -635,8 +657,9 @@ void floodFill(SurfaceT *s, int16_t x, int16_t y, uint8_t newColor) {
     if (x < 0 || x >= SURFACE_WIDTH || y < 0 || y >= SURFACE_HEIGHT) {
         return;
     }
-    row       = &s->pixels[SURFACE_ROW_OFFSET(y)];
-    seedColor = srcPixel(row, x);
+    /* halSamplePixel reads from whichever storage the port uses --
+     * works on both chunky (s->pixels) and planar (s->portData) ports. */
+    seedColor = halSamplePixel(s, x, y);
     if ((seedColor & 0x0F) == (newColor & 0x0F)) {
         return;
     }
@@ -645,7 +668,6 @@ void floodFill(SurfaceT *s, int16_t x, int16_t y, uint8_t newColor) {
 
 
 void floodFillBounded(SurfaceT *s, int16_t x, int16_t y, uint8_t newColor, uint8_t boundaryColor) {
-    uint8_t *row;
     uint8_t  pix;
 
     if (s == NULL) {
@@ -654,8 +676,7 @@ void floodFillBounded(SurfaceT *s, int16_t x, int16_t y, uint8_t newColor, uint8
     if (x < 0 || x >= SURFACE_WIDTH || y < 0 || y >= SURFACE_HEIGHT) {
         return;
     }
-    row = &s->pixels[SURFACE_ROW_OFFSET(y)];
-    pix = srcPixel(row, x);
+    pix = halSamplePixel(s, x, y);
     // Starting on a boundary pixel or already-filled pixel: nothing
     // to do.
     if ((pix & 0x0F) == (boundaryColor & 0x0F)) {
@@ -669,25 +690,16 @@ void floodFillBounded(SurfaceT *s, int16_t x, int16_t y, uint8_t newColor, uint8
 
 
 uint8_t samplePixel(const SurfaceT *s, int16_t x, int16_t y) {
-    uint8_t byte;
-
     if (s == NULL) {
         return 0;
     }
     if (x < 0 || x >= SURFACE_WIDTH || y < 0 || y >= SURFACE_HEIGHT) {
         return 0;
     }
-
-    /* Cast to uint16_t before shift -- already validated x >= 0,
-     * unsigned semantics match. Avoids ~SSHIFTRIGHT helper. */
-    byte = s->pixels[SURFACE_ROW_OFFSET(y) + ((uint16_t)x >> 1)];
-    if (x & 1) {
-        return (uint8_t)(byte & 0x0F);
-    }
-    /* `byte >> 4` is uint8_t but ORCA-C promotes to int (signed 16-bit)
-     * for the shift, then narrows -- triggers ~SSHIFTRIGHT. The
-     * mask-then-shift sidesteps the promotion path. */
-    return (uint8_t)((byte & 0xF0u) >> 4);
+    /* halSamplePixel reads from whichever storage the port uses --
+     * chunky ports return a nibble extracted from s->pixels; planar
+     * ports read 4 plane bits and assemble the nibble. */
+    return halSamplePixel(s, x, y);
 }
 
 
@@ -725,6 +737,8 @@ void surfaceBlit(SurfaceT *dst, const JoeyAssetT *src, int16_t x, int16_t y) {
             }
         }
     }
+    halBlitRectPlanes(dst, x, y, src->pixels, srcX0, srcY0,
+                      copyW, copyH, srcRowBytes, 0xFFFFu);
     surfaceMarkDirtyRect(dst, x, y, copyW, copyH);
 }
 
@@ -768,6 +782,8 @@ void surfaceBlitMasked(SurfaceT *dst, const JoeyAssetT *src, int16_t x, int16_t
             }
         }
     }
+    halBlitRectPlanes(dst, x, y, src->pixels, srcX0, srcY0,
+                      copyW, copyH, srcRowBytes, (uint16_t)transparent);
     surfaceMarkDirtyRect(dst, x, y, copyW, copyH);
 }
 
diff --git a/src/core/hal.h b/src/core/hal.h
index fec0777..05cdbd8 100644
--- a/src/core/hal.h
+++ b/src/core/hal.h
@@ -9,8 +9,11 @@
 #ifndef JOEYLIB_HAL_H
 #define JOEYLIB_HAL_H
 
+#include <stdio.h>
+
 #include "joey/core.h"
 #include "joey/input.h"
+#include "joey/sprite.h"
 #include "joey/surface.h"
 
 // Per-port one-shot initialization. Called from joeyInit after config
@@ -27,17 +30,131 @@ void halShutdown(void);
 // backs the library-owned stage surface. Ports that have a
 // hardware-friendly pin location for the back buffer (IIgs $01/2000
 // with SHR shadow inhibited) return that address here; ports with no
-// such constraint just malloc/free.
+// such constraint just malloc/free. Planar 68k ports may return NULL
+// if the surface is planar-only and has no chunky shadow.
 uint8_t *halStageAllocPixels(void);
 void     halStageFreePixels(uint8_t *pixels);
 
-// Present the entire source surface to the display.
-void halPresent(const SurfaceT *src);
+// Allocate / release the per-surface portData blob (see SurfaceT in
+// surfaceInternal.h). Chunky ports return NULL from Init -- they keep
+// portData unused and operate on the chunky `pixels` buffer. Planar
+// 68k ports allocate a per-surface struct here describing the
+// bitplane storage (Amiga: 4 separate plane buffers + stride; ST: one
+// interleaved buffer + stride). Called by surfaceCreate / stageAlloc
+// after pixels is allocated; freed by surfaceDestroy / stageFree
+// before pixels is freed. `isStage` lets the port short-circuit for
+// the stage if its planes are display-owned (e.g. Amiga's BitMap
+// planes from OpenScreen) rather than allocated per surface.
+void *halSurfaceAllocPortData(SurfaceT *s, bool isStage);
+void  halSurfaceFreePortData(SurfaceT *s, bool isStage, void *portData);
 
-// Present a rectangular region of the source surface. The caller has
-// already validated and clipped the rect to be fully inside the
-// surface bounds and to have positive extents.
-void halPresentRect(const SurfaceT *src, int16_t x, int16_t y, uint16_t w, uint16_t h);
+// Phase 3 planar dual-write: called from cross-platform fillRect AFTER
+// the chunky shadow has been written, with the same already-clipped
+// (x, y, w, h) and the raw color index 0..15. Planar ports update
+// the bitplanes with the rect's bit pattern (per-plane bit value =
+// (color >> plane) & 1). Chunky ports (DOS, IIgs) provide a no-op
+// stub. Called unconditionally so cross-platform code doesn't have
+// to know the port is planar; the per-port stub is the cheapest
+// possible thing on chunky ports.
+void halFillRectPlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex);
+
+// Phase 3 planar dual-write for surfaceCopy: called from cross-platform
+// surfaceCopy AFTER the chunky pixel buffer is memcpy'd. Planar ports
+// also memcpy the bitplanes from src to dst so JOEYLIB_PLANAR_PRESENT
+// builds see correct planes. dst and src are non-NULL and distinct
+// (caller's no-op guards already passed).
+void halSurfaceCopyPlanes(SurfaceT *dst, const SurfaceT *src);
+
+// Phase 5 planar dual-write for tile ops. Called from cross-platform
+// tile.c AFTER the chunky path completes. (bx, by) are tile-grid
+// coords (0..39 horiz, 0..24 vert; surface is 40x25 tiles).
+// transparentIndex for tileCopyMasked: pixel value to skip. tilePaste
+// reads from a packed 32-byte chunky TileT (4 bytes/row x 8 rows).
+// All Amiga impls operate on the off-screen shadow planes via
+// AmigaPlanarT; chunky-port stubs are no-ops. tileSnap is read-only
+// so has no planar dual-write hook.
+void halTileFillPlanes(SurfaceT *s, uint8_t bx, uint8_t by, uint8_t colorIndex);
+void halTileCopyPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy);
+void halTileCopyMaskedPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy, uint8_t transparentIndex);
+void halTilePastePlanes(SurfaceT *dst, uint8_t bx, uint8_t by, const uint8_t *chunkyTile);
+
+// tileSnap: cross-platform code reads s->pixels chunky bytes into a
+// 32-byte TileT. On planar ports (s->pixels NULL) the chunky read
+// crashes -- this hook is the planar derivation: reads bitplane bits
+// for the tile rect and assembles 32 chunky bytes (4 bytes/row x 8
+// rows) into chunkyTileOut. Chunky ports (s->pixels valid) implement
+// this as a no-op since the cross-platform fallback already filled
+// chunkyTileOut from s->pixels.
+void halTileSnapPlanes(const SurfaceT *src, uint8_t bx, uint8_t by, uint8_t *chunkyTileOut);
+
+// Phase 6 planar dual-write for spriteDraw. Called from cross-platform
+// sprite.c AFTER spriteCompiledDraw or spriteDrawInterpreted has
+// updated the chunky shadow. (x, y) is the destination top-left in
+// surface pixels (may be partially off-surface; the hook does its own
+// clipping). Walks the sprite's chunky tile data and updates dst
+// surface planes for every non-transparent pixel (nibble != 0).
+// Save/restore have NO planar dual-write yet -- after spriteSaveUnder
+// + spriteDraw + spriteRestoreUnder under JOEYLIB_PLANAR_PRESENT, the
+// planes still show the sprite (chunky restored, planes unchanged).
+// Workable approach for that needs a parallel plane backup buffer;
+// deferred until apps actually depend on PLANAR_PRESENT save/restore.
+void halSpriteDrawPlanes(SurfaceT *s, const SpriteT *sp, int16_t x, int16_t y);
+
+// Phase 8 planar dual-write for asset blits and full surface loads.
+// halBlitRectPlanes is called from surfaceBlit / surfaceBlitMasked
+// AFTER the chunky path. transparent == 0xFFFF means opaque blit; any
+// other value is a nibble (0..15) to skip. srcBytes is the asset's
+// raw chunky pixel buffer; srcRowBytes is its stride. (x, y) is the
+// already-clipped destination top-left in dst surface pixels;
+// srcX0/srcY0 is where in the asset the visible region starts after
+// clip; copyW/copyH is the visible region size in pixels.
+//
+void halBlitRectPlanes(SurfaceT *dst, int16_t x, int16_t y, const uint8_t *srcBytes, int16_t srcX0, int16_t srcY0, int16_t copyW, int16_t copyH, int16_t srcRowBytes, uint16_t transparent);
+
+// Phase 9 sprite save/restore plane data. Chunky ports already hold
+// pixel data in backup->bytes via the cross-platform memcpy. Planar
+// ports (Amiga) DO have chunky NULL, so backup->bytes is unused by
+// the chunky path -- we repurpose it to hold per-plane bytes. Layout:
+// 4 plane stripes of (h * bytesPerPlaneRow) bytes each, where
+// bytesPerPlaneRow = w/8 (sprite x and w are guaranteed 2-pixel
+// aligned by spriteSaveUnder; planar requires further 8-pixel
+// rounding -- see Amiga impl notes). Total bytes:
+// 4 * h * w/8 = h * w/2 = same as chunky. backup->sizeBytes capacity
+// works on both ports. Chunky-port impls are no-ops; Amiga writes /
+// reads plane bytes via AmigaPlanarT.
+void halSpriteSavePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t *dstPlaneBytes);
+void halSpriteRestorePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, const uint8_t *srcPlaneBytes);
+
+// Phase 9 reader hooks. Cross-platform code calls these instead of
+// reading from s->pixels directly so it works regardless of whether
+// the port stores chunky or planar as the source of truth. Chunky
+// ports (DOS, IIgs) implement these reading from s->pixels (cheap);
+// Amiga reads from the bitplanes in AmigaPlanarT. (x, y) bounds are
+// already validated by the caller.
+//
+// halSamplePixel: returns the 0..15 nibble at (x, y).
+// halSurfaceHash: returns the FNV-style hash of pixel + scb + palette
+//   that surfaceHash currently computes by walking s->pixels. Allows
+//   ports to use their native pixel storage instead.
+// halSurfaceCopyChunky: cross-platform surfaceCopy used to memcpy
+//   s->pixels src->dst; on planar ports there is no chunky to copy
+//   (planes already covered by halSurfaceCopyPlanes). Chunky ports
+//   do the memcpy here; Amiga is a no-op.
+// halSurfaceLoadFileChunky / halSurfaceSaveFileChunky wrap fread /
+//   fwrite of the pixel data. Chunky ports stream directly to/from
+//   s->pixels; Amiga uses a scratch buffer + c2p (load) or
+//   plane->chunky derivation (save).
+uint8_t halSamplePixel(const SurfaceT *s, int16_t x, int16_t y);
+uint32_t halSurfaceHash(const SurfaceT *s);
+void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src);
+bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp);
+bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp);
+
+// Present the dirty regions of the source surface to the display.
+// The cross-platform stagePresent walks the dirty arrays before
+// calling this; ports may use the dirty arrays themselves to skip
+// untouched rows.
+void halPresent(const SurfaceT *src);
 
 // Optional: returns a port-specific error message string for the last
 // HAL failure, or NULL if none. Ports may return NULL always.
@@ -73,9 +190,23 @@ uint16_t halFrameHz(void);
 
 // Audio: per-port engine setup, module + SFX playback, teardown.
 // halAudioInit returns true if the platform has a working engine.
-// All entry points are safe to call when init failed -- they become
-// no-ops. See joey/audio.h for the public API contract that wraps
-// these.
+// Per-surface chunky pixel allocation. Chunky ports (DOS, IIgs, ST
+// while still chunky) allocate SURFACE_PIXELS_SIZE bytes (calloc-
+// style, zero-filled). Pure-planar Amiga returns NULL -- there's no
+// chunky shadow; cross-platform code that previously read s->pixels
+// goes through halSamplePixel / halSurfaceCopyChunky / etc. instead.
+// halSurfaceFreePixels mirrors free(); NULL is a valid input on
+// planar ports.
+uint8_t *halSurfaceAllocPixels(void);
+void     halSurfaceFreePixels(uint8_t *pixels);
+
+// Get a pointer to the start of bitplane `planeIdx` (0..3) for surface
+// `s`. Returns NULL on chunky ports (no planes). On Amiga returns
+// pd->planes[planeIdx] from the AmigaPlanarT struct in portData.
+// Used by the planar sprite codegen dispatcher to compute the 4
+// plane addresses to hand the emitted asm.
+uint8_t *halSurfacePlanePtr(const SurfaceT *s, uint8_t planeIdx);
+
 bool halAudioInit(void);
 void halAudioShutdown(void);
 void halAudioPlayMod(const uint8_t *data, uint32_t length, bool loop);
@@ -185,6 +316,21 @@ bool halFastFloodWalkAndScans(uint8_t *pixels, int16_t x, int16_t y,
                               bool *seedMatched,
                               int16_t *leftXOut, int16_t *rightXOut);
 
+// Planar variants of halFastFloodWalk / halFastFloodScanRow. Take a
+// SurfaceT* instead of a chunky-row pointer so they work on planar
+// ports (Amiga post-Phase 9) where s->pixels is NULL. Same semantics;
+// chunky ports return false (the chunky variants above are faster
+// when a chunky row is available). Replace the per-pixel
+// halSamplePixel walk on planar ports.
+bool halFloodWalkPlanes(const SurfaceT *s, int16_t startX, int16_t y,
+                        uint8_t matchColor, uint8_t newColor, bool matchEqual,
+                        bool *seedMatched,
+                        int16_t *leftXOut, int16_t *rightXOut);
+
+bool halFloodScanRowPlanes(const SurfaceT *s, int16_t leftX, int16_t rightX, int16_t scanY,
+                           uint8_t matchColor, uint8_t newColor, bool matchEqual,
+                           uint8_t *markBuf);
+
 // surfaceBlit / surfaceBlitMasked rect-copy helper. Caller has done
 // the clip math: dstRow0 / srcRow0 point at row 0 of the source/dest
 // regions, dstX / srcX are intra-row pixel offsets, copyW/copyH are
@@ -333,6 +479,12 @@ extern uint16_t gFloodRightX;
 #undef  halFastFloodScanAndPush
 #define halFastFloodScanAndPush(_row, _lx, _rx, _mc, _nc, _me, _sy, _sx, _syA, _sp, _ms) (false)
 
+// IIgs is chunky; the planar flood hooks are never reachable.
+#undef  halFloodWalkPlanes
+#define halFloodWalkPlanes(_s, _sx, _y, _mc, _nc, _me, _sm, _lx, _rx) (false)
+#undef  halFloodScanRowPlanes
+#define halFloodScanRowPlanes(_s, _lx, _rx, _sy, _mc, _nc, _me, _mb) (false)
+
 // Tier-1 flood: multi-output. Asm sets gFloodSeedMatch / gFloodLeftX /
 // gFloodRightX; macro reads those into the caller's out-ptrs.
 #undef  halFastFloodWalkAndScans
diff --git a/src/core/present.c b/src/core/present.c
index 1184f27..a550c66 100644
--- a/src/core/present.c
+++ b/src/core/present.c
@@ -2,8 +2,7 @@
 //
 // stagePresent walks the per-row dirty bands set by drawing primitives
 // and asks the port HAL to flip just those rows to the display, then
-// resets the dirty state. stagePresentRect bypasses dirty tracking
-// entirely and slams a caller-specified rectangle (after clipping).
+// resets the dirty state.
 
 #include <stddef.h>
 
@@ -25,48 +24,3 @@ void stagePresent(void) {
     halPresent(stage);
     stageDirtyClearAll();
 }
-
-
-void stagePresentRect(int16_t x, int16_t y, uint16_t w, uint16_t h) {
-    SurfaceT *stage;
-    int16_t   sx;
-    int16_t   sy;
-    int16_t   sw;
-    int16_t   sh;
-
-    stage = stageGet();
-    if (stage == NULL) {
-        return;
-    }
-
-    sx = x;
-    sy = y;
-    sw = (int16_t)w;
-    sh = (int16_t)h;
-
-    if (sw <= 0 || sh <= 0) {
-        return;
-    }
-    if (sx < 0) {
-        sw += sx;
-        sx = 0;
-    }
-    if (sy < 0) {
-        sh += sy;
-        sy = 0;
-    }
-    if (sx >= SURFACE_WIDTH || sy >= SURFACE_HEIGHT) {
-        return;
-    }
-    if (sx + sw > SURFACE_WIDTH) {
-        sw = SURFACE_WIDTH - sx;
-    }
-    if (sy + sh > SURFACE_HEIGHT) {
-        sh = SURFACE_HEIGHT - sy;
-    }
-    if (sw <= 0 || sh <= 0) {
-        return;
-    }
-
-    halPresentRect(stage, sx, sy, (uint16_t)sw, (uint16_t)sh);
-}
diff --git a/src/core/sprite.c b/src/core/sprite.c
index 80bf036..177ca53 100644
--- a/src/core/sprite.c
+++ b/src/core/sprite.c
@@ -10,6 +10,7 @@
 
 #include "joey/sprite.h"
 #include "codegenArenaInternal.h"
+#include "hal.h"
 #include "spriteInternal.h"
 #include "surfaceInternal.h"
 
@@ -22,6 +23,20 @@
 // Color 0 is always transparent for sprites (DESIGN.md contract).
 #define TRANSPARENT_NIBBLE 0
 
+// On Amiga (post-Phase 9 / Phase 6 redux) the compiled sprite emitter
+// writes directly to the bitplanes, so the halSpritePlanes hooks are
+// pure duplicate work after a compiled call. On other ports the
+// hooks are either no-op stubs (chunky-only IIgs/DOS) or the only
+// thing writing planes (ST: chunky-shadow + planes). Slow / interpreter
+// paths still need the hooks unconditionally on every platform -- the
+// chunky interpreter is a no-op on Amiga (s->pixels NULL) so the hook
+// is the only draw.
+#if defined(JOEYLIB_PLATFORM_AMIGA)
+#define COMPILED_SPRITE_WRITES_PLANES 1
+#else
+#define COMPILED_SPRITE_WRITES_PLANES 0
+#endif
+
 
 // ----- Prototypes -----
 
@@ -144,14 +159,20 @@ static void spriteDrawInterpreted(SurfaceT *s, SpriteT *sp, int16_t x, int16_t y
         return;
     }
 
-    for (row = 0; row < h; row++) {
-        dstRow = &s->pixels[(dy + row) * SURFACE_BYTES_PER_ROW];
-        for (col = 0; col < w; col++) {
-            nibble = srcNibble(sp, (int16_t)(sx + col), (int16_t)(sy + row));
-            if (nibble == TRANSPARENT_NIBBLE) {
-                continue;
+    /* Skip the chunky write loop on planar ports (s->pixels == NULL).
+     * halSpriteDrawPlanes is called by the spriteDraw caller and does
+     * its own clip + plane write, so the dirty mark + planar update
+     * happen there. Phase 9 dropped the chunky shadow on Amiga. */
+    if (s->pixels != NULL) {
+        for (row = 0; row < h; row++) {
+            dstRow = &s->pixels[(dy + row) * SURFACE_BYTES_PER_ROW];
+            for (col = 0; col < w; col++) {
+                nibble = srcNibble(sp, (int16_t)(sx + col), (int16_t)(sy + row));
+                if (nibble == TRANSPARENT_NIBBLE) {
+                    continue;
+                }
+                writeDstNibble(dstRow, (int16_t)(dx + col), nibble);
             }
-            writeDstNibble(dstRow, (int16_t)(dx + col), nibble);
         }
     }
     surfaceMarkDirtyRect(s, dx, dy, w, h);
@@ -200,6 +221,13 @@ SpriteT *spriteCreateFromSurface(const SurfaceT *src, int16_t x, int16_t y,
     if (src == NULL || widthTiles == 0 || heightTiles == 0) {
         return NULL;
     }
+    /* Phase 9: planar ports have NULL src->pixels. Capturing a sprite
+     * from such a surface needs a planar-to-chunky derivation hook;
+     * not implemented yet, so refuse the call. Apps targeting Amiga
+     * should ship sprites as static tile data instead. */
+    if (src->pixels == NULL) {
+        return NULL;
+    }
     // Source x/y must be on a tile boundary so each captured tile lands
     // on whole bytes -- mid-byte snapshots would lose half a pixel at
     // the left edge.
@@ -284,10 +312,14 @@ void spriteDraw(SurfaceT *s, SpriteT *sp, int16_t x, int16_t y) {
     // need clip math (they walk fixed offsets).
     if (sp->slot != NULL && isFullyOnSurface(x, y, widthPx, heightPx)) {
         spriteCompiledDraw(s, sp, x, y);
+        if (!COMPILED_SPRITE_WRITES_PLANES) {
+            halSpriteDrawPlanes(s, sp, x, y);
+        }
         surfaceMarkDirtyRect(s, x, y, (int16_t)widthPx, (int16_t)heightPx);
         return;
     }
     spriteDrawInterpreted(s, sp, x, y);
+    halSpriteDrawPlanes(s, sp, x, y);
 }
 
 
@@ -332,7 +364,7 @@ void spriteSaveAndDraw(SurfaceT *s, SpriteT *sp, int16_t x, int16_t y, SpriteBac
         uint16_t  saveIdx;
         uint16_t  drawIdx;
         uint8_t  *offsetsBase;
-        shift       = (uint8_t)(x & 1);
+        shift       = SPRITE_SHIFT_INDEX(x);
         saveIdx     = (uint16_t)(((uint16_t)shift << 1) + (uint16_t)shift + SPRITE_OP_SAVE);
         drawIdx     = (uint16_t)(((uint16_t)shift << 1) + (uint16_t)shift + SPRITE_OP_DRAW);
         offsetsBase = (uint8_t *)sp->routineOffsets;
@@ -340,6 +372,10 @@ void spriteSaveAndDraw(SurfaceT *s, SpriteT *sp, int16_t x, int16_t y, SpriteBac
             *(uint16_t *)(offsetsBase + (drawIdx << 1)) != SPRITE_NOT_COMPILED) {
             spriteCompiledSaveUnder(s, sp, x, y, backup);
             spriteCompiledDraw    (s, sp, x, y);
+            if (!COMPILED_SPRITE_WRITES_PLANES) {
+                halSpriteSavePlanes(s, backup->x, backup->y, backup->width, backup->height, backup->bytes);
+                halSpriteDrawPlanes(s, sp, x, y);
+            }
             surfaceMarkDirtyRect  (s, x, y, (int16_t)widthPx, (int16_t)heightPx);
             return;
         }
@@ -630,13 +666,18 @@ void spriteRestoreUnder(SurfaceT *s, const SpriteBackupT *backup) {
         routeOffset = *(uint16_t *)((uint8_t *)sp->routineOffsets + (routeIdx << 1));
         if (routeOffset != SPRITE_NOT_COMPILED) {
             spriteCompiledRestoreUnder(s, backup);
+            if (!COMPILED_SPRITE_WRITES_PLANES) {
+                halSpriteRestorePlanes(s, bx, by, bw, bh, backup->bytes);
+            }
             surfaceMarkDirtyRect(s, bx, by, (int16_t)bw, (int16_t)bh);
             return;
         }
     }
 
-    /* Slow / interpreted memcpy fallback. */
-    {
+    /* Slow / interpreted memcpy fallback. Skip the chunky memcpy if
+     * the port has no chunky shadow (Phase 9 Amiga: s->pixels NULL);
+     * halSpriteRestorePlanes below does the planar restore. */
+    if (s->pixels != NULL) {
         int16_t  row;
         int16_t  byteStart;
         uint8_t *dstRow;
@@ -650,6 +691,7 @@ void spriteRestoreUnder(SurfaceT *s, const SpriteBackupT *backup) {
                    (size_t)copyBytes);
         }
     }
+    halSpriteRestorePlanes(s, bx, by, bw, bh, backup->bytes);
     surfaceMarkDirtyRect(s, bx, by, (int16_t)bw, (int16_t)bh);
 }
 
@@ -684,11 +726,14 @@ void spriteSaveUnder(const SurfaceT *s, SpriteT *sp, int16_t x, int16_t y, Sprit
     if (backup->bytes != NULL && slot != NULL && isFullyOnSurface(x, y, widthPx, heightPx)) {
         uint16_t routeIdx;
         uint16_t routeOffset;
-        shift       = (uint8_t)(x & 1);
+        shift       = SPRITE_SHIFT_INDEX(x);
         routeIdx    = (uint16_t)(((uint16_t)shift << 1) + (uint16_t)shift + SPRITE_OP_SAVE);
         routeOffset = *(uint16_t *)((uint8_t *)sp->routineOffsets + (routeIdx << 1));
         if (routeOffset != SPRITE_NOT_COMPILED) {
             spriteCompiledSaveUnder(s, sp, x, y, backup);
+            if (!COMPILED_SPRITE_WRITES_PLANES) {
+                halSpriteSavePlanes(s, backup->x, backup->y, backup->width, backup->height, backup->bytes);
+            }
             return;
         }
     }
@@ -744,11 +789,16 @@ void spriteSaveUnder(const SurfaceT *s, SpriteT *sp, int16_t x, int16_t y, Sprit
         // backup with bytes==NULL.
         return;
     }
-    for (row = 0; row < h; row++) {
-        srcRow = &s->pixels[(dy + row) * SURFACE_BYTES_PER_ROW];
-        memcpy(&backup->bytes[(uint16_t)row * (uint16_t)copyBytes],
-               &srcRow[byteStart],
-               (size_t)copyBytes);
+    /* Chunky save path: skip on planar ports (s->pixels NULL).
+     * halSpriteSavePlanes below covers the planar case. */
+    if (s->pixels != NULL) {
+        for (row = 0; row < h; row++) {
+            srcRow = &s->pixels[(dy + row) * SURFACE_BYTES_PER_ROW];
+            memcpy(&backup->bytes[(uint16_t)row * (uint16_t)copyBytes],
+                   &srcRow[byteStart],
+                   (size_t)copyBytes);
+        }
     }
+    halSpriteSavePlanes(s, clippedX, dy, (uint16_t)clippedW, (uint16_t)h, backup->bytes);
     }   /* end slow path */
 }
diff --git a/src/core/spriteInternal.h b/src/core/spriteInternal.h
index 8e4733a..99a6bd5 100644
--- a/src/core/spriteInternal.h
+++ b/src/core/spriteInternal.h
@@ -13,6 +13,16 @@
 #define SPRITE_OP_RESTORE 2
 #define SPRITE_OP_COUNT   3
 
+// Per-platform shift index used by the dispatcher. Chunky 4bpp ports
+// store one nibble per pixel pair so the only sub-byte alignment is
+// x % 2. Amiga planar packs 8 pixels per plane byte so all 8
+// alignments matter.
+#if defined(JOEYLIB_PLATFORM_AMIGA)
+#define SPRITE_SHIFT_INDEX(x)  ((uint8_t)((x) & 7))
+#else
+#define SPRITE_SHIFT_INDEX(x)  ((uint8_t)((x) & 1))
+#endif
+
 // Sentinel stored in routineOffsets[shift][op] when that op's emitter
 // returned 0 bytes (i.e., the platform doesn't implement compiled
 // codegen for that op yet). Distinct from a real offset of 0, which
diff --git a/src/core/surface.c b/src/core/surface.c
index 486620f..229b5f0 100644
--- a/src/core/surface.c
+++ b/src/core/surface.c
@@ -65,9 +65,10 @@ void surfaceCopy(SurfaceT *dst, const SurfaceT *src) {
     if (dst == NULL || src == NULL || dst == src) {
         return;
     }
-    memcpy(dst->pixels,  src->pixels,  SURFACE_PIXELS_SIZE);
+    halSurfaceCopyChunky(dst, src);          /* memcpy on chunky ports; no-op on planar */
     memcpy(dst->scb,     src->scb,     sizeof(src->scb));
     memcpy(dst->palette, src->palette, sizeof(src->palette));
+    halSurfaceCopyPlanes(dst, src);          /* 4 plane memcpys on planar ports; no-op on chunky */
     surfaceMarkDirtyAll(dst);
 }
 
@@ -79,11 +80,10 @@ SurfaceT *surfaceCreate(void) {
     if (s == NULL) {
         return NULL;
     }
-    s->pixels = (uint8_t *)calloc(1, SURFACE_PIXELS_SIZE);
-    if (s->pixels == NULL) {
-        free(s);
-        return NULL;
-    }
+    /* halSurfaceAllocPixels returns NULL on planar ports (Amiga); the
+     * primary storage is the port-allocated planes via portData below. */
+    s->pixels = halSurfaceAllocPixels();
+    s->portData = halSurfaceAllocPortData(s, false);
     paletteInitDefault(s);
     return s;
 }
@@ -96,11 +96,44 @@ void surfaceDestroy(SurfaceT *s) {
     if (s == gStage) {
         return;
     }
-    free(s->pixels);
+    halSurfaceFreePortData(s, false, s->portData);
+    halSurfaceFreePixels(s->pixels);
     free(s);
 }
 
 
+// Cheapest deterministic hash that still detects per-byte changes:
+// (hash << 1) ^ byte, a single 16-bit accumulator. ORCA-C / 65816
+// compiles to ASL + EOR -- about 35 cyc per byte. A 32-bit multiply
+// FNV-style hash takes ~200 cyc per byte via ~UMUL4, which adds
+// 80+ seconds to a UBER run on IIgs. Discrimination is weaker than
+// FNV but plenty for cross-port validation: we only need "did the
+// same logical-pixel sequence produce the same hash?" -- not
+// crypto-grade collision resistance over arbitrary inputs.
+//
+// Walks the chunky pixel buffer byte-by-byte, the same logical-pixel
+// ordering on every chunky-format port (IIgs, DOS, Amiga and ST
+// while still chunky). When the planar rewrite drops s->pixels on
+// Amiga/ST this function will need a HAL hook (halSurfaceHash) to
+// read planes natively while producing the same logical hash.
+/* Cross-port FNV-style hash of pixels + SCB + palette. The hash logic
+ * (multiplier streams, byte ordering for palette) is identical across
+ * ports, but the pixel READS go through the port HAL so chunky ports
+ * walk s->pixels and planar ports walk plane bits and assemble nibble
+ * pairs into chunky bytes for the hash. Both produce the same logical-
+ * pixel hash because they hash the same logical pixel sequence in the
+ * same chunky byte order. SCB and palette are still hashed inline
+ * here because they live in the SurfaceT struct on every port (no
+ * port-specific storage) and the byte/value-with-explicit-byte-order
+ * walks are already endian-independent. */
+uint32_t surfaceHash(const SurfaceT *s) {
+    if (s == NULL) {
+        return 0u;
+    }
+    return halSurfaceHash(s);
+}
+
+
 bool surfaceLoadFile(SurfaceT *dst, const char *path) {
     FILE *fp;
     long  fileSize;
@@ -125,7 +158,7 @@ bool surfaceLoadFile(SurfaceT *dst, const char *path) {
         fclose(fp);
         return false;
     }
-    if (fread(dst->pixels, 1, SURFACE_PIXELS_SIZE, fp) != SURFACE_PIXELS_SIZE) {
+    if (!halSurfaceLoadFileChunky(dst, fp)) {
         fclose(fp);
         return false;
     }
@@ -153,7 +186,7 @@ bool surfaceSaveFile(const SurfaceT *src, const char *path) {
     if (fp == NULL) {
         return false;
     }
-    if (fwrite(src->pixels, 1, SURFACE_PIXELS_SIZE, fp) != SURFACE_PIXELS_SIZE) {
+    if (!halSurfaceSaveFileChunky(src, fp)) {
         fclose(fp);
         return false;
     }
@@ -228,13 +261,14 @@ bool stageAlloc(void) {
     if (gStage == NULL) {
         return false;
     }
+    /* halStageAllocPixels returns NULL on planar ports (Amiga) where
+     * the chunky shadow doesn't exist; the planes from portData are
+     * the source of truth. NULL pixels is no longer a failure. */
     gStage->pixels = halStageAllocPixels();
-    if (gStage->pixels == NULL) {
-        free(gStage);
-        gStage = NULL;
-        return false;
+    if (gStage->pixels != NULL) {
+        memset(gStage->pixels, 0, SURFACE_PIXELS_SIZE);
     }
-    memset(gStage->pixels, 0, SURFACE_PIXELS_SIZE);
+    gStage->portData = halSurfaceAllocPortData(gStage, true);
     stageDirtyClearAll();
     paletteInitDefault(gStage);
     return true;
@@ -255,6 +289,7 @@ void stageFree(void) {
     if (gStage == NULL) {
         return;
     }
+    halSurfaceFreePortData(gStage, true, gStage->portData);
     halStageFreePixels(gStage->pixels);
     free(gStage);
     gStage = NULL;
diff --git a/src/core/surfaceInternal.h b/src/core/surfaceInternal.h
index 45017a5..c7743d9 100644
--- a/src/core/surfaceInternal.h
+++ b/src/core/surfaceInternal.h
@@ -14,8 +14,17 @@
 // auto-mirroring to $E1). Caller-side `s->pixels[i]` syntax is
 // unchanged; only allocation/copy paths in surface.c shift to a
 // two-buffer model.
+//
+// portData is per-port opaque storage. On chunky ports (IIgs, DOS) it
+// stays NULL -- pixels is the source of truth. On planar ports
+// (Amiga, Atari ST) it points to a port-private struct describing the
+// 4 bitplanes (Amiga: 4 separate plane buffers + stride; ST: single
+// interleaved buffer + stride). Cross-platform code never touches it
+// directly -- all primitive access goes through halFast* on planar
+// ports. See project_planar_68k_plan.md for the full architecture.
 struct SurfaceT {
     uint8_t  *pixels;
+    void     *portData;
     uint8_t   scb[SURFACE_HEIGHT];
     uint16_t  palette[SURFACE_PALETTE_COUNT][SURFACE_COLORS_PER_PALETTE];
 };
@@ -38,6 +47,18 @@ struct SurfaceT {
 extern uint8_t gStageMinWord[SURFACE_HEIGHT];
 extern uint8_t gStageMaxWord[SURFACE_HEIGHT];
 
+// Per-byte mixer for surfaceHash. Two-stream: lo *= 31 + b, hi *= 251 + b.
+// Strength-reduced to shifts so ORCA-C doesn't emit `~UMUL2` (~150 cyc
+// per call); 32 KB hashed twice -> ~5 minutes per UBER run. The
+// shift form is 16-bit-equivalent (mod 2^16) so hash values are
+// identical to the original `* 31u` / `* 251u` form.
+//   lo *= 31  ==  (lo << 5) - lo
+//   hi *= 251 ==  (hi << 8) - (hi << 2) - hi
+#define SURFACE_HASH_MIX_BYTE(lo_, hi_, b_) do { \
+    (lo_) = (uint16_t)(((((lo_) << 5) - (lo_)) + (b_))); \
+    (hi_) = (uint16_t)((((hi_) << 8) - ((hi_) << 2) - (hi_)) + (b_)); \
+} while (0)
+
 // Stage SCB / palette dirty flags. scbSet* and paletteSet set them
 // true when the stage's data is modified; the per-port present code
 // checks the flags and clears after upload. Replaces a per-frame
@@ -50,6 +71,15 @@ extern bool gStagePaletteDirty;
 // bands are widened to cover the rect. If `s` is any other surface,
 // the call is a no-op -- non-stage surfaces never get presented, so
 // they don't carry dirty state.
+//
+// Planar ports rely on the chunky shadow + c2p path through Phase 8.
+// Planar-native primitives (Phases 3+) dual-write: they update both
+// the chunky pixels and the bitplanes in the same call, so c2p at
+// present time always derives correct planes from up-to-date chunky.
+// Phase 9 deletes the chunky shadow + c2p; only at that point will
+// per-row planar-vs-chunky tracking even be a possible question, and
+// the plan is to avoid it entirely there too (planes become the only
+// source of truth).
 void surfaceMarkDirtyRect(const SurfaceT *s, int16_t x, int16_t y, int16_t w, int16_t h);
 
 // Shorthand for "every row, full width" -- used by surfaceClear and
diff --git a/src/core/tile.c b/src/core/tile.c
index e451425..d84b585 100644
--- a/src/core/tile.c
+++ b/src/core/tile.c
@@ -147,6 +147,7 @@ void tileCopy(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src,
     if (!halFastTileCopy(dstRow0, srcRow0)) {
         copyTileOpaque(dstRow0, srcRow0);
     }
+    halTileCopyPlanes(dst, dstBx, dstBy, src, srcBx, srcBy);
     surfaceMarkDirtyRect(dst, (int16_t)dstPixelX, (int16_t)dstPixelY,
                          TILE_PIXELS_PER_SIDE, TILE_PIXELS_PER_SIDE);
 }
@@ -178,6 +179,7 @@ void tileCopyMasked(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT
     if (!halFastTileCopyMasked(dstRow0, srcRow0, transparentIndex)) {
         copyTileMasked(dstRow0, srcRow0, transparentIndex);
     }
+    halTileCopyMaskedPlanes(dst, dstBx, dstBy, src, srcBx, srcBy, transparentIndex);
     surfaceMarkDirtyRect(dst, (int16_t)dstPixelX, (int16_t)dstPixelY,
                          TILE_PIXELS_PER_SIDE, TILE_PIXELS_PER_SIDE);
 }
@@ -209,6 +211,7 @@ void tileFill(SurfaceT *s, uint8_t bx, uint8_t by, uint8_t colorIndex) {
             row += SURFACE_BYTES_PER_ROW;
         }
     }
+    halTileFillPlanes(s, bx, by, colorIndex);
     surfaceMarkDirtyRect(s, (int16_t)pixelX, (int16_t)pixelY,
                          TILE_PIXELS_PER_SIDE, TILE_PIXELS_PER_SIDE);
 }
@@ -241,6 +244,7 @@ void tilePaste(SurfaceT *dst, uint8_t bx, uint8_t by, const TileT *in) {
             src    += TILE_BYTES_PER_ROW;
         }
     }
+    halTilePastePlanes(dst, bx, by, &in->pixels[0]);
     surfaceMarkDirtyRect(dst, (int16_t)pixelX, (int16_t)pixelY,
                          TILE_PIXELS_PER_SIDE, TILE_PIXELS_PER_SIDE);
 }
@@ -261,9 +265,12 @@ void tileSnap(const SurfaceT *src, uint8_t bx, uint8_t by, TileT *out) {
     }
     pixelX = (uint16_t)((uint16_t)bx * TILE_PIXELS_PER_SIDE);
     pixelY = (uint16_t)((uint16_t)by * TILE_PIXELS_PER_SIDE);
-    srcRow = &src->pixels[SURFACE_ROW_OFFSET(pixelY) + (pixelX >> 1)];
     dst    = &out->pixels[0];
-    if (!halFastTileSnap(dst, srcRow)) {
+    /* On planar ports (s->pixels NULL) the chunky read path is
+     * skipped; halTileSnapPlanes below derives the tile bytes from
+     * the bitplanes. */
+    if (src->pixels != NULL && !halFastTileSnap(dst, &src->pixels[SURFACE_ROW_OFFSET(pixelY) + (pixelX >> 1)])) {
+        srcRow = &src->pixels[SURFACE_ROW_OFFSET(pixelY) + (pixelX >> 1)];
         for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) {
             dst[0] = srcRow[0];
             dst[1] = srcRow[1];
@@ -273,4 +280,5 @@ void tileSnap(const SurfaceT *src, uint8_t bx, uint8_t by, TileT *out) {
             dst    += TILE_BYTES_PER_ROW;
         }
     }
+    halTileSnapPlanes(src, bx, by, &out->pixels[0]);
 }
diff --git a/src/port/amiga/circle.s b/src/port/amiga/circle.s
new file mode 100644
index 0000000..e36503c
--- /dev/null
+++ b/src/port/amiga/circle.s
@@ -0,0 +1,270 @@
+| Amiga planar circle outline V4 -- 16-way color-specialized.
+|
+| Per Bresenham iter:
+|   1. Precompute 4 xp records (xp_byte_w + bitMask_b + notMask_b) for
+|      cx +/- bx and cx +/- by, stored at sp+0..15 (4 records x 4 bytes).
+|   2. Precompute 4 yp40 words for cy +/- by and cy +/- bx, stored at
+|      sp+16..23 (4 words x 2 bytes).
+|   3. Plot 8 octant pixels with hardcoded color: each pixel does 4
+|      branchless plane RMW ops (or.b for set bits, and.b for clear
+|      bits) -- no btst, no per-plane branch.
+|   4. Bresenham step.
+|
+| At function entry the color is masked to 4 bits and used as the index
+| into a 16-entry jump table that selects the matching main loop.
+| Each main loop has the color hardcoded into the per-plane RMW ops.
+|
+| The branchless plot saves ~20-28 cyc per plane vs V3's btst+branch
+| pattern -- ~640-900 cyc per Bresenham iter.
+|
+| ABI: cdecl. d2-d7/a2-a6 callee-save.
+|
+| void surface68kAmigaCircleOutline(uint8_t *p0, uint8_t *p1,
+|                                   uint8_t *p2, uint8_t *p3,
+|                                   uint16_t cx, uint16_t cy,
+|                                   uint16_t r,  uint8_t  color);
+|
+| Register allocation across the iter loop:
+|   d2.w   = bx (Bresenham)
+|   d3.w   = by (Bresenham)
+|   d4.w   = err (Bresenham)
+|   d5.w   = cx (cached)
+|   a4     = cy (cached, sign-extended)
+|   a0..a3 = plane bases
+|   a5     = bitMaskLut
+|   d0,d1,d6,d7 = scratch in precompute / plot
+|
+| Scratch block (24 bytes) at sp+0..23:
+|   sp+0..3:   xp1 record [xp_byte_w, bitMask_b, notMask_b] for cx+bx
+|   sp+4..7:   xp2 record for cx-bx
+|   sp+8..11:  xp3 record for cx+by
+|   sp+12..15: xp4 record for cx-by
+|   sp+16..17: yp1 word (cy+by) * 40
+|   sp+18..19: yp2 word (cy-by) * 40
+|   sp+20..21: yp3 word (cy+bx) * 40
+|   sp+22..23: yp4 word (cy-bx) * 40
+
+                .text
+
+
+| ---- XP_REC: build xp record at sp+slot for xp = cx <signOp> <xreg> ----
+| signOp: add or sub
+| xreg:   %d2 (bx) or %d3 (by)
+| slot:   0, 4, 8, or 12
+| Trashes: d0, d1, d6, d7
+
+                .macro  XP_REC  slot, signOp, xreg
+                move.w  %d5,%d6
+                \signOp\().w \xreg,%d6        | d6 = xp
+                move.w  %d6,%d7
+                lsr.w   #3,%d7                | d7 = xp >> 3 (xp_byte)
+                and.w   #7,%d6                | d6 = xp & 7
+                move.b  (%a5,%d6.w),%d6       | d6 = bitMask
+                move.b  %d6,%d1
+                not.b   %d1                   | d1 = notMask
+                move.w  %d7,\slot(%sp)        | xp_byte word
+                move.b  %d6,\slot+2(%sp)      | bitMask byte
+                move.b  %d1,\slot+3(%sp)      | notMask byte
+                .endm
+
+
+| ---- YP_REC: build yp40 word at sp+slot for yp = cy <signOp> <yreg> ----
+
+                .macro  YP_REC  slot, signOp, yreg
+                move.l  %a4,%d6
+                \signOp\().w \yreg,%d6        | d6.w = yp
+                move.w  %d6,%d0
+                lsl.w   #3,%d6                | d6 = yp << 3
+                lsl.w   #5,%d0                | d0 = yp << 5
+                add.w   %d6,%d0               | d0 = yp * 40
+                move.w  %d0,\slot(%sp)
+                .endm
+
+
+| ---- PLOT_FIXED: plot one pixel with hardcoded 4-bit color ----
+| slotYp: 16, 18, 20, or 22 (yp40 word slot)
+| slotXp: 0, 4, 8, or 12   (xp record slot)
+| color:  literal 0..15
+| Trashes: d0, d1, d7
+
+                .macro  PLOT_FIXED  slotYp, slotXp, color
+                move.w  \slotYp(%sp),%d0      | d0 = yp40
+                add.w   \slotXp(%sp),%d0      | d0 += xp_byte
+                move.b  \slotXp+2(%sp),%d1    | d1.b = bitMask
+                move.b  \slotXp+3(%sp),%d7    | d7.b = notMask
+                .if  ((\color) & 1)
+                or.b    %d1,(%a0,%d0.w)
+                .else
+                and.b   %d7,(%a0,%d0.w)
+                .endif
+                .if  ((\color) & 2)
+                or.b    %d1,(%a1,%d0.w)
+                .else
+                and.b   %d7,(%a1,%d0.w)
+                .endif
+                .if  ((\color) & 4)
+                or.b    %d1,(%a2,%d0.w)
+                .else
+                and.b   %d7,(%a2,%d0.w)
+                .endif
+                .if  ((\color) & 8)
+                or.b    %d1,(%a3,%d0.w)
+                .else
+                and.b   %d7,(%a3,%d0.w)
+                .endif
+                .endm
+
+
+| ---- PLOT_8: plot all 8 octant pixels for a given hardcoded color ----
+
+                .macro  PLOT_8  color
+                PLOT_FIXED  16,  0, \color    | (cx+bx, cy+by)
+                PLOT_FIXED  16,  4, \color    | (cx-bx, cy+by)
+                PLOT_FIXED  18,  0, \color    | (cx+bx, cy-by)
+                PLOT_FIXED  18,  4, \color    | (cx-bx, cy-by)
+                PLOT_FIXED  20,  8, \color    | (cx+by, cy+bx)
+                PLOT_FIXED  20, 12, \color    | (cx-by, cy+bx)
+                PLOT_FIXED  22,  8, \color    | (cx+by, cy-bx)
+                PLOT_FIXED  22, 12, \color    | (cx-by, cy-bx)
+                .endm
+
+
+| ---- CO_BODY: full Bresenham loop body for a hardcoded color ----
+| Generates the per-iter precompute, branchless plot, and Bresenham
+| step. Uses unique labels via \color suffix.
+
+                .macro  CO_BODY  color
+                XP_REC   0, add, %d2          | xp1 = cx+bx
+                XP_REC   4, sub, %d2          | xp2 = cx-bx
+                XP_REC   8, add, %d3          | xp3 = cx+by
+                XP_REC  12, sub, %d3          | xp4 = cx-by
+                YP_REC  16, add, %d3          | yp1 = cy+by
+                YP_REC  18, sub, %d3          | yp2 = cy-by
+                YP_REC  20, add, %d2          | yp3 = cy+bx
+                YP_REC  22, sub, %d2          | yp4 = cy-bx
+
+                PLOT_8  \color
+
+                addq.w  #1,%d3
+                tst.w   %d4
+                bgt     .LcoDecX_\color
+                add.w   %d3,%d4
+                add.w   %d3,%d4
+                addq.w  #1,%d4
+                bra.w   .LcoLoop_\color
+.LcoDecX_\color:
+                subq.w  #1,%d2
+                add.w   %d3,%d4
+                add.w   %d3,%d4
+                sub.w   %d2,%d4
+                sub.w   %d2,%d4
+                addq.w  #1,%d4
+                bra.w   .LcoLoop_\color
+                .endm
+
+
+| ---- CO_LOOP_HDR: emit a labelled loop header for a color ----
+
+                .macro  CO_LOOP_HDR  color
+.LcoLoop_\color:
+                cmp.w   %d3,%d2
+                bcs.w   .LcoDone
+                CO_BODY \color
+                .endm
+
+
+| ---- Function entry ----
+
+                .equ    SP_SAVED, 44
+                .equ    SP_LOCAL, 24
+
+                .equ    SP_OFF,         (SP_SAVED + 4 + SP_LOCAL)
+
+                .equ    SP_P0,    SP_OFF + 0
+                .equ    SP_P1,    SP_OFF + 4
+                .equ    SP_P2,    SP_OFF + 8
+                .equ    SP_P3,    SP_OFF + 12
+                .equ    SP_CX,    SP_OFF + 16 + 2
+                .equ    SP_CY,    SP_OFF + 20 + 2
+                .equ    SP_R,     SP_OFF + 24 + 2
+                .equ    SP_COLOR, SP_OFF + 28 + 3
+
+                .globl  _surface68kAmigaCircleOutline
+
+_surface68kAmigaCircleOutline:
+                movem.l %d2-%d7/%a2-%a6,-(%sp)
+                lea     -SP_LOCAL(%sp),%sp
+
+                | Plane bases.
+                move.l  SP_P0(%sp),%a0
+                move.l  SP_P1(%sp),%a1
+                move.l  SP_P2(%sp),%a2
+                move.l  SP_P3(%sp),%a3
+                lea     bitMaskLut(%pc),%a5
+
+                | Cache cx in d5, cy (sign-extended) in a4.
+                move.w  SP_CX(%sp),%d5
+                move.w  SP_CY(%sp),%d6
+                ext.l   %d6
+                movea.l %d6,%a4
+
+                | Bresenham init.
+                move.w  SP_R(%sp),%d2         | bx = r
+                moveq   #0,%d3                | by = 0
+                moveq   #1,%d4
+                sub.w   %d2,%d4               | err = 1 - bx
+
+                | Dispatch on color (low 4 bits) -> one of 16 main loops.
+                | Each table entry is a bra.w (4 bytes), so index *= 4.
+                moveq   #0,%d6
+                move.b  SP_COLOR(%sp),%d6
+                and.w   #0x0F,%d6
+                add.w   %d6,%d6
+                add.w   %d6,%d6
+                lea     .LcoTable(%pc),%a6
+                jmp     0(%a6,%d6.w)
+
+.LcoTable:
+                bra.w   .LcoLoop_0
+                bra.w   .LcoLoop_1
+                bra.w   .LcoLoop_2
+                bra.w   .LcoLoop_3
+                bra.w   .LcoLoop_4
+                bra.w   .LcoLoop_5
+                bra.w   .LcoLoop_6
+                bra.w   .LcoLoop_7
+                bra.w   .LcoLoop_8
+                bra.w   .LcoLoop_9
+                bra.w   .LcoLoop_10
+                bra.w   .LcoLoop_11
+                bra.w   .LcoLoop_12
+                bra.w   .LcoLoop_13
+                bra.w   .LcoLoop_14
+                bra.w   .LcoLoop_15
+
+                CO_LOOP_HDR  0
+                CO_LOOP_HDR  1
+                CO_LOOP_HDR  2
+                CO_LOOP_HDR  3
+                CO_LOOP_HDR  4
+                CO_LOOP_HDR  5
+                CO_LOOP_HDR  6
+                CO_LOOP_HDR  7
+                CO_LOOP_HDR  8
+                CO_LOOP_HDR  9
+                CO_LOOP_HDR  10
+                CO_LOOP_HDR  11
+                CO_LOOP_HDR  12
+                CO_LOOP_HDR  13
+                CO_LOOP_HDR  14
+                CO_LOOP_HDR  15
+
+.LcoDone:
+                lea     SP_LOCAL(%sp),%sp
+                movem.l (%sp)+,%d2-%d7/%a2-%a6
+                rts
+
+
+                .align  2
+bitMaskLut:
+                .byte   0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01
diff --git a/src/port/amiga/hal.c b/src/port/amiga/hal.c
index fe6997b..c51b5a9 100644
--- a/src/port/amiga/hal.c
+++ b/src/port/amiga/hal.c
@@ -23,8 +23,11 @@
 #include <stdlib.h>
 #include <string.h>
 
+#include "joey/debug.h"
+
 #include <exec/types.h>
 #include <exec/interrupts.h>
+#include <exec/memory.h>
 #include <hardware/intbits.h>
 #include <intuition/intuition.h>
 #include <intuition/screens.h>
@@ -43,6 +46,7 @@
 #include <proto/graphics.h>
 
 #include "hal.h"
+#include "spriteInternal.h"
 #include "surfaceInternal.h"
 #include "draw68k_inline.h"
 
@@ -59,11 +63,31 @@ static void removeVblServer(void);
 
 #define AMIGA_BITPLANES      4
 #define AMIGA_BYTES_PER_ROW  40
+#define AMIGA_PLANE_SIZE     (AMIGA_BYTES_PER_ROW * SURFACE_HEIGHT)
+
+
+// ----- Per-surface planar storage (project_planar_68k_plan) -----
+//
+// Phase 1 carved out the SurfaceT.portData hook. This struct is what
+// it points to on Amiga: the 4 plane base pointers + stride. For the
+// stage, planes[i] aliases gPlanes[i] (Intuition-allocated, already
+// in chip RAM, already on display). For non-stage surfaces planes[i]
+// gets its own AllocMem(MEMF_CHIP) so the blitter can reach it.
+//
+// Direct fields rather than a union because the consumer is asm /
+// inline C that wants minimal indirection in the inner loop.
+typedef struct {
+    uint8_t *planes[AMIGA_BITPLANES];
+    uint16_t bytesPerRow;       // = AMIGA_BYTES_PER_ROW (40)
+    uint16_t bytesPerPlane;     // = AMIGA_PLANE_SIZE (8000)
+    bool     ownsPlanes;        // true = AllocMem'd (free at destroy);
+                                // false = aliased to gPlanes (don't free)
+} AmigaPlanarT;
 
 // ----- Prototypes -----
 
+static void                amigaPlanarSetPixel(AmigaPlanarT *pd, int16_t x, int16_t y, uint8_t color);
 static void                buildCopperList(const SurfaceT *src);
-static void                c2pRange(const SurfaceT *src, int16_t y0, int16_t y1, uint16_t byteStart, uint16_t byteEnd);
 static void                dumpCopperList(void);
 static void                installCopperList(void);
 static void                uploadFirstBandPalette(const SurfaceT *src);
@@ -83,8 +107,12 @@ static struct UCopList  *gNewUCL  = NULL;  // built but not yet installed
 // demo after the initial paint) leave both alone. MrgCop + LoadView +
 // WaitTOF is hundreds of milliseconds on a 7 MHz 68000, so skipping
 // them on clean frames is a major win.
-static uint8_t  gCachedScb    [SURFACE_HEIGHT];
-static uint16_t gCachedPalette[SURFACE_PALETTE_COUNT][SURFACE_COLORS_PER_PALETTE];
+/* Long-aligned so memcmpLongs (uint32_t pointer compare) won't
+ * address-error on 68000. SurfaceT.scb/palette are at long-aligned
+ * offsets (8 and 208), so the source side is safe; these cached
+ * mirrors need the same property. */
+static uint8_t  gCachedScb    [SURFACE_HEIGHT]                                  __attribute__((aligned(4)));
+static uint16_t gCachedPalette[SURFACE_PALETTE_COUNT][SURFACE_COLORS_PER_PALETTE] __attribute__((aligned(4)));
 static bool     gCacheValid = false;
 
 // 4 KB chunky-to-planar lookup table consumed by chunkyToPlanarRow
@@ -144,42 +172,10 @@ static void initC2pLut(void) {
 }
 
 
-// Convert a range of chunky scanlines [y0, y1) to Amiga planar over
-// planar-byte columns [byteStart, byteEnd). Per row the work is dropped
-// into chunkyToPlanarRow (src/port/amiga/c2p.s) which is ~5x faster
-// than the old per-pixel C inner loop GCC emits for m68k.
-//
-// Each planar byte corresponds to 8 horizontal pixels = 4 source bytes
-// at 4bpp packed; partial-rect callers should round byteStart down and
-// byteEnd up to keep the 8-pixel alignment.
-static void c2pRange(const SurfaceT *src, int16_t y0, int16_t y1, uint16_t byteStart, uint16_t byteEnd) {
-    const uint8_t *srcLine;
-    UBYTE         *p0;
-    UBYTE         *p1;
-    UBYTE         *p2;
-    UBYTE         *p3;
-    int16_t        y;
-    uint16_t       numBytes;
-
-    if (byteStart >= byteEnd) {
-        return;
-    }
-    if (!gC2pLutReady) {
-        initC2pLut();
-    }
-    numBytes = (uint16_t)(byteEnd - byteStart);
-
-    for (y = y0; y < y1; y++) {
-        // 4 source bytes per planar byte: source-byte offset =
-        // byteStart * 4 within the chunky row.
-        srcLine = &src->pixels[y * SURFACE_BYTES_PER_ROW + byteStart * 4];
-        p0      = &gPlanes[0][y * AMIGA_BYTES_PER_ROW + byteStart];
-        p1      = &gPlanes[1][y * AMIGA_BYTES_PER_ROW + byteStart];
-        p2      = &gPlanes[2][y * AMIGA_BYTES_PER_ROW + byteStart];
-        p3      = &gPlanes[3][y * AMIGA_BYTES_PER_ROW + byteStart];
-        chunkyToPlanarRow(srcLine, p0, p1, p2, p3, numBytes, gC2pLut);
-    }
-}
+// (Phase 9 deleted c2pRange. halSurfaceLoadPlanes inlines its own
+// per-row chunkyToPlanarRow loop -- the only code path that still
+// converts chunky to planar today, since asset loading is the only
+// surface mutation that doesn't go through a planar-aware primitive.)
 
 
 // Build a user copper list for per-scanline palette (SCB emulation).
@@ -360,16 +356,39 @@ static void dumpCopperList(void) {
 }
 
 
+/* Long-aligned compare. SCB is 200 bytes, palette is 16*16*2 = 512
+ * bytes; both gCached* are statically aligned and src->scb/palette
+ * sit at long-aligned offsets in SurfaceT. libnix memcmp walks
+ * byte-by-byte (~10 cyc/byte = ~7 ms for 712 bytes); a long-pointer
+ * inline compare drops that to ~2 ms, which dominates per-call
+ * overhead for tight stagePresentRect loops where there's nothing
+ * to actually present. Returns nonzero on first mismatch. */
+static bool memcmpLongs(const void *a, const void *b, uint16_t bytes) {
+    const uint32_t *pa = (const uint32_t *)a;
+    const uint32_t *pb = (const uint32_t *)b;
+    uint16_t        n  = (uint16_t)(bytes >> 2);
+    while (n > 0u) {
+        if (*pa != *pb) {
+            return true;
+        }
+        pa++;
+        pb++;
+        n--;
+    }
+    return false;
+}
+
+
 // Returns true if the SCB table or palette RGB values differ from the
 // last presented frame, or if no frame has been presented yet.
 static bool paletteOrScbChanged(const SurfaceT *src) {
     if (!gCacheValid) {
         return true;
     }
-    if (memcmp(gCachedScb, src->scb, sizeof(gCachedScb)) != 0) {
+    if (memcmpLongs(gCachedScb, src->scb, (uint16_t)sizeof(gCachedScb))) {
         return true;
     }
-    if (memcmp(gCachedPalette, src->palette, sizeof(gCachedPalette)) != 0) {
+    if (memcmpLongs(gCachedPalette, src->palette, (uint16_t)sizeof(gCachedPalette))) {
         return true;
     }
     return false;
@@ -380,10 +399,14 @@ static bool paletteOrScbChanged(const SurfaceT *src) {
 // state visible to the display differs from what the surface carries
 // now. On clean frames we skip the AllocMem + MrgCop + LoadView +
 // WaitTOF chain entirely.
+static uint32_t gCopperRebuildCount = 0;
+static uint32_t gPresentCallCount   = 0;
+
 static void updateCopperIfNeeded(const SurfaceT *src) {
     if (!paletteOrScbChanged(src)) {
         return;
     }
+    gCopperRebuildCount++;
     uploadFirstBandPalette(src);
     buildCopperList(src);
     installCopperList();
@@ -393,6 +416,15 @@ static void updateCopperIfNeeded(const SurfaceT *src) {
 }
 
 
+/* Diag hook: callable from anywhere via the linker symbol. */
+void amigaDumpPresentCounters(const char *label) {
+    joeyLogF("amiga-perf: %s: present=%lu copperRebuild=%lu",
+             label != NULL ? label : "?",
+             (unsigned long)gPresentCallCount,
+             (unsigned long)gCopperRebuildCount);
+}
+
+
 // Load the first band's palette into the screen's ColorMap so the
 // Intuition-generated frame-start copper writes those values on each
 // frame. This acts as a safety net: even if our user copper list does
@@ -419,11 +451,50 @@ static void uploadFirstBandPalette(const SurfaceT *src) {
 
 bool halInit(const JoeyConfigT *config) {
     uint16_t i;
+    uint16_t j;
 
     (void)config;
 
+    // Allocate our own BitMap with explicitly-non-interleaved planes,
+    // then hand it to OpenScreen via SA_BitMap. Why not let Intuition
+    // build one for us:
+    //   * AllocBitMap is V39+ and we target OCS / Kickstart 1.3
+    //     (V34), so we have to do this manually with InitBitMap +
+    //     per-plane AllocMem.
+    //   * Without SA_BitMap, OpenScreen on AmigaOS 2.x+ / AROS may
+    //     return a BitMap with interleaved planes (single allocation,
+    //     stride = depth * bytes_per_row). Our c2pRange and every
+    //     planar primitive assume stride = bytes_per_row per plane;
+    //     interleaved layout would silently corrupt with no
+    //     diagnostic.
+    //   * Forcing the layout also locks in chip-RAM placement (display
+    //     DMA can only fetch from chip RAM) regardless of host
+    //     defaults.
+    gBitMap = (struct BitMap *)AllocMem((ULONG)sizeof(struct BitMap),
+                                        (ULONG)(MEMF_PUBLIC | MEMF_CLEAR));
+    if (gBitMap == NULL) {
+        return false;
+    }
+    InitBitMap(gBitMap, (LONG)AMIGA_BITPLANES, (LONG)SURFACE_WIDTH, (LONG)SURFACE_HEIGHT);
+    for (i = 0; i < AMIGA_BITPLANES; i++) {
+        gPlanes[i] = (UBYTE *)AllocMem((ULONG)AMIGA_PLANE_SIZE,
+                                       (ULONG)(MEMF_CHIP | MEMF_CLEAR));
+        if (gPlanes[i] == NULL) {
+            for (j = 0; j < i; j++) {
+                FreeMem(gPlanes[j], (ULONG)AMIGA_PLANE_SIZE);
+                gPlanes[j] = NULL;
+            }
+            FreeMem(gBitMap, (ULONG)sizeof(struct BitMap));
+            gBitMap = NULL;
+            return false;
+        }
+        gBitMap->Planes[i] = gPlanes[i];
+    }
+
     // SA_DisplayID pins us to OCS PAL low-res so Intuition opens a
-    // real planar screen rather than an RTG substitute.
+    // real planar screen rather than an RTG substitute. SA_BitMap
+    // makes Intuition use OUR pre-allocated planes; CloseScreen will
+    // not free them -- our halShutdown does.
     gScreen = OpenScreenTags(NULL,
         (ULONG)SA_Width,      (ULONG)SURFACE_WIDTH,
         (ULONG)SA_Height,     (ULONG)SURFACE_HEIGHT,
@@ -434,19 +505,19 @@ bool halInit(const JoeyConfigT *config) {
         (ULONG)SA_Title,      (ULONG)"JoeyLib",
         (ULONG)SA_Type,       (ULONG)CUSTOMSCREEN,
         (ULONG)SA_Quiet,      (ULONG)TRUE,
+        (ULONG)SA_BitMap,     (ULONG)gBitMap,
         TAG_DONE);
 
     if (gScreen == NULL) {
-        return false;
-    }
-    gBitMap = gScreen->RastPort.BitMap;
-    for (i = 0; i < AMIGA_BITPLANES; i++) {
-        gPlanes[i] = gBitMap->Planes[i];
-        if (gPlanes[i] == NULL) {
-            CloseScreen(gScreen);
-            gScreen = NULL;
-            return false;
+        for (i = 0; i < AMIGA_BITPLANES; i++) {
+            if (gPlanes[i] != NULL) {
+                FreeMem(gPlanes[i], (ULONG)AMIGA_PLANE_SIZE);
+                gPlanes[i] = NULL;
+            }
         }
+        FreeMem(gBitMap, (ULONG)sizeof(struct BitMap));
+        gBitMap = NULL;
+        return false;
     }
     // Force COLOR00 to black so the overscan/border region around the
     // 320x200 display is black until the app's palette load takes over
@@ -464,51 +535,128 @@ const char *halLastError(void) {
 }
 
 
-void halPresent(const SurfaceT *src) {
-    int16_t  y;
-    uint8_t  minWord;
-    uint8_t  maxWord;
-    uint16_t byteStart;
-    uint16_t byteEnd;
+// Phase 9 switch flip: present is now always a per-row planar memcpy
+// from the off-screen shadow planes (where every drawing primitive
+// dual-writes today) into gPlanes[] (the displayed BitMap). c2p is
+// gone; the chunky `s->pixels` shadow is still maintained by the
+// halFast* primitives but no longer drives display. Phase 10 will
+// either (a) BPLPTR-swap shadow <-> display planes (zero-copy) or
+// (b) stop writing chunky in the fast paths to recover the dual-
+// write cost. Per-row dirty tracking is reused: only dirty bands
+// memcpy.
+/* Helper: copy a rect (firstRow..lastRow inclusive) from each shadow
+ * plane into the displayed plane. One CopyMemQuick per plane covers
+ * the bounding box. CopyMemQuick is guaranteed long-move and beats
+ * libnix memcpy by ~5x for full-screen present, but REQUIRES both
+ * pointers long-aligned and length a long multiple -- misalignment
+ * gurus. byteStart rounds DOWN to long boundary; copyLen rounds UP
+ * past the right edge. Over-copies up to 3 bytes per side, which
+ * stays inside the plane buffer (AMIGA_PLANE_SIZE = 8000). */
+static void amigaPresentRectInner(AmigaPlanarT *pd, int16_t firstRow, int16_t lastRow,
+                                  uint16_t byteStart, uint16_t bytesPerRow) {
+    uint16_t alignedStart;
+    uint16_t alignedEnd;
+    uint16_t alignedBytesPerRow;
+    uint16_t offset;
+    uint16_t copyLen;
+    uint16_t i;
+    int16_t  row;
 
+    alignedStart       = (uint16_t)(byteStart & ~3u);
+    alignedEnd         = (uint16_t)((byteStart + bytesPerRow + 3u) & ~3u);
+    alignedBytesPerRow = (uint16_t)(alignedEnd - alignedStart);
+
+    /* THIN path: rect is narrow enough that the bounding-box memcpy
+     * would over-copy. e.g. an 8x8 rect at x=40 has 32 bytes of
+     * actual data vs 284 bytes of bounding box (9x over-copy). For
+     * narrow rects, per-row inline long-copy beats memcpy because
+     * memcpy has function-call dispatch per call AND we only need a
+     * couple of long stores per row. Threshold tuned so
+     * stagePresentRect 8b lands here, stagePresentRect F doesn't. */
+    if (alignedBytesPerRow <= (AMIGA_BYTES_PER_ROW / 2u)) {
+        offset = (uint16_t)((uint16_t)firstRow * AMIGA_BYTES_PER_ROW + alignedStart);
+        for (i = 0; i < AMIGA_BITPLANES; i++) {
+            uint8_t *src = pd->planes[i] + offset;
+            uint8_t *dst = gPlanes[i] + offset;
+            for (row = firstRow; row <= lastRow; row++) {
+                uint32_t       *d32 = (uint32_t *)dst;
+                const uint32_t *s32 = (const uint32_t *)src;
+                uint16_t        n   = (uint16_t)(alignedBytesPerRow >> 2);
+                do {
+                    *d32++ = *s32++;
+                } while (--n);
+                src += AMIGA_BYTES_PER_ROW;
+                dst += AMIGA_BYTES_PER_ROW;
+            }
+        }
+        return;
+    }
+
+    /* FAT path: bounding-box memcpy. Single libc call per plane;
+     * over-copies the L/R margins of inner rows but that overhead
+     * amortizes when the rect spans a large fraction of the row. */
+    offset  = (uint16_t)((uint16_t)firstRow * AMIGA_BYTES_PER_ROW + alignedStart);
+    copyLen = (uint16_t)((uint16_t)(lastRow - firstRow) * AMIGA_BYTES_PER_ROW
+                         + alignedBytesPerRow);
+    for (i = 0; i < AMIGA_BITPLANES; i++) {
+        memcpy(gPlanes[i] + offset, pd->planes[i] + offset, copyLen);
+    }
+}
+
+
+void halPresent(const SurfaceT *src) {
+    AmigaPlanarT *pd;
+    int16_t       y;
+    int16_t       firstRow;
+    int16_t       lastRow;
+    uint8_t       minWord;
+    uint8_t       maxWord;
+    uint8_t       unionMinWord;
+    uint8_t       unionMaxWord;
+    uint16_t      byteStart;
+    uint16_t      bytesPerRow;
+
+    gPresentCallCount++;
     if (src == NULL || gScreen == NULL) {
         return;
     }
     updateCopperIfNeeded(src);
 
-    // Walk per-row dirty bands: each planar byte covers 8 px = 2 chunky
-    // words, so byteStart = minWord/2 and byteEnd = maxWord/2 + 1
-    // converts dirty-word units to the planar-byte units c2pRange wants.
+    pd = (AmigaPlanarT *)src->portData;
+    if (pd == NULL) {
+        return;
+    }
+
+    /* Reduce the per-row dirty bands to a bounding box. */
+    firstRow     = -1;
+    lastRow      = -1;
+    unionMinWord = 0xFFu;
+    unionMaxWord = 0u;
     for (y = 0; y < SURFACE_HEIGHT; y++) {
         minWord = gStageMinWord[y];
         maxWord = gStageMaxWord[y];
         if (minWord > maxWord) {
             continue;
         }
-        byteStart = (uint16_t)(minWord >> 1);
-        byteEnd   = (uint16_t)((maxWord >> 1) + 1);
-        c2pRange(src, y, (int16_t)(y + 1), byteStart, byteEnd);
+        if (firstRow < 0) {
+            firstRow = y;
+        }
+        lastRow = y;
+        if (minWord < unionMinWord) {
+            unionMinWord = minWord;
+        }
+        if (maxWord > unionMaxWord) {
+            unionMaxWord = maxWord;
+        }
     }
-}
-
-
-void halPresentRect(const SurfaceT *src, int16_t x, int16_t y, uint16_t w, uint16_t h) {
-    uint16_t byteStart;
-    uint16_t byteEnd;
-
-    if (src == NULL || gScreen == NULL) {
+    if (firstRow < 0) {
         return;
     }
-    updateCopperIfNeeded(src);
-    // Each planar byte covers 8 horizontal pixels. Round dirty pixel
-    // range to the enclosing planar-byte range so we never miss an
-    // edge pixel while still honoring the rect width.
-    byteStart = (uint16_t)(x >> 3);
-    byteEnd   = (uint16_t)(((uint16_t)x + w + 7) >> 3);
-    if (byteEnd > AMIGA_BYTES_PER_ROW) {
-        byteEnd = AMIGA_BYTES_PER_ROW;
-    }
-    c2pRange(src, y, y + (int16_t)h, byteStart, byteEnd);
+
+    /* Each planar byte covers 8 px = 2 chunky words. */
+    byteStart   = (uint16_t)(unionMinWord >> 1);
+    bytesPerRow = (uint16_t)(((unionMaxWord >> 1) + 1u) - byteStart);
+    amigaPresentRectInner(pd, firstRow, lastRow, byteStart, bytesPerRow);
 }
 
 
@@ -581,6 +729,9 @@ uint16_t halFrameHz(void) {
 
 
 void halShutdown(void) {
+    uint16_t i;
+
+    amigaDumpPresentCounters("halShutdown");
     // Tear down the VBL server before closing the screen so the
     // interrupt chain is clean if anything else is watching.
     removeVblServer();
@@ -594,6 +745,19 @@ void halShutdown(void) {
         Permit();
         CloseScreen(gScreen);
         gScreen = NULL;
+    }
+    // We allocated the BitMap and its planes manually (see halInit)
+    // so SA_BitMap could pin Intuition to non-interleaved layout.
+    // CloseScreen with an SA_BitMap'd screen does NOT free our
+    // BitMap or planes -- we own them and must clean up here.
+    if (gBitMap != NULL) {
+        for (i = 0; i < AMIGA_BITPLANES; i++) {
+            if (gPlanes[i] != NULL) {
+                FreeMem(gPlanes[i], (ULONG)AMIGA_PLANE_SIZE);
+                gPlanes[i] = NULL;
+            }
+        }
+        FreeMem(gBitMap, (ULONG)sizeof(struct BitMap));
         gBitMap = NULL;
     }
     if (gNewUCL != NULL) {
@@ -609,58 +773,795 @@ void halShutdown(void) {
 extern void surface68kClearLong(uint8_t *pixels, uint16_t fillByte);
 extern void surface68kFillRectFull(uint8_t *pixels, int16_t y, uint16_t h, uint16_t fillByte);
 extern void surface68kFillRectByteAligned(uint8_t *rowFirst, uint16_t midBytes, uint16_t h, uint16_t fillByte);
+extern void surface68kFillSpan4Planes(uint8_t *p0, uint8_t *p1, uint8_t *p2, uint8_t *p3,
+                                      uint16_t numMid, uint8_t leftMask, uint8_t rightMask,
+                                      uint8_t fb0, uint8_t fb1, uint8_t fb2, uint8_t fb3);
+extern void surface68kAmigaCircleOutline(uint8_t *p0, uint8_t *p1, uint8_t *p2, uint8_t *p3,
+                                         uint16_t cx, uint16_t cy, uint16_t r, uint8_t color);
 
 
+// Phase 3 planar dual-write: write the bitplanes alongside the
+// Phase 9: Amiga is pure planar. Every halFast* below returns true
+// to suppress the cross-platform chunky fallback path -- there is no
+// chunky shadow on Amiga (s->pixels is NULL post-Phase 9). The actual
+// planar work happens in dedicated halXxxPlanes hooks called by
+// cross-platform code AFTER each halFast*.
+//
+// halFastSurfaceClear inlines the planar fill (no separate hook).
+// All other halFast* are short-circuit stubs that return true.
 bool halFastSurfaceClear(SurfaceT *s, uint8_t doubled) {
+    AmigaPlanarT *pd;
+    uint8_t       colorIndex;
+    uint16_t      i;
+    uint8_t       planeByte;
+
     if (s != stageGet()) {
         return false;
     }
-    surface68kClearLong(s->pixels, (uint16_t)doubled);
+    colorIndex = (uint8_t)(doubled & 0x0Fu);
+    pd = (AmigaPlanarT *)s->portData;
+    if (pd != NULL) {
+        for (i = 0; i < AMIGA_BITPLANES; i++) {
+            planeByte = ((colorIndex >> i) & 1u) ? 0xFFu : 0x00u;
+            memset(pd->planes[i], planeByte, AMIGA_PLANE_SIZE);
+        }
+    }
     return true;
 }
 
 
-// Fast path bands:
-//   - x == 0 && w == SURFACE_WIDTH (full row): one move.l-stream per
-//     row via surface68kFillRectFull. No nibble fixups needed -- both
-//     nibbles in every byte get the same value, and rowFirst is the
-//     surface base which is always word-aligned by calloc.
-//   - x % 4 == 0 && w even (byte-aligned AND word-aligned): inner
-//     bytes via the asm. The (x % 4 == 0) part is the 68000 alignment
-//     requirement for the move.l writes inside the asm -- byte index
-//     = x/2, so x must be a multiple of 4 for the byte index to be
-//     even.
-//   - everything else: fall through to C's fillRectClipped, which
-//     does per-byte writes (no alignment needed) and handles the
-//     leading / trailing nibble RMW correctly.
 bool halFastFillRect(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex) {
-    uint8_t  doubled;
-
+    /* Pure short-circuit: halFillRectPlanes (called by cross-platform
+     * fillRect after this) does the actual planar fill with full
+     * partial-byte mask handling. We just claim ownership so the C
+     * chunky fallback never runs. */
+    (void)x; (void)y; (void)w; (void)h; (void)colorIndex;
     if (s != stageGet()) {
         return false;
     }
-    if (h == 0u || w == 0u) {
-        return true;        /* clipped-out: nothing to do, but we "handled" it */
-    }
-    doubled = (uint8_t)(((colorIndex & 0x0Fu) << 4) | (colorIndex & 0x0Fu));
-
-    if (x == 0 && w == (uint16_t)SURFACE_WIDTH) {
-        surface68kFillRectFull(s->pixels, y, h, (uint16_t)doubled);
-        return true;
-    }
-    if (((x & 3) == 0) && ((w & 1u) == 0u)) {
-        uint8_t *rowFirst = &s->pixels[(uint16_t)y * (uint16_t)SURFACE_BYTES_PER_ROW + ((uint16_t)x >> 1)];
-        surface68kFillRectByteAligned(rowFirst, w >> 1, h, (uint16_t)doubled);
-        return true;
-    }
-    return false;
+    return true;
 }
 
 
+// Phase 3 planar dual-write for fillRect: writes the four off-screen
+// shadow plane buffers alongside the chunky shadow. Caller (cross-
+// platform fillRect) has already done the chunky write via
+// halFastFillRect or fillRectClipped. The shadow planes are off-
+// screen so this is invisible until stagePresent.
+//
+// Layout reminder (see docs/amiga_planar.md): each plane byte covers
+// 8 horizontal pixels; bit 7 = leftmost pixel of that byte. So a
+// rect clipped to [x, x+w) needs:
+//   * bytes [x/8 .. (x+w-1)/8] in each plane row
+//   * leading partial byte if (x % 8) != 0 (only bits [7-x%8 .. 0]
+//     get touched -- the upper bits stay)
+//   * trailing partial byte if ((x+w-1) % 8) != 7 (only bits [7 ..
+//     7-(x+w-1)%8] get touched)
+//   * single-byte case (byteFirst == byteLast) collapses to one
+//     read-modify-write with a combined mask.
+// For each plane, the bit value at every pixel in the rect is
+// constant: (colorIndex >> plane) & 1. Set bit -> OR with mask;
+// clear bit -> AND with ~mask.
+void halFillRectPlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex) {
+    AmigaPlanarT *pd;
+    uint16_t      byteFirst;
+    uint16_t      byteLast;
+    uint16_t      numMid;
+    uint8_t       leftMask;
+    uint8_t       rightMask;
+    uint16_t      plane;
+    int16_t       row;
+    int16_t       yEnd;
+    uint8_t       bitVal;
+    uint8_t       fullByte;
+    uint8_t      *p;
+    uint8_t      *planeBase;
+
+    if (s == NULL || w == 0u || h == 0u) {
+        return;
+    }
+    pd = (AmigaPlanarT *)s->portData;
+    if (pd == NULL) {
+        return;
+    }
+
+    /* Variable shifts on 68000 cost 8 cyc per bit shifted -- for shifts
+     * by up to 7 that's ~30 cyc per mask, ~60 cyc both. LUTs compile
+     * to a single byte load. fillCircle r=40 calls halFillRectPlanes
+     * 160 times, so this saves ~160*60=9600 cyc per fillCircle. */
+    static const uint8_t kLeftMaskLut[8]  = {
+        0xFFu, 0x7Fu, 0x3Fu, 0x1Fu, 0x0Fu, 0x07u, 0x03u, 0x01u
+    };
+    static const uint8_t kRightMaskLut[8] = {
+        0x80u, 0xC0u, 0xE0u, 0xF0u, 0xF8u, 0xFCu, 0xFEu, 0xFFu
+    };
+
+    byteFirst = (uint16_t)((uint16_t)x >> 3);
+    byteLast  = (uint16_t)(((uint16_t)x + w - 1u) >> 3);
+
+    leftMask  = kLeftMaskLut [(uint16_t)x & 7u];
+    rightMask = kRightMaskLut[((uint16_t)x + w - 1u) & 7u];
+
+    yEnd = y + (int16_t)h;
+
+    /* Full-row fast path: no partial-byte RMW on either edge, so each
+     * plane is a pure long-fill of (h * 40) bytes. fillRect 320x200
+     * is the dominant case and lands here; saves 200 rows of leading
+     * byte RMW + 200 of trailing byte RMW per plane = 1600 chip-bus
+     * read+write cycles per plane on top of the actual data write. */
+    if (byteFirst == 0u && byteLast == (uint16_t)(AMIGA_BYTES_PER_ROW - 1u)) {
+        uint16_t totalLongs;
+        uint16_t groups;
+        uint16_t tail;
+        uint32_t fillLong;
+        uint32_t *p32;
+
+        /* AMIGA_BYTES_PER_ROW = 40 = 10 longs / row. 8x-unrolled inner
+         * loop amortizes the dec+bne to ~2 cyc per store; net ~12 cyc
+         * per long including chip-bus contention. Tail handles the
+         * 0..7 longs that don't fit a full group. */
+        totalLongs = (uint16_t)((uint16_t)h * (AMIGA_BYTES_PER_ROW / 4u));
+        groups     = (uint16_t)(totalLongs >> 3);
+        tail       = (uint16_t)(totalLongs & 7u);
+        for (plane = 0; plane < AMIGA_BITPLANES; plane++) {
+            bitVal   = (uint8_t)((colorIndex >> plane) & 1u);
+            fullByte = bitVal ? 0xFFu : 0x00u;
+            fillLong = (uint32_t)fullByte * 0x01010101UL;
+            p32 = (uint32_t *)(pd->planes[plane]
+                               + (uint16_t)y * AMIGA_BYTES_PER_ROW);
+            if (groups > 0u) {
+                uint16_t n = groups;
+                do {
+                    p32[0] = fillLong;
+                    p32[1] = fillLong;
+                    p32[2] = fillLong;
+                    p32[3] = fillLong;
+                    p32[4] = fillLong;
+                    p32[5] = fillLong;
+                    p32[6] = fillLong;
+                    p32[7] = fillLong;
+                    p32 += 8;
+                } while (--n);
+            }
+            {
+                uint16_t t = tail;
+                while (t > 0u) {
+                    *p32++ = fillLong;
+                    t--;
+                }
+            }
+        }
+        return;
+    }
+
+    /* Byte-aligned partial-row fast path: when both edges are full
+     * bytes (leftMask == rightMask == 0xFF) every byte in the row is
+     * a full overwrite -- no RMW needed. UBER fillRect 80x80 at x=120
+     * lands here (byteFirst=15, byteLast=24). Plane bases are
+     * MEMF_FAST-allocated long-aligned, and y*40 is also a multiple
+     * of 4, so rowP alignment is determined by byteFirst alone --
+     * computed once, not per-row. */
+    if (leftMask == 0xFFu && rightMask == 0xFFu) {
+        uint16_t nBytes     = (uint16_t)(byteLast - byteFirst + 1u);
+        uint8_t  alignBytes = (uint8_t)((4u - (byteFirst & 3u)) & 3u);
+        uint16_t midBytes;
+        uint16_t midLongs;
+        uint16_t tailBytes;
+
+        if (alignBytes > nBytes) {
+            alignBytes = (uint8_t)nBytes;
+        }
+        midBytes  = (uint16_t)(nBytes - alignBytes);
+        midLongs  = (uint16_t)(midBytes >> 2);
+        tailBytes = (uint16_t)(midBytes & 3u);
+
+        for (plane = 0; plane < AMIGA_BITPLANES; plane++) {
+            bitVal    = (uint8_t)((colorIndex >> plane) & 1u);
+            fullByte  = bitVal ? 0xFFu : 0x00u;
+            uint32_t  fillLong = (uint32_t)fullByte * 0x01010101UL;
+            planeBase = pd->planes[plane];
+            uint8_t  *rowP = planeBase
+                             + (uint16_t)y * AMIGA_BYTES_PER_ROW
+                             + byteFirst;
+            for (row = y; row < yEnd; row++) {
+                uint8_t  *pp = rowP;
+                uint8_t   ab = alignBytes;
+                uint16_t  ml = midLongs;
+                uint16_t  tb = tailBytes;
+                while (ab > 0u) {
+                    *pp++ = fullByte;
+                    ab--;
+                }
+                if (ml > 0u) {
+                    uint32_t *p32 = (uint32_t *)pp;
+                    do {
+                        *p32++ = fillLong;
+                    } while (--ml);
+                    pp = (uint8_t *)p32;
+                }
+                while (tb > 0u) {
+                    *pp++ = fullByte;
+                    tb--;
+                }
+                rowP += AMIGA_BYTES_PER_ROW;
+            }
+        }
+        return;
+    }
+
+    /* Hoist bitVal-dependent setup outside the row loop. Two
+     * specialized per-plane paths (OR for set, AND-NOT for clear)
+     * give gcc -O2 simple branchless inner loops. Row pointer is
+     * advanced by += AMIGA_BYTES_PER_ROW instead of recomputed per
+     * row -- saves the per-iter multiply.
+     *
+     * Single-byte case (byteFirst == byteLast) uses the combined
+     * mask; multi-byte case uses leading + middle long-fill +
+     * trailing. The middle long-fill path is identical to the
+     * earlier code (align, long stores, drain) but lifted into the
+     * per-plane scope so the constants are loop-invariant. */
+    {
+        uint8_t  notLeftMask  = (uint8_t)~leftMask;
+        uint8_t  notRightMask = (uint8_t)~rightMask;
+
+        if (byteFirst == byteLast) {
+            uint8_t mask    = (uint8_t)(leftMask & rightMask);
+            uint8_t notMask = (uint8_t)~mask;
+            for (plane = 0; plane < AMIGA_BITPLANES; plane++) {
+                planeBase = pd->planes[plane];
+                p = planeBase + (uint16_t)y * AMIGA_BYTES_PER_ROW + byteFirst;
+                if ((colorIndex >> plane) & 1u) {
+                    /* OR path */
+                    for (row = y; row < yEnd; row++) {
+                        *p = (uint8_t)(*p | mask);
+                        p += AMIGA_BYTES_PER_ROW;
+                    }
+                } else {
+                    /* AND-NOT path */
+                    for (row = y; row < yEnd; row++) {
+                        *p = (uint8_t)(*p & notMask);
+                        p += AMIGA_BYTES_PER_ROW;
+                    }
+                }
+            }
+            return;
+        }
+
+        numMid = (uint16_t)(byteLast - byteFirst - 1u);
+
+        /* Hoist middle-region alignment outside both per-plane and
+         * per-row loops. midStart = planeBase + y*40 + byteFirst + 1.
+         * Plane bases are MEMF_FAST long-aligned and y*40 is a
+         * multiple of 4, so midStart's alignment is determined by
+         * (byteFirst + 1) & 3 alone -- constant across planes/rows. */
+        uint8_t  midAlignBytes = (uint8_t)((4u - ((byteFirst + 1u) & 3u)) & 3u);
+        uint16_t midRem;
+        uint16_t midLongs;
+        uint16_t midTail;
+
+        if (midAlignBytes > numMid) {
+            midAlignBytes = (uint8_t)numMid;
+        }
+        midRem   = (uint16_t)(numMid - midAlignBytes);
+        midLongs = (uint16_t)(midRem >> 2);
+        midTail  = (uint16_t)(midRem & 3u);
+
+        /* Small-numMid byte-only path: when there are no full longs to
+         * fill, the unified long-fill machinery's runtime ml/tb checks
+         * cost more than they save. UBER fillRect 16x16 (numMid=1)
+         * lands here. */
+        if (midLongs == 0u) {
+            for (plane = 0; plane < AMIGA_BITPLANES; plane++) {
+                bitVal    = (uint8_t)((colorIndex >> plane) & 1u);
+                fullByte  = bitVal ? 0xFFu : 0x00u;
+                uint8_t   leadBits  = (uint8_t)(bitVal ? leftMask  : notLeftMask);
+                uint8_t   trailBits = (uint8_t)(bitVal ? rightMask : notRightMask);
+                planeBase = pd->planes[plane];
+                p = planeBase + (uint16_t)y * AMIGA_BYTES_PER_ROW + byteFirst;
+
+                if (bitVal) {
+                    for (row = y; row < yEnd; row++) {
+                        uint8_t  *pp = p;
+                        uint16_t  m  = numMid;
+                        *pp = (uint8_t)(*pp | leadBits); pp++;
+                        while (m > 0u) { *pp++ = fullByte; m--; }
+                        *pp = (uint8_t)(*pp | trailBits);
+                        p += AMIGA_BYTES_PER_ROW;
+                    }
+                } else {
+                    for (row = y; row < yEnd; row++) {
+                        uint8_t  *pp = p;
+                        uint16_t  m  = numMid;
+                        *pp = (uint8_t)(*pp & leadBits); pp++;
+                        while (m > 0u) { *pp++ = fullByte; m--; }
+                        *pp = (uint8_t)(*pp & trailBits);
+                        p += AMIGA_BYTES_PER_ROW;
+                    }
+                }
+            }
+            return;
+        }
+
+        for (plane = 0; plane < AMIGA_BITPLANES; plane++) {
+            bitVal    = (uint8_t)((colorIndex >> plane) & 1u);
+            fullByte  = bitVal ? 0xFFu : 0x00u;
+            uint32_t fillLong = (uint32_t)fullByte * 0x01010101UL;
+            uint8_t  leadBits = (uint8_t)(bitVal ? leftMask  : notLeftMask);
+            uint8_t  trailBits= (uint8_t)(bitVal ? rightMask : notRightMask);
+            planeBase = pd->planes[plane];
+            p = planeBase + (uint16_t)y * AMIGA_BYTES_PER_ROW + byteFirst;
+
+            if (bitVal) {
+                for (row = y; row < yEnd; row++) {
+                    uint8_t  *pp = p;
+                    uint8_t   ab = midAlignBytes;
+                    uint16_t  ml = midLongs;
+                    uint16_t  tb = midTail;
+                    *pp = (uint8_t)(*pp | leadBits); pp++;
+                    while (ab > 0u) { *pp++ = fullByte; ab--; }
+                    {
+                        uint32_t *p32 = (uint32_t *)pp;
+                        do { *p32++ = fillLong; } while (--ml);
+                        pp = (uint8_t *)p32;
+                    }
+                    while (tb > 0u) { *pp++ = fullByte; tb--; }
+                    *pp = (uint8_t)(*pp | trailBits);
+                    p += AMIGA_BYTES_PER_ROW;
+                }
+            } else {
+                for (row = y; row < yEnd; row++) {
+                    uint8_t  *pp = p;
+                    uint8_t   ab = midAlignBytes;
+                    uint16_t  ml = midLongs;
+                    uint16_t  tb = midTail;
+                    *pp = (uint8_t)(*pp & leadBits); pp++;
+                    while (ab > 0u) { *pp++ = fullByte; ab--; }
+                    {
+                        uint32_t *p32 = (uint32_t *)pp;
+                        do { *p32++ = fillLong; } while (--ml);
+                        pp = (uint8_t *)p32;
+                    }
+                    while (tb > 0u) { *pp++ = fullByte; tb--; }
+                    *pp = (uint8_t)(*pp & trailBits);
+                    p += AMIGA_BYTES_PER_ROW;
+                }
+            }
+        }
+    }
+}
+
+
+// Phase 5 planar dual-write for tile ops, fully planar after Phase 9
+// dropped the chunky shadow. All tiles are 8-pixel aligned (8x8 blocks
+// at multiples of 8), so plane writes are byte-aligned -- one plane
+// byte per row, 8 rows per tile, no edge masks. Stride between rows
+// in a plane is AMIGA_BYTES_PER_ROW (40).
+
+void halTileFillPlanes(SurfaceT *s, uint8_t bx, uint8_t by, uint8_t colorIndex) {
+    AmigaPlanarT *pd;
+    uint16_t      plane;
+    uint8_t       fillByte;
+    uint8_t      *p;
+    uint8_t       row;
+
+    pd = (AmigaPlanarT *)s->portData;
+    if (pd == NULL) {
+        return;
+    }
+    for (plane = 0; plane < AMIGA_BITPLANES; plane++) {
+        fillByte = ((colorIndex >> plane) & 1u) ? 0xFFu : 0x00u;
+        p = pd->planes[plane] + (uint16_t)by * 8u * AMIGA_BYTES_PER_ROW + bx;
+        for (row = 0; row < 8u; row++) {
+            *p = fillByte;
+            p += AMIGA_BYTES_PER_ROW;
+        }
+    }
+}
+
+
+void halTileCopyPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy) {
+    AmigaPlanarT *dstPd;
+    AmigaPlanarT *srcPd;
+    uint16_t      plane;
+    uint8_t      *dp;
+    const uint8_t *sp;
+    uint8_t       row;
+
+    dstPd = (AmigaPlanarT *)dst->portData;
+    srcPd = (AmigaPlanarT *)src->portData;
+    if (dstPd == NULL || srcPd == NULL) {
+        return;
+    }
+    for (plane = 0; plane < AMIGA_BITPLANES; plane++) {
+        dp = dstPd->planes[plane] + (uint16_t)dstBy * 8u * AMIGA_BYTES_PER_ROW + dstBx;
+        sp = srcPd->planes[plane] + (uint16_t)srcBy * 8u * AMIGA_BYTES_PER_ROW + srcBx;
+        for (row = 0; row < 8u; row++) {
+            *dp = *sp;
+            dp += AMIGA_BYTES_PER_ROW;
+            sp += AMIGA_BYTES_PER_ROW;
+        }
+    }
+}
+
+
+// Pure-planar masked copy. For each row of the 8x8 tile, read 4 src
+// plane bytes; compute a per-pixel "non-transparent" mask via XOR
+// against the transparent index's per-plane bit pattern (a pixel
+// matches transparent IFF all 4 plane bits match transparent's 4
+// bits = OR of 4 XOR'd bytes is 0 in that bit). Then for each plane,
+// dst = (dst & ~mask) | (src & mask) -- copy src bits at mask-set
+// positions, preserve dst bits elsewhere.
+void halTileCopyMaskedPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy, uint8_t transparentIndex) {
+    AmigaPlanarT *dstPd;
+    AmigaPlanarT *srcPd;
+    uint8_t       transparent;
+    uint8_t       transBitByte[AMIGA_BITPLANES];
+    uint16_t      i;
+    uint8_t       row;
+    uint16_t      srcByteOff;
+    uint16_t      dstByteOff;
+    uint8_t       srcPlaneBytes[AMIGA_BITPLANES];
+    uint8_t       maskByte;
+
+    dstPd = (AmigaPlanarT *)dst->portData;
+    srcPd = (AmigaPlanarT *)src->portData;
+    if (dstPd == NULL || srcPd == NULL) {
+        return;
+    }
+    transparent = (uint8_t)(transparentIndex & 0x0Fu);
+    /* Per-plane "all bits set if transparent's bit at this plane is 1
+     * else all 0" -- so XOR gives bit set where pixel differs from
+     * transparent in that plane. */
+    for (i = 0; i < AMIGA_BITPLANES; i++) {
+        transBitByte[i] = ((transparent >> i) & 1u) ? 0xFFu : 0x00u;
+    }
+
+    for (row = 0; row < 8u; row++) {
+        srcByteOff = (uint16_t)((uint16_t)srcBy * 8u + row) * AMIGA_BYTES_PER_ROW + srcBx;
+        dstByteOff = (uint16_t)((uint16_t)dstBy * 8u + row) * AMIGA_BYTES_PER_ROW + dstBx;
+        srcPlaneBytes[0] = srcPd->planes[0][srcByteOff];
+        srcPlaneBytes[1] = srcPd->planes[1][srcByteOff];
+        srcPlaneBytes[2] = srcPd->planes[2][srcByteOff];
+        srcPlaneBytes[3] = srcPd->planes[3][srcByteOff];
+        /* maskByte: bit set where pixel differs from transparent in
+         * ANY plane -- i.e., where the pixel is non-transparent. */
+        maskByte = (uint8_t)((srcPlaneBytes[0] ^ transBitByte[0])
+                           | (srcPlaneBytes[1] ^ transBitByte[1])
+                           | (srcPlaneBytes[2] ^ transBitByte[2])
+                           | (srcPlaneBytes[3] ^ transBitByte[3]));
+        for (i = 0; i < AMIGA_BITPLANES; i++) {
+            uint8_t existing = dstPd->planes[i][dstByteOff];
+            dstPd->planes[i][dstByteOff] = (uint8_t)((existing & (uint8_t)~maskByte)
+                                                  | (srcPlaneBytes[i] & maskByte));
+        }
+    }
+}
+
+
+// Phase 8 planar dual-write for asset blits. Walks the asset's
+// chunky pixel buffer in the already-clipped (srcX0, srcY0)..
+// (srcX0+copyW, srcY0+copyH) range and sets dst plane bits per
+// pixel via amigaPlanarSetPixel. transparent == 0xFFFF means opaque
+// (no skip); any nibble value 0..15 means skip that color. Asset
+// row stride is srcRowBytes (asset width may be < SURFACE_WIDTH).
+void halBlitRectPlanes(SurfaceT *dst, int16_t x, int16_t y, const uint8_t *srcBytes, int16_t srcX0, int16_t srcY0, int16_t copyW, int16_t copyH, int16_t srcRowBytes, uint16_t transparent) {
+    AmigaPlanarT  *pd;
+    int16_t        row;
+    int16_t        col;
+    int16_t        sx;
+    const uint8_t *srcRow;
+    uint8_t        byte;
+    uint8_t        nibble;
+    bool           hasMask;
+    uint8_t        transNibble;
+
+    pd = (AmigaPlanarT *)dst->portData;
+    if (pd == NULL) {
+        return;
+    }
+    hasMask     = (transparent <= 0x0Fu);
+    transNibble = (uint8_t)(transparent & 0x0Fu);
+
+    for (row = 0; row < copyH; row++) {
+        srcRow = &srcBytes[(uint16_t)(srcY0 + row) * (uint16_t)srcRowBytes];
+        for (col = 0; col < copyW; col++) {
+            sx     = (int16_t)(srcX0 + col);
+            byte   = srcRow[sx >> 1];
+            nibble = (sx & 1) ? (uint8_t)(byte & 0x0Fu) : (uint8_t)(byte >> 4);
+            if (hasMask && nibble == transNibble) {
+                continue;
+            }
+            amigaPlanarSetPixel(pd, (int16_t)(x + col), (int16_t)(y + row), nibble);
+        }
+    }
+}
+
+
+// Phase 9 sprite save/restore plane-backup hooks.
+// Sprite save at (x, y, w, h) writes 4 plane stripes into backup
+// buffer; restore reads them back. x and w are 2-pixel aligned by
+// cross-platform code; we round x DOWN and w UP to 8-pixel boundaries
+// here so plane writes are byte-aligned. Backup layout (matches the
+// h * w/2 = 4 * h * w/8 sizing the cross-platform code allocates):
+//   bytes [0           .. h*bpr      ): plane 0 rows
+//   bytes [h*bpr       .. 2*h*bpr    ): plane 1 rows
+//   bytes [2*h*bpr     .. 3*h*bpr    ): plane 2 rows
+//   bytes [3*h*bpr     .. 4*h*bpr    ): plane 3 rows
+// where bpr = bytesPerPlaneRow = roundedW/8.
+//
+// If the rect's rounded width is wider than the chunky-sized backup
+// would hold (h * (w/2) bytes), we silently truncate -- the planar
+// stripes for partial-byte-aligned sprites won't fit. This case is
+// rare for tile-aligned sprites; document if it bites.
+
+static void amigaSpriteRoundRect(int16_t *xp, uint16_t *wp, uint16_t *bprp) {
+    int16_t  xIn = *xp;
+    uint16_t wIn = *wp;
+    int16_t  xOut = (int16_t)(xIn & ~7);   /* round down to 8-pixel */
+    uint16_t span = (uint16_t)(((uint16_t)xIn + wIn) - (uint16_t)xOut);
+    uint16_t wOut = (uint16_t)((span + 7u) & ~7u);
+    *xp   = xOut;
+    *wp   = wOut;
+    *bprp = (uint16_t)(wOut >> 3);
+}
+
+
+void halSpriteSavePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t *dstPlaneBytes) {
+    AmigaPlanarT *pd;
+    uint16_t      bpr;
+    uint16_t      planeStripe;
+    uint16_t      i;
+    uint16_t      row;
+    uint8_t      *dst;
+    const uint8_t *src;
+
+    pd = (AmigaPlanarT *)s->portData;
+    if (pd == NULL || dstPlaneBytes == NULL) {
+        return;
+    }
+    amigaSpriteRoundRect(&x, &w, &bpr);
+    planeStripe = (uint16_t)((uint16_t)h * bpr);
+    for (i = 0; i < AMIGA_BITPLANES; i++) {
+        dst = dstPlaneBytes + i * planeStripe;
+        for (row = 0; row < (uint16_t)h; row++) {
+            src = pd->planes[i] + ((uint16_t)y + row) * AMIGA_BYTES_PER_ROW + ((uint16_t)x >> 3);
+            memcpy(dst, src, bpr);
+            dst += bpr;
+        }
+    }
+}
+
+
+void halSpriteRestorePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, const uint8_t *srcPlaneBytes) {
+    AmigaPlanarT  *pd;
+    uint16_t       bpr;
+    uint16_t       planeStripe;
+    uint16_t       i;
+    uint16_t       row;
+    uint8_t       *dst;
+    const uint8_t *src;
+
+    pd = (AmigaPlanarT *)s->portData;
+    if (pd == NULL || srcPlaneBytes == NULL) {
+        return;
+    }
+    amigaSpriteRoundRect(&x, &w, &bpr);
+    planeStripe = (uint16_t)((uint16_t)h * bpr);
+    for (i = 0; i < AMIGA_BITPLANES; i++) {
+        src = srcPlaneBytes + i * planeStripe;
+        for (row = 0; row < (uint16_t)h; row++) {
+            dst = pd->planes[i] + ((uint16_t)y + row) * AMIGA_BYTES_PER_ROW + ((uint16_t)x >> 3);
+            memcpy(dst, src, bpr);
+            src += bpr;
+        }
+    }
+}
+
+
+/* Helper used by Amiga halSurfaceLoadFileChunky to populate planes
+ * from a freshly-loaded chunky pixel buffer (s->pixels). */
+static void amigaPopulatePlanesFromChunky(SurfaceT *s) {
+    AmigaPlanarT  *pd;
+    int16_t        y;
+    const uint8_t *srcLine;
+    UBYTE         *p0;
+    UBYTE         *p1;
+    UBYTE         *p2;
+    UBYTE         *p3;
+
+    pd = (AmigaPlanarT *)s->portData;
+    if (pd == NULL) {
+        return;
+    }
+    if (!gC2pLutReady) {
+        initC2pLut();
+    }
+    for (y = 0; y < SURFACE_HEIGHT; y++) {
+        srcLine = &s->pixels[y * SURFACE_BYTES_PER_ROW];
+        p0      = pd->planes[0] + (uint16_t)y * AMIGA_BYTES_PER_ROW;
+        p1      = pd->planes[1] + (uint16_t)y * AMIGA_BYTES_PER_ROW;
+        p2      = pd->planes[2] + (uint16_t)y * AMIGA_BYTES_PER_ROW;
+        p3      = pd->planes[3] + (uint16_t)y * AMIGA_BYTES_PER_ROW;
+        chunkyToPlanarRow(srcLine, p0, p1, p2, p3, AMIGA_BYTES_PER_ROW, gC2pLut);
+    }
+}
+
+
+// Phase 6 planar dual-write for sprite draw. Walks the sprite's
+// chunky tile data with the same clipping the cross-platform code
+// applies, calling amigaPlanarSetPixel for every non-transparent
+// pixel (nibble != 0). Bypasses the compiled fast path entirely --
+// the chunky compiled draw already ran by the time we get here, so
+// we just mirror its pixel set into the planes.
+//
+// Sprite tile data layout: tileData = wTiles * hTiles tiles, each
+// tile = 8 rows x 4 chunky bytes. Tiles laid out row-major.
+// For pixel (px, py) within the sprite:
+//   tileX = px / 8, tileY = py / 8
+//   inTileX = px % 8, inTileY = py % 8
+//   tileBase = tileData + (tileY * wTiles + tileX) * 32
+//   byte = tileBase[inTileY * 4 + inTileX/2]
+//   nibble = (inTileX & 1) ? byte & 0x0F : byte >> 4
+//
+// Save/restore have no equivalent planar dual-write yet (the
+// SpriteBackupT only has chunky storage); workable approach needs
+// a parallel plane-data buffer. Apps that depend on PLANAR_PRESENT
+// save/restore semantics will see stale planes after restore.
+void halSpriteDrawPlanes(SurfaceT *s, const SpriteT *sp, int16_t x, int16_t y) {
+    AmigaPlanarT  *pd;
+    int16_t        dx;
+    int16_t        dy;
+    int16_t        sx;
+    int16_t        sy;
+    int16_t        w;
+    int16_t        h;
+    int16_t        row;
+    int16_t        col;
+    int16_t        spritePx;
+    int16_t        spritePy;
+    int16_t        tileX;
+    int16_t        tileY;
+    int16_t        inTileX;
+    int16_t        inTileY;
+    uint16_t       wTiles;
+    const uint8_t *tile;
+    uint8_t        byte;
+    uint8_t        nibble;
+
+    pd = (AmigaPlanarT *)s->portData;
+    if (pd == NULL) {
+        return;
+    }
+    wTiles = sp->widthTiles;
+    w = (int16_t)(wTiles * 8);
+    h = (int16_t)(sp->heightTiles * 8);
+    dx = x;
+    dy = y;
+
+    /* Clip dst rect against surface. sx/sy track the offset INTO the
+     * sprite that the clipped region starts at. Same logic as
+     * sprite.c:clipRect() but inlined since that helper is static. */
+    sx = 0;
+    sy = 0;
+    if (dx < 0) { sx = (int16_t)(-dx); w = (int16_t)(w - sx); dx = 0; }
+    if (dy < 0) { sy = (int16_t)(-dy); h = (int16_t)(h - sy); dy = 0; }
+    if (dx >= SURFACE_WIDTH || dy >= SURFACE_HEIGHT || w <= 0 || h <= 0) {
+        return;
+    }
+    if (dx + w > SURFACE_WIDTH)  { w = (int16_t)(SURFACE_WIDTH  - dx); }
+    if (dy + h > SURFACE_HEIGHT) { h = (int16_t)(SURFACE_HEIGHT - dy); }
+
+    for (row = 0; row < h; row++) {
+        spritePy = (int16_t)(sy + row);
+        tileY    = (int16_t)(spritePy >> 3);     /* / 8 */
+        inTileY  = (int16_t)(spritePy & 7);
+        for (col = 0; col < w; col++) {
+            spritePx = (int16_t)(sx + col);
+            tileX    = (int16_t)(spritePx >> 3);
+            inTileX  = (int16_t)(spritePx & 7);
+            tile     = sp->tileData + (uint32_t)((tileY * wTiles + tileX) * 32);
+            byte     = tile[inTileY * 4 + (inTileX >> 1)];
+            nibble   = (inTileX & 1) ? (uint8_t)(byte & 0x0Fu) : (uint8_t)(byte >> 4);
+            if (nibble != 0u) {
+                amigaPlanarSetPixel(pd, (int16_t)(dx + col), (int16_t)(dy + row), nibble);
+            }
+        }
+    }
+}
+
+
+// Phase 9 plane-to-chunky derivation for tileSnap. Reads 8 plane
+// bytes (1 byte per row x 8 rows) from each of 4 planes for the
+// 8-pixel-aligned tile column at bx, then assembles 32 chunky bytes
+// (4 per row x 8 rows, packed 2 px/byte high-then-low nibble) into
+// On Amiga, TileT.pixels is opaque port-specific storage (cross-
+// platform tile.c never reads it directly when s->pixels is NULL).
+// We use those 32 bytes as 4 planes x 8 rows, plane-major:
+//   bytes [0..7]   = plane 0, rows 0..7
+//   bytes [8..15]  = plane 1, rows 0..7
+//   bytes [16..23] = plane 2, rows 0..7
+//   bytes [24..31] = plane 3, rows 0..7
+// snap/paste then become 32 plain byte loads + stores -- no chunky
+// <-> planar conversion at all. The previous c2p-based path paid
+// 4 KB LUT lookups + bit shuffling per pixel; this is ~50x cheaper.
+#define AMIGA_TILE_PLANE_STRIDE  8
+void halTileSnapPlanes(const SurfaceT *src, uint8_t bx, uint8_t by, uint8_t *tileOut) {
+    AmigaPlanarT *pd;
+    uint16_t      row;
+    uint16_t      rowBase;
+    uint8_t       plane;
+
+    pd = (AmigaPlanarT *)src->portData;
+    if (pd == NULL) {
+        return;
+    }
+    rowBase = (uint16_t)((uint16_t)by * 8u) * AMIGA_BYTES_PER_ROW + bx;
+    for (plane = 0; plane < AMIGA_BITPLANES; plane++) {
+        const uint8_t *p = pd->planes[plane] + rowBase;
+        uint8_t       *q = tileOut + plane * AMIGA_TILE_PLANE_STRIDE;
+        for (row = 0; row < 8u; row++) {
+            q[row] = p[row * AMIGA_BYTES_PER_ROW];
+        }
+    }
+}
+
+
+void halTilePastePlanes(SurfaceT *dst, uint8_t bx, uint8_t by, const uint8_t *tileBytes) {
+    AmigaPlanarT *pd;
+    uint8_t       row;
+    uint8_t       plane;
+    uint16_t      rowBase;
+
+    pd = (AmigaPlanarT *)dst->portData;
+    if (pd == NULL) {
+        return;
+    }
+    /* TileT.pixels[] holds plane-major planar bytes (see
+     * halTileSnapPlanes header for layout). Paste = 32 byte stores
+     * with no chunky -> planar conversion. */
+    rowBase = (uint16_t)((uint16_t)by * 8u) * AMIGA_BYTES_PER_ROW + bx;
+    for (plane = 0; plane < AMIGA_BITPLANES; plane++) {
+        const uint8_t *q = tileBytes + plane * AMIGA_TILE_PLANE_STRIDE;
+        uint8_t       *p = pd->planes[plane] + rowBase;
+        for (row = 0; row < 8u; row++) {
+            p[row * AMIGA_BYTES_PER_ROW] = q[row];
+        }
+    }
+}
+
+
+// Phase 3 planar dual-write for surfaceCopy: 4 plane memcpys after
+// the cross-platform chunky pixel memcpy. Both src and dst planes
+// are off-screen shadow buffers; the displayed gPlanes[] is updated
+// only at stagePresent.
+void halSurfaceCopyPlanes(SurfaceT *dst, const SurfaceT *src) {
+    AmigaPlanarT *dstPd;
+    AmigaPlanarT *srcPd;
+    uint16_t      i;
+
+    dstPd = (AmigaPlanarT *)dst->portData;
+    srcPd = (AmigaPlanarT *)src->portData;
+    if (dstPd == NULL || srcPd == NULL) {
+        return;
+    }
+    for (i = 0; i < AMIGA_BITPLANES; i++) {
+        memcpy(dstPd->planes[i], srcPd->planes[i], AMIGA_PLANE_SIZE);
+    }
+}
+
+
+/* Phase 9: tile halFast hooks return true to suppress chunky fallback.
+ * The actual planar work happens in halTileFillPlanes / halTileCopyPlanes
+ * / etc. (called by cross-platform tile.c after each halFast). tileSnap
+ * outputs a chunky TileT -- see halTileSnapPlanes-style derivation
+ * inside the snap planar work added below if needed. For now tileSnap
+ * skips its output (TileT will be all-zeros) when called on Amiga;
+ * apps that depend on tileSnap on Amiga need a planar-to-chunky
+ * derivation hook (TODO if any UBER/demo path actually exercises it). */
 bool halFastTileCopy(uint8_t *dstRow0, const uint8_t *srcRow0) {
     (void)dstRow0;
     (void)srcRow0;
-    return false;
+    return true;
 }
 
 
@@ -668,58 +1569,304 @@ bool halFastTileCopyMasked(uint8_t *dstRow0, const uint8_t *srcRow0, uint8_t tra
     (void)dstRow0;
     (void)srcRow0;
     (void)transparent;
-    return false;
+    return true;
 }
 
 
 bool halFastTilePaste(uint8_t *dstRow0, const uint8_t *srcTilePixels) {
     (void)dstRow0;
     (void)srcTilePixels;
-    return false;
+    return true;
 }
 
 
 bool halFastTileSnap(uint8_t *dstTilePixels, const uint8_t *srcRow0) {
     (void)dstTilePixels;
     (void)srcRow0;
-    return false;
+    return true;
+}
+
+
+// Phase 4 planar dual-write helper: set one pixel's bit in each of
+// the four shadow planes. Caller (the per-primitive walker below) has
+// already validated (x, y) is on-surface. byteOff is computed once
+// and reused across all four planes since stride is the same in each.
+/* Per-pixel plane RMW with the color-bit classification hoisted by
+ * the caller. set0..set3 are 0xFF if that plane's color bit is 1
+ * (OR-in bitMask), 0 if 0 (AND-out bitMask). Each plane updates with
+ * `*pn = (*pn & ~bitMask) | (setN & bitMask)` -- branch-free, 4
+ * RMWs per pixel. Inlined into shape walkers so the outer loop pays
+ * no function-call overhead. */
+/* Bit-mask LUT for the per-pixel macro -- replaces a runtime
+ * `0x80u >> (x & 7)` (8 cyc per bit shifted on 68000) with a single
+ * byte load. Saves ~25 cyc per pixel inside circle / line walkers. */
+static const uint8_t kAmigaPlanePutPixelBitLut[8] = {
+    0x80u, 0x40u, 0x20u, 0x10u, 0x08u, 0x04u, 0x02u, 0x01u
+};
+
+#define AMIGA_PLANE_PUT_PIXEL(pd_, x_, y_, set0_, set1_, set2_, set3_) do { \
+    uint16_t byteOff_ = (uint16_t)((uint16_t)(y_) * AMIGA_BYTES_PER_ROW + ((uint16_t)(x_) >> 3)); \
+    uint8_t  bitMask_ = kAmigaPlanePutPixelBitLut[(uint16_t)(x_) & 7u];                           \
+    uint8_t  notMask_ = (uint8_t)~bitMask_;                                                       \
+    uint8_t *p0_ = &(pd_)->planes[0][byteOff_];                                                   \
+    uint8_t *p1_ = &(pd_)->planes[1][byteOff_];                                                   \
+    uint8_t *p2_ = &(pd_)->planes[2][byteOff_];                                                   \
+    uint8_t *p3_ = &(pd_)->planes[3][byteOff_];                                                   \
+    *p0_ = (uint8_t)((*p0_ & notMask_) | ((set0_) & bitMask_));                                   \
+    *p1_ = (uint8_t)((*p1_ & notMask_) | ((set1_) & bitMask_));                                   \
+    *p2_ = (uint8_t)((*p2_ & notMask_) | ((set2_) & bitMask_));                                   \
+    *p3_ = (uint8_t)((*p3_ & notMask_) | ((set3_) & bitMask_));                                   \
+} while (0)
+
+
+static void amigaPlanarSetPixel(AmigaPlanarT *pd, int16_t x, int16_t y, uint8_t color) {
+    uint16_t byteOff;
+    uint8_t  bitMask;
+    uint8_t  notMask;
+    uint8_t *p0;
+    uint8_t *p1;
+    uint8_t *p2;
+    uint8_t *p3;
+
+    byteOff = (uint16_t)((uint16_t)y * AMIGA_BYTES_PER_ROW + ((uint16_t)x >> 3));
+    bitMask = (uint8_t)(0x80u >> ((uint16_t)x & 7u));
+    notMask = (uint8_t)~bitMask;
+
+    /* Unroll the 4-plane loop. Loop counter + array indexing inside
+     * the hot per-pixel path is the gating cost on circle outlines:
+     * UBER drawCircle r=80 calls this ~640 times per call. */
+    p0 = &pd->planes[0][byteOff];
+    p1 = &pd->planes[1][byteOff];
+    p2 = &pd->planes[2][byteOff];
+    p3 = &pd->planes[3][byteOff];
+
+    if (color & 0x01u) { *p0 = (uint8_t)(*p0 | bitMask); } else { *p0 = (uint8_t)(*p0 & notMask); }
+    if (color & 0x02u) { *p1 = (uint8_t)(*p1 | bitMask); } else { *p1 = (uint8_t)(*p1 & notMask); }
+    if (color & 0x04u) { *p2 = (uint8_t)(*p2 | bitMask); } else { *p2 = (uint8_t)(*p2 & notMask); }
+    if (color & 0x08u) { *p3 = (uint8_t)(*p3 | bitMask); } else { *p3 = (uint8_t)(*p3 & notMask); }
 }
 
 
 bool halFastDrawPixel(SurfaceT *s, uint16_t x, uint16_t y, uint8_t colorIndex) {
-    uint8_t nibLo;
+    AmigaPlanarT *pd;
+    uint8_t       nibLo;
     if (s != stageGet()) {
         return false;
     }
     nibLo = (uint8_t)(colorIndex & 0x0Fu);
-    draw68kPlotPixel(s->pixels, (int16_t)x, (int16_t)y, nibLo, (uint8_t)(nibLo << 4));
+    pd = (AmigaPlanarT *)s->portData;
+    if (pd != NULL) {
+        amigaPlanarSetPixel(pd, (int16_t)x, (int16_t)y, nibLo);
+    }
     return true;
 }
 
 
+// Bresenham's diagonal line, planar-only walk. Same algorithm as
+// cross-platform drawLine's fallback so the pixel set matches the
+// chunky walker (draw68kLine) bit-for-bit.
+static void amigaPlanarLine(AmigaPlanarT *pd, int16_t x0, int16_t y0, int16_t x1, int16_t y1, uint8_t color) {
+    int16_t dx;
+    int16_t dy;
+    int16_t sx;
+    int16_t sy;
+    int16_t err;
+    int16_t e2;
+    uint8_t set0;
+    uint8_t set1;
+    uint8_t set2;
+    uint8_t set3;
+
+    set0 = (color & 0x01u) ? 0xFFu : 0x00u;
+    set1 = (color & 0x02u) ? 0xFFu : 0x00u;
+    set2 = (color & 0x04u) ? 0xFFu : 0x00u;
+    set3 = (color & 0x08u) ? 0xFFu : 0x00u;
+
+    dx  = (int16_t)((x1 > x0) ? (x1 - x0) : (x0 - x1));
+    dy  = (int16_t)(-((y1 > y0) ? (y1 - y0) : (y0 - y1)));
+    sx  = (int16_t)((x0 < x1) ? 1 : -1);
+    sy  = (int16_t)((y0 < y1) ? 1 : -1);
+    err = (int16_t)(dx + dy);
+    while (1) {
+        AMIGA_PLANE_PUT_PIXEL(pd, x0, y0, set0, set1, set2, set3);
+        if (x0 == x1 && y0 == y1) {
+            break;
+        }
+        e2 = (int16_t)(2 * err);
+        if (e2 >= dy) {
+            err = (int16_t)(err + dy);
+            x0  = (int16_t)(x0 + sx);
+        }
+        if (e2 <= dx) {
+            err = (int16_t)(err + dx);
+            y0  = (int16_t)(y0 + sy);
+        }
+    }
+}
+
+
 bool halFastDrawLine(SurfaceT *s, int16_t x0, int16_t y0, int16_t x1, int16_t y1, uint8_t colorIndex) {
+    AmigaPlanarT *pd;
     if (s != stageGet()) {
         return false;
     }
-    draw68kLine(s->pixels, x0, y0, x1, y1, colorIndex);
+    pd = (AmigaPlanarT *)s->portData;
+    if (pd != NULL) {
+        amigaPlanarLine(pd, x0, y0, x1, y1, (uint8_t)(colorIndex & 0x0Fu));
+    }
     return true;
 }
 
 
+// 8-octant midpoint circle outline, planar-only walk. Mirrors
+// drawCircle's cross-platform fallback exactly so plane bits land at
+// the same pixels as the chunky walker (draw68kCircleOutline).
+static void amigaPlanarCircleOutline(AmigaPlanarT *pd, int16_t cx, int16_t cy, uint16_t r, uint8_t color) {
+    int16_t x;
+    int16_t y;
+    int16_t err;
+    uint8_t set0;
+    uint8_t set1;
+    uint8_t set2;
+    uint8_t set3;
+
+    /* Classify each plane once: 0xFF if color bit is 1 (set bitMask),
+     * 0 if bit is 0 (clear bitMask). The per-pixel macro then folds
+     * this into a branch-free RMW. */
+    set0 = (color & 0x01u) ? 0xFFu : 0x00u;
+    set1 = (color & 0x02u) ? 0xFFu : 0x00u;
+    set2 = (color & 0x04u) ? 0xFFu : 0x00u;
+    set3 = (color & 0x08u) ? 0xFFu : 0x00u;
+
+    x = (int16_t)r;
+    y = 0;
+    err = (int16_t)(1 - x);
+    while (x >= y) {
+        AMIGA_PLANE_PUT_PIXEL(pd, (int16_t)(cx + x), (int16_t)(cy + y), set0, set1, set2, set3);
+        AMIGA_PLANE_PUT_PIXEL(pd, (int16_t)(cx - x), (int16_t)(cy + y), set0, set1, set2, set3);
+        AMIGA_PLANE_PUT_PIXEL(pd, (int16_t)(cx + x), (int16_t)(cy - y), set0, set1, set2, set3);
+        AMIGA_PLANE_PUT_PIXEL(pd, (int16_t)(cx - x), (int16_t)(cy - y), set0, set1, set2, set3);
+        AMIGA_PLANE_PUT_PIXEL(pd, (int16_t)(cx + y), (int16_t)(cy + x), set0, set1, set2, set3);
+        AMIGA_PLANE_PUT_PIXEL(pd, (int16_t)(cx - y), (int16_t)(cy + x), set0, set1, set2, set3);
+        AMIGA_PLANE_PUT_PIXEL(pd, (int16_t)(cx + y), (int16_t)(cy - x), set0, set1, set2, set3);
+        AMIGA_PLANE_PUT_PIXEL(pd, (int16_t)(cx - y), (int16_t)(cy - x), set0, set1, set2, set3);
+        y++;
+        if (err <= 0) {
+            err = (int16_t)(err + y + y + 1);
+        } else {
+            x--;
+            err = (int16_t)(err + y + y - x - x + 1);
+        }
+    }
+}
+
+
 bool halFastDrawCircle(SurfaceT *s, int16_t cx, int16_t cy, uint16_t r, uint8_t colorIndex) {
+    AmigaPlanarT *pd;
     if (s != stageGet()) {
         return false;
     }
-    draw68kCircleOutline(s->pixels, cx, cy, r, colorIndex);
+    pd = (AmigaPlanarT *)s->portData;
+    if (pd != NULL) {
+        surface68kAmigaCircleOutline(pd->planes[0], pd->planes[1],
+                                     pd->planes[2], pd->planes[3],
+                                     (uint16_t)cx, (uint16_t)cy, r,
+                                     (uint8_t)(colorIndex & 0x0Fu));
+    }
     return true;
 }
 
 
+/* Single-row 4-plane span fill via shared68k asm. Caller pre-computes
+ * the left/right partial-byte masks; fillByte per plane is just
+ * 0xFF/0x00 based on colorIndex bit. The asm body avoids per-byte
+ * function-call dispatch and the C compiler's per-iter overhead --
+ * critical for fillCircle r=40 which pre-asm was paying ~50 ms/call
+ * for 80 spans. */
+static inline __attribute__((always_inline))
+void amigaFillSpanInline(AmigaPlanarT *pd, int16_t spanX, int16_t spanY,
+                         uint16_t spanW, uint8_t colorIndex) {
+    static const uint8_t kLM[8] = {0xFFu,0x7Fu,0x3Fu,0x1Fu,0x0Fu,0x07u,0x03u,0x01u};
+    static const uint8_t kRM[8] = {0x80u,0xC0u,0xE0u,0xF0u,0xF8u,0xFCu,0xFEu,0xFFu};
+
+    uint16_t byteFirst = (uint16_t)((uint16_t)spanX >> 3);
+    uint16_t lastBit   = (uint16_t)(spanX + spanW - 1);
+    uint16_t byteLast  = (uint16_t)(lastBit >> 3);
+    uint8_t  leftMask  = kLM[(uint16_t)spanX & 7u];
+    uint8_t  rightMask = kRM[lastBit & 7u];
+    uint16_t rowOff    = (uint16_t)((uint16_t)spanY * AMIGA_BYTES_PER_ROW + byteFirst);
+    uint8_t  fb0 = ((colorIndex >> 0) & 1u) ? 0xFFu : 0x00u;
+    uint8_t  fb1 = ((colorIndex >> 1) & 1u) ? 0xFFu : 0x00u;
+    uint8_t  fb2 = ((colorIndex >> 2) & 1u) ? 0xFFu : 0x00u;
+    uint8_t  fb3 = ((colorIndex >> 3) & 1u) ? 0xFFu : 0x00u;
+    uint8_t *p0  = pd->planes[0] + rowOff;
+    uint8_t *p1  = pd->planes[1] + rowOff;
+    uint8_t *p2  = pd->planes[2] + rowOff;
+    uint8_t *p3  = pd->planes[3] + rowOff;
+
+    if (byteFirst == byteLast) {
+        /* Single-byte case kept in C: the asm path post-increments
+         * the pointer between leading and trailing RMW, which would
+         * read the wrong byte if both edges land on the same byte.
+         * One-byte spans are rare anyway (~1 of 80 in fillCircle r=40)
+         * so the C overhead is fine here. */
+        uint8_t mask    = (uint8_t)(leftMask & rightMask);
+        uint8_t notMask = (uint8_t)~mask;
+        uint8_t *pp[AMIGA_BITPLANES];
+        uint8_t  fb[AMIGA_BITPLANES];
+        uint8_t  i;
+        pp[0] = p0; pp[1] = p1; pp[2] = p2; pp[3] = p3;
+        fb[0] = fb0; fb[1] = fb1; fb[2] = fb2; fb[3] = fb3;
+        for (i = 0; i < AMIGA_BITPLANES; i++) {
+            *pp[i] = (uint8_t)((*pp[i] & notMask) | (fb[i] & mask));
+        }
+        return;
+    }
+
+    {
+        uint16_t numMid = (uint16_t)(byteLast - byteFirst - 1u);
+        surface68kFillSpan4Planes(p0, p1, p2, p3, numMid, leftMask, rightMask,
+                                  fb0, fb1, fb2, fb3);
+    }
+}
+
+
 bool halFastFillCircle(SurfaceT *s, int16_t cx, int16_t cy, uint16_t r, uint8_t colorIndex) {
+    AmigaPlanarT *pd;
+    int16_t       x;
+    int16_t       y;
+    int16_t       err;
+    int16_t       spanX;
+    uint16_t      spanW;
+
     if (s != stageGet()) {
         return false;
     }
-    draw68kCircleFill(s->pixels, cx, cy, r, colorIndex);
+    pd = (AmigaPlanarT *)s->portData;
+    if (pd != NULL) {
+        /* Inline per-span plane fill -- avoids 4 halFillRectPlanes
+         * dispatches per midpoint iter (~320 dispatches for r=40). */
+        x   = (int16_t)r;
+        y   = 0;
+        err = (int16_t)(1 - x);
+        while (x >= y) {
+            spanX = (int16_t)(cx - x);
+            spanW = (uint16_t)(2 * x + 1);
+            amigaFillSpanInline(pd, spanX, (int16_t)(cy + y), spanW, colorIndex);
+            amigaFillSpanInline(pd, spanX, (int16_t)(cy - y), spanW, colorIndex);
+            spanX = (int16_t)(cx - y);
+            spanW = (uint16_t)(2 * y + 1);
+            amigaFillSpanInline(pd, spanX, (int16_t)(cy + x), spanW, colorIndex);
+            amigaFillSpanInline(pd, spanX, (int16_t)(cy - x), spanW, colorIndex);
+            y++;
+            if (err <= 0) {
+                err = (int16_t)(err + y + y + 1);
+            } else {
+                x--;
+                err = (int16_t)(err + y + y - x - x + 1);
+            }
+        }
+    }
     return true;
 }
 
@@ -749,19 +1896,6 @@ bool halFastFloodScanRow(uint8_t *row, int16_t leftX, int16_t rightX, uint8_t ma
 }
 
 
-bool halFastBlitRect(uint8_t *dstRow0, int16_t dstX, const uint8_t *srcRow0, int16_t srcX, int16_t copyW, int16_t copyH, int16_t srcRowBytes, uint16_t transparent) {
-    (void)dstRow0;
-    (void)dstX;
-    (void)srcRow0;
-    (void)srcX;
-    (void)copyW;
-    (void)copyH;
-    (void)srcRowBytes;
-    (void)transparent;
-    return false;
-}
-
-
 bool halFastFloodScanAndPush(uint8_t *row, int16_t leftX, int16_t rightX, uint8_t matchColor, uint8_t newColor, bool matchEqual, int16_t scanY, int16_t *stackX, int16_t *stackY, int16_t *spInOut, int16_t maxSp) {
     (void)row;
     (void)leftX;
@@ -796,20 +1930,471 @@ bool halFastFloodWalkAndScans(uint8_t *pixels, int16_t x, int16_t y, uint8_t mat
 }
 
 
+/* Plane-aware nibble at (x, y) given the 4 plane row bases. Reads one
+ * byte per plane and assembles the 4-bit color from a single bit
+ * position. Caller is responsible for valid (x, y). */
+static uint8_t amigaNibbleFromPlanes(uint8_t * const planes[AMIGA_BITPLANES], int16_t x) {
+    uint16_t byteOff;
+    uint8_t  bitMask;
+    uint8_t  color;
+
+    byteOff = (uint16_t)((uint16_t)x >> 3);
+    bitMask = (uint8_t)(0x80u >> ((uint16_t)x & 7u));
+    color = 0u;
+    if (planes[0][byteOff] & bitMask) color = (uint8_t)(color | 0x01u);
+    if (planes[1][byteOff] & bitMask) color = (uint8_t)(color | 0x02u);
+    if (planes[2][byteOff] & bitMask) color = (uint8_t)(color | 0x04u);
+    if (planes[3][byteOff] & bitMask) color = (uint8_t)(color | 0x08u);
+    return color;
+}
+
+
+/* Build the 4 plane row pointers for a given y. */
+static void amigaPlaneRowPtrs(const SurfaceT *s, int16_t y, uint8_t **out /* [4] */) {
+    AmigaPlanarT *pd;
+    uint16_t      yOff;
+    uint8_t       i;
+
+    pd = (AmigaPlanarT *)s->portData;
+    yOff = (uint16_t)((uint16_t)y * AMIGA_BYTES_PER_ROW);
+    for (i = 0; i < AMIGA_BITPLANES; i++) {
+        out[i] = pd->planes[i] + yOff;
+    }
+}
+
+
+bool halFloodWalkPlanes(const SurfaceT *s, int16_t startX, int16_t y, uint8_t matchColor, uint8_t newColor, bool matchEqual, bool *seedMatched, int16_t *leftXOut, int16_t *rightXOut) {
+    AmigaPlanarT *pd;
+    uint8_t      *rowPlanes[AMIGA_BITPLANES];
+    int16_t       leftX;
+    int16_t       rightX;
+    uint8_t       pix;
+    bool          pixMatch;
+
+    pd = (AmigaPlanarT *)s->portData;
+    if (pd == NULL) {
+        return false;
+    }
+    matchColor = (uint8_t)(matchColor & 0x0Fu);
+    newColor   = (uint8_t)(newColor   & 0x0Fu);
+    amigaPlaneRowPtrs(s, y, rowPlanes);
+
+    pix      = amigaNibbleFromPlanes(rowPlanes, startX);
+    pixMatch = (pix == matchColor);
+    if (matchEqual ? !pixMatch : (pixMatch || pix == newColor)) {
+        *seedMatched = false;
+        return true;
+    }
+    *seedMatched = true;
+
+    leftX = startX;
+    while (leftX > 0) {
+        pix      = amigaNibbleFromPlanes(rowPlanes, (int16_t)(leftX - 1));
+        pixMatch = (pix == matchColor);
+        if (matchEqual ? !pixMatch : (pixMatch || pix == newColor)) {
+            break;
+        }
+        leftX--;
+    }
+
+    rightX = startX;
+    while (rightX < SURFACE_WIDTH - 1) {
+        pix      = amigaNibbleFromPlanes(rowPlanes, (int16_t)(rightX + 1));
+        pixMatch = (pix == matchColor);
+        if (matchEqual ? !pixMatch : (pixMatch || pix == newColor)) {
+            break;
+        }
+        rightX++;
+    }
+
+    *leftXOut  = leftX;
+    *rightXOut = rightX;
+    return true;
+}
+
+
+bool halFloodScanRowPlanes(const SurfaceT *s, int16_t leftX, int16_t rightX, int16_t scanY, uint8_t matchColor, uint8_t newColor, bool matchEqual, uint8_t *markBuf) {
+    AmigaPlanarT *pd;
+    uint8_t      *rowPlanes[AMIGA_BITPLANES];
+    int16_t       byteCol;
+    int16_t       byteColFirst;
+    int16_t       byteColLast;
+    int16_t       bit;
+    int16_t       x;
+    int16_t       markIdx;
+    uint8_t       p0, p1, p2, p3;
+    uint8_t       bitMask;
+    uint8_t       pix;
+    bool          pixMatch;
+
+    pd = (AmigaPlanarT *)s->portData;
+    if (pd == NULL) {
+        return false;
+    }
+    matchColor = (uint8_t)(matchColor & 0x0Fu);
+    newColor   = (uint8_t)(newColor   & 0x0Fu);
+    amigaPlaneRowPtrs(s, scanY, rowPlanes);
+
+    byteColFirst = (int16_t)(leftX  >> 3);
+    byteColLast  = (int16_t)(rightX >> 3);
+    for (byteCol = byteColFirst; byteCol <= byteColLast; byteCol++) {
+        p0 = rowPlanes[0][byteCol];
+        p1 = rowPlanes[1][byteCol];
+        p2 = rowPlanes[2][byteCol];
+        p3 = rowPlanes[3][byteCol];
+        for (bit = 0; bit < 8; bit++) {
+            x = (int16_t)((byteCol << 3) + bit);
+            if (x < leftX || x > rightX) {
+                continue;
+            }
+            bitMask = (uint8_t)(0x80u >> bit);
+            pix = 0u;
+            if (p0 & bitMask) pix = (uint8_t)(pix | 0x01u);
+            if (p1 & bitMask) pix = (uint8_t)(pix | 0x02u);
+            if (p2 & bitMask) pix = (uint8_t)(pix | 0x04u);
+            if (p3 & bitMask) pix = (uint8_t)(pix | 0x08u);
+            pixMatch = (pix == matchColor);
+            markIdx = (int16_t)(x - leftX);
+            markBuf[markIdx] = (uint8_t)(matchEqual
+                ? (pixMatch ? 1 : 0)
+                : ((!pixMatch && pix != newColor) ? 1 : 0));
+        }
+    }
+    return true;
+}
+
+
 bool halFastTileFill(SurfaceT *s, uint8_t bx, uint8_t by, uint16_t fillWord) {
+    /* Phase 9: chunky write skipped; halTileFillPlanes (called by
+     * cross-platform tile.c after this) does the planar fill. */
     (void)s;
     (void)bx;
     (void)by;
     (void)fillWord;
-    return false;
+    return true;
+}
+
+
+bool halFastBlitRect(uint8_t *dstRow0, int16_t dstX, const uint8_t *srcRow0, int16_t srcX, int16_t copyW, int16_t copyH, int16_t srcRowBytes, uint16_t transparent) {
+    /* Phase 9: chunky write skipped; halBlitRectPlanes (called by
+     * cross-platform surfaceBlit after this) does the planar work. */
+    (void)dstRow0; (void)dstX; (void)srcRow0; (void)srcX;
+    (void)copyW; (void)copyH; (void)srcRowBytes; (void)transparent;
+    return true;
+}
+
+
+/* ===== Phase 9 reader hooks: pure-planar Amiga implementations =====
+ *
+ * Cross-platform code that USED to read s->pixels (chunky shadow) now
+ * goes through these. On Amiga the chunky shadow doesn't exist;
+ * pixels are derived on demand by walking the plane bits.
+ *
+ * Per-pixel assembly: for pixel (x, y), read bit (7 - x%8) from plane
+ * byte at row*40 + x/8 in each of the 4 planes. Color index = sum
+ * of (bit_p << p). */
+
+uint8_t halSamplePixel(const SurfaceT *s, int16_t x, int16_t y) {
+    AmigaPlanarT *pd;
+    uint16_t      byteOff;
+    uint8_t       bitMask;
+    uint8_t       color;
+    uint16_t      i;
+
+    pd = (AmigaPlanarT *)s->portData;
+    if (pd == NULL) {
+        return 0u;
+    }
+    byteOff = (uint16_t)((uint16_t)y * AMIGA_BYTES_PER_ROW + ((uint16_t)x >> 3));
+    bitMask = (uint8_t)(0x80u >> ((uint16_t)x & 7u));
+    color = 0u;
+    for (i = 0; i < AMIGA_BITPLANES; i++) {
+        if (pd->planes[i][byteOff] & bitMask) {
+            color = (uint8_t)(color | (1u << i));
+        }
+    }
+    return color;
+}
+
+
+/* Reverse-c2p: per row, derive 160 chunky bytes from 40 plane bytes
+ * (per plane, 4 planes). Used by halSurfaceHash, halSurfaceSaveFileChunky.
+ * Walks 8 pixels per planar-byte column; per pixel assembles nibble
+ * from 4 plane bits. Output: 4 chunky bytes per planar-byte column
+ * (since 8 pixels = 4 chunky bytes at 2px/byte). */
+static void amigaPlanesToChunkyRow(const AmigaPlanarT *pd, int16_t y, uint8_t *dstChunkyRow) {
+    uint16_t       col;
+    uint16_t       byteOff;
+    uint8_t        b0, b1, b2, b3;
+    uint8_t        pix;
+    uint8_t        bitMask;
+    uint16_t       p;
+
+    for (col = 0; col < AMIGA_BYTES_PER_ROW; col++) {
+        byteOff = (uint16_t)((uint16_t)y * AMIGA_BYTES_PER_ROW + col);
+        b0 = pd->planes[0][byteOff];
+        b1 = pd->planes[1][byteOff];
+        b2 = pd->planes[2][byteOff];
+        b3 = pd->planes[3][byteOff];
+        /* For each of 8 pixels in this planar byte (bit 7 = leftmost),
+         * assemble nibble from the 4 plane bits and pack into chunky
+         * bytes (high nibble = even pixel, low nibble = odd pixel). */
+        for (p = 0; p < 8u; p++) {
+            bitMask = (uint8_t)(0x80u >> p);
+            pix = 0u;
+            if (b0 & bitMask) pix = (uint8_t)(pix | 1u);
+            if (b1 & bitMask) pix = (uint8_t)(pix | 2u);
+            if (b2 & bitMask) pix = (uint8_t)(pix | 4u);
+            if (b3 & bitMask) pix = (uint8_t)(pix | 8u);
+            if ((p & 1u) == 0u) {
+                dstChunkyRow[col * 4u + (p >> 1)] = (uint8_t)(pix << 4);
+            } else {
+                dstChunkyRow[col * 4u + (p >> 1)] = (uint8_t)(dstChunkyRow[col * 4u + (p >> 1)] | pix);
+            }
+        }
+    }
+}
+
+
+uint32_t halSurfaceHash(const SurfaceT *s) {
+    AmigaPlanarT   *pd;
+    uint16_t        lo = 0xACE1u, hi = 0x1357u;
+    uint16_t        n, v;
+    int16_t         row;
+    uint8_t         b;
+    uint8_t         chunkyRow[SURFACE_BYTES_PER_ROW];
+    const uint16_t *w;
+
+    pd = (AmigaPlanarT *)s->portData;
+    if (pd == NULL) {
+        return 0u;
+    }
+    /* Pixel hash: derive chunky one row at a time, fold byte-by-byte
+     * via the shared SURFACE_HASH_MIX_BYTE so cross-port hash
+     * matches. */
+    for (row = 0; row < SURFACE_HEIGHT; row++) {
+        amigaPlanesToChunkyRow(pd, row, chunkyRow);
+        for (n = 0; n < SURFACE_BYTES_PER_ROW; n++) {
+            b = chunkyRow[n];
+            SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        }
+    }
+    /* SCB: byte sequence, endian-independent. */
+    for (n = 0; n < (uint16_t)SURFACE_HEIGHT; n++) {
+        b = s->scb[n];
+        SURFACE_HASH_MIX_BYTE(lo, hi, b);
+    }
+    /* Palette: read uint16 values, fold high-then-low for endian-
+     * independence. */
+    w = &s->palette[0][0];
+    for (n = 0; n < (uint16_t)SURFACE_PALETTE_ENTRIES; n++) {
+        v  = *w++;
+        b  = (uint8_t)((v >> 8) & 0xFFu); SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        b  = (uint8_t)(v & 0xFFu);        SURFACE_HASH_MIX_BYTE(lo, hi, b);
+    }
+    return ((uint32_t)hi << 16) | (uint32_t)lo;
+}
+
+
+void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src) {
+    /* Amiga has no chunky shadow. Plane copy happens in
+     * halSurfaceCopyPlanes (called separately by surfaceCopy). */
+    (void)dst;
+    (void)src;
+}
+
+
+bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp) {
+    AmigaPlanarT *pd;
+    uint8_t      *scratch;
+    uint8_t      *srcLine;
+    int16_t       y;
+    UBYTE        *p0;
+    UBYTE        *p1;
+    UBYTE        *p2;
+    UBYTE        *p3;
+    bool          ok;
+
+    pd = (AmigaPlanarT *)dst->portData;
+    if (pd == NULL) {
+        return false;
+    }
+    /* fread the chunky file payload into a scratch buffer, then c2p
+     * directly into our planes. The scratch is a one-shot AllocMem
+     * (PUBLIC, not chip) since chunkyToPlanarRow only reads it. */
+    scratch = (uint8_t *)AllocMem((ULONG)SURFACE_PIXELS_SIZE, (ULONG)MEMF_PUBLIC);
+    if (scratch == NULL) {
+        return false;
+    }
+    ok = (fread(scratch, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE);
+    if (ok) {
+        if (!gC2pLutReady) {
+            initC2pLut();
+        }
+        for (y = 0; y < SURFACE_HEIGHT; y++) {
+            srcLine = &scratch[y * SURFACE_BYTES_PER_ROW];
+            p0 = pd->planes[0] + (uint16_t)y * AMIGA_BYTES_PER_ROW;
+            p1 = pd->planes[1] + (uint16_t)y * AMIGA_BYTES_PER_ROW;
+            p2 = pd->planes[2] + (uint16_t)y * AMIGA_BYTES_PER_ROW;
+            p3 = pd->planes[3] + (uint16_t)y * AMIGA_BYTES_PER_ROW;
+            chunkyToPlanarRow(srcLine, p0, p1, p2, p3, AMIGA_BYTES_PER_ROW, gC2pLut);
+        }
+    }
+    FreeMem(scratch, (ULONG)SURFACE_PIXELS_SIZE);
+    return ok;
+}
+
+
+bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp) {
+    AmigaPlanarT *pd;
+    uint8_t       chunkyRow[SURFACE_BYTES_PER_ROW];
+    int16_t       y;
+
+    pd = (AmigaPlanarT *)src->portData;
+    if (pd == NULL) {
+        return false;
+    }
+    /* Per row: derive chunky from planes, write 160 bytes. Less
+     * efficient than a single fwrite of a full buffer but avoids
+     * needing a 32 KB scratch allocation. */
+    for (y = 0; y < SURFACE_HEIGHT; y++) {
+        amigaPlanesToChunkyRow(pd, y, chunkyRow);
+        if (fwrite(chunkyRow, 1, SURFACE_BYTES_PER_ROW, fp) != SURFACE_BYTES_PER_ROW) {
+            return false;
+        }
+    }
+    return true;
 }
 
 
 uint8_t *halStageAllocPixels(void) {
-    return (uint8_t *)malloc(SURFACE_PIXELS_SIZE);
+    /* Phase 9: Amiga has no chunky shadow. The stage pixels pointer
+     * stays NULL; cross-platform code reads pixels via halSamplePixel
+     * (or other halXxxChunky hooks) which read from planes. NULL is
+     * a valid return -- cross-platform stageAlloc treats NULL as
+     * "port has no chunky storage" and skips the chunky memset. */
+    return NULL;
 }
 
 
 void halStageFreePixels(uint8_t *pixels) {
+    /* halStageAllocPixels returned NULL on Amiga, so this is always
+     * NULL (free(NULL) is well-defined no-op). Symmetric for any
+     * future port that does allocate stage pixels. */
     free(pixels);
 }
+
+
+uint8_t *halSurfaceAllocPixels(void) {
+    /* Same rationale as halStageAllocPixels: no chunky on Amiga. */
+    return NULL;
+}
+
+
+void halSurfaceFreePixels(uint8_t *pixels) {
+    free(pixels);
+}
+
+
+uint8_t *halSurfacePlanePtr(const SurfaceT *s, uint8_t planeIdx) {
+    AmigaPlanarT *pd;
+    if (planeIdx >= AMIGA_BITPLANES) {
+        return NULL;
+    }
+    pd = (AmigaPlanarT *)s->portData;
+    if (pd == NULL) {
+        return NULL;
+    }
+    return pd->planes[planeIdx];
+}
+
+
+// Allocate the per-surface planar storage: an AmigaPlanarT plus 4
+// off-screen chip-RAM plane buffers. The stage gets its OWN shadow
+// planes (NOT aliased to gPlanes / Intuition's BitMap) so drawing
+// primitives that dual-write to planes don't immediately appear on
+// screen -- the display is updated atomically at stagePresent time
+// (c2p chunky->gPlanes today; memcpy shadow->gPlanes under
+// JOEYLIB_PLANAR_PRESENT; pointer-swap or alias in Phase 9).
+// Aliasing the stage to gPlanes was tried and reverted because every
+// drawing primitive showed AS IT HAPPENED, which broke the "draw
+// invisibly, palette flips with content at present" semantic Pattern
+// and apps depend on. See project_planar_68k_plan.md Phase 3 notes.
+//
+// Returns NULL on allocation failure. Cross-platform code stores the
+// result in s->portData; primitives access via (AmigaPlanarT *)
+// s->portData.
+void *halSurfaceAllocPortData(SurfaceT *s, bool isStage) {
+    AmigaPlanarT *pd;
+    uint16_t      i;
+
+    (void)s;
+    (void)isStage;
+    pd = (AmigaPlanarT *)AllocMem((ULONG)sizeof(AmigaPlanarT),
+                                  (ULONG)(MEMF_CHIP | MEMF_CLEAR));
+    if (pd == NULL) {
+        return NULL;
+    }
+    pd->bytesPerRow   = AMIGA_BYTES_PER_ROW;
+    pd->bytesPerPlane = AMIGA_PLANE_SIZE;
+
+    /* Both stage and non-stage: AllocMem fresh planes, MEMF_CLEAR
+     * for the JoeyLib contract that color 0 = black at surface
+     * allocation. Stage uses these as off-screen back planes (display
+     * is gPlanes[], CHIP, updated only by halPresent). Non-stage uses
+     * these as the surface's only planes.
+     *
+     * MEMF_FAST (no MEMF_CHIP) explicitly demands fast RAM. The
+     * shadow planes are CPU-only (no blitter/copper access); freeing
+     * them from chip RAM cuts halPresent's chip-bus contention in
+     * half (read FAST + write CHIP instead of read CHIP + write CHIP)
+     * and gives RMW drawing primitives 2-3x speedup. If fast RAM is
+     * unavailable (bare A500, no expansion), we fall back to chip
+     * via the loop below. */
+    for (i = 0; i < AMIGA_BITPLANES; i++) {
+        pd->planes[i] = (uint8_t *)AllocMem((ULONG)AMIGA_PLANE_SIZE,
+                                            (ULONG)(MEMF_FAST | MEMF_CLEAR));
+        if (pd->planes[i] == NULL) {
+            /* No fast RAM available; fall back to chip. */
+            pd->planes[i] = (uint8_t *)AllocMem((ULONG)AMIGA_PLANE_SIZE,
+                                                (ULONG)(MEMF_CHIP | MEMF_CLEAR));
+            joeyLogF("amiga: shadow plane %u in CHIP (fast unavailable, addr=$%08lX)",
+                     (unsigned)i, (unsigned long)pd->planes[i]);
+        } else {
+            joeyLogF("amiga: shadow plane %u in FAST (addr=$%08lX)",
+                     (unsigned)i, (unsigned long)pd->planes[i]);
+        }
+        if (pd->planes[i] == NULL) {
+            /* Roll back any planes already allocated. */
+            while (i > 0u) {
+                i--;
+                FreeMem(pd->planes[i], (ULONG)AMIGA_PLANE_SIZE);
+            }
+            FreeMem(pd, (ULONG)sizeof(AmigaPlanarT));
+            return NULL;
+        }
+    }
+    pd->ownsPlanes = true;
+    return pd;
+}
+
+
+void halSurfaceFreePortData(SurfaceT *s, bool isStage, void *portData) {
+    AmigaPlanarT *pd;
+    uint16_t      i;
+
+    (void)s;
+    (void)isStage;
+    if (portData == NULL) {
+        return;
+    }
+    pd = (AmigaPlanarT *)portData;
+    if (pd->ownsPlanes) {
+        for (i = 0; i < AMIGA_BITPLANES; i++) {
+            if (pd->planes[i] != NULL) {
+                FreeMem(pd->planes[i], (ULONG)AMIGA_PLANE_SIZE);
+            }
+        }
+    }
+    FreeMem(pd, (ULONG)sizeof(AmigaPlanarT));
+}
diff --git a/src/port/atarist/hal.c b/src/port/atarist/hal.c
index 2efcf32..2e77041 100644
--- a/src/port/atarist/hal.c
+++ b/src/port/atarist/hal.c
@@ -526,26 +526,6 @@ void halPresent(const SurfaceT *src) {
 }
 
 
-void halPresentRect(const SurfaceT *src, int16_t x, int16_t y, uint16_t w, uint16_t h) {
-    uint16_t groupStart;
-    uint16_t groupEnd;
-
-    if (src == NULL || !gModeSet) {
-        return;
-    }
-    refreshPaletteStateIfNeeded(src);
-    // Each c2p group covers 16 horizontal pixels. Round dirty pixel
-    // range to the enclosing group range to keep the planar word
-    // alignment without missing edge pixels.
-    groupStart = (uint16_t)(x >> 4);
-    groupEnd   = (uint16_t)(((uint16_t)x + w + 15) >> 4);
-    if (groupEnd > ST_GROUPS_PER_ROW) {
-        groupEnd = ST_GROUPS_PER_ROW;
-    }
-    c2pRange(src, y, y + (int16_t)h, groupStart, groupEnd);
-}
-
-
 // Vsync() is XBIOS opcode 37; mintlib exposes it directly. It blocks
 // until the next 50 Hz (PAL) or 60 Hz (NTSC) vertical blank.
 void halWaitVBL(void) {
@@ -730,6 +710,20 @@ bool halFastFloodWalk(uint8_t *row, int16_t startX, uint8_t matchColor, uint8_t
 }
 
 
+bool halFloodWalkPlanes(const SurfaceT *s, int16_t startX, int16_t y, uint8_t matchColor, uint8_t newColor, bool matchEqual, bool *seedMatched, int16_t *leftXOut, int16_t *rightXOut) {
+    (void)s; (void)startX; (void)y; (void)matchColor; (void)newColor; (void)matchEqual;
+    (void)seedMatched; (void)leftXOut; (void)rightXOut;
+    return false;
+}
+
+
+bool halFloodScanRowPlanes(const SurfaceT *s, int16_t leftX, int16_t rightX, int16_t scanY, uint8_t matchColor, uint8_t newColor, bool matchEqual, uint8_t *markBuf) {
+    (void)s; (void)leftX; (void)rightX; (void)scanY; (void)matchColor; (void)newColor; (void)matchEqual;
+    (void)markBuf;
+    return false;
+}
+
+
 bool halFastFloodScanRow(uint8_t *row, int16_t leftX, int16_t rightX, uint8_t matchColor, uint8_t newColor, bool matchEqual, uint8_t *markBuf) {
     (void)row;
     (void)leftX;
@@ -798,6 +792,146 @@ bool halFastTileFill(SurfaceT *s, uint8_t bx, uint8_t by, uint16_t fillWord) {
 }
 
 
+// Phase-1 planar plumbing: portData hooks declared and exported, but
+// returning NULL keeps the ST port operating in the legacy
+// chunky-with-c2p model. Phase 4 replaces this with an interleaved
+// planar buffer + stride blob, and rewrites every halFast* primitive
+// to read/write planes directly.
+void *halSurfaceAllocPortData(SurfaceT *s, bool isStage) {
+    (void)s;
+    (void)isStage;
+    return NULL;
+}
+
+
+void halSurfaceFreePortData(SurfaceT *s, bool isStage, void *portData) {
+    (void)s;
+    (void)isStage;
+    (void)portData;
+}
+
+
+// ST planar dual-write isn't implemented yet (interleaved word-planar
+// layout needs a different code path than Amiga's separate plane
+// buffers). Stub for now; chunky shadow + c2p still drives display.
+void halFillRectPlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex) {
+    (void)s;
+    (void)x;
+    (void)y;
+    (void)w;
+    (void)h;
+    (void)colorIndex;
+}
+
+
+void halSurfaceCopyPlanes(SurfaceT *dst, const SurfaceT *src) {
+    (void)dst;
+    (void)src;
+}
+
+
+void halTileFillPlanes(SurfaceT *s, uint8_t bx, uint8_t by, uint8_t colorIndex) {
+    (void)s; (void)bx; (void)by; (void)colorIndex;
+}
+void halTileCopyPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy) {
+    (void)dst; (void)dstBx; (void)dstBy; (void)src; (void)srcBx; (void)srcBy;
+}
+void halTileCopyMaskedPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy, uint8_t transparentIndex) {
+    (void)dst; (void)dstBx; (void)dstBy; (void)src; (void)srcBx; (void)srcBy; (void)transparentIndex;
+}
+void halTilePastePlanes(SurfaceT *dst, uint8_t bx, uint8_t by, const uint8_t *chunkyTile) {
+    (void)dst; (void)bx; (void)by; (void)chunkyTile;
+}
+void halTileSnapPlanes(const SurfaceT *src, uint8_t bx, uint8_t by, uint8_t *chunkyTileOut) {
+    (void)src; (void)bx; (void)by; (void)chunkyTileOut;
+}
+void halSpriteDrawPlanes(SurfaceT *s, const SpriteT *sp, int16_t x, int16_t y) {
+    (void)s; (void)sp; (void)x; (void)y;
+}
+void halBlitRectPlanes(SurfaceT *dst, int16_t x, int16_t y, const uint8_t *srcBytes, int16_t srcX0, int16_t srcY0, int16_t copyW, int16_t copyH, int16_t srcRowBytes, uint16_t transparent) {
+    (void)dst; (void)x; (void)y; (void)srcBytes; (void)srcX0; (void)srcY0;
+    (void)copyW; (void)copyH; (void)srcRowBytes; (void)transparent;
+}
+void halSpriteSavePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t *dstPlaneBytes) {
+    (void)s; (void)x; (void)y; (void)w; (void)h; (void)dstPlaneBytes;
+}
+void halSpriteRestorePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, const uint8_t *srcPlaneBytes) {
+    (void)s; (void)x; (void)y; (void)w; (void)h; (void)srcPlaneBytes;
+}
+
+
+/* Phase 9 chunky reader hooks -- ST is still chunky-shadow + c2p,
+ * so reads come from s->pixels just like DOS / IIgs. */
+uint8_t halSamplePixel(const SurfaceT *s, int16_t x, int16_t y) {
+    uint8_t byte = s->pixels[(uint16_t)y * SURFACE_BYTES_PER_ROW + ((uint16_t)x >> 1)];
+    if (x & 1) return (uint8_t)(byte & 0x0Fu);
+    return (uint8_t)((byte & 0xF0u) >> 4);
+}
+
+
+uint32_t halSurfaceHash(const SurfaceT *s) {
+    uint16_t        lo = 0xACE1u, hi = 0x1357u, blocks, n, v;
+    const uint8_t  *p;
+    const uint16_t *w;
+    uint8_t         b;
+    p      = s->pixels;
+    blocks = (uint16_t)(SURFACE_PIXELS_SIZE / 8);
+    do {
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        blocks--;
+    } while (blocks > 0u);
+    p = s->scb;
+    for (n = 0; n < (uint16_t)SURFACE_HEIGHT; n++) {
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+    }
+    w = &s->palette[0][0];
+    for (n = 0; n < (uint16_t)SURFACE_PALETTE_ENTRIES; n++) {
+        v  = *w++;
+        b  = (uint8_t)((v >> 8) & 0xFFu);  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        b  = (uint8_t)(v & 0xFFu);         SURFACE_HASH_MIX_BYTE(lo, hi, b);
+    }
+    return ((uint32_t)hi << 16) | (uint32_t)lo;
+}
+
+
+void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src) {
+    memcpy(dst->pixels, src->pixels, SURFACE_PIXELS_SIZE);
+}
+
+
+bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp) {
+    return fread(dst->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE;
+}
+
+
+bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp) {
+    return fwrite(src->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE;
+}
+
+
+uint8_t *halSurfaceAllocPixels(void) {
+    return (uint8_t *)calloc(1, SURFACE_PIXELS_SIZE);
+}
+
+
+void halSurfaceFreePixels(uint8_t *pixels) {
+    free(pixels);
+}
+
+
+uint8_t *halSurfacePlanePtr(const SurfaceT *s, uint8_t planeIdx) {
+    (void)s; (void)planeIdx;
+    return NULL;
+}
+
+
 uint8_t *halStageAllocPixels(void) {
     return (uint8_t *)malloc(SURFACE_PIXELS_SIZE);
 }
diff --git a/src/port/dos/hal.c b/src/port/dos/hal.c
index 7e446c2..d1ca693 100644
--- a/src/port/dos/hal.c
+++ b/src/port/dos/hal.c
@@ -244,21 +244,6 @@ void halPresent(const SurfaceT *src) {
 }
 
 
-void halPresentRect(const SurfaceT *src, int16_t x, int16_t y, uint16_t w, uint16_t h) {
-    int16_t py;
-    int16_t yEnd;
-
-    if (src == NULL || gVgaMem == NULL) {
-        return;
-    }
-    uploadPaletteIfNeeded(src);
-    yEnd = y + (int16_t)h;
-    for (py = y; py < yEnd; py++) {
-        expandAndWriteLine(src, py, x, w, &gVgaMem[py * VGA_STRIDE]);
-    }
-}
-
-
 // VGA mode 13h vertical refresh on a real CRT runs at ~70 Hz. We
 // detect the start of vertical retrace by polling input status
 // register 1 ($3DA) bit 3: 1 = currently in vretrace. To get a
@@ -423,6 +408,20 @@ bool halFastFloodWalk(uint8_t *row, int16_t startX, uint8_t matchColor, uint8_t
 }
 
 
+bool halFloodWalkPlanes(const SurfaceT *s, int16_t startX, int16_t y, uint8_t matchColor, uint8_t newColor, bool matchEqual, bool *seedMatched, int16_t *leftXOut, int16_t *rightXOut) {
+    (void)s; (void)startX; (void)y; (void)matchColor; (void)newColor; (void)matchEqual;
+    (void)seedMatched; (void)leftXOut; (void)rightXOut;
+    return false;
+}
+
+
+bool halFloodScanRowPlanes(const SurfaceT *s, int16_t leftX, int16_t rightX, int16_t scanY, uint8_t matchColor, uint8_t newColor, bool matchEqual, uint8_t *markBuf) {
+    (void)s; (void)leftX; (void)rightX; (void)scanY; (void)matchColor; (void)newColor; (void)matchEqual;
+    (void)markBuf;
+    return false;
+}
+
+
 bool halFastFloodScanRow(uint8_t *row, int16_t leftX, int16_t rightX, uint8_t matchColor, uint8_t newColor, bool matchEqual, uint8_t *markBuf) {
     (void)row;
     (void)leftX;
@@ -499,3 +498,143 @@ uint8_t *halStageAllocPixels(void) {
 void halStageFreePixels(uint8_t *pixels) {
     free(pixels);
 }
+
+
+// DOS / VGA mode 13h is chunky-native (8bpp linear). portData is
+// unused; the chunky `pixels` buffer feeds the present-time
+// nearest-neighbor copy to VGA RAM.
+void *halSurfaceAllocPortData(SurfaceT *s, bool isStage) {
+    (void)s;
+    (void)isStage;
+    return NULL;
+}
+
+
+void halSurfaceFreePortData(SurfaceT *s, bool isStage, void *portData) {
+    (void)s;
+    (void)isStage;
+    (void)portData;
+}
+
+
+// DOS has no bitplanes -- chunky pixels are the source of truth and
+// expandAndWriteLine derives the VGA DAC indices straight from them.
+// This hook is a stub here; the cross-platform fillRect calls it
+// unconditionally.
+void halFillRectPlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex) {
+    (void)s;
+    (void)x;
+    (void)y;
+    (void)w;
+    (void)h;
+    (void)colorIndex;
+}
+
+
+void halSurfaceCopyPlanes(SurfaceT *dst, const SurfaceT *src) {
+    (void)dst;
+    (void)src;
+}
+
+
+void halTileFillPlanes(SurfaceT *s, uint8_t bx, uint8_t by, uint8_t colorIndex) {
+    (void)s; (void)bx; (void)by; (void)colorIndex;
+}
+void halTileCopyPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy) {
+    (void)dst; (void)dstBx; (void)dstBy; (void)src; (void)srcBx; (void)srcBy;
+}
+void halTileCopyMaskedPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy, uint8_t transparentIndex) {
+    (void)dst; (void)dstBx; (void)dstBy; (void)src; (void)srcBx; (void)srcBy; (void)transparentIndex;
+}
+void halTilePastePlanes(SurfaceT *dst, uint8_t bx, uint8_t by, const uint8_t *chunkyTile) {
+    (void)dst; (void)bx; (void)by; (void)chunkyTile;
+}
+void halTileSnapPlanes(const SurfaceT *src, uint8_t bx, uint8_t by, uint8_t *chunkyTileOut) {
+    (void)src; (void)bx; (void)by; (void)chunkyTileOut;
+}
+void halSpriteDrawPlanes(SurfaceT *s, const SpriteT *sp, int16_t x, int16_t y) {
+    (void)s; (void)sp; (void)x; (void)y;
+}
+void halBlitRectPlanes(SurfaceT *dst, int16_t x, int16_t y, const uint8_t *srcBytes, int16_t srcX0, int16_t srcY0, int16_t copyW, int16_t copyH, int16_t srcRowBytes, uint16_t transparent) {
+    (void)dst; (void)x; (void)y; (void)srcBytes; (void)srcX0; (void)srcY0;
+    (void)copyW; (void)copyH; (void)srcRowBytes; (void)transparent;
+}
+void halSpriteSavePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t *dstPlaneBytes) {
+    (void)s; (void)x; (void)y; (void)w; (void)h; (void)dstPlaneBytes;
+}
+void halSpriteRestorePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, const uint8_t *srcPlaneBytes) {
+    (void)s; (void)x; (void)y; (void)w; (void)h; (void)srcPlaneBytes;
+}
+
+
+/* Phase 9 reader hooks: chunky ports use the original s->pixels-based
+ * paths. */
+
+uint8_t halSamplePixel(const SurfaceT *s, int16_t x, int16_t y) {
+    uint8_t byte = s->pixels[(uint16_t)y * SURFACE_BYTES_PER_ROW + ((uint16_t)x >> 1)];
+    if (x & 1) return (uint8_t)(byte & 0x0Fu);
+    return (uint8_t)((byte & 0xF0u) >> 4);
+}
+
+
+uint32_t halSurfaceHash(const SurfaceT *s) {
+    uint16_t        lo = 0xACE1u, hi = 0x1357u, blocks, n, v;
+    const uint8_t  *p;
+    const uint16_t *w;
+    uint8_t         b;
+    p      = s->pixels;
+    blocks = (uint16_t)(SURFACE_PIXELS_SIZE / 8);
+    do {
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        blocks--;
+    } while (blocks > 0u);
+    p = s->scb;
+    for (n = 0; n < (uint16_t)SURFACE_HEIGHT; n++) {
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+    }
+    w = &s->palette[0][0];
+    for (n = 0; n < (uint16_t)SURFACE_PALETTE_ENTRIES; n++) {
+        v  = *w++;
+        b  = (uint8_t)((v >> 8) & 0xFFu);  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        b  = (uint8_t)(v & 0xFFu);         SURFACE_HASH_MIX_BYTE(lo, hi, b);
+    }
+    return ((uint32_t)hi << 16) | (uint32_t)lo;
+}
+
+
+void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src) {
+    memcpy(dst->pixels, src->pixels, SURFACE_PIXELS_SIZE);
+}
+
+
+bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp) {
+    return fread(dst->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE;
+}
+
+
+bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp) {
+    return fwrite(src->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE;
+}
+
+
+uint8_t *halSurfaceAllocPixels(void) {
+    return (uint8_t *)calloc(1, SURFACE_PIXELS_SIZE);
+}
+
+
+void halSurfaceFreePixels(uint8_t *pixels) {
+    free(pixels);
+}
+
+
+uint8_t *halSurfacePlanePtr(const SurfaceT *s, uint8_t planeIdx) {
+    (void)s; (void)planeIdx;
+    return NULL;
+}
diff --git a/src/port/iigs/hal.c b/src/port/iigs/hal.c
index 5cad5b7..237fcab 100644
--- a/src/port/iigs/hal.c
+++ b/src/port/iigs/hal.c
@@ -26,12 +26,25 @@
 // crowd up against the 64 KB-per-bank limit).
 
 #include <stddef.h>
+#include <stdlib.h>
 #include <string.h>
 
 #include "joey/debug.h"
 #include "hal.h"
 #include "surfaceInternal.h"
 
+/* GetTick wrapper in peislam.asm: invokes the Misc Toolset GetTick
+ * ($2503) and returns the low 16 bits of the system's tick counter
+ * (firmware VBL ISR-driven). Polling $C019 from C user code missed
+ * transitions for any op over ~1 ms; the system's tick counter is
+ * updated by the actual interrupt handler so it stays accurate
+ * regardless of caller polling rate. Tick rate matches the video
+ * field rate -- 60 Hz on NTSC, 50 Hz on PAL. */
+extern uint16_t iigsGetTickWord(void);
+/* Reads battery RAM hrtz50or60: 0 = NTSC, 1 = PAL. */
+extern uint16_t iigsReadHzParam(void);
+static uint16_t gFrameHz = 60u;
+
 // hal.c is the single TU that calls into joeyDraw.asm. Cross-
 // platform draw.c / tile.c / etc. dispatch through halFast*
 // functions defined here; they never reference the asm symbols
@@ -210,6 +223,7 @@ bool halInit(const JoeyConfigT *config) {
     // is unreliable from halInit's calling context, so we don't try
     // it here -- the first present will set up SCB to 320 mode.
     iigsInitRowLut();
+    gFrameHz = (iigsReadHzParam() == 1u) ? 50u : 60u;
     gModeSet = true;
     return true;
 }
@@ -234,40 +248,6 @@ void halPresent(const SurfaceT *src) {
 }
 
 
-void halPresentRect(const SurfaceT *src, int16_t x, int16_t y, uint16_t w, uint16_t h) {
-    uint16_t copyBytes;
-    int16_t  byteStart;
-    uint16_t srcOffset;
-
-    if (src == NULL) {
-        return;
-    }
-
-    uploadScbAndPaletteIfNeeded(src);
-
-    // Pixel copy: byte-aligned runs per scanline. x is always >= 0
-    // after API-level clipping. Use unsigned shifts to avoid
-    // ~SSHIFTRIGHT helper for `x >> 1` on signed int16_t.
-    byteStart = (int16_t)((uint16_t)x >> 1);
-    copyBytes = (uint16_t)((((uint16_t)x + w + 1u) >> 1) - (uint16_t)byteStart);
-
-    if (copyBytes == 0 || h == 0) {
-        return;
-    }
-
-    // Pixel copy: prefer the PEI-slam variant when the rect satisfies
-    // its contract (copyBytes even, 2..80). Sprite-rect presents
-    // (typical 8 bytes wide) hit this ~3x faster than MVN. Wider or
-    // odd-byte rects fall back to MVN, which has no width cap.
-    srcOffset = (uint16_t)(0x2000 + SURFACE_ROW_OFFSET(y) + byteStart);
-    if ((copyBytes & 1) == 0 && copyBytes >= 2 && copyBytes <= 80) {
-        iigsBlitRectStageToShrPEI(srcOffset, copyBytes, h);
-    } else {
-        iigsBlitRectStageToShr(srcOffset, copyBytes, h);
-    }
-}
-
-
 void halShutdown(void) {
     if (gModeSet) {
         *IIGS_NEWVIDEO_REG = gPreviousNewVideo;
@@ -305,6 +285,142 @@ void halStageFreePixels(uint8_t *pixels) {
 }
 
 
+// IIgs is chunky-native: portData is unused. The chunky `pixels`
+// buffer at $01:2000 is the stage's pixel storage and the source for
+// stagePresent's PEI-slam to $E1.
+void *halSurfaceAllocPortData(SurfaceT *s, bool isStage) {
+    (void)s;
+    (void)isStage;
+    return NULL;
+}
+
+
+void halSurfaceFreePortData(SurfaceT *s, bool isStage, void *portData) {
+    (void)s;
+    (void)isStage;
+    (void)portData;
+}
+
+
+// IIgs SHR is chunky-native; no bitplanes to update.
+void halFillRectPlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex) {
+    (void)s;
+    (void)x;
+    (void)y;
+    (void)w;
+    (void)h;
+    (void)colorIndex;
+}
+
+
+void halSurfaceCopyPlanes(SurfaceT *dst, const SurfaceT *src) {
+    (void)dst;
+    (void)src;
+}
+
+
+void halTileFillPlanes(SurfaceT *s, uint8_t bx, uint8_t by, uint8_t colorIndex) {
+    (void)s; (void)bx; (void)by; (void)colorIndex;
+}
+void halTileCopyPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy) {
+    (void)dst; (void)dstBx; (void)dstBy; (void)src; (void)srcBx; (void)srcBy;
+}
+void halTileCopyMaskedPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy, uint8_t transparentIndex) {
+    (void)dst; (void)dstBx; (void)dstBy; (void)src; (void)srcBx; (void)srcBy; (void)transparentIndex;
+}
+void halTilePastePlanes(SurfaceT *dst, uint8_t bx, uint8_t by, const uint8_t *chunkyTile) {
+    (void)dst; (void)bx; (void)by; (void)chunkyTile;
+}
+void halTileSnapPlanes(const SurfaceT *src, uint8_t bx, uint8_t by, uint8_t *chunkyTileOut) {
+    (void)src; (void)bx; (void)by; (void)chunkyTileOut;
+}
+void halSpriteDrawPlanes(SurfaceT *s, const SpriteT *sp, int16_t x, int16_t y) {
+    (void)s; (void)sp; (void)x; (void)y;
+}
+void halBlitRectPlanes(SurfaceT *dst, int16_t x, int16_t y, const uint8_t *srcBytes, int16_t srcX0, int16_t srcY0, int16_t copyW, int16_t copyH, int16_t srcRowBytes, uint16_t transparent) {
+    (void)dst; (void)x; (void)y; (void)srcBytes; (void)srcX0; (void)srcY0;
+    (void)copyW; (void)copyH; (void)srcRowBytes; (void)transparent;
+}
+void halSpriteSavePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t *dstPlaneBytes) {
+    (void)s; (void)x; (void)y; (void)w; (void)h; (void)dstPlaneBytes;
+}
+void halSpriteRestorePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, const uint8_t *srcPlaneBytes) {
+    (void)s; (void)x; (void)y; (void)w; (void)h; (void)srcPlaneBytes;
+}
+
+
+/* Phase 9 chunky reader hooks: IIgs reads from s->pixels just like
+ * the legacy paths did. Same logic as the DOS port. */
+uint8_t halSamplePixel(const SurfaceT *s, int16_t x, int16_t y) {
+    uint8_t byte = s->pixels[(uint16_t)y * SURFACE_BYTES_PER_ROW + ((uint16_t)x >> 1)];
+    if (x & 1) return (uint8_t)(byte & 0x0Fu);
+    return (uint8_t)((byte & 0xF0u) >> 4);
+}
+
+
+uint32_t halSurfaceHash(const SurfaceT *s) {
+    uint16_t        lo = 0xACE1u, hi = 0x1357u, blocks, n, v;
+    const uint8_t  *p;
+    const uint16_t *w;
+    uint8_t         b;
+    p      = s->pixels;
+    blocks = (uint16_t)(SURFACE_PIXELS_SIZE / 8);
+    do {
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        blocks--;
+    } while (blocks > 0u);
+    p = s->scb;
+    for (n = 0; n < (uint16_t)SURFACE_HEIGHT; n++) {
+        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+    }
+    w = &s->palette[0][0];
+    for (n = 0; n < (uint16_t)SURFACE_PALETTE_ENTRIES; n++) {
+        v  = *w++;
+        b  = (uint8_t)((v >> 8) & 0xFFu);  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        b  = (uint8_t)(v & 0xFFu);         SURFACE_HASH_MIX_BYTE(lo, hi, b);
+    }
+    return ((uint32_t)hi << 16) | (uint32_t)lo;
+}
+
+
+void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src) {
+    memcpy(dst->pixels, src->pixels, SURFACE_PIXELS_SIZE);
+}
+
+
+bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp) {
+    return fread(dst->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE;
+}
+
+
+bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp) {
+    return fwrite(src->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE;
+}
+
+
+uint8_t *halSurfaceAllocPixels(void) {
+    return (uint8_t *)calloc(1, SURFACE_PIXELS_SIZE);
+}
+
+
+void halSurfaceFreePixels(uint8_t *pixels) {
+    free(pixels);
+}
+
+
+uint8_t *halSurfacePlanePtr(const SurfaceT *s, uint8_t planeIdx) {
+    (void)s; (void)planeIdx;
+    return NULL;
+}
+
+
 // $C019 RDVBLBAR: bit 7 = 0 during vertical blank, 1 during active
 // scan. To produce a rising-edge wait (one VBL per call), first spin
 // while VBL is currently active (bit 7 = 0), then spin until VBL
@@ -333,24 +449,11 @@ void halWaitVBL(void) {
 // byte and the counter never advances. The explicit lda > / sta >
 // pattern uses long-mode addressing throughout, which is
 // DBR-independent.
-static uint16_t gFrameCount  = 0;
-static uint8_t  gPrevInVbl   = 0;
-
 uint16_t halFrameCount(void) {
-    uint8_t  now;
-    uint16_t cnt;
-
-    now = (*IIGS_VBL_STATUS & VBL_BAR_BIT) == 0;
-    if (now && !gPrevInVbl) {
-        cnt = gFrameCount;
-        cnt = (uint16_t)(cnt + 1u);
-        gFrameCount = cnt;
-    }
-    gPrevInVbl = now;
-    return gFrameCount;
+    return iigsGetTickWord();
 }
 
 
 uint16_t halFrameHz(void) {
-    return 60u;
+    return gFrameHz;
 }
diff --git a/src/port/iigs/peislam.asm b/src/port/iigs/peislam.asm
index 74fa672..0c1b6ff 100644
--- a/src/port/iigs/peislam.asm
+++ b/src/port/iigs/peislam.asm
@@ -1,15 +1,66 @@
-* peislam.asm - placeholder.
-*
-* The original PEI-slam-per-row helper was removed; its functionality
-* was rolled into iigsBlitStageToShr in joeyDraw.asm (full PEI-slam
-* with per-row dirty skip). This stub remains so the build's
-* PORT_ASM_SRCS_ALL wildcard pulls in a file with a recognized load
-* segment and the linker keeps the same segment-bank layout it had
-* when peislam.asm was a real translation unit.
+* peislam.asm - originally a PEI-slam helper, now hosts the GetTick
+* and ReadBParam trampolines. The PEI-slam logic was rolled into
+* iigsBlitStageToShr in joeyDraw.asm.
 
                 keep    PEISLAM
                 case    on
 
+
+* Stub kept so the PEISLAM load segment stays present (the build's
+* PORT_ASM_SRCS_ALL wildcard pulls in this file by name).
 peislamStub     start   IIGSASM
                 rtl
                 end
+
+
+****************************************************************
+* uint16_t iigsGetTickWord(void)
+*
+* Calls Misc Toolset GetTick ($2503) and returns the low 16 bits of
+* the 32-bit tick counter. The system increments this counter from
+* the actual VBL hardware interrupt, so it stays accurate regardless
+* of caller polling rate -- C-side polling of $C019 missed transitions
+* for any op over ~1 ms.
+*
+* GetTick output convention: caller pushes 4 bytes of output space,
+* tool dispatcher writes the LongWord into them. We pull the low 16
+* bits into A (ORCA-C Word return convention -- A holds the result,
+* not Y; verified against jIIgs.asm asmGetVbl) and discard the high
+* 16 into X.
+*
+* ORCA-C cdecl ABI: caller has M=I=16. Word return in A.
+****************************************************************
+
+iigsGetTickWord start IIGSASM
+                pha             ; output space high word
+                pha             ; output space low word
+                ldx     #$2503  ; _GetTick
+                jsl     $E10000
+
+                pla             ; A = low 16 bits (return value)
+                plx             ; discard high 16 bits
+                rtl
+                end
+
+
+****************************************************************
+* uint16_t iigsReadHzParam(void)
+*
+* Reads battery RAM parameter hrtz50or60 ($1D) via _ReadBParam ($0C03)
+* and returns the raw value: 0 = NTSC (60 Hz), 1 = PAL (50 Hz).
+*
+* GetTick fires from the hardware VBL ISR, so its rate matches the
+* video field rate -- 60 Hz on NTSC, 50 Hz on PAL. halFrameHz must
+* report whichever this machine actually runs so wall-clock math
+* (frames * 1000 / halFrameHz) is correct on both.
+****************************************************************
+
+iigsReadHzParam start IIGSASM
+                pha             ; output space (Word)
+                pea     $001D   ; hrtz50or60 parameter ID
+                ldx     #$0C03  ; _ReadBParam
+                jsl     $E10000
+
+                pla             ; A = result (ORCA-C Word return)
+                rtl
+                end
diff --git a/src/shared68k/surface68k.s b/src/shared68k/surface68k.s
index 7e3186a..4d68fb2 100644
--- a/src/shared68k/surface68k.s
+++ b/src/shared68k/surface68k.s
@@ -253,3 +253,253 @@ _surface68kFillRectByteAligned:
 .Lfrb_done:
                 movem.l (%sp)+,%d2-%d6
                 rts
+
+
+| ----------------------------------------------------------------
+| void surface68kFillSpan4Planes(uint8_t *p0, uint8_t *p1,
+|                                uint8_t *p2, uint8_t *p3,
+|                                uint16_t numMid,
+|                                uint8_t  leftMask, uint8_t rightMask,
+|                                uint8_t  fb0, uint8_t fb1,
+|                                uint8_t  fb2, uint8_t fb3);
+|
+| Fill ONE planar row across 4 planes -- the per-row body of
+| halFillRectPlanes lifted into asm. Each pN points at the leading
+| byte (already advanced by planeBase + y*40 + byteFirst on the C
+| side). leftMask and rightMask are the partial-byte masks for the
+| left/right edges; numMid is the count of full bytes between them.
+| fbN is 0x00 or 0xFF, the per-plane fill byte (caller pre-classifies
+| (colorIndex >> N) & 1 -> 0xFF or 0x00).
+|
+| Used by Amiga halFastFillCircle (one call per scanline span) and
+| Amiga halFillRectPlanes (one call per row of the rect). Replaces
+| the C inner loop whose ~13 cyc/byte was the gating cost on
+| fillCircle r=40 even after C-side inlining.
+|
+| Mask convention is uniform for all planes:
+|   leading byte  := (*p & ~leftMask)  | (fbN & leftMask)
+|   middle bytes  := fbN
+|   trailing byte := (*p & ~rightMask) | (fbN & rightMask)
+| -- branchless: the same arithmetic produces "set" or "clear" based
+| on whether fbN is 0xFF or 0x00.
+|
+| ABI: m68k cdecl. d2-d7/a2-a6 callee-save (movem'd here).
+| Stack offset to first arg after MOVEM: 11 regs * 4 = 44 bytes saved
+| + 4 ret PC = 48.
+| ----------------------------------------------------------------
+                .globl  _surface68kFillSpan4Planes
+
+                .equ    SP_SAVED, 44
+                .equ    SP_RPC,    4
+                .equ    SP_OFF,   (SP_SAVED + SP_RPC)
+
+                .equ    SP_P0,    SP_OFF + 0
+                .equ    SP_P1,    SP_OFF + 4
+                .equ    SP_P2,    SP_OFF + 8
+                .equ    SP_P3,    SP_OFF + 12
+                .equ    SP_NMID,  SP_OFF + 16 + 2  | int -> low word at +2
+                .equ    SP_LMASK, SP_OFF + 20 + 3  | int -> low byte at +3
+                .equ    SP_RMASK, SP_OFF + 24 + 3
+                .equ    SP_FB0,   SP_OFF + 28 + 3
+                .equ    SP_FB1,   SP_OFF + 32 + 3
+                .equ    SP_FB2,   SP_OFF + 36 + 3
+                .equ    SP_FB3,   SP_OFF + 40 + 3
+
+| Macro: per-plane work fully inlined. Args:
+|   plane_an  = the address register holding this plane's pointer.
+|   fb_off    = the stack offset for this plane's fillByte.
+| Uses d6/d7 as scratch; d1=leftMask, d2=~leftMask, d3=rightMask,
+| d4=~rightMask; d0=numMid-1 (only valid if mid_count > 0). The mid
+| loop is skipped via .LfsSkipMid_<n> when numMid was 0 at entry --
+| the per-plane caller branches to the right tail label.
+|
+| Hand-unrolled 4x rather than using bsr+rts to dodge ~12 cyc per
+| return + the per-plane re-test of numMid that the previous build
+| paid. The mid-loop label suffix is the plane index so all four
+| inline copies can coexist without label collisions.
+|
+| Plain text version of the per-plane body (translate to asm 4x with
+| different a-regs and fb stack offsets):
+|
+|   move.b  (an),%d6
+|   and.b   %d2,%d6
+|   move.b  fb,%d7
+|   and.b   %d1,%d7
+|   or.b    %d7,%d6
+|   move.b  %d6,(an)+
+|   < if has-middle path: >
+|     move.w  %d0,%d7
+|   .midN:
+|     move.b  fb,(an)+
+|     dbra    %d7,.midN
+|   < trailing: >
+|   move.b  (an),%d6
+|   and.b   %d4,%d6
+|   move.b  fb,%d7
+|   and.b   %d3,%d7
+|   or.b    %d7,%d6
+|   move.b  %d6,(an)
+
+_surface68kFillSpan4Planes:
+                movem.l %d2-%d7/%a2-%a6,-(%sp)
+
+                move.b  SP_LMASK(%sp),%d1
+                move.b  %d1,%d2
+                not.b   %d2
+                move.b  SP_RMASK(%sp),%d3
+                move.b  %d3,%d4
+                not.b   %d4
+
+                move.l  SP_P0(%sp),%a0
+                move.l  SP_P1(%sp),%a1
+                move.l  SP_P2(%sp),%a2
+                move.l  SP_P3(%sp),%a3
+
+                | One-time numMid test. d0.w = numMid; if 0 jump to
+                | the no-middle entry, otherwise pre-decrement for dbra
+                | and fall into the with-middle entry. Both paths
+                | unroll all 4 planes.
+                move.w  SP_NMID(%sp),%d0
+                beq     .LfsNoMid
+                subq.w  #1,%d0
+
+                | ---- WITH-MIDDLE PATH ----
+                | Plane 0
+                move.b  (%a0),%d6
+                and.b   %d2,%d6
+                move.b  SP_FB0(%sp),%d5
+                move.b  %d5,%d7
+                and.b   %d1,%d7
+                or.b    %d7,%d6
+                move.b  %d6,(%a0)+
+                move.w  %d0,%d7
+.LfsMid0:       move.b  %d5,(%a0)+
+                dbra    %d7,.LfsMid0
+                move.b  (%a0),%d6
+                and.b   %d4,%d6
+                move.b  %d5,%d7
+                and.b   %d3,%d7
+                or.b    %d7,%d6
+                move.b  %d6,(%a0)
+
+                | Plane 1
+                move.b  (%a1),%d6
+                and.b   %d2,%d6
+                move.b  SP_FB1(%sp),%d5
+                move.b  %d5,%d7
+                and.b   %d1,%d7
+                or.b    %d7,%d6
+                move.b  %d6,(%a1)+
+                move.w  %d0,%d7
+.LfsMid1:       move.b  %d5,(%a1)+
+                dbra    %d7,.LfsMid1
+                move.b  (%a1),%d6
+                and.b   %d4,%d6
+                move.b  %d5,%d7
+                and.b   %d3,%d7
+                or.b    %d7,%d6
+                move.b  %d6,(%a1)
+
+                | Plane 2
+                move.b  (%a2),%d6
+                and.b   %d2,%d6
+                move.b  SP_FB2(%sp),%d5
+                move.b  %d5,%d7
+                and.b   %d1,%d7
+                or.b    %d7,%d6
+                move.b  %d6,(%a2)+
+                move.w  %d0,%d7
+.LfsMid2:       move.b  %d5,(%a2)+
+                dbra    %d7,.LfsMid2
+                move.b  (%a2),%d6
+                and.b   %d4,%d6
+                move.b  %d5,%d7
+                and.b   %d3,%d7
+                or.b    %d7,%d6
+                move.b  %d6,(%a2)
+
+                | Plane 3
+                move.b  (%a3),%d6
+                and.b   %d2,%d6
+                move.b  SP_FB3(%sp),%d5
+                move.b  %d5,%d7
+                and.b   %d1,%d7
+                or.b    %d7,%d6
+                move.b  %d6,(%a3)+
+                move.w  %d0,%d7
+.LfsMid3:       move.b  %d5,(%a3)+
+                dbra    %d7,.LfsMid3
+                move.b  (%a3),%d6
+                and.b   %d4,%d6
+                move.b  %d5,%d7
+                and.b   %d3,%d7
+                or.b    %d7,%d6
+                move.b  %d6,(%a3)
+
+                movem.l (%sp)+,%d2-%d7/%a2-%a6
+                rts
+
+.LfsNoMid:
+                | ---- NO-MIDDLE PATH (just leading + trailing) ----
+                | Plane 0
+                move.b  (%a0),%d6
+                and.b   %d2,%d6
+                move.b  SP_FB0(%sp),%d5
+                move.b  %d5,%d7
+                and.b   %d1,%d7
+                or.b    %d7,%d6
+                move.b  %d6,(%a0)+
+                move.b  (%a0),%d6
+                and.b   %d4,%d6
+                move.b  %d5,%d7
+                and.b   %d3,%d7
+                or.b    %d7,%d6
+                move.b  %d6,(%a0)
+
+                | Plane 1
+                move.b  (%a1),%d6
+                and.b   %d2,%d6
+                move.b  SP_FB1(%sp),%d5
+                move.b  %d5,%d7
+                and.b   %d1,%d7
+                or.b    %d7,%d6
+                move.b  %d6,(%a1)+
+                move.b  (%a1),%d6
+                and.b   %d4,%d6
+                move.b  %d5,%d7
+                and.b   %d3,%d7
+                or.b    %d7,%d6
+                move.b  %d6,(%a1)
+
+                | Plane 2
+                move.b  (%a2),%d6
+                and.b   %d2,%d6
+                move.b  SP_FB2(%sp),%d5
+                move.b  %d5,%d7
+                and.b   %d1,%d7
+                or.b    %d7,%d6
+                move.b  %d6,(%a2)+
+                move.b  (%a2),%d6
+                and.b   %d4,%d6
+                move.b  %d5,%d7
+                and.b   %d3,%d7
+                or.b    %d7,%d6
+                move.b  %d6,(%a2)
+
+                | Plane 3
+                move.b  (%a3),%d6
+                and.b   %d2,%d6
+                move.b  SP_FB3(%sp),%d5
+                move.b  %d5,%d7
+                and.b   %d1,%d7
+                or.b    %d7,%d6
+                move.b  %d6,(%a3)+
+                move.b  (%a3),%d6
+                and.b   %d4,%d6
+                move.b  %d5,%d7
+                and.b   %d3,%d7
+                or.b    %d7,%d6
+                move.b  %d6,(%a3)
+
+                movem.l (%sp)+,%d2-%d7/%a2-%a6
+                rts
diff --git a/tools/diff-uber-hashes b/tools/diff-uber-hashes
new file mode 100755
index 0000000..6b36814
--- /dev/null
+++ b/tools/diff-uber-hashes
@@ -0,0 +1,93 @@
+#!/usr/bin/env python3
+"""Compare two UBER joeylog.txt files by per-op surface hash.
+
+Used by the planar 68k rewrite (project_planar_68k_plan.md): IIgs
+captures the golden reference, each 68k port re-runs UBER after a
+primitive conversion, and this tool tells you which ops produced
+different pixels. Without this, "looks right visually" misses the
+subtle mismatches that cascade into hard-to-debug corruption.
+
+Usage:
+    tools/diff-uber-hashes <reference-log> <test-log>
+
+Exit code:
+    0 = all hashes match
+    1 = at least one mismatch
+    2 = usage error or missing file
+"""
+
+import re
+import sys
+
+# Match e.g.:
+#   UBER: drawCircle r=80: 56 iters / 4 frames = 840 ops/sec | hash=A1B2C3D4
+LINE_RE = re.compile(
+    r"UBER:\s+(?P<op>[^:]+):\s+\d+\s+iters\s+/\s+\d+\s+frames\s+=\s+\d+\s+ops/sec\s+\|\s+hash=(?P<hash>[0-9A-Fa-f]+)"
+)
+
+
+def parse_log(path):
+    """Return ordered dict {op_name: hash} from a UBER log file.
+
+    Multiple runs may be concatenated in the same log (joeyLog appends)
+    -- in that case the LAST hash for each op wins, matching the most
+    recent run.
+    """
+    hashes = {}
+    with open(path) as f:
+        for line in f:
+            m = LINE_RE.search(line)
+            if m:
+                hashes[m.group("op").strip()] = m.group("hash").upper()
+    return hashes
+
+
+def main(argv):
+    if len(argv) != 3:
+        sys.stderr.write(
+            "usage: diff-uber-hashes <reference-log> <test-log>\n"
+        )
+        return 2
+
+    try:
+        ref = parse_log(argv[1])
+        test = parse_log(argv[2])
+    except OSError as e:
+        sys.stderr.write(f"error: {e}\n")
+        return 2
+
+    if not ref:
+        sys.stderr.write(f"error: no UBER hash lines found in {argv[1]}\n")
+        return 2
+    if not test:
+        sys.stderr.write(f"error: no UBER hash lines found in {argv[2]}\n")
+        return 2
+
+    mismatches = 0
+    matches = 0
+    for op, ref_hash in ref.items():
+        test_hash = test.get(op)
+        if test_hash is None:
+            print(f"  MISSING in test: {op}  (ref={ref_hash})")
+            mismatches += 1
+        elif test_hash != ref_hash:
+            print(f"  MISMATCH {op}: ref={ref_hash}  test={test_hash}")
+            mismatches += 1
+        else:
+            matches += 1
+
+    extras = [op for op in test if op not in ref]
+    for op in extras:
+        print(f"  EXTRA in test: {op}  (test={test[op]})")
+
+    total = len(ref) + len(extras)
+    print()
+    if mismatches == 0 and not extras:
+        print(f"OK: {matches}/{total} ops match")
+        return 0
+    print(f"FAIL: {matches} match, {mismatches} mismatch, {len(extras)} extras")
+    return 1
+
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv))
diff --git a/tools/diff-uber-perf b/tools/diff-uber-perf
new file mode 100755
index 0000000..82df37d
--- /dev/null
+++ b/tools/diff-uber-perf
@@ -0,0 +1,132 @@
+#!/usr/bin/env python3
+"""Compare two UBER joeylog.txt files by per-op ops/sec.
+
+Sibling of diff-uber-hashes (which compares pixel correctness). This
+tool drives Phase 10 of project_planar_68k_plan.md: pick the
+biggest perf gaps vs the IIgs reference and target asm/algorithmic
+optimization at those.
+
+Usage:
+    tools/diff-uber-perf <reference-log> <test-log> [--threshold 1.0]
+
+Output is sorted by speed ratio (test/ref) ascending, so the worst
+gaps print first. Ops missing from either log are flagged. The
+threshold flag (default 1.0) marks ops below that ratio as FAIL --
+project_perf_directive.md says "IIgs is the perf floor; every
+other target must match or beat it", so parity = 1.0x. Use
+--threshold 0.8 for the project_planar_68k_plan looser acceptance.
+
+Exit code:
+    0 = all common ops at >= threshold
+    1 = at least one op below threshold (or missing)
+    2 = usage error or missing file
+"""
+
+import re
+import sys
+
+# Match e.g.:
+#   UBER: drawCircle r=80: 56 iters / 4 frames = 840 ops/sec | hash=A1B2C3D4
+LINE_RE = re.compile(
+    r"UBER:\s+(?P<op>[^:]+):\s+\d+\s+iters\s+/\s+\d+\s+frames\s+=\s+(?P<ops>\d+)\s+ops/sec"
+)
+
+
+def parse_log(path):
+    """Return ordered dict {op_name: ops_per_sec} from a UBER log file.
+
+    Multiple runs may be concatenated (joeyLog appends); last value
+    for each op wins, matching the most recent run.
+    """
+    perf = {}
+    with open(path) as f:
+        for line in f:
+            m = LINE_RE.search(line)
+            if m:
+                perf[m.group("op").strip()] = int(m.group("ops"))
+    return perf
+
+
+def main(argv):
+    threshold = 1.0
+    args = []
+    i = 1
+    while i < len(argv):
+        if argv[i] == "--threshold" and i + 1 < len(argv):
+            try:
+                threshold = float(argv[i + 1])
+            except ValueError:
+                sys.stderr.write(f"error: bad threshold {argv[i+1]}\n")
+                return 2
+            i += 2
+        else:
+            args.append(argv[i])
+            i += 1
+
+    if len(args) != 2:
+        sys.stderr.write(
+            "usage: diff-uber-perf <reference-log> <test-log> [--threshold 1.0]\n"
+        )
+        return 2
+
+    try:
+        ref = parse_log(args[0])
+        test = parse_log(args[1])
+    except OSError as e:
+        sys.stderr.write(f"error: {e}\n")
+        return 2
+
+    if not ref:
+        sys.stderr.write(f"error: no UBER lines found in {args[0]}\n")
+        return 2
+    if not test:
+        sys.stderr.write(f"error: no UBER lines found in {args[1]}\n")
+        return 2
+
+    rows = []
+    for op, ref_ops in ref.items():
+        test_ops = test.get(op)
+        if test_ops is None:
+            rows.append((op, ref_ops, None, None, "MISSING"))
+            continue
+        if ref_ops == 0:
+            ratio = float("inf") if test_ops > 0 else 1.0
+        else:
+            ratio = test_ops / ref_ops
+        status = "ok" if ratio >= threshold else "FAIL"
+        rows.append((op, ref_ops, test_ops, ratio, status))
+
+    extras = [(op, None, test[op], None, "EXTRA") for op in test if op not in ref]
+
+    # Sort: missing/fail first by worst ratio, then ok ascending by ratio.
+    def sort_key(row):
+        op, refv, testv, ratio, status = row
+        if status == "MISSING":
+            return (0, 0.0, op)
+        if status == "EXTRA":
+            return (3, 0.0, op)
+        return (1 if status == "FAIL" else 2, ratio, op)
+
+    rows.sort(key=sort_key)
+
+    op_w = max(len(op) for op in ref) if ref else 8
+    op_w = max(op_w, max((len(op) for op in test), default=8), len("op"))
+
+    print(f"{'op':<{op_w}}  {'ref':>10}  {'test':>10}  {'ratio':>7}  status")
+    print(f"{'-'*op_w}  {'-'*10}  {'-'*10}  {'-'*7}  ------")
+    fails = 0
+    for op, refv, testv, ratio, status in rows + extras:
+        refs = "" if refv is None else str(refv)
+        tests = "" if testv is None else str(testv)
+        rats = "" if ratio is None else f"{ratio:.2f}x"
+        print(f"{op:<{op_w}}  {refs:>10}  {tests:>10}  {rats:>7}  {status}")
+        if status in ("FAIL", "MISSING"):
+            fails += 1
+
+    print()
+    print(f"threshold: {threshold:.2f}x  ({len(rows)} ops compared, {fails} below threshold)")
+    return 1 if fails > 0 else 0
+
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv))