diff --git a/make/atarist.mk b/make/atarist.mk
index 38bc119..b516aa2 100644
--- a/make/atarist.mk
+++ b/make/atarist.mk
@@ -37,6 +37,7 @@ LIB_OBJS := \
     $(patsubst $(SRC_PORT)/atarist/%.s,$(BUILD)/obj/port/%.o,$(PORT_S_SRCS)) \
     $(patsubst $(SRC_68K)/%.s,$(BUILD)/obj/68k/%.o,$(SHARED_S)) \
     $(BUILD)/obj/codegen/spriteEmit68k.o \
+    $(BUILD)/obj/codegen/spriteEmitInterleaved68k.o \
     $(BUILD)/obj/codegen/spriteCompile.o
 
 LIB := $(LIBDIR)/libjoey.a
diff --git a/src/codegen/spriteCompile.c b/src/codegen/spriteCompile.c
index 750283f..e051adc 100644
--- a/src/codegen/spriteCompile.c
+++ b/src/codegen/spriteCompile.c
@@ -37,7 +37,7 @@ static uint16_t emitDrawForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift
 #elif defined(JOEYLIB_PLATFORM_AMIGA)
     return spriteEmitDrawPlanar68k(out, sp, shift);
 #elif defined(JOEYLIB_PLATFORM_ATARIST)
-    return spriteEmitDraw68k(out, sp, shift);
+    return spriteEmitDrawInterleaved68k(out, sp, shift);
 #elif defined(JOEYLIB_PLATFORM_IIGS)
     return spriteEmitDrawIigs(out, sp, shift);
 #else
@@ -57,7 +57,7 @@ static uint16_t emitSaveForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift
 #elif defined(JOEYLIB_PLATFORM_AMIGA)
     return spriteEmitSavePlanar68k(out, sp, shift);
 #elif defined(JOEYLIB_PLATFORM_ATARIST)
-    return spriteEmitSave68k(out, sp, shift);
+    return spriteEmitSaveInterleaved68k(out, sp, shift);
 #elif defined(JOEYLIB_PLATFORM_IIGS)
     return spriteEmitSaveIigs(out, sp, shift);
 #else
@@ -73,7 +73,7 @@ static uint16_t emitRestoreForTarget(uint8_t *out, const SpriteT *sp, uint8_t sh
 #elif defined(JOEYLIB_PLATFORM_AMIGA)
     return spriteEmitRestorePlanar68k(out, sp, shift);
 #elif defined(JOEYLIB_PLATFORM_ATARIST)
-    return spriteEmitRestore68k(out, sp, shift);
+    return spriteEmitRestoreInterleaved68k(out, sp, shift);
 #elif defined(JOEYLIB_PLATFORM_IIGS)
     return spriteEmitRestoreIigs(out, sp, shift);
 #else
@@ -139,6 +139,15 @@ bool spriteCompile(SpriteT *sp) {
         free(scratch);
         return false;
     }
+    if (totalSize == 0) {
+        /* Platforms whose emitter returns 0 for every (shift, op) have
+         * no compiled bytes -- spriteCompiledDraw / SaveUnder /
+         * RestoreUnder would dereference a degenerate slot or chunky
+         * shadow. Bail so sp->slot stays NULL and the dispatcher
+         * routes through the interpreted halSpriteXxxPlanes path. */
+        free(scratch);
+        return false;
+    }
 
     slot = codegenArenaAlloc(totalSize);
     if (slot == NULL) {
@@ -684,6 +693,68 @@ void spriteCompiledRestoreUnder(SurfaceT *dst, const SpriteBackupT *backup) {
     fn(p0 + byteOff, p1 + byteOff, p2 + byteOff, p3 + byteOff, backup->bytes);
 }
 
+#elif defined(JOEYLIB_PLATFORM_ATARIST)
+
+/* ST word-interleaved planar runtime dispatch. The JIT routine takes
+ * one arg: groupBase = pd->base + y*160 + (x>>4)*8 (the address of
+ * the first 16-pixel group the sprite touches). It walks rows by
+ * adda.w #160 at the end of each row. Per (row, tile_col, plane) it
+ * emits up to one move.b / clr.b / andi.b+ori.b / ori.b chain at
+ * d16(a0).
+ *
+ * shift selection (in spriteInternal.h SPRITE_SHIFT_INDEX):
+ *   0  : byte-aligned x with x mod 16 == 0  (first tile col high half)
+ *   1  : byte-aligned x with x mod 16 == 8  (first tile col low half)
+ *   2+ : non-byte-aligned x, never compiled (emitter returns 0); the
+ *        per-shift offset is SPRITE_NOT_COMPILED so the dispatcher
+ *        falls back to halSpriteDrawPlanes. */
+void spriteCompiledDraw(SurfaceT *dst, const SpriteT *sp, int16_t x, int16_t y) {
+    typedef void (*DrawFn)(uint8_t *groupBase);
+    uint8_t  shift;
+    uint16_t routeOffset;
+    uint8_t *base;
+    uint8_t *groupBase;
+    DrawFn   fn;
+
+    shift       = SPRITE_SHIFT_INDEX(x);
+    routeOffset = sp->routineOffsets[shift][SPRITE_OP_DRAW];
+    if (routeOffset == SPRITE_NOT_COMPILED) {
+        /* Non-byte-aligned x: cross-platform spriteDraw will call
+         * halSpriteDrawPlanes after this returns (since the dispatcher
+         * already chose the compiled path based on sp->slot != NULL,
+         * but COMPILED_SPRITE_WRITES_PLANES is 1 on ST so it normally
+         * suppresses the planes hook). For non-aligned shifts we
+         * deliberately want the interpreted planes hook to run, so
+         * delegate via halSpriteDrawPlanes here. */
+        halSpriteDrawPlanes(dst, sp, x, y);
+        return;
+    }
+    base = halSurfacePlanePtr(dst, 0);
+    if (base == NULL) {
+        return;
+    }
+    groupBase = base
+              + (uint16_t)y * 160u
+              + (uint16_t)((uint16_t)x >> 4) * 8u;
+    fn = (DrawFn)(codegenArenaBase() + sp->slot->offset + routeOffset);
+    fn(groupBase);
+}
+
+
+/* Save/Restore aren't compiled on ST yet (emitter returns 0). The
+ * dispatcher's check on sp->routineOffsets[shift][SPRITE_OP_SAVE/_RESTORE]
+ * == SPRITE_NOT_COMPILED already routes those through the
+ * interpreted halSpriteSavePlanes / halSpriteRestorePlanes. These
+ * stubs exist only to satisfy the linker. */
+void spriteCompiledSaveUnder(const SurfaceT *src, SpriteT *sp, int16_t x, int16_t y, SpriteBackupT *backup) {
+    (void)src; (void)sp; (void)x; (void)y; (void)backup;
+}
+
+
+void spriteCompiledRestoreUnder(SurfaceT *dst, const SpriteBackupT *backup) {
+    (void)dst; (void)backup;
+}
+
 #else
 
 void spriteCompiledDraw(SurfaceT *dst, const SpriteT *sp, int16_t x, int16_t y) {
diff --git a/src/codegen/spriteEmitInterleaved68k.c b/src/codegen/spriteEmitInterleaved68k.c
new file mode 100644
index 0000000..32a7f5c
--- /dev/null
+++ b/src/codegen/spriteEmitInterleaved68k.c
@@ -0,0 +1,220 @@
+// 68k sprite codegen for ST word-interleaved planar layout. Emits a
+// cdecl-callable routine `void draw(uint8_t *groupBase)` that walks
+// the sprite's tile data and writes plane bytes via `d16(a0)` chains.
+//
+// ST planar layout reminder (doc/atarist_planar.md): one buffer; per
+// scanline 20 groups of 8 bytes; per group, 4 plane words back-to-
+// back. groupBase points at the FIRST group the sprite touches:
+//   pd->base + y * 160 + (x >> 4) * 8
+//
+// Shift index for ST is bit 3 of x (whether the sprite starts in the
+// high half or low half of the first group). x mod 8 != 0 falls back
+// to the interpreter (returns 0 from this emitter so sp->slot stays
+// NULL for those alignments).
+//
+// Per (row, tile_col, plane) we emit one of:
+//   * nothing                         (op byte = 0, all transparent)
+//   * move.b #pbN, d16(a0)            (op = 0xFF, full replace, 6 bytes)
+//   * clr.b d16(a0)                   (op = 0xFF AND pbN = 0, 4 bytes)
+//   * andi.b #~op, d16(a0)            (op partial, pbN = 0, 6 bytes)
+//   * ori.b #pbN, d16(a0)             (op partial, pbN == op, 6 bytes)
+//   * andi.b #~op + ori.b #pbN        (mixed, 12 bytes)
+//
+// d16 is the byte offset from groupBase to the target plane byte.
+// Layout of the byte offset:
+//   shift 0: byteOff = (col >> 1) * 8 + plane*2 + (col & 1)
+//   shift 1: byteOff = ((col + 1) >> 1) * 8 + plane*2 + (1 - (col & 1))
+// Each tile column is 8 sprite pixels = exactly half a 16-pixel
+// group, alternating high (offset 0) and low (offset 1) bytes of
+// each plane word.
+//
+// Per row we adda.w #160, a0 to advance to the next scanline.
+
+#include "joey/sprite.h"
+#include "joey/surface.h"
+#include "spriteEmitter.h"
+#include "spriteInternal.h"
+
+
+// ----- Constants -----
+
+#define TILE_PIXELS         8
+#define TILE_BYTES          32
+#define TILE_BYTES_PER_ROW  4
+#define ST_BYTES_PER_ROW    160
+
+
+// ----- Helpers -----
+
+static uint16_t writeBE16(uint8_t *out, uint16_t value) {
+    out[0] = (uint8_t)(value >> 8);
+    out[1] = (uint8_t)(value & 0xFFu);
+    return 2;
+}
+
+
+// Build the 4 plane bytes + opacity byte for one (row, tileCol)
+// pair. pbN bit 7 is sprite pixel 0 (leftmost), bit 0 is pixel 7.
+// op bit N is set iff that pixel's color != 0.
+static void buildPlaneBytes(const SpriteT *sp, uint16_t row, uint16_t tileCol,
+                             uint8_t *outPb0, uint8_t *outPb1,
+                             uint8_t *outPb2, uint8_t *outPb3,
+                             uint8_t *outOp) {
+    uint16_t       tileY     = (uint16_t)(row >> 3);
+    uint16_t       inTileY   = (uint16_t)(row & 7u);
+    uint16_t       wTiles    = sp->widthTiles;
+    const uint8_t *tileBytes = sp->tileData + (uint32_t)(tileY * wTiles + tileCol) * 32u;
+    const uint8_t *tileRow   = tileBytes + (uint32_t)inTileY * TILE_BYTES_PER_ROW;
+    uint8_t        pb0       = 0u;
+    uint8_t        pb1       = 0u;
+    uint8_t        pb2       = 0u;
+    uint8_t        pb3       = 0u;
+    uint8_t        op        = 0u;
+    uint8_t        p;
+    uint8_t        b;
+    uint8_t        color;
+    uint8_t        bit;
+
+    for (p = 0; p < 8u; p++) {
+        b = tileRow[p >> 1];
+        color = (p & 1u) ? (uint8_t)(b & 0x0Fu) : (uint8_t)(b >> 4);
+        bit = (uint8_t)(0x80u >> p);
+        if (color != 0u) {
+            op = (uint8_t)(op | bit);
+            if (color & 1u) pb0 = (uint8_t)(pb0 | bit);
+            if (color & 2u) pb1 = (uint8_t)(pb1 | bit);
+            if (color & 4u) pb2 = (uint8_t)(pb2 | bit);
+            if (color & 8u) pb3 = (uint8_t)(pb3 | bit);
+        }
+    }
+    *outPb0 = pb0;
+    *outPb1 = pb1;
+    *outPb2 = pb2;
+    *outPb3 = pb3;
+    *outOp  = op;
+}
+
+
+// Emit code for one plane byte at d16(a0). Returns bytes written.
+// op=opacity byte, pb=plane byte (subset of op).
+static uint16_t emitPlaneByte(uint8_t *out, uint16_t cursor, uint16_t d16, uint8_t op, uint8_t pb) {
+    uint16_t start = cursor;
+
+    if (op == 0u) {
+        return 0u;  /* nothing to emit */
+    }
+    if (op == 0xFFu) {
+        /* All 8 pixels opaque: replace the byte. */
+        if (pb == 0u) {
+            /* clr.b d16(a0). Opcode 0x4228 + d16. 4 bytes. */
+            cursor += writeBE16(out + cursor, 0x4228u);
+            cursor += writeBE16(out + cursor, d16);
+        } else {
+            /* move.b #pb, d16(a0). Opcode 0x117C + #imm word + d16. 6 bytes. */
+            cursor += writeBE16(out + cursor, 0x117Cu);
+            cursor += writeBE16(out + cursor, (uint16_t)pb);
+            cursor += writeBE16(out + cursor, d16);
+        }
+        return (uint16_t)(cursor - start);
+    }
+    /* Partial opacity. pb is a subset of op. */
+    if (pb == 0u) {
+        /* All opaque pixels have plane bit 0: just clear those bits. */
+        /* andi.b #~op, d16(a0). Opcode 0x0228 + #imm word + d16. 6 bytes. */
+        cursor += writeBE16(out + cursor, 0x0228u);
+        cursor += writeBE16(out + cursor, (uint16_t)(~op & 0xFFu));
+        cursor += writeBE16(out + cursor, d16);
+        return (uint16_t)(cursor - start);
+    }
+    if (pb == op) {
+        /* All opaque pixels have plane bit 1: just set those bits. */
+        /* ori.b #op, d16(a0). Opcode 0x0028 + #imm word + d16. 6 bytes. */
+        cursor += writeBE16(out + cursor, 0x0028u);
+        cursor += writeBE16(out + cursor, (uint16_t)op);
+        cursor += writeBE16(out + cursor, d16);
+        return (uint16_t)(cursor - start);
+    }
+    /* Mixed: clear opaque bits, then set the plane bits. */
+    cursor += writeBE16(out + cursor, 0x0228u);
+    cursor += writeBE16(out + cursor, (uint16_t)(~op & 0xFFu));
+    cursor += writeBE16(out + cursor, d16);
+    cursor += writeBE16(out + cursor, 0x0028u);
+    cursor += writeBE16(out + cursor, (uint16_t)pb);
+    cursor += writeBE16(out + cursor, d16);
+    return (uint16_t)(cursor - start);
+}
+
+
+// ----- Emit API -----
+
+uint16_t spriteEmitDrawInterleaved68k(uint8_t *out, const SpriteT *sp, uint8_t shift) {
+    uint16_t cursor;
+    uint16_t row;
+    uint16_t col;
+    uint16_t plane;
+    uint16_t heightPx;
+    uint16_t wTiles;
+    uint8_t  pb[4];
+    uint8_t  op;
+
+    /* Only shifts 0 and 1 emit code. shift 0 = first tile col in
+     * high half (x mod 16 == 0). shift 1 = first tile col in low
+     * half (x mod 16 == 8). Other byte alignments fall through to
+     * the interpreter via halSpriteDrawPlanes. */
+    if (shift > 1u) {
+        return 0u;
+    }
+
+    cursor   = 0u;
+    heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS);
+    wTiles   = sp->widthTiles;
+
+    /* Prologue: movea.l 4(sp), a0. Opcode 0x206F + d16=4. 4 bytes. */
+    cursor += writeBE16(out + cursor, 0x206Fu);
+    cursor += writeBE16(out + cursor, 0x0004u);
+
+    for (row = 0; row < heightPx; row++) {
+        if (row > 0u) {
+            /* adda.w #160, a0. Opcode 0xD0FC + imm word. 4 bytes. */
+            cursor += writeBE16(out + cursor, 0xD0FCu);
+            cursor += writeBE16(out + cursor, (uint16_t)ST_BYTES_PER_ROW);
+        }
+        for (col = 0; col < wTiles; col++) {
+            buildPlaneBytes(sp, row, col, &pb[0], &pb[1], &pb[2], &pb[3], &op);
+            if (op == 0u) {
+                continue;  /* whole tile column row is transparent */
+            }
+            for (plane = 0; plane < 4u; plane++) {
+                uint16_t d16;
+                if (shift == 0u) {
+                    /* col 0 (high) -> +0, col 1 (low) -> +1, col 2
+                     * (high group 1) -> +8, ... */
+                    d16 = (uint16_t)((col >> 1) * 8 + plane * 2 + (col & 1u));
+                } else {
+                    /* col 0 (low) -> +1, col 1 (high group 1) -> +8, ... */
+                    d16 = (uint16_t)(((col + 1u) >> 1) * 8 + plane * 2 + (1u - (col & 1u)));
+                }
+                cursor += emitPlaneByte(out, cursor, d16, op, pb[plane]);
+            }
+        }
+    }
+
+    /* Epilogue: rts. */
+    cursor += writeBE16(out + cursor, 0x4E75u);
+    return cursor;
+}
+
+
+/* Save / restore aren't implemented yet -- returning 0 so they fall
+ * through to the C interpreter (halSpriteSavePlanes / halSpriteRestorePlanes
+ * fast paths cover the byte-aligned case). */
+uint16_t spriteEmitSaveInterleaved68k(uint8_t *out, const SpriteT *sp, uint8_t shift) {
+    (void)out; (void)sp; (void)shift;
+    return 0u;
+}
+
+
+uint16_t spriteEmitRestoreInterleaved68k(uint8_t *out, const SpriteT *sp, uint8_t shift) {
+    (void)out; (void)sp; (void)shift;
+    return 0u;
+}
diff --git a/src/codegen/spriteEmitter.h b/src/codegen/spriteEmitter.h
index acd7169..694fe01 100644
--- a/src/codegen/spriteEmitter.h
+++ b/src/codegen/spriteEmitter.h
@@ -57,4 +57,15 @@ uint16_t spriteEmitDrawPlanar68k    (uint8_t *out, const SpriteT *sp, uint8_t sh
 uint16_t spriteEmitSavePlanar68k    (uint8_t *out, const SpriteT *sp, uint8_t shift);
 uint16_t spriteEmitRestorePlanar68k (uint8_t *out, const SpriteT *sp, uint8_t shift);
 
+// Word-interleaved planar 68k emitter (ST). Calling convention for
+// the emitted bytes:
+//   void draw(uint8_t *groupBase);
+// where groupBase = pd->base + y*160 + (x>>4)*8. Shifts 0 and 1 emit
+// real bytes (x mod 16 == 0 for shift 0, x mod 16 == 8 for shift 1);
+// other shifts return 0 so the cross-platform dispatcher falls back
+// to halSpriteDrawPlanes.
+uint16_t spriteEmitDrawInterleaved68k    (uint8_t *out, const SpriteT *sp, uint8_t shift);
+uint16_t spriteEmitSaveInterleaved68k    (uint8_t *out, const SpriteT *sp, uint8_t shift);
+uint16_t spriteEmitRestoreInterleaved68k (uint8_t *out, const SpriteT *sp, uint8_t shift);
+
 #endif
diff --git a/src/core/sprite.c b/src/core/sprite.c
index 177ca53..0fea341 100644
--- a/src/core/sprite.c
+++ b/src/core/sprite.c
@@ -31,7 +31,11 @@
 // paths still need the hooks unconditionally on every platform -- the
 // chunky interpreter is a no-op on Amiga (s->pixels NULL) so the hook
 // is the only draw.
-#if defined(JOEYLIB_PLATFORM_AMIGA)
+/* ST also runs pure planar post-Phase-9 (s->pixels NULL); the JIT
+ * routine writes plane bytes directly, so the chunky interpreter
+ * is a no-op and the halSpriteDrawPlanes hook would be a redundant
+ * second draw. Same rationale as Amiga. */
+#if defined(JOEYLIB_PLATFORM_AMIGA) || defined(JOEYLIB_PLATFORM_ATARIST)
 #define COMPILED_SPRITE_WRITES_PLANES 1
 #else
 #define COMPILED_SPRITE_WRITES_PLANES 0
diff --git a/src/core/spriteInternal.h b/src/core/spriteInternal.h
index 99a6bd5..d2059c9 100644
--- a/src/core/spriteInternal.h
+++ b/src/core/spriteInternal.h
@@ -16,9 +16,15 @@
 // Per-platform shift index used by the dispatcher. Chunky 4bpp ports
 // store one nibble per pixel pair so the only sub-byte alignment is
 // x % 2. Amiga planar packs 8 pixels per plane byte so all 8
-// alignments matter.
+// alignments matter. ST word-interleaved planar groups 16 pixels
+// per word; for byte-aligned x (x mod 8 == 0) the only meaningful
+// distinction is high vs low byte of the plane word, which is bit
+// 3 of x (== (x >> 3) & 1). Other shifts (x mod 8 != 0) emit 0
+// from the JIT and route to the interpreter.
 #if defined(JOEYLIB_PLATFORM_AMIGA)
 #define SPRITE_SHIFT_INDEX(x)  ((uint8_t)((x) & 7))
+#elif defined(JOEYLIB_PLATFORM_ATARIST)
+#define SPRITE_SHIFT_INDEX(x)  ((uint8_t)(((x) & 7) ? 2u : (uint8_t)(((x) >> 3) & 1u)))
 #else
 #define SPRITE_SHIFT_INDEX(x)  ((uint8_t)((x) & 1))
 #endif
diff --git a/src/core/tile.c b/src/core/tile.c
index d84b585..16e36ef 100644
--- a/src/core/tile.c
+++ b/src/core/tile.c
@@ -141,11 +141,13 @@ void tileCopy(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src,
     srcPixelX = (uint16_t)((uint16_t)srcBx * TILE_PIXELS_PER_SIDE);
     srcPixelY = (uint16_t)((uint16_t)srcBy * TILE_PIXELS_PER_SIDE);
 
-    dstRow0 = &dst->pixels[SURFACE_ROW_OFFSET(dstPixelY) + (dstPixelX >> 1)];
-    srcRow0 = &src->pixels[SURFACE_ROW_OFFSET(srcPixelY) + (srcPixelX >> 1)];
-
-    if (!halFastTileCopy(dstRow0, srcRow0)) {
-        copyTileOpaque(dstRow0, srcRow0);
+    /* Skip the chunky path on planar ports (pixels NULL). */
+    if (dst->pixels != NULL && src->pixels != NULL) {
+        dstRow0 = &dst->pixels[SURFACE_ROW_OFFSET(dstPixelY) + (dstPixelX >> 1)];
+        srcRow0 = &src->pixels[SURFACE_ROW_OFFSET(srcPixelY) + (srcPixelX >> 1)];
+        if (!halFastTileCopy(dstRow0, srcRow0)) {
+            copyTileOpaque(dstRow0, srcRow0);
+        }
     }
     halTileCopyPlanes(dst, dstBx, dstBy, src, srcBx, srcBy);
     surfaceMarkDirtyRect(dst, (int16_t)dstPixelX, (int16_t)dstPixelY,
@@ -173,11 +175,13 @@ void tileCopyMasked(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT
     srcPixelX = (uint16_t)((uint16_t)srcBx * TILE_PIXELS_PER_SIDE);
     srcPixelY = (uint16_t)((uint16_t)srcBy * TILE_PIXELS_PER_SIDE);
 
-    dstRow0 = &dst->pixels[SURFACE_ROW_OFFSET(dstPixelY) + (dstPixelX >> 1)];
-    srcRow0 = &src->pixels[SURFACE_ROW_OFFSET(srcPixelY) + (srcPixelX >> 1)];
-
-    if (!halFastTileCopyMasked(dstRow0, srcRow0, transparentIndex)) {
-        copyTileMasked(dstRow0, srcRow0, transparentIndex);
+    /* Skip the chunky path on planar ports (pixels NULL). */
+    if (dst->pixels != NULL && src->pixels != NULL) {
+        dstRow0 = &dst->pixels[SURFACE_ROW_OFFSET(dstPixelY) + (dstPixelX >> 1)];
+        srcRow0 = &src->pixels[SURFACE_ROW_OFFSET(srcPixelY) + (srcPixelX >> 1)];
+        if (!halFastTileCopyMasked(dstRow0, srcRow0, transparentIndex)) {
+            copyTileMasked(dstRow0, srcRow0, transparentIndex);
+        }
     }
     halTileCopyMaskedPlanes(dst, dstBx, dstBy, src, srcBx, srcBy, transparentIndex);
     surfaceMarkDirtyRect(dst, (int16_t)dstPixelX, (int16_t)dstPixelY,
@@ -199,8 +203,9 @@ void tileFill(SurfaceT *s, uint8_t bx, uint8_t by, uint8_t colorIndex) {
     pixelX  = (uint16_t)((uint16_t)bx * TILE_PIXELS_PER_SIDE);
     pixelY  = (uint16_t)((uint16_t)by * TILE_PIXELS_PER_SIDE);
     doubled = (uint8_t)(((colorIndex & 0x0F) << 4) | (colorIndex & 0x0F));
-    if (!halFastTileFill(s, bx, by,
-                         (uint16_t)((uint16_t)doubled | ((uint16_t)doubled << 8)))) {
+    if (s->pixels != NULL
+            && !halFastTileFill(s, bx, by,
+                                (uint16_t)((uint16_t)doubled | ((uint16_t)doubled << 8)))) {
         uint8_t *row = &s->pixels[SURFACE_ROW_OFFSET(pixelY) + (pixelX >> 1)];
         uint8_t  i;
         for (i = 0; i < TILE_PIXELS_PER_SIDE; i++) {
@@ -232,16 +237,22 @@ void tilePaste(SurfaceT *dst, uint8_t bx, uint8_t by, const TileT *in) {
     }
     pixelX = (uint16_t)((uint16_t)bx * TILE_PIXELS_PER_SIDE);
     pixelY = (uint16_t)((uint16_t)by * TILE_PIXELS_PER_SIDE);
-    dstRow = &dst->pixels[SURFACE_ROW_OFFSET(pixelY) + (pixelX >> 1)];
     src    = &in->pixels[0];
-    if (!halFastTilePaste(dstRow, src)) {
-        for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) {
-            dstRow[0] = src[0];
-            dstRow[1] = src[1];
-            dstRow[2] = src[2];
-            dstRow[3] = src[3];
-            dstRow += SURFACE_BYTES_PER_ROW;
-            src    += TILE_BYTES_PER_ROW;
+    /* Skip the chunky write path on planar ports (dst->pixels NULL) --
+     * mirrors tileSnap's pixels-NULL short-circuit. Saves the dstRow
+     * SURFACE_ROW_OFFSET multiply + halFastTilePaste jsr/rts per call
+     * on ST/Amiga where the planar path below does the real work. */
+    if (dst->pixels != NULL) {
+        dstRow = &dst->pixels[SURFACE_ROW_OFFSET(pixelY) + (pixelX >> 1)];
+        if (!halFastTilePaste(dstRow, src)) {
+            for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) {
+                dstRow[0] = src[0];
+                dstRow[1] = src[1];
+                dstRow[2] = src[2];
+                dstRow[3] = src[3];
+                dstRow += SURFACE_BYTES_PER_ROW;
+                src    += TILE_BYTES_PER_ROW;
+            }
         }
     }
     halTilePastePlanes(dst, bx, by, &in->pixels[0]);
diff --git a/src/port/atarist/audio.c b/src/port/atarist/audio.c
index f6c5552..687e83b 100644
--- a/src/port/atarist/audio.c
+++ b/src/port/atarist/audio.c
@@ -39,6 +39,13 @@
 #define ST_MFP_IMRA     ((volatile uint8_t *)0xFFFFFA13L)
 #define ST_MFP_ISRA     ((volatile uint8_t *)0xFFFFFA0FL)
 
+// YM2149 (sound chip) supervisor-only ports. Index reg 7 (mixer)
+// controls per-channel tone + noise enables; reg 8/9/A are volumes
+// for channels A/B/C; regs 0/1, 2/3, 4/5 are tone period for those
+// channels; reg 6 is noise period.
+#define ST_YM_SELECT    ((volatile uint8_t *)0xFFFF8800L)
+#define ST_YM_DATA      ((volatile uint8_t *)0xFFFF8802L)
+
 #define MFP_TA_BIT      0x20
 #define MFP_TACR_STOP   0x00
 #define MFP_TACR_DIV200 0x07
@@ -90,6 +97,32 @@ static long installTimerA(void) {
     gNeedRefill[0] = 0;
     gNeedRefill[1] = 0;
 
+    // YM2149 setup for PWM-via-volume on channel A:
+    //   reg 7 (mixer): set bits 0 (tone A off) and 3 (noise A off);
+    //                  preserve bits 6+7 (I/O port directions, used
+    //                  by TOS for floppy / keyboard / printer).
+    //   reg 8 (channel A volume): start at 0 to avoid a pop at start.
+    //
+    // Without the mixer setup, whatever state TOS left noise A in
+    // gets gated by our 12 kHz volume writes -- if noise A was on,
+    // a constant volume = constant hiss. Standard PWM-DAC trick is
+    // to disable both tone and noise so the volume reg is a pure
+    // 4-bit amplitude DAC.
+    //
+    // We can't reliably read back YM regs on the ST (the data port
+    // returns last-write, not register contents), so we OR in the
+    // disable bits over an assumed-safe TOS-default mask. Bit 6 set
+    // (port A output) matches stock TOS; bit 7 set (port B output)
+    // matches the centronics-printer direction TOS configures.
+    *ST_YM_SELECT = 7;
+    *ST_YM_DATA   = 0xFF;  // all tones + noises off; I/O ports A+B output (TOS default)
+    *ST_YM_SELECT = 8;
+    *ST_YM_DATA   = 0;     // channel A volume = 0 to avoid a pop at start
+    *ST_YM_SELECT = 9;
+    *ST_YM_DATA   = 0;     // channel B volume = 0
+    *ST_YM_SELECT = 10;
+    *ST_YM_DATA   = 0;     // channel C volume = 0
+
     // MFP Timer A: stop, install our vector, set prescaler 200 + data
     // 1 (= 2.4576 MHz / 200 = 12288 Hz), then start.
     *ST_MFP_TACR = MFP_TACR_STOP;
@@ -108,6 +141,10 @@ static long uninstallTimerA(void) {
         (void)Setexc(VEC_MFP_TA, (long)gOldTimerAVec);
         gOldTimerAVec = NULL;
     }
+    /* Silence channel A volume so handoff back to TOS is clean (no
+     * residual DC level on the speaker). */
+    *ST_YM_SELECT = 8;
+    *ST_YM_DATA   = 0;
     return 0;
 }
 
diff --git a/src/port/atarist/circle.s b/src/port/atarist/circle.s
new file mode 100644
index 0000000..b7c65c9
--- /dev/null
+++ b/src/port/atarist/circle.s
@@ -0,0 +1,282 @@
+| Atari ST word-interleaved planar circle outline -- 68000 hand-rolled.
+|
+| Mirrors src/port/amiga/circle.s in spirit but for ST's single
+| word-interleaved planar buffer:
+|   * Per scanline: 20 groups of 8 bytes; each group is 4 plane
+|     words back-to-back (p0_word, p1_word, p2_word, p3_word).
+|   * Pixel x: group = x >> 4; bit position within word = 15 - (x & 15).
+|   * Plane N's word at row y, group g: base + y*160 + g*8 + N*2.
+|
+| 16-way color dispatch + per-iter precompute (4 xp records + 4 yp40
+| words) gives a branchless 4-plane RMW per pixel. 8 octants are
+| inlined per Bresenham iter; no bsr.
+|
+| ABI: cdecl. d2-d7/a2-a6 callee-save.
+|
+| void surface68kStCircleOutline(uint8_t *base,
+|                                uint16_t cx, uint16_t cy,
+|                                uint16_t r,  uint8_t  color);
+|
+| Register allocation:
+|   d2.w   = bx (Bresenham)
+|   d3.w   = by (Bresenham)
+|   d4.w   = err (Bresenham)
+|   d5.w   = cx (cached)
+|   a4     = cy (cached, sign-extended)
+|   a3     = base
+|   a5     = bitMaskWordLut
+|   d0,d1,d6,d7 = scratch
+|
+| Scratch block (24 bytes) at sp+0..23:
+|   sp+0..3:   xp1 record [groupOff_w, bitMask_b, notMask_b]
+|              groupOff = (x >> 4) * 8 (byte offset of group within row)
+|              bitMask  = byte representation of 1 << (15 - (x & 15))
+|              ... wait, bitMask must be a WORD on ST not a byte.
+|
+| Actually layout differs from Amiga: ST needs a WORD bit mask, not
+| a byte. Per-record layout (8 bytes):
+|   groupOff_word (2 bytes), bitMask_word (2 bytes), notMask_word
+|   (2 bytes), pad (2 bytes)
+|
+|   sp+0..7:    xp1 record (cx + bx)
+|   sp+8..15:   xp2 record (cx - bx)
+|   sp+16..23:  xp3 record (cx + by)
+|   sp+24..31:  xp4 record (cx - by)
+|   sp+32..33:  yp1_off (cy + by) * 160
+|   sp+34..35:  yp2_off (cy - by) * 160
+|   sp+36..37:  yp3_off (cy + bx) * 160
+|   sp+38..39:  yp4_off (cy - bx) * 160
+| Total: 40 bytes.
+
+                .text
+
+
+| ---- BIT_MASK_WORD: build 1 << (15 - (x & 15)) ---------
+| Look up via 16-entry table (a5 holds base). Cheaper than variable
+| shift on 68000 (which is 8 + 2n cyc). Table is 32 bytes (16 words).
+| Returns word in d_out.
+
+| ---- XP_REC: build xp record at sp+slot for xp = cx <signOp> <xreg>
+| signOp: add or sub
+| xreg:   %d2 (bx) or %d3 (by)
+| slot:   0, 8, 16, or 24
+| Trashes: d0, d1, d6, d7
+
+                .macro  XP_REC  slot, signOp, xreg
+                move.w  %d5,%d6                | d6 = cx
+                \signOp\().w \xreg,%d6         | d6 = xp
+                move.w  %d6,%d7
+                lsr.w   #4,%d7                 | d7 = group
+                lsl.w   #3,%d7                 | d7 = group * 8 (byte offset)
+                and.w   #15,%d6                | d6 = xp & 15 (0..15)
+                add.w   %d6,%d6                | d6 *= 2 (word index)
+                move.w  (%a5,%d6.w),%d6        | d6 = bitMask word
+                move.w  %d7,\slot(%sp)         | groupOff word
+                move.w  %d6,\slot+2(%sp)       | bitMask word
+                .endm
+
+
+| ---- YP_REC: store (yp * 160) at sp+slot ---------
+| yp = cy <signOp> <yreg>; trashes d0, d6.
+
+                .macro  YP_REC  slot, signOp, yreg
+                move.l  %a4,%d6
+                \signOp\().w \yreg,%d6         | d6.w = yp
+                move.w  %d6,%d0
+                lsl.w   #5,%d6                 | d6 = yp << 5
+                lsl.w   #7,%d0                 | d0 = yp << 7
+                add.w   %d6,%d0                | d0 = yp * 160
+                move.w  %d0,\slot(%sp)
+                .endm
+
+
+| ---- PLOT_FIXED: plot one pixel with hardcoded 4-bit color ----
+| slotYp:  32, 34, 36, or 38 (yp_off word slot)
+| slotXp:  0, 8, 16, or 24    (xp record slot)
+| color:   literal 0..15
+| Trashes: d0, d1, d7
+
+                .macro  PLOT_FIXED  slotYp, slotXp, color
+                move.w  \slotYp(%sp),%d0       | d0 = yp_off
+                add.w   \slotXp(%sp),%d0       | d0 += groupOff
+                move.w  \slotXp+2(%sp),%d1     | d1 = bitMask word
+                move.w  %d1,%d7
+                not.w   %d7                    | d7 = notMask
+                lea     0(%a3,%d0.w),%a2       | a2 = base + byteOff (group ptr)
+                | 4 plane word RMWs at (a2)+, postinc walks p0->p1->p2->p3
+                .if  ((\color) & 1)
+                or.w    %d1,(%a2)+
+                .else
+                and.w   %d7,(%a2)+
+                .endif
+                .if  ((\color) & 2)
+                or.w    %d1,(%a2)+
+                .else
+                and.w   %d7,(%a2)+
+                .endif
+                .if  ((\color) & 4)
+                or.w    %d1,(%a2)+
+                .else
+                and.w   %d7,(%a2)+
+                .endif
+                .if  ((\color) & 8)
+                or.w    %d1,(%a2)+
+                .else
+                and.w   %d7,(%a2)+
+                .endif
+                .endm
+
+
+| ---- PLOT_8: 8 octant pixels for hardcoded color ----
+
+                .macro  PLOT_8  color
+                PLOT_FIXED  32,  0, \color     | (cx+bx, cy+by)
+                PLOT_FIXED  32,  8, \color     | (cx-bx, cy+by)
+                PLOT_FIXED  34,  0, \color     | (cx+bx, cy-by)
+                PLOT_FIXED  34,  8, \color     | (cx-bx, cy-by)
+                PLOT_FIXED  36, 16, \color     | (cx+by, cy+bx)
+                PLOT_FIXED  36, 24, \color     | (cx-by, cy+bx)
+                PLOT_FIXED  38, 16, \color     | (cx+by, cy-bx)
+                PLOT_FIXED  38, 24, \color     | (cx-by, cy-bx)
+                .endm
+
+
+| ---- CO_BODY: full Bresenham loop body for hardcoded color ----
+
+                .macro  CO_BODY  color
+                XP_REC   0, add, %d2           | xp1 = cx+bx
+                XP_REC   8, sub, %d2           | xp2 = cx-bx
+                XP_REC  16, add, %d3           | xp3 = cx+by
+                XP_REC  24, sub, %d3           | xp4 = cx-by
+                YP_REC  32, add, %d3           | yp1 = (cy+by)*160
+                YP_REC  34, sub, %d3           | yp2 = (cy-by)*160
+                YP_REC  36, add, %d2           | yp3 = (cy+bx)*160
+                YP_REC  38, sub, %d2           | yp4 = (cy-bx)*160
+
+                PLOT_8  \color
+
+                addq.w  #1,%d3
+                tst.w   %d4
+                bgt     .LcoStDecX_\color
+                add.w   %d3,%d4
+                add.w   %d3,%d4
+                addq.w  #1,%d4
+                bra.w   .LcoStLoop_\color
+.LcoStDecX_\color:
+                subq.w  #1,%d2
+                add.w   %d3,%d4
+                add.w   %d3,%d4
+                sub.w   %d2,%d4
+                sub.w   %d2,%d4
+                addq.w  #1,%d4
+                bra.w   .LcoStLoop_\color
+                .endm
+
+
+                .macro  CO_LOOP_HDR  color
+.LcoStLoop_\color:
+                cmp.w   %d3,%d2
+                bcs.w   .LcoStDone
+                CO_BODY \color
+                .endm
+
+
+| ---- Function entry ----
+| Stack on entry (after movem.l of 11 regs + lea):
+|   sp+0..39:  scratch (40 bytes)
+|   sp+40..83: movem (44 bytes)
+|   sp+84..87: return PC
+|   sp+88+0:   base (uint8_t *)
+|   sp+88+4:   cx  (int promoted, .w at +88+4+2)
+|   sp+88+8:   cy  (int promoted, .w at +88+8+2)
+|   sp+88+12:  r   (int promoted, .w at +88+12+2)
+|   sp+88+16:  color (int promoted, byte at +88+16+3)
+
+                .equ    SP_SAVED, 44
+                .equ    SP_LOCAL, 40
+                .equ    SP_OFF,         (SP_SAVED + 4 + SP_LOCAL)
+                .equ    SP_BASE,    SP_OFF + 0
+                .equ    SP_CX,      SP_OFF + 4 + 2
+                .equ    SP_CY,      SP_OFF + 8 + 2
+                .equ    SP_R,       SP_OFF + 12 + 2
+                .equ    SP_COLOR,   SP_OFF + 16 + 3
+
+                .globl  _surface68kStCircleOutline
+
+_surface68kStCircleOutline:
+                movem.l %d2-%d7/%a2-%a6,-(%sp)
+                lea     -SP_LOCAL(%sp),%sp
+
+                | Load base (a3) and bitMaskLut (a5).
+                move.l  SP_BASE(%sp),%a3
+                lea     bitMaskWordLut(%pc),%a5
+
+                | Cache cx in d5, cy (sign-extended) in a4.
+                move.w  SP_CX(%sp),%d5
+                move.w  SP_CY(%sp),%d6
+                ext.l   %d6
+                movea.l %d6,%a4
+
+                | Bresenham init.
+                move.w  SP_R(%sp),%d2          | bx = r
+                moveq   #0,%d3                 | by = 0
+                moveq   #1,%d4
+                sub.w   %d2,%d4                | err = 1 - bx
+
+                | Dispatch on color (low 4 bits) -> one of 16 main loops.
+                moveq   #0,%d6
+                move.b  SP_COLOR(%sp),%d6
+                and.w   #0x0F,%d6
+                add.w   %d6,%d6
+                add.w   %d6,%d6                | * 4 for bra.w table
+                lea     .LcoStTable(%pc),%a6
+                jmp     0(%a6,%d6.w)
+
+.LcoStTable:
+                bra.w   .LcoStLoop_0
+                bra.w   .LcoStLoop_1
+                bra.w   .LcoStLoop_2
+                bra.w   .LcoStLoop_3
+                bra.w   .LcoStLoop_4
+                bra.w   .LcoStLoop_5
+                bra.w   .LcoStLoop_6
+                bra.w   .LcoStLoop_7
+                bra.w   .LcoStLoop_8
+                bra.w   .LcoStLoop_9
+                bra.w   .LcoStLoop_10
+                bra.w   .LcoStLoop_11
+                bra.w   .LcoStLoop_12
+                bra.w   .LcoStLoop_13
+                bra.w   .LcoStLoop_14
+                bra.w   .LcoStLoop_15
+
+                CO_LOOP_HDR  0
+                CO_LOOP_HDR  1
+                CO_LOOP_HDR  2
+                CO_LOOP_HDR  3
+                CO_LOOP_HDR  4
+                CO_LOOP_HDR  5
+                CO_LOOP_HDR  6
+                CO_LOOP_HDR  7
+                CO_LOOP_HDR  8
+                CO_LOOP_HDR  9
+                CO_LOOP_HDR  10
+                CO_LOOP_HDR  11
+                CO_LOOP_HDR  12
+                CO_LOOP_HDR  13
+                CO_LOOP_HDR  14
+                CO_LOOP_HDR  15
+
+.LcoStDone:
+                lea     SP_LOCAL(%sp),%sp
+                movem.l (%sp)+,%d2-%d7/%a2-%a6
+                rts
+
+
+                .align  2
+| 16 word entries: bitMaskWordLut[i] = 1 << (15 - i), for i in 0..15.
+bitMaskWordLut:
+                .word   0x8000, 0x4000, 0x2000, 0x1000
+                .word   0x0800, 0x0400, 0x0200, 0x0100
+                .word   0x0080, 0x0040, 0x0020, 0x0010
+                .word   0x0008, 0x0004, 0x0002, 0x0001
diff --git a/src/port/atarist/fillCircle.s b/src/port/atarist/fillCircle.s
new file mode 100644
index 0000000..ba508df
--- /dev/null
+++ b/src/port/atarist/fillCircle.s
@@ -0,0 +1,292 @@
+| Atari ST word-interleaved planar fillCircle -- 68000 hand-rolled.
+|
+| Bresenham midpoint circle, 4 horizontal spans per Bresenham iter,
+| paired by shared x-range so leftMask/rightMask are computed once
+| per pair:
+|     Pair A: x in [cx-bx, cx+bx], rows y = cy+by, cy-by
+|     Pair B: x in [cx-by, cx+by], rows y = cy+bx, cy-bx
+|
+| Caller MUST guarantee the bounding box (cx-r, cy-r) (cx+r, cy+r)
+| is fully on-surface. Off-surface circles fall back to the C walker.
+|
+| ABI: cdecl. d2-d7/a2-a6 callee-save.
+|
+|   void surface68kStFillCircle(uint8_t *base,
+|                               uint16_t cx, uint16_t cy,
+|                               uint16_t r,  uint8_t  color);
+|
+| Register allocation across the loop:
+|   d2.w = bx (Bresenham, starts at r)
+|   d3.w = by (Bresenham, starts at 0)
+|   d4.w = err
+|   d5.l = loLong (planes 0+1 long template)
+|   d6.l = hiLong (planes 2+3 long template)
+|   d7.b = color (low nibble; tested via btst)
+|   a3   = base
+|   a4   = scratch / current group pointer
+|   d0,d1 = scratch
+|
+| Stack scratch (8 bytes at 0(sp)..7(sp)):
+|   0..1  leftMask  (word; per pair)
+|   2..3  rightMask (word; per pair)
+|   4..5  numGroups (word; per pair)
+|   6..7  groupFirstByteOff (word; per pair)
+
+                .text
+
+
+                .equ    SP_FC_SAVED, 44
+                .equ    SP_FC_LOCAL, 8
+                .equ    SP_FC_OFF,         (SP_FC_SAVED + 4 + SP_FC_LOCAL)
+                .equ    SP_FC_BASE,    SP_FC_OFF + 0
+                .equ    SP_FC_CX,      SP_FC_OFF + 4 + 2
+                .equ    SP_FC_CY,      SP_FC_OFF + 8 + 2
+                .equ    SP_FC_R,       SP_FC_OFF + 12 + 2
+                .equ    SP_FC_COLOR,   SP_FC_OFF + 16 + 3
+
+
+| ---- COMPUTE_PAIR_MASKS macro -----------------------------------
+| Input:  d0.w = left, d1.w = right
+| Output: 0(sp) leftMask, 2(sp) rightMask, 4(sp) numGroups,
+|         6(sp) groupFirstByteOff
+| Trashes: d0, d1
+| (No labels: straightline.)
+
+                .macro  COMPUTE_PAIR_MASKS
+                move.w  %d0,0(%sp)             | stash left
+                move.w  %d1,2(%sp)             | stash right
+                | groupFirst & groupFirstByteOff
+                move.w  %d0,%d1
+                lsr.w   #4,%d1                 | groupFirst
+                move.w  %d1,%d0
+                lsl.w   #3,%d0                 | groupFirstByteOff
+                move.w  %d0,6(%sp)
+                | numGroups = (right >> 4) - groupFirst
+                move.w  2(%sp),%d0
+                lsr.w   #4,%d0                 | groupLast
+                sub.w   %d1,%d0                | numGroups
+                move.w  %d0,4(%sp)
+                | leftMask via LUT[bitFirst]; a5 = leftMaskLut base
+                move.w  0(%sp),%d0
+                and.w   #15,%d0
+                add.w   %d0,%d0
+                move.w  (%a5,%d0.w),%d1
+                move.w  %d1,0(%sp)
+                | rightMask via LUT[bitLast]; a6 = rightMaskLut base
+                move.w  2(%sp),%d0
+                and.w   #15,%d0
+                add.w   %d0,%d0
+                move.w  (%a6,%d0.w),%d1
+                move.w  %d1,2(%sp)
+                .endm
+
+
+| ---- SPAN_BODY macro --------------------------------------------
+| Render one row span using the pair masks at 0(sp)..7(sp).
+| Input:  d0.w = y (signed)
+|         a3 = base, d5 = loLong, d6 = hiLong, d7 = color
+| Trashes: d0, d1, a4
+| Macro takes an idx parameter for unique labels.
+
+                .macro  SPAN_BODY
+                | a4 = base + y*160
+                ext.l   %d0
+                move.l  %d0,%d1
+                lsl.l   #5,%d0
+                lsl.l   #7,%d1
+                add.l   %d1,%d0                | y*160
+                lea     0(%a3,%d0.l),%a4
+                | a4 += groupFirstByteOff
+                moveq   #0,%d0
+                move.w  6(%sp),%d0
+                add.l   %d0,%a4
+                | numGroups in d1
+                move.w  4(%sp),%d1
+                tst.w   %d1
+                bne.s   .Lsb_multi\@
+                | single-group: combinedMask = leftMask & rightMask
+                move.w  0(%sp),%d0
+                and.w   2(%sp),%d0
+                bsr     .Lfc_applyMask
+                bra.w   .Lsb_done\@
+.Lsb_multi\@:
+                | leading mask. applyMask postinc-advances a4 by 8
+                | (the 4 plane RMWs each advance by 2 via (a4)+).
+                | applyMask trashes d1, so reload numGroups after bsr.
+                move.w  0(%sp),%d0
+                bsr     .Lfc_applyMask
+                move.w  4(%sp),%d1             | reload numGroups
+                subq.w  #1,%d1                 | d1 = numMid
+                beq.s   .Lsb_skipMid\@
+.Lsb_midLoop\@:
+                move.l  %d5,(%a4)+
+                move.l  %d6,(%a4)+
+                subq.w  #1,%d1
+                bne.s   .Lsb_midLoop\@
+.Lsb_skipMid\@:
+                | trailing mask
+                move.w  2(%sp),%d0
+                bsr     .Lfc_applyMask
+.Lsb_done\@:
+                .endm
+
+
+                .globl  _surface68kStFillCircle
+
+_surface68kStFillCircle:
+                movem.l %d2-%d7/%a2-%a6,-(%sp)
+                lea     -SP_FC_LOCAL(%sp),%sp
+
+                | base, color
+                move.l  SP_FC_BASE(%sp),%a3
+                moveq   #0,%d7
+                move.b  SP_FC_COLOR(%sp),%d7
+
+                | LUT bases (PC-relative indexed has only 8-bit
+                | displacement, so cache full pointers in a-regs).
+                lea     leftMaskLut(%pc),%a5
+                lea     rightMaskLut(%pc),%a6
+
+                | loLong = ((c&1)?0xFFFF0000:0) | ((c&2)?0x0000FFFF:0)
+                moveq   #0,%d5
+                btst    #1,%d7
+                beq.s   .Lfc_lo1
+                move.w  #-1,%d5
+.Lfc_lo1:
+                btst    #0,%d7
+                beq.s   .Lfc_lo0
+                ori.l   #0xFFFF0000,%d5
+.Lfc_lo0:
+                | hiLong = ((c&4)?0xFFFF0000:0) | ((c&8)?0x0000FFFF:0)
+                moveq   #0,%d6
+                btst    #3,%d7
+                beq.s   .Lfc_hi3
+                move.w  #-1,%d6
+.Lfc_hi3:
+                btst    #2,%d7
+                beq.s   .Lfc_hi2
+                ori.l   #0xFFFF0000,%d6
+.Lfc_hi2:
+
+                | Bresenham init: bx=r, by=0, err=1-bx
+                move.w  SP_FC_R(%sp),%d2
+                moveq   #0,%d3
+                moveq   #1,%d4
+                sub.w   %d2,%d4
+
+.Lfc_loop:
+                cmp.w   %d3,%d2
+                bcs.w   .Lfc_done
+
+                | --- Pair A: x range = (cx - bx, cx + bx)
+                move.w  SP_FC_CX(%sp),%d0
+                move.w  %d0,%d1
+                sub.w   %d2,%d0                | left  = cx - bx
+                add.w   %d2,%d1                | right = cx + bx
+                COMPUTE_PAIR_MASKS
+
+                | Span A1: y = cy + by
+                move.w  SP_FC_CY(%sp),%d0
+                add.w   %d3,%d0
+                SPAN_BODY
+
+                | Span A2: y = cy - by
+                move.w  SP_FC_CY(%sp),%d0
+                sub.w   %d3,%d0
+                SPAN_BODY
+
+                | --- Pair B: x range = (cx - by, cx + by)
+                move.w  SP_FC_CX(%sp),%d0
+                move.w  %d0,%d1
+                sub.w   %d3,%d0                | left  = cx - by
+                add.w   %d3,%d1                | right = cx + by
+                COMPUTE_PAIR_MASKS
+
+                | Span B1: y = cy + bx
+                move.w  SP_FC_CY(%sp),%d0
+                add.w   %d2,%d0
+                SPAN_BODY
+
+                | Span B2: y = cy - bx
+                move.w  SP_FC_CY(%sp),%d0
+                sub.w   %d2,%d0
+                SPAN_BODY
+
+                | --- Bresenham step
+                addq.w  #1,%d3
+                tst.w   %d4
+                bgt.s   .Lfc_decBx
+                add.w   %d3,%d4
+                add.w   %d3,%d4
+                addq.w  #1,%d4
+                bra.w   .Lfc_loop
+.Lfc_decBx:
+                subq.w  #1,%d2
+                add.w   %d3,%d4
+                add.w   %d3,%d4
+                sub.w   %d2,%d4
+                sub.w   %d2,%d4
+                addq.w  #1,%d4
+                bra.w   .Lfc_loop
+
+
+.Lfc_done:
+                lea     SP_FC_LOCAL(%sp),%sp
+                movem.l (%sp)+,%d2-%d7/%a2-%a6
+                rts
+
+
+| ---- Apply 4-plane mask at (a4) -------------------------------
+| Input:  d0.w = mask, d7.b = color, a4 = group ptr
+| Output: a4 advanced by 8 (next group). Caller must NOT post-add 8.
+| Trashes: d0, d1
+| Subroutine, called via bsr from SPAN_BODY. Postinc on each plane
+| RMW saves 4 cyc/plane vs displacement (12 vs 16 EA cyc).
+
+.Lfc_applyMask:
+                move.w  %d0,%d1
+                not.w   %d1                    | d1 = notMask
+                btst    #0,%d7
+                beq.s   .Lfc_am0a
+                or.w    %d0,(%a4)+
+                bra.s   .Lfc_am1
+.Lfc_am0a:
+                and.w   %d1,(%a4)+
+.Lfc_am1:
+                btst    #1,%d7
+                beq.s   .Lfc_am1a
+                or.w    %d0,(%a4)+
+                bra.s   .Lfc_am2
+.Lfc_am1a:
+                and.w   %d1,(%a4)+
+.Lfc_am2:
+                btst    #2,%d7
+                beq.s   .Lfc_am2a
+                or.w    %d0,(%a4)+
+                bra.s   .Lfc_am3
+.Lfc_am2a:
+                and.w   %d1,(%a4)+
+.Lfc_am3:
+                btst    #3,%d7
+                beq.s   .Lfc_am3a
+                or.w    %d0,(%a4)+
+                rts
+.Lfc_am3a:
+                and.w   %d1,(%a4)+
+                rts
+
+
+                .align  2
+| leftMaskLut[i]  = (1 << (16 - i)) - 1, indexed by bitFirst (0..15)
+leftMaskLut:
+                .word   0xFFFF, 0x7FFF, 0x3FFF, 0x1FFF
+                .word   0x0FFF, 0x07FF, 0x03FF, 0x01FF
+                .word   0x00FF, 0x007F, 0x003F, 0x001F
+                .word   0x000F, 0x0007, 0x0003, 0x0001
+
+| rightMaskLut[i] = ~((1 << (15 - i)) - 1), indexed by bitLast (0..15)
+rightMaskLut:
+                .word   0x8000, 0xC000, 0xE000, 0xF000
+                .word   0xF800, 0xFC00, 0xFE00, 0xFF00
+                .word   0xFF80, 0xFFC0, 0xFFE0, 0xFFF0
+                .word   0xFFF8, 0xFFFC, 0xFFFE, 0xFFFF
diff --git a/src/port/atarist/hal.c b/src/port/atarist/hal.c
index 2e77041..77a5c5c 100644
--- a/src/port/atarist/hal.c
+++ b/src/port/atarist/hal.c
@@ -36,6 +36,8 @@
 
 #include "hal.h"
 #include "surfaceInternal.h"
+#include "spriteInternal.h"
+#include "joey/tile.h"
 #include "draw68k_inline.h"
 
 // ----- Constants -----
@@ -45,8 +47,32 @@
 // with each word holding the 16 one-bit samples for one bitplane.
 #define ST_BYTES_PER_ROW    160
 #define ST_GROUPS_PER_ROW   20
+#define ST_BYTES_PER_GROUP  8         // 4 plane words back-to-back
+#define ST_PLANE_OFF_BYTES  2         // step between adjacent plane words within a group
+#define ST_BITPLANES        4
+#define ST_PLANAR_SIZE      (ST_BYTES_PER_ROW * SURFACE_HEIGHT)
 #define ST_SCREEN_ALIGN     256
 
+
+// ----- Per-surface planar storage (project_planar_68k_plan, ST Phase 2) -----
+//
+// ST has word-interleaved planar: ONE 32000-byte buffer per surface
+// holds all 4 planes packed 8 bytes per 16-pixel group. Per-scanline
+// layout (160 bytes) is 20 groups of 8 bytes, where each group holds
+// p0_word, p1_word, p2_word, p3_word back-to-back. Compare with
+// Amiga's 4 separate plane buffers -- same total bytes, very different
+// access pattern.
+//
+// The stage gets its own SHADOW planar buffer (NOT aliased to
+// gScreenBase) so drawing primitives don't appear until stagePresent
+// memcpy's shadow -> screen. Same rationale as Amiga's per-stage
+// shadow planes.
+typedef struct {
+    uint8_t *base;          // 32000-byte interleaved planar buffer
+    uint8_t *raw;           // unaligned malloc result for free()
+    bool     ownsBuffer;    // true = we malloc'd, false = aliased
+} StPlanarT;
+
 // Shifter palette registers: 16 words at $FFFF8240..$FFFF825F.
 #define ST_PALETTE_REGS  ((volatile uint16_t *)0xFFFF8240L)
 
@@ -65,8 +91,50 @@
 
 // ----- Prototypes -----
 
+// Phase 10: planar primitive helpers must be visible everywhere they
+// could inline. Defined up here (between StPlanarT and the rest of
+// the prototype block) so every halFast* / fillSpan / circle walker
+// can fold the 4-plane RMW directly into its body. always_inline
+// hammers the point home for gcc-mint's conservative inliner.
+
+static inline __attribute__((always_inline)) void stApplyMaskToGroup(uint8_t *groupBase, uint16_t mask, uint8_t color) {
+    uint16_t  notMask = (uint16_t)~mask;
+    uint16_t *pw     = (uint16_t *)groupBase;
+    if (color & 1u) { pw[0] = (uint16_t)(pw[0] | mask); } else { pw[0] = (uint16_t)(pw[0] & notMask); }
+    if (color & 2u) { pw[1] = (uint16_t)(pw[1] | mask); } else { pw[1] = (uint16_t)(pw[1] & notMask); }
+    if (color & 4u) { pw[2] = (uint16_t)(pw[2] | mask); } else { pw[2] = (uint16_t)(pw[2] & notMask); }
+    if (color & 8u) { pw[3] = (uint16_t)(pw[3] | mask); } else { pw[3] = (uint16_t)(pw[3] & notMask); }
+}
+
+
+static inline __attribute__((always_inline)) void stPlanarSetPixel(StPlanarT *pd, int16_t x, int16_t y, uint8_t color) {
+    uint16_t  group   = (uint16_t)((uint16_t)x >> 4);
+    uint16_t  bitMask = (uint16_t)(1u << (15u - ((uint16_t)x & 15u)));
+    uint16_t  notMask = (uint16_t)~bitMask;
+    uint16_t *pw      = (uint16_t *)(pd->base
+                                     + (uint16_t)y * ST_BYTES_PER_ROW
+                                     + group * ST_BYTES_PER_GROUP);
+    if (color & 1u) { pw[0] = (uint16_t)(pw[0] | bitMask); } else { pw[0] = (uint16_t)(pw[0] & notMask); }
+    if (color & 2u) { pw[1] = (uint16_t)(pw[1] | bitMask); } else { pw[1] = (uint16_t)(pw[1] & notMask); }
+    if (color & 4u) { pw[2] = (uint16_t)(pw[2] | bitMask); } else { pw[2] = (uint16_t)(pw[2] & notMask); }
+    if (color & 8u) { pw[3] = (uint16_t)(pw[3] | bitMask); } else { pw[3] = (uint16_t)(pw[3] & notMask); }
+}
+
+
+static inline __attribute__((always_inline)) uint8_t stPlanarGetPixel(const StPlanarT *pd, int16_t x, int16_t y) {
+    uint16_t        group   = (uint16_t)((uint16_t)x >> 4);
+    uint16_t        bitMask = (uint16_t)(1u << (15u - ((uint16_t)x & 15u)));
+    const uint16_t *pw      = (const uint16_t *)(pd->base
+                                                 + (uint16_t)y * ST_BYTES_PER_ROW
+                                                 + group * ST_BYTES_PER_GROUP);
+    uint8_t c = 0u;
+    if (pw[0] & bitMask) { c = (uint8_t)(c | 1u); }
+    if (pw[1] & bitMask) { c = (uint8_t)(c | 2u); }
+    if (pw[2] & bitMask) { c = (uint8_t)(c | 4u); }
+    if (pw[3] & bitMask) { c = (uint8_t)(c | 8u); }
+    return c;
+}
 static uint16_t quantizeColorToSt(uint16_t orgb);
-static void     c2pRange(const SurfaceT *src, int16_t y0, int16_t y1, uint16_t groupStart, uint16_t groupEnd);
 static void     flattenScbPalettes(const SurfaceT *src);
 static void     initC2pLut(void);
 static void     writeDiagnostics(void);
@@ -138,6 +206,55 @@ static uint8_t  gCachedScb    [SURFACE_HEIGHT];
 static uint16_t gCachedPalette[SURFACE_PALETTE_COUNT][SURFACE_COLORS_PER_PALETTE];
 static bool     gCacheValid = false;
 
+// 256-long plane-spread LUT for the asm sprite SAVE path (defined in
+// spriteAsm.s). For plane byte b, LUT[b] is a 32-bit value where each
+// of b's 8 bits is placed at the bit-0 position of the corresponding
+// pixel's nibble inside a 4-byte chunky long. The asm shifts the LUT
+// entry left by N to get plane N's contribution; OR'd across 4 planes
+// gives the full chunky long. Initialized lazily.
+//
+// LUT used by surface68kStSpriteSaveByteAligned. The asm reads via
+// `move.l (a_ptr, d0.l), d4` which requires the LUT to be long-
+// aligned -- and TOS .PRG BSS only does 2-byte alignment. Worse,
+// the cascading offsets from the odd-sized gC2pLut put even
+// `uint32_t` BSS slots at addr mod 4 == 2.
+//
+// Fix: malloc the LUT. mintlib's malloc returns long-aligned memory.
+// The pointer is passed to the asm via the C-side wrapper (so the
+// asm reads it from the stack, where it's guaranteed long-aligned
+// regardless of where the static pointer slot lives).
+static uint32_t *gStPlaneSpreadLutPtr = NULL;
+static bool      gStPlaneSpreadLutReady = false;
+
+static bool initStPlaneSpreadLut(void) {
+    int b;
+    int i;
+
+    if (gStPlaneSpreadLutReady) {
+        return true;
+    }
+    gStPlaneSpreadLutPtr = (uint32_t *)malloc(256 * sizeof(uint32_t));
+    if (gStPlaneSpreadLutPtr == NULL) {
+        return false;
+    }
+
+    for (b = 0; b < 256; b++) {
+        uint32_t v = 0u;
+        for (i = 0; i < 8; i++) {
+            if (b & (0x80 >> i)) {
+                int byteIdx = i >> 1;
+                int isHigh  = ((i & 1) == 0);
+                int bitInLong = (3 - byteIdx) * 8 + (isHigh ? 4 : 0);
+                v |= (uint32_t)1u << bitInLong;
+            }
+        }
+        gStPlaneSpreadLutPtr[b] = v;
+    }
+    gStPlaneSpreadLutReady = true;
+    return true;
+}
+
+
 // 4 KB chunky-to-planar lookup table consumed by chunkyToPlanarRowSt
 // (src/port/atarist/c2p.s). Layout: gC2pLut[src*16 + pos*4 + plane]
 // = the 2-bit plane-byte contribution for source byte `src` at
@@ -146,27 +263,12 @@ static bool     gCacheValid = false;
 // the same table feeds both halves of an ST plane word: positions
 // 0..3 land in the high byte, 4..7 (re-indexed mod 4) in the low
 // byte. Built once by initC2pLut on the first halPresent call.
-static uint8_t  gC2pLut[4 * 1024];
+/* Exported (no static) so spriteAsm.s can `lea _gC2pLut, %a2`. */
+uint8_t         gC2pLut[4 * 1024];
 static bool     gC2pLutReady = false;
 
 // ----- Internal helpers (alphabetical) -----
 
-static void c2pRange(const SurfaceT *src, int16_t y0, int16_t y1, uint16_t groupStart, uint16_t groupEnd) {
-    int16_t        y;
-    const uint8_t *srcLine;
-    uint16_t      *dstLine;
-
-    if (!gC2pLutReady) {
-        initC2pLut();
-    }
-    for (y = y0; y < y1; y++) {
-        srcLine = &src->pixels[y * SURFACE_BYTES_PER_ROW];
-        dstLine = (uint16_t *)&gScreenBase[y * ST_BYTES_PER_ROW];
-        chunkyToPlanarRowSt(srcLine, dstLine, groupStart, groupEnd, gC2pLut);
-    }
-}
-
-
 // Scan the surface's SCB and record one transition entry for each
 // run of the same palette index. gBandCount is the number of
 // distinct bands; gBandStart[i] is the display line where band i
@@ -499,20 +601,31 @@ const char *halLastError(void) {
 
 
 void halPresent(const SurfaceT *src) {
-    int16_t  y;
-    uint8_t  minWord;
-    uint8_t  maxWord;
-    uint16_t groupStart;
-    uint16_t groupEnd;
+    StPlanarT *pd;
+    int16_t    y;
+    uint8_t    minWord;
+    uint8_t    maxWord;
+    uint16_t   groupStart;
+    uint16_t   groupEnd;
+    uint16_t   byteStart;
+    uint16_t   byteLen;
 
     if (src == NULL || !gModeSet) {
         return;
     }
+    pd = (StPlanarT *)src->portData;
+    if (pd == NULL) {
+        return;
+    }
     refreshPaletteStateIfNeeded(src);
 
-    // Walk per-row dirty bands: each c2p group covers 16 px = 4 chunky
-    // words, so groupStart = minWord/4 and groupEnd = maxWord/4 + 1
-    // converts dirty-word units to c2pRange's group units.
+    // Phase 9: planar shadow -> screen RAM. Same dirty-word band
+    // tracking the c2p path used; just memcpy the planar bytes for
+    // each band instead of running c2p on the chunky shadow. Each
+    // dirty word covers 4 pixels = ?of one group = quarter of an
+    // 8-byte group. We round to whole groups (8 bytes each) for a
+    // simple aligned memcpy, since planar groups are the natural
+    // copy unit.
     for (y = 0; y < SURFACE_HEIGHT; y++) {
         minWord = gStageMinWord[y];
         maxWord = gStageMaxWord[y];
@@ -521,7 +634,11 @@ void halPresent(const SurfaceT *src) {
         }
         groupStart = (uint16_t)(minWord >> 2);
         groupEnd   = (uint16_t)((maxWord >> 2) + 1);
-        c2pRange(src, y, (int16_t)(y + 1), groupStart, groupEnd);
+        byteStart  = (uint16_t)(groupStart * ST_BYTES_PER_GROUP);
+        byteLen    = (uint16_t)((groupEnd - groupStart) * ST_BYTES_PER_GROUP);
+        memcpy(&gScreenBase[(uint16_t)y * ST_BYTES_PER_ROW + byteStart],
+               &pd->base   [(uint16_t)y * ST_BYTES_PER_ROW + byteStart],
+               byteLen);
     }
 }
 
@@ -563,6 +680,15 @@ void halShutdown(void) {
         return;
     }
 
+    // Stop the audio Timer A first. The audio HAL has its own
+    // halAudioShutdown that disables Timer A and restores the vector,
+    // but cross-platform joeyShutdown doesn't call it -- if a sketch
+    // forgets joeyAudioShutdown(), Timer A keeps firing after our
+    // code unloads and TOS panics on the first dangling vector hit.
+    // Calling halAudioShutdown here is idempotent (gReady guard),
+    // so explicit-shutdown sketches still work.
+    halAudioShutdown();
+
     // Disable MFP Timer B and restore the exception vectors before
     // changing the screen -- a late ISR firing mid-Setscreen would
     // write palette into whatever buffer TOS remapped.
@@ -587,112 +713,279 @@ void halShutdown(void) {
 extern void surface68kClearLong(uint8_t *pixels, uint16_t fillByte);
 extern void surface68kFillRectFull(uint8_t *pixels, int16_t y, uint16_t h, uint16_t fillByte);
 extern void surface68kFillRectByteAligned(uint8_t *rowFirst, uint16_t midBytes, uint16_t h, uint16_t fillByte);
+extern void surface68kStCircleOutline(uint8_t *base, uint16_t cx, uint16_t cy, uint16_t r, uint8_t color);
+extern void surface68kStDrawLine(uint8_t *base, int16_t x0, int16_t y0, int16_t x1, int16_t y1, uint8_t color);
+extern void surface68kStFillSpan(uint8_t *base, int16_t left, int16_t right, int16_t y, uint8_t color);
+extern void surface68kStFillCircle(uint8_t *base, uint16_t cx, uint16_t cy, uint16_t r, uint8_t color);
+extern void surface68kStFillRectSingleGroup(uint8_t *firstGroupPtr, uint16_t mask, uint16_t h, uint8_t color);
+extern void surface68kStFillRectMulti(uint8_t *base, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t color);
+extern void surface68kStLongFill(uint8_t *dst, uint16_t numGroups, uint32_t loLong, uint32_t hiLong);
+extern void surface68kStSpriteSaveByteAligned(uint8_t *base, uint16_t x, uint16_t y, uint16_t w, uint16_t h, uint8_t *dstChunky, const uint32_t *lut);
+extern void surface68kStSpriteRestoreByteAligned(uint8_t *base, uint16_t x, uint16_t y, uint16_t w, uint16_t h, const uint8_t *srcChunky, const uint8_t *c2pLut);
 
 
+// Phase 9: clear the entire planar buffer to a 4-bit color. Build an
+// 8-byte group template (4 plane words: 0xFFFF or 0x0000 each by
+// color bit) then stream it across all 4000 groups via long stores.
 bool halFastSurfaceClear(SurfaceT *s, uint8_t doubled) {
+    StPlanarT *pd;
+    uint8_t    color;
+    uint32_t   loLong;
+    uint32_t   hiLong;
+    uint32_t  *p32;
+    uint16_t   groups;
+
     if (s != stageGet()) {
         return false;
     }
-    surface68kClearLong(s->pixels, (uint16_t)doubled);
+    pd = (StPlanarT *)s->portData;
+    if (pd == NULL) {
+        return true;
+    }
+    color  = (uint8_t)(doubled & 0x0Fu);
+    /* Per-group: [p0_word][p1_word][p2_word][p3_word] = 8 bytes = 2 longs.
+     * loLong = (p0_word << 16) | p1_word; hiLong = (p2_word << 16) | p3_word. */
+    loLong = ((color & 1u) ? 0xFFFF0000ul : 0ul)
+           | ((color & 2u) ? 0x0000FFFFul : 0ul);
+    hiLong = ((color & 4u) ? 0xFFFF0000ul : 0ul)
+           | ((color & 8u) ? 0x0000FFFFul : 0ul);
+    (void)p32;
+    (void)groups;
+    surface68kStLongFill(pd->base,
+                         (uint16_t)(ST_PLANAR_SIZE / ST_BYTES_PER_GROUP),
+                         loLong, hiLong);
     return true;
 }
 
 
-// Fast path bands:
-//   - x == 0 && w == SURFACE_WIDTH (full row): one move.l-stream per
-//     row via surface68kFillRectFull. Always word-aligned.
-//   - x % 4 == 0 && w even (word-aligned): byte index = x/2, so x must
-//     be a multiple of 4 for the move.l writes inside the asm to land
-//     on even addresses (68000 address-error rule).
-//   - everything else: fall through to C's fillRectClipped, which is
-//     per-byte and tolerates any alignment.
+// Phase 9: pure short-circuit. halFillRectPlanes (called by cross-
+// platform fillRect right after this) does the actual planar fill;
+// we just claim ownership so the chunky fillRectClipped fallback
+// never runs.
 bool halFastFillRect(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex) {
-    uint8_t doubled;
-
+    (void)x; (void)y; (void)w; (void)h; (void)colorIndex;
     if (s != stageGet()) {
         return false;
     }
-    if (h == 0u || w == 0u) {
-        return true;
-    }
-    doubled = (uint8_t)(((colorIndex & 0x0Fu) << 4) | (colorIndex & 0x0Fu));
-
-    if (x == 0 && w == (uint16_t)SURFACE_WIDTH) {
-        surface68kFillRectFull(s->pixels, y, h, (uint16_t)doubled);
-        return true;
-    }
-    if (((x & 3) == 0) && ((w & 1u) == 0u)) {
-        uint8_t *rowFirst = &s->pixels[(uint16_t)y * (uint16_t)SURFACE_BYTES_PER_ROW + ((uint16_t)x >> 1)];
-        surface68kFillRectByteAligned(rowFirst, w >> 1, h, (uint16_t)doubled);
-        return true;
-    }
-    return false;
+    return true;
 }
 
 
+// Phase 9: claim every halFastTile* call so the cross-platform chunky
+// fallback (which would dereference NULL s->pixels) never fires. The
+// halTileXxxPlanes hook called separately by tile.c does the planar
+// work.
 bool halFastTileCopy(uint8_t *dstRow0, const uint8_t *srcRow0) {
-    (void)dstRow0;
-    (void)srcRow0;
-    return false;
+    (void)dstRow0; (void)srcRow0;
+    return true;
 }
 
 
 bool halFastTileCopyMasked(uint8_t *dstRow0, const uint8_t *srcRow0, uint8_t transparent) {
-    (void)dstRow0;
-    (void)srcRow0;
-    (void)transparent;
-    return false;
-}
-
-
-bool halFastTilePaste(uint8_t *dstRow0, const uint8_t *srcTilePixels) {
-    (void)dstRow0;
-    (void)srcTilePixels;
-    return false;
-}
-
-
-bool halFastTileSnap(uint8_t *dstTilePixels, const uint8_t *srcRow0) {
-    (void)dstTilePixels;
-    (void)srcRow0;
-    return false;
-}
-
-
-bool halFastDrawPixel(SurfaceT *s, uint16_t x, uint16_t y, uint8_t colorIndex) {
-    uint8_t nibLo;
-    if (s != stageGet()) {
-        return false;
-    }
-    nibLo = (uint8_t)(colorIndex & 0x0Fu);
-    draw68kPlotPixel(s->pixels, (int16_t)x, (int16_t)y, nibLo, (uint8_t)(nibLo << 4));
+    (void)dstRow0; (void)srcRow0; (void)transparent;
     return true;
 }
 
 
-bool halFastDrawLine(SurfaceT *s, int16_t x0, int16_t y0, int16_t x1, int16_t y1, uint8_t colorIndex) {
+bool halFastTilePaste(uint8_t *dstRow0, const uint8_t *srcTilePixels) {
+    (void)dstRow0; (void)srcTilePixels;
+    return true;
+}
+
+
+bool halFastTileSnap(uint8_t *dstTilePixels, const uint8_t *srcRow0) {
+    (void)dstTilePixels; (void)srcRow0;
+    return true;
+}
+
+
+// Phase 9: planar-only. Chunky shadow is gone; only the planar buffer
+// gets the pixel.
+bool halFastDrawPixel(SurfaceT *s, uint16_t x, uint16_t y, uint8_t colorIndex) {
+    StPlanarT *pd;
+
     if (s != stageGet()) {
         return false;
     }
-    draw68kLine(s->pixels, x0, y0, x1, y1, colorIndex);
+    pd = (StPlanarT *)s->portData;
+    if (pd != NULL) {
+        stPlanarSetPixel(pd, (int16_t)x, (int16_t)y, (uint8_t)(colorIndex & 0x0Fu));
+    }
+    return true;
+}
+
+
+// Phase 9 planar walkers. Same Bresenham as the cross-platform
+// fallback, but writing to the planar buffer via stPlanarSetPixel.
+// Mirror the Amiga amigaPlanarLine / amigaPlanarCircleOutline /
+// amigaPlanarCircleFill structure. Phase 10 hand-rolled asm replaces
+// these (drawCircle.s already exists for Amiga; ST will get its own).
+static void stPlanarLine(StPlanarT *pd, int16_t x0, int16_t y0, int16_t x1, int16_t y1, uint8_t color) {
+    int16_t dx;
+    int16_t dy;
+    int16_t sx;
+    int16_t sy;
+    int16_t err;
+    int16_t e2;
+
+    dx  = (int16_t)((x1 > x0) ? (x1 - x0) : (x0 - x1));
+    dy  = (int16_t)(-((y1 > y0) ? (y1 - y0) : (y0 - y1)));
+    sx  = (int16_t)((x0 < x1) ? 1 : -1);
+    sy  = (int16_t)((y0 < y1) ? 1 : -1);
+    err = (int16_t)(dx + dy);
+    while (1) {
+        if (x0 >= 0 && x0 < SURFACE_WIDTH && y0 >= 0 && y0 < SURFACE_HEIGHT) {
+            stPlanarSetPixel(pd, x0, y0, color);
+        }
+        if (x0 == x1 && y0 == y1) {
+            break;
+        }
+        e2 = (int16_t)(2 * err);
+        if (e2 >= dy) { err = (int16_t)(err + dy); x0 = (int16_t)(x0 + sx); }
+        if (e2 <= dx) { err = (int16_t)(err + dx); y0 = (int16_t)(y0 + sy); }
+    }
+}
+
+
+static void stPlanarCircleOutline(StPlanarT *pd, int16_t cx, int16_t cy, uint16_t r, uint8_t color) {
+    int16_t bx = (int16_t)r;
+    int16_t by = 0;
+    int16_t err = (int16_t)(1 - bx);
+    int16_t px;
+    int16_t py;
+
+    while (bx >= by) {
+        /* 8 octants. Per-pixel clip since Bresenham can leave the
+         * surface for circles touching the edge. */
+        #define ST_PLOT(X, Y) do { px = (X); py = (Y); if (px >= 0 && px < SURFACE_WIDTH && py >= 0 && py < SURFACE_HEIGHT) { stPlanarSetPixel(pd, px, py, color); } } while (0)
+        ST_PLOT((int16_t)(cx + bx), (int16_t)(cy + by));
+        ST_PLOT((int16_t)(cx - bx), (int16_t)(cy + by));
+        ST_PLOT((int16_t)(cx + bx), (int16_t)(cy - by));
+        ST_PLOT((int16_t)(cx - bx), (int16_t)(cy - by));
+        ST_PLOT((int16_t)(cx + by), (int16_t)(cy + bx));
+        ST_PLOT((int16_t)(cx - by), (int16_t)(cy + bx));
+        ST_PLOT((int16_t)(cx + by), (int16_t)(cy - bx));
+        ST_PLOT((int16_t)(cx - by), (int16_t)(cy - bx));
+        #undef ST_PLOT
+        by++;
+        if (err > 0) {
+            bx--;
+            err = (int16_t)(err + 2 * (by - bx) + 1);
+        } else {
+            err = (int16_t)(err + 2 * by + 1);
+        }
+    }
+}
+
+
+// Phase 10: group-aware span fill -- the same leading-mask /
+// full-group / trailing-mask decomposition halFillRectPlanes uses,
+// but for one row. Replaces the per-pixel walk that gave fillCircle
+// r=40 ~1 ops/sec.
+static void stPlanarFillSpan(StPlanarT *pd, int16_t x0, int16_t x1, int16_t y, uint8_t color) {
+    int16_t left;
+    int16_t right;
+
+    if (y < 0 || y >= SURFACE_HEIGHT) {
+        return;
+    }
+    left  = (x0 < x1) ? x0 : x1;
+    right = (x0 > x1) ? x0 : x1;
+    if (left  < 0)              { left  = 0; }
+    if (right >= SURFACE_WIDTH) { right = SURFACE_WIDTH - 1; }
+    if (left > right) {
+        return;
+    }
+    surface68kStFillSpan(pd->base, left, right, y, color);
+}
+
+
+static void stPlanarCircleFill(StPlanarT *pd, int16_t cx, int16_t cy, uint16_t r, uint8_t color) {
+    int16_t bx = (int16_t)r;
+    int16_t by = 0;
+    int16_t err = (int16_t)(1 - bx);
+
+    while (bx >= by) {
+        stPlanarFillSpan(pd, (int16_t)(cx - bx), (int16_t)(cx + bx), (int16_t)(cy + by), color);
+        stPlanarFillSpan(pd, (int16_t)(cx - bx), (int16_t)(cx + bx), (int16_t)(cy - by), color);
+        stPlanarFillSpan(pd, (int16_t)(cx - by), (int16_t)(cx + by), (int16_t)(cy + bx), color);
+        stPlanarFillSpan(pd, (int16_t)(cx - by), (int16_t)(cx + by), (int16_t)(cy - bx), color);
+        by++;
+        if (err > 0) {
+            bx--;
+            err = (int16_t)(err + 2 * (by - bx) + 1);
+        } else {
+            err = (int16_t)(err + 2 * by + 1);
+        }
+    }
+}
+
+
+bool halFastDrawLine(SurfaceT *s, int16_t x0, int16_t y0, int16_t x1, int16_t y1, uint8_t colorIndex) {
+    StPlanarT *pd;
+    if (s != stageGet()) {
+        return false;
+    }
+    pd = (StPlanarT *)s->portData;
+    if (pd != NULL) {
+        // Asm walker assumes fully on-surface; partial-clip lines fall
+        // back to the C walker which clips per-pixel.
+        if (x0 >= 0 && x0 < SURFACE_WIDTH && y0 >= 0 && y0 < SURFACE_HEIGHT
+                && x1 >= 0 && x1 < SURFACE_WIDTH && y1 >= 0 && y1 < SURFACE_HEIGHT) {
+            surface68kStDrawLine(pd->base, x0, y0, x1, y1, (uint8_t)(colorIndex & 0x0Fu));
+        } else {
+            stPlanarLine(pd, x0, y0, x1, y1, (uint8_t)(colorIndex & 0x0Fu));
+        }
+    }
     return true;
 }
 
 
 bool halFastDrawCircle(SurfaceT *s, int16_t cx, int16_t cy, uint16_t r, uint8_t colorIndex) {
+    StPlanarT *pd;
     if (s != stageGet()) {
         return false;
     }
-    draw68kCircleOutline(s->pixels, cx, cy, r, colorIndex);
+    pd = (StPlanarT *)s->portData;
+    if (pd == NULL) {
+        return true;
+    }
+    /* Off-surface circles fall back to the per-pixel C walker which
+     * does the clip per plot; the asm assumes fully-on-surface so it
+     * can drop the clip check from the inner loop. */
+    if ((int32_t)cx - (int32_t)r < 0
+            || (int32_t)cx + (int32_t)r >= SURFACE_WIDTH
+            || (int32_t)cy - (int32_t)r < 0
+            || (int32_t)cy + (int32_t)r >= SURFACE_HEIGHT) {
+        stPlanarCircleOutline(pd, cx, cy, r, (uint8_t)(colorIndex & 0x0Fu));
+    } else {
+        surface68kStCircleOutline(pd->base, (uint16_t)cx, (uint16_t)cy, r,
+                                  (uint8_t)(colorIndex & 0x0Fu));
+    }
     return true;
 }
 
 
 bool halFastFillCircle(SurfaceT *s, int16_t cx, int16_t cy, uint16_t r, uint8_t colorIndex) {
+    StPlanarT *pd;
     if (s != stageGet()) {
         return false;
     }
-    draw68kCircleFill(s->pixels, cx, cy, r, colorIndex);
+    pd = (StPlanarT *)s->portData;
+    if (pd == NULL) {
+        return true;
+    }
+    // Off-surface bounding box falls back to the C span walker, which
+    // clips each span; the asm assumes the whole circle is on-surface.
+    if ((int32_t)cx - (int32_t)r < 0
+            || (int32_t)cx + (int32_t)r >= SURFACE_WIDTH
+            || (int32_t)cy - (int32_t)r < 0
+            || (int32_t)cy + (int32_t)r >= SURFACE_HEIGHT) {
+        stPlanarCircleFill(pd, cx, cy, r, (uint8_t)(colorIndex & 0x0Fu));
+    } else {
+        surface68kStFillCircle(pd->base, (uint16_t)cx, (uint16_t)cy, r,
+                               (uint8_t)(colorIndex & 0x0Fu));
+    }
     return true;
 }
 
@@ -736,16 +1029,14 @@ bool halFastFloodScanRow(uint8_t *row, int16_t leftX, int16_t rightX, uint8_t ma
 }
 
 
+// Phase 9: short-circuit. Cross-platform blitRect calls
+// halBlitRectPlanes after halFastBlitRect; the planar work happens
+// there, so we just claim ownership to skip the chunky copy that
+// would dereference NULL dstRow0.
 bool halFastBlitRect(uint8_t *dstRow0, int16_t dstX, const uint8_t *srcRow0, int16_t srcX, int16_t copyW, int16_t copyH, int16_t srcRowBytes, uint16_t transparent) {
-    (void)dstRow0;
-    (void)dstX;
-    (void)srcRow0;
-    (void)srcX;
-    (void)copyW;
-    (void)copyH;
-    (void)srcRowBytes;
-    (void)transparent;
-    return false;
+    (void)dstRow0; (void)dstX; (void)srcRow0; (void)srcX;
+    (void)copyW; (void)copyH; (void)srcRowBytes; (void)transparent;
+    return true;
 }
 
 
@@ -783,113 +1074,1045 @@ bool halFastFloodWalkAndScans(uint8_t *pixels, int16_t x, int16_t y, uint8_t mat
 }
 
 
+// Phase 9: short-circuit. halTileFillPlanes does the planar work
+// after this returns true; the chunky fallback that would run on
+// false would dereference NULL s->pixels.
 bool halFastTileFill(SurfaceT *s, uint8_t bx, uint8_t by, uint16_t fillWord) {
-    (void)s;
-    (void)bx;
-    (void)by;
-    (void)fillWord;
-    return false;
+    (void)bx; (void)by; (void)fillWord;
+    if (s != stageGet()) {
+        return false;
+    }
+    return true;
 }
 
 
-// Phase-1 planar plumbing: portData hooks declared and exported, but
-// returning NULL keeps the ST port operating in the legacy
-// chunky-with-c2p model. Phase 4 replaces this with an interleaved
-// planar buffer + stride blob, and rewrites every halFast* primitive
-// to read/write planes directly.
+// Phase 2: allocate a shadow word-interleaved planar buffer per
+// surface. Both stage and non-stage get their own buffer (gScreenBase
+// remains the single display target).
+//
+// LONG alignment is required, not just word: the full-row long-fill
+// path and circle.s both do `move.l` writes on this buffer, and
+// 68000 address-errors on long access to a word-aligned-but-not-
+// long-aligned destination. mintlib's malloc usually returns long-
+// aligned blocks, but TOS heaps can land at odd offsets after a
+// few allocations -- over-allocate by 4 bytes and align up here.
+// Symptom of getting this wrong: intermittent return-to-desktop
+// after the red startup paint as the first long write hits an
+// odd-by-2 base.
 void *halSurfaceAllocPortData(SurfaceT *s, bool isStage) {
+    StPlanarT *pd;
+    uint8_t   *raw;
+    uintptr_t  addr;
+
     (void)s;
     (void)isStage;
-    return NULL;
+
+    pd = (StPlanarT *)calloc(1, sizeof(StPlanarT));
+    if (pd == NULL) {
+        return NULL;
+    }
+
+    raw = (uint8_t *)malloc(ST_PLANAR_SIZE + 4u);
+    if (raw == NULL) {
+        free(pd);
+        return NULL;
+    }
+    addr = (uintptr_t)raw;
+    addr = (addr + 3u) & ~(uintptr_t)3u;   /* round up to long-aligned */
+    pd->raw        = raw;
+    pd->base       = (uint8_t *)addr;
+    pd->ownsBuffer = true;
+    memset(pd->base, 0, ST_PLANAR_SIZE);
+    return pd;
 }
 
 
 void halSurfaceFreePortData(SurfaceT *s, bool isStage, void *portData) {
+    StPlanarT *pd;
+
     (void)s;
     (void)isStage;
-    (void)portData;
+    if (portData == NULL) {
+        return;
+    }
+    pd = (StPlanarT *)portData;
+    if (pd->ownsBuffer && pd->raw != NULL) {
+        free(pd->raw);
+    }
+    free(pd);
 }
 
 
-// ST planar dual-write isn't implemented yet (interleaved word-planar
-// layout needs a different code path than Amiga's separate plane
-// buffers). Stub for now; chunky shadow + c2p still drives display.
+// Phase 3: dual-write to the word-interleaved planar shadow buffer.
+// Chunky shadow (s->pixels) is still the source-of-truth for display
+// (c2p at present); the planar buffer becomes authoritative at
+// Phase 9 switch flip.
+//
+// Per row: split [x, x+w) into a leading partial group (bits
+// 15..15-bitFirst within the leading word -> mask = (1<<(16-bitFirst))
+// - 1), zero or more full groups, and a trailing partial group
+// (bits 15..15-bitLast -> mask = ~((1<<(15-bitLast)) - 1)). For each
+// of the 4 plane words within a group, the bit value of the color
+// index controls OR-with-mask (set) vs AND-with-not-mask (clear).
+// Single-group case (groupFirst == groupLast) collapses to one word
+// RMW per plane with the combined mask.
+// (stApplyMaskToGroup is defined inline near the top of the file.)
+
+
 void halFillRectPlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex) {
-    (void)s;
-    (void)x;
-    (void)y;
-    (void)w;
-    (void)h;
-    (void)colorIndex;
+    StPlanarT *pd;
+    uint16_t   groupFirst;
+    uint16_t   groupLast;
+    uint8_t   *rowBase;
+
+    if (s == NULL || w == 0u || h == 0u) {
+        return;
+    }
+    pd = (StPlanarT *)s->portData;
+    if (pd == NULL) {
+        return;
+    }
+
+    /* Phase 10 fast path: x == 0 AND w == SURFACE_WIDTH means the rect
+     * spans every group on every row with no edge masks. movem.l-based
+     * asm long-fill batches 24 bytes per call. UBER fillRect 320x200
+     * lands here. */
+    if (x == 0 && w == SURFACE_WIDTH) {
+        uint32_t loLong = ((colorIndex & 1u) ? 0xFFFF0000ul : 0ul)
+                       | ((colorIndex & 2u) ? 0x0000FFFFul : 0ul);
+        uint32_t hiLong = ((colorIndex & 4u) ? 0xFFFF0000ul : 0ul)
+                       | ((colorIndex & 8u) ? 0x0000FFFFul : 0ul);
+        surface68kStLongFill(pd->base + (uint16_t)y * ST_BYTES_PER_ROW,
+                             (uint16_t)((uint16_t)h * ST_GROUPS_PER_ROW),
+                             loLong, hiLong);
+        return;
+    }
+
+    groupFirst = (uint16_t)((uint16_t)x >> 4);
+    groupLast  = (uint16_t)(((uint16_t)x + w - 1u) >> 4);
+
+    if (groupFirst == groupLast) {
+        uint16_t  bitFirst  = (uint16_t)((uint16_t)x & 15u);
+        uint16_t  bitLast   = (uint16_t)(((uint16_t)x + w - 1u) & 15u);
+        uint16_t  leftMask  = (uint16_t)((1ul << (16u - bitFirst)) - 1ul);
+        uint16_t  rightMask = (uint16_t)~((1ul << (15u - bitLast)) - 1ul);
+        uint16_t  mask      = (uint16_t)(leftMask & rightMask);
+        rowBase = pd->base + (uint16_t)y * ST_BYTES_PER_ROW;
+        surface68kStFillRectSingleGroup(rowBase + groupFirst * ST_BYTES_PER_GROUP,
+                                        mask, h, colorIndex);
+        return;
+    }
+
+    /* Phase 10.5: multi-group case (groupFirst != groupLast) handled
+     * by 16-way-color-dispatched asm with hoisted mask state. ~3-5x
+     * faster than the C loop with inlined stApplyMaskToGroup. */
+    surface68kStFillRectMulti(pd->base, x, y, w, h, colorIndex);
 }
 
 
 void halSurfaceCopyPlanes(SurfaceT *dst, const SurfaceT *src) {
-    (void)dst;
-    (void)src;
+    StPlanarT *dstPd;
+    StPlanarT *srcPd;
+
+    if (dst == NULL || src == NULL) {
+        return;
+    }
+    dstPd = (StPlanarT *)dst->portData;
+    srcPd = (StPlanarT *)src->portData;
+    if (dstPd == NULL || srcPd == NULL) {
+        return;
+    }
+    memcpy(dstPd->base, srcPd->base, ST_PLANAR_SIZE);
 }
 
 
+// ----- Phases 4-7: per-pixel / tile / sprite / blit planar primitives -----
+//
+// These implementations dual-write the planar shadow alongside the
+// chunky shadow that cross-platform code maintains. They use simple
+// per-pixel walks for clarity and correctness; Phase 10 will replace
+// the hot ones (fillRect, drawPixel, sprite codegen) with hand-rolled
+// asm. The emphasis here is "correct first, fast later" -- Phase 9
+// flips the read source from chunky to planar and we'll see immediately
+// (DRAW hash vs IIgs reference) whether each primitive landed bits
+// in the right place.
+
+// stPlanarSetPixel and stPlanarGetPixel are defined inline near the
+// top of the file (between StPlanarT and the prototype block) so
+// every callsite folds the 4-plane RMW into its body.
+
+
+// Phase 5 tile ops. 8x8 tiles at byte position (bx, by) start at
+// pixel (bx*8, by*8). 8 pixels wide always covers exactly half a
+// 16-pixel group: high half (bits 15..8) when bx is even, low half
+// (bits 7..0) when bx is odd. Per-row work is 4 plane half-word RMWs.
 void halTileFillPlanes(SurfaceT *s, uint8_t bx, uint8_t by, uint8_t colorIndex) {
-    (void)s; (void)bx; (void)by; (void)colorIndex;
+    StPlanarT *pd;
+    uint16_t   group;
+    uint16_t   halfMask;
+    uint8_t   *gp;
+
+    if (s == NULL) {
+        return;
+    }
+    pd = (StPlanarT *)s->portData;
+    if (pd == NULL) {
+        return;
+    }
+    group    = (uint16_t)((uint16_t)bx >> 1);
+    halfMask = ((bx & 1u) == 0u) ? 0xFF00u : 0x00FFu;
+    gp = pd->base + (uint16_t)by * 8u * ST_BYTES_PER_ROW + group * ST_BYTES_PER_GROUP;
+    surface68kStFillRectSingleGroup(gp, halfMask, TILE_PIXELS_PER_SIDE, colorIndex);
 }
+
+
+// Phase 10: group-aware tile paste. Per row: extract 8 pixels from
+// 4 chunky bytes, build 4 plane bytes (one per plane), drop them
+// into the high or low half of the 4 plane words at this group --
+// 4 word RMWs per row instead of 64 per-pixel calls.
+static const uint8_t kStTileBitLut[8] = { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 };
+
+
+// Phase 10: tile paste/snap reuse the asm sprite save/restore
+// helpers -- identical per-row work patterns at byte-aligned
+// positions. Width 8 = single tile column = single half-group
+// write per plane. The asm walker handles 8 rows just as well
+// as a sprite's variable height.
+void halTilePastePlanes(SurfaceT *dst, uint8_t bx, uint8_t by, const uint8_t *tileBytes) {
+    StPlanarT *pd;
+    uint16_t   group;
+    uint8_t   *dstAddr;
+    int16_t    row;
+
+    if (dst == NULL || tileBytes == NULL) {
+        return;
+    }
+    pd = (StPlanarT *)dst->portData;
+    if (pd == NULL) {
+        return;
+    }
+    /* Phase 10.5: TileT.pixels holds plane-major bytes (4 plane bytes
+     * per row * 8 rows = 32 bytes). Direct byte copy to the planar
+     * buffer; no chunky <-> planar conversion. Mirrors the sibling
+     * halTileCopyPlanes pattern but reads from the contiguous tile
+     * buffer. Drops the asm-walker entry/exit overhead. */
+    group   = (uint16_t)((uint16_t)bx >> 1);
+    dstAddr = pd->base
+            + (uint16_t)by * 8u * ST_BYTES_PER_ROW
+            + group * ST_BYTES_PER_GROUP
+            + (uint16_t)(bx & 1u);
+    for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) {
+        dstAddr[0] = tileBytes[0];
+        dstAddr[2] = tileBytes[1];
+        dstAddr[4] = tileBytes[2];
+        dstAddr[6] = tileBytes[3];
+        dstAddr   += ST_BYTES_PER_ROW;
+        tileBytes += TILE_BYTES_PER_ROW;
+    }
+}
+
+
+void halTileSnapPlanes(const SurfaceT *src, uint8_t bx, uint8_t by, uint8_t *tileOut) {
+    const StPlanarT *pd;
+    uint16_t         group;
+    const uint8_t   *srcAddr;
+    int16_t          row;
+
+    if (src == NULL || tileOut == NULL) {
+        return;
+    }
+    pd = (const StPlanarT *)src->portData;
+    if (pd == NULL) {
+        return;
+    }
+    /* Phase 10.5: write plane-major bytes to TileT (4 per row * 8 rows). */
+    group   = (uint16_t)((uint16_t)bx >> 1);
+    srcAddr = pd->base
+            + (uint16_t)by * 8u * ST_BYTES_PER_ROW
+            + group * ST_BYTES_PER_GROUP
+            + (uint16_t)(bx & 1u);
+    for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) {
+        tileOut[0] = srcAddr[0];
+        tileOut[1] = srcAddr[2];
+        tileOut[2] = srcAddr[4];
+        tileOut[3] = srcAddr[6];
+        srcAddr   += ST_BYTES_PER_ROW;
+        tileOut   += TILE_BYTES_PER_ROW;
+    }
+}
+
+
+/* Slow-path C versions kept (renamed) for reference; not in the
+ * active call chain. */
+static void halTilePastePlanes_oldC(SurfaceT *dst, uint8_t bx, uint8_t by, const uint8_t *chunkyTile) {
+    StPlanarT *pd;
+    uint16_t   group;
+    uint16_t   halfMask;
+    uint16_t   notHalfMask;
+    bool       isHigh;
+    uint8_t   *rowBase;
+    int16_t    row;
+    int16_t    pix;
+    uint16_t  *pw;
+    uint8_t    b;
+    uint8_t    color;
+    uint8_t    pb0;
+    uint8_t    pb1;
+    uint8_t    pb2;
+    uint8_t    pb3;
+    uint8_t    bit;
+
+    if (dst == NULL || chunkyTile == NULL) {
+        return;
+    }
+    pd = (StPlanarT *)dst->portData;
+    if (pd == NULL) {
+        return;
+    }
+    group       = (uint16_t)((uint16_t)bx >> 1);
+    isHigh      = ((bx & 1u) == 0u);
+    halfMask    = isHigh ? 0xFF00u : 0x00FFu;
+    notHalfMask = (uint16_t)~halfMask;
+    rowBase = pd->base
+            + (uint16_t)by * TILE_PIXELS_PER_SIDE * ST_BYTES_PER_ROW
+            + group * ST_BYTES_PER_GROUP;
+    for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) {
+        pb0 = pb1 = pb2 = pb3 = 0u;
+        for (pix = 0; pix < TILE_PIXELS_PER_SIDE; pix++) {
+            b = chunkyTile[row * TILE_BYTES_PER_ROW + (pix >> 1)];
+            color = (pix & 1) ? (uint8_t)(b & 0x0Fu) : (uint8_t)(b >> 4);
+            bit = kStTileBitLut[pix];
+            if (color & 1u) { pb0 = (uint8_t)(pb0 | bit); }
+            if (color & 2u) { pb1 = (uint8_t)(pb1 | bit); }
+            if (color & 4u) { pb2 = (uint8_t)(pb2 | bit); }
+            if (color & 8u) { pb3 = (uint8_t)(pb3 | bit); }
+        }
+        pw = (uint16_t *)rowBase;
+        if (isHigh) {
+            pw[0] = (uint16_t)((pw[0] & notHalfMask) | ((uint16_t)pb0 << 8));
+            pw[1] = (uint16_t)((pw[1] & notHalfMask) | ((uint16_t)pb1 << 8));
+            pw[2] = (uint16_t)((pw[2] & notHalfMask) | ((uint16_t)pb2 << 8));
+            pw[3] = (uint16_t)((pw[3] & notHalfMask) | ((uint16_t)pb3 << 8));
+        } else {
+            pw[0] = (uint16_t)((pw[0] & notHalfMask) | (uint16_t)pb0);
+            pw[1] = (uint16_t)((pw[1] & notHalfMask) | (uint16_t)pb1);
+            pw[2] = (uint16_t)((pw[2] & notHalfMask) | (uint16_t)pb2);
+            pw[3] = (uint16_t)((pw[3] & notHalfMask) | (uint16_t)pb3);
+        }
+        rowBase += ST_BYTES_PER_ROW;
+    }
+}
+
+
+// Phase 10: group-aware tile snap. Read 4 plane half-words for the
+// row's group, distribute the 8 plane bits per plane into chunky
+// nibbles. 4 word reads per row + 4 chunky bytes per row, no
+// per-pixel function calls. Replaced by the asm-routed halTileSnapPlanes
+// above; kept for reference as the C-only fallback.
+static void halTileSnapPlanes_oldC(const SurfaceT *src, uint8_t bx, uint8_t by, uint8_t *chunkyTileOut) {
+    const StPlanarT *pd;
+    uint16_t         group;
+    uint16_t         halfShift;
+    const uint8_t   *rowBase;
+    int16_t          row;
+    int16_t          pair;
+    const uint16_t  *pw;
+    uint8_t          pb0;
+    uint8_t          pb1;
+    uint8_t          pb2;
+    uint8_t          pb3;
+    uint8_t          bitHi;
+    uint8_t          bitLo;
+    uint8_t          hi;
+    uint8_t          lo;
+
+    if (src == NULL || chunkyTileOut == NULL) {
+        return;
+    }
+    pd = (const StPlanarT *)src->portData;
+    if (pd == NULL) {
+        return;
+    }
+    group     = (uint16_t)((uint16_t)bx >> 1);
+    halfShift = ((bx & 1u) == 0u) ? 8u : 0u;
+    rowBase = pd->base
+            + (uint16_t)by * TILE_PIXELS_PER_SIDE * ST_BYTES_PER_ROW
+            + group * ST_BYTES_PER_GROUP;
+    for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) {
+        pw  = (const uint16_t *)rowBase;
+        pb0 = (uint8_t)(pw[0] >> halfShift);
+        pb1 = (uint8_t)(pw[1] >> halfShift);
+        pb2 = (uint8_t)(pw[2] >> halfShift);
+        pb3 = (uint8_t)(pw[3] >> halfShift);
+        for (pair = 0; pair < TILE_BYTES_PER_ROW; pair++) {
+            bitHi = kStTileBitLut[pair * 2];
+            bitLo = kStTileBitLut[pair * 2 + 1];
+            hi = 0u;
+            lo = 0u;
+            if (pb0 & bitHi) hi = (uint8_t)(hi | 1u);
+            if (pb1 & bitHi) hi = (uint8_t)(hi | 2u);
+            if (pb2 & bitHi) hi = (uint8_t)(hi | 4u);
+            if (pb3 & bitHi) hi = (uint8_t)(hi | 8u);
+            if (pb0 & bitLo) lo = (uint8_t)(lo | 1u);
+            if (pb1 & bitLo) lo = (uint8_t)(lo | 2u);
+            if (pb2 & bitLo) lo = (uint8_t)(lo | 4u);
+            if (pb3 & bitLo) lo = (uint8_t)(lo | 8u);
+            chunkyTileOut[row * TILE_BYTES_PER_ROW + pair] = (uint8_t)((hi << 4) | lo);
+        }
+        rowBase += ST_BYTES_PER_ROW;
+    }
+}
+
+
+// Phase 10: direct planar->planar tile copy. Each tile occupies one
+// half-byte of one plane word per plane per row (8 rows total).
+// We just byte-copy 4 plane bytes per row -- no chunky scratch, no
+// bit transpose, no LUT. ~640 cyc per tile vs ~5000 cyc for the
+// snap+paste path.
 void halTileCopyPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy) {
-    (void)dst; (void)dstBx; (void)dstBy; (void)src; (void)srcBx; (void)srcBy;
+    StPlanarT     *dstPd;
+    const StPlanarT *srcPd;
+    uint8_t        *dstAddr;
+    const uint8_t  *srcAddr;
+    uint16_t        srcGroup;
+    uint16_t        dstGroup;
+    int16_t         row;
+
+    if (dst == NULL || src == NULL) {
+        return;
+    }
+    dstPd = (StPlanarT *)dst->portData;
+    srcPd = (const StPlanarT *)src->portData;
+    if (dstPd == NULL || srcPd == NULL) {
+        return;
+    }
+    srcGroup = (uint16_t)((uint16_t)srcBx >> 1);
+    dstGroup = (uint16_t)((uint16_t)dstBx >> 1);
+    srcAddr = srcPd->base
+            + (uint16_t)srcBy * 8u * ST_BYTES_PER_ROW
+            + srcGroup * ST_BYTES_PER_GROUP
+            + (uint16_t)(srcBx & 1u);
+    dstAddr = dstPd->base
+            + (uint16_t)dstBy * 8u * ST_BYTES_PER_ROW
+            + dstGroup * ST_BYTES_PER_GROUP
+            + (uint16_t)(dstBx & 1u);
+    for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) {
+        dstAddr[0] = srcAddr[0];   /* plane 0 byte (high or low half) */
+        dstAddr[2] = srcAddr[2];   /* plane 1 */
+        dstAddr[4] = srcAddr[4];   /* plane 2 */
+        dstAddr[6] = srcAddr[6];   /* plane 3 */
+        srcAddr += ST_BYTES_PER_ROW;
+        dstAddr += ST_BYTES_PER_ROW;
+    }
 }
+
+
 void halTileCopyMaskedPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy, uint8_t transparentIndex) {
-    (void)dst; (void)dstBx; (void)dstBy; (void)src; (void)srcBx; (void)srcBy; (void)transparentIndex;
+    StPlanarT *dstPd;
+    uint8_t    scratch[TILE_BYTES];
+    int16_t    row;
+    uint16_t   dstX0;
+    uint16_t   dstY0;
+    uint16_t   group;
+    uint16_t   halfOff;
+    uint8_t   *dstByte;
+    uint8_t    p0;
+    uint8_t    p1;
+    uint8_t    p2;
+    uint8_t    p3;
+    uint8_t    xK0;
+    uint8_t    xK1;
+    uint8_t    xK2;
+    uint8_t    xK3;
+    uint8_t    mask;
+    uint8_t    notMask;
+
+    if (dst == NULL || src == NULL) {
+        return;
+    }
+    dstPd = (StPlanarT *)dst->portData;
+    if (dstPd == NULL) {
+        return;
+    }
+    /* Phase 10.5: bulk-plane fast path. scratch holds plane-major bytes
+     * (4 plane bytes per row * 8 rows). For each row, build a "non-
+     * transparent" mask = OR of (plane_byte XOR replicated transparent
+     * bit) -- 1s where the source pixel != transparent. Then 4 byte
+     * RMWs (one per plane) write the row at byte-aligned dst.
+     *
+     * For transparent=0 this collapses to mask = p0|p1|p2|p3.
+     * Replaces the prior 64-iteration per-pixel SetPixel walker. */
+    halTileSnapPlanes(src, srcBx, srcBy, scratch);
+    dstX0 = (uint16_t)((uint16_t)dstBx * TILE_PIXELS_PER_SIDE);
+    dstY0 = (uint16_t)((uint16_t)dstBy * TILE_PIXELS_PER_SIDE);
+    group   = (uint16_t)(dstX0 >> 4);
+    halfOff = (uint16_t)((dstX0 & 8u) >> 3u);
+    dstByte = dstPd->base + dstY0 * ST_BYTES_PER_ROW
+            + group * ST_BYTES_PER_GROUP + halfOff;
+
+    xK0 = (transparentIndex & 1u) ? 0xFFu : 0u;
+    xK1 = (transparentIndex & 2u) ? 0xFFu : 0u;
+    xK2 = (transparentIndex & 4u) ? 0xFFu : 0u;
+    xK3 = (transparentIndex & 8u) ? 0xFFu : 0u;
+
+    for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) {
+        p0 = scratch[row * 4 + 0];
+        p1 = scratch[row * 4 + 1];
+        p2 = scratch[row * 4 + 2];
+        p3 = scratch[row * 4 + 3];
+        mask = (uint8_t)((p0 ^ xK0) | (p1 ^ xK1) | (p2 ^ xK2) | (p3 ^ xK3));
+        if (mask != 0u) {
+            notMask = (uint8_t)~mask;
+            dstByte[0] = (uint8_t)((dstByte[0] & notMask) | (p0 & mask));
+            dstByte[2] = (uint8_t)((dstByte[2] & notMask) | (p1 & mask));
+            dstByte[4] = (uint8_t)((dstByte[4] & notMask) | (p2 & mask));
+            dstByte[6] = (uint8_t)((dstByte[6] & notMask) | (p3 & mask));
+        }
+        dstByte += ST_BYTES_PER_ROW;
+    }
 }
-void halTilePastePlanes(SurfaceT *dst, uint8_t bx, uint8_t by, const uint8_t *chunkyTile) {
-    (void)dst; (void)bx; (void)by; (void)chunkyTile;
-}
-void halTileSnapPlanes(const SurfaceT *src, uint8_t bx, uint8_t by, uint8_t *chunkyTileOut) {
-    (void)src; (void)bx; (void)by; (void)chunkyTileOut;
+
+
+// Phase 10 fast path: byte-aligned, fully-on-surface sprite draw.
+// Builds 4 plane bytes + 1 opacity byte from each tile-column row
+// in one pass, then does 4 word RMWs per group half. ~7x faster
+// than the per-pixel walker for the typical (byte-aligned) case.
+//
+// Per row of a tile column: 4 chunky bytes -> 8 nibbles -> {plane0
+// byte, plane1 byte, plane2 byte, plane3 byte, opacity byte}. The
+// opacity byte has bits set where the sprite pixel is non-zero;
+// transparent pixels (color 0) leave the destination plane bits
+// alone via the (word AND ~opMask) | (planeBits AND opMask) RMW.
+//
+// 8 pixels at byte-aligned x always cover exactly one half of one
+// group: high half if (x mod 16) == 0, low half if (x mod 16) == 8.
+// We branch once per tile column on (dstX & 8).
+static void stSpriteDrawByteAligned(StPlanarT *pd, const SpriteT *sp, int16_t x, int16_t y) {
+    uint16_t       wTiles = sp->widthTiles;
+    int16_t        srcH   = (int16_t)(sp->heightTiles * 8);
+    uint8_t       *rowBase = pd->base + (uint16_t)y * ST_BYTES_PER_ROW;
+    int16_t        row;
+
+    for (row = 0; row < srcH; row++) {
+        int16_t        tileY    = (int16_t)(row >> 3);
+        int16_t        inTileY  = (int16_t)(row & 7);
+        const uint8_t *tileRowBase = sp->tileData + (uint32_t)tileY * wTiles * 32u + (uint32_t)inTileY * 4u;
+        int16_t        tileCol;
+
+        for (tileCol = 0; tileCol < (int16_t)wTiles; tileCol++) {
+            const uint8_t *trp = tileRowBase + (uint32_t)tileCol * 32u;
+            uint8_t        b0  = trp[0];
+            uint8_t        b1  = trp[1];
+            uint8_t        b2  = trp[2];
+            uint8_t        b3  = trp[3];
+            uint8_t        pb0 = 0u;
+            uint8_t        pb1 = 0u;
+            uint8_t        pb2 = 0u;
+            uint8_t        pb3 = 0u;
+            uint8_t        pop = 0u;
+            uint8_t        c;
+
+            /* 8 pixels per tile column: hi(b0),lo(b0),hi(b1),lo(b1),
+             * hi(b2),lo(b2),hi(b3),lo(b3) at bit positions 7..0
+             * within the eventual plane byte. Walk inline -- no LUT
+             * loop overhead. */
+            c = (uint8_t)(b0 >> 4); if (c) { pop = (uint8_t)(pop | 0x80u); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x80u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x80u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x80u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x80u); }
+            c = (uint8_t)(b0 & 0x0Fu); if (c) { pop = (uint8_t)(pop | 0x40u); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x40u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x40u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x40u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x40u); }
+            c = (uint8_t)(b1 >> 4); if (c) { pop = (uint8_t)(pop | 0x20u); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x20u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x20u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x20u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x20u); }
+            c = (uint8_t)(b1 & 0x0Fu); if (c) { pop = (uint8_t)(pop | 0x10u); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x10u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x10u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x10u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x10u); }
+            c = (uint8_t)(b2 >> 4); if (c) { pop = (uint8_t)(pop | 0x08u); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x08u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x08u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x08u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x08u); }
+            c = (uint8_t)(b2 & 0x0Fu); if (c) { pop = (uint8_t)(pop | 0x04u); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x04u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x04u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x04u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x04u); }
+            c = (uint8_t)(b3 >> 4); if (c) { pop = (uint8_t)(pop | 0x02u); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x02u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x02u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x02u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x02u); }
+            c = (uint8_t)(b3 & 0x0Fu); if (c) { pop = (uint8_t)(pop | 0x01u); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x01u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x01u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x01u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x01u); }
+
+            if (pop != 0u) {
+                int16_t   dstX  = (int16_t)(x + tileCol * 8);
+                uint16_t  group = (uint16_t)((uint16_t)dstX >> 4);
+                uint16_t *pw    = (uint16_t *)(rowBase + group * ST_BYTES_PER_GROUP);
+                uint16_t  opMask;
+                uint16_t  notOpMask;
+                uint16_t  pv0;
+                uint16_t  pv1;
+                uint16_t  pv2;
+                uint16_t  pv3;
+
+                if ((dstX & 8) == 0) {
+                    opMask = (uint16_t)((uint16_t)pop << 8);
+                    pv0    = (uint16_t)((uint16_t)pb0 << 8);
+                    pv1    = (uint16_t)((uint16_t)pb1 << 8);
+                    pv2    = (uint16_t)((uint16_t)pb2 << 8);
+                    pv3    = (uint16_t)((uint16_t)pb3 << 8);
+                } else {
+                    opMask = (uint16_t)pop;
+                    pv0    = (uint16_t)pb0;
+                    pv1    = (uint16_t)pb1;
+                    pv2    = (uint16_t)pb2;
+                    pv3    = (uint16_t)pb3;
+                }
+                notOpMask = (uint16_t)~opMask;
+                pw[0] = (uint16_t)((pw[0] & notOpMask) | pv0);
+                pw[1] = (uint16_t)((pw[1] & notOpMask) | pv1);
+                pw[2] = (uint16_t)((pw[2] & notOpMask) | pv2);
+                pw[3] = (uint16_t)((pw[3] & notOpMask) | pv3);
+            }
+        }
+        rowBase += ST_BYTES_PER_ROW;
+    }
 }
+
+
+// Phase 10: sprite walker with hoisted state. rowBase advances by
+// 160 per row instead of recomputing y*160 per pixel; tile-row
+// pointer is advanced once per tile column (8 cols) instead of
+// recomputed per pixel; the per-pixel inner block is the inlined
+// stPlanarSetPixel body so there's no nested function entry / y*160
+// re-derivation. Major rewrite of the dispatcher path that drove
+// the 0.06x sprite gap before this commit.
 void halSpriteDrawPlanes(SurfaceT *s, const SpriteT *sp, int16_t x, int16_t y) {
-    (void)s; (void)sp; (void)x; (void)y;
+    StPlanarT     *pd;
+    int16_t        spritePxStart;
+    int16_t        spritePyStart;
+    int16_t        srcW;
+    int16_t        srcH;
+    int16_t        sx;
+    int16_t        sy;
+    int16_t        row;
+    int16_t        col;
+    uint16_t       wTiles;
+    uint8_t       *rowBase;
+
+    if (s == NULL || sp == NULL || sp->tileData == NULL) {
+        return;
+    }
+    pd = (StPlanarT *)s->portData;
+    if (pd == NULL) {
+        return;
+    }
+    wTiles = sp->widthTiles;
+    srcW   = (int16_t)(wTiles * 8);
+    srcH   = (int16_t)(sp->heightTiles * 8);
+    spritePxStart = x;
+    spritePyStart = y;
+
+    /* Phase 10 fast path: byte-aligned x AND fully on-surface lets
+     * us bulk-write 8 pixels into one group half per tile column,
+     * skipping the per-pixel walker entirely. UBER's spriteDraw test
+     * at (160, 100) lands here. */
+    if ((x & 7) == 0
+            && x >= 0 && (x + srcW) <= SURFACE_WIDTH
+            && y >= 0 && (y + srcH) <= SURFACE_HEIGHT) {
+        stSpriteDrawByteAligned(pd, sp, x, y);
+        return;
+    }
+
+    sx = 0;
+    sy = 0;
+    if (spritePxStart < 0) { sx = (int16_t)(-spritePxStart); srcW = (int16_t)(srcW - sx); spritePxStart = 0; }
+    if (spritePyStart < 0) { sy = (int16_t)(-spritePyStart); srcH = (int16_t)(srcH - sy); spritePyStart = 0; }
+    if (spritePxStart >= SURFACE_WIDTH || spritePyStart >= SURFACE_HEIGHT || srcW <= 0 || srcH <= 0) {
+        return;
+    }
+    if (spritePxStart + srcW > SURFACE_WIDTH)  { srcW = (int16_t)(SURFACE_WIDTH  - spritePxStart); }
+    if (spritePyStart + srcH > SURFACE_HEIGHT) { srcH = (int16_t)(SURFACE_HEIGHT - spritePyStart); }
+
+    rowBase = pd->base + (uint16_t)spritePyStart * ST_BYTES_PER_ROW;
+    for (row = 0; row < srcH; row++) {
+        int16_t        spritePy = (int16_t)(sy + row);
+        int16_t        tileY    = (int16_t)(spritePy >> 3);
+        int16_t        inTileY  = (int16_t)(spritePy & 7);
+        const uint8_t *tileRowBase = sp->tileData + (uint32_t)tileY * wTiles * 32u + (uint32_t)inTileY * TILE_BYTES_PER_ROW;
+        int16_t        sxStart  = sx;
+        int16_t        sxEnd    = (int16_t)(sx + srcW);
+        int16_t        spritePx = sxStart;
+
+        /* Walk in tile-column chunks; each tile column advances the
+         * tileRow pointer once and gives 8 contiguous source pixels. */
+        col = 0;
+        while (spritePx < sxEnd) {
+            int16_t        tileX        = (int16_t)(spritePx >> 3);
+            int16_t        inTileX      = (int16_t)(spritePx & 7);
+            const uint8_t *tilePixelRow = tileRowBase + (uint32_t)tileX * 32u;
+            int16_t        pixelsLeft   = (int16_t)(8 - inTileX);
+            int16_t        endThisTile  = (int16_t)(spritePx + pixelsLeft);
+            if (endThisTile > sxEnd) {
+                endThisTile = sxEnd;
+                pixelsLeft  = (int16_t)(endThisTile - spritePx);
+            }
+            for (; spritePx < endThisTile; spritePx++) {
+                uint8_t b      = tilePixelRow[inTileX >> 1];
+                uint8_t nibble = (inTileX & 1) ? (uint8_t)(b & 0x0Fu) : (uint8_t)(b >> 4);
+                if (nibble != 0u) {
+                    int16_t   dstX    = (int16_t)(spritePxStart + col);
+                    uint16_t  group   = (uint16_t)((uint16_t)dstX >> 4);
+                    uint16_t  bitMask = (uint16_t)(1u << (15u - ((uint16_t)dstX & 15u)));
+                    uint16_t  notMask = (uint16_t)~bitMask;
+                    uint16_t *pw      = (uint16_t *)(rowBase + group * ST_BYTES_PER_GROUP);
+                    if (nibble & 1u) { pw[0] = (uint16_t)(pw[0] | bitMask); } else { pw[0] = (uint16_t)(pw[0] & notMask); }
+                    if (nibble & 2u) { pw[1] = (uint16_t)(pw[1] | bitMask); } else { pw[1] = (uint16_t)(pw[1] & notMask); }
+                    if (nibble & 4u) { pw[2] = (uint16_t)(pw[2] | bitMask); } else { pw[2] = (uint16_t)(pw[2] & notMask); }
+                    if (nibble & 8u) { pw[3] = (uint16_t)(pw[3] | bitMask); } else { pw[3] = (uint16_t)(pw[3] & notMask); }
+                }
+                inTileX++;
+                col++;
+            }
+        }
+        rowBase += ST_BYTES_PER_ROW;
+    }
 }
+
+
 void halBlitRectPlanes(SurfaceT *dst, int16_t x, int16_t y, const uint8_t *srcBytes, int16_t srcX0, int16_t srcY0, int16_t copyW, int16_t copyH, int16_t srcRowBytes, uint16_t transparent) {
-    (void)dst; (void)x; (void)y; (void)srcBytes; (void)srcX0; (void)srcY0;
-    (void)copyW; (void)copyH; (void)srcRowBytes; (void)transparent;
+    StPlanarT     *pd;
+    int16_t        row;
+    int16_t        col;
+    int16_t        srcXCol;
+    const uint8_t *srcRow;
+    uint8_t        b;
+    uint8_t        color;
+
+    if (dst == NULL || srcBytes == NULL) {
+        return;
+    }
+    pd = (StPlanarT *)dst->portData;
+    if (pd == NULL) {
+        return;
+    }
+    for (row = 0; row < copyH; row++) {
+        srcRow = &srcBytes[(srcY0 + row) * srcRowBytes];
+        for (col = 0; col < copyW; col++) {
+            srcXCol = (int16_t)(srcX0 + col);
+            b = srcRow[srcXCol >> 1];
+            color = (srcXCol & 1) ? (uint8_t)(b & 0x0Fu) : (uint8_t)(b >> 4);
+            if (transparent < 16u && color == (uint8_t)transparent) {
+                continue;
+            }
+            stPlanarSetPixel(pd, (int16_t)(x + col), (int16_t)(y + row), color);
+        }
+    }
 }
+
+
+// Phase 10 fast paths for save/restore. Hand-rolled asm
+// (surface68kStSprite{Save,Restore}ByteAligned) does the chunky <->
+// plane bit transpose via ASL+ROXL and walks rows/tile columns. The
+// C wrappers below are kept as a fallback / reference; they're not
+// in the critical path now that the asm versions are wired in.
+static void stSpriteSaveByteAligned(const StPlanarT *pd, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t *dstChunkyBytes) {
+    int16_t        bytesPerRow = (int16_t)(w >> 1);
+    int16_t        tileCols    = (int16_t)(w >> 3);
+    const uint8_t *rowBase     = pd->base + (uint16_t)y * ST_BYTES_PER_ROW;
+    int16_t        row;
+    int16_t        tileCol;
+
+    for (row = 0; row < (int16_t)h; row++) {
+        uint8_t *dstRow = &dstChunkyBytes[(uint16_t)row * (uint16_t)bytesPerRow];
+        for (tileCol = 0; tileCol < tileCols; tileCol++) {
+            int16_t         srcX  = (int16_t)(x + tileCol * 8);
+            uint16_t        group = (uint16_t)((uint16_t)srcX >> 4);
+            uint16_t        shift = ((srcX & 8) == 0) ? 8u : 0u;
+            const uint16_t *pw    = (const uint16_t *)(rowBase + group * ST_BYTES_PER_GROUP);
+            uint8_t         pb0   = (uint8_t)(pw[0] >> shift);
+            uint8_t         pb1   = (uint8_t)(pw[1] >> shift);
+            uint8_t         pb2   = (uint8_t)(pw[2] >> shift);
+            uint8_t         pb3   = (uint8_t)(pw[3] >> shift);
+            int16_t         pair;
+            for (pair = 0; pair < 4; pair++) {
+                uint8_t bitHi = (uint8_t)(0x80u >> (pair * 2));
+                uint8_t bitLo = (uint8_t)(0x80u >> (pair * 2 + 1));
+                uint8_t hi = 0u;
+                uint8_t lo = 0u;
+                if (pb0 & bitHi) { hi = (uint8_t)(hi | 1u); }
+                if (pb1 & bitHi) { hi = (uint8_t)(hi | 2u); }
+                if (pb2 & bitHi) { hi = (uint8_t)(hi | 4u); }
+                if (pb3 & bitHi) { hi = (uint8_t)(hi | 8u); }
+                if (pb0 & bitLo) { lo = (uint8_t)(lo | 1u); }
+                if (pb1 & bitLo) { lo = (uint8_t)(lo | 2u); }
+                if (pb2 & bitLo) { lo = (uint8_t)(lo | 4u); }
+                if (pb3 & bitLo) { lo = (uint8_t)(lo | 8u); }
+                dstRow[tileCol * 4 + pair] = (uint8_t)((hi << 4) | lo);
+            }
+        }
+        rowBase += ST_BYTES_PER_ROW;
+    }
+}
+
+
+static void stSpriteRestoreByteAligned(StPlanarT *pd, int16_t x, int16_t y, uint16_t w, uint16_t h, const uint8_t *srcChunkyBytes) {
+    int16_t   bytesPerRow = (int16_t)(w >> 1);
+    int16_t   tileCols    = (int16_t)(w >> 3);
+    uint8_t  *rowBase     = pd->base + (uint16_t)y * ST_BYTES_PER_ROW;
+    int16_t   row;
+    int16_t   tileCol;
+
+    for (row = 0; row < (int16_t)h; row++) {
+        const uint8_t *srcRow = &srcChunkyBytes[(uint16_t)row * (uint16_t)bytesPerRow];
+        for (tileCol = 0; tileCol < tileCols; tileCol++) {
+            uint8_t   b0  = srcRow[tileCol * 4 + 0];
+            uint8_t   b1  = srcRow[tileCol * 4 + 1];
+            uint8_t   b2  = srcRow[tileCol * 4 + 2];
+            uint8_t   b3  = srcRow[tileCol * 4 + 3];
+            uint8_t   pb0 = 0u;
+            uint8_t   pb1 = 0u;
+            uint8_t   pb2 = 0u;
+            uint8_t   pb3 = 0u;
+            uint8_t   c;
+            int16_t   dstX;
+            uint16_t  group;
+            uint16_t *pw;
+            uint16_t  halfMask;
+            uint16_t  notHalfMask;
+
+            c = (uint8_t)(b0 >> 4); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x80u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x80u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x80u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x80u);
+            c = (uint8_t)(b0 & 0x0Fu); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x40u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x40u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x40u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x40u);
+            c = (uint8_t)(b1 >> 4); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x20u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x20u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x20u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x20u);
+            c = (uint8_t)(b1 & 0x0Fu); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x10u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x10u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x10u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x10u);
+            c = (uint8_t)(b2 >> 4); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x08u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x08u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x08u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x08u);
+            c = (uint8_t)(b2 & 0x0Fu); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x04u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x04u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x04u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x04u);
+            c = (uint8_t)(b3 >> 4); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x02u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x02u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x02u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x02u);
+            c = (uint8_t)(b3 & 0x0Fu); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x01u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x01u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x01u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x01u);
+
+            dstX  = (int16_t)(x + tileCol * 8);
+            group = (uint16_t)((uint16_t)dstX >> 4);
+            pw    = (uint16_t *)(rowBase + group * ST_BYTES_PER_GROUP);
+            if ((dstX & 8) == 0) {
+                halfMask = 0xFF00u;
+                pw[0] = (uint16_t)((pw[0] & 0x00FFu) | ((uint16_t)pb0 << 8));
+                pw[1] = (uint16_t)((pw[1] & 0x00FFu) | ((uint16_t)pb1 << 8));
+                pw[2] = (uint16_t)((pw[2] & 0x00FFu) | ((uint16_t)pb2 << 8));
+                pw[3] = (uint16_t)((pw[3] & 0x00FFu) | ((uint16_t)pb3 << 8));
+            } else {
+                halfMask = 0x00FFu;
+                pw[0] = (uint16_t)((pw[0] & 0xFF00u) | (uint16_t)pb0);
+                pw[1] = (uint16_t)((pw[1] & 0xFF00u) | (uint16_t)pb1);
+                pw[2] = (uint16_t)((pw[2] & 0xFF00u) | (uint16_t)pb2);
+                pw[3] = (uint16_t)((pw[3] & 0xFF00u) | (uint16_t)pb3);
+            }
+            (void)halfMask;
+            (void)notHalfMask;
+        }
+        rowBase += ST_BYTES_PER_ROW;
+    }
+}
+
+
+// Phase 10: hoist y*160 to per-row, fold setPixel/getPixel bodies
+// inline. Each pixel's group address differs only in (x), so we
+// can compute base+row*160 once per row and just do per-pixel
+// (group, bitMask, 4 plane RMW). 2x speedup over the per-pixel
+// stPlanarSetPixel form.
 void halSpriteSavePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t *dstPlaneBytes) {
-    (void)s; (void)x; (void)y; (void)w; (void)h; (void)dstPlaneBytes;
+    StPlanarT     *pd;
+    int16_t        row;
+    int16_t        pair;
+    int16_t        pairs;
+    uint8_t       *pp;
+    const uint8_t *rowBase;
+
+    if (s == NULL || dstPlaneBytes == NULL || w == 0u || h == 0u) {
+        return;
+    }
+    pd = (StPlanarT *)s->portData;
+    if (pd == NULL) {
+        return;
+    }
+    /* Phase 10.5 fast path: byte-aligned, fully on-surface.
+     * Asm walker does direct planar byte copy (LUT pointer unused). */
+    if ((x & 7) == 0 && (w & 7) == 0
+            && x >= 0 && (x + (int16_t)w) <= SURFACE_WIDTH
+            && y >= 0 && (y + (int16_t)h) <= SURFACE_HEIGHT) {
+        surface68kStSpriteSaveByteAligned(pd->base, (uint16_t)x, (uint16_t)y, w, h, dstPlaneBytes, NULL);
+        return;
+    }
+
+    pairs   = (int16_t)(w >> 1);
+    rowBase = pd->base + (uint16_t)y * ST_BYTES_PER_ROW;
+    for (row = 0; row < (int16_t)h; row++) {
+        pp = &dstPlaneBytes[(uint16_t)row * (uint16_t)pairs];
+        for (pair = 0; pair < pairs; pair++) {
+            int16_t   px;
+            uint16_t  group;
+            uint16_t  bitMask;
+            const uint16_t *pw;
+            uint8_t   hi;
+            uint8_t   lo;
+
+            px      = (int16_t)(x + pair * 2);
+            group   = (uint16_t)((uint16_t)px >> 4);
+            bitMask = (uint16_t)(1u << (15u - ((uint16_t)px & 15u)));
+            pw      = (const uint16_t *)(rowBase + group * ST_BYTES_PER_GROUP);
+            hi = 0u;
+            if (pw[0] & bitMask) { hi = (uint8_t)(hi | 1u); }
+            if (pw[1] & bitMask) { hi = (uint8_t)(hi | 2u); }
+            if (pw[2] & bitMask) { hi = (uint8_t)(hi | 4u); }
+            if (pw[3] & bitMask) { hi = (uint8_t)(hi | 8u); }
+
+            px      = (int16_t)(x + pair * 2 + 1);
+            group   = (uint16_t)((uint16_t)px >> 4);
+            bitMask = (uint16_t)(1u << (15u - ((uint16_t)px & 15u)));
+            pw      = (const uint16_t *)(rowBase + group * ST_BYTES_PER_GROUP);
+            lo = 0u;
+            if (pw[0] & bitMask) { lo = (uint8_t)(lo | 1u); }
+            if (pw[1] & bitMask) { lo = (uint8_t)(lo | 2u); }
+            if (pw[2] & bitMask) { lo = (uint8_t)(lo | 4u); }
+            if (pw[3] & bitMask) { lo = (uint8_t)(lo | 8u); }
+
+            pp[pair] = (uint8_t)((hi << 4) | lo);
+        }
+        rowBase += ST_BYTES_PER_ROW;
+    }
 }
+
+
 void halSpriteRestorePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, const uint8_t *srcPlaneBytes) {
-    (void)s; (void)x; (void)y; (void)w; (void)h; (void)srcPlaneBytes;
+    StPlanarT     *pd;
+    int16_t        row;
+    int16_t        pair;
+    int16_t        pairs;
+    uint8_t        b;
+    const uint8_t *pp;
+    uint8_t       *rowBase;
+
+    if (s == NULL || srcPlaneBytes == NULL || w == 0u || h == 0u) {
+        return;
+    }
+    pd = (StPlanarT *)s->portData;
+    if (pd == NULL) {
+        return;
+    }
+    /* Phase 10.5 fast path: byte-aligned, fully on-surface.
+     * Asm walker does direct planar byte copy (LUT pointer unused). */
+    if ((x & 7) == 0 && (w & 7) == 0
+            && x >= 0 && (x + (int16_t)w) <= SURFACE_WIDTH
+            && y >= 0 && (y + (int16_t)h) <= SURFACE_HEIGHT) {
+        surface68kStSpriteRestoreByteAligned(pd->base, (uint16_t)x, (uint16_t)y, w, h, srcPlaneBytes, NULL);
+        return;
+    }
+
+    pairs   = (int16_t)(w >> 1);
+    rowBase = pd->base + (uint16_t)y * ST_BYTES_PER_ROW;
+    for (row = 0; row < (int16_t)h; row++) {
+        pp = &srcPlaneBytes[(uint16_t)row * (uint16_t)pairs];
+        for (pair = 0; pair < pairs; pair++) {
+            int16_t   px;
+            uint16_t  group;
+            uint16_t  bitMask;
+            uint16_t  notMask;
+            uint16_t *pw;
+            uint8_t   color;
+
+            b = pp[pair];
+
+            px      = (int16_t)(x + pair * 2);
+            color   = (uint8_t)(b >> 4);
+            group   = (uint16_t)((uint16_t)px >> 4);
+            bitMask = (uint16_t)(1u << (15u - ((uint16_t)px & 15u)));
+            notMask = (uint16_t)~bitMask;
+            pw      = (uint16_t *)(rowBase + group * ST_BYTES_PER_GROUP);
+            if (color & 1u) { pw[0] = (uint16_t)(pw[0] | bitMask); } else { pw[0] = (uint16_t)(pw[0] & notMask); }
+            if (color & 2u) { pw[1] = (uint16_t)(pw[1] | bitMask); } else { pw[1] = (uint16_t)(pw[1] & notMask); }
+            if (color & 4u) { pw[2] = (uint16_t)(pw[2] | bitMask); } else { pw[2] = (uint16_t)(pw[2] & notMask); }
+            if (color & 8u) { pw[3] = (uint16_t)(pw[3] | bitMask); } else { pw[3] = (uint16_t)(pw[3] & notMask); }
+
+            px      = (int16_t)(x + pair * 2 + 1);
+            color   = (uint8_t)(b & 0x0Fu);
+            group   = (uint16_t)((uint16_t)px >> 4);
+            bitMask = (uint16_t)(1u << (15u - ((uint16_t)px & 15u)));
+            notMask = (uint16_t)~bitMask;
+            pw      = (uint16_t *)(rowBase + group * ST_BYTES_PER_GROUP);
+            if (color & 1u) { pw[0] = (uint16_t)(pw[0] | bitMask); } else { pw[0] = (uint16_t)(pw[0] & notMask); }
+            if (color & 2u) { pw[1] = (uint16_t)(pw[1] | bitMask); } else { pw[1] = (uint16_t)(pw[1] & notMask); }
+            if (color & 4u) { pw[2] = (uint16_t)(pw[2] | bitMask); } else { pw[2] = (uint16_t)(pw[2] & notMask); }
+            if (color & 8u) { pw[3] = (uint16_t)(pw[3] | bitMask); } else { pw[3] = (uint16_t)(pw[3] & notMask); }
+        }
+        rowBase += ST_BYTES_PER_ROW;
+    }
 }
 
 
-/* Phase 9 chunky reader hooks -- ST is still chunky-shadow + c2p,
- * so reads come from s->pixels just like DOS / IIgs. */
+// Phase 7: pixel reader. Pre-Phase-9 reads from the chunky shadow
+// (s->pixels) since that's the source-of-truth during transition.
+// Once Phase 9 sets s->pixels = NULL the planar shadow becomes
+// authoritative and we walk the 4 plane bits at (x, y).
 uint8_t halSamplePixel(const SurfaceT *s, int16_t x, int16_t y) {
-    uint8_t byte = s->pixels[(uint16_t)y * SURFACE_BYTES_PER_ROW + ((uint16_t)x >> 1)];
-    if (x & 1) return (uint8_t)(byte & 0x0Fu);
-    return (uint8_t)((byte & 0xF0u) >> 4);
+    if (s->pixels != NULL) {
+        uint8_t byte = s->pixels[(uint16_t)y * SURFACE_BYTES_PER_ROW + ((uint16_t)x >> 1)];
+        if (x & 1) return (uint8_t)(byte & 0x0Fu);
+        return (uint8_t)((byte & 0xF0u) >> 4);
+    }
+    {
+        StPlanarT *pd = (StPlanarT *)s->portData;
+        if (pd == NULL) {
+            return 0u;
+        }
+        return stPlanarGetPixel(pd, x, y);
+    }
+}
+
+
+// Phase 9: derive 160 chunky bytes per row from the word-interleaved
+// planar buffer (20 groups x 4 plane words). Same shape as the Amiga's
+// amigaPlanesToChunkyRow but per-group instead of per-byte. Used by
+// halSurfaceHash and halSurfaceSaveFileChunky.
+static void stPlanarToChunkyRow(const StPlanarT *pd, int16_t y, uint8_t *dstChunkyRow) {
+    uint16_t        group;
+    uint16_t        p;
+    uint16_t        bitMask;
+    uint8_t         pix;
+    const uint16_t *gp;
+
+    for (group = 0; group < ST_GROUPS_PER_ROW; group++) {
+        gp = (const uint16_t *)(pd->base
+                                + (uint16_t)y * ST_BYTES_PER_ROW
+                                + group * ST_BYTES_PER_GROUP);
+        for (p = 0; p < 16u; p++) {
+            bitMask = (uint16_t)(1u << (15u - p));
+            pix = 0u;
+            if (gp[0] & bitMask) { pix = (uint8_t)(pix | 1u); }
+            if (gp[1] & bitMask) { pix = (uint8_t)(pix | 2u); }
+            if (gp[2] & bitMask) { pix = (uint8_t)(pix | 4u); }
+            if (gp[3] & bitMask) { pix = (uint8_t)(pix | 8u); }
+            if ((p & 1u) == 0u) {
+                dstChunkyRow[group * 8u + (p >> 1)] = (uint8_t)(pix << 4);
+            } else {
+                dstChunkyRow[group * 8u + (p >> 1)] = (uint8_t)(dstChunkyRow[group * 8u + (p >> 1)] | pix);
+            }
+        }
+    }
 }
 
 
 uint32_t halSurfaceHash(const SurfaceT *s) {
-    uint16_t        lo = 0xACE1u, hi = 0x1357u, blocks, n, v;
-    const uint8_t  *p;
-    const uint16_t *w;
+    StPlanarT      *pd;
+    uint16_t        lo = 0xACE1u;
+    uint16_t        hi = 0x1357u;
+    uint16_t        n;
+    uint16_t        v;
+    int16_t         row;
+    uint16_t        col;
     uint8_t         b;
-    p      = s->pixels;
-    blocks = (uint16_t)(SURFACE_PIXELS_SIZE / 8);
-    do {
-        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
-        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
-        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
-        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
-        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
-        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
-        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
-        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
-        blocks--;
-    } while (blocks > 0u);
-    p = s->scb;
-    for (n = 0; n < (uint16_t)SURFACE_HEIGHT; n++) {
-        b = *p++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+    uint8_t         chunkyRow[SURFACE_BYTES_PER_ROW];
+    const uint16_t *w;
+
+    pd = (StPlanarT *)s->portData;
+    if (pd == NULL) {
+        return 0u;
+    }
+    /* Pixel hash: derive equivalent chunky bytes from the planar
+     * shadow row by row, fold them into the same SURFACE_HASH_MIX_BYTE
+     * the chunky ports use so cross-port hash comparisons stay valid. */
+    for (row = 0; row < SURFACE_HEIGHT; row++) {
+        stPlanarToChunkyRow(pd, row, chunkyRow);
+        for (col = 0; col < SURFACE_BYTES_PER_ROW; col++) {
+            b = chunkyRow[col];
+            SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        }
+    }
+    /* SCB + palette mix unchanged from chunky days. */
+    {
+        const uint8_t *sp = s->scb;
+        for (n = 0; n < (uint16_t)SURFACE_HEIGHT; n++) {
+            b = *sp++;  SURFACE_HASH_MIX_BYTE(lo, hi, b);
+        }
     }
     w = &s->palette[0][0];
     for (n = 0; n < (uint16_t)SURFACE_PALETTE_ENTRIES; n++) {
@@ -901,39 +2124,108 @@ uint32_t halSurfaceHash(const SurfaceT *s) {
 }
 
 
+// Phase 9: planar-only. The chunky shadow is gone; surface copy is
+// 32000 bytes of planar data. halSurfaceCopyPlanes already handles
+// the planar copy via memcpy of pd->base. This stub only guards the
+// pre-Phase-9 contract; cross-platform surfaceCopy still calls both
+// halSurfaceCopyChunky and halSurfaceCopyPlanes.
 void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src) {
-    memcpy(dst->pixels, src->pixels, SURFACE_PIXELS_SIZE);
+    (void)dst; (void)src;
 }
 
 
+// Phase 9: read chunky from file into a temporary scratch buffer,
+// then c2p once into the planar shadow. The .joeysurface file format
+// is still chunky 4bpp on disk (cross-port asset interchange); the
+// in-memory representation is what changes.
 bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp) {
-    return fread(dst->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE;
+    StPlanarT *pd;
+    uint8_t   *scratch;
+    int16_t    y;
+    bool       ok;
+
+    pd = (StPlanarT *)dst->portData;
+    if (pd == NULL) {
+        return false;
+    }
+    scratch = (uint8_t *)malloc(SURFACE_PIXELS_SIZE);
+    if (scratch == NULL) {
+        return false;
+    }
+    ok = (fread(scratch, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE);
+    if (ok) {
+        if (!gC2pLutReady) {
+            initC2pLut();
+        }
+        for (y = 0; y < SURFACE_HEIGHT; y++) {
+            const uint8_t *srcLine = &scratch[y * SURFACE_BYTES_PER_ROW];
+            uint16_t      *dstLine = (uint16_t *)&pd->base[y * ST_BYTES_PER_ROW];
+            chunkyToPlanarRowSt(srcLine, dstLine, 0u, ST_GROUPS_PER_ROW, gC2pLut);
+        }
+    }
+    free(scratch);
+    return ok;
 }
 
 
+// Phase 9: derive chunky bytes from the planar shadow row by row,
+// stream to file. Avoids needing a full 32 KB scratch buffer.
 bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp) {
-    return fwrite(src->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE;
+    StPlanarT *pd;
+    uint8_t    chunkyRow[SURFACE_BYTES_PER_ROW];
+    int16_t    y;
+
+    pd = (StPlanarT *)src->portData;
+    if (pd == NULL) {
+        return false;
+    }
+    for (y = 0; y < SURFACE_HEIGHT; y++) {
+        stPlanarToChunkyRow(pd, y, chunkyRow);
+        if (fwrite(chunkyRow, 1, SURFACE_BYTES_PER_ROW, fp) != SURFACE_BYTES_PER_ROW) {
+            return false;
+        }
+    }
+    return true;
 }
 
 
+// Phase 9: no chunky storage on the ST. Cross-platform code treats
+// NULL as "port has no chunky shadow" (same contract Amiga uses).
 uint8_t *halSurfaceAllocPixels(void) {
-    return (uint8_t *)calloc(1, SURFACE_PIXELS_SIZE);
-}
-
-
-void halSurfaceFreePixels(uint8_t *pixels) {
-    free(pixels);
-}
-
-
-uint8_t *halSurfacePlanePtr(const SurfaceT *s, uint8_t planeIdx) {
-    (void)s; (void)planeIdx;
     return NULL;
 }
 
 
+void halSurfaceFreePixels(uint8_t *pixels) {
+    free(pixels);  /* free(NULL) is a no-op; symmetric for non-planar ports. */
+}
+
+
+// ST is word-interleaved: one buffer holds all 4 planes per group
+// back-to-back. There's no per-plane base, but we overload planeIdx
+// 0 to return the single buffer base so the cross-platform sprite
+// dispatcher (spriteCompiledDraw) can hand it to the ST JIT
+// routine, which computes plane offsets internally via d16(a0)
+// chains. planeIdx >= 1 returns NULL since they don't make sense
+// in interleaved layout.
+uint8_t *halSurfacePlanePtr(const SurfaceT *s, uint8_t planeIdx) {
+    StPlanarT *pd;
+
+    if (s == NULL || planeIdx != 0u) {
+        return NULL;
+    }
+    pd = (StPlanarT *)s->portData;
+    if (pd == NULL) {
+        return NULL;
+    }
+    return pd->base;
+}
+
+
+// Phase 9: stage has no chunky shadow either. Cross-platform stageAlloc
+// stores NULL in s->pixels and skips the chunky memset.
 uint8_t *halStageAllocPixels(void) {
-    return (uint8_t *)malloc(SURFACE_PIXELS_SIZE);
+    return NULL;
 }
 
 
diff --git a/src/port/atarist/lineSpan.s b/src/port/atarist/lineSpan.s
new file mode 100644
index 0000000..242b7b4
--- /dev/null
+++ b/src/port/atarist/lineSpan.s
@@ -0,0 +1,853 @@
+| Atari ST word-interleaved planar drawLine -- 68000 hand-rolled.
+|
+| Bresenham line walker with 16-way color dispatch. Per pixel:
+|   * 4-plane word RMW with branchless OR/AND chosen at compile time
+|   * bit mask via 16-entry word table; group offset via (x>>4)<<3
+|   * y*160 = (y<<5)+(y<<7)
+|
+| Caller MUST guarantee the entire line lies on-surface (full clip
+| precheck). Partial-clip lines fall back to the C walker.
+|
+| ABI: cdecl. d2-d7/a2-a6 callee-save.
+|
+|   void surface68kStDrawLine(uint8_t *base,
+|                             int16_t x0, int16_t y0,
+|                             int16_t x1, int16_t y1,
+|                             uint8_t color);
+|
+| Register allocation in the inner loop:
+|   d2.w   = x  (current pixel)
+|   d3.w   = y  (current pixel)
+|   d4.w   = err
+|   d5.w   = dx       (>= 0)
+|   d6.w   = -dy_abs  (<= 0; "Bresenham uses -dy")
+|   d7     = sx (long; moveq #1 or #-1, low word used for .w add)
+|   a4     = sy (long; sign-extended)
+|   a3     = base
+|   a5     = bitMaskWordLut
+|   a2     = scratch (per-pixel: base + byteOff)
+|   d0,d1  = scratch
+|
+| Stack scratch:
+|   sp+0..1  iter counter (max(dx, dy_abs) + 1)
+
+                .text
+
+
+                .equ    SP_SAVED, 44
+                .equ    SP_LOCAL, 4
+                .equ    SP_OFF,         (SP_SAVED + 4 + SP_LOCAL)
+                .equ    SP_BASE,    SP_OFF + 0
+                .equ    SP_X0,      SP_OFF + 4 + 2
+                .equ    SP_Y0,      SP_OFF + 8 + 2
+                .equ    SP_X1,      SP_OFF + 12 + 2
+                .equ    SP_Y1,      SP_OFF + 16 + 2
+                .equ    SP_COLOR,   SP_OFF + 20 + 3
+
+
+| ---- DL_PLOT: 4-plane word RMW for hardcoded color ----
+| Inputs:  d2.w = x, d3.w = y, a3 = base, a5 = bitMaskWordLut
+| Trashes: d0, d1, a2
+
+                .macro  DL_PLOT  color
+                | byteOff = y*160 + (x>>4)*8
+                move.w  %d3,%d0
+                ext.l   %d0
+                move.l  %d0,%d1
+                lsl.l   #5,%d0                 | y << 5
+                lsl.l   #7,%d1                 | y << 7
+                add.l   %d1,%d0                | d0 = y * 160
+                move.w  %d2,%d1
+                lsr.w   #4,%d1
+                lsl.w   #3,%d1                 | (x>>4) * 8
+                ext.l   %d1
+                add.l   %d1,%d0                | d0 = byteOff
+                lea     0(%a3,%d0.l),%a2       | a2 = base + byteOff
+                | d1 = bitMask, d0 = notMask
+                move.w  %d2,%d1
+                and.w   #15,%d1
+                add.w   %d1,%d1
+                move.w  (%a5,%d1.w),%d1
+                move.w  %d1,%d0
+                not.w   %d0
+                | per-plane RMW with postinc (drops 4 cyc per RMW vs
+                | displacement (d8,An) = 16 cyc, plain (An)+ = 12 cyc).
+                .if  ((\color) & 1)
+                or.w    %d1,(%a2)+
+                .else
+                and.w   %d0,(%a2)+
+                .endif
+                .if  ((\color) & 2)
+                or.w    %d1,(%a2)+
+                .else
+                and.w   %d0,(%a2)+
+                .endif
+                .if  ((\color) & 4)
+                or.w    %d1,(%a2)+
+                .else
+                and.w   %d0,(%a2)+
+                .endif
+                .if  ((\color) & 8)
+                or.w    %d1,(%a2)+
+                .else
+                and.w   %d0,(%a2)+
+                .endif
+                .endm
+
+
+| ---- DL_BODY: full Bresenham loop body for hardcoded color ----
+
+                .macro  DL_BODY  color
+.LdlStLoop_\color:
+                DL_PLOT  \color
+                | e2 = 2 * err
+                move.w  %d4,%d0
+                add.w   %d0,%d0                | d0 = e2
+                | if (e2 >= dy) { err += dy; x += sx; }
+                cmp.w   %d6,%d0
+                blt.s   .LdlStNoX_\color
+                add.w   %d6,%d4
+                add.w   %d7,%d2
+.LdlStNoX_\color:
+                | if (e2 <= dx) { err += dx; y += sy; }
+                cmp.w   %d5,%d0
+                bgt.s   .LdlStNoY_\color
+                add.w   %d5,%d4
+                add.w   %a4,%d3                | sy.w from a4
+.LdlStNoY_\color:
+                subq.w  #1,0(%sp)
+                bne.w   .LdlStLoop_\color
+                bra.w   .LdlStDone
+                .endm
+
+
+                .globl  _surface68kStDrawLine
+
+_surface68kStDrawLine:
+                movem.l %d2-%d7/%a2-%a6,-(%sp)
+                lea     -SP_LOCAL(%sp),%sp
+
+                | Load base & lut.
+                move.l  SP_BASE(%sp),%a3
+                lea     bitMaskWordLut(%pc),%a5
+
+                | x = x0, y = y0
+                move.w  SP_X0(%sp),%d2
+                move.w  SP_Y0(%sp),%d3
+
+                | dx = abs(x1 - x0), sx = sign(x1 - x0)
+                move.w  SP_X1(%sp),%d5
+                sub.w   %d2,%d5                | d5 = x1 - x0
+                bge.s   .LdlSxPos
+                neg.w   %d5
+                moveq   #-1,%d7
+                bra.s   .LdlSxDone
+.LdlSxPos:
+                moveq   #1,%d7
+.LdlSxDone:
+
+                | dy_abs in d6, sy in d0 (-> a4)
+                move.w  SP_Y1(%sp),%d6
+                sub.w   %d3,%d6                | d6 = y1 - y0
+                bge.s   .LdlSyPos
+                neg.w   %d6
+                moveq   #-1,%d0
+                bra.s   .LdlSyDone
+.LdlSyPos:
+                moveq   #1,%d0
+.LdlSyDone:
+                ext.l   %d0
+                movea.l %d0,%a4                | a4 = sy
+
+                | iter counter = max(dx, dy_abs) + 1
+                move.w  %d5,%d0
+                cmp.w   %d6,%d0
+                bge.s   .LdlNitDone
+                move.w  %d6,%d0
+.LdlNitDone:
+                addq.w  #1,%d0
+                move.w  %d0,0(%sp)
+
+                | err = dx - dy_abs (== dx + dy where dy negative)
+                move.w  %d5,%d4
+                sub.w   %d6,%d4                | d4 = err
+                neg.w   %d6                    | d6 = -dy_abs (negative)
+
+                | Dispatch on color (low 4 bits) -> 16 specialized loops.
+                moveq   #0,%d0
+                move.b  SP_COLOR(%sp),%d0
+                and.w   #0x0F,%d0
+                add.w   %d0,%d0
+                add.w   %d0,%d0                | * 4 for bra.w table
+                lea     .LdlStTable(%pc),%a6
+                jmp     0(%a6,%d0.w)
+
+.LdlStTable:
+                bra.w   .LdlStLoop_0
+                bra.w   .LdlStLoop_1
+                bra.w   .LdlStLoop_2
+                bra.w   .LdlStLoop_3
+                bra.w   .LdlStLoop_4
+                bra.w   .LdlStLoop_5
+                bra.w   .LdlStLoop_6
+                bra.w   .LdlStLoop_7
+                bra.w   .LdlStLoop_8
+                bra.w   .LdlStLoop_9
+                bra.w   .LdlStLoop_10
+                bra.w   .LdlStLoop_11
+                bra.w   .LdlStLoop_12
+                bra.w   .LdlStLoop_13
+                bra.w   .LdlStLoop_14
+                bra.w   .LdlStLoop_15
+
+                DL_BODY  0
+                DL_BODY  1
+                DL_BODY  2
+                DL_BODY  3
+                DL_BODY  4
+                DL_BODY  5
+                DL_BODY  6
+                DL_BODY  7
+                DL_BODY  8
+                DL_BODY  9
+                DL_BODY  10
+                DL_BODY  11
+                DL_BODY  12
+                DL_BODY  13
+                DL_BODY  14
+                DL_BODY  15
+
+.LdlStDone:
+                lea     SP_LOCAL(%sp),%sp
+                movem.l (%sp)+,%d2-%d7/%a2-%a6
+                rts
+
+
+                .align  2
+| 16 word entries: bitMaskWordLut[i] = 1 << (15 - i), for i in 0..15.
+bitMaskWordLut:
+                .word   0x8000, 0x4000, 0x2000, 0x1000
+                .word   0x0800, 0x0400, 0x0200, 0x0100
+                .word   0x0080, 0x0040, 0x0020, 0x0010
+                .word   0x0008, 0x0004, 0x0002, 0x0001
+
+
+| ---- surface68kStFillSpan ---------------------------------------
+|
+| Single-row span fill: leading-mask group + middle long-fills +
+| trailing-mask group, all in one frame. Caller pre-clips so the
+| span is fully on-surface.
+|
+|   void surface68kStFillSpan(uint8_t *base,
+|                             int16_t left, int16_t right,
+|                             int16_t y, uint8_t color);
+|
+|   Caller guarantees: 0 <= left <= right < 320, 0 <= y < 200.
+|
+| Register layout:
+|   a3 = base
+|   a4 = current group pointer
+|   d2.w = leftMask (then trailing trampoline target)
+|   d3.w = rightMask
+|   d4.w = numGroups - 1 (middle iter count when > 0)
+|   d5.l = loLong (planes 0+1 long template)
+|   d6.l = hiLong (planes 2+3 long template)
+|   d7.b = color (low nibble; tested via btst)
+|   d0,d1 = scratch
+
+                .equ    SP_FS_SAVED, 44
+                .equ    SP_FS_OFF,         (SP_FS_SAVED + 4)
+                .equ    SP_FS_BASE,    SP_FS_OFF + 0
+                .equ    SP_FS_LEFT,    SP_FS_OFF + 4 + 2
+                .equ    SP_FS_RIGHT,   SP_FS_OFF + 8 + 2
+                .equ    SP_FS_Y,       SP_FS_OFF + 12 + 2
+                .equ    SP_FS_COLOR,   SP_FS_OFF + 16 + 3
+
+
+                .globl  _surface68kStFillSpan
+
+_surface68kStFillSpan:
+                movem.l %d2-%d7/%a2-%a6,-(%sp)
+
+                move.l  SP_FS_BASE(%sp),%a3
+                moveq   #0,%d7
+                move.b  SP_FS_COLOR(%sp),%d7   | d7 = color
+
+                | loLong = ((c&1)?0xFFFF0000:0) | ((c&2)?0x0000FFFF:0)
+                moveq   #0,%d5
+                btst    #1,%d7
+                beq.s   .LfsLoBit1
+                move.w  #-1,%d5
+.LfsLoBit1:
+                btst    #0,%d7
+                beq.s   .LfsLoBit0
+                ori.l   #0xFFFF0000,%d5
+.LfsLoBit0:
+                | hiLong = ((c&4)?0xFFFF0000:0) | ((c&8)?0x0000FFFF:0)
+                moveq   #0,%d6
+                btst    #3,%d7
+                beq.s   .LfsHiBit3
+                move.w  #-1,%d6
+.LfsHiBit3:
+                btst    #2,%d7
+                beq.s   .LfsHiBit2
+                ori.l   #0xFFFF0000,%d6
+.LfsHiBit2:
+
+                | rowBase = base + y*160 -> a4
+                move.w  SP_FS_Y(%sp),%d0
+                ext.l   %d0
+                move.l  %d0,%d1
+                lsl.l   #5,%d0
+                lsl.l   #7,%d1
+                add.l   %d1,%d0                | d0 = y*160
+                lea     0(%a3,%d0.l),%a4
+
+                | left in d0, right in d1
+                move.w  SP_FS_LEFT(%sp),%d0
+                move.w  SP_FS_RIGHT(%sp),%d1
+
+                | bitFirst in d2, bitLast in d3
+                move.w  %d0,%d2
+                and.w   #15,%d2
+                move.w  %d1,%d3
+                and.w   #15,%d3
+
+                | a4 += groupFirst * 8
+                | numGroups = groupLast - groupFirst (in d4)
+                move.w  %d0,%d4
+                lsr.w   #4,%d4                 | d4 = groupFirst
+                move.w  %d4,%d0                | save groupFirst into d0
+                lsl.w   #3,%d0                 | d0 = groupFirst*8
+                ext.l   %d0
+                add.l   %d0,%a4
+                move.w  %d1,%d0
+                lsr.w   #4,%d0                 | d0 = groupLast
+                sub.w   %d4,%d0                | d0 = groupLast - groupFirst
+                move.w  %d0,%d4                | d4 = numGroups
+
+                | leftMask  = (1 << (16 - bitFirst)) - 1
+                moveq   #16,%d0
+                sub.w   %d2,%d0                | d0 = 16 - bitFirst (1..16)
+                moveq   #1,%d2
+                lsl.l   %d0,%d2                | 1 << (16 - bitFirst)
+                subq.l  #1,%d2                 | d2.w = leftMask
+
+                | rightMask = ~((1 << (15 - bitLast)) - 1)
+                moveq   #15,%d0
+                sub.w   %d3,%d0                | d0 = 15 - bitLast (0..15)
+                moveq   #1,%d3
+                lsl.l   %d0,%d3                | 1 << (15 - bitLast)
+                subq.l  #1,%d3                 | inverse mask
+                not.w   %d3                    | d3.w = rightMask
+
+                | If numGroups == 0, single-group: mask = leftMask & rightMask
+                tst.w   %d4
+                bne.s   .LfsMulti
+
+                and.w   %d2,%d3                | d3 = combinedMask
+                move.w  %d3,%d2
+                bsr.s   .LfsApplyMask
+                bra.w   .LfsDone
+
+.LfsMulti:
+                | Leading mask (d2 already = leftMask)
+                bsr.s   .LfsApplyMask
+                addq.l  #8,%a4                 | next group
+
+                | numMid = numGroups - 1
+                subq.w  #1,%d4
+                beq.s   .LfsTrailing
+
+.LfsMidLoop:
+                move.l  %d5,(%a4)+
+                move.l  %d6,(%a4)+
+                subq.w  #1,%d4
+                bne.s   .LfsMidLoop
+
+.LfsTrailing:
+                move.w  %d3,%d2                | d2 = rightMask
+                bsr.s   .LfsApplyMask
+
+.LfsDone:
+                movem.l (%sp)+,%d2-%d7/%a2-%a6
+                rts
+
+
+| Apply 4-plane word RMW at (a4) using mask in d2 (or notMask in d0).
+| Plane N: if (color bit N) OR mask else AND notMask.
+| Inputs: a4, d2.w = mask, d7.b = color
+| Trashes: d0
+| Returns via rts.
+
+.LfsApplyMask:
+                move.w  %d2,%d0
+                not.w   %d0                    | d0 = notMask
+                btst    #0,%d7
+                beq.s   .LfsAm0a
+                or.w    %d2,(%a4)
+                bra.s   .LfsAm1
+.LfsAm0a:
+                and.w   %d0,(%a4)
+.LfsAm1:
+                btst    #1,%d7
+                beq.s   .LfsAm1a
+                or.w    %d2,2(%a4)
+                bra.s   .LfsAm2
+.LfsAm1a:
+                and.w   %d0,2(%a4)
+.LfsAm2:
+                btst    #2,%d7
+                beq.s   .LfsAm2a
+                or.w    %d2,4(%a4)
+                bra.s   .LfsAm3
+.LfsAm2a:
+                and.w   %d0,4(%a4)
+.LfsAm3:
+                btst    #3,%d7
+                beq.s   .LfsAm3a
+                or.w    %d2,6(%a4)
+                rts
+.LfsAm3a:
+                and.w   %d0,6(%a4)
+                rts
+
+
+| ---- surface68kStFillRectSingleGroup -----------------------------
+|
+| Fill rect when groupFirst == groupLast (thin/single-column rect).
+| Caller pre-computes firstGroupPtr = base + y*160 + groupFirst*8
+| and the mask = leftMask & rightMask.
+|
+|   void surface68kStFillRectSingleGroup(uint8_t *firstGroupPtr,
+|                                        uint16_t mask,
+|                                        uint16_t h,
+|                                        uint8_t color);
+|
+| Dispatched on color (low nibble) -> 16 specialized loops with
+| hardcoded OR/AND per plane. Inner loop is 4 plane word RMWs +
+| advance row + branch.
+|
+| drawLine V routes to fillRect 1xH which lands here.
+
+                .equ    SP_FRG_SAVED, 24            | d2-d5/a2-a3 = 6 longs
+                .equ    SP_FRG_OFF,         (SP_FRG_SAVED + 4)
+                .equ    SP_FRG_PTR,    SP_FRG_OFF + 0
+                .equ    SP_FRG_MASK,   SP_FRG_OFF + 4 + 2
+                .equ    SP_FRG_H,      SP_FRG_OFF + 8 + 2
+                .equ    SP_FRG_COLOR,  SP_FRG_OFF + 12 + 3
+
+
+                .macro  FRG_LOOP color
+.Lfrg_loop_\color:
+                .if  ((\color) & 1)
+                or.w    %d3,(%a3)+
+                .else
+                and.w   %d4,(%a3)+
+                .endif
+                .if  ((\color) & 2)
+                or.w    %d3,(%a3)+
+                .else
+                and.w   %d4,(%a3)+
+                .endif
+                .if  ((\color) & 4)
+                or.w    %d3,(%a3)+
+                .else
+                and.w   %d4,(%a3)+
+                .endif
+                .if  ((\color) & 8)
+                or.w    %d3,(%a3)+
+                .else
+                and.w   %d4,(%a3)+
+                .endif
+                lea     152(%a3),%a3            | a3 now at row start; advance to next row (160-8)
+                subq.w  #1,%d5
+                bne.w   .Lfrg_loop_\color
+                bra.w   .Lfrg_done
+                .endm
+
+
+                .globl  _surface68kStFillRectSingleGroup
+
+_surface68kStFillRectSingleGroup:
+                movem.l %d2-%d5/%a2-%a3,-(%sp)
+
+                move.l  SP_FRG_PTR(%sp),%a3
+                move.w  SP_FRG_MASK(%sp),%d3
+                move.w  SP_FRG_H(%sp),%d5
+                tst.w   %d5
+                beq.w   .Lfrg_done
+                move.w  %d3,%d4
+                not.w   %d4                        | d4 = notMask
+
+                | Color dispatch
+                moveq   #0,%d2
+                move.b  SP_FRG_COLOR(%sp),%d2
+                and.w   #0x0F,%d2
+                add.w   %d2,%d2
+                add.w   %d2,%d2                    | * 4 for bra.w table
+                lea     .Lfrg_table(%pc),%a2
+                jmp     0(%a2,%d2.w)
+
+.Lfrg_table:
+                bra.w   .Lfrg_loop_0
+                bra.w   .Lfrg_loop_1
+                bra.w   .Lfrg_loop_2
+                bra.w   .Lfrg_loop_3
+                bra.w   .Lfrg_loop_4
+                bra.w   .Lfrg_loop_5
+                bra.w   .Lfrg_loop_6
+                bra.w   .Lfrg_loop_7
+                bra.w   .Lfrg_loop_8
+                bra.w   .Lfrg_loop_9
+                bra.w   .Lfrg_loop_10
+                bra.w   .Lfrg_loop_11
+                bra.w   .Lfrg_loop_12
+                bra.w   .Lfrg_loop_13
+                bra.w   .Lfrg_loop_14
+                bra.w   .Lfrg_loop_15
+
+                FRG_LOOP  0
+                FRG_LOOP  1
+                FRG_LOOP  2
+                FRG_LOOP  3
+                FRG_LOOP  4
+                FRG_LOOP  5
+                FRG_LOOP  6
+                FRG_LOOP  7
+                FRG_LOOP  8
+                FRG_LOOP  9
+                FRG_LOOP  10
+                FRG_LOOP  11
+                FRG_LOOP  12
+                FRG_LOOP  13
+                FRG_LOOP  14
+                FRG_LOOP  15
+
+.Lfrg_done:
+                movem.l (%sp)+,%d2-%d5/%a2-%a3
+                rts
+
+
+| ---- surface68kStFillRectMulti -------------------------------------
+|
+| Multi-group fillRect: groupFirst != groupLast. Caller pre-clips.
+| Dispatched on color (low nibble) -> 16 specialized H-row loops.
+|
+|   void surface68kStFillRectMulti(uint8_t *base,
+|                                  int16_t x, int16_t y,
+|                                  uint16_t w, uint16_t h,
+|                                  uint8_t color);
+|
+| Per row body (per color C):
+|   1. Leading mask: 4 hardcoded plane RMW with leftMask
+|   2. Middle: numMid groups of 2 long-writes (loLong, hiLong)
+|   3. Trailing mask: 4 hardcoded plane RMW with rightMask
+|   4. Advance rowBase by 160; decrement h; loop.
+|
+| Register layout in inner loop:
+|   d2.w = leftMask       d3.w = rightMask
+|   d4.w = ~leftMask      d5.w = ~rightMask
+|   d6.l = loLong         d7.l = hiLong
+|   a3   = rowBase (advances by 160 each iter)
+|   a4   = a_grp (per-row scratch)
+|   d0,d1 = scratch
+|
+| Stack scratch (4 bytes at sp+0):
+|   0..1  numMid   (word, reload per row for mid loop)
+|   2..3  h        (word, decrement per row)
+
+                .equ    SP_FRM_SAVED, 44
+                .equ    SP_FRM_LOCAL, 4
+                .equ    SP_FRM_OFF,         (SP_FRM_SAVED + 4 + SP_FRM_LOCAL)
+                .equ    SP_FRM_BASE,    SP_FRM_OFF + 0
+                .equ    SP_FRM_X,       SP_FRM_OFF + 4 + 2
+                .equ    SP_FRM_Y,       SP_FRM_OFF + 8 + 2
+                .equ    SP_FRM_W,       SP_FRM_OFF + 12 + 2
+                .equ    SP_FRM_H,       SP_FRM_OFF + 16 + 2
+                .equ    SP_FRM_COLOR,   SP_FRM_OFF + 20 + 3
+
+
+                .macro  FRM_LOOP color
+.LfrM_loop_\color:
+                | Leading mask at (a4)+, walking from row start
+                move.l  %a3,%a4                | a4 = current row's groupFirst byte
+                .if  ((\color) & 1)
+                or.w    %d2,(%a4)+
+                .else
+                and.w   %d4,(%a4)+
+                .endif
+                .if  ((\color) & 2)
+                or.w    %d2,(%a4)+
+                .else
+                and.w   %d4,(%a4)+
+                .endif
+                .if  ((\color) & 4)
+                or.w    %d2,(%a4)+
+                .else
+                and.w   %d4,(%a4)+
+                .endif
+                .if  ((\color) & 8)
+                or.w    %d2,(%a4)+
+                .else
+                and.w   %d4,(%a4)+
+                .endif
+                | a4 now points to next group (8 bytes past row start).
+                | Middle long-fill
+                move.w  0(%sp),%d0
+                tst.w   %d0
+                beq.s   .LfrM_skipMid_\color
+.LfrM_midLoop_\color:
+                move.l  %d6,(%a4)+
+                move.l  %d7,(%a4)+
+                subq.w  #1,%d0
+                bne.s   .LfrM_midLoop_\color
+.LfrM_skipMid_\color:
+                | Trailing mask at (a4)+
+                .if  ((\color) & 1)
+                or.w    %d3,(%a4)+
+                .else
+                and.w   %d5,(%a4)+
+                .endif
+                .if  ((\color) & 2)
+                or.w    %d3,(%a4)+
+                .else
+                and.w   %d5,(%a4)+
+                .endif
+                .if  ((\color) & 4)
+                or.w    %d3,(%a4)+
+                .else
+                and.w   %d5,(%a4)+
+                .endif
+                .if  ((\color) & 8)
+                or.w    %d3,(%a4)+
+                .else
+                and.w   %d5,(%a4)+
+                .endif
+                | Advance row (a3 unchanged through the body)
+                lea     160(%a3),%a3
+                subq.w  #1,2(%sp)
+                bne.w   .LfrM_loop_\color
+                bra.w   .LfrM_done
+                .endm
+
+
+                .globl  _surface68kStFillRectMulti
+
+_surface68kStFillRectMulti:
+                movem.l %d2-%d7/%a2-%a6,-(%sp)
+                lea     -SP_FRM_LOCAL(%sp),%sp
+
+                | Load color, build loLong (d6) and hiLong (d7)
+                moveq   #0,%d0
+                move.b  SP_FRM_COLOR(%sp),%d0
+                moveq   #0,%d6
+                btst    #1,%d0
+                beq.s   .LfrM_lo1
+                move.w  #-1,%d6
+.LfrM_lo1:
+                btst    #0,%d0
+                beq.s   .LfrM_lo0
+                ori.l   #0xFFFF0000,%d6
+.LfrM_lo0:
+                moveq   #0,%d7
+                btst    #3,%d0
+                beq.s   .LfrM_hi3
+                move.w  #-1,%d7
+.LfrM_hi3:
+                btst    #2,%d0
+                beq.s   .LfrM_hi2
+                ori.l   #0xFFFF0000,%d7
+.LfrM_hi2:
+
+                | Compute group ptrs and masks
+                | groupFirst = x >> 4; groupFirstByteOff = groupFirst * 8
+                | bitFirst = x & 15
+                move.w  SP_FRM_X(%sp),%d0
+                move.w  SP_FRM_W(%sp),%d1
+                add.w   %d0,%d1
+                subq.w  #1,%d1                 | d1 = x + w - 1 (last pixel)
+
+                | leftMask via LUT[bitFirst]
+                move.w  %d0,%d2
+                and.w   #15,%d2
+                add.w   %d2,%d2
+                lea     frmLeftMaskLut(%pc),%a2
+                move.w  (%a2,%d2.w),%d2        | d2 = leftMask
+                move.w  %d2,%d4
+                not.w   %d4                    | d4 = notLeftMask
+
+                | rightMask via LUT[bitLast]
+                move.w  %d1,%d3
+                and.w   #15,%d3
+                add.w   %d3,%d3
+                lea     frmRightMaskLut(%pc),%a2
+                move.w  (%a2,%d3.w),%d3        | d3 = rightMask
+                move.w  %d3,%d5
+                not.w   %d5                    | d5 = notRightMask
+
+                | numMid = (last >> 4) - (x >> 4) - 1
+                move.w  %d1,%a2                | a2.w = lastPixel (temp)
+                move.l  %a2,%d1
+                lsr.w   #4,%d1                 | groupLast (low word)
+                move.w  %d0,%a2
+                move.l  %a2,%d0
+                lsr.w   #4,%d0                 | groupFirst
+                move.w  %d0,%a4                | a4.w = groupFirst (save for byteOff calc)
+                sub.w   %d0,%d1                | d1 = groupLast - groupFirst
+                subq.w  #1,%d1                 | d1 = numMid (>= 0 since multi-group caller)
+                move.w  %d1,0(%sp)             | numMid -> stack
+
+                | h -> stack
+                move.w  SP_FRM_H(%sp),%d1
+                move.w  %d1,2(%sp)
+
+                | a3 = base + y*160 + groupFirst*8
+                move.w  SP_FRM_Y(%sp),%d0
+                ext.l   %d0
+                move.l  %d0,%d1
+                lsl.l   #5,%d0
+                lsl.l   #7,%d1
+                add.l   %d1,%d0                | y*160
+                move.l  SP_FRM_BASE(%sp),%a3
+                add.l   %d0,%a3                | rowBase = base + y*160
+                move.l  %a4,%d0                | groupFirst
+                lsl.w   #3,%d0                 | * 8
+                ext.l   %d0
+                add.l   %d0,%a3                | + groupFirst*8
+
+                | Dispatch on color
+                moveq   #0,%d0
+                move.b  SP_FRM_COLOR(%sp),%d0
+                and.w   #0x0F,%d0
+                add.w   %d0,%d0
+                add.w   %d0,%d0
+                lea     .LfrM_table(%pc),%a2
+                jmp     0(%a2,%d0.w)
+
+.LfrM_table:
+                bra.w   .LfrM_loop_0
+                bra.w   .LfrM_loop_1
+                bra.w   .LfrM_loop_2
+                bra.w   .LfrM_loop_3
+                bra.w   .LfrM_loop_4
+                bra.w   .LfrM_loop_5
+                bra.w   .LfrM_loop_6
+                bra.w   .LfrM_loop_7
+                bra.w   .LfrM_loop_8
+                bra.w   .LfrM_loop_9
+                bra.w   .LfrM_loop_10
+                bra.w   .LfrM_loop_11
+                bra.w   .LfrM_loop_12
+                bra.w   .LfrM_loop_13
+                bra.w   .LfrM_loop_14
+                bra.w   .LfrM_loop_15
+
+                FRM_LOOP  0
+                FRM_LOOP  1
+                FRM_LOOP  2
+                FRM_LOOP  3
+                FRM_LOOP  4
+                FRM_LOOP  5
+                FRM_LOOP  6
+                FRM_LOOP  7
+                FRM_LOOP  8
+                FRM_LOOP  9
+                FRM_LOOP  10
+                FRM_LOOP  11
+                FRM_LOOP  12
+                FRM_LOOP  13
+                FRM_LOOP  14
+                FRM_LOOP  15
+
+.LfrM_done:
+                lea     SP_FRM_LOCAL(%sp),%sp
+                movem.l (%sp)+,%d2-%d7/%a2-%a6
+                rts
+
+
+                .align  2
+| Same LUTs as in fillCircle.s; duplicated locally so each .o file's
+| PC-rel lea can reach them within its own .text segment.
+frmLeftMaskLut:
+                .word   0xFFFF, 0x7FFF, 0x3FFF, 0x1FFF
+                .word   0x0FFF, 0x07FF, 0x03FF, 0x01FF
+                .word   0x00FF, 0x007F, 0x003F, 0x001F
+                .word   0x000F, 0x0007, 0x0003, 0x0001
+
+frmRightMaskLut:
+                .word   0x8000, 0xC000, 0xE000, 0xF000
+                .word   0xF800, 0xFC00, 0xFE00, 0xFF00
+                .word   0xFF80, 0xFFC0, 0xFFE0, 0xFFF0
+                .word   0xFFF8, 0xFFFC, 0xFFFE, 0xFFFF
+
+
+| ---- surface68kStLongFill ----------------------------------------
+|
+| Bulk long-fill helper for full-row fills (surfaceClear, fillRect
+| 320x200). Writes numGroups groups of 8 bytes (loLong, hiLong)
+| starting at dst. Uses movem.l d2-d7 (3 groups = 24 bytes per
+| batch) plus a tail pair to amortize loop overhead.
+|
+|   void surface68kStLongFill(uint8_t *dst,
+|                             uint16_t numGroups,
+|                             uint32_t loLong,
+|                             uint32_t hiLong);
+|
+| Per-batch cost: movem.l (56 cyc) + subq (8) + bne (10) = 74 cyc
+| for 24 bytes -- ~3 cyc/byte vs ~5 cyc/byte for the straight C
+| do-while of two move.l writes.
+
+                .equ    SP_LF_SAVED, 24            | d2-d7 = 6 longs
+                .equ    SP_LF_OFF,         (SP_LF_SAVED + 4)
+                .equ    SP_LF_DST,     SP_LF_OFF + 0
+                .equ    SP_LF_NGROUPS, SP_LF_OFF + 4 + 2
+                .equ    SP_LF_LO,      SP_LF_OFF + 8
+                .equ    SP_LF_HI,      SP_LF_OFF + 12
+
+
+                .globl  _surface68kStLongFill
+
+_surface68kStLongFill:
+                movem.l %d2-%d7,-(%sp)
+
+                move.l  SP_LF_DST(%sp),%a0
+                move.l  SP_LF_LO(%sp),%d2
+                move.l  SP_LF_HI(%sp),%d3
+                move.w  SP_LF_NGROUPS(%sp),%d0
+
+                | Set up d2-d7 = lo, hi, lo, hi, lo, hi (movem writes
+                | in d-reg order, so this gives the right alternation
+                | for 3 consecutive 8-byte groups).
+                move.l  %d2,%d4
+                move.l  %d2,%d6
+                move.l  %d3,%d5
+                move.l  %d3,%d7
+
+                | numBatches = numGroups / 3 (quotient), tail = remainder
+                ext.l   %d0
+                divu.w  #3,%d0
+                move.l  %d0,%d1
+                swap    %d1                        | d1.w = remainder
+                tst.w   %d0                        | quotient
+                beq.s   .Llf_tail
+.Llf_loop:
+                movem.l %d2-%d7,(%a0)
+                lea     24(%a0),%a0
+                subq.w  #1,%d0
+                bne.s   .Llf_loop
+
+.Llf_tail:
+                | Remainder: 0, 1, or 2 groups of 8 bytes
+                tst.w   %d1
+                beq.s   .Llf_done
+                move.l  %d2,(%a0)+
+                move.l  %d3,(%a0)+
+                subq.w  #1,%d1
+                beq.s   .Llf_done
+                move.l  %d2,(%a0)+
+                move.l  %d3,(%a0)+
+
+.Llf_done:
+                movem.l (%sp)+,%d2-%d7
+                rts
diff --git a/src/port/atarist/spriteAsm.s b/src/port/atarist/spriteAsm.s
new file mode 100644
index 0000000..b1b233c
--- /dev/null
+++ b/src/port/atarist/spriteAsm.s
@@ -0,0 +1,202 @@
+| ST byte-aligned sprite save / restore via 256-entry plane-spread
+| LUT. The LUT entry for each plane byte value is a 32-bit "spread"
+| where each plane byte bit lands at the corresponding plane-0 bit
+| position of the 4-byte chunky output. For plane N, we shift the
+| LUT entry left by N to put bits at the plane-N positions, then OR
+| the 4 plane contributions together to get the chunky long.
+|
+| LUT layout (256 longs = 1 KB), populated by initStPlaneSpreadLut
+| in hal.c:
+|
+|   gStPlaneSpreadLut[b] for plane byte b:
+|     bit i of b (i = 0 = MSB = leftmost pixel) maps to bit
+|     bitInLong(i) = (3 - (i >> 1)) * 8 + ((i & 1) ? 0 : 4)
+|     of the long. Plane 0's bits land at nibble bit 0 of each
+|     chunky byte; left-shift the LUT entry by N for plane N.
+|
+| ABI: cdecl. d2-d7/a2-a6 callee-save. C signatures:
+|
+|   void surface68kStSpriteSaveByteAligned(uint8_t *base,
+|                                          uint16_t x, uint16_t y,
+|                                          uint16_t w, uint16_t h,
+|                                          uint8_t *dstChunky);
+|
+|   void surface68kStSpriteRestoreByteAligned(uint8_t *base,
+|                                             uint16_t x, uint16_t y,
+|                                             uint16_t w, uint16_t h,
+|                                             const uint8_t *srcChunky);
+
+                .text
+
+
+                .equ    SP_SAVED, 44
+                .equ    SP_OFF,   (SP_SAVED + 4)
+                .equ    SP_BASE,    SP_OFF + 0
+                .equ    SP_X,       SP_OFF + 4 + 2
+                .equ    SP_Y,       SP_OFF + 8 + 2
+                .equ    SP_W,       SP_OFF + 12 + 2
+                .equ    SP_H,       SP_OFF + 16 + 2
+                .equ    SP_CHUNKY,  SP_OFF + 20
+                .equ    SP_LUT,     SP_OFF + 24
+
+
+| Per-tile-col SAVE: 4 plane bytes -> 4 contiguous bytes in buffer.
+| a0 -> plane 0 byte (high or low half), strides 2 to next plane
+| a1 -> output planar bytes (advanced by 4)
+| a2 -> unused (LUT no longer needed)
+|
+| Phase 10.5: dropped chunky <-> planar conversion. The buffer holds
+| plane-major bytes (per row: plane0, plane1, plane2, plane3 per
+| tile col, for w/8 tile cols). 4 byte copies instead of 4 LUT
+| lookups + shifts + ORs.
+
+                .macro  SAVE_TILECOL
+                move.b  (%a0),(%a1)+           | plane 0
+                move.b  2(%a0),(%a1)+          | plane 1
+                move.b  4(%a0),(%a1)+          | plane 2
+                move.b  6(%a0),(%a1)+          | plane 3
+                .endm
+
+
+                .globl  _surface68kStSpriteSaveByteAligned
+
+_surface68kStSpriteSaveByteAligned:
+                movem.l %d2-%d7/%a2-%a6,-(%sp)
+
+                move.l  SP_BASE(%sp),%a3
+                move.l  SP_CHUNKY(%sp),%a1
+                | LUT pointer comes in via stack arg -- guaranteed
+                | long-aligned because gcc passes ptr args via
+                | move.l on a long-aligned sp slot. Avoids the BSS
+                | misalignment problem on TOS .PRG (BSS pads only to
+                | 2 bytes, even uint32_t slots can land at mod-4 = 2).
+                move.l  SP_LUT(%sp),%a2
+
+                move.w  SP_W(%sp),%d5
+                lsr.w   #3,%d5                 | d5 = tileCols
+                move.w  SP_H(%sp),%d6          | d6 = h
+                move.w  SP_X(%sp),%d7
+
+                | a4 = base + y*160 + (x>>4)*8
+                move.w  SP_Y(%sp),%d0
+                ext.l   %d0
+                move.l  %d0,%d1
+                lsl.l   #5,%d0                 | y << 5
+                lsl.l   #7,%d1                 | y << 7
+                add.l   %d1,%d0                | y * 160
+                lea     0(%a3,%d0.l),%a4
+                moveq   #0,%d0
+                move.w  %d7,%d0
+                lsr.w   #4,%d0
+                lsl.w   #3,%d0
+                ext.l   %d0
+                add.l   %d0,%a4
+
+                | Initial half offset: (x & 8) >> 3 = 0 or 1
+                and.w   #8,%d7
+                lsr.w   #3,%d7
+
+.LsaveRow:
+                move.w  %d5,%d3                | d3 = tileCols
+                moveq   #0,%d2
+                move.w  %d7,%d2
+                lea     0(%a4,%d2.l),%a0       | a0 = first plane-0 byte
+
+.LsaveCol:
+                SAVE_TILECOL
+                | Advance a0: bit 0 = 0 -> high, advance to low (+1).
+                | bit 0 = 1 -> low, advance to next group's high (+7).
+                move.l  %a0,%d4
+                btst    #0,%d4
+                bne.s   .LsaveColWasLo
+                addq.l  #1,%a0
+                bra.s   .LsaveColNext
+.LsaveColWasLo:
+                lea     7(%a0),%a0
+.LsaveColNext:
+                subq.w  #1,%d3
+                bne.w   .LsaveCol
+
+                lea     160(%a4),%a4
+                subq.w  #1,%d6
+                bne.w   .LsaveRow
+
+                movem.l (%sp)+,%d2-%d7/%a2-%a6
+                rts
+
+
+| Per-tile-col RESTORE: 4 contiguous bytes from buffer -> 4 plane bytes.
+| a0 -> plane 0 byte (high or low half)
+| a1 -> input planar bytes (advanced by 4)
+| a2 -> unused (LUT no longer needed)
+|
+| Phase 10.5: dropped chunky -> planar conversion. Buffer layout
+| matches SAVE_TILECOL: per row, plane0/1/2/3 per tile col.
+
+                .macro  RESTORE_TILECOL
+                move.b  (%a1)+,(%a0)           | plane 0
+                move.b  (%a1)+,2(%a0)          | plane 1
+                move.b  (%a1)+,4(%a0)          | plane 2
+                move.b  (%a1)+,6(%a0)          | plane 3
+                .endm
+
+
+                .globl  _surface68kStSpriteRestoreByteAligned
+
+_surface68kStSpriteRestoreByteAligned:
+                movem.l %d2-%d7/%a2-%a6,-(%sp)
+
+                move.l  SP_BASE(%sp),%a3
+                move.l  SP_CHUNKY(%sp),%a1
+                move.l  SP_LUT(%sp),%a2        | gC2pLut passed in
+
+                | tileCols is held in a5 (not d5) because the macro
+                | trashes d5 (uses it for pb3).
+                move.w  SP_W(%sp),%d0
+                lsr.w   #3,%d0
+                movea.w %d0,%a5
+                move.w  SP_H(%sp),%d6
+                move.w  SP_X(%sp),%d7
+
+                move.w  SP_Y(%sp),%d0
+                ext.l   %d0
+                move.l  %d0,%d1
+                lsl.l   #5,%d0
+                lsl.l   #7,%d1
+                add.l   %d1,%d0
+                lea     0(%a3,%d0.l),%a4
+                moveq   #0,%d0
+                move.w  %d7,%d0
+                lsr.w   #4,%d0
+                lsl.w   #3,%d0
+                ext.l   %d0
+                add.l   %d0,%a4
+
+                and.w   #8,%d7
+                lsr.w   #3,%d7
+
+.LrestoreRow:
+                move.w  %a5,%d3                | d3 = tileCols (from a5)
+                moveq   #0,%d2
+                move.w  %d7,%d2
+                lea     0(%a4,%d2.l),%a0
+
+.LrestoreCol:
+                RESTORE_TILECOL
+                move.l  %a0,%d4
+                btst    #0,%d4
+                bne.s   .LrestoreColWasLo
+                addq.l  #1,%a0
+                bra.s   .LrestoreColNext
+.LrestoreColWasLo:
+                lea     7(%a0),%a0
+.LrestoreColNext:
+                subq.w  #1,%d3
+                bne.w   .LrestoreCol
+
+                lea     160(%a4),%a4
+                subq.w  #1,%d6
+                bne.w   .LrestoreRow
+
+                movem.l (%sp)+,%d2-%d7/%a2-%a6
+                rts