10 changed files with 199 additions and 1140 deletions
--- a/src/codegen/spriteCompile.c
+++ b/src/codegen/spriteCompile.c
@ -42,17 +42,11 @@ static uint16_t emitDrawForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift
 }


-// Save-under and restore-under emit dispatch. Each per-CPU pair
-// produces row-by-row copy bytes; the runtime dispatch in
-// src/core/sprite.c gates on routineOffsets[shift][SPRITE_OP_SAVE]
-// != SPRITE_NOT_COMPILED and falls back to the interpreted memcpy
-// path otherwise.
+// Save-under and restore-under emitters are IIgs-only at the moment;
+// other CPUs return 0, the runtime treats that as "not compiled" and
+// falls back to spriteSaveUnderInterpreted / spriteRestoreUnderInterpreted.
 static uint16_t emitSaveForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift) {
-#if defined(JOEYLIB_PLATFORM_DOS)
-    return spriteEmitSaveX86(out, sp, shift);
-#elif defined(JOEYLIB_PLATFORM_AMIGA) || defined(JOEYLIB_PLATFORM_ATARIST)
-    return spriteEmitSave68k(out, sp, shift);
-#elif defined(JOEYLIB_PLATFORM_IIGS)
+#if defined(JOEYLIB_PLATFORM_IIGS)
    return spriteEmitSaveIigs(out, sp, shift);
 #else
    (void)out; (void)sp; (void)shift;
@ -62,11 +56,7 @@ static uint16_t emitSaveForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift


 static uint16_t emitRestoreForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift) {
-#if defined(JOEYLIB_PLATFORM_DOS)
-    return spriteEmitRestoreX86(out, sp, shift);
-#elif defined(JOEYLIB_PLATFORM_AMIGA) || defined(JOEYLIB_PLATFORM_ATARIST)
-    return spriteEmitRestore68k(out, sp, shift);
-#elif defined(JOEYLIB_PLATFORM_IIGS)
+#if defined(JOEYLIB_PLATFORM_IIGS)
    return spriteEmitRestoreIigs(out, sp, shift);
 #else
    (void)out; (void)sp; (void)shift;
@ -426,62 +416,18 @@ void spriteCompiledDraw(SurfaceT *dst, const SpriteT *sp, int16_t x, int16_t y)
 }


-// x86 / 68k compiled save: bytes are a cdecl
-//   void copy(const uint8_t *src, uint8_t *dst)
-// that walks heightPx rows of copyBytes from screen (stride
-// SURFACE_BYTES_PER_ROW) into the contiguous backup buffer.
+// Non-IIgs platforms have no compiled save/restore yet. The dispatch
+// in src/core/sprite.c gates on routineOffsets[shift][SPRITE_OP_*] !=
+// SPRITE_NOT_COMPILED, so these stubs should never actually run on
+// those platforms; they exist so spriteInternal.h's prototypes stay
+// resolved at link time.
 void spriteCompiledSaveUnder(const SurfaceT *src, SpriteT *sp, int16_t x, int16_t y, SpriteBackupT *backup) {
-    typedef void (*CopyFn)(const uint8_t *src, uint8_t *dst);
-    uint8_t  shift;
-    int16_t  clippedX;
-    uint16_t widthPx;
-    uint16_t heightPx;
-    uint16_t copyBytes;
-    uint8_t *screenPtr;
-    CopyFn   fn;
-
-    shift     = (uint8_t)(x & 1);
-    clippedX  = (int16_t)(x & ~1);
-    widthPx   = (uint16_t)(sp->widthTiles  * 8);
-    heightPx  = (uint16_t)(sp->heightTiles * 8);
-    copyBytes = (uint16_t)((widthPx >> 1) + (shift == 1 ? 1 : 0));
-
-    screenPtr = (uint8_t *)&src->pixels[(uint16_t)y * SURFACE_BYTES_PER_ROW + ((uint16_t)clippedX >> 1)];
-
-    backup->sprite    = sp;
-    backup->x         = clippedX;
-    backup->y         = y;
-    backup->width     = (uint16_t)(copyBytes << 1);
-    backup->height    = heightPx;
-    backup->sizeBytes = (uint16_t)(copyBytes * heightPx);
-
-    fn = (CopyFn)(codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_SAVE]);
-    fn(screenPtr, backup->bytes);
+    (void)src; (void)sp; (void)x; (void)y; (void)backup;
 }


-// Mirror of save: caller swaps arg order so the same emitted shape
-// drives backup -> screen. The screen-side stride lives inside the
-// emitted bytes, so RESTORE has its own routine bytes (stride is
-// applied to dst instead of src).
 void spriteCompiledRestoreUnder(SurfaceT *dst, const SpriteBackupT *backup) {
-    typedef void (*CopyFn)(const uint8_t *src, uint8_t *dst);
-    SpriteT  *sp;
-    uint8_t   shift;
-    uint16_t  copyBytes;
-    uint16_t  spriteBytesPerRow;
-    uint8_t  *screenPtr;
-    CopyFn    fn;
-
-    sp                = backup->sprite;
-    copyBytes         = (uint16_t)(backup->width >> 1);
-    spriteBytesPerRow = (uint16_t)(sp->widthTiles * 4);
-    shift             = (copyBytes == spriteBytesPerRow) ? 0 : 1;
-
-    screenPtr = (uint8_t *)&dst->pixels[(uint16_t)backup->y * SURFACE_BYTES_PER_ROW + ((uint16_t)backup->x >> 1)];
-
-    fn = (CopyFn)(codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_RESTORE]);
-    fn(backup->bytes, screenPtr);
+    (void)dst; (void)backup;
 }

 #endif
--- a/src/codegen/spriteEmit68k.c
+++ b/src/codegen/spriteEmit68k.c
@ -1,12 +1,10 @@
 // 68k sprite codegen (Amiga + Atari ST). Emits SysV-ish cdecl-
-// callable PIC draw / save / restore routines that read or write
-// 4bpp packed surface bytes via d16(a0) chains. Same shape as the
-// x86 emitter; only the instruction encoding differs.
+// callable PIC draw routines that write 4bpp packed surface bytes
+// via d16(a0) chains. Same shape as the x86 emitter; only the
+// instruction encoding differs.
 //
 // Calling convention (m68k gcc / mintlib):
-//   void draw(uint8_t *dst);                              -- arg in 4(sp)
-//   void save/restore(const uint8_t *src, uint8_t *dst);  -- args in 4(sp)/8(sp)
-//   a0/a1/d0/d1 are caller-saved.
+//   void draw(uint8_t *dst);    -- arg in 4(sp); a0/a1/d0/d1 caller-saved.
 //
 // Per-byte emit (no run coalescing yet):
 //   - all-transparent: skip
@ -40,46 +38,13 @@

 // ----- Prototypes -----

-static uint16_t emitCopyBody68k(uint8_t *out, uint16_t cursor, uint16_t heightPx, uint16_t copyBytes, bool strideOnSrc);
-static void     shiftedByteAt(const SpriteT *sp, uint16_t row, uint16_t col, uint8_t shift, uint16_t spriteBytesPerRow, uint8_t *outValue, uint8_t *outOpaqueMask);
 static uint8_t  spriteSourceByte(const SpriteT *sp, uint16_t row, uint16_t col);
+static void     shiftedByteAt(const SpriteT *sp, uint16_t row, uint16_t col, uint8_t shift, uint16_t spriteBytesPerRow, uint8_t *outValue, uint8_t *outOpaqueMask);
 static uint16_t writeBE16(uint8_t *out, uint16_t value);


 // ----- Emit helpers (alphabetical) -----

-// Shared body for save/restore. Walks heightPx rows of copyBytes
-// using `move.b (a0)+, (a1)+` byte-wise (safe regardless of pointer
-// alignment, since the screen-side x can land on an odd byte). After
-// each row except the last, advances either a0 (SAVE: src=screen) or
-// a1 (RESTORE: dst=screen) by (SURFACE_BYTES_PER_ROW - copyBytes) so
-// the strided side lines up with the next scanline; the contiguous
-// side advances naturally via the post-increment.
-//
-// strideOnSrc=true   -> source has the screen stride (SAVE)
-// strideOnSrc=false  -> destination has the screen stride (RESTORE)
-static uint16_t emitCopyBody68k(uint8_t *out, uint16_t cursor, uint16_t heightPx, uint16_t copyBytes, bool strideOnSrc) {
-    uint16_t row;
-    uint16_t col;
-    uint16_t advance;
-
-    advance = (uint16_t)(SURFACE_BYTES_PER_ROW - copyBytes);
-
-    for (row = 0; row < heightPx; row++) {
-        // Unrolled: move.b (a0)+, (a1)+ -- 0x12D8.
-        for (col = 0; col < copyBytes; col++) {
-            cursor += writeBE16(out + cursor, 0x12D8u);
-        }
-        if (row + 1u < heightPx) {
-            // adda.w #advance, a0 (0xD0FC) for SAVE
-            // adda.w #advance, a1 (0xD2FC) for RESTORE
-            cursor += writeBE16(out + cursor, strideOnSrc ? 0xD0FCu : 0xD2FCu);
-            cursor += writeBE16(out + cursor, advance);
-        }
-    }
-    return cursor;
-}
-
 // Same logic as the x86 shiftedByteAt -- per-byte transparency
 // decomposition for shift in {0,1}. opaqueMask high nibble 0xF0 if
 // dest high nibble is opaque, 0x0F if low is opaque.
@ -219,46 +184,3 @@ uint16_t spriteEmitDraw68k(uint8_t *out, const SpriteT *sp, uint8_t shift) {
 }


-// RESTORE: copy backup -> screen. Destination has the screen stride.
-uint16_t spriteEmitRestore68k(uint8_t *out, const SpriteT *sp, uint8_t shift) {
-    uint16_t cursor;
-    uint16_t heightPx;
-    uint16_t copyBytes;
-
-    cursor    = 0;
-    heightPx  = (uint16_t)(sp->heightTiles * TILE_PIXELS);
-    copyBytes = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW + (shift == 1u ? 1u : 0u));
-
-    // Prologue: movea.l 4(sp), a0 (src); movea.l 8(sp), a1 (dst).
-    cursor += writeBE16(out + cursor, 0x206Fu);
-    cursor += writeBE16(out + cursor, 0x0004u);
-    cursor += writeBE16(out + cursor, 0x226Fu);
-    cursor += writeBE16(out + cursor, 0x0008u);
-
-    cursor = emitCopyBody68k(out, cursor, heightPx, copyBytes, false);
-
-    cursor += writeBE16(out + cursor, 0x4E75u);
-    return cursor;
-}
-
-
-// SAVE: copy screen -> backup. Source has the screen stride.
-uint16_t spriteEmitSave68k(uint8_t *out, const SpriteT *sp, uint8_t shift) {
-    uint16_t cursor;
-    uint16_t heightPx;
-    uint16_t copyBytes;
-
-    cursor    = 0;
-    heightPx  = (uint16_t)(sp->heightTiles * TILE_PIXELS);
-    copyBytes = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW + (shift == 1u ? 1u : 0u));
-
-    cursor += writeBE16(out + cursor, 0x206Fu);
-    cursor += writeBE16(out + cursor, 0x0004u);
-    cursor += writeBE16(out + cursor, 0x226Fu);
-    cursor += writeBE16(out + cursor, 0x0008u);
-
-    cursor = emitCopyBody68k(out, cursor, heightPx, copyBytes, true);
-
-    cursor += writeBE16(out + cursor, 0x4E75u);
-    return cursor;
-}
--- a/src/codegen/spriteEmitIigs.c
+++ b/src/codegen/spriteEmitIigs.c
@ -217,36 +217,6 @@ uint16_t spriteEmitRestoreIigs(uint8_t *out, const SpriteT *sp, uint8_t shift) {


 // 65816 draw emit. Returns bytes written.
-//
-// Two emission paths share the body:
-//
-//   * M=8 byte path (default; matches the stub-set entry mode):
-//       opaque:  A9 vv         LDA #vv             ;  2c
-//                99 lo hi      STA abs,Y           ;  5c
-//                                                  ; 7c / 5 bytes per byte
-//       mixed:   B9 lo hi      LDA abs,Y           ;  5c
-//                29 mm         AND #~mask          ;  2c
-//                09 vv         ORA #val            ;  2c
-//                99 lo hi      STA abs,Y           ;  5c
-//                                                  ;14c / 9 bytes per byte
-//
-//   * M=16 word path (entered around runs of >= 2 consecutive
-//     fully-opaque bytes). Each word write covers 2 dest bytes:
-//       prologue: C2 20         REP #$20            ;  3c
-//       per pair: A9 lo hi      LDA #imm16          ;  3c
-//                 99 lo hi      STA abs,Y           ;  6c
-//                                                   ;  9c / 6 bytes per pair
-//       epilogue: E2 20         SEP #$20            ;  3c
-//
-//     vs. M=8 path doing the same 2 bytes: 14c / 10 bytes. Per-pair
-//     savings are 5c / 4 bytes; the 6c/4-byte REP+SEP transition is
-//     amortized once per opaque run, so the path is profitable for
-//     runs of 2 pairs (4 consecutive opaque bytes) or longer. For
-//     isolated pairs we still take the M=16 path -- the 1-cycle loss
-//     vs. M=8 is dwarfed by the typical-sprite opaque-run length.
-//
-// Mixed bytes always run on the M=8 path because the AND/ORA in
-// M=16 would clobber the adjacent byte.
 uint16_t spriteEmitDrawIigs(uint8_t *out, const SpriteT *sp, uint8_t shift) {
    uint16_t cursor;
    uint16_t row;
@ -257,15 +227,11 @@ uint16_t spriteEmitDrawIigs(uint8_t *out, const SpriteT *sp, uint8_t shift) {
    uint16_t absOffset;
    uint8_t  value;
    uint8_t  opaqueMask;
-    uint8_t  nextValue;
-    uint8_t  nextOpaqueMask;
-    bool     wide;

    cursor             = 0;
    heightPx           = (uint16_t)(sp->heightTiles * TILE_PIXELS);
    spriteBytesPerRow  = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW);
    destBytesPerRow    = (uint16_t)(spriteBytesPerRow + (shift == 1 ? 1 : 0));
-    wide               = false;

    // No prologue: caller (the inline-asm stub in spriteCompile.c)
    // sets M=8/X=16/Y=destRow before JSL'ing here.
@ -277,42 +243,18 @@ uint16_t spriteEmitDrawIigs(uint8_t *out, const SpriteT *sp, uint8_t shift) {
                continue;
            }
            absOffset = (uint16_t)(row * SURFACE_BYTES_PER_ROW + col);
-
-            if (opaqueMask == 0xFFu && (col + 1) < destBytesPerRow) {
-                // Look ahead: if (col, col+1) are both fully opaque
-                // we can pair them as a single M=16 word write.
-                shiftedByteAt(sp, row, (uint16_t)(col + 1), shift,
-                              spriteBytesPerRow, &nextValue, &nextOpaqueMask);
-                if (nextOpaqueMask == 0xFFu) {
-                    if (!wide) {
-                        out[cursor++] = 0xC2;            // REP #$20 -- M=16
-                        out[cursor++] = 0x20;
-                        wide = true;
-                    }
-                    out[cursor++] = 0xA9;                // LDA #imm16
-                    cursor += writeLE16(out + cursor,
-                                        (uint16_t)(((uint16_t)nextValue << 8) | value));
-                    out[cursor++] = 0x99;                // STA abs,Y
-                    cursor += writeLE16(out + cursor, absOffset);
-                    col++;                               // consumed col+1
-                    continue;
-                }
-            }
-
-            // Falls through here for: isolated opaque (no pair), mixed,
-            // or the trailing odd byte at the right edge. All on M=8.
-            if (wide) {
-                out[cursor++] = 0xE2;                    // SEP #$20 -- back to M=8
-                out[cursor++] = 0x20;
-                wide = false;
-            }
-
            if (opaqueMask == 0xFFu) {
+                // lda #imm    A9 ii
+                // sta abs,Y   99 lo hi
                out[cursor++] = 0xA9;
                out[cursor++] = value;
                out[cursor++] = 0x99;
                cursor += writeLE16(out + cursor, absOffset);
            } else {
+                // lda abs,Y   B9 lo hi
+                // and #mask   29 mm
+                // ora #val    09 vv
+                // sta abs,Y   99 lo hi
                out[cursor++] = 0xB9;
                cursor += writeLE16(out + cursor, absOffset);
                out[cursor++] = 0x29;
@ -325,16 +267,6 @@ uint16_t spriteEmitDrawIigs(uint8_t *out, const SpriteT *sp, uint8_t shift) {
        }
    }

-    // Routine exits in M=8: the JSL stub assumes M=8 throughout (the
-    // stub itself only ever ran with M=8 and doesn't restore M). The
-    // asm wrapper after the JSL forces M=16 again, but be defensive
-    // and ensure we leave M=8 here so the stub's PLB/RTL run as
-    // expected even if the wrapper convention changes.
-    if (wide) {
-        out[cursor++] = 0xE2;
-        out[cursor++] = 0x20;
-    }
-
    // Epilogue: rtl (large memory model -b uses JSL/RTL).
    out[cursor++] = 0x6B;
    return cursor;
--- a/src/codegen/spriteEmitX86.c
+++ b/src/codegen/spriteEmitX86.c
@ -1,23 +1,20 @@
 // x86 sprite codegen (DOS port). Emits 32-bit cdecl-callable PIC
-// draw / save / restore routines that read or write 4bpp packed
-// surface bytes via [esi+disp8] chains. The C side calls them
-// through a function pointer cast.
+// draw routines that write 4bpp packed surface bytes via
+// [esi+disp8] chains. The C side calls them through a function
+// pointer cast.
 //
 // Calling convention:
-//   draw(uint8_t *dst)                              -- arg in [esp+8] after prologue saves esi
-//   save/restore(const uint8_t *src, uint8_t *dst)  -- args in [esp+12]/[esp+16] after esi+edi save
+//   draw(uint8_t *dst)             -- esi advances row by row
 //
-// Per-byte emit, with opaque-run coalescing for the draw path:
+// Save and restore are not compiled -- they're uniform memcpy-
+// shaped operations and the C interpreter handles them at memcpy
+// speed via the standard library.
+//
+// Per-byte emit (no run coalescing yet):
 //   - all-transparent (both nibbles 0): skip, no instruction
+//   - all-opaque: mov byte [esi+col], imm8       (4 bytes encoded)
 //   - mixed: mov al,[esi+col]; and al,mask; or al,val; mov [esi+col],al
 //                                                 (3 + 2 + 2 + 3 = 10 bytes)
-//   - run of N consecutive fully-opaque bytes: emit largest chunks
-//     while N >= 4: mov dword [esi+col], imm32   (7 bytes,  1 store)
-//     if   N >= 2: mov word  [esi+col], imm16   (6 bytes,  1 store)
-//     if   N == 1: mov byte  [esi+col], imm8    (4 bytes,  1 store)
-//     A run of 4 opaque bytes is therefore one 7-byte store instead of
-//     four 4-byte stores (16 bytes / 4 stores). Unaligned access is
-//     fine on 386+.
 // Per row:
 //   add esi, SURFACE_BYTES_PER_ROW                (6 bytes encoded)
 // Prologue:
@ -48,69 +45,12 @@

 // ----- Prototypes -----

-static uint16_t emitCopyBodyX86(uint8_t *out, uint16_t cursor, uint16_t heightPx, uint16_t copyBytes, bool strideOnSrc);
-static void     shiftedByteAt(const SpriteT *sp, uint16_t row, uint16_t col, uint8_t shift, uint16_t spriteBytesPerRow, uint8_t *outValue, uint8_t *outOpaqueMask);
 static uint8_t  spriteSourceByte(const SpriteT *sp, uint16_t row, uint16_t col);
+static void     shiftedByteAt(const SpriteT *sp, uint16_t row, uint16_t col, uint8_t shift, uint16_t spriteBytesPerRow, uint8_t *outValue, uint8_t *outOpaqueMask);


 // ----- Emit helpers (alphabetical) -----

-// Shared body for save/restore. Walks heightPx rows of copyBytes
-// using rep movsd for the dword-aligned bulk and rep movsb for the
-// byte tail. After each row except the last, advances either esi
-// or edi by (SURFACE_BYTES_PER_ROW - copyBytes) so the strided side
-// (screen) lines up with the next scanline; the contiguous side
-// (backup) advances naturally because rep movs* leaves the index
-// register one past the last byte copied.
-//
-// strideOnSrc=true   -> source has the screen stride (SAVE)
-// strideOnSrc=false  -> destination has the screen stride (RESTORE)
-static uint16_t emitCopyBodyX86(uint8_t *out, uint16_t cursor, uint16_t heightPx, uint16_t copyBytes, bool strideOnSrc) {
-    uint16_t row;
-    uint16_t dwords;
-    uint16_t tail;
-    int32_t  advance;
-
-    dwords  = (uint16_t)(copyBytes >> 2);
-    tail    = (uint16_t)(copyBytes & 0x3u);
-    advance = (int32_t)SURFACE_BYTES_PER_ROW - (int32_t)copyBytes;
-
-    for (row = 0; row < heightPx; row++) {
-        if (dwords > 0) {
-            // mov ecx, dwords (B9 imm32); rep movsd (F3 A5)
-            out[cursor++] = 0xB9;
-            out[cursor++] = (uint8_t)(dwords & 0xFFu);
-            out[cursor++] = (uint8_t)((dwords >> 8) & 0xFFu);
-            out[cursor++] = 0;
-            out[cursor++] = 0;
-            out[cursor++] = 0xF3;
-            out[cursor++] = 0xA5;
-        }
-        if (tail > 0) {
-            // mov ecx, tail (B9 imm32); rep movsb (F3 A4)
-            out[cursor++] = 0xB9;
-            out[cursor++] = (uint8_t)(tail & 0xFFu);
-            out[cursor++] = 0;
-            out[cursor++] = 0;
-            out[cursor++] = 0;
-            out[cursor++] = 0xF3;
-            out[cursor++] = 0xA4;
-        }
-        if (row + 1u < heightPx) {
-            // SAVE: add esi, advance (81 C6 imm32)
-            // RESTORE: add edi, advance (81 C7 imm32)
-            out[cursor++] = 0x81;
-            out[cursor++] = (uint8_t)(strideOnSrc ? 0xC6u : 0xC7u);
-            out[cursor++] = (uint8_t)(advance & 0xFFu);
-            out[cursor++] = (uint8_t)((advance >> 8) & 0xFFu);
-            out[cursor++] = (uint8_t)((advance >> 16) & 0xFFu);
-            out[cursor++] = (uint8_t)((advance >> 24) & 0xFFu);
-        }
-    }
-    return cursor;
-}
-
-
 // Decompose a destination byte's contribution from the sprite into
 // (value, opaqueMask) for shift in {0, 1}. opaqueMask high nibble
 // 0xF0 means high dest nibble is opaque; 0x0F means low is opaque;
@ -188,17 +128,11 @@ uint16_t spriteEmitDrawX86(uint8_t *out, const SpriteT *sp, uint8_t shift) {
    uint16_t  cursor;
    uint16_t  row;
    uint16_t  col;
-    uint16_t  runEnd;
-    uint16_t  runLen;
    uint16_t  heightPx;
    uint16_t  spriteBytesPerRow;
    uint16_t  destBytesPerRow;
    uint8_t   value;
    uint8_t   opaqueMask;
-    uint8_t   v1;
-    uint8_t   v2;
-    uint8_t   v3;
-    uint8_t   m;

    cursor             = 0;
    heightPx           = (uint16_t)(sp->heightTiles * TILE_PIXELS);
@ -210,7 +144,7 @@ uint16_t spriteEmitDrawX86(uint8_t *out, const SpriteT *sp, uint8_t shift) {
    out[cursor++] = 0x8B; out[cursor++] = 0x74;
    out[cursor++] = 0x24; out[cursor++] = 0x08;

-    // Body: per row, scan dest bytes coalescing fully-opaque runs.
+    // Body: per row, per dest byte.
    for (row = 0; row < heightPx; row++) {
        if (row > 0) {
            // add esi, SURFACE_BYTES_PER_ROW (32-bit imm)
@ -220,14 +154,17 @@ uint16_t spriteEmitDrawX86(uint8_t *out, const SpriteT *sp, uint8_t shift) {
            out[cursor++] = 0x00;
            out[cursor++] = 0x00;
        }
-        col = 0;
-        while (col < destBytesPerRow) {
+        for (col = 0; col < destBytesPerRow; col++) {
            shiftedByteAt(sp, row, col, shift, spriteBytesPerRow, &value, &opaqueMask);
            if (opaqueMask == 0x00) {
-                col++;
-                continue;
+                continue;       // both nibbles transparent
            }
-            if (opaqueMask != 0xFFu) {
+            if (opaqueMask == 0xFFu) {
+                // mov byte [esi+col], imm8       (C6 46 cc ii)
+                out[cursor++] = 0xC6; out[cursor++] = 0x46;
+                out[cursor++] = (uint8_t)(col & 0xFFu);
+                out[cursor++] = value;
+            } else {
                // Mixed: read-modify-write.
                //   mov al, [esi+col]            (8A 46 cc)
                //   and al, ~opaqueMask          (24 mm)
@ -241,61 +178,6 @@ uint16_t spriteEmitDrawX86(uint8_t *out, const SpriteT *sp, uint8_t shift) {
                out[cursor++] = value;
                out[cursor++] = 0x88; out[cursor++] = 0x46;
                out[cursor++] = (uint8_t)(col & 0xFFu);
-                col++;
-                continue;
-            }
-            // Fully opaque at col -- find the end of the run.
-            runEnd = (uint16_t)(col + 1);
-            while (runEnd < destBytesPerRow) {
-                shiftedByteAt(sp, row, runEnd, shift, spriteBytesPerRow, &v1, &m);
-                if (m != 0xFFu) {
-                    break;
-                }
-                runEnd++;
-            }
-            runLen = (uint16_t)(runEnd - col);
-
-            // Emit dword stores while >= 4 bytes remain, then a word
-            // store if >= 2, then a single byte. shiftedByteAt is cheap
-            // enough that re-reading per chunk beats threading a
-            // fixed-size buffer through.
-            while (runLen >= 4) {
-                shiftedByteAt(sp, row, (uint16_t)(col + 1), shift, spriteBytesPerRow, &v1, &m);
-                shiftedByteAt(sp, row, (uint16_t)(col + 2), shift, spriteBytesPerRow, &v2, &m);
-                shiftedByteAt(sp, row, (uint16_t)(col + 3), shift, spriteBytesPerRow, &v3, &m);
-                // mov dword [esi+col], imm32   (C7 46 cc ii ii ii ii)
-                out[cursor++] = 0xC7; out[cursor++] = 0x46;
-                out[cursor++] = (uint8_t)(col & 0xFFu);
-                out[cursor++] = value;
-                out[cursor++] = v1;
-                out[cursor++] = v2;
-                out[cursor++] = v3;
-                col    = (uint16_t)(col + 4);
-                runLen = (uint16_t)(runLen - 4);
-                if (runLen > 0) {
-                    shiftedByteAt(sp, row, col, shift, spriteBytesPerRow, &value, &opaqueMask);
-                }
-            }
-            if (runLen >= 2) {
-                shiftedByteAt(sp, row, (uint16_t)(col + 1), shift, spriteBytesPerRow, &v1, &m);
-                // mov word [esi+col], imm16    (66 C7 46 cc ii ii)
-                out[cursor++] = 0x66;
-                out[cursor++] = 0xC7; out[cursor++] = 0x46;
-                out[cursor++] = (uint8_t)(col & 0xFFu);
-                out[cursor++] = value;
-                out[cursor++] = v1;
-                col    = (uint16_t)(col + 2);
-                runLen = (uint16_t)(runLen - 2);
-                if (runLen > 0) {
-                    shiftedByteAt(sp, row, col, shift, spriteBytesPerRow, &value, &opaqueMask);
-                }
-            }
-            if (runLen == 1) {
-                // mov byte [esi+col], imm8     (C6 46 cc ii)
-                out[cursor++] = 0xC6; out[cursor++] = 0x46;
-                out[cursor++] = (uint8_t)(col & 0xFFu);
-                out[cursor++] = value;
-                col++;
            }
        }
    }
@ -307,51 +189,3 @@ uint16_t spriteEmitDrawX86(uint8_t *out, const SpriteT *sp, uint8_t shift) {
 }


-// RESTORE: copy backup -> screen. Destination has the screen stride.
-uint16_t spriteEmitRestoreX86(uint8_t *out, const SpriteT *sp, uint8_t shift) {
-    uint16_t cursor;
-    uint16_t heightPx;
-    uint16_t copyBytes;
-
-    cursor    = 0;
-    heightPx  = (uint16_t)(sp->heightTiles * TILE_PIXELS);
-    copyBytes = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW + (shift == 1u ? 1u : 0u));
-
-    // Prologue: push esi; push edi; mov esi,[esp+12]; mov edi,[esp+16]
-    out[cursor++] = 0x56;
-    out[cursor++] = 0x57;
-    out[cursor++] = 0x8B; out[cursor++] = 0x74; out[cursor++] = 0x24; out[cursor++] = 0x0C;
-    out[cursor++] = 0x8B; out[cursor++] = 0x7C; out[cursor++] = 0x24; out[cursor++] = 0x10;
-
-    cursor = emitCopyBodyX86(out, cursor, heightPx, copyBytes, false);
-
-    // Epilogue: pop edi; pop esi; ret
-    out[cursor++] = 0x5F;
-    out[cursor++] = 0x5E;
-    out[cursor++] = 0xC3;
-    return cursor;
-}
-
-
-// SAVE: copy screen -> backup. Source has the screen stride.
-uint16_t spriteEmitSaveX86(uint8_t *out, const SpriteT *sp, uint8_t shift) {
-    uint16_t cursor;
-    uint16_t heightPx;
-    uint16_t copyBytes;
-
-    cursor    = 0;
-    heightPx  = (uint16_t)(sp->heightTiles * TILE_PIXELS);
-    copyBytes = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW + (shift == 1u ? 1u : 0u));
-
-    out[cursor++] = 0x56;
-    out[cursor++] = 0x57;
-    out[cursor++] = 0x8B; out[cursor++] = 0x74; out[cursor++] = 0x24; out[cursor++] = 0x0C;
-    out[cursor++] = 0x8B; out[cursor++] = 0x7C; out[cursor++] = 0x24; out[cursor++] = 0x10;
-
-    cursor = emitCopyBodyX86(out, cursor, heightPx, copyBytes, true);
-
-    out[cursor++] = 0x5F;
-    out[cursor++] = 0x5E;
-    out[cursor++] = 0xC3;
-    return cursor;
-}
--- a/src/codegen/spriteEmitter.h
+++ b/src/codegen/spriteEmitter.h
@ -27,19 +27,11 @@ uint16_t spriteEmitDrawIigs(uint8_t *out, const SpriteT *sp, uint8_t shift);
 // rectangle between the destination surface and a backup buffer. The
 // rectangle's width and start position depend on the shift: for
 // shift=0 (even x) it covers exactly the sprite's bytes per row;
-// for shift=1 (odd x) it covers one extra byte (left edge nibble).
-// Per-CPU emitters return 0 to mean "not implemented" -- the runtime
-// dispatch falls back to the interpreted path in that case.
-//
-// IIgs uses a self-modifying MVN-stub on top of these bytes; x86 and
-// 68k use a plain cdecl `void copy(const uint8_t *src, uint8_t *dst)`
-// where the caller swaps args between SAVE (screen->backup) and
-// RESTORE (backup->screen).
+// for shift=1 (odd x) it covers one extra byte on each side, rounded
+// up to even. Per-CPU emitters return 0 to mean "not implemented" --
+// the runtime dispatch falls back to the interpreted path in that
+// case.
 uint16_t spriteEmitSaveIigs   (uint8_t *out, const SpriteT *sp, uint8_t shift);
 uint16_t spriteEmitRestoreIigs(uint8_t *out, const SpriteT *sp, uint8_t shift);
-uint16_t spriteEmitSaveX86    (uint8_t *out, const SpriteT *sp, uint8_t shift);
-uint16_t spriteEmitRestoreX86 (uint8_t *out, const SpriteT *sp, uint8_t shift);
-uint16_t spriteEmitSave68k    (uint8_t *out, const SpriteT *sp, uint8_t shift);
-uint16_t spriteEmitRestore68k (uint8_t *out, const SpriteT *sp, uint8_t shift);

 #endif
--- a/src/port/amiga/c2p.s
+++ b/src/port/amiga/c2p.s
@ -1,10 +1,13 @@
 | Amiga chunky-to-planar conversion -- 68000 hand-rolled.
 |
-| Drop-in replacement for hal.c's old c2pRange C inner loop. Uses a
-| 4 KB lookup table built once at HAL init: each (sourceByte, position,
-| plane) tuple maps to the plane-byte bit contribution that source
-| byte makes when it sits at that position within a 4-byte (8-pixel)
-| planar group going to that plane.
+| Drop-in replacement for hal.c's old c2pRange C inner loop. The C
+| version walked every pixel and OR'd individual bits into 4 plane
+| accumulators -- ~1.5 s for a full 320x200 frame on a 7 MHz 68000
+| (the GCC m68k codegen is poor for tight bit-twiddling). This rewrite
+| uses a 4 KB lookup table built once at HAL init: each (sourceByte,
+| bytePosition, plane) tuple maps to the plane-byte-bit contribution
+| that source byte makes when it sits at that position within a
+| 4-byte (= 8-pixel) planar group.
 |
 | Calling convention: m68k-amigaos-gcc cdecl.
 |   Args on stack at 4(sp), 8(sp), ...
@ -19,17 +22,12 @@
 |                        uint16_t       n,      ; 24(sp) - planar byte count (low word)
 |                        const uint8_t *lut);   ; 28(sp) - 4 KB LUT base
 |
-| LUT layout: lut[src*16 + pos*4 + plane] = 1-byte plane contribution
-| for source byte `src` sitting at byte-position `pos` (0..3) within
-| its 4-byte planar group, going to plane `plane` (0..3). All 16
-| (pos, plane) entries for one src byte are contiguous, so the inner
-| loop reaches every entry off (a5, d4.w) with an 8-bit displacement
-| (0..15) and never has to advance an index register.
-|
-| Per planar byte we consume 4 source bytes (positions 0..3 of the
-| 8-pixel group). For each we compute d4 = src*16 with four add.w's
-| (faster than asl.w on 68000) and OR the four plane contributions
-| into d0..d3 with byte-displaced (a5,d4.w) reads.
+| LUT layout: lut[pos*1024 + plane*256 + src] = 1-byte plane contribution
+| for source byte `src` sitting at byte-position `pos` within its
+| 4-byte planar group, going to plane `plane`. Byte-position 0 is the
+| leftmost (its two pixels land in plane-byte bits 7 and 6); position
+| 3 is the rightmost (bits 1 and 0). Built once by chunkyToPlanarInit
+| (in hal.c) at HAL boot.
 |
 | GAS-syntax (binutils m68k); assembled by m68k-amigaos-as via the
 | gcc driver.
@ -67,52 +65,54 @@ _chunkyToPlanarRow:
                moveq   #0,%d3                          | plane 3 acc

                | ----- Source byte position 0 -----
+                | a5 points to start of LUT. Plane 0/1/2/3 sub-tables
+                | for position 0 are at offsets 0/256/512/768.
                moveq   #0,%d4
                move.b  (%a0)+,%d4                      | src[0]
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4                         | d4 = src * 16
-                or.b      0(%a5,%d4.w),%d0              | pos0 plane0
-                or.b      1(%a5,%d4.w),%d1              | pos0 plane1
-                or.b      2(%a5,%d4.w),%d2              | pos0 plane2
-                or.b      3(%a5,%d4.w),%d3              | pos0 plane3
+                move.l  %a5,%a6
+                or.b    (%a6,%d4.w),%d0                 | +0   = pos0 plane 0
+                lea     256(%a6),%a6
+                or.b    (%a6,%d4.w),%d1                 | +256 = pos0 plane 1
+                lea     256(%a6),%a6
+                or.b    (%a6,%d4.w),%d2                 | +512 = pos0 plane 2
+                lea     256(%a6),%a6
+                or.b    (%a6,%d4.w),%d3                 | +768 = pos0 plane 3

                | ----- Source byte position 1 -----
+                lea     256(%a6),%a6                    | advance to pos1 plane 0
                moveq   #0,%d4
-                move.b  (%a0)+,%d4                      | src[1]
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                or.b      4(%a5,%d4.w),%d0              | pos1 plane0
-                or.b      5(%a5,%d4.w),%d1              | pos1 plane1
-                or.b      6(%a5,%d4.w),%d2              | pos1 plane2
-                or.b      7(%a5,%d4.w),%d3              | pos1 plane3
+                move.b  (%a0)+,%d4
+                or.b    (%a6,%d4.w),%d0
+                lea     256(%a6),%a6
+                or.b    (%a6,%d4.w),%d1
+                lea     256(%a6),%a6
+                or.b    (%a6,%d4.w),%d2
+                lea     256(%a6),%a6
+                or.b    (%a6,%d4.w),%d3

                | ----- Source byte position 2 -----
+                lea     256(%a6),%a6
                moveq   #0,%d4
-                move.b  (%a0)+,%d4                      | src[2]
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                or.b      8(%a5,%d4.w),%d0              | pos2 plane0
-                or.b      9(%a5,%d4.w),%d1              | pos2 plane1
-                or.b     10(%a5,%d4.w),%d2              | pos2 plane2
-                or.b     11(%a5,%d4.w),%d3              | pos2 plane3
+                move.b  (%a0)+,%d4
+                or.b    (%a6,%d4.w),%d0
+                lea     256(%a6),%a6
+                or.b    (%a6,%d4.w),%d1
+                lea     256(%a6),%a6
+                or.b    (%a6,%d4.w),%d2
+                lea     256(%a6),%a6
+                or.b    (%a6,%d4.w),%d3

                | ----- Source byte position 3 -----
+                lea     256(%a6),%a6
                moveq   #0,%d4
-                move.b  (%a0)+,%d4                      | src[3]
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                or.b     12(%a5,%d4.w),%d0              | pos3 plane0
-                or.b     13(%a5,%d4.w),%d1              | pos3 plane1
-                or.b     14(%a5,%d4.w),%d2              | pos3 plane2
-                or.b     15(%a5,%d4.w),%d3              | pos3 plane3
+                move.b  (%a0)+,%d4
+                or.b    (%a6,%d4.w),%d0
+                lea     256(%a6),%a6
+                or.b    (%a6,%d4.w),%d1
+                lea     256(%a6),%a6
+                or.b    (%a6,%d4.w),%d2
+                lea     256(%a6),%a6
+                or.b    (%a6,%d4.w),%d3

                | ----- Store plane bytes -----
                move.b  %d0,(%a1)+
--- a/src/port/amiga/hal.c
+++ b/src/port/amiga/hal.c
@ -77,12 +77,11 @@ static uint16_t gCachedPalette[SURFACE_PALETTE_COUNT][SURFACE_COLORS_PER_PALETTE
 static bool     gCacheValid = false;

 // 4 KB chunky-to-planar lookup table consumed by chunkyToPlanarRow
-// (src/port/amiga/c2p.s). Layout: gC2pLut[src*16 + pos*4 + plane] =
-// the plane-byte bit contribution that source byte `src` makes to
-// plane `plane` when it sits at byte-position `pos` within a 4-byte
-// (8-pixel) planar group. The src-major layout lets the asm inner
-// loop reach all 16 (pos, plane) entries for a single src byte via
-// 8-bit displacements off (a5, d4.w) without any LEA between reads.
+// (src/port/amiga/c2p.s). Layout: gC2pLut[pos*1024 + plane*256 + src]
+// = the plane-byte bit contribution that source byte `src` makes when
+// it sits at byte-position `pos` within a 4-byte (8-pixel) planar
+// group, going to plane `plane`. Built once by initC2pLut on the
+// first halPresent call.
 static uint8_t  gC2pLut[4 * 1024];
 static bool     gC2pLutReady = false;

@ -117,14 +116,14 @@ static void initC2pLut(void) {
    if (gC2pLutReady) {
        return;
    }
-    for (src = 0; src < 256; src++) {
    for (pos = 0; pos < 4; pos++) {
        highShift = (uint8_t)(7 - 2 * pos);
        lowShift  = (uint8_t)(6 - 2 * pos);
        for (plane = 0; plane < 4; plane++) {
+            for (src = 0; src < 256; src++) {
                highBit = (uint8_t)(((src >> 4) >> plane) & 1);
                lowBit  = (uint8_t)(((src & 0x0F) >> plane) & 1);
-                gC2pLut[src * 16 + pos * 4 + plane] =
+                gC2pLut[pos * 1024 + plane * 256 + src] =
                    (uint8_t)((highBit << highShift) | (lowBit << lowShift));
            }
        }
--- a/src/port/atarist/c2p.s
+++ b/src/port/atarist/c2p.s
@ -1,188 +0,0 @@
-| Atari ST chunky-to-planar conversion -- 68000 hand-rolled.
-|
-| Drop-in replacement for hal.c's old c2pRow C inner loop. The C
-| version walked every pixel and built each plane word with a
-| run-time variable bit shift (`1 << bit`), which costs ~6+2*bit
-| cycles on 68000 -- roughly 100+ cycles per pixel after GCC's m68k
-| codegen overhead. This rewrite uses a 4 KB lookup table built once
-| at HAL init: same layout as the Amiga c2p LUT, so the
-| (sourceByte, position, plane) -> 2-bit contribution mapping is
-| identical, but the routine packs results into ST word-interleaved
-| planar (4 plane words per 16-pixel group) instead of 4 separate
-| plane bytes.
-|
-| Each ST group is 8 source bytes -> 4 plane words. Source byte
-| positions 0..3 contribute to the HIGH byte of each plane word
-| (bits 15..8); positions 4..7 contribute to the LOW byte (bits
-| 7..0). Within a byte, the LUT for (src, bp%4, plane) already
-| places bits at (7-2*(bp%4), 6-2*(bp%4)), so we use the SAME LUT
-| entries for both halves -- we just shift d0..d3 left by 8 between
-| the halves to move the high-half bits up before the low half ORs
-| into the now-empty low byte.
-|
-| Calling convention: m68k-atari-mint-gcc cdecl.
-|   Args on stack at 4(sp), 8(sp), ...
-|   d2-d7, a2-a6 are callee-save.
-|   No return value.
-|
-| void chunkyToPlanarRowSt(const uint8_t *src,    ;  4(sp) - 4bpp packed source row
-|                          uint16_t      *dst,    ;  8(sp) - planar dest row (uint16_t*)
-|                          uint16_t       groupStart, ; 12(sp) - first group index (low word)
-|                          uint16_t       groupEnd,   ; 16(sp) - one-past-last group index (low word)
-|                          const uint8_t *lut);   ; 20(sp) - 4 KB LUT base
-|
-| LUT layout: lut[src*16 + pos*4 + plane] (uint8) = the 2-bit plane
-| contribution for source byte `src` at byte-position `pos` (0..3
-| within a 4-byte chunk) going to plane `plane` (0..3). All 16
-| (pos, plane) entries for one src byte are contiguous, so the inner
-| loop reaches every entry off (a5, d4.w) with an 8-bit displacement
-| (0..15) without LEA between reads.
-|
-| GAS-syntax (binutils m68k); assembled by m68k-atari-mint-as via
-| the gcc driver.
-
-                .text
-                .globl  _chunkyToPlanarRowSt
-
-| MOVEM frame: d2-d7 (6) + a2-a6 (5) = 11 regs * 4 bytes = 44 bytes.
-                .equ    SAVED_REGS_SIZE, 44
-
-
-_chunkyToPlanarRowSt:
-                movem.l %d2-%d7/%a2-%a6,-(%sp)
-
-                move.l   4+SAVED_REGS_SIZE(%sp),%a0     | src row base
-                move.l   8+SAVED_REGS_SIZE(%sp),%a1     | dst (uint16_t*)
-                | Both groupStart and groupEnd are uint16_t but GCC
-                | promotes them to int and pushes 4 bytes each; the
-                | low word lives at +2 in big-endian layout.
-                move.w  12+SAVED_REGS_SIZE+2(%sp),%d6   | groupStart
-                move.w  16+SAVED_REGS_SIZE+2(%sp),%d7   | groupEnd
-                move.l  20+SAVED_REGS_SIZE(%sp),%a5     | LUT base
-
-                | Advance src and dst to the first group's data.
-                | Each group consumes 8 source bytes and produces 4
-                | dest words (8 bytes), so both pointers advance by
-                | groupStart * 8.
-                move.w  %d6,%d4
-                lsl.w   #3,%d4
-                add.w   %d4,%a0
-                add.w   %d4,%a1
-
-                sub.w   %d6,%d7                         | groupCount = end - start
-                subq.w  #1,%d7                          | DBRA bias
-                bmi     .Ldone
-
-.LgroupLoop:
-                moveq   #0,%d0                          | plane 0 acc
-                moveq   #0,%d1                          | plane 1 acc
-                moveq   #0,%d2                          | plane 2 acc
-                moveq   #0,%d3                          | plane 3 acc
-
-                | ===== Source bytes 0..3 -> high byte of each plane word =====
-                moveq   #0,%d4
-                move.b  (%a0)+,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4                         | d4 = src * 16
-                or.b      0(%a5,%d4.w),%d0
-                or.b      1(%a5,%d4.w),%d1
-                or.b      2(%a5,%d4.w),%d2
-                or.b      3(%a5,%d4.w),%d3
-
-                moveq   #0,%d4
-                move.b  (%a0)+,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                or.b      4(%a5,%d4.w),%d0
-                or.b      5(%a5,%d4.w),%d1
-                or.b      6(%a5,%d4.w),%d2
-                or.b      7(%a5,%d4.w),%d3
-
-                moveq   #0,%d4
-                move.b  (%a0)+,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                or.b      8(%a5,%d4.w),%d0
-                or.b      9(%a5,%d4.w),%d1
-                or.b     10(%a5,%d4.w),%d2
-                or.b     11(%a5,%d4.w),%d3
-
-                moveq   #0,%d4
-                move.b  (%a0)+,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                or.b     12(%a5,%d4.w),%d0
-                or.b     13(%a5,%d4.w),%d1
-                or.b     14(%a5,%d4.w),%d2
-                or.b     15(%a5,%d4.w),%d3
-
-                | Move accumulated bits into the HIGH byte of each word.
-                lsl.w   #8,%d0
-                lsl.w   #8,%d1
-                lsl.w   #8,%d2
-                lsl.w   #8,%d3
-
-                | ===== Source bytes 4..7 -> low byte of each plane word =====
-                moveq   #0,%d4
-                move.b  (%a0)+,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                or.b      0(%a5,%d4.w),%d0
-                or.b      1(%a5,%d4.w),%d1
-                or.b      2(%a5,%d4.w),%d2
-                or.b      3(%a5,%d4.w),%d3
-
-                moveq   #0,%d4
-                move.b  (%a0)+,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                or.b      4(%a5,%d4.w),%d0
-                or.b      5(%a5,%d4.w),%d1
-                or.b      6(%a5,%d4.w),%d2
-                or.b      7(%a5,%d4.w),%d3
-
-                moveq   #0,%d4
-                move.b  (%a0)+,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                or.b      8(%a5,%d4.w),%d0
-                or.b      9(%a5,%d4.w),%d1
-                or.b     10(%a5,%d4.w),%d2
-                or.b     11(%a5,%d4.w),%d3
-
-                moveq   #0,%d4
-                move.b  (%a0)+,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                or.b     12(%a5,%d4.w),%d0
-                or.b     13(%a5,%d4.w),%d1
-                or.b     14(%a5,%d4.w),%d2
-                or.b     15(%a5,%d4.w),%d3
-
-                | Store 4 plane words.
-                move.w  %d0,(%a1)+
-                move.w  %d1,(%a1)+
-                move.w  %d2,(%a1)+
-                move.w  %d3,(%a1)+
-
-                dbra    %d7,.LgroupLoop
-
-.Ldone:
-                movem.l (%sp)+,%d2-%d7/%a2-%a6
-                rts
--- a/src/port/atarist/hal.c
+++ b/src/port/atarist/hal.c
@ -64,19 +64,12 @@
 // ----- Prototypes -----

 static uint16_t quantizeColorToSt(uint16_t orgb);
+static void     c2pRow(const uint8_t *src, uint16_t *dst, uint16_t groupStart, uint16_t groupEnd);
 static void     c2pRange(const SurfaceT *src, int16_t y0, int16_t y1, uint16_t groupStart, uint16_t groupEnd);
 static void     flattenScbPalettes(const SurfaceT *src);
-static void     initC2pLut(void);
 static void     writeDiagnostics(void);
 static long     writePrevPaletteRegs(void);

-// Provided by src/port/atarist/c2p.s.
-extern void chunkyToPlanarRowSt(const uint8_t *src,
-                                uint16_t *dst,
-                                uint16_t groupStart,
-                                uint16_t groupEnd,
-                                const uint8_t *lut);
-
 static __attribute__((interrupt_handler)) void timerBIsr(void);
 static __attribute__((interrupt_handler)) void vblIsr(void);
 static void                                    buildTransitions(const SurfaceT *src);
@ -136,31 +129,55 @@ static uint8_t  gCachedScb    [SURFACE_HEIGHT];
 static uint16_t gCachedPalette[SURFACE_PALETTE_COUNT][SURFACE_COLORS_PER_PALETTE];
 static bool     gCacheValid = false;

-// 4 KB chunky-to-planar lookup table consumed by chunkyToPlanarRowSt
-// (src/port/atarist/c2p.s). Layout: gC2pLut[src*16 + pos*4 + plane]
-// = the 2-bit plane-byte contribution for source byte `src` at
-// byte-position `pos` (0..3 within a 4-byte chunk) going to plane
-// `plane`. Bit positions inside the byte are (7-2*pos, 6-2*pos), so
-// the same table feeds both halves of an ST plane word: positions
-// 0..3 land in the high byte, 4..7 (re-indexed mod 4) in the low
-// byte. Built once by initC2pLut on the first halPresent call.
-static uint8_t  gC2pLut[4 * 1024];
-static bool     gC2pLutReady = false;
-
 // ----- Internal helpers (alphabetical) -----

+// Convert 16 chunky pixels (8 bytes 4bpp packed) to 4 ST planar words
+// per group. groupStart..groupEnd selects a horizontal sub-range so
+// halPresentRect can avoid touching unchanged groups.
+static void c2pRow(const uint8_t *src, uint16_t *dst, uint16_t groupStart, uint16_t groupEnd) {
+    uint16_t group;
+    uint16_t px;
+    uint16_t plane0;
+    uint16_t plane1;
+    uint16_t plane2;
+    uint16_t plane3;
+    uint8_t  byte;
+    uint8_t  nibble;
+    uint16_t bit;
+
+    for (group = groupStart; group < groupEnd; group++) {
+        plane0 = 0;
+        plane1 = 0;
+        plane2 = 0;
+        plane3 = 0;
+
+        for (px = 0; px < 16; px++) {
+            byte   = src[(group * 8) + (px >> 1)];
+            nibble = (uint8_t)((px & 1) ? (byte & 0x0F) : (byte >> 4));
+            bit    = (uint16_t)(15 - px);
+            plane0 = (uint16_t)(plane0 | (((nibble >> 0) & 1) << bit));
+            plane1 = (uint16_t)(plane1 | (((nibble >> 1) & 1) << bit));
+            plane2 = (uint16_t)(plane2 | (((nibble >> 2) & 1) << bit));
+            plane3 = (uint16_t)(plane3 | (((nibble >> 3) & 1) << bit));
+        }
+
+        dst[(group * 4) + 0] = plane0;
+        dst[(group * 4) + 1] = plane1;
+        dst[(group * 4) + 2] = plane2;
+        dst[(group * 4) + 3] = plane3;
+    }
+}
+
+
 static void c2pRange(const SurfaceT *src, int16_t y0, int16_t y1, uint16_t groupStart, uint16_t groupEnd) {
    int16_t        y;
    const uint8_t *srcLine;
    uint16_t      *dstLine;

-    if (!gC2pLutReady) {
-        initC2pLut();
-    }
    for (y = y0; y < y1; y++) {
        srcLine = &src->pixels[y * SURFACE_BYTES_PER_ROW];
        dstLine = (uint16_t *)&gScreenBase[y * ST_BYTES_PER_ROW];
-        chunkyToPlanarRowSt(srcLine, dstLine, groupStart, groupEnd, gC2pLut);
+        c2pRow(srcLine, dstLine, groupStart, groupEnd);
    }
 }

@ -246,37 +263,6 @@ static void refreshPaletteStateIfNeeded(const SurfaceT *src) {
 }


-// Build the 4 KB chunky-to-planar lookup table consumed by
-// chunkyToPlanarRowSt. Same layout/contents as the Amiga c2p LUT;
-// see src/port/atarist/c2p.s for the addressing math.
-static void initC2pLut(void) {
-    uint16_t pos;
-    uint16_t plane;
-    uint16_t src;
-    uint8_t  highShift;
-    uint8_t  lowShift;
-    uint8_t  highBit;
-    uint8_t  lowBit;
-
-    if (gC2pLutReady) {
-        return;
-    }
-    for (src = 0; src < 256; src++) {
-        for (pos = 0; pos < 4; pos++) {
-            highShift = (uint8_t)(7 - 2 * pos);
-            lowShift  = (uint8_t)(6 - 2 * pos);
-            for (plane = 0; plane < 4; plane++) {
-                highBit = (uint8_t)(((src >> 4) >> plane) & 1);
-                lowBit  = (uint8_t)(((src & 0x0F) >> plane) & 1);
-                gC2pLut[src * 16 + pos * 4 + plane] =
-                    (uint8_t)((highBit << highShift) | (lowBit << lowShift));
-            }
-        }
-    }
-    gC2pLutReady = true;
-}
-
-
 // 12-bit $0RGB to STF 9-bit palette register (drops the low bit of
 // each 4-bit channel).
 static uint16_t quantizeColorToSt(uint16_t orgb) {
--- a/tools/joeysprite/joeysprite.c
+++ b/tools/joeysprite/joeysprite.c
@ -1,36 +1,17 @@
-// joeysprite: host-side compiler that turns sprite art into a `.spr`
-// file ready to be loaded at runtime by spriteLoadFile.
+// joeysprite: host-side compiler that turns raw tile data into a
+// `.spr` file ready to be loaded at runtime by spriteLoadFile.
 //
 // Usage:
 //   joeysprite --target {iigs,amiga,atarist,dos}
-//              [--width-tiles N --height-tiles M]
-//              INPUT OUTPUT.spr
+//              --width-tiles N --height-tiles M
+//              input.tiles output.spr
 //
-// Two input formats are accepted; the first 2 bytes select the path:
-//
-//   PPM (P6) -- 8-bit-per-channel raster from any pixel-art tool that
-//   exports PPM (GIMP, ImageMagick `convert`, paint.net, etc.). Image
-//   dimensions must be multiples of 8 in both axes; widthTiles /
-//   heightTiles are auto-derived as W/8 and H/8 (CLI overrides are
-//   optional and must match). Each input RGB is reduced to a 12-bit
-//   $0RGB color (high nibble of each channel); the input must use
-//   no more than 16 distinct $0RGB colors after that reduction. The
-//   FIRST color encountered (typically the top-left pixel) is bound
-//   to palette index 0, which the runtime treats as transparent --
-//   so paint your sprite background with that pixel's color.
-//
-//   Raw `.tiles` -- widthTiles * heightTiles * 32 bytes, laid out
+// `input.tiles` is widthTiles * heightTiles * 32 bytes, laid out
 // tile-major as the runtime SpriteT.tileData expects: tile (0,0)
 // first 32 bytes, tile (1,0) next 32, ... tile (widthTiles-1, 0),
 // then tile (0,1), and so on. Inside each tile, rows are stored
 // top-to-bottom and each row is 4 bytes (8 pixels at 4bpp packed,
-//   high nibble = left pixel). --width-tiles / --height-tiles are
-//   required for this path since the file carries no header.
-//
-// The .spr output carries indices only -- the palette mapping is the
-// application's responsibility (typical pattern: ship a separate
-// .jas built from the same PPM via joeyasset, or hand-author the
-// palette in code).
+// high nibble = left pixel).
 //
 // Output `.spr` format (target-native byte order for code; see
 // DESIGN.md §12). Mirrors src/core/sprite.c's reader:
@ -40,19 +21,14 @@
 //   bytes 4-5   tileBytes (LE16) = widthTiles*heightTiles*32
 //   ...         offsets (JOEY_SPRITE_SHIFT_COUNT * SPRITE_OP_COUNT *
 //               uint16_t LE): [draw_s0, save_s0, restore_s0,
-//               draw_s1, save_s1, restore_s1]. Each entry is the
-//               byte offset of that routine within the compiled-code
-//               region, or 0xFFFF (SPRITE_NOT_COMPILED) if the per-CPU
-//               emitter returned 0 bytes for that op -- the runtime
-//               then falls back to the interpreted memcpy/RMW path.
+//               draw_s1, save_s1, restore_s1]. Save/restore offsets
+//               are 0 here -- the runtime keeps the memcpy-based
+//               interpreter for those ops.
 //   ...         compiled code (codeSize bytes)
 //   ...         raw tile data (tileBytes bytes; same layout as the
 //               input file, lets the runtime interpreter handle
 //               clipped draws without decoding the compiled bytes).

-#include <ctype.h>
-#include <errno.h>
-#include <stdbool.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
@ -75,26 +51,18 @@ typedef enum {
 // ----- Constants -----

 #define MAX_SCRATCH_BYTES   (16u * 1024u)
-// Pixel art conventions for sprite work.
-#define TILE_PIXELS         8
-#define TILE_BYTES          32
-#define TILE_BYTES_PER_ROW  4
-#define MAX_PALETTE_ENTRIES 16
-
-#define PPM_TOKEN_MAX       64
+#define SPR_HEADER_SIZE     6
+// Save/restore offsets are reserved (0) for now -- the runtime
+// memcpy interpreter handles them.
+#define SHIFT_OPS           3
+#define OFFSET_TABLE_BYTES  (JOEY_SPRITE_SHIFT_COUNT * SHIFT_OPS * 2u)


 // ----- Prototypes -----

-static int       buildPalette(const uint8_t *rgb, int width, int height, uint8_t *outIndices);
 static int       compileToSpr(const SpriteT *sp, TargetE target, const char *outPath);
-static uint16_t  emitForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift, uint8_t op, TargetE target);
-static bool      fileIsPpm(const char *path);
-static int       loadPpm(const char *path, int *outWidth, int *outHeight, uint8_t **outPixels);
-static int       loadPpmAsTiles(const char *path, long *widthTiles, long *heightTiles, uint8_t **outTiles, uint32_t *outSize);
+static uint16_t  emitForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift, TargetE target);
 static int       loadTileData(const char *path, uint8_t **outBytes, uint32_t *outSize);
-static void      packIndicesToTiles(const uint8_t *indices, int width, int height, uint8_t *outTiles);
-static int       parsePpmToken(FILE *fp, char *out, int outLen);
 static TargetE   parseTarget(const char *name);
 static int       usage(const char *prog);
 static int       writeLE16(FILE *fp, uint16_t v);
@ -102,68 +70,16 @@ static int       writeLE16(FILE *fp, uint16_t v);

 // ----- Internal helpers (alphabetical) -----

-// Reduce every input RGB triple to a 12-bit $0RGB color and assign
-// palette indices in encounter order: top-left pixel = index 0,
-// next-encountered = index 1, etc. The runtime treats index 0 as
-// transparent, so the top-left pixel must be the sprite's background
-// color. Returns the number of distinct colors found, or -1 if the
-// image needs more than 16 entries after $0RGB quantization.
-//
-// Mirrors joeyasset's buildPalette but only emits the index array;
-// joeysprite drops the $0RGB palette since the .spr format carries
-// indices alone.
-static int buildPalette(const uint8_t *rgb, int width, int height, uint8_t *outIndices) {
-    uint16_t  palette[MAX_PALETTE_ENTRIES];
-    int       paletteCount;
-    int       total;
-    int       i;
-    int       j;
-    uint8_t   r;
-    uint8_t   g;
-    uint8_t   b;
-    uint16_t  color;
-
-    total        = width * height;
-    paletteCount = 0;
-    for (i = 0; i < total; i++) {
-        r = (uint8_t)(rgb[i * 3 + 0] >> 4);
-        g = (uint8_t)(rgb[i * 3 + 1] >> 4);
-        b = (uint8_t)(rgb[i * 3 + 2] >> 4);
-        color = (uint16_t)((r << 8) | (g << 4) | b);
-        for (j = 0; j < paletteCount; j++) {
-            if (palette[j] == color) {
-                break;
-            }
-        }
-        if (j == paletteCount) {
-            if (paletteCount >= MAX_PALETTE_ENTRIES) {
-                return -1;
-            }
-            palette[paletteCount] = color;
-            paletteCount++;
-        }
-        outIndices[i] = (uint8_t)j;
-    }
-    return paletteCount;
-}
-
-
-// Two-pass: pass 1 sizes every (shift, op) routine into shiftOpSizes;
-// pass 2 stamps them into the code buffer at their cumulative offsets.
-// Routines that return 0 bytes (the per-CPU emitter doesn't implement
-// that op) get SPRITE_NOT_COMPILED in their offset slot so the runtime
-// dispatch falls back to the interpreted path.
 static int compileToSpr(const SpriteT *sp, TargetE target, const char *outPath) {
    uint8_t  *scratch;
    uint8_t  *codeBuf;
-    uint16_t  routineSizes[JOEY_SPRITE_SHIFT_COUNT][SPRITE_OP_COUNT];
-    uint16_t  routineOffsets[JOEY_SPRITE_SHIFT_COUNT][SPRITE_OP_COUNT];
+    uint16_t  shiftLengths[JOEY_SPRITE_SHIFT_COUNT];
    uint32_t  totalCodeSize;
    uint8_t   shift;
    uint8_t   op;
    uint16_t  written;
    uint16_t  cursor;
-    uint16_t  value;
+    uint16_t  offset;
    FILE     *fp;
    int       rc;

@ -175,17 +91,10 @@ static int compileToSpr(const SpriteT *sp, TargetE target, const char *outPath)

    totalCodeSize = 0;
    for (shift = 0; shift < JOEY_SPRITE_SHIFT_COUNT; shift++) {
-        for (op = 0; op < SPRITE_OP_COUNT; op++) {
-            written                    = emitForTarget(scratch, sp, shift, op, target);
-            routineSizes[shift][op]    = written;
-            if (written == 0) {
-                routineOffsets[shift][op] = SPRITE_NOT_COMPILED;
-            } else {
-                routineOffsets[shift][op] = (uint16_t)totalCodeSize;
+        written              = emitForTarget(scratch, sp, shift, target);
+        shiftLengths[shift]  = written;
        totalCodeSize       += written;
    }
-        }
-    }
    if (totalCodeSize > 0xFFFFu) {
        fprintf(stderr, "joeysprite: emitted %u code bytes; max is 65535\n",
                (unsigned)totalCodeSize);
@ -193,7 +102,7 @@ static int compileToSpr(const SpriteT *sp, TargetE target, const char *outPath)
        return 2;
    }

-    codeBuf = (uint8_t *)malloc(totalCodeSize > 0 ? totalCodeSize : 1);
+    codeBuf = (uint8_t *)malloc(totalCodeSize);
    if (codeBuf == NULL) {
        fprintf(stderr, "joeysprite: out of memory for code buffer\n");
        free(scratch);
@ -202,14 +111,9 @@ static int compileToSpr(const SpriteT *sp, TargetE target, const char *outPath)

    cursor = 0;
    for (shift = 0; shift < JOEY_SPRITE_SHIFT_COUNT; shift++) {
-        for (op = 0; op < SPRITE_OP_COUNT; op++) {
-            if (routineSizes[shift][op] == 0) {
-                continue;
-            }
-            written = emitForTarget(codeBuf + cursor, sp, shift, op, target);
+        written = emitForTarget(codeBuf + cursor, sp, shift, target);
        cursor  = (uint16_t)(cursor + written);
    }
-    }

    fp = fopen(outPath, "wb");
    if (fp == NULL) {
@ -225,17 +129,25 @@ static int compileToSpr(const SpriteT *sp, TargetE target, const char *outPath)
    if (rc == 0 && writeLE16(fp, (uint16_t)totalCodeSize) != 0) rc = 2;
    if (rc == 0 && writeLE16(fp, (uint16_t)(sp->widthTiles * sp->heightTiles * 32u)) != 0) rc = 2;

+    // Offset table: cumulative draw offsets + zeros for save/restore.
+    offset = 0;
    for (shift = 0; rc == 0 && shift < JOEY_SPRITE_SHIFT_COUNT; shift++) {
-        for (op = 0; op < SPRITE_OP_COUNT; op++) {
-            value = routineOffsets[shift][op];
+        for (op = 0; op < SHIFT_OPS; op++) {
+            uint16_t value;
+            if (op == SPRITE_OP_DRAW) {
+                value = offset;
+            } else {
+                value = 0;
+            }
            if (writeLE16(fp, value) != 0) {
                rc = 2;
                break;
            }
        }
+        offset = (uint16_t)(offset + shiftLengths[shift]);
    }

-    if (rc == 0 && totalCodeSize > 0) {
+    if (rc == 0) {
        if (fwrite(codeBuf, 1, totalCodeSize, fp) != totalCodeSize) {
            rc = 2;
        }
@ -267,207 +179,21 @@ static int compileToSpr(const SpriteT *sp, TargetE target, const char *outPath)
 }


-static uint16_t emitForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift, uint8_t op, TargetE target) {
+static uint16_t emitForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift, TargetE target) {
    switch (target) {
        case TARGET_DOS:
-            switch (op) {
-                case SPRITE_OP_DRAW:    return spriteEmitDrawX86   (out, sp, shift);
-                case SPRITE_OP_SAVE:    return spriteEmitSaveX86   (out, sp, shift);
-                case SPRITE_OP_RESTORE: return spriteEmitRestoreX86(out, sp, shift);
-                default:                return 0;
-            }
+            return spriteEmitDrawX86(out, sp, shift);
        case TARGET_AMIGA:
        case TARGET_ATARIST:
-            switch (op) {
-                case SPRITE_OP_DRAW:    return spriteEmitDraw68k   (out, sp, shift);
-                case SPRITE_OP_SAVE:    return spriteEmitSave68k   (out, sp, shift);
-                case SPRITE_OP_RESTORE: return spriteEmitRestore68k(out, sp, shift);
-                default:                return 0;
-            }
+            return spriteEmitDraw68k(out, sp, shift);
        case TARGET_IIGS:
-            switch (op) {
-                case SPRITE_OP_DRAW:    return spriteEmitDrawIigs   (out, sp, shift);
-                case SPRITE_OP_SAVE:    return spriteEmitSaveIigs   (out, sp, shift);
-                case SPRITE_OP_RESTORE: return spriteEmitRestoreIigs(out, sp, shift);
-                default:                return 0;
-            }
+            return spriteEmitDrawIigs(out, sp, shift);
        default:
            return 0;
    }
 }


-// Sniff the first 2 bytes for the PPM magic. Errors return false (the
-// caller will fall through to the .tiles loader, which surfaces a
-// clear error if the bytes aren't valid tile data either).
-static bool fileIsPpm(const char *path) {
-    FILE *fp;
-    int   c0;
-    int   c1;
-
-    fp = fopen(path, "rb");
-    if (fp == NULL) {
-        return false;
-    }
-    c0 = fgetc(fp);
-    c1 = fgetc(fp);
-    fclose(fp);
-    return (c0 == 'P' && c1 == '6');
-}
-
-
-// Read a PPM (P6) raster into a freshly allocated 8-bit RGB buffer.
-// Mirrors joeyasset's loadPpm. Caller frees *outPixels.
-static int loadPpm(const char *path, int *outWidth, int *outHeight, uint8_t **outPixels) {
-    FILE    *fp;
-    char     tok[PPM_TOKEN_MAX];
-    int      width;
-    int      height;
-    int      maxval;
-    size_t   pixelBytes;
-    uint8_t *buf;
-    size_t   read;
-
-    fp = fopen(path, "rb");
-    if (fp == NULL) {
-        fprintf(stderr, "joeysprite: cannot open %s: %s\n", path, strerror(errno));
-        return 2;
-    }
-    if (parsePpmToken(fp, tok, sizeof(tok)) != 0 || strcmp(tok, "P6") != 0) {
-        fprintf(stderr, "joeysprite: %s is not a PPM (P6) file\n", path);
-        fclose(fp);
-        return 2;
-    }
-    if (parsePpmToken(fp, tok, sizeof(tok)) != 0) {
-        fclose(fp);
-        return 2;
-    }
-    width = atoi(tok);
-    if (parsePpmToken(fp, tok, sizeof(tok)) != 0) {
-        fclose(fp);
-        return 2;
-    }
-    height = atoi(tok);
-    if (parsePpmToken(fp, tok, sizeof(tok)) != 0) {
-        fclose(fp);
-        return 2;
-    }
-    maxval = atoi(tok);
-    if (width <= 0 || height <= 0) {
-        fprintf(stderr, "joeysprite: %s has non-positive dimensions\n", path);
-        fclose(fp);
-        return 2;
-    }
-    if (maxval != 255) {
-        fprintf(stderr, "joeysprite: %s maxval %d unsupported (must be 255)\n", path, maxval);
-        fclose(fp);
-        return 2;
-    }
-    pixelBytes = (size_t)width * (size_t)height * 3u;
-    buf = (uint8_t *)malloc(pixelBytes);
-    if (buf == NULL) {
-        fprintf(stderr, "joeysprite: out of memory (%zu bytes)\n", pixelBytes);
-        fclose(fp);
-        return 2;
-    }
-    read = fread(buf, 1, pixelBytes, fp);
-    fclose(fp);
-    if (read != pixelBytes) {
-        fprintf(stderr, "joeysprite: short raster in %s (got %zu, need %zu)\n",
-                path, read, pixelBytes);
-        free(buf);
-        return 2;
-    }
-    *outWidth  = width;
-    *outHeight = height;
-    *outPixels = buf;
-    return 0;
-}
-
-
-// End-to-end PPM -> tile-major 4bpp packed. On entry, *widthTiles /
-// *heightTiles are 0 if the user didn't pass --width-tiles /
-// --height-tiles, or the user-provided values otherwise; we fill in
-// the auto-derived values when the user left them at 0, and validate
-// against the image when they didn't.
-static int loadPpmAsTiles(const char *path, long *widthTiles, long *heightTiles, uint8_t **outTiles, uint32_t *outSize) {
-    uint8_t *rgb;
-    uint8_t *indices;
-    uint8_t *tiles;
-    int      width;
-    int      height;
-    long     wTiles;
-    long     hTiles;
-    uint32_t tileBytes;
-    int      paletteCount;
-    int      rc;
-
-    rc = loadPpm(path, &width, &height, &rgb);
-    if (rc != 0) {
-        return rc;
-    }
-    if ((width % TILE_PIXELS) != 0 || (height % TILE_PIXELS) != 0) {
-        fprintf(stderr,
-            "joeysprite: %s is %dx%d -- both dimensions must be multiples of %d\n",
-            path, width, height, TILE_PIXELS);
-        free(rgb);
-        return 2;
-    }
-    wTiles = width  / TILE_PIXELS;
-    hTiles = height / TILE_PIXELS;
-    if (*widthTiles == 0) {
-        *widthTiles = wTiles;
-    } else if (*widthTiles != wTiles) {
-        fprintf(stderr,
-            "joeysprite: --width-tiles %ld disagrees with image width %d (%ld tiles)\n",
-            *widthTiles, width, wTiles);
-        free(rgb);
-        return 2;
-    }
-    if (*heightTiles == 0) {
-        *heightTiles = hTiles;
-    } else if (*heightTiles != hTiles) {
-        fprintf(stderr,
-            "joeysprite: --height-tiles %ld disagrees with image height %d (%ld tiles)\n",
-            *heightTiles, height, hTiles);
-        free(rgb);
-        return 2;
-    }
-
-    indices = (uint8_t *)malloc((size_t)width * (size_t)height);
-    if (indices == NULL) {
-        fprintf(stderr, "joeysprite: out of memory for index buffer\n");
-        free(rgb);
-        return 2;
-    }
-    paletteCount = buildPalette(rgb, width, height, indices);
-    free(rgb);
-    if (paletteCount < 0) {
-        fprintf(stderr,
-            "joeysprite: %s has more than 16 distinct $0RGB colors after\n"
-            "  4-bit-per-channel quantization. Reduce the input palette and\n"
-            "  retry (e.g. pngquant --nofs 16, or GIMP -> Image -> Mode ->\n"
-            "  Indexed... with 16 colors and no dithering).\n", path);
-        free(indices);
-        return 2;
-    }
-
-    tileBytes = (uint32_t)wTiles * (uint32_t)hTiles * TILE_BYTES;
-    tiles     = (uint8_t *)malloc(tileBytes);
-    if (tiles == NULL) {
-        fprintf(stderr, "joeysprite: out of memory for tile buffer\n");
-        free(indices);
-        return 2;
-    }
-    packIndicesToTiles(indices, width, height, tiles);
-    free(indices);
-
-    *outTiles = tiles;
-    *outSize  = tileBytes;
-    return 0;
-}
-
-
 static int loadTileData(const char *path, uint8_t **outBytes, uint32_t *outSize) {
    FILE    *fp;
    long     fileSize;
@ -510,76 +236,6 @@ static int loadTileData(const char *path, uint8_t **outBytes, uint32_t *outSize)
 }


-// Reshuffle row-major palette indices into the tile-major 4bpp packed
-// layout the runtime SpriteT.tileData expects: tile (tx,ty)'s 32 bytes
-// land contiguously at outTiles[(ty*widthTiles + tx) * 32], with each
-// row inside the tile as 4 packed bytes (high nibble = left pixel).
-static void packIndicesToTiles(const uint8_t *indices, int width, int height, uint8_t *outTiles) {
-    int      widthTiles;
-    int      heightTiles;
-    int      tx;
-    int      ty;
-    int      row;
-    int      col;
-    int      pxX;
-    int      pxY;
-    uint8_t  hi;
-    uint8_t  lo;
-    uint8_t *tile;
-
-    widthTiles  = width  / TILE_PIXELS;
-    heightTiles = height / TILE_PIXELS;
-    for (ty = 0; ty < heightTiles; ty++) {
-        for (tx = 0; tx < widthTiles; tx++) {
-            tile = &outTiles[(ty * widthTiles + tx) * TILE_BYTES];
-            for (row = 0; row < TILE_PIXELS; row++) {
-                pxY = ty * TILE_PIXELS + row;
-                for (col = 0; col < TILE_BYTES_PER_ROW; col++) {
-                    pxX = tx * TILE_PIXELS + col * 2;
-                    hi  = (uint8_t)(indices[pxY * width + pxX]     & 0x0Fu);
-                    lo  = (uint8_t)(indices[pxY * width + pxX + 1] & 0x0Fu);
-                    tile[row * TILE_BYTES_PER_ROW + col] = (uint8_t)((hi << 4) | lo);
-                }
-            }
-        }
-    }
-}
-
-
-// Reads a single whitespace-separated token from a PPM header,
-// skipping `#` comments to end-of-line. Mirrors joeyasset.
-static int parsePpmToken(FILE *fp, char *out, int outLen) {
-    int c;
-    int pos;
-
-    pos = 0;
-    for (;;) {
-        c = fgetc(fp);
-        if (c == EOF) {
-            return -1;
-        }
-        if (isspace(c)) {
-            continue;
-        }
-        if (c == '#') {
-            while ((c = fgetc(fp)) != EOF && c != '\n') {
-                /* skip */;
-            }
-            continue;
-        }
-        break;
-    }
-    while (c != EOF && !isspace(c) && c != '#') {
-        if (pos < outLen - 1) {
-            out[pos++] = (char)c;
-        }
-        c = fgetc(fp);
-    }
-    out[pos] = 0;
-    return 0;
-}
-
-
 static TargetE parseTarget(const char *name) {
    if (strcmp(name, "iigs")    == 0) return TARGET_IIGS;
    if (strcmp(name, "amiga")   == 0) return TARGET_AMIGA;
@ -592,11 +248,8 @@ static TargetE parseTarget(const char *name) {
 static int usage(const char *prog) {
    fprintf(stderr,
        "usage: %s --target {iigs,amiga,atarist,dos} \\\n"
-        "         [--width-tiles N --height-tiles M] \\\n"
-        "         INPUT OUTPUT.spr\n"
-        "  INPUT is a PPM (P6) file (auto-derives tile dims from W/8, H/8)\n"
-        "  or a raw .tiles byte stream (requires --width-tiles/--height-tiles).\n",
-        prog);
+        "         --width-tiles N --height-tiles M \\\n"
+        "         input.tiles output.spr\n", prog);
    return 2;
 }

@ -648,11 +301,9 @@ int main(int argc, char **argv) {
            return usage(argv[0]);
        }
    }
-    if (targetName == NULL || inPath == NULL || outPath == NULL) {
-        return usage(argv[0]);
-    }
-    if (widthTiles < 0 || widthTiles > 255 ||
-        heightTiles < 0 || heightTiles > 255) {
+    if (targetName == NULL || widthTiles <= 0 || widthTiles > 255 ||
+        heightTiles <= 0 || heightTiles > 255 ||
+        inPath == NULL || outPath == NULL) {
        return usage(argv[0]);
    }

@ -662,25 +313,10 @@ int main(int argc, char **argv) {
        return usage(argv[0]);
    }

-    if (fileIsPpm(inPath)) {
-        // PPM path: tile dims auto-derive (or validate against CLI).
-        rc = loadPpmAsTiles(inPath, &widthTiles, &heightTiles, &tileBytes, &tileSize);
-        if (rc != 0) {
-            return rc;
-        }
-    } else {
-        // Raw .tiles path: tile dims required.
-        if (widthTiles <= 0 || heightTiles <= 0) {
-            fprintf(stderr,
-                "joeysprite: %s is not a PPM; --width-tiles and --height-tiles are required\n",
-                inPath);
-            return usage(argv[0]);
-        }
    rc = loadTileData(inPath, &tileBytes, &tileSize);
    if (rc != 0) {
        return rc;
    }
-    }

    expectedTileSize = (uint32_t)(widthTiles * heightTiles * 32);
    if (tileSize != expectedTileSize) {