From c4ee37941a8f91552b44870e65a201cbcdf2ca89 Mon Sep 17 00:00:00 2001
From: Scott Duensing <scott@duensing.com>
Date: Sun, 26 Apr 2026 21:42:43 -0500
Subject: [PATCH] Sprite save/restore for x86 and 68k.

---
 src/codegen/spriteCompile.c  |  78 +++++++++++++++++++----
 src/codegen/spriteEmit68k.c  |  88 +++++++++++++++++++++++--
 src/codegen/spriteEmitIigs.c |  80 +++++++++++++++++++++--
 src/codegen/spriteEmitX86.c  | 120 ++++++++++++++++++++++++++++++++---
 src/codegen/spriteEmitter.h  |  16 +++--
 src/port/amiga/c2p.s         | 102 ++++++++++++++---------------
 src/port/amiga/hal.c         |  23 +++----
 7 files changed, 409 insertions(+), 98 deletions(-)

diff --git a/src/codegen/spriteCompile.c b/src/codegen/spriteCompile.c
index d87bf2a..ce7b196 100644
--- a/src/codegen/spriteCompile.c
+++ b/src/codegen/spriteCompile.c
@@ -42,11 +42,17 @@ static uint16_t emitDrawForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift
 }
 
 
-// Save-under and restore-under emitters are IIgs-only at the moment;
-// other CPUs return 0, the runtime treats that as "not compiled" and
-// falls back to spriteSaveUnderInterpreted / spriteRestoreUnderInterpreted.
+// Save-under and restore-under emit dispatch. Each per-CPU pair
+// produces row-by-row copy bytes; the runtime dispatch in
+// src/core/sprite.c gates on routineOffsets[shift][SPRITE_OP_SAVE]
+// != SPRITE_NOT_COMPILED and falls back to the interpreted memcpy
+// path otherwise.
 static uint16_t emitSaveForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift) {
-#if defined(JOEYLIB_PLATFORM_IIGS)
+#if defined(JOEYLIB_PLATFORM_DOS)
+    return spriteEmitSaveX86(out, sp, shift);
+#elif defined(JOEYLIB_PLATFORM_AMIGA) || defined(JOEYLIB_PLATFORM_ATARIST)
+    return spriteEmitSave68k(out, sp, shift);
+#elif defined(JOEYLIB_PLATFORM_IIGS)
     return spriteEmitSaveIigs(out, sp, shift);
 #else
     (void)out; (void)sp; (void)shift;
@@ -56,7 +62,11 @@ static uint16_t emitSaveForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift
 
 
 static uint16_t emitRestoreForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift) {
-#if defined(JOEYLIB_PLATFORM_IIGS)
+#if defined(JOEYLIB_PLATFORM_DOS)
+    return spriteEmitRestoreX86(out, sp, shift);
+#elif defined(JOEYLIB_PLATFORM_AMIGA) || defined(JOEYLIB_PLATFORM_ATARIST)
+    return spriteEmitRestore68k(out, sp, shift);
+#elif defined(JOEYLIB_PLATFORM_IIGS)
     return spriteEmitRestoreIigs(out, sp, shift);
 #else
     (void)out; (void)sp; (void)shift;
@@ -416,18 +426,62 @@ void spriteCompiledDraw(SurfaceT *dst, const SpriteT *sp, int16_t x, int16_t y)
 }
 
 
-// Non-IIgs platforms have no compiled save/restore yet. The dispatch
-// in src/core/sprite.c gates on routineOffsets[shift][SPRITE_OP_*] !=
-// SPRITE_NOT_COMPILED, so these stubs should never actually run on
-// those platforms; they exist so spriteInternal.h's prototypes stay
-// resolved at link time.
+// x86 / 68k compiled save: bytes are a cdecl
+//   void copy(const uint8_t *src, uint8_t *dst)
+// that walks heightPx rows of copyBytes from screen (stride
+// SURFACE_BYTES_PER_ROW) into the contiguous backup buffer.
 void spriteCompiledSaveUnder(const SurfaceT *src, SpriteT *sp, int16_t x, int16_t y, SpriteBackupT *backup) {
-    (void)src; (void)sp; (void)x; (void)y; (void)backup;
+    typedef void (*CopyFn)(const uint8_t *src, uint8_t *dst);
+    uint8_t  shift;
+    int16_t  clippedX;
+    uint16_t widthPx;
+    uint16_t heightPx;
+    uint16_t copyBytes;
+    uint8_t *screenPtr;
+    CopyFn   fn;
+
+    shift     = (uint8_t)(x & 1);
+    clippedX  = (int16_t)(x & ~1);
+    widthPx   = (uint16_t)(sp->widthTiles  * 8);
+    heightPx  = (uint16_t)(sp->heightTiles * 8);
+    copyBytes = (uint16_t)((widthPx >> 1) + (shift == 1 ? 1 : 0));
+
+    screenPtr = (uint8_t *)&src->pixels[(uint16_t)y * SURFACE_BYTES_PER_ROW + ((uint16_t)clippedX >> 1)];
+
+    backup->sprite    = sp;
+    backup->x         = clippedX;
+    backup->y         = y;
+    backup->width     = (uint16_t)(copyBytes << 1);
+    backup->height    = heightPx;
+    backup->sizeBytes = (uint16_t)(copyBytes * heightPx);
+
+    fn = (CopyFn)(codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_SAVE]);
+    fn(screenPtr, backup->bytes);
 }
 
 
+// Mirror of save: caller swaps arg order so the same emitted shape
+// drives backup -> screen. The screen-side stride lives inside the
+// emitted bytes, so RESTORE has its own routine bytes (stride is
+// applied to dst instead of src).
 void spriteCompiledRestoreUnder(SurfaceT *dst, const SpriteBackupT *backup) {
-    (void)dst; (void)backup;
+    typedef void (*CopyFn)(const uint8_t *src, uint8_t *dst);
+    SpriteT  *sp;
+    uint8_t   shift;
+    uint16_t  copyBytes;
+    uint16_t  spriteBytesPerRow;
+    uint8_t  *screenPtr;
+    CopyFn    fn;
+
+    sp                = backup->sprite;
+    copyBytes         = (uint16_t)(backup->width >> 1);
+    spriteBytesPerRow = (uint16_t)(sp->widthTiles * 4);
+    shift             = (copyBytes == spriteBytesPerRow) ? 0 : 1;
+
+    screenPtr = (uint8_t *)&dst->pixels[(uint16_t)backup->y * SURFACE_BYTES_PER_ROW + ((uint16_t)backup->x >> 1)];
+
+    fn = (CopyFn)(codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_RESTORE]);
+    fn(backup->bytes, screenPtr);
 }
 
 #endif
diff --git a/src/codegen/spriteEmit68k.c b/src/codegen/spriteEmit68k.c
index 5147cec..b86851c 100644
--- a/src/codegen/spriteEmit68k.c
+++ b/src/codegen/spriteEmit68k.c
@@ -1,10 +1,12 @@
 // 68k sprite codegen (Amiga + Atari ST). Emits SysV-ish cdecl-
-// callable PIC draw routines that write 4bpp packed surface bytes
-// via d16(a0) chains. Same shape as the x86 emitter; only the
-// instruction encoding differs.
+// callable PIC draw / save / restore routines that read or write
+// 4bpp packed surface bytes via d16(a0) chains. Same shape as the
+// x86 emitter; only the instruction encoding differs.
 //
 // Calling convention (m68k gcc / mintlib):
-//   void draw(uint8_t *dst);    -- arg in 4(sp); a0/a1/d0/d1 caller-saved.
+//   void draw(uint8_t *dst);                              -- arg in 4(sp)
+//   void save/restore(const uint8_t *src, uint8_t *dst);  -- args in 4(sp)/8(sp)
+//   a0/a1/d0/d1 are caller-saved.
 //
 // Per-byte emit (no run coalescing yet):
 //   - all-transparent: skip
@@ -38,13 +40,46 @@
 
 // ----- Prototypes -----
 
-static uint8_t  spriteSourceByte(const SpriteT *sp, uint16_t row, uint16_t col);
+static uint16_t emitCopyBody68k(uint8_t *out, uint16_t cursor, uint16_t heightPx, uint16_t copyBytes, bool strideOnSrc);
 static void     shiftedByteAt(const SpriteT *sp, uint16_t row, uint16_t col, uint8_t shift, uint16_t spriteBytesPerRow, uint8_t *outValue, uint8_t *outOpaqueMask);
+static uint8_t  spriteSourceByte(const SpriteT *sp, uint16_t row, uint16_t col);
 static uint16_t writeBE16(uint8_t *out, uint16_t value);
 
 
 // ----- Emit helpers (alphabetical) -----
 
+// Shared body for save/restore. Walks heightPx rows of copyBytes
+// using `move.b (a0)+, (a1)+` byte-wise (safe regardless of pointer
+// alignment, since the screen-side x can land on an odd byte). After
+// each row except the last, advances either a0 (SAVE: src=screen) or
+// a1 (RESTORE: dst=screen) by (SURFACE_BYTES_PER_ROW - copyBytes) so
+// the strided side lines up with the next scanline; the contiguous
+// side advances naturally via the post-increment.
+//
+// strideOnSrc=true   -> source has the screen stride (SAVE)
+// strideOnSrc=false  -> destination has the screen stride (RESTORE)
+static uint16_t emitCopyBody68k(uint8_t *out, uint16_t cursor, uint16_t heightPx, uint16_t copyBytes, bool strideOnSrc) {
+    uint16_t row;
+    uint16_t col;
+    uint16_t advance;
+
+    advance = (uint16_t)(SURFACE_BYTES_PER_ROW - copyBytes);
+
+    for (row = 0; row < heightPx; row++) {
+        // Unrolled: move.b (a0)+, (a1)+ -- 0x12D8.
+        for (col = 0; col < copyBytes; col++) {
+            cursor += writeBE16(out + cursor, 0x12D8u);
+        }
+        if (row + 1u < heightPx) {
+            // adda.w #advance, a0 (0xD0FC) for SAVE
+            // adda.w #advance, a1 (0xD2FC) for RESTORE
+            cursor += writeBE16(out + cursor, strideOnSrc ? 0xD0FCu : 0xD2FCu);
+            cursor += writeBE16(out + cursor, advance);
+        }
+    }
+    return cursor;
+}
+
 // Same logic as the x86 shiftedByteAt -- per-byte transparency
 // decomposition for shift in {0,1}. opaqueMask high nibble 0xF0 if
 // dest high nibble is opaque, 0x0F if low is opaque.
@@ -184,3 +219,46 @@ uint16_t spriteEmitDraw68k(uint8_t *out, const SpriteT *sp, uint8_t shift) {
 }
 
 
+// RESTORE: copy backup -> screen. Destination has the screen stride.
+uint16_t spriteEmitRestore68k(uint8_t *out, const SpriteT *sp, uint8_t shift) {
+    uint16_t cursor;
+    uint16_t heightPx;
+    uint16_t copyBytes;
+
+    cursor    = 0;
+    heightPx  = (uint16_t)(sp->heightTiles * TILE_PIXELS);
+    copyBytes = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW + (shift == 1u ? 1u : 0u));
+
+    // Prologue: movea.l 4(sp), a0 (src); movea.l 8(sp), a1 (dst).
+    cursor += writeBE16(out + cursor, 0x206Fu);
+    cursor += writeBE16(out + cursor, 0x0004u);
+    cursor += writeBE16(out + cursor, 0x226Fu);
+    cursor += writeBE16(out + cursor, 0x0008u);
+
+    cursor = emitCopyBody68k(out, cursor, heightPx, copyBytes, false);
+
+    cursor += writeBE16(out + cursor, 0x4E75u);
+    return cursor;
+}
+
+
+// SAVE: copy screen -> backup. Source has the screen stride.
+uint16_t spriteEmitSave68k(uint8_t *out, const SpriteT *sp, uint8_t shift) {
+    uint16_t cursor;
+    uint16_t heightPx;
+    uint16_t copyBytes;
+
+    cursor    = 0;
+    heightPx  = (uint16_t)(sp->heightTiles * TILE_PIXELS);
+    copyBytes = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW + (shift == 1u ? 1u : 0u));
+
+    cursor += writeBE16(out + cursor, 0x206Fu);
+    cursor += writeBE16(out + cursor, 0x0004u);
+    cursor += writeBE16(out + cursor, 0x226Fu);
+    cursor += writeBE16(out + cursor, 0x0008u);
+
+    cursor = emitCopyBody68k(out, cursor, heightPx, copyBytes, true);
+
+    cursor += writeBE16(out + cursor, 0x4E75u);
+    return cursor;
+}
diff --git a/src/codegen/spriteEmitIigs.c b/src/codegen/spriteEmitIigs.c
index 6b1d93f..0829ef6 100644
--- a/src/codegen/spriteEmitIigs.c
+++ b/src/codegen/spriteEmitIigs.c
@@ -217,6 +217,36 @@ uint16_t spriteEmitRestoreIigs(uint8_t *out, const SpriteT *sp, uint8_t shift) {
 
 
 // 65816 draw emit. Returns bytes written.
+//
+// Two emission paths share the body:
+//
+//   * M=8 byte path (default; matches the stub-set entry mode):
+//       opaque:  A9 vv         LDA #vv             ;  2c
+//                99 lo hi      STA abs,Y           ;  5c
+//                                                  ; 7c / 5 bytes per byte
+//       mixed:   B9 lo hi      LDA abs,Y           ;  5c
+//                29 mm         AND #~mask          ;  2c
+//                09 vv         ORA #val            ;  2c
+//                99 lo hi      STA abs,Y           ;  5c
+//                                                  ;14c / 9 bytes per byte
+//
+//   * M=16 word path (entered around runs of >= 2 consecutive
+//     fully-opaque bytes). Each word write covers 2 dest bytes:
+//       prologue: C2 20         REP #$20            ;  3c
+//       per pair: A9 lo hi      LDA #imm16          ;  3c
+//                 99 lo hi      STA abs,Y           ;  6c
+//                                                   ;  9c / 6 bytes per pair
+//       epilogue: E2 20         SEP #$20            ;  3c
+//
+//     vs. M=8 path doing the same 2 bytes: 14c / 10 bytes. Per-pair
+//     savings are 5c / 4 bytes; the 6c/4-byte REP+SEP transition is
+//     amortized once per opaque run, so the path is profitable for
+//     runs of 2 pairs (4 consecutive opaque bytes) or longer. For
+//     isolated pairs we still take the M=16 path -- the 1-cycle loss
+//     vs. M=8 is dwarfed by the typical-sprite opaque-run length.
+//
+// Mixed bytes always run on the M=8 path because the AND/ORA in
+// M=16 would clobber the adjacent byte.
 uint16_t spriteEmitDrawIigs(uint8_t *out, const SpriteT *sp, uint8_t shift) {
     uint16_t cursor;
     uint16_t row;
@@ -227,11 +257,15 @@ uint16_t spriteEmitDrawIigs(uint8_t *out, const SpriteT *sp, uint8_t shift) {
     uint16_t absOffset;
     uint8_t  value;
     uint8_t  opaqueMask;
+    uint8_t  nextValue;
+    uint8_t  nextOpaqueMask;
+    bool     wide;
 
     cursor             = 0;
     heightPx           = (uint16_t)(sp->heightTiles * TILE_PIXELS);
     spriteBytesPerRow  = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW);
     destBytesPerRow    = (uint16_t)(spriteBytesPerRow + (shift == 1 ? 1 : 0));
+    wide               = false;
 
     // No prologue: caller (the inline-asm stub in spriteCompile.c)
     // sets M=8/X=16/Y=destRow before JSL'ing here.
@@ -243,18 +277,42 @@ uint16_t spriteEmitDrawIigs(uint8_t *out, const SpriteT *sp, uint8_t shift) {
                 continue;
             }
             absOffset = (uint16_t)(row * SURFACE_BYTES_PER_ROW + col);
+
+            if (opaqueMask == 0xFFu && (col + 1) < destBytesPerRow) {
+                // Look ahead: if (col, col+1) are both fully opaque
+                // we can pair them as a single M=16 word write.
+                shiftedByteAt(sp, row, (uint16_t)(col + 1), shift,
+                              spriteBytesPerRow, &nextValue, &nextOpaqueMask);
+                if (nextOpaqueMask == 0xFFu) {
+                    if (!wide) {
+                        out[cursor++] = 0xC2;            // REP #$20 -- M=16
+                        out[cursor++] = 0x20;
+                        wide = true;
+                    }
+                    out[cursor++] = 0xA9;                // LDA #imm16
+                    cursor += writeLE16(out + cursor,
+                                        (uint16_t)(((uint16_t)nextValue << 8) | value));
+                    out[cursor++] = 0x99;                // STA abs,Y
+                    cursor += writeLE16(out + cursor, absOffset);
+                    col++;                               // consumed col+1
+                    continue;
+                }
+            }
+
+            // Falls through here for: isolated opaque (no pair), mixed,
+            // or the trailing odd byte at the right edge. All on M=8.
+            if (wide) {
+                out[cursor++] = 0xE2;                    // SEP #$20 -- back to M=8
+                out[cursor++] = 0x20;
+                wide = false;
+            }
+
             if (opaqueMask == 0xFFu) {
-                // lda #imm    A9 ii
-                // sta abs,Y   99 lo hi
                 out[cursor++] = 0xA9;
                 out[cursor++] = value;
                 out[cursor++] = 0x99;
                 cursor += writeLE16(out + cursor, absOffset);
             } else {
-                // lda abs,Y   B9 lo hi
-                // and #mask   29 mm
-                // ora #val    09 vv
-                // sta abs,Y   99 lo hi
                 out[cursor++] = 0xB9;
                 cursor += writeLE16(out + cursor, absOffset);
                 out[cursor++] = 0x29;
@@ -267,6 +325,16 @@ uint16_t spriteEmitDrawIigs(uint8_t *out, const SpriteT *sp, uint8_t shift) {
         }
     }
 
+    // Routine exits in M=8: the JSL stub assumes M=8 throughout (the
+    // stub itself only ever ran with M=8 and doesn't restore M). The
+    // asm wrapper after the JSL forces M=16 again, but be defensive
+    // and ensure we leave M=8 here so the stub's PLB/RTL run as
+    // expected even if the wrapper convention changes.
+    if (wide) {
+        out[cursor++] = 0xE2;
+        out[cursor++] = 0x20;
+    }
+
     // Epilogue: rtl (large memory model -b uses JSL/RTL).
     out[cursor++] = 0x6B;
     return cursor;
diff --git a/src/codegen/spriteEmitX86.c b/src/codegen/spriteEmitX86.c
index 8247811..8d66228 100644
--- a/src/codegen/spriteEmitX86.c
+++ b/src/codegen/spriteEmitX86.c
@@ -1,14 +1,11 @@
 // x86 sprite codegen (DOS port). Emits 32-bit cdecl-callable PIC
-// draw routines that write 4bpp packed surface bytes via
-// [esi+disp8] chains. The C side calls them through a function
-// pointer cast.
+// draw / save / restore routines that read or write 4bpp packed
+// surface bytes via [esi+disp8] chains. The C side calls them
+// through a function pointer cast.
 //
 // Calling convention:
-//   draw(uint8_t *dst)             -- esi advances row by row
-//
-// Save and restore are not compiled -- they're uniform memcpy-
-// shaped operations and the C interpreter handles them at memcpy
-// speed via the standard library.
+//   draw(uint8_t *dst)                              -- arg in [esp+8] after prologue saves esi
+//   save/restore(const uint8_t *src, uint8_t *dst)  -- args in [esp+12]/[esp+16] after esi+edi save
 //
 // Per-byte emit (no run coalescing yet):
 //   - all-transparent (both nibbles 0): skip, no instruction
@@ -45,12 +42,69 @@
 
 // ----- Prototypes -----
 
-static uint8_t  spriteSourceByte(const SpriteT *sp, uint16_t row, uint16_t col);
+static uint16_t emitCopyBodyX86(uint8_t *out, uint16_t cursor, uint16_t heightPx, uint16_t copyBytes, bool strideOnSrc);
 static void     shiftedByteAt(const SpriteT *sp, uint16_t row, uint16_t col, uint8_t shift, uint16_t spriteBytesPerRow, uint8_t *outValue, uint8_t *outOpaqueMask);
+static uint8_t  spriteSourceByte(const SpriteT *sp, uint16_t row, uint16_t col);
 
 
 // ----- Emit helpers (alphabetical) -----
 
+// Shared body for save/restore. Walks heightPx rows of copyBytes
+// using rep movsd for the dword-aligned bulk and rep movsb for the
+// byte tail. After each row except the last, advances either esi
+// or edi by (SURFACE_BYTES_PER_ROW - copyBytes) so the strided side
+// (screen) lines up with the next scanline; the contiguous side
+// (backup) advances naturally because rep movs* leaves the index
+// register one past the last byte copied.
+//
+// strideOnSrc=true   -> source has the screen stride (SAVE)
+// strideOnSrc=false  -> destination has the screen stride (RESTORE)
+static uint16_t emitCopyBodyX86(uint8_t *out, uint16_t cursor, uint16_t heightPx, uint16_t copyBytes, bool strideOnSrc) {
+    uint16_t row;
+    uint16_t dwords;
+    uint16_t tail;
+    int32_t  advance;
+
+    dwords  = (uint16_t)(copyBytes >> 2);
+    tail    = (uint16_t)(copyBytes & 0x3u);
+    advance = (int32_t)SURFACE_BYTES_PER_ROW - (int32_t)copyBytes;
+
+    for (row = 0; row < heightPx; row++) {
+        if (dwords > 0) {
+            // mov ecx, dwords (B9 imm32); rep movsd (F3 A5)
+            out[cursor++] = 0xB9;
+            out[cursor++] = (uint8_t)(dwords & 0xFFu);
+            out[cursor++] = (uint8_t)((dwords >> 8) & 0xFFu);
+            out[cursor++] = 0;
+            out[cursor++] = 0;
+            out[cursor++] = 0xF3;
+            out[cursor++] = 0xA5;
+        }
+        if (tail > 0) {
+            // mov ecx, tail (B9 imm32); rep movsb (F3 A4)
+            out[cursor++] = 0xB9;
+            out[cursor++] = (uint8_t)(tail & 0xFFu);
+            out[cursor++] = 0;
+            out[cursor++] = 0;
+            out[cursor++] = 0;
+            out[cursor++] = 0xF3;
+            out[cursor++] = 0xA4;
+        }
+        if (row + 1u < heightPx) {
+            // SAVE: add esi, advance (81 C6 imm32)
+            // RESTORE: add edi, advance (81 C7 imm32)
+            out[cursor++] = 0x81;
+            out[cursor++] = (uint8_t)(strideOnSrc ? 0xC6u : 0xC7u);
+            out[cursor++] = (uint8_t)(advance & 0xFFu);
+            out[cursor++] = (uint8_t)((advance >> 8) & 0xFFu);
+            out[cursor++] = (uint8_t)((advance >> 16) & 0xFFu);
+            out[cursor++] = (uint8_t)((advance >> 24) & 0xFFu);
+        }
+    }
+    return cursor;
+}
+
+
 // Decompose a destination byte's contribution from the sprite into
 // (value, opaqueMask) for shift in {0, 1}. opaqueMask high nibble
 // 0xF0 means high dest nibble is opaque; 0x0F means low is opaque;
@@ -189,3 +243,51 @@ uint16_t spriteEmitDrawX86(uint8_t *out, const SpriteT *sp, uint8_t shift) {
 }
 
 
+// RESTORE: copy backup -> screen. Destination has the screen stride.
+uint16_t spriteEmitRestoreX86(uint8_t *out, const SpriteT *sp, uint8_t shift) {
+    uint16_t cursor;
+    uint16_t heightPx;
+    uint16_t copyBytes;
+
+    cursor    = 0;
+    heightPx  = (uint16_t)(sp->heightTiles * TILE_PIXELS);
+    copyBytes = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW + (shift == 1u ? 1u : 0u));
+
+    // Prologue: push esi; push edi; mov esi,[esp+12]; mov edi,[esp+16]
+    out[cursor++] = 0x56;
+    out[cursor++] = 0x57;
+    out[cursor++] = 0x8B; out[cursor++] = 0x74; out[cursor++] = 0x24; out[cursor++] = 0x0C;
+    out[cursor++] = 0x8B; out[cursor++] = 0x7C; out[cursor++] = 0x24; out[cursor++] = 0x10;
+
+    cursor = emitCopyBodyX86(out, cursor, heightPx, copyBytes, false);
+
+    // Epilogue: pop edi; pop esi; ret
+    out[cursor++] = 0x5F;
+    out[cursor++] = 0x5E;
+    out[cursor++] = 0xC3;
+    return cursor;
+}
+
+
+// SAVE: copy screen -> backup. Source has the screen stride.
+uint16_t spriteEmitSaveX86(uint8_t *out, const SpriteT *sp, uint8_t shift) {
+    uint16_t cursor;
+    uint16_t heightPx;
+    uint16_t copyBytes;
+
+    cursor    = 0;
+    heightPx  = (uint16_t)(sp->heightTiles * TILE_PIXELS);
+    copyBytes = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW + (shift == 1u ? 1u : 0u));
+
+    out[cursor++] = 0x56;
+    out[cursor++] = 0x57;
+    out[cursor++] = 0x8B; out[cursor++] = 0x74; out[cursor++] = 0x24; out[cursor++] = 0x0C;
+    out[cursor++] = 0x8B; out[cursor++] = 0x7C; out[cursor++] = 0x24; out[cursor++] = 0x10;
+
+    cursor = emitCopyBodyX86(out, cursor, heightPx, copyBytes, true);
+
+    out[cursor++] = 0x5F;
+    out[cursor++] = 0x5E;
+    out[cursor++] = 0xC3;
+    return cursor;
+}
diff --git a/src/codegen/spriteEmitter.h b/src/codegen/spriteEmitter.h
index 8a8955f..8fbe359 100644
--- a/src/codegen/spriteEmitter.h
+++ b/src/codegen/spriteEmitter.h
@@ -27,11 +27,19 @@ uint16_t spriteEmitDrawIigs(uint8_t *out, const SpriteT *sp, uint8_t shift);
 // rectangle between the destination surface and a backup buffer. The
 // rectangle's width and start position depend on the shift: for
 // shift=0 (even x) it covers exactly the sprite's bytes per row;
-// for shift=1 (odd x) it covers one extra byte on each side, rounded
-// up to even. Per-CPU emitters return 0 to mean "not implemented" --
-// the runtime dispatch falls back to the interpreted path in that
-// case.
+// for shift=1 (odd x) it covers one extra byte (left edge nibble).
+// Per-CPU emitters return 0 to mean "not implemented" -- the runtime
+// dispatch falls back to the interpreted path in that case.
+//
+// IIgs uses a self-modifying MVN-stub on top of these bytes; x86 and
+// 68k use a plain cdecl `void copy(const uint8_t *src, uint8_t *dst)`
+// where the caller swaps args between SAVE (screen->backup) and
+// RESTORE (backup->screen).
 uint16_t spriteEmitSaveIigs   (uint8_t *out, const SpriteT *sp, uint8_t shift);
 uint16_t spriteEmitRestoreIigs(uint8_t *out, const SpriteT *sp, uint8_t shift);
+uint16_t spriteEmitSaveX86    (uint8_t *out, const SpriteT *sp, uint8_t shift);
+uint16_t spriteEmitRestoreX86 (uint8_t *out, const SpriteT *sp, uint8_t shift);
+uint16_t spriteEmitSave68k    (uint8_t *out, const SpriteT *sp, uint8_t shift);
+uint16_t spriteEmitRestore68k (uint8_t *out, const SpriteT *sp, uint8_t shift);
 
 #endif
diff --git a/src/port/amiga/c2p.s b/src/port/amiga/c2p.s
index 9a7c1f3..25554fa 100644
--- a/src/port/amiga/c2p.s
+++ b/src/port/amiga/c2p.s
@@ -1,13 +1,10 @@
 | Amiga chunky-to-planar conversion -- 68000 hand-rolled.
 |
-| Drop-in replacement for hal.c's old c2pRange C inner loop. The C
-| version walked every pixel and OR'd individual bits into 4 plane
-| accumulators -- ~1.5 s for a full 320x200 frame on a 7 MHz 68000
-| (the GCC m68k codegen is poor for tight bit-twiddling). This rewrite
-| uses a 4 KB lookup table built once at HAL init: each (sourceByte,
-| bytePosition, plane) tuple maps to the plane-byte-bit contribution
-| that source byte makes when it sits at that position within a
-| 4-byte (= 8-pixel) planar group.
+| Drop-in replacement for hal.c's old c2pRange C inner loop. Uses a
+| 4 KB lookup table built once at HAL init: each (sourceByte, position,
+| plane) tuple maps to the plane-byte bit contribution that source
+| byte makes when it sits at that position within a 4-byte (8-pixel)
+| planar group going to that plane.
 |
 | Calling convention: m68k-amigaos-gcc cdecl.
 |   Args on stack at 4(sp), 8(sp), ...
@@ -22,12 +19,17 @@
 |                        uint16_t       n,      ; 24(sp) - planar byte count (low word)
 |                        const uint8_t *lut);   ; 28(sp) - 4 KB LUT base
 |
-| LUT layout: lut[pos*1024 + plane*256 + src] = 1-byte plane contribution
-| for source byte `src` sitting at byte-position `pos` within its
-| 4-byte planar group, going to plane `plane`. Byte-position 0 is the
-| leftmost (its two pixels land in plane-byte bits 7 and 6); position
-| 3 is the rightmost (bits 1 and 0). Built once by chunkyToPlanarInit
-| (in hal.c) at HAL boot.
+| LUT layout: lut[src*16 + pos*4 + plane] = 1-byte plane contribution
+| for source byte `src` sitting at byte-position `pos` (0..3) within
+| its 4-byte planar group, going to plane `plane` (0..3). All 16
+| (pos, plane) entries for one src byte are contiguous, so the inner
+| loop reaches every entry off (a5, d4.w) with an 8-bit displacement
+| (0..15) and never has to advance an index register.
+|
+| Per planar byte we consume 4 source bytes (positions 0..3 of the
+| 8-pixel group). For each we compute d4 = src*16 with four add.w's
+| (faster than asl.w on 68000) and OR the four plane contributions
+| into d0..d3 with byte-displaced (a5,d4.w) reads.
 |
 | GAS-syntax (binutils m68k); assembled by m68k-amigaos-as via the
 | gcc driver.
@@ -65,54 +67,52 @@ _chunkyToPlanarRow:
                 moveq   #0,%d3                          | plane 3 acc
 
                 | ----- Source byte position 0 -----
-                | a5 points to start of LUT. Plane 0/1/2/3 sub-tables
-                | for position 0 are at offsets 0/256/512/768.
                 moveq   #0,%d4
                 move.b  (%a0)+,%d4                      | src[0]
-                move.l  %a5,%a6
-                or.b    (%a6,%d4.w),%d0                 | +0   = pos0 plane 0
-                lea     256(%a6),%a6
-                or.b    (%a6,%d4.w),%d1                 | +256 = pos0 plane 1
-                lea     256(%a6),%a6
-                or.b    (%a6,%d4.w),%d2                 | +512 = pos0 plane 2
-                lea     256(%a6),%a6
-                or.b    (%a6,%d4.w),%d3                 | +768 = pos0 plane 3
+                add.w   %d4,%d4
+                add.w   %d4,%d4
+                add.w   %d4,%d4
+                add.w   %d4,%d4                         | d4 = src * 16
+                or.b      0(%a5,%d4.w),%d0              | pos0 plane0
+                or.b      1(%a5,%d4.w),%d1              | pos0 plane1
+                or.b      2(%a5,%d4.w),%d2              | pos0 plane2
+                or.b      3(%a5,%d4.w),%d3              | pos0 plane3
 
                 | ----- Source byte position 1 -----
-                lea     256(%a6),%a6                    | advance to pos1 plane 0
                 moveq   #0,%d4
-                move.b  (%a0)+,%d4
-                or.b    (%a6,%d4.w),%d0
-                lea     256(%a6),%a6
-                or.b    (%a6,%d4.w),%d1
-                lea     256(%a6),%a6
-                or.b    (%a6,%d4.w),%d2
-                lea     256(%a6),%a6
-                or.b    (%a6,%d4.w),%d3
+                move.b  (%a0)+,%d4                      | src[1]
+                add.w   %d4,%d4
+                add.w   %d4,%d4
+                add.w   %d4,%d4
+                add.w   %d4,%d4
+                or.b      4(%a5,%d4.w),%d0              | pos1 plane0
+                or.b      5(%a5,%d4.w),%d1              | pos1 plane1
+                or.b      6(%a5,%d4.w),%d2              | pos1 plane2
+                or.b      7(%a5,%d4.w),%d3              | pos1 plane3
 
                 | ----- Source byte position 2 -----
-                lea     256(%a6),%a6
                 moveq   #0,%d4
-                move.b  (%a0)+,%d4
-                or.b    (%a6,%d4.w),%d0
-                lea     256(%a6),%a6
-                or.b    (%a6,%d4.w),%d1
-                lea     256(%a6),%a6
-                or.b    (%a6,%d4.w),%d2
-                lea     256(%a6),%a6
-                or.b    (%a6,%d4.w),%d3
+                move.b  (%a0)+,%d4                      | src[2]
+                add.w   %d4,%d4
+                add.w   %d4,%d4
+                add.w   %d4,%d4
+                add.w   %d4,%d4
+                or.b      8(%a5,%d4.w),%d0              | pos2 plane0
+                or.b      9(%a5,%d4.w),%d1              | pos2 plane1
+                or.b     10(%a5,%d4.w),%d2              | pos2 plane2
+                or.b     11(%a5,%d4.w),%d3              | pos2 plane3
 
                 | ----- Source byte position 3 -----
-                lea     256(%a6),%a6
                 moveq   #0,%d4
-                move.b  (%a0)+,%d4
-                or.b    (%a6,%d4.w),%d0
-                lea     256(%a6),%a6
-                or.b    (%a6,%d4.w),%d1
-                lea     256(%a6),%a6
-                or.b    (%a6,%d4.w),%d2
-                lea     256(%a6),%a6
-                or.b    (%a6,%d4.w),%d3
+                move.b  (%a0)+,%d4                      | src[3]
+                add.w   %d4,%d4
+                add.w   %d4,%d4
+                add.w   %d4,%d4
+                add.w   %d4,%d4
+                or.b     12(%a5,%d4.w),%d0              | pos3 plane0
+                or.b     13(%a5,%d4.w),%d1              | pos3 plane1
+                or.b     14(%a5,%d4.w),%d2              | pos3 plane2
+                or.b     15(%a5,%d4.w),%d3              | pos3 plane3
 
                 | ----- Store plane bytes -----
                 move.b  %d0,(%a1)+
diff --git a/src/port/amiga/hal.c b/src/port/amiga/hal.c
index 174cdee..9c8bb20 100644
--- a/src/port/amiga/hal.c
+++ b/src/port/amiga/hal.c
@@ -77,11 +77,12 @@ static uint16_t gCachedPalette[SURFACE_PALETTE_COUNT][SURFACE_COLORS_PER_PALETTE
 static bool     gCacheValid = false;
 
 // 4 KB chunky-to-planar lookup table consumed by chunkyToPlanarRow
-// (src/port/amiga/c2p.s). Layout: gC2pLut[pos*1024 + plane*256 + src]
-// = the plane-byte bit contribution that source byte `src` makes when
-// it sits at byte-position `pos` within a 4-byte (8-pixel) planar
-// group, going to plane `plane`. Built once by initC2pLut on the
-// first halPresent call.
+// (src/port/amiga/c2p.s). Layout: gC2pLut[src*16 + pos*4 + plane] =
+// the plane-byte bit contribution that source byte `src` makes to
+// plane `plane` when it sits at byte-position `pos` within a 4-byte
+// (8-pixel) planar group. The src-major layout lets the asm inner
+// loop reach all 16 (pos, plane) entries for a single src byte via
+// 8-bit displacements off (a5, d4.w) without any LEA between reads.
 static uint8_t  gC2pLut[4 * 1024];
 static bool     gC2pLutReady = false;
 
@@ -116,14 +117,14 @@ static void initC2pLut(void) {
     if (gC2pLutReady) {
         return;
     }
-    for (pos = 0; pos < 4; pos++) {
-        highShift = (uint8_t)(7 - 2 * pos);
-        lowShift  = (uint8_t)(6 - 2 * pos);
-        for (plane = 0; plane < 4; plane++) {
-            for (src = 0; src < 256; src++) {
+    for (src = 0; src < 256; src++) {
+        for (pos = 0; pos < 4; pos++) {
+            highShift = (uint8_t)(7 - 2 * pos);
+            lowShift  = (uint8_t)(6 - 2 * pos);
+            for (plane = 0; plane < 4; plane++) {
                 highBit = (uint8_t)(((src >> 4) >> plane) & 1);
                 lowBit  = (uint8_t)(((src & 0x0F) >> plane) & 1);
-                gC2pLut[pos * 1024 + plane * 256 + src] =
+                gC2pLut[src * 16 + pos * 4 + plane] =
                     (uint8_t)((highBit << highShift) | (lowBit << lowShift));
             }
         }