From cf6ae093d3985f487b1376875a9448cf977f661b Mon Sep 17 00:00:00 2001
From: Scott Duensing <scott@duensing.com>
Date: Mon, 4 May 2026 11:06:41 -0500
Subject: [PATCH] ST is more or less parity.

---
 README.md                     | 326 ++++++++++++++++++++++
 scripts/dosbox-386sx16.conf   |  28 ++
 scripts/run-dos.sh            |   6 +
 src/core/hal.h                |  13 +-
 src/core/surface.c            |   4 +-
 src/port/amiga/c2p.s          | 127 ---------
 src/port/amiga/hal.c          | 143 +---------
 src/port/atarist/c2p.s        | 188 -------------
 src/port/atarist/circle.s     |  20 +-
 src/port/atarist/fillCircle.s | 303 ++++++++++----------
 src/port/atarist/hal.c        | 505 +++++++---------------------------
 src/port/atarist/lineSpan.s   | 162 ++++++++++-
 src/port/atarist/spriteAsm.s  | 195 ++++++++++---
 src/port/dos/hal.c            |   4 +-
 src/port/iigs/hal.c           |   4 +-
 15 files changed, 966 insertions(+), 1062 deletions(-)
 create mode 100644 scripts/dosbox-386sx16.conf
 delete mode 100644 src/port/amiga/c2p.s
 delete mode 100644 src/port/atarist/c2p.s
diff --git a/README.md b/README.md
index cc5dde9..810e8db 100644
--- a/README.md
+++ b/README.md
@@ -59,6 +59,332 @@ build/<plat>/         per-target build outputs
 ```
 
 
+## Public API
+
+Game code includes a single umbrella header:
+
+```c
+#include <joey/joey.h>
+```
+
+That pulls in every public surface listed below. Full documentation
+lives in the per-feature headers under `include/joey/`; what follows
+is a quick reference. Every entry point is plain C, no C++ extensions.
+
+
+### Lifecycle (`joey/core.h`)
+
+```c
+typedef struct {
+    HostModeE hostMode;       // HOST_MODE_TAKEOVER or HOST_MODE_OS
+    uint32_t  codegenBytes;   // runtime compiled-sprite cache size
+    uint16_t  maxSurfaces;    // maximum concurrent surfaces
+    uint32_t  audioBytes;     // audio sample / module RAM pool
+    uint32_t  assetBytes;     // tileset / sprite / map RAM pool
+} JoeyConfigT;
+
+bool        joeyInit         (const JoeyConfigT *config);
+void        joeyShutdown     (void);
+const char *joeyLastError    (void);
+const char *joeyPlatformName (void);
+const char *joeyVersionString(void);
+
+void        joeyWaitVBL      (void);     // block until next VBL
+uint16_t    joeyFrameCount   (void);     // monotonic 16-bit frame counter
+uint16_t    joeyFrameHz      (void);     // 50 / 60 / 70 depending on port
+```
+
+
+### Surfaces (`joey/surface.h`)
+
+All surfaces are 320x200 4bpp packed (high nibble = left pixel) with
+a 200-entry SCB table and 16 palettes of 16 `$0RGB` colors.
+
+```c
+#define SURFACE_WIDTH               320
+#define SURFACE_HEIGHT              200
+#define SURFACE_BYTES_PER_ROW       160
+#define SURFACE_PIXELS_SIZE         (SURFACE_BYTES_PER_ROW * SURFACE_HEIGHT)
+#define SURFACE_PALETTE_COUNT       16
+#define SURFACE_COLORS_PER_PALETTE  16
+
+typedef struct SurfaceT SurfaceT;     // opaque
+
+SurfaceT *surfaceCreate (void);
+void      surfaceDestroy(SurfaceT *s);
+SurfaceT *stageGet      (void);                              // library back-buffer
+void      surfaceCopy   (SurfaceT *dst, const SurfaceT *src);
+
+bool      surfaceSaveFile(const SurfaceT *src, const char *path);
+bool      surfaceLoadFile(SurfaceT       *dst, const char *path);
+uint32_t  surfaceHash    (const SurfaceT *s);                // FNV-1a of logical pixels
+```
+
+`surfaceSaveFile` writes the surface in **target-native** form. Files
+are NOT cross-port portable; the asset pipeline handles conversion.
+
+
+### Drawing (`joey/draw.h`)
+
+All primitives clip to the surface; off-surface coords are silent
+no-ops. Color 0 is plotted normally (use the masked variants if you
+need transparency).
+
+```c
+void surfaceClear      (SurfaceT *s, uint8_t color);
+void drawPixel         (SurfaceT *s, int16_t x, int16_t y, uint8_t color);
+uint8_t samplePixel    (const SurfaceT *s, int16_t x, int16_t y);
+
+void drawLine          (SurfaceT *s, int16_t x0, int16_t y0,
+                        int16_t x1, int16_t y1, uint8_t color);
+void drawRect          (SurfaceT *s, int16_t x, int16_t y,
+                        uint16_t w, uint16_t h, uint8_t color);
+void fillRect          (SurfaceT *s, int16_t x, int16_t y,
+                        uint16_t w, uint16_t h, uint8_t color);
+void drawCircle        (SurfaceT *s, int16_t cx, int16_t cy,
+                        uint16_t r, uint8_t color);
+void fillCircle        (SurfaceT *s, int16_t cx, int16_t cy,
+                        uint16_t r, uint8_t color);
+
+void floodFill         (SurfaceT *s, int16_t x, int16_t y, uint8_t newColor);
+void floodFillBounded  (SurfaceT *s, int16_t x, int16_t y,
+                        uint8_t newColor, uint8_t boundaryColor);
+
+void surfaceBlit       (SurfaceT *dst, const JoeyAssetT *src, int16_t x, int16_t y);
+void surfaceBlitMasked (SurfaceT *dst, const JoeyAssetT *src,
+                        int16_t x, int16_t y, uint8_t transparentIndex);
+```
+
+
+### Palette and SCB (`joey/palette.h`)
+
+Colors are 12-bit `$0RGB`. Color 0 of every palette is forced to
+black on `paletteSet`. Each scanline picks one of the 16 palettes
+via the SCB.
+
+```c
+void    paletteSet  (SurfaceT *s, uint8_t paletteIndex, const uint16_t *colors16);
+void    paletteGet  (const SurfaceT *s, uint8_t paletteIndex, uint16_t *out16);
+void    scbSet      (SurfaceT *s, uint16_t line, uint8_t paletteIndex);
+void    scbSetRange (SurfaceT *s, uint16_t firstLine, uint16_t lastLine,
+                     uint8_t paletteIndex);
+uint8_t scbGet      (const SurfaceT *s, uint16_t line);
+```
+
+
+### Tiles (`joey/tile.h`)
+
+A "tile" is just an 8x8-aligned region of any surface. The API moves
+32-byte chunks between surfaces and provides a small `TileT` value
+type so callers can stash a copy without allocating a scratch surface.
+
+```c
+#define TILE_PIXELS_PER_SIDE  8
+#define TILE_BYTES_PER_ROW    4
+#define TILE_BYTES            (TILE_BYTES_PER_ROW * TILE_PIXELS_PER_SIDE)
+#define TILE_BLOCKS_PER_ROW   (SURFACE_WIDTH  / TILE_PIXELS_PER_SIDE)  // 40
+#define TILE_BLOCKS_PER_COL   (SURFACE_HEIGHT / TILE_PIXELS_PER_SIDE)  // 25
+#define TILE_NO_GLYPH         ((uint16_t)0xFFFFu)
+
+typedef struct TileT { uint8_t pixels[TILE_BYTES]; } TileT;
+
+void tileCopy       (SurfaceT *dst, uint8_t dstBx, uint8_t dstBy,
+                     const SurfaceT *src, uint8_t srcBx, uint8_t srcBy);
+void tileCopyMasked (SurfaceT *dst, uint8_t dstBx, uint8_t dstBy,
+                     const SurfaceT *src, uint8_t srcBx, uint8_t srcBy,
+                     uint8_t transparentIndex);
+void tileFill       (SurfaceT *s,   uint8_t bx,    uint8_t by,    uint8_t color);
+void tileSnap       (const SurfaceT *src, uint8_t bx, uint8_t by, TileT *out);
+void tilePaste      (SurfaceT *dst, uint8_t bx, uint8_t by,       const TileT *in);
+
+void drawText       (SurfaceT *dst, uint8_t bx, uint8_t by,
+                     const SurfaceT *fontSurface, const uint16_t *asciiMap,
+                     const char *str);
+```
+
+
+### Sprites (`joey/sprite.h`)
+
+Rectangles of 8x8 tiles drawn at arbitrary pixel positions with
+color-0 transparency. Tile data is `widthTiles * heightTiles * 32`
+bytes, tile-major 4bpp packed. Sprites can be runtime-compiled
+into per-shift code variants for fast draws.
+
+```c
+typedef enum { SPRITE_FLAGS_NONE = 0 } SpriteFlagsE;
+typedef struct SpriteT SpriteT;            // opaque
+
+typedef struct {
+    SpriteT  *sprite;
+    int16_t   x, y;
+    uint16_t  width, height;               // pixels
+    uint8_t  *bytes;                       // caller-owned save-under buffer
+    uint16_t  sizeBytes;
+} SpriteBackupT;
+
+SpriteT *spriteCreate            (const uint8_t *tileData,
+                                  uint8_t widthTiles, uint8_t heightTiles,
+                                  SpriteFlagsE flags);
+SpriteT *spriteCreateFromSurface (const SurfaceT *src, int16_t x, int16_t y,
+                                  uint8_t widthTiles, uint8_t heightTiles,
+                                  SpriteFlagsE flags);
+SpriteT *spriteLoadFile          (const char *path, SpriteFlagsE flags);
+SpriteT *spriteFromCompiledMem   (const uint8_t *data, uint32_t length,
+                                  SpriteFlagsE flags);
+bool     spriteSaveFile          (SpriteT *sp, const char *path);
+void     spriteDestroy           (SpriteT *sp);
+
+bool     spriteCompile           (SpriteT *sp);   // build per-shift fast path
+void     spritePrewarm           (SpriteT *sp);   // hint: compile if not already
+
+void     spriteDraw              (SurfaceT *s, SpriteT *sp, int16_t x, int16_t y);
+void     spriteSaveUnder         (const SurfaceT *s, SpriteT *sp,
+                                  int16_t x, int16_t y, SpriteBackupT *backup);
+void     spriteRestoreUnder      (SurfaceT *s, const SpriteBackupT *backup);
+void     spriteSaveAndDraw       (SurfaceT *s, SpriteT *sp, int16_t x, int16_t y,
+                                  SpriteBackupT *backup);
+
+void     spriteCompact           (void);          // defrag the codegen arena
+uint32_t spriteCodegenBytesUsed  (void);
+uint32_t spriteCodegenBytesTotal (void);
+```
+
+
+### Assets (`joey/asset.h`)
+
+Small bitmap blits with optional embedded palette, in `.jas` format.
+Use embedded `const JoeyAssetT` for ship-with-binary art; use the
+loaders for on-disk assets.
+
+```c
+typedef struct {
+    uint16_t       width;
+    uint16_t       height;
+    bool           hasPalette;
+    uint16_t       palette[16];        // valid only if hasPalette
+    const uint8_t *pixels;             // 4bpp packed, rowBytes = (width+1)/2
+} JoeyAssetT;
+
+JoeyAssetT *joeyAssetLoadFile     (const char *path);
+JoeyAssetT *joeyAssetFromMem      (const uint8_t *data, uint32_t length);
+void        joeyAssetFree         (JoeyAssetT *asset);
+void        joeyAssetApplyPalette (SurfaceT *dst, uint8_t paletteIndex,
+                                   const JoeyAssetT *asset);
+```
+
+
+### Present (`joey/present.h`)
+
+```c
+void stagePresent(void);
+```
+
+Flips the dirty rows of the stage to the display, then clears dirty
+state. Drawing primitives mark dirty as a side effect, so calling
+`stagePresent` once at end-of-frame is enough.
+
+
+### Input (`joey/input.h`)
+
+Call `joeyInputPoll` once per frame, then query the state predicates.
+Edge predicates (`*Pressed`, `*Released`) fire only in the frame the
+transition happened.
+
+```c
+typedef enum { /* KEY_NONE, KEY_A..KEY_Z, KEY_0..KEY_9, KEY_SPACE,
+                  KEY_ESCAPE, KEY_RETURN, KEY_TAB, KEY_BACKSPACE,
+                  KEY_UP/DOWN/LEFT/RIGHT, KEY_LSHIFT/RSHIFT/LCTRL/LALT,
+                  KEY_F1..KEY_F10, KEY_COUNT */ } JoeyKeyE;
+typedef enum { MOUSE_BUTTON_NONE, MOUSE_BUTTON_LEFT, MOUSE_BUTTON_RIGHT,
+               MOUSE_BUTTON_MIDDLE, MOUSE_BUTTON_COUNT } JoeyMouseButtonE;
+typedef enum { JOYSTICK_0, JOYSTICK_1, JOYSTICK_COUNT } JoeyJoystickE;
+typedef enum { JOY_BUTTON_0, JOY_BUTTON_1, JOY_BUTTON_COUNT } JoeyJoyButtonE;
+
+#define JOYSTICK_AXIS_MAX  127
+#define JOYSTICK_AXIS_MIN  (-127)
+
+void    joeyInputPoll        (void);
+void    joeyWaitForAnyKey    (void);
+
+bool    joeyKeyDown          (JoeyKeyE key);
+bool    joeyKeyPressed       (JoeyKeyE key);
+bool    joeyKeyReleased      (JoeyKeyE key);
+
+int16_t joeyMouseX           (void);
+int16_t joeyMouseY           (void);
+bool    joeyMouseDown        (JoeyMouseButtonE b);
+bool    joeyMousePressed     (JoeyMouseButtonE b);
+bool    joeyMouseReleased    (JoeyMouseButtonE b);
+
+bool    joeyJoystickConnected(JoeyJoystickE js);
+int8_t  joeyJoystickX        (JoeyJoystickE js);
+int8_t  joeyJoystickY        (JoeyJoystickE js);
+bool    joeyJoyDown          (JoeyJoystickE js, JoeyJoyButtonE b);
+bool    joeyJoyPressed       (JoeyJoystickE js, JoeyJoyButtonE b);
+bool    joeyJoyReleased      (JoeyJoystickE js, JoeyJoyButtonE b);
+void    joeyJoystickReset    (JoeyJoystickE js, uint8_t deadZone);
+```
+
+
+### Audio (`joey/audio.h`)
+
+4-channel Protracker-style music plus four one-shot SFX slots. Module
+data must be the platform-native form produced by `tools/joeymod`
+(`.mod` for Amiga/DOS/ST; `.ntp` for IIgs; `.amod` if you want
+loop=false on Amiga). A failed `joeyAudioInit` is non-fatal; the rest
+of the API stays callable as no-ops.
+
+```c
+#define JOEY_AUDIO_SFX_SLOTS  4
+
+bool joeyAudioInit          (void);
+void joeyAudioShutdown      (void);
+
+void joeyAudioPlayMod       (const uint8_t *data, uint32_t length, bool loop);
+void joeyAudioStopMod       (void);
+bool joeyAudioIsPlayingMod  (void);
+
+void joeyAudioPlaySfx       (uint8_t slot, const uint8_t *sample,
+                             uint32_t length, uint16_t rateHz);
+void joeyAudioStopSfx       (uint8_t slot);
+
+void joeyAudioFrameTick     (void);
+```
+
+
+### Debug logging (`joey/debug.h`)
+
+Crash-tracing logger. Writes are buffered and durable across normal
+exit; call `joeyLogFlush` ahead of suspected hang points if you want
+a guaranteed last-line-on-disk.
+
+```c
+void joeyLog     (const char *msg);
+void joeyLogF    (const char *fmt, ...);
+void joeyLogFlush(void);
+void joeyLogReset(void);
+```
+
+Output goes to `joeylog.txt` in the program's working directory.
+
+
+### Platform macros (`joey/platform.h`)
+
+The build system normally sets the platform via `-D`; auto-detection
+from compiler-predefined macros is a fallback. Game code can
+conditionally compile on these:
+
+```
+JOEYLIB_PLATFORM_IIGS / _AMIGA / _ATARIST / _DOS   // exactly one defined
+JOEYLIB_CPU_65816 / _68000 / _X86
+JOEYLIB_ENDIAN_LITTLE / _BIG
+JOEYLIB_NATIVE_CHUNKY / _NATIVE_PLANAR
+JOEYLIB_HAS_BLITTER / _HAS_COPPER                  // Amiga only
+JOEYLIB_PLATFORM_NAME                              // human-readable string
+JOEYLIB_VERSION_MAJOR / _MINOR / _PATCH / _STRING
+```
+
+
 ## License
 
 TBD.
diff --git a/scripts/dosbox-386sx16.conf b/scripts/dosbox-386sx16.conf
new file mode 100644
index 0000000..95bbf27
--- /dev/null
+++ b/scripts/dosbox-386sx16.conf
@@ -0,0 +1,28 @@
+# DOSBox config: simulate an Intel 386SX-16 (1988), the slowest 386
+# desktop CPU JoeyLib could realistically be run on. Use this floor
+# to verify the DOS port still hits its frame budget on the bottom of
+# the 386 stack rather than coasting on host CPU.
+#
+# The 386SX is identical to the 386DX in instruction set; the only
+# difference is the 16-bit external bus (vs 32-bit on DX), which slows
+# memory-bound code. DOSBox does not model the bus split directly --
+# the cycles count below approximates the combined 386SX-16 throughput.
+#
+# Notes:
+#   core    = normal           accurate per-instruction cycles, not
+#                              recompiled-to-host (auto / dynamic would
+#                              defeat slow-CPU simulation).
+#   cputype = 386              386 instruction set (no 486 BSWAP /
+#                              CMPXCHG, no Pentium MMX).
+#   cycles  = fixed 2200       community-standard approximation for
+#                              386SX-16 throughput in DOSBox.
+#                              DOSBox-Staging deprecates this in favor
+#                              of cpu_cycles, but still accepts it.
+#                              Vanilla DOSBox and DOSBox-X only know
+#                              the old key, so 'cycles' stays for
+#                              cross-fork portability.
+
+[cpu]
+core    = normal
+cputype = 386
+cycles  = fixed 2200
diff --git a/scripts/run-dos.sh b/scripts/run-dos.sh
index 607d37c..62fdc99 100755
--- a/scripts/run-dos.sh
+++ b/scripts/run-dos.sh
@@ -18,6 +18,7 @@ fi
 prog=${1:-pattern}
 repo=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)
 bin_dir=$repo/build/dos/bin
+conf=$repo/scripts/dosbox-386sx16.conf
 file=${prog^^}.EXE
 
 if [[ ! -f "$bin_dir/$file" ]]; then
@@ -34,7 +35,12 @@ fi
 # default capture-on-click behavior fights the VM's grab and mouse
 # input is unusable. On plain DOSBox this -set flag is unknown and is
 # logged once as a warning, then ignored -- harmless either way.
+#
+# -conf $conf locks the CPU to a simulated 386SX-16 (the slowest
+# realistic 386 desktop). DOSBox layers configs: anything not set in
+# our file falls back to the user's main dosbox.conf.
 exec dosbox \
+    -conf "$conf" \
     -set "mouse_capture=seamless" \
     -c "C:" \
     -c "$file" \
diff --git a/src/core/hal.h b/src/core/hal.h
index 05cdbd8..f5eba77 100644
--- a/src/core/hal.h
+++ b/src/core/hal.h
@@ -140,15 +140,16 @@ void halSpriteRestorePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint1
 //   s->pixels src->dst; on planar ports there is no chunky to copy
 //   (planes already covered by halSurfaceCopyPlanes). Chunky ports
 //   do the memcpy here; Amiga is a no-op.
-// halSurfaceLoadFileChunky / halSurfaceSaveFileChunky wrap fread /
-//   fwrite of the pixel data. Chunky ports stream directly to/from
-//   s->pixels; Amiga uses a scratch buffer + c2p (load) or
-//   plane->chunky derivation (save).
+// halSurfaceLoadFile / halSurfaceSaveFile wrap fread / fwrite of the
+//   pixel data using each port's native pixel format (chunky on
+//   IIgs/DOS, interleaved planar on ST, plane-major on Amiga). Files
+//   written by one port are NOT loadable by another -- conversion is
+//   the asset pipeline's job.
 uint8_t halSamplePixel(const SurfaceT *s, int16_t x, int16_t y);
 uint32_t halSurfaceHash(const SurfaceT *s);
 void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src);
-bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp);
-bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp);
+bool halSurfaceLoadFile(SurfaceT *dst, FILE *fp);
+bool halSurfaceSaveFile(const SurfaceT *src, FILE *fp);
 
 // Present the dirty regions of the source surface to the display.
 // The cross-platform stagePresent walks the dirty arrays before
diff --git a/src/core/surface.c b/src/core/surface.c
index 229b5f0..d2c5c62 100644
--- a/src/core/surface.c
+++ b/src/core/surface.c
@@ -158,7 +158,7 @@ bool surfaceLoadFile(SurfaceT *dst, const char *path) {
         fclose(fp);
         return false;
     }
-    if (!halSurfaceLoadFileChunky(dst, fp)) {
+    if (!halSurfaceLoadFile(dst, fp)) {
         fclose(fp);
         return false;
     }
@@ -186,7 +186,7 @@ bool surfaceSaveFile(const SurfaceT *src, const char *path) {
     if (fp == NULL) {
         return false;
     }
-    if (!halSurfaceSaveFileChunky(src, fp)) {
+    if (!halSurfaceSaveFile(src, fp)) {
         fclose(fp);
         return false;
     }
diff --git a/src/port/amiga/c2p.s b/src/port/amiga/c2p.s
deleted file mode 100644
index 25554fa..0000000
--- a/src/port/amiga/c2p.s
+++ /dev/null
@@ -1,127 +0,0 @@
-| Amiga chunky-to-planar conversion -- 68000 hand-rolled.
-|
-| Drop-in replacement for hal.c's old c2pRange C inner loop. Uses a
-| 4 KB lookup table built once at HAL init: each (sourceByte, position,
-| plane) tuple maps to the plane-byte bit contribution that source
-| byte makes when it sits at that position within a 4-byte (8-pixel)
-| planar group going to that plane.
-|
-| Calling convention: m68k-amigaos-gcc cdecl.
-|   Args on stack at 4(sp), 8(sp), ...
-|   d2-d7, a2-a6 are callee-save.
-|   No return value.
-|
-| void chunkyToPlanarRow(const uint8_t *src,    ;  4(sp) - 4bpp packed source row
-|                        uint8_t       *p0,     ;  8(sp) - plane 0 dest row
-|                        uint8_t       *p1,     ; 12(sp) - plane 1 dest row
-|                        uint8_t       *p2,     ; 16(sp) - plane 2 dest row
-|                        uint8_t       *p3,     ; 20(sp) - plane 3 dest row
-|                        uint16_t       n,      ; 24(sp) - planar byte count (low word)
-|                        const uint8_t *lut);   ; 28(sp) - 4 KB LUT base
-|
-| LUT layout: lut[src*16 + pos*4 + plane] = 1-byte plane contribution
-| for source byte `src` sitting at byte-position `pos` (0..3) within
-| its 4-byte planar group, going to plane `plane` (0..3). All 16
-| (pos, plane) entries for one src byte are contiguous, so the inner
-| loop reaches every entry off (a5, d4.w) with an 8-bit displacement
-| (0..15) and never has to advance an index register.
-|
-| Per planar byte we consume 4 source bytes (positions 0..3 of the
-| 8-pixel group). For each we compute d4 = src*16 with four add.w's
-| (faster than asl.w on 68000) and OR the four plane contributions
-| into d0..d3 with byte-displaced (a5,d4.w) reads.
-|
-| GAS-syntax (binutils m68k); assembled by m68k-amigaos-as via the
-| gcc driver.
-
-                .text
-                .globl  _chunkyToPlanarRow
-
-| Stack frame size of MOVEM.L block: d2-d7 (6) + a2-a6 (5) = 11 regs
-| * 4 bytes = 44 bytes. Args therefore start at the original sp+4
-| offset PLUS 44.
-                .equ    SAVED_REGS_SIZE, 44
-
-
-_chunkyToPlanarRow:
-                movem.l %d2-%d7/%a2-%a6,-(%sp)
-
-                move.l   4+SAVED_REGS_SIZE(%sp),%a0     | src
-                move.l   8+SAVED_REGS_SIZE(%sp),%a1     | p0
-                move.l  12+SAVED_REGS_SIZE(%sp),%a2     | p1
-                move.l  16+SAVED_REGS_SIZE(%sp),%a3     | p2
-                move.l  20+SAVED_REGS_SIZE(%sp),%a4     | p3
-                | n is a uint16_t but GCC promotes to int and pushes a
-                | full 4 bytes -- the low word lives at +2 in big-endian
-                | layout.
-                move.w  24+SAVED_REGS_SIZE+2(%sp),%d7   | planar byte count
-                move.l  28+SAVED_REGS_SIZE(%sp),%a5     | LUT base
-
-                subq.w  #1,%d7                          | DBRA: count-1
-                bmi     .Ldone                          | nothing to do
-
-.LbyteLoop:
-                moveq   #0,%d0                          | plane 0 acc
-                moveq   #0,%d1                          | plane 1 acc
-                moveq   #0,%d2                          | plane 2 acc
-                moveq   #0,%d3                          | plane 3 acc
-
-                | ----- Source byte position 0 -----
-                moveq   #0,%d4
-                move.b  (%a0)+,%d4                      | src[0]
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4                         | d4 = src * 16
-                or.b      0(%a5,%d4.w),%d0              | pos0 plane0
-                or.b      1(%a5,%d4.w),%d1              | pos0 plane1
-                or.b      2(%a5,%d4.w),%d2              | pos0 plane2
-                or.b      3(%a5,%d4.w),%d3              | pos0 plane3
-
-                | ----- Source byte position 1 -----
-                moveq   #0,%d4
-                move.b  (%a0)+,%d4                      | src[1]
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                or.b      4(%a5,%d4.w),%d0              | pos1 plane0
-                or.b      5(%a5,%d4.w),%d1              | pos1 plane1
-                or.b      6(%a5,%d4.w),%d2              | pos1 plane2
-                or.b      7(%a5,%d4.w),%d3              | pos1 plane3
-
-                | ----- Source byte position 2 -----
-                moveq   #0,%d4
-                move.b  (%a0)+,%d4                      | src[2]
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                or.b      8(%a5,%d4.w),%d0              | pos2 plane0
-                or.b      9(%a5,%d4.w),%d1              | pos2 plane1
-                or.b     10(%a5,%d4.w),%d2              | pos2 plane2
-                or.b     11(%a5,%d4.w),%d3              | pos2 plane3
-
-                | ----- Source byte position 3 -----
-                moveq   #0,%d4
-                move.b  (%a0)+,%d4                      | src[3]
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                or.b     12(%a5,%d4.w),%d0              | pos3 plane0
-                or.b     13(%a5,%d4.w),%d1              | pos3 plane1
-                or.b     14(%a5,%d4.w),%d2              | pos3 plane2
-                or.b     15(%a5,%d4.w),%d3              | pos3 plane3
-
-                | ----- Store plane bytes -----
-                move.b  %d0,(%a1)+
-                move.b  %d1,(%a2)+
-                move.b  %d2,(%a3)+
-                move.b  %d3,(%a4)+
-
-                dbra    %d7,.LbyteLoop
-
-.Ldone:
-                movem.l (%sp)+,%d2-%d7/%a2-%a6
-                rts
diff --git a/src/port/amiga/hal.c b/src/port/amiga/hal.c
index c51b5a9..7e87f2d 100644
--- a/src/port/amiga/hal.c
+++ b/src/port/amiga/hal.c
@@ -115,69 +115,10 @@ static uint8_t  gCachedScb    [SURFACE_HEIGHT]
 static uint16_t gCachedPalette[SURFACE_PALETTE_COUNT][SURFACE_COLORS_PER_PALETTE] __attribute__((aligned(4)));
 static bool     gCacheValid = false;
 
-// 4 KB chunky-to-planar lookup table consumed by chunkyToPlanarRow
-// (src/port/amiga/c2p.s). Layout: gC2pLut[src*16 + pos*4 + plane] =
-// the plane-byte bit contribution that source byte `src` makes to
-// plane `plane` when it sits at byte-position `pos` within a 4-byte
-// (8-pixel) planar group. The src-major layout lets the asm inner
-// loop reach all 16 (pos, plane) entries for a single src byte via
-// 8-bit displacements off (a5, d4.w) without any LEA between reads.
-static uint8_t  gC2pLut[4 * 1024];
-static bool     gC2pLutReady = false;
-
 static bool paletteOrScbChanged(const SurfaceT *src);
-static void initC2pLut(void);
-
-// Provided by src/port/amiga/c2p.s.
-extern void chunkyToPlanarRow(const uint8_t *src,
-                              uint8_t *p0, uint8_t *p1, uint8_t *p2, uint8_t *p3,
-                              uint16_t numPlanarBytes,
-                              const uint8_t *lut);
 
 // ----- Internal helpers (alphabetical) -----
 
-// Build the 4 KB chunky-to-planar lookup table consumed by
-// chunkyToPlanarRow. For each (pos, plane, src) tuple, store the
-// bit contribution that source byte `src` makes to plane `plane`
-// when it sits at byte-position `pos` (0..3) within a 4-byte
-// (8-pixel) planar group:
-//
-//   - src high nibble = leftmost pixel  -> plane bit (7 - 2*pos)
-//   - src low  nibble = rightmost pixel -> plane bit (6 - 2*pos)
-static void initC2pLut(void) {
-    uint16_t pos;
-    uint16_t plane;
-    uint16_t src;
-    uint8_t  highShift;
-    uint8_t  lowShift;
-    uint8_t  highBit;
-    uint8_t  lowBit;
-
-    if (gC2pLutReady) {
-        return;
-    }
-    for (src = 0; src < 256; src++) {
-        for (pos = 0; pos < 4; pos++) {
-            highShift = (uint8_t)(7 - 2 * pos);
-            lowShift  = (uint8_t)(6 - 2 * pos);
-            for (plane = 0; plane < 4; plane++) {
-                highBit = (uint8_t)(((src >> 4) >> plane) & 1);
-                lowBit  = (uint8_t)(((src & 0x0F) >> plane) & 1);
-                gC2pLut[src * 16 + pos * 4 + plane] =
-                    (uint8_t)((highBit << highShift) | (lowBit << lowShift));
-            }
-        }
-    }
-    gC2pLutReady = true;
-}
-
-
-// (Phase 9 deleted c2pRange. halSurfaceLoadPlanes inlines its own
-// per-row chunkyToPlanarRow loop -- the only code path that still
-// converts chunky to planar today, since asset loading is the only
-// surface mutation that doesn't go through a planar-aware primitive.)
-
-
 // Build a user copper list for per-scanline palette (SCB emulation).
 // One WAIT + 16 MOVEs per displayed scanline + one CEND. The list is
 // stored in gNewUCL until installCopperList swaps it onto the screen.
@@ -1358,35 +1299,6 @@ void halSpriteRestorePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint1
 }
 
 
-/* Helper used by Amiga halSurfaceLoadFileChunky to populate planes
- * from a freshly-loaded chunky pixel buffer (s->pixels). */
-static void amigaPopulatePlanesFromChunky(SurfaceT *s) {
-    AmigaPlanarT  *pd;
-    int16_t        y;
-    const uint8_t *srcLine;
-    UBYTE         *p0;
-    UBYTE         *p1;
-    UBYTE         *p2;
-    UBYTE         *p3;
-
-    pd = (AmigaPlanarT *)s->portData;
-    if (pd == NULL) {
-        return;
-    }
-    if (!gC2pLutReady) {
-        initC2pLut();
-    }
-    for (y = 0; y < SURFACE_HEIGHT; y++) {
-        srcLine = &s->pixels[y * SURFACE_BYTES_PER_ROW];
-        p0      = pd->planes[0] + (uint16_t)y * AMIGA_BYTES_PER_ROW;
-        p1      = pd->planes[1] + (uint16_t)y * AMIGA_BYTES_PER_ROW;
-        p2      = pd->planes[2] + (uint16_t)y * AMIGA_BYTES_PER_ROW;
-        p3      = pd->planes[3] + (uint16_t)y * AMIGA_BYTES_PER_ROW;
-        chunkyToPlanarRow(srcLine, p0, p1, p2, p3, AMIGA_BYTES_PER_ROW, gC2pLut);
-    }
-}
-
-
 // Phase 6 planar dual-write for sprite draw. Walks the sprite's
 // chunky tile data with the same clipping the cross-platform code
 // applies, calling amigaPlanarSetPixel for every non-transparent
@@ -2118,7 +2030,9 @@ uint8_t halSamplePixel(const SurfaceT *s, int16_t x, int16_t y) {
 
 
 /* Reverse-c2p: per row, derive 160 chunky bytes from 40 plane bytes
- * (per plane, 4 planes). Used by halSurfaceHash, halSurfaceSaveFileChunky.
+ * (per plane, 4 planes). Used by halSurfaceHash to fold the planar
+ * surface into the same byte-stream the chunky ports hash, so cross-
+ * port hash comparisons stay valid.
  * Walks 8 pixels per planar-byte column; per pixel assembles nibble
  * from 4 plane bits. Output: 4 chunky bytes per planar-byte column
  * (since 8 pixels = 4 chunky bytes at 2px/byte). */
@@ -2204,62 +2118,35 @@ void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src) {
 }
 
 
-bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp) {
+// On-disk format is the Amiga's native plane-major buffer: planes
+// 0..3 written sequentially, AMIGA_PLANE_SIZE bytes each.
+bool halSurfaceLoadFile(SurfaceT *dst, FILE *fp) {
     AmigaPlanarT *pd;
-    uint8_t      *scratch;
-    uint8_t      *srcLine;
-    int16_t       y;
-    UBYTE        *p0;
-    UBYTE        *p1;
-    UBYTE        *p2;
-    UBYTE        *p3;
-    bool          ok;
+    uint8_t       i;
 
     pd = (AmigaPlanarT *)dst->portData;
     if (pd == NULL) {
         return false;
     }
-    /* fread the chunky file payload into a scratch buffer, then c2p
-     * directly into our planes. The scratch is a one-shot AllocMem
-     * (PUBLIC, not chip) since chunkyToPlanarRow only reads it. */
-    scratch = (uint8_t *)AllocMem((ULONG)SURFACE_PIXELS_SIZE, (ULONG)MEMF_PUBLIC);
-    if (scratch == NULL) {
-        return false;
-    }
-    ok = (fread(scratch, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE);
-    if (ok) {
-        if (!gC2pLutReady) {
-            initC2pLut();
-        }
-        for (y = 0; y < SURFACE_HEIGHT; y++) {
-            srcLine = &scratch[y * SURFACE_BYTES_PER_ROW];
-            p0 = pd->planes[0] + (uint16_t)y * AMIGA_BYTES_PER_ROW;
-            p1 = pd->planes[1] + (uint16_t)y * AMIGA_BYTES_PER_ROW;
-            p2 = pd->planes[2] + (uint16_t)y * AMIGA_BYTES_PER_ROW;
-            p3 = pd->planes[3] + (uint16_t)y * AMIGA_BYTES_PER_ROW;
-            chunkyToPlanarRow(srcLine, p0, p1, p2, p3, AMIGA_BYTES_PER_ROW, gC2pLut);
+    for (i = 0; i < AMIGA_BITPLANES; i++) {
+        if (fread(pd->planes[i], 1, AMIGA_PLANE_SIZE, fp) != AMIGA_PLANE_SIZE) {
+            return false;
         }
     }
-    FreeMem(scratch, (ULONG)SURFACE_PIXELS_SIZE);
-    return ok;
+    return true;
 }
 
 
-bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp) {
+bool halSurfaceSaveFile(const SurfaceT *src, FILE *fp) {
     AmigaPlanarT *pd;
-    uint8_t       chunkyRow[SURFACE_BYTES_PER_ROW];
-    int16_t       y;
+    uint8_t       i;
 
     pd = (AmigaPlanarT *)src->portData;
     if (pd == NULL) {
         return false;
     }
-    /* Per row: derive chunky from planes, write 160 bytes. Less
-     * efficient than a single fwrite of a full buffer but avoids
-     * needing a 32 KB scratch allocation. */
-    for (y = 0; y < SURFACE_HEIGHT; y++) {
-        amigaPlanesToChunkyRow(pd, y, chunkyRow);
-        if (fwrite(chunkyRow, 1, SURFACE_BYTES_PER_ROW, fp) != SURFACE_BYTES_PER_ROW) {
+    for (i = 0; i < AMIGA_BITPLANES; i++) {
+        if (fwrite(pd->planes[i], 1, AMIGA_PLANE_SIZE, fp) != AMIGA_PLANE_SIZE) {
             return false;
         }
     }
diff --git a/src/port/atarist/c2p.s b/src/port/atarist/c2p.s
deleted file mode 100644
index c4a2df6..0000000
--- a/src/port/atarist/c2p.s
+++ /dev/null
@@ -1,188 +0,0 @@
-| Atari ST chunky-to-planar conversion -- 68000 hand-rolled.
-|
-| Drop-in replacement for hal.c's old c2pRow C inner loop. The C
-| version walked every pixel and built each plane word with a
-| run-time variable bit shift (`1 << bit`), which costs ~6+2*bit
-| cycles on 68000 -- roughly 100+ cycles per pixel after GCC's m68k
-| codegen overhead. This rewrite uses a 4 KB lookup table built once
-| at HAL init: same layout as the Amiga c2p LUT, so the
-| (sourceByte, position, plane) -> 2-bit contribution mapping is
-| identical, but the routine packs results into ST word-interleaved
-| planar (4 plane words per 16-pixel group) instead of 4 separate
-| plane bytes.
-|
-| Each ST group is 8 source bytes -> 4 plane words. Source byte
-| positions 0..3 contribute to the HIGH byte of each plane word
-| (bits 15..8); positions 4..7 contribute to the LOW byte (bits
-| 7..0). Within a byte, the LUT for (src, bp%4, plane) already
-| places bits at (7-2*(bp%4), 6-2*(bp%4)), so we use the SAME LUT
-| entries for both halves -- we just shift d0..d3 left by 8 between
-| the halves to move the high-half bits up before the low half ORs
-| into the now-empty low byte.
-|
-| Calling convention: m68k-atari-mint-gcc cdecl.
-|   Args on stack at 4(sp), 8(sp), ...
-|   d2-d7, a2-a6 are callee-save.
-|   No return value.
-|
-| void chunkyToPlanarRowSt(const uint8_t *src,    ;  4(sp) - 4bpp packed source row
-|                          uint16_t      *dst,    ;  8(sp) - planar dest row (uint16_t*)
-|                          uint16_t       groupStart, ; 12(sp) - first group index (low word)
-|                          uint16_t       groupEnd,   ; 16(sp) - one-past-last group index (low word)
-|                          const uint8_t *lut);   ; 20(sp) - 4 KB LUT base
-|
-| LUT layout: lut[src*16 + pos*4 + plane] (uint8) = the 2-bit plane
-| contribution for source byte `src` at byte-position `pos` (0..3
-| within a 4-byte chunk) going to plane `plane` (0..3). All 16
-| (pos, plane) entries for one src byte are contiguous, so the inner
-| loop reaches every entry off (a5, d4.w) with an 8-bit displacement
-| (0..15) without LEA between reads.
-|
-| GAS-syntax (binutils m68k); assembled by m68k-atari-mint-as via
-| the gcc driver.
-
-                .text
-                .globl  _chunkyToPlanarRowSt
-
-| MOVEM frame: d2-d7 (6) + a2-a6 (5) = 11 regs * 4 bytes = 44 bytes.
-                .equ    SAVED_REGS_SIZE, 44
-
-
-_chunkyToPlanarRowSt:
-                movem.l %d2-%d7/%a2-%a6,-(%sp)
-
-                move.l   4+SAVED_REGS_SIZE(%sp),%a0     | src row base
-                move.l   8+SAVED_REGS_SIZE(%sp),%a1     | dst (uint16_t*)
-                | Both groupStart and groupEnd are uint16_t but GCC
-                | promotes them to int and pushes 4 bytes each; the
-                | low word lives at +2 in big-endian layout.
-                move.w  12+SAVED_REGS_SIZE+2(%sp),%d6   | groupStart
-                move.w  16+SAVED_REGS_SIZE+2(%sp),%d7   | groupEnd
-                move.l  20+SAVED_REGS_SIZE(%sp),%a5     | LUT base
-
-                | Advance src and dst to the first group's data.
-                | Each group consumes 8 source bytes and produces 4
-                | dest words (8 bytes), so both pointers advance by
-                | groupStart * 8.
-                move.w  %d6,%d4
-                lsl.w   #3,%d4
-                add.w   %d4,%a0
-                add.w   %d4,%a1
-
-                sub.w   %d6,%d7                         | groupCount = end - start
-                subq.w  #1,%d7                          | DBRA bias
-                bmi     .Ldone
-
-.LgroupLoop:
-                moveq   #0,%d0                          | plane 0 acc
-                moveq   #0,%d1                          | plane 1 acc
-                moveq   #0,%d2                          | plane 2 acc
-                moveq   #0,%d3                          | plane 3 acc
-
-                | ===== Source bytes 0..3 -> high byte of each plane word =====
-                moveq   #0,%d4
-                move.b  (%a0)+,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4                         | d4 = src * 16
-                or.b      0(%a5,%d4.w),%d0
-                or.b      1(%a5,%d4.w),%d1
-                or.b      2(%a5,%d4.w),%d2
-                or.b      3(%a5,%d4.w),%d3
-
-                moveq   #0,%d4
-                move.b  (%a0)+,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                or.b      4(%a5,%d4.w),%d0
-                or.b      5(%a5,%d4.w),%d1
-                or.b      6(%a5,%d4.w),%d2
-                or.b      7(%a5,%d4.w),%d3
-
-                moveq   #0,%d4
-                move.b  (%a0)+,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                or.b      8(%a5,%d4.w),%d0
-                or.b      9(%a5,%d4.w),%d1
-                or.b     10(%a5,%d4.w),%d2
-                or.b     11(%a5,%d4.w),%d3
-
-                moveq   #0,%d4
-                move.b  (%a0)+,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                or.b     12(%a5,%d4.w),%d0
-                or.b     13(%a5,%d4.w),%d1
-                or.b     14(%a5,%d4.w),%d2
-                or.b     15(%a5,%d4.w),%d3
-
-                | Move accumulated bits into the HIGH byte of each word.
-                lsl.w   #8,%d0
-                lsl.w   #8,%d1
-                lsl.w   #8,%d2
-                lsl.w   #8,%d3
-
-                | ===== Source bytes 4..7 -> low byte of each plane word =====
-                moveq   #0,%d4
-                move.b  (%a0)+,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                or.b      0(%a5,%d4.w),%d0
-                or.b      1(%a5,%d4.w),%d1
-                or.b      2(%a5,%d4.w),%d2
-                or.b      3(%a5,%d4.w),%d3
-
-                moveq   #0,%d4
-                move.b  (%a0)+,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                or.b      4(%a5,%d4.w),%d0
-                or.b      5(%a5,%d4.w),%d1
-                or.b      6(%a5,%d4.w),%d2
-                or.b      7(%a5,%d4.w),%d3
-
-                moveq   #0,%d4
-                move.b  (%a0)+,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                or.b      8(%a5,%d4.w),%d0
-                or.b      9(%a5,%d4.w),%d1
-                or.b     10(%a5,%d4.w),%d2
-                or.b     11(%a5,%d4.w),%d3
-
-                moveq   #0,%d4
-                move.b  (%a0)+,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                add.w   %d4,%d4
-                or.b     12(%a5,%d4.w),%d0
-                or.b     13(%a5,%d4.w),%d1
-                or.b     14(%a5,%d4.w),%d2
-                or.b     15(%a5,%d4.w),%d3
-
-                | Store 4 plane words.
-                move.w  %d0,(%a1)+
-                move.w  %d1,(%a1)+
-                move.w  %d2,(%a1)+
-                move.w  %d3,(%a1)+
-
-                dbra    %d7,.LgroupLoop
-
-.Ldone:
-                movem.l (%sp)+,%d2-%d7/%a2-%a6
-                rts
diff --git a/src/port/atarist/circle.s b/src/port/atarist/circle.s
index b7c65c9..d7e21cd 100644
--- a/src/port/atarist/circle.s
+++ b/src/port/atarist/circle.s
@@ -82,11 +82,9 @@
                 .macro  YP_REC  slot, signOp, yreg
                 move.l  %a4,%d6
                 \signOp\().w \yreg,%d6         | d6.w = yp
-                move.w  %d6,%d0
-                lsl.w   #5,%d6                 | d6 = yp << 5
-                lsl.w   #7,%d0                 | d0 = yp << 7
-                add.w   %d6,%d0                | d0 = yp * 160
-                move.w  %d0,\slot(%sp)
+                add.w   %d6,%d6                | * 2 for word index
+                move.w  (%a6,%d6.w),%d6        | yLut[yp] = yp * 160
+                move.w  %d6,\slot(%sp)
                 .endm
 
 
@@ -223,14 +221,21 @@ _surface68kStCircleOutline:
                 moveq   #1,%d4
                 sub.w   %d2,%d4                | err = 1 - bx
 
+                | a6 = yLut base (yp -> yp*160). Lookup is faster than
+                | the 4 cyc + 4 cyc + 18 cyc + 22 cyc + 4 cyc shift+add
+                | chain we used to do per YP_REC. Saved across all 4
+                | YP_RECs per Bresenham iter (~120 cyc/iter).
+                | Shared LUT lives in lineSpan.s; reference absolute.
+                lea     _gStRowOffsetLut,%a6
+
                 | Dispatch on color (low 4 bits) -> one of 16 main loops.
                 moveq   #0,%d6
                 move.b  SP_COLOR(%sp),%d6
                 and.w   #0x0F,%d6
                 add.w   %d6,%d6
                 add.w   %d6,%d6                | * 4 for bra.w table
-                lea     .LcoStTable(%pc),%a6
-                jmp     0(%a6,%d6.w)
+                lea     .LcoStTable(%pc),%a2
+                jmp     0(%a2,%d6.w)
 
 .LcoStTable:
                 bra.w   .LcoStLoop_0
@@ -280,3 +285,4 @@ bitMaskWordLut:
                 .word   0x0800, 0x0400, 0x0200, 0x0100
                 .word   0x0080, 0x0040, 0x0020, 0x0010
                 .word   0x0008, 0x0004, 0x0002, 0x0001
+| (yLut now lives in lineSpan.s as the shared _gStRowOffsetLut)
diff --git a/src/port/atarist/fillCircle.s b/src/port/atarist/fillCircle.s
index ba508df..7ed25a8 100644
--- a/src/port/atarist/fillCircle.s
+++ b/src/port/atarist/fillCircle.s
@@ -9,28 +9,16 @@
 | Caller MUST guarantee the bounding box (cx-r, cy-r) (cx+r, cy+r)
 | is fully on-surface. Off-surface circles fall back to the C walker.
 |
+| Phase 10 final: 16-way color dispatch at the OUTER loop. Each color
+| variant has its own Bresenham body where SPAN_BODY inlines a hard-
+| coded 4-plane mask RMW (no btst, no bsr/rts). Saves ~120 cyc per
+| applyMask call (was ~180 via bsr applyMask with runtime btst on d7).
+|
 | ABI: cdecl. d2-d7/a2-a6 callee-save.
 |
 |   void surface68kStFillCircle(uint8_t *base,
 |                               uint16_t cx, uint16_t cy,
 |                               uint16_t r,  uint8_t  color);
-|
-| Register allocation across the loop:
-|   d2.w = bx (Bresenham, starts at r)
-|   d3.w = by (Bresenham, starts at 0)
-|   d4.w = err
-|   d5.l = loLong (planes 0+1 long template)
-|   d6.l = hiLong (planes 2+3 long template)
-|   d7.b = color (low nibble; tested via btst)
-|   a3   = base
-|   a4   = scratch / current group pointer
-|   d0,d1 = scratch
-|
-| Stack scratch (8 bytes at 0(sp)..7(sp)):
-|   0..1  leftMask  (word; per pair)
-|   2..3  rightMask (word; per pair)
-|   4..5  numGroups (word; per pair)
-|   6..7  groupFirstByteOff (word; per pair)
 
                 .text
 
@@ -42,7 +30,7 @@
                 .equ    SP_FC_CX,      SP_FC_OFF + 4 + 2
                 .equ    SP_FC_CY,      SP_FC_OFF + 8 + 2
                 .equ    SP_FC_R,       SP_FC_OFF + 12 + 2
-                .equ    SP_FC_COLOR,   SP_FC_OFF + 16 + 3
+                .equ    SP_FC_COLOR,   SP_FC_OFF + 20 + 3
 
 
 | ---- COMPUTE_PAIR_MASKS macro -----------------------------------
@@ -50,18 +38,15 @@
 | Output: 0(sp) leftMask, 2(sp) rightMask, 4(sp) numGroups,
 |         6(sp) groupFirstByteOff
 | Trashes: d0, d1
-| (No labels: straightline.)
 
                 .macro  COMPUTE_PAIR_MASKS
                 move.w  %d0,0(%sp)             | stash left
                 move.w  %d1,2(%sp)             | stash right
-                | groupFirst & groupFirstByteOff
                 move.w  %d0,%d1
                 lsr.w   #4,%d1                 | groupFirst
                 move.w  %d1,%d0
                 lsl.w   #3,%d0                 | groupFirstByteOff
                 move.w  %d0,6(%sp)
-                | numGroups = (right >> 4) - groupFirst
                 move.w  2(%sp),%d0
                 lsr.w   #4,%d0                 | groupLast
                 sub.w   %d1,%d0                | numGroups
@@ -81,25 +66,53 @@
                 .endm
 
 
-| ---- SPAN_BODY macro --------------------------------------------
-| Render one row span using the pair masks at 0(sp)..7(sp).
-| Input:  d0.w = y (signed)
-|         a3 = base, d5 = loLong, d6 = hiLong, d7 = color
-| Trashes: d0, d1, a4
-| Macro takes an idx parameter for unique labels.
+| ---- APPLY_MASK_INLINE macro ------------------------------------
+| 4-plane mask RMW with HARDCODED color. a4 advances by 8 (postinc).
+| Inputs:  d0.w = mask, a4 = group ptr
+| Trashes: d1 (notMask scratch)
 
-                .macro  SPAN_BODY
-                | a4 = base + y*160
-                ext.l   %d0
-                move.l  %d0,%d1
-                lsl.l   #5,%d0
-                lsl.l   #7,%d1
-                add.l   %d1,%d0                | y*160
-                lea     0(%a3,%d0.l),%a4
-                | a4 += groupFirstByteOff
-                moveq   #0,%d0
-                move.w  6(%sp),%d0
-                add.l   %d0,%a4
+                .macro  APPLY_MASK_INLINE  color
+                move.w  %d0,%d1
+                not.w   %d1
+                .if  ((\color) & 1)
+                or.w    %d0,(%a4)+
+                .else
+                and.w   %d1,(%a4)+
+                .endif
+                .if  ((\color) & 2)
+                or.w    %d0,(%a4)+
+                .else
+                and.w   %d1,(%a4)+
+                .endif
+                .if  ((\color) & 4)
+                or.w    %d0,(%a4)+
+                .else
+                and.w   %d1,(%a4)+
+                .endif
+                .if  ((\color) & 8)
+                or.w    %d0,(%a4)+
+                .else
+                and.w   %d1,(%a4)+
+                .endif
+                .endm
+
+
+| ---- SPAN_BODY macro --------------------------------------------
+| Render one row span. Color hardcoded.
+| Input:  d0.w = y (signed)
+|         a3 = base, d5 = loLong, d6 = hiLong
+|         masks at 0..7(sp): leftMask, rightMask, numGroups, groupFirstByteOff
+| Trashes: d0, d1, a4
+
+                .macro  SPAN_BODY  color
+                | a4 = base + y*160 + groupFirstByteOff
+                | y*160 via shared _gStRowOffsetLut (a2 holds lut base).
+                | byteOff (y*160 + groupFirstByteOff) fits in 16 bits
+                | (max 31992), so word-only ops + .w-indexed lea.
+                add.w   %d0,%d0                | y * 2 (word index)
+                move.w  (%a2,%d0.w),%d0        | d0 = y * 160
+                add.w   6(%sp),%d0             | + groupFirstByteOff
+                lea     0(%a3,%d0.w),%a4
                 | numGroups in d1
                 move.w  4(%sp),%d1
                 tst.w   %d1
@@ -107,15 +120,14 @@
                 | single-group: combinedMask = leftMask & rightMask
                 move.w  0(%sp),%d0
                 and.w   2(%sp),%d0
-                bsr     .Lfc_applyMask
+                APPLY_MASK_INLINE \color
                 bra.w   .Lsb_done\@
 .Lsb_multi\@:
-                | leading mask. applyMask postinc-advances a4 by 8
-                | (the 4 plane RMWs each advance by 2 via (a4)+).
-                | applyMask trashes d1, so reload numGroups after bsr.
+                | leading mask. APPLY_MASK_INLINE postinc-advances a4 by 8.
+                | APPLY trashes d1, so reload numGroups after.
                 move.w  0(%sp),%d0
-                bsr     .Lfc_applyMask
-                move.w  4(%sp),%d1             | reload numGroups
+                APPLY_MASK_INLINE \color
+                move.w  4(%sp),%d1
                 subq.w  #1,%d1                 | d1 = numMid
                 beq.s   .Lsb_skipMid\@
 .Lsb_midLoop\@:
@@ -126,11 +138,71 @@
 .Lsb_skipMid\@:
                 | trailing mask
                 move.w  2(%sp),%d0
-                bsr     .Lfc_applyMask
+                APPLY_MASK_INLINE \color
 .Lsb_done\@:
                 .endm
 
 
+| ---- CO_BODY macro: per-color full Bresenham loop body ----------
+
+                .macro  CO_BODY  color
+.Lfc_loop_\color:
+                cmp.w   %d3,%d2
+                bcs.w   .Lfc_done
+
+                | --- Pair A: x range = (cx - bx, cx + bx)
+                move.w  SP_FC_CX(%sp),%d0
+                move.w  %d0,%d1
+                sub.w   %d2,%d0
+                add.w   %d2,%d1
+                COMPUTE_PAIR_MASKS
+
+                | Span A1: y = cy + by
+                move.w  SP_FC_CY(%sp),%d0
+                add.w   %d3,%d0
+                SPAN_BODY  \color
+
+                | Span A2: y = cy - by
+                move.w  SP_FC_CY(%sp),%d0
+                sub.w   %d3,%d0
+                SPAN_BODY  \color
+
+                | --- Pair B: x range = (cx - by, cx + by)
+                move.w  SP_FC_CX(%sp),%d0
+                move.w  %d0,%d1
+                sub.w   %d3,%d0
+                add.w   %d3,%d1
+                COMPUTE_PAIR_MASKS
+
+                | Span B1: y = cy + bx
+                move.w  SP_FC_CY(%sp),%d0
+                add.w   %d2,%d0
+                SPAN_BODY  \color
+
+                | Span B2: y = cy - bx
+                move.w  SP_FC_CY(%sp),%d0
+                sub.w   %d2,%d0
+                SPAN_BODY  \color
+
+                | --- Bresenham step
+                addq.w  #1,%d3
+                tst.w   %d4
+                bgt.s   .Lfc_decBx_\color
+                add.w   %d3,%d4
+                add.w   %d3,%d4
+                addq.w  #1,%d4
+                bra.w   .Lfc_loop_\color
+.Lfc_decBx_\color:
+                subq.w  #1,%d2
+                add.w   %d3,%d4
+                add.w   %d3,%d4
+                sub.w   %d2,%d4
+                sub.w   %d2,%d4
+                addq.w  #1,%d4
+                bra.w   .Lfc_loop_\color
+                .endm
+
+
                 .globl  _surface68kStFillCircle
 
 _surface68kStFillCircle:
@@ -142,10 +214,11 @@ _surface68kStFillCircle:
                 moveq   #0,%d7
                 move.b  SP_FC_COLOR(%sp),%d7
 
-                | LUT bases (PC-relative indexed has only 8-bit
-                | displacement, so cache full pointers in a-regs).
+                | LUT bases. a5/a6 = mask LUTs (used by COMPUTE_PAIR_MASKS).
+                | a2 = shared _gStRowOffsetLut (used by SPAN_BODY for y*160).
                 lea     leftMaskLut(%pc),%a5
                 lea     rightMaskLut(%pc),%a6
+                lea     _gStRowOffsetLut,%a2
 
                 | loLong = ((c&1)?0xFFFF0000:0) | ((c&2)?0x0000FFFF:0)
                 moveq   #0,%d5
@@ -174,60 +247,50 @@ _surface68kStFillCircle:
                 moveq   #1,%d4
                 sub.w   %d2,%d4
 
-.Lfc_loop:
-                cmp.w   %d3,%d2
-                bcs.w   .Lfc_done
+                | Dispatch on color (low 4 bits) -> 16 specialized loops.
+                | Use a4 (gets overwritten in SPAN_BODY's first lea) as
+                | dispatch scratch since a2 now holds yLut for the body.
+                and.w   #0x0F,%d7
+                move.w  %d7,%d0
+                add.w   %d0,%d0
+                add.w   %d0,%d0                | * 4 for bra.w table
+                lea     .Lfc_table(%pc),%a4
+                jmp     0(%a4,%d0.w)
 
-                | --- Pair A: x range = (cx - bx, cx + bx)
-                move.w  SP_FC_CX(%sp),%d0
-                move.w  %d0,%d1
-                sub.w   %d2,%d0                | left  = cx - bx
-                add.w   %d2,%d1                | right = cx + bx
-                COMPUTE_PAIR_MASKS
+.Lfc_table:
+                bra.w   .Lfc_loop_0
+                bra.w   .Lfc_loop_1
+                bra.w   .Lfc_loop_2
+                bra.w   .Lfc_loop_3
+                bra.w   .Lfc_loop_4
+                bra.w   .Lfc_loop_5
+                bra.w   .Lfc_loop_6
+                bra.w   .Lfc_loop_7
+                bra.w   .Lfc_loop_8
+                bra.w   .Lfc_loop_9
+                bra.w   .Lfc_loop_10
+                bra.w   .Lfc_loop_11
+                bra.w   .Lfc_loop_12
+                bra.w   .Lfc_loop_13
+                bra.w   .Lfc_loop_14
+                bra.w   .Lfc_loop_15
 
-                | Span A1: y = cy + by
-                move.w  SP_FC_CY(%sp),%d0
-                add.w   %d3,%d0
-                SPAN_BODY
-
-                | Span A2: y = cy - by
-                move.w  SP_FC_CY(%sp),%d0
-                sub.w   %d3,%d0
-                SPAN_BODY
-
-                | --- Pair B: x range = (cx - by, cx + by)
-                move.w  SP_FC_CX(%sp),%d0
-                move.w  %d0,%d1
-                sub.w   %d3,%d0                | left  = cx - by
-                add.w   %d3,%d1                | right = cx + by
-                COMPUTE_PAIR_MASKS
-
-                | Span B1: y = cy + bx
-                move.w  SP_FC_CY(%sp),%d0
-                add.w   %d2,%d0
-                SPAN_BODY
-
-                | Span B2: y = cy - bx
-                move.w  SP_FC_CY(%sp),%d0
-                sub.w   %d2,%d0
-                SPAN_BODY
-
-                | --- Bresenham step
-                addq.w  #1,%d3
-                tst.w   %d4
-                bgt.s   .Lfc_decBx
-                add.w   %d3,%d4
-                add.w   %d3,%d4
-                addq.w  #1,%d4
-                bra.w   .Lfc_loop
-.Lfc_decBx:
-                subq.w  #1,%d2
-                add.w   %d3,%d4
-                add.w   %d3,%d4
-                sub.w   %d2,%d4
-                sub.w   %d2,%d4
-                addq.w  #1,%d4
-                bra.w   .Lfc_loop
+                CO_BODY  0
+                CO_BODY  1
+                CO_BODY  2
+                CO_BODY  3
+                CO_BODY  4
+                CO_BODY  5
+                CO_BODY  6
+                CO_BODY  7
+                CO_BODY  8
+                CO_BODY  9
+                CO_BODY  10
+                CO_BODY  11
+                CO_BODY  12
+                CO_BODY  13
+                CO_BODY  14
+                CO_BODY  15
 
 
 .Lfc_done:
@@ -236,46 +299,6 @@ _surface68kStFillCircle:
                 rts
 
 
-| ---- Apply 4-plane mask at (a4) -------------------------------
-| Input:  d0.w = mask, d7.b = color, a4 = group ptr
-| Output: a4 advanced by 8 (next group). Caller must NOT post-add 8.
-| Trashes: d0, d1
-| Subroutine, called via bsr from SPAN_BODY. Postinc on each plane
-| RMW saves 4 cyc/plane vs displacement (12 vs 16 EA cyc).
-
-.Lfc_applyMask:
-                move.w  %d0,%d1
-                not.w   %d1                    | d1 = notMask
-                btst    #0,%d7
-                beq.s   .Lfc_am0a
-                or.w    %d0,(%a4)+
-                bra.s   .Lfc_am1
-.Lfc_am0a:
-                and.w   %d1,(%a4)+
-.Lfc_am1:
-                btst    #1,%d7
-                beq.s   .Lfc_am1a
-                or.w    %d0,(%a4)+
-                bra.s   .Lfc_am2
-.Lfc_am1a:
-                and.w   %d1,(%a4)+
-.Lfc_am2:
-                btst    #2,%d7
-                beq.s   .Lfc_am2a
-                or.w    %d0,(%a4)+
-                bra.s   .Lfc_am3
-.Lfc_am2a:
-                and.w   %d1,(%a4)+
-.Lfc_am3:
-                btst    #3,%d7
-                beq.s   .Lfc_am3a
-                or.w    %d0,(%a4)+
-                rts
-.Lfc_am3a:
-                and.w   %d1,(%a4)+
-                rts
-
-
                 .align  2
 | leftMaskLut[i]  = (1 << (16 - i)) - 1, indexed by bitFirst (0..15)
 leftMaskLut:
diff --git a/src/port/atarist/hal.c b/src/port/atarist/hal.c
index 77a5c5c..bf68308 100644
--- a/src/port/atarist/hal.c
+++ b/src/port/atarist/hal.c
@@ -2,7 +2,7 @@
 //
 // M2 scope:
 //   * XBIOS Setscreen to ST low-res (320x200x16, mode 0).
-//   * Chunky 4bpp to word-interleaved ST planar c2p at present time.
+//   * Word-interleaved ST planar buffer copied to the screen at present.
 //
 // M2.5 scope (per-band palette / SCB emulation):
 //   * halPresent scans the SurfaceT's SCB array and builds a compact
@@ -136,17 +136,9 @@ static inline __attribute__((always_inline)) uint8_t stPlanarGetPixel(const StPl
 }
 static uint16_t quantizeColorToSt(uint16_t orgb);
 static void     flattenScbPalettes(const SurfaceT *src);
-static void     initC2pLut(void);
 static void     writeDiagnostics(void);
 static long     writePrevPaletteRegs(void);
 
-// Provided by src/port/atarist/c2p.s.
-extern void chunkyToPlanarRowSt(const uint8_t *src,
-                                uint16_t *dst,
-                                uint16_t groupStart,
-                                uint16_t groupEnd,
-                                const uint8_t *lut);
-
 static __attribute__((interrupt_handler)) void timerBIsr(void);
 static __attribute__((interrupt_handler)) void vblIsr(void);
 static void                                    buildTransitions(const SurfaceT *src);
@@ -201,72 +193,11 @@ static void (*gOldTimerBVec)(void) = NULL;
 // SCB; neither is cheap on a 7 MHz 68000. In the typical game loop
 // (and every frame of the keys demo after the initial paint) SCB and
 // palette never change, so caching and skipping those passes keeps
-// rect presents down to just the c2p work.
+// rect presents down to just the screen blit.
 static uint8_t  gCachedScb    [SURFACE_HEIGHT];
 static uint16_t gCachedPalette[SURFACE_PALETTE_COUNT][SURFACE_COLORS_PER_PALETTE];
 static bool     gCacheValid = false;
 
-// 256-long plane-spread LUT for the asm sprite SAVE path (defined in
-// spriteAsm.s). For plane byte b, LUT[b] is a 32-bit value where each
-// of b's 8 bits is placed at the bit-0 position of the corresponding
-// pixel's nibble inside a 4-byte chunky long. The asm shifts the LUT
-// entry left by N to get plane N's contribution; OR'd across 4 planes
-// gives the full chunky long. Initialized lazily.
-//
-// LUT used by surface68kStSpriteSaveByteAligned. The asm reads via
-// `move.l (a_ptr, d0.l), d4` which requires the LUT to be long-
-// aligned -- and TOS .PRG BSS only does 2-byte alignment. Worse,
-// the cascading offsets from the odd-sized gC2pLut put even
-// `uint32_t` BSS slots at addr mod 4 == 2.
-//
-// Fix: malloc the LUT. mintlib's malloc returns long-aligned memory.
-// The pointer is passed to the asm via the C-side wrapper (so the
-// asm reads it from the stack, where it's guaranteed long-aligned
-// regardless of where the static pointer slot lives).
-static uint32_t *gStPlaneSpreadLutPtr = NULL;
-static bool      gStPlaneSpreadLutReady = false;
-
-static bool initStPlaneSpreadLut(void) {
-    int b;
-    int i;
-
-    if (gStPlaneSpreadLutReady) {
-        return true;
-    }
-    gStPlaneSpreadLutPtr = (uint32_t *)malloc(256 * sizeof(uint32_t));
-    if (gStPlaneSpreadLutPtr == NULL) {
-        return false;
-    }
-
-    for (b = 0; b < 256; b++) {
-        uint32_t v = 0u;
-        for (i = 0; i < 8; i++) {
-            if (b & (0x80 >> i)) {
-                int byteIdx = i >> 1;
-                int isHigh  = ((i & 1) == 0);
-                int bitInLong = (3 - byteIdx) * 8 + (isHigh ? 4 : 0);
-                v |= (uint32_t)1u << bitInLong;
-            }
-        }
-        gStPlaneSpreadLutPtr[b] = v;
-    }
-    gStPlaneSpreadLutReady = true;
-    return true;
-}
-
-
-// 4 KB chunky-to-planar lookup table consumed by chunkyToPlanarRowSt
-// (src/port/atarist/c2p.s). Layout: gC2pLut[src*16 + pos*4 + plane]
-// = the 2-bit plane-byte contribution for source byte `src` at
-// byte-position `pos` (0..3 within a 4-byte chunk) going to plane
-// `plane`. Bit positions inside the byte are (7-2*pos, 6-2*pos), so
-// the same table feeds both halves of an ST plane word: positions
-// 0..3 land in the high byte, 4..7 (re-indexed mod 4) in the low
-// byte. Built once by initC2pLut on the first halPresent call.
-/* Exported (no static) so spriteAsm.s can `lea _gC2pLut, %a2`. */
-uint8_t         gC2pLut[4 * 1024];
-static bool     gC2pLutReady = false;
-
 // ----- Internal helpers (alphabetical) -----
 
 // Scan the surface's SCB and record one transition entry for each
@@ -350,37 +281,6 @@ static void refreshPaletteStateIfNeeded(const SurfaceT *src) {
 }
 
 
-// Build the 4 KB chunky-to-planar lookup table consumed by
-// chunkyToPlanarRowSt. Same layout/contents as the Amiga c2p LUT;
-// see src/port/atarist/c2p.s for the addressing math.
-static void initC2pLut(void) {
-    uint16_t pos;
-    uint16_t plane;
-    uint16_t src;
-    uint8_t  highShift;
-    uint8_t  lowShift;
-    uint8_t  highBit;
-    uint8_t  lowBit;
-
-    if (gC2pLutReady) {
-        return;
-    }
-    for (src = 0; src < 256; src++) {
-        for (pos = 0; pos < 4; pos++) {
-            highShift = (uint8_t)(7 - 2 * pos);
-            lowShift  = (uint8_t)(6 - 2 * pos);
-            for (plane = 0; plane < 4; plane++) {
-                highBit = (uint8_t)(((src >> 4) >> plane) & 1);
-                lowBit  = (uint8_t)(((src & 0x0F) >> plane) & 1);
-                gC2pLut[src * 16 + pos * 4 + plane] =
-                    (uint8_t)((highBit << highShift) | (lowBit << lowShift));
-            }
-        }
-    }
-    gC2pLutReady = true;
-}
-
-
 // 12-bit $0RGB to STF 9-bit palette register (drops the low bit of
 // each 4-bit channel).
 static uint16_t quantizeColorToSt(uint16_t orgb) {
@@ -619,11 +519,8 @@ void halPresent(const SurfaceT *src) {
     }
     refreshPaletteStateIfNeeded(src);
 
-    // Phase 9: planar shadow -> screen RAM. Same dirty-word band
-    // tracking the c2p path used; just memcpy the planar bytes for
-    // each band instead of running c2p on the chunky shadow. Each
-    // dirty word covers 4 pixels = ?of one group = quarter of an
-    // 8-byte group. We round to whole groups (8 bytes each) for a
+    // Planar buffer -> screen RAM. Each dirty word covers 4 pixels
+    // (a quarter of an 8-byte group). Round to whole groups for a
     // simple aligned memcpy, since planar groups are the natural
     // copy unit.
     for (y = 0; y < SURFACE_HEIGHT; y++) {
@@ -720,8 +617,11 @@ extern void surface68kStFillCircle(uint8_t *base, uint16_t cx, uint16_t cy, uint
 extern void surface68kStFillRectSingleGroup(uint8_t *firstGroupPtr, uint16_t mask, uint16_t h, uint8_t color);
 extern void surface68kStFillRectMulti(uint8_t *base, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t color);
 extern void surface68kStLongFill(uint8_t *dst, uint16_t numGroups, uint32_t loLong, uint32_t hiLong);
-extern void surface68kStSpriteSaveByteAligned(uint8_t *base, uint16_t x, uint16_t y, uint16_t w, uint16_t h, uint8_t *dstChunky, const uint32_t *lut);
-extern void surface68kStSpriteRestoreByteAligned(uint8_t *base, uint16_t x, uint16_t y, uint16_t w, uint16_t h, const uint8_t *srcChunky, const uint8_t *c2pLut);
+extern void surface68kStTileFill8x8(uint8_t *firstGroupPtr, uint16_t mask, uint8_t color);
+extern void surface68kStSprite16x16Save(uint8_t *base, uint16_t x, uint16_t y, uint8_t *dstBuf);
+extern void surface68kStSprite16x16Restore(uint8_t *base, uint16_t x, uint16_t y, const uint8_t *srcBuf);
+extern void surface68kStSpriteSaveByteAligned(uint8_t *base, uint16_t x, uint16_t y, uint16_t w, uint16_t h, uint8_t *dstPlaneBytes);
+extern void surface68kStSpriteRestoreByteAligned(uint8_t *base, uint16_t x, uint16_t y, uint16_t w, uint16_t h, const uint8_t *srcPlaneBytes);
 
 
 // Phase 9: clear the entire planar buffer to a 4-bit color. Build an
@@ -1262,17 +1162,12 @@ void halTileFillPlanes(SurfaceT *s, uint8_t bx, uint8_t by, uint8_t colorIndex)
     group    = (uint16_t)((uint16_t)bx >> 1);
     halfMask = ((bx & 1u) == 0u) ? 0xFF00u : 0x00FFu;
     gp = pd->base + (uint16_t)by * 8u * ST_BYTES_PER_ROW + group * ST_BYTES_PER_GROUP;
-    surface68kStFillRectSingleGroup(gp, halfMask, TILE_PIXELS_PER_SIDE, colorIndex);
+    /* Phase 10 final: specialized 8x8 unrolled tile-fill skips the
+     * generic FRG_LOOP's per-row subq+bne overhead. */
+    surface68kStTileFill8x8(gp, halfMask, colorIndex);
 }
 
 
-// Phase 10: group-aware tile paste. Per row: extract 8 pixels from
-// 4 chunky bytes, build 4 plane bytes (one per plane), drop them
-// into the high or low half of the 4 plane words at this group --
-// 4 word RMWs per row instead of 64 per-pixel calls.
-static const uint8_t kStTileBitLut[8] = { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 };
-
-
 // Phase 10: tile paste/snap reuse the asm sprite save/restore
 // helpers -- identical per-row work patterns at byte-aligned
 // positions. Width 8 = single tile column = single half-group
@@ -1301,14 +1196,25 @@ void halTilePastePlanes(SurfaceT *dst, uint8_t bx, uint8_t by, const uint8_t *ti
             + (uint16_t)by * 8u * ST_BYTES_PER_ROW
             + group * ST_BYTES_PER_GROUP
             + (uint16_t)(bx & 1u);
-    for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) {
-        dstAddr[0] = tileBytes[0];
-        dstAddr[2] = tileBytes[1];
-        dstAddr[4] = tileBytes[2];
-        dstAddr[6] = tileBytes[3];
-        dstAddr   += ST_BYTES_PER_ROW;
-        tileBytes += TILE_BYTES_PER_ROW;
-    }
+    (void)row;
+#define ST_TILE_PASTE_ROW                                                   \
+    do {                                                                    \
+        dstAddr[0] = tileBytes[0];                                          \
+        dstAddr[2] = tileBytes[1];                                          \
+        dstAddr[4] = tileBytes[2];                                          \
+        dstAddr[6] = tileBytes[3];                                          \
+        dstAddr   += ST_BYTES_PER_ROW;                                      \
+        tileBytes += TILE_BYTES_PER_ROW;                                    \
+    } while (0)
+    ST_TILE_PASTE_ROW;
+    ST_TILE_PASTE_ROW;
+    ST_TILE_PASTE_ROW;
+    ST_TILE_PASTE_ROW;
+    ST_TILE_PASTE_ROW;
+    ST_TILE_PASTE_ROW;
+    ST_TILE_PASTE_ROW;
+    ST_TILE_PASTE_ROW;
+#undef ST_TILE_PASTE_ROW
 }
 
 
@@ -1331,136 +1237,25 @@ void halTileSnapPlanes(const SurfaceT *src, uint8_t bx, uint8_t by, uint8_t *til
             + (uint16_t)by * 8u * ST_BYTES_PER_ROW
             + group * ST_BYTES_PER_GROUP
             + (uint16_t)(bx & 1u);
-    for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) {
-        tileOut[0] = srcAddr[0];
-        tileOut[1] = srcAddr[2];
-        tileOut[2] = srcAddr[4];
-        tileOut[3] = srcAddr[6];
-        srcAddr   += ST_BYTES_PER_ROW;
-        tileOut   += TILE_BYTES_PER_ROW;
-    }
-}
-
-
-/* Slow-path C versions kept (renamed) for reference; not in the
- * active call chain. */
-static void halTilePastePlanes_oldC(SurfaceT *dst, uint8_t bx, uint8_t by, const uint8_t *chunkyTile) {
-    StPlanarT *pd;
-    uint16_t   group;
-    uint16_t   halfMask;
-    uint16_t   notHalfMask;
-    bool       isHigh;
-    uint8_t   *rowBase;
-    int16_t    row;
-    int16_t    pix;
-    uint16_t  *pw;
-    uint8_t    b;
-    uint8_t    color;
-    uint8_t    pb0;
-    uint8_t    pb1;
-    uint8_t    pb2;
-    uint8_t    pb3;
-    uint8_t    bit;
-
-    if (dst == NULL || chunkyTile == NULL) {
-        return;
-    }
-    pd = (StPlanarT *)dst->portData;
-    if (pd == NULL) {
-        return;
-    }
-    group       = (uint16_t)((uint16_t)bx >> 1);
-    isHigh      = ((bx & 1u) == 0u);
-    halfMask    = isHigh ? 0xFF00u : 0x00FFu;
-    notHalfMask = (uint16_t)~halfMask;
-    rowBase = pd->base
-            + (uint16_t)by * TILE_PIXELS_PER_SIDE * ST_BYTES_PER_ROW
-            + group * ST_BYTES_PER_GROUP;
-    for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) {
-        pb0 = pb1 = pb2 = pb3 = 0u;
-        for (pix = 0; pix < TILE_PIXELS_PER_SIDE; pix++) {
-            b = chunkyTile[row * TILE_BYTES_PER_ROW + (pix >> 1)];
-            color = (pix & 1) ? (uint8_t)(b & 0x0Fu) : (uint8_t)(b >> 4);
-            bit = kStTileBitLut[pix];
-            if (color & 1u) { pb0 = (uint8_t)(pb0 | bit); }
-            if (color & 2u) { pb1 = (uint8_t)(pb1 | bit); }
-            if (color & 4u) { pb2 = (uint8_t)(pb2 | bit); }
-            if (color & 8u) { pb3 = (uint8_t)(pb3 | bit); }
-        }
-        pw = (uint16_t *)rowBase;
-        if (isHigh) {
-            pw[0] = (uint16_t)((pw[0] & notHalfMask) | ((uint16_t)pb0 << 8));
-            pw[1] = (uint16_t)((pw[1] & notHalfMask) | ((uint16_t)pb1 << 8));
-            pw[2] = (uint16_t)((pw[2] & notHalfMask) | ((uint16_t)pb2 << 8));
-            pw[3] = (uint16_t)((pw[3] & notHalfMask) | ((uint16_t)pb3 << 8));
-        } else {
-            pw[0] = (uint16_t)((pw[0] & notHalfMask) | (uint16_t)pb0);
-            pw[1] = (uint16_t)((pw[1] & notHalfMask) | (uint16_t)pb1);
-            pw[2] = (uint16_t)((pw[2] & notHalfMask) | (uint16_t)pb2);
-            pw[3] = (uint16_t)((pw[3] & notHalfMask) | (uint16_t)pb3);
-        }
-        rowBase += ST_BYTES_PER_ROW;
-    }
-}
-
-
-// Phase 10: group-aware tile snap. Read 4 plane half-words for the
-// row's group, distribute the 8 plane bits per plane into chunky
-// nibbles. 4 word reads per row + 4 chunky bytes per row, no
-// per-pixel function calls. Replaced by the asm-routed halTileSnapPlanes
-// above; kept for reference as the C-only fallback.
-static void halTileSnapPlanes_oldC(const SurfaceT *src, uint8_t bx, uint8_t by, uint8_t *chunkyTileOut) {
-    const StPlanarT *pd;
-    uint16_t         group;
-    uint16_t         halfShift;
-    const uint8_t   *rowBase;
-    int16_t          row;
-    int16_t          pair;
-    const uint16_t  *pw;
-    uint8_t          pb0;
-    uint8_t          pb1;
-    uint8_t          pb2;
-    uint8_t          pb3;
-    uint8_t          bitHi;
-    uint8_t          bitLo;
-    uint8_t          hi;
-    uint8_t          lo;
-
-    if (src == NULL || chunkyTileOut == NULL) {
-        return;
-    }
-    pd = (const StPlanarT *)src->portData;
-    if (pd == NULL) {
-        return;
-    }
-    group     = (uint16_t)((uint16_t)bx >> 1);
-    halfShift = ((bx & 1u) == 0u) ? 8u : 0u;
-    rowBase = pd->base
-            + (uint16_t)by * TILE_PIXELS_PER_SIDE * ST_BYTES_PER_ROW
-            + group * ST_BYTES_PER_GROUP;
-    for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) {
-        pw  = (const uint16_t *)rowBase;
-        pb0 = (uint8_t)(pw[0] >> halfShift);
-        pb1 = (uint8_t)(pw[1] >> halfShift);
-        pb2 = (uint8_t)(pw[2] >> halfShift);
-        pb3 = (uint8_t)(pw[3] >> halfShift);
-        for (pair = 0; pair < TILE_BYTES_PER_ROW; pair++) {
-            bitHi = kStTileBitLut[pair * 2];
-            bitLo = kStTileBitLut[pair * 2 + 1];
-            hi = 0u;
-            lo = 0u;
-            if (pb0 & bitHi) hi = (uint8_t)(hi | 1u);
-            if (pb1 & bitHi) hi = (uint8_t)(hi | 2u);
-            if (pb2 & bitHi) hi = (uint8_t)(hi | 4u);
-            if (pb3 & bitHi) hi = (uint8_t)(hi | 8u);
-            if (pb0 & bitLo) lo = (uint8_t)(lo | 1u);
-            if (pb1 & bitLo) lo = (uint8_t)(lo | 2u);
-            if (pb2 & bitLo) lo = (uint8_t)(lo | 4u);
-            if (pb3 & bitLo) lo = (uint8_t)(lo | 8u);
-            chunkyTileOut[row * TILE_BYTES_PER_ROW + pair] = (uint8_t)((hi << 4) | lo);
-        }
-        rowBase += ST_BYTES_PER_ROW;
-    }
+    (void)row;
+#define ST_TILE_SNAP_ROW                                                    \
+    do {                                                                    \
+        tileOut[0] = srcAddr[0];                                            \
+        tileOut[1] = srcAddr[2];                                            \
+        tileOut[2] = srcAddr[4];                                            \
+        tileOut[3] = srcAddr[6];                                            \
+        srcAddr   += ST_BYTES_PER_ROW;                                      \
+        tileOut   += TILE_BYTES_PER_ROW;                                    \
+    } while (0)
+    ST_TILE_SNAP_ROW;
+    ST_TILE_SNAP_ROW;
+    ST_TILE_SNAP_ROW;
+    ST_TILE_SNAP_ROW;
+    ST_TILE_SNAP_ROW;
+    ST_TILE_SNAP_ROW;
+    ST_TILE_SNAP_ROW;
+    ST_TILE_SNAP_ROW;
+#undef ST_TILE_SNAP_ROW
 }
 
 
@@ -1496,14 +1291,28 @@ void halTileCopyPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const Surfac
             + (uint16_t)dstBy * 8u * ST_BYTES_PER_ROW
             + dstGroup * ST_BYTES_PER_GROUP
             + (uint16_t)(dstBx & 1u);
-    for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) {
-        dstAddr[0] = srcAddr[0];   /* plane 0 byte (high or low half) */
-        dstAddr[2] = srcAddr[2];   /* plane 1 */
-        dstAddr[4] = srcAddr[4];   /* plane 2 */
-        dstAddr[6] = srcAddr[6];   /* plane 3 */
-        srcAddr += ST_BYTES_PER_ROW;
-        dstAddr += ST_BYTES_PER_ROW;
-    }
+    /* gcc-mint -O2 does NOT unroll the 8-iter byte-copy loop,
+     * leaving cmpl + bnes loop overhead per row. Manual unroll
+     * drops ~150 cyc/call. (void)row keeps the unused decl quiet. */
+    (void)row;
+#define ST_TILE_COPY_ROW                                                    \
+    do {                                                                    \
+        dstAddr[0] = srcAddr[0];                                            \
+        dstAddr[2] = srcAddr[2];                                            \
+        dstAddr[4] = srcAddr[4];                                            \
+        dstAddr[6] = srcAddr[6];                                            \
+        srcAddr += ST_BYTES_PER_ROW;                                        \
+        dstAddr += ST_BYTES_PER_ROW;                                        \
+    } while (0)
+    ST_TILE_COPY_ROW;          /* row 0 */
+    ST_TILE_COPY_ROW;          /* row 1 */
+    ST_TILE_COPY_ROW;          /* row 2 */
+    ST_TILE_COPY_ROW;          /* row 3 */
+    ST_TILE_COPY_ROW;          /* row 4 */
+    ST_TILE_COPY_ROW;          /* row 5 */
+    ST_TILE_COPY_ROW;          /* row 6 */
+    ST_TILE_COPY_ROW;          /* row 7 */
+#undef ST_TILE_COPY_ROW
 }
 
 
@@ -1792,109 +1601,6 @@ void halBlitRectPlanes(SurfaceT *dst, int16_t x, int16_t y, const uint8_t *srcBy
 }
 
 
-// Phase 10 fast paths for save/restore. Hand-rolled asm
-// (surface68kStSprite{Save,Restore}ByteAligned) does the chunky <->
-// plane bit transpose via ASL+ROXL and walks rows/tile columns. The
-// C wrappers below are kept as a fallback / reference; they're not
-// in the critical path now that the asm versions are wired in.
-static void stSpriteSaveByteAligned(const StPlanarT *pd, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t *dstChunkyBytes) {
-    int16_t        bytesPerRow = (int16_t)(w >> 1);
-    int16_t        tileCols    = (int16_t)(w >> 3);
-    const uint8_t *rowBase     = pd->base + (uint16_t)y * ST_BYTES_PER_ROW;
-    int16_t        row;
-    int16_t        tileCol;
-
-    for (row = 0; row < (int16_t)h; row++) {
-        uint8_t *dstRow = &dstChunkyBytes[(uint16_t)row * (uint16_t)bytesPerRow];
-        for (tileCol = 0; tileCol < tileCols; tileCol++) {
-            int16_t         srcX  = (int16_t)(x + tileCol * 8);
-            uint16_t        group = (uint16_t)((uint16_t)srcX >> 4);
-            uint16_t        shift = ((srcX & 8) == 0) ? 8u : 0u;
-            const uint16_t *pw    = (const uint16_t *)(rowBase + group * ST_BYTES_PER_GROUP);
-            uint8_t         pb0   = (uint8_t)(pw[0] >> shift);
-            uint8_t         pb1   = (uint8_t)(pw[1] >> shift);
-            uint8_t         pb2   = (uint8_t)(pw[2] >> shift);
-            uint8_t         pb3   = (uint8_t)(pw[3] >> shift);
-            int16_t         pair;
-            for (pair = 0; pair < 4; pair++) {
-                uint8_t bitHi = (uint8_t)(0x80u >> (pair * 2));
-                uint8_t bitLo = (uint8_t)(0x80u >> (pair * 2 + 1));
-                uint8_t hi = 0u;
-                uint8_t lo = 0u;
-                if (pb0 & bitHi) { hi = (uint8_t)(hi | 1u); }
-                if (pb1 & bitHi) { hi = (uint8_t)(hi | 2u); }
-                if (pb2 & bitHi) { hi = (uint8_t)(hi | 4u); }
-                if (pb3 & bitHi) { hi = (uint8_t)(hi | 8u); }
-                if (pb0 & bitLo) { lo = (uint8_t)(lo | 1u); }
-                if (pb1 & bitLo) { lo = (uint8_t)(lo | 2u); }
-                if (pb2 & bitLo) { lo = (uint8_t)(lo | 4u); }
-                if (pb3 & bitLo) { lo = (uint8_t)(lo | 8u); }
-                dstRow[tileCol * 4 + pair] = (uint8_t)((hi << 4) | lo);
-            }
-        }
-        rowBase += ST_BYTES_PER_ROW;
-    }
-}
-
-
-static void stSpriteRestoreByteAligned(StPlanarT *pd, int16_t x, int16_t y, uint16_t w, uint16_t h, const uint8_t *srcChunkyBytes) {
-    int16_t   bytesPerRow = (int16_t)(w >> 1);
-    int16_t   tileCols    = (int16_t)(w >> 3);
-    uint8_t  *rowBase     = pd->base + (uint16_t)y * ST_BYTES_PER_ROW;
-    int16_t   row;
-    int16_t   tileCol;
-
-    for (row = 0; row < (int16_t)h; row++) {
-        const uint8_t *srcRow = &srcChunkyBytes[(uint16_t)row * (uint16_t)bytesPerRow];
-        for (tileCol = 0; tileCol < tileCols; tileCol++) {
-            uint8_t   b0  = srcRow[tileCol * 4 + 0];
-            uint8_t   b1  = srcRow[tileCol * 4 + 1];
-            uint8_t   b2  = srcRow[tileCol * 4 + 2];
-            uint8_t   b3  = srcRow[tileCol * 4 + 3];
-            uint8_t   pb0 = 0u;
-            uint8_t   pb1 = 0u;
-            uint8_t   pb2 = 0u;
-            uint8_t   pb3 = 0u;
-            uint8_t   c;
-            int16_t   dstX;
-            uint16_t  group;
-            uint16_t *pw;
-            uint16_t  halfMask;
-            uint16_t  notHalfMask;
-
-            c = (uint8_t)(b0 >> 4); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x80u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x80u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x80u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x80u);
-            c = (uint8_t)(b0 & 0x0Fu); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x40u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x40u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x40u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x40u);
-            c = (uint8_t)(b1 >> 4); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x20u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x20u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x20u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x20u);
-            c = (uint8_t)(b1 & 0x0Fu); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x10u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x10u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x10u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x10u);
-            c = (uint8_t)(b2 >> 4); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x08u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x08u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x08u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x08u);
-            c = (uint8_t)(b2 & 0x0Fu); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x04u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x04u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x04u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x04u);
-            c = (uint8_t)(b3 >> 4); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x02u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x02u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x02u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x02u);
-            c = (uint8_t)(b3 & 0x0Fu); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x01u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x01u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x01u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x01u);
-
-            dstX  = (int16_t)(x + tileCol * 8);
-            group = (uint16_t)((uint16_t)dstX >> 4);
-            pw    = (uint16_t *)(rowBase + group * ST_BYTES_PER_GROUP);
-            if ((dstX & 8) == 0) {
-                halfMask = 0xFF00u;
-                pw[0] = (uint16_t)((pw[0] & 0x00FFu) | ((uint16_t)pb0 << 8));
-                pw[1] = (uint16_t)((pw[1] & 0x00FFu) | ((uint16_t)pb1 << 8));
-                pw[2] = (uint16_t)((pw[2] & 0x00FFu) | ((uint16_t)pb2 << 8));
-                pw[3] = (uint16_t)((pw[3] & 0x00FFu) | ((uint16_t)pb3 << 8));
-            } else {
-                halfMask = 0x00FFu;
-                pw[0] = (uint16_t)((pw[0] & 0xFF00u) | (uint16_t)pb0);
-                pw[1] = (uint16_t)((pw[1] & 0xFF00u) | (uint16_t)pb1);
-                pw[2] = (uint16_t)((pw[2] & 0xFF00u) | (uint16_t)pb2);
-                pw[3] = (uint16_t)((pw[3] & 0xFF00u) | (uint16_t)pb3);
-            }
-            (void)halfMask;
-            (void)notHalfMask;
-        }
-        rowBase += ST_BYTES_PER_ROW;
-    }
-}
-
-
 // Phase 10: hoist y*160 to per-row, fold setPixel/getPixel bodies
 // inline. Each pixel's group address differs only in (x), so we
 // can compute base+row*160 once per row and just do per-pixel
@@ -1916,11 +1622,16 @@ void halSpriteSavePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t
         return;
     }
     /* Phase 10.5 fast path: byte-aligned, fully on-surface.
-     * Asm walker does direct planar byte copy (LUT pointer unused). */
+     * Specialized 16x16 (the UBER ball-sprite size) skips the asm
+     * walker's per-row col-init + col-loop-check overhead. */
     if ((x & 7) == 0 && (w & 7) == 0
             && x >= 0 && (x + (int16_t)w) <= SURFACE_WIDTH
             && y >= 0 && (y + (int16_t)h) <= SURFACE_HEIGHT) {
-        surface68kStSpriteSaveByteAligned(pd->base, (uint16_t)x, (uint16_t)y, w, h, dstPlaneBytes, NULL);
+        if (w == 16u && h == 16u) {
+            surface68kStSprite16x16Save(pd->base, (uint16_t)x, (uint16_t)y, dstPlaneBytes);
+        } else {
+            surface68kStSpriteSaveByteAligned(pd->base, (uint16_t)x, (uint16_t)y, w, h, dstPlaneBytes);
+        }
         return;
     }
 
@@ -1980,11 +1691,15 @@ void halSpriteRestorePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint1
         return;
     }
     /* Phase 10.5 fast path: byte-aligned, fully on-surface.
-     * Asm walker does direct planar byte copy (LUT pointer unused). */
+     * Specialized 16x16 (UBER ball-sprite) skips walker overhead. */
     if ((x & 7) == 0 && (w & 7) == 0
             && x >= 0 && (x + (int16_t)w) <= SURFACE_WIDTH
             && y >= 0 && (y + (int16_t)h) <= SURFACE_HEIGHT) {
-        surface68kStSpriteRestoreByteAligned(pd->base, (uint16_t)x, (uint16_t)y, w, h, srcPlaneBytes, NULL);
+        if (w == 16u && h == 16u) {
+            surface68kStSprite16x16Restore(pd->base, (uint16_t)x, (uint16_t)y, srcPlaneBytes);
+        } else {
+            surface68kStSpriteRestoreByteAligned(pd->base, (uint16_t)x, (uint16_t)y, w, h, srcPlaneBytes);
+        }
         return;
     }
 
@@ -2049,10 +1764,11 @@ uint8_t halSamplePixel(const SurfaceT *s, int16_t x, int16_t y) {
 }
 
 
-// Phase 9: derive 160 chunky bytes per row from the word-interleaved
-// planar buffer (20 groups x 4 plane words). Same shape as the Amiga's
+// Derive 160 chunky bytes per row from the word-interleaved planar
+// buffer (20 groups x 4 plane words). Same shape as the Amiga's
 // amigaPlanesToChunkyRow but per-group instead of per-byte. Used by
-// halSurfaceHash and halSurfaceSaveFileChunky.
+// halSurfaceHash to fold the planar surface into the same byte stream
+// the chunky ports hash, so cross-port hash comparisons stay valid.
 static void stPlanarToChunkyRow(const StPlanarT *pd, int16_t y, uint8_t *dstChunkyRow) {
     uint16_t        group;
     uint16_t        p;
@@ -2134,58 +1850,27 @@ void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src) {
 }
 
 
-// Phase 9: read chunky from file into a temporary scratch buffer,
-// then c2p once into the planar shadow. The .joeysurface file format
-// is still chunky 4bpp on disk (cross-port asset interchange); the
-// in-memory representation is what changes.
-bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp) {
+// On-disk format is the ST's native interleaved planar buffer; one
+// fread fills it directly, no chunky scratch or c2p step.
+bool halSurfaceLoadFile(SurfaceT *dst, FILE *fp) {
     StPlanarT *pd;
-    uint8_t   *scratch;
-    int16_t    y;
-    bool       ok;
 
     pd = (StPlanarT *)dst->portData;
     if (pd == NULL) {
         return false;
     }
-    scratch = (uint8_t *)malloc(SURFACE_PIXELS_SIZE);
-    if (scratch == NULL) {
-        return false;
-    }
-    ok = (fread(scratch, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE);
-    if (ok) {
-        if (!gC2pLutReady) {
-            initC2pLut();
-        }
-        for (y = 0; y < SURFACE_HEIGHT; y++) {
-            const uint8_t *srcLine = &scratch[y * SURFACE_BYTES_PER_ROW];
-            uint16_t      *dstLine = (uint16_t *)&pd->base[y * ST_BYTES_PER_ROW];
-            chunkyToPlanarRowSt(srcLine, dstLine, 0u, ST_GROUPS_PER_ROW, gC2pLut);
-        }
-    }
-    free(scratch);
-    return ok;
+    return fread(pd->base, 1, ST_PLANAR_SIZE, fp) == ST_PLANAR_SIZE;
 }
 
 
-// Phase 9: derive chunky bytes from the planar shadow row by row,
-// stream to file. Avoids needing a full 32 KB scratch buffer.
-bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp) {
+bool halSurfaceSaveFile(const SurfaceT *src, FILE *fp) {
     StPlanarT *pd;
-    uint8_t    chunkyRow[SURFACE_BYTES_PER_ROW];
-    int16_t    y;
 
     pd = (StPlanarT *)src->portData;
     if (pd == NULL) {
         return false;
     }
-    for (y = 0; y < SURFACE_HEIGHT; y++) {
-        stPlanarToChunkyRow(pd, y, chunkyRow);
-        if (fwrite(chunkyRow, 1, SURFACE_BYTES_PER_ROW, fp) != SURFACE_BYTES_PER_ROW) {
-            return false;
-        }
-    }
-    return true;
+    return fwrite(pd->base, 1, ST_PLANAR_SIZE, fp) == ST_PLANAR_SIZE;
 }
 
 
diff --git a/src/port/atarist/lineSpan.s b/src/port/atarist/lineSpan.s
index 242b7b4..ce189d8 100644
--- a/src/port/atarist/lineSpan.s
+++ b/src/port/atarist/lineSpan.s
@@ -50,19 +50,17 @@
 | Trashes: d0, d1, a2
 
                 .macro  DL_PLOT  color
-                | byteOff = y*160 + (x>>4)*8
+                | byteOff = y*160 + (x>>4)*8 (fits in 16 bits since
+                | surface is 32000 bytes < 32K). Skip ext.l + .l add
+                | + .l indexed lea -- all word-sized ops save 14 cyc/pixel.
                 move.w  %d3,%d0
-                ext.l   %d0
-                move.l  %d0,%d1
-                lsl.l   #5,%d0                 | y << 5
-                lsl.l   #7,%d1                 | y << 7
-                add.l   %d1,%d0                | d0 = y * 160
+                add.w   %d0,%d0                | y * 2 (word index)
+                move.w  (%a6,%d0.w),%d0        | d0 = y * 160
                 move.w  %d2,%d1
                 lsr.w   #4,%d1
                 lsl.w   #3,%d1                 | (x>>4) * 8
-                ext.l   %d1
-                add.l   %d1,%d0                | d0 = byteOff
-                lea     0(%a3,%d0.l),%a2       | a2 = base + byteOff
+                add.w   %d1,%d0                | d0 = byteOff (fits in 16 bits)
+                lea     0(%a3,%d0.w),%a2       | a2 = base + byteOff
                 | d1 = bitMask, d0 = notMask
                 move.w  %d2,%d1
                 and.w   #15,%d1
@@ -127,9 +125,11 @@ _surface68kStDrawLine:
                 movem.l %d2-%d7/%a2-%a6,-(%sp)
                 lea     -SP_LOCAL(%sp),%sp
 
-                | Load base & lut.
+                | Load base & luts.
                 move.l  SP_BASE(%sp),%a3
                 lea     bitMaskWordLut(%pc),%a5
+                | a6 = yLut base (yp -> yp*160) for use in DL_PLOT.
+                lea     _gStRowOffsetLut(%pc),%a6
 
                 | x = x0, y = y0
                 move.w  SP_X0(%sp),%d2
@@ -179,8 +179,8 @@ _surface68kStDrawLine:
                 and.w   #0x0F,%d0
                 add.w   %d0,%d0
                 add.w   %d0,%d0                | * 4 for bra.w table
-                lea     .LdlStTable(%pc),%a6
-                jmp     0(%a6,%d0.w)
+                lea     .LdlStTable(%pc),%a2   | a2 scratch (a6 holds yLut)
+                jmp     0(%a2,%d0.w)
 
 .LdlStTable:
                 bra.w   .LdlStLoop_0
@@ -529,6 +529,129 @@ _surface68kStFillRectSingleGroup:
                 rts
 
 
+| ---- surface68kStTileFill8x8 ---------------------------------------
+|
+| Specialized 8x8 single-group fill: 16-way color dispatch + 8 rows
+| fully unrolled. Drops the per-row subq+bne overhead that the
+| generic FRG_LOOP pays. Used by halTileFillPlanes.
+|
+|   void surface68kStTileFill8x8(uint8_t *firstGroupPtr,
+|                                uint16_t mask,
+|                                uint8_t color);
+|
+| Per row body: 4 plane RMW with postinc + lea 152(a3),a3 to next
+| row. Row 7 skips the trailing lea (a3 not used after).
+
+                .equ    SP_TF_SAVED, 16        | d3-d4/a2-a3 = 4 longs
+                .equ    SP_TF_OFF,         (SP_TF_SAVED + 4)
+                .equ    SP_TF_PTR,    SP_TF_OFF + 0
+                .equ    SP_TF_MASK,   SP_TF_OFF + 4 + 2
+                .equ    SP_TF_COLOR,  SP_TF_OFF + 8 + 3
+
+
+                .macro  TF8_ROW_BARE  color
+                .if  ((\color) & 1)
+                or.w    %d3,(%a3)+
+                .else
+                and.w   %d4,(%a3)+
+                .endif
+                .if  ((\color) & 2)
+                or.w    %d3,(%a3)+
+                .else
+                and.w   %d4,(%a3)+
+                .endif
+                .if  ((\color) & 4)
+                or.w    %d3,(%a3)+
+                .else
+                and.w   %d4,(%a3)+
+                .endif
+                .if  ((\color) & 8)
+                or.w    %d3,(%a3)+
+                .else
+                and.w   %d4,(%a3)+
+                .endif
+                .endm
+
+
+                .macro  TF8_ROW  color
+                TF8_ROW_BARE  \color
+                lea     152(%a3),%a3
+                .endm
+
+
+                .macro  TF8_BODY  color
+.Ltf8_body_\color:
+                TF8_ROW       \color           | row 0
+                TF8_ROW       \color           | row 1
+                TF8_ROW       \color           | row 2
+                TF8_ROW       \color           | row 3
+                TF8_ROW       \color           | row 4
+                TF8_ROW       \color           | row 5
+                TF8_ROW       \color           | row 6
+                TF8_ROW_BARE  \color           | row 7 (no trailing lea)
+                bra.w         .Ltf8_done
+                .endm
+
+
+                .globl  _surface68kStTileFill8x8
+
+_surface68kStTileFill8x8:
+                movem.l %d3-%d4/%a2-%a3,-(%sp)
+
+                move.l  SP_TF_PTR(%sp),%a3
+                move.w  SP_TF_MASK(%sp),%d3
+                move.w  %d3,%d4
+                not.w   %d4
+
+                | Color dispatch
+                moveq   #0,%d0
+                move.b  SP_TF_COLOR(%sp),%d0
+                and.w   #0x0F,%d0
+                add.w   %d0,%d0
+                add.w   %d0,%d0                | * 4 for bra.w table
+                lea     .Ltf8_table(%pc),%a2
+                jmp     0(%a2,%d0.w)
+
+.Ltf8_table:
+                bra.w   .Ltf8_body_0
+                bra.w   .Ltf8_body_1
+                bra.w   .Ltf8_body_2
+                bra.w   .Ltf8_body_3
+                bra.w   .Ltf8_body_4
+                bra.w   .Ltf8_body_5
+                bra.w   .Ltf8_body_6
+                bra.w   .Ltf8_body_7
+                bra.w   .Ltf8_body_8
+                bra.w   .Ltf8_body_9
+                bra.w   .Ltf8_body_10
+                bra.w   .Ltf8_body_11
+                bra.w   .Ltf8_body_12
+                bra.w   .Ltf8_body_13
+                bra.w   .Ltf8_body_14
+                bra.w   .Ltf8_body_15
+
+                TF8_BODY  0
+                TF8_BODY  1
+                TF8_BODY  2
+                TF8_BODY  3
+                TF8_BODY  4
+                TF8_BODY  5
+                TF8_BODY  6
+                TF8_BODY  7
+                TF8_BODY  8
+                TF8_BODY  9
+                TF8_BODY  10
+                TF8_BODY  11
+                TF8_BODY  12
+                TF8_BODY  13
+                TF8_BODY  14
+                TF8_BODY  15
+
+.Ltf8_done:
+                movem.l (%sp)+,%d3-%d4/%a2-%a3
+                rts
+
+
 | ---- surface68kStFillRectMulti -------------------------------------
 |
 | Multi-group fillRect: groupFirst != groupLast. Caller pre-clips.
@@ -782,6 +905,21 @@ frmRightMaskLut:
                 .word   0xFFF8, 0xFFFC, 0xFFFE, 0xFFFF
 
 
+                .align  2
+| Shared y -> y*160 LUT. Used by drawLine (DL_PLOT), drawCircle
+| (YP_REC), fillCircle (SPAN_BODY). 200 words = 400 bytes.
+| Replaces a 44-cyc lsl.w #5 + lsl.w #7 + add.w shift chain with
+| a 14-cyc indexed-word load. Exported so circle.s and fillCircle.s
+| can reference it via absolute addressing without duplication.
+                .globl  _gStRowOffsetLut
+_gStRowOffsetLut:
+                .set    li_y, 0
+                .rept   200
+                .word   li_y * 160
+                .set    li_y, li_y + 1
+                .endr
+
+
 | ---- surface68kStLongFill ----------------------------------------
 |
 | Bulk long-fill helper for full-row fills (surfaceClear, fillRect
diff --git a/src/port/atarist/spriteAsm.s b/src/port/atarist/spriteAsm.s
index b1b233c..97969c8 100644
--- a/src/port/atarist/spriteAsm.s
+++ b/src/port/atarist/spriteAsm.s
@@ -1,30 +1,19 @@
-| ST byte-aligned sprite save / restore via 256-entry plane-spread
-| LUT. The LUT entry for each plane byte value is a 32-bit "spread"
-| where each plane byte bit lands at the corresponding plane-0 bit
-| position of the 4-byte chunky output. For plane N, we shift the
-| LUT entry left by N to put bits at the plane-N positions, then OR
-| the 4 plane contributions together to get the chunky long.
-|
-| LUT layout (256 longs = 1 KB), populated by initStPlaneSpreadLut
-| in hal.c:
-|
-|   gStPlaneSpreadLut[b] for plane byte b:
-|     bit i of b (i = 0 = MSB = leftmost pixel) maps to bit
-|     bitInLong(i) = (3 - (i >> 1)) * 8 + ((i & 1) ? 0 : 4)
-|     of the long. Plane 0's bits land at nibble bit 0 of each
-|     chunky byte; left-shift the LUT entry by N for plane N.
+| ST byte-aligned sprite save / restore. Buffer holds plane-major
+| bytes: per row, plane0/1/2/3 per tile col, for w/8 tile cols. The
+| inner per-tile-col macro is 4 byte copies (no chunky <-> planar
+| conversion since the buffer matches the surface's plane layout).
 |
 | ABI: cdecl. d2-d7/a2-a6 callee-save. C signatures:
 |
 |   void surface68kStSpriteSaveByteAligned(uint8_t *base,
 |                                          uint16_t x, uint16_t y,
 |                                          uint16_t w, uint16_t h,
-|                                          uint8_t *dstChunky);
+|                                          uint8_t *dstPlaneBytes);
 |
 |   void surface68kStSpriteRestoreByteAligned(uint8_t *base,
 |                                             uint16_t x, uint16_t y,
 |                                             uint16_t w, uint16_t h,
-|                                             const uint8_t *srcChunky);
+|                                             const uint8_t *srcPlaneBytes);
 
                 .text
 
@@ -36,19 +25,12 @@
                 .equ    SP_Y,       SP_OFF + 8 + 2
                 .equ    SP_W,       SP_OFF + 12 + 2
                 .equ    SP_H,       SP_OFF + 16 + 2
-                .equ    SP_CHUNKY,  SP_OFF + 20
-                .equ    SP_LUT,     SP_OFF + 24
+                .equ    SP_BUF,     SP_OFF + 20
 
 
 | Per-tile-col SAVE: 4 plane bytes -> 4 contiguous bytes in buffer.
 | a0 -> plane 0 byte (high or low half), strides 2 to next plane
 | a1 -> output planar bytes (advanced by 4)
-| a2 -> unused (LUT no longer needed)
-|
-| Phase 10.5: dropped chunky <-> planar conversion. The buffer holds
-| plane-major bytes (per row: plane0, plane1, plane2, plane3 per
-| tile col, for w/8 tile cols). 4 byte copies instead of 4 LUT
-| lookups + shifts + ORs.
 
                 .macro  SAVE_TILECOL
                 move.b  (%a0),(%a1)+           | plane 0
@@ -64,13 +46,7 @@ _surface68kStSpriteSaveByteAligned:
                 movem.l %d2-%d7/%a2-%a6,-(%sp)
 
                 move.l  SP_BASE(%sp),%a3
-                move.l  SP_CHUNKY(%sp),%a1
-                | LUT pointer comes in via stack arg -- guaranteed
-                | long-aligned because gcc passes ptr args via
-                | move.l on a long-aligned sp slot. Avoids the BSS
-                | misalignment problem on TOS .PRG (BSS pads only to
-                | 2 bytes, even uint32_t slots can land at mod-4 = 2).
-                move.l  SP_LUT(%sp),%a2
+                move.l  SP_BUF(%sp),%a1
 
                 move.w  SP_W(%sp),%d5
                 lsr.w   #3,%d5                 | d5 = tileCols
@@ -128,10 +104,6 @@ _surface68kStSpriteSaveByteAligned:
 | Per-tile-col RESTORE: 4 contiguous bytes from buffer -> 4 plane bytes.
 | a0 -> plane 0 byte (high or low half)
 | a1 -> input planar bytes (advanced by 4)
-| a2 -> unused (LUT no longer needed)
-|
-| Phase 10.5: dropped chunky -> planar conversion. Buffer layout
-| matches SAVE_TILECOL: per row, plane0/1/2/3 per tile col.
 
                 .macro  RESTORE_TILECOL
                 move.b  (%a1)+,(%a0)           | plane 0
@@ -147,8 +119,7 @@ _surface68kStSpriteRestoreByteAligned:
                 movem.l %d2-%d7/%a2-%a6,-(%sp)
 
                 move.l  SP_BASE(%sp),%a3
-                move.l  SP_CHUNKY(%sp),%a1
-                move.l  SP_LUT(%sp),%a2        | gC2pLut passed in
+                move.l  SP_BUF(%sp),%a1
 
                 | tileCols is held in a5 (not d5) because the macro
                 | trashes d5 (uses it for pb3).
@@ -200,3 +171,151 @@ _surface68kStSpriteRestoreByteAligned:
 
                 movem.l (%sp)+,%d2-%d7/%a2-%a6
                 rts
+
+
+| ---- surface68kStSprite16x16Save / Restore -----------------------
+|
+| Specialized 16x16 sprite save/restore: 16 rows fully unrolled,
+| 8 byte copies per row (2 tile cols), no col loop. Drops the asm
+| walker's per-row col-init + col-loop-check overhead.
+|
+|   void surface68kStSprite16x16Save(uint8_t *base,
+|                                    uint16_t x, uint16_t y,
+|                                    uint8_t *dstBuf);
+|
+|   void surface68kStSprite16x16Restore(uint8_t *base,
+|                                       uint16_t x, uint16_t y,
+|                                       const uint8_t *srcBuf);
+|
+| Caller guarantees x is byte-aligned (x mod 8 == 0). Two halfOff
+| variants dispatch on (x & 8): halfOff=0 reads/writes within one
+| group (offsets 0/2/4/6 high half + 1/3/5/7 low half). halfOff=1
+| spans two groups (low half of group N + high half of group N+1).
+
+                .equ    SP16_SAVED, 12         | d2/a2-a3 = 3 longs
+                .equ    SP16_OFF,         (SP16_SAVED + 4)
+                .equ    SP16_BASE,    SP16_OFF + 0
+                .equ    SP16_X,       SP16_OFF + 4 + 2
+                .equ    SP16_Y,       SP16_OFF + 8 + 2
+                .equ    SP16_BUF,     SP16_OFF + 12
+
+
+| Macro: setup a0 = base + y*160 + group*8 + halfOff
+| Trashes: d0, d1, d2; a0 left at row start
+
+                .macro  SP16_SETUP_A0
+                move.l  SP16_BASE(%sp),%a3
+                move.w  SP16_X(%sp),%d0
+                move.w  SP16_Y(%sp),%d1
+
+                | a0 = base + y*160
+                ext.l   %d1
+                move.l  %d1,%d2
+                lsl.l   #5,%d1
+                lsl.l   #7,%d2
+                add.l   %d2,%d1
+                lea     0(%a3,%d1.l),%a0
+
+                | a0 += (x>>4) * 8
+                move.w  %d0,%d1
+                lsr.w   #4,%d1
+                lsl.w   #3,%d1
+                ext.l   %d1
+                add.l   %d1,%a0
+
+                | a0 += halfOff (= (x & 8) >> 3)
+                and.w   #8,%d0
+                lsr.w   #3,%d0
+                ext.l   %d0
+                add.l   %d0,%a0
+                | d0 = halfOff (0 or 1) for downstream dispatch
+                .endm
+
+
+                .globl  _surface68kStSprite16x16Save
+
+_surface68kStSprite16x16Save:
+                movem.l %d2/%a2-%a3,-(%sp)
+                SP16_SETUP_A0
+                move.l  SP16_BUF(%sp),%a1
+
+                tst.w   %d0
+                bne.w   .Lsp16s_low
+
+                | halfOff=0: a0 at high half. Col 0 = high (offsets
+                | 0,2,4,6); col 1 = low (offsets 1,3,5,7).
+                .rept   16
+                move.b  (%a0),(%a1)+
+                move.b  2(%a0),(%a1)+
+                move.b  4(%a0),(%a1)+
+                move.b  6(%a0),(%a1)+
+                move.b  1(%a0),(%a1)+
+                move.b  3(%a0),(%a1)+
+                move.b  5(%a0),(%a1)+
+                move.b  7(%a0),(%a1)+
+                lea     160(%a0),%a0
+                .endr
+                bra.w   .Lsp16s_done
+
+.Lsp16s_low:
+                | halfOff=1: a0 at low half (group+1). Col 0 = low of
+                | this group, offsets 0,2,4,6 from a0. Col 1 = high of
+                | next group, at offsets 7,9,11,13 from a0.
+                .rept   16
+                move.b  (%a0),(%a1)+
+                move.b  2(%a0),(%a1)+
+                move.b  4(%a0),(%a1)+
+                move.b  6(%a0),(%a1)+
+                move.b  7(%a0),(%a1)+
+                move.b  9(%a0),(%a1)+
+                move.b  11(%a0),(%a1)+
+                move.b  13(%a0),(%a1)+
+                lea     160(%a0),%a0
+                .endr
+
+.Lsp16s_done:
+                movem.l (%sp)+,%d2/%a2-%a3
+                rts
+
+
+                .globl  _surface68kStSprite16x16Restore
+
+_surface68kStSprite16x16Restore:
+                movem.l %d2/%a2-%a3,-(%sp)
+                SP16_SETUP_A0
+                move.l  SP16_BUF(%sp),%a1
+
+                tst.w   %d0
+                bne.w   .Lsp16r_low
+
+                | halfOff=0: write high half (col 0) + low half (col 1).
+                .rept   16
+                move.b  (%a1)+,(%a0)
+                move.b  (%a1)+,2(%a0)
+                move.b  (%a1)+,4(%a0)
+                move.b  (%a1)+,6(%a0)
+                move.b  (%a1)+,1(%a0)
+                move.b  (%a1)+,3(%a0)
+                move.b  (%a1)+,5(%a0)
+                move.b  (%a1)+,7(%a0)
+                lea     160(%a0),%a0
+                .endr
+                bra.w   .Lsp16r_done
+
+.Lsp16r_low:
+                | halfOff=1
+                .rept   16
+                move.b  (%a1)+,(%a0)
+                move.b  (%a1)+,2(%a0)
+                move.b  (%a1)+,4(%a0)
+                move.b  (%a1)+,6(%a0)
+                move.b  (%a1)+,7(%a0)
+                move.b  (%a1)+,9(%a0)
+                move.b  (%a1)+,11(%a0)
+                move.b  (%a1)+,13(%a0)
+                lea     160(%a0),%a0
+                .endr
+
+.Lsp16r_done:
+                movem.l (%sp)+,%d2/%a2-%a3
+                rts
diff --git a/src/port/dos/hal.c b/src/port/dos/hal.c
index d1ca693..7b91f58 100644
--- a/src/port/dos/hal.c
+++ b/src/port/dos/hal.c
@@ -614,12 +614,12 @@ void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src) {
 }
 
 
-bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp) {
+bool halSurfaceLoadFile(SurfaceT *dst, FILE *fp) {
     return fread(dst->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE;
 }
 
 
-bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp) {
+bool halSurfaceSaveFile(const SurfaceT *src, FILE *fp) {
     return fwrite(src->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE;
 }
 
diff --git a/src/port/iigs/hal.c b/src/port/iigs/hal.c
index 237fcab..a41a151 100644
--- a/src/port/iigs/hal.c
+++ b/src/port/iigs/hal.c
@@ -395,12 +395,12 @@ void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src) {
 }
 
 
-bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp) {
+bool halSurfaceLoadFile(SurfaceT *dst, FILE *fp) {
     return fread(dst->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE;
 }
 
 
-bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp) {
+bool halSurfaceSaveFile(const SurfaceT *src, FILE *fp) {
     return fwrite(src->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE;
 }