ST is more or less parity.

2026-05-04 11:06:41 -05:00 · 2026-05-04 11:06:41 -05:00 · cf6ae093d3
commit cf6ae093d3
parent 818dc801db
15 changed files with 966 additions and 1062 deletions
--- a/README.md
+++ b/README.md
@ -59,6 +59,332 @@ build/<plat>/         per-target build outputs
 ```
 ## Public API
 Game code includes a single umbrella header:
 ```c
 #include <joey/joey.h>
 ```
 That pulls in every public surface listed below. Full documentation
 lives in the per-feature headers under `include/joey/`; what follows
 is a quick reference. Every entry point is plain C, no C++ extensions.
 ### Lifecycle (`joey/core.h`)
 ```c
 typedef struct {
    HostModeE hostMode;       // HOST_MODE_TAKEOVER or HOST_MODE_OS
    uint32_t  codegenBytes;   // runtime compiled-sprite cache size
    uint16_t  maxSurfaces;    // maximum concurrent surfaces
    uint32_t  audioBytes;     // audio sample / module RAM pool
    uint32_t  assetBytes;     // tileset / sprite / map RAM pool
 } JoeyConfigT;
 bool        joeyInit         (const JoeyConfigT *config);
 void        joeyShutdown     (void);
 const char *joeyLastError    (void);
 const char *joeyPlatformName (void);
 const char *joeyVersionString(void);
 void        joeyWaitVBL      (void);     // block until next VBL
 uint16_t    joeyFrameCount   (void);     // monotonic 16-bit frame counter
 uint16_t    joeyFrameHz      (void);     // 50 / 60 / 70 depending on port
 ```
 ### Surfaces (`joey/surface.h`)
 All surfaces are 320x200 4bpp packed (high nibble = left pixel) with
 a 200-entry SCB table and 16 palettes of 16 `$0RGB` colors.
 ```c
 #define SURFACE_WIDTH               320
 #define SURFACE_HEIGHT              200
 #define SURFACE_BYTES_PER_ROW       160
 #define SURFACE_PIXELS_SIZE         (SURFACE_BYTES_PER_ROW * SURFACE_HEIGHT)
 #define SURFACE_PALETTE_COUNT       16
 #define SURFACE_COLORS_PER_PALETTE  16
 typedef struct SurfaceT SurfaceT;     // opaque
 SurfaceT *surfaceCreate (void);
 void      surfaceDestroy(SurfaceT *s);
 SurfaceT *stageGet      (void);                              // library back-buffer
 void      surfaceCopy   (SurfaceT *dst, const SurfaceT *src);
 bool      surfaceSaveFile(const SurfaceT *src, const char *path);
 bool      surfaceLoadFile(SurfaceT       *dst, const char *path);
 uint32_t  surfaceHash    (const SurfaceT *s);                // FNV-1a of logical pixels
 ```
 `surfaceSaveFile` writes the surface in **target-native** form. Files
 are NOT cross-port portable; the asset pipeline handles conversion.
 ### Drawing (`joey/draw.h`)
 All primitives clip to the surface; off-surface coords are silent
 no-ops. Color 0 is plotted normally (use the masked variants if you
 need transparency).
 ```c
 void surfaceClear      (SurfaceT *s, uint8_t color);
 void drawPixel         (SurfaceT *s, int16_t x, int16_t y, uint8_t color);
 uint8_t samplePixel    (const SurfaceT *s, int16_t x, int16_t y);
 void drawLine          (SurfaceT *s, int16_t x0, int16_t y0,
                        int16_t x1, int16_t y1, uint8_t color);
 void drawRect          (SurfaceT *s, int16_t x, int16_t y,
                        uint16_t w, uint16_t h, uint8_t color);
 void fillRect          (SurfaceT *s, int16_t x, int16_t y,
                        uint16_t w, uint16_t h, uint8_t color);
 void drawCircle        (SurfaceT *s, int16_t cx, int16_t cy,
                        uint16_t r, uint8_t color);
 void fillCircle        (SurfaceT *s, int16_t cx, int16_t cy,
                        uint16_t r, uint8_t color);
 void floodFill         (SurfaceT *s, int16_t x, int16_t y, uint8_t newColor);
 void floodFillBounded  (SurfaceT *s, int16_t x, int16_t y,
                        uint8_t newColor, uint8_t boundaryColor);
 void surfaceBlit       (SurfaceT *dst, const JoeyAssetT *src, int16_t x, int16_t y);
 void surfaceBlitMasked (SurfaceT *dst, const JoeyAssetT *src,
                        int16_t x, int16_t y, uint8_t transparentIndex);
 ```
 ### Palette and SCB (`joey/palette.h`)
 Colors are 12-bit `$0RGB`. Color 0 of every palette is forced to
 black on `paletteSet`. Each scanline picks one of the 16 palettes
 via the SCB.
 ```c
 void    paletteSet  (SurfaceT *s, uint8_t paletteIndex, const uint16_t *colors16);
 void    paletteGet  (const SurfaceT *s, uint8_t paletteIndex, uint16_t *out16);
 void    scbSet      (SurfaceT *s, uint16_t line, uint8_t paletteIndex);
 void    scbSetRange (SurfaceT *s, uint16_t firstLine, uint16_t lastLine,
                     uint8_t paletteIndex);
 uint8_t scbGet      (const SurfaceT *s, uint16_t line);
 ```
 ### Tiles (`joey/tile.h`)
 A "tile" is just an 8x8-aligned region of any surface. The API moves
 32-byte chunks between surfaces and provides a small `TileT` value
 type so callers can stash a copy without allocating a scratch surface.
 ```c
 #define TILE_PIXELS_PER_SIDE  8
 #define TILE_BYTES_PER_ROW    4
 #define TILE_BYTES            (TILE_BYTES_PER_ROW * TILE_PIXELS_PER_SIDE)
 #define TILE_BLOCKS_PER_ROW   (SURFACE_WIDTH  / TILE_PIXELS_PER_SIDE)  // 40
 #define TILE_BLOCKS_PER_COL   (SURFACE_HEIGHT / TILE_PIXELS_PER_SIDE)  // 25
 #define TILE_NO_GLYPH         ((uint16_t)0xFFFFu)
 typedef struct TileT { uint8_t pixels[TILE_BYTES]; } TileT;
 void tileCopy       (SurfaceT *dst, uint8_t dstBx, uint8_t dstBy,
                     const SurfaceT *src, uint8_t srcBx, uint8_t srcBy);
 void tileCopyMasked (SurfaceT *dst, uint8_t dstBx, uint8_t dstBy,
                     const SurfaceT *src, uint8_t srcBx, uint8_t srcBy,
                     uint8_t transparentIndex);
 void tileFill       (SurfaceT *s,   uint8_t bx,    uint8_t by,    uint8_t color);
 void tileSnap       (const SurfaceT *src, uint8_t bx, uint8_t by, TileT *out);
 void tilePaste      (SurfaceT *dst, uint8_t bx, uint8_t by,       const TileT *in);
 void drawText       (SurfaceT *dst, uint8_t bx, uint8_t by,
                     const SurfaceT *fontSurface, const uint16_t *asciiMap,
                     const char *str);
 ```
 ### Sprites (`joey/sprite.h`)
 Rectangles of 8x8 tiles drawn at arbitrary pixel positions with
 color-0 transparency. Tile data is `widthTiles * heightTiles * 32`
 bytes, tile-major 4bpp packed. Sprites can be runtime-compiled
 into per-shift code variants for fast draws.
 ```c
 typedef enum { SPRITE_FLAGS_NONE = 0 } SpriteFlagsE;
 typedef struct SpriteT SpriteT;            // opaque
 typedef struct {
    SpriteT  *sprite;
    int16_t   x, y;
    uint16_t  width, height;               // pixels
    uint8_t  *bytes;                       // caller-owned save-under buffer
    uint16_t  sizeBytes;
 } SpriteBackupT;
 SpriteT *spriteCreate            (const uint8_t *tileData,
                                  uint8_t widthTiles, uint8_t heightTiles,
                                  SpriteFlagsE flags);
 SpriteT *spriteCreateFromSurface (const SurfaceT *src, int16_t x, int16_t y,
                                  uint8_t widthTiles, uint8_t heightTiles,
                                  SpriteFlagsE flags);
 SpriteT *spriteLoadFile          (const char *path, SpriteFlagsE flags);
 SpriteT *spriteFromCompiledMem   (const uint8_t *data, uint32_t length,
                                  SpriteFlagsE flags);
 bool     spriteSaveFile          (SpriteT *sp, const char *path);
 void     spriteDestroy           (SpriteT *sp);
 bool     spriteCompile           (SpriteT *sp);   // build per-shift fast path
 void     spritePrewarm           (SpriteT *sp);   // hint: compile if not already
 void     spriteDraw              (SurfaceT *s, SpriteT *sp, int16_t x, int16_t y);
 void     spriteSaveUnder         (const SurfaceT *s, SpriteT *sp,
                                  int16_t x, int16_t y, SpriteBackupT *backup);
 void     spriteRestoreUnder      (SurfaceT *s, const SpriteBackupT *backup);
 void     spriteSaveAndDraw       (SurfaceT *s, SpriteT *sp, int16_t x, int16_t y,
                                  SpriteBackupT *backup);
 void     spriteCompact           (void);          // defrag the codegen arena
 uint32_t spriteCodegenBytesUsed  (void);
 uint32_t spriteCodegenBytesTotal (void);
 ```
 ### Assets (`joey/asset.h`)
 Small bitmap blits with optional embedded palette, in `.jas` format.
 Use embedded `const JoeyAssetT` for ship-with-binary art; use the
 loaders for on-disk assets.
 ```c
 typedef struct {
    uint16_t       width;
    uint16_t       height;
    bool           hasPalette;
    uint16_t       palette[16];        // valid only if hasPalette
    const uint8_t *pixels;             // 4bpp packed, rowBytes = (width+1)/2
 } JoeyAssetT;
 JoeyAssetT *joeyAssetLoadFile     (const char *path);
 JoeyAssetT *joeyAssetFromMem      (const uint8_t *data, uint32_t length);
 void        joeyAssetFree         (JoeyAssetT *asset);
 void        joeyAssetApplyPalette (SurfaceT *dst, uint8_t paletteIndex,
                                   const JoeyAssetT *asset);
 ```
 ### Present (`joey/present.h`)
 ```c
 void stagePresent(void);
 ```
 Flips the dirty rows of the stage to the display, then clears dirty
 state. Drawing primitives mark dirty as a side effect, so calling
 `stagePresent` once at end-of-frame is enough.
 ### Input (`joey/input.h`)
 Call `joeyInputPoll` once per frame, then query the state predicates.
 Edge predicates (`*Pressed`, `*Released`) fire only in the frame the
 transition happened.
 ```c
 typedef enum { /* KEY_NONE, KEY_A..KEY_Z, KEY_0..KEY_9, KEY_SPACE,
                  KEY_ESCAPE, KEY_RETURN, KEY_TAB, KEY_BACKSPACE,
                  KEY_UP/DOWN/LEFT/RIGHT, KEY_LSHIFT/RSHIFT/LCTRL/LALT,
                  KEY_F1..KEY_F10, KEY_COUNT */ } JoeyKeyE;
 typedef enum { MOUSE_BUTTON_NONE, MOUSE_BUTTON_LEFT, MOUSE_BUTTON_RIGHT,
               MOUSE_BUTTON_MIDDLE, MOUSE_BUTTON_COUNT } JoeyMouseButtonE;
 typedef enum { JOYSTICK_0, JOYSTICK_1, JOYSTICK_COUNT } JoeyJoystickE;
 typedef enum { JOY_BUTTON_0, JOY_BUTTON_1, JOY_BUTTON_COUNT } JoeyJoyButtonE;
 #define JOYSTICK_AXIS_MAX  127
 #define JOYSTICK_AXIS_MIN  (-127)
 void    joeyInputPoll        (void);
 void    joeyWaitForAnyKey    (void);
 bool    joeyKeyDown          (JoeyKeyE key);
 bool    joeyKeyPressed       (JoeyKeyE key);
 bool    joeyKeyReleased      (JoeyKeyE key);
 int16_t joeyMouseX           (void);
 int16_t joeyMouseY           (void);
 bool    joeyMouseDown        (JoeyMouseButtonE b);
 bool    joeyMousePressed     (JoeyMouseButtonE b);
 bool    joeyMouseReleased    (JoeyMouseButtonE b);
 bool    joeyJoystickConnected(JoeyJoystickE js);
 int8_t  joeyJoystickX        (JoeyJoystickE js);
 int8_t  joeyJoystickY        (JoeyJoystickE js);
 bool    joeyJoyDown          (JoeyJoystickE js, JoeyJoyButtonE b);
 bool    joeyJoyPressed       (JoeyJoystickE js, JoeyJoyButtonE b);
 bool    joeyJoyReleased      (JoeyJoystickE js, JoeyJoyButtonE b);
 void    joeyJoystickReset    (JoeyJoystickE js, uint8_t deadZone);
 ```
 ### Audio (`joey/audio.h`)
 4-channel Protracker-style music plus four one-shot SFX slots. Module
 data must be the platform-native form produced by `tools/joeymod`
 (`.mod` for Amiga/DOS/ST; `.ntp` for IIgs; `.amod` if you want
 loop=false on Amiga). A failed `joeyAudioInit` is non-fatal; the rest
 of the API stays callable as no-ops.
 ```c
 #define JOEY_AUDIO_SFX_SLOTS  4
 bool joeyAudioInit          (void);
 void joeyAudioShutdown      (void);
 void joeyAudioPlayMod       (const uint8_t *data, uint32_t length, bool loop);
 void joeyAudioStopMod       (void);
 bool joeyAudioIsPlayingMod  (void);
 void joeyAudioPlaySfx       (uint8_t slot, const uint8_t *sample,
                             uint32_t length, uint16_t rateHz);
 void joeyAudioStopSfx       (uint8_t slot);
 void joeyAudioFrameTick     (void);
 ```
 ### Debug logging (`joey/debug.h`)
 Crash-tracing logger. Writes are buffered and durable across normal
 exit; call `joeyLogFlush` ahead of suspected hang points if you want
 a guaranteed last-line-on-disk.
 ```c
 void joeyLog     (const char *msg);
 void joeyLogF    (const char *fmt, ...);
 void joeyLogFlush(void);
 void joeyLogReset(void);
 ```
 Output goes to `joeylog.txt` in the program's working directory.
 ### Platform macros (`joey/platform.h`)
 The build system normally sets the platform via `-D`; auto-detection
 from compiler-predefined macros is a fallback. Game code can
 conditionally compile on these:
 ```
 JOEYLIB_PLATFORM_IIGS / _AMIGA / _ATARIST / _DOS   // exactly one defined
 JOEYLIB_CPU_65816 / _68000 / _X86
 JOEYLIB_ENDIAN_LITTLE / _BIG
 JOEYLIB_NATIVE_CHUNKY / _NATIVE_PLANAR
 JOEYLIB_HAS_BLITTER / _HAS_COPPER                  // Amiga only
 JOEYLIB_PLATFORM_NAME                              // human-readable string
 JOEYLIB_VERSION_MAJOR / _MINOR / _PATCH / _STRING
 ```
 ## License
 TBD.
--- a/scripts/dosbox-386sx16.conf
+++ b/scripts/dosbox-386sx16.conf
@ -0,0 +1,28 @@
 # DOSBox config: simulate an Intel 386SX-16 (1988), the slowest 386
 # desktop CPU JoeyLib could realistically be run on. Use this floor
 # to verify the DOS port still hits its frame budget on the bottom of
 # the 386 stack rather than coasting on host CPU.
 #
 # The 386SX is identical to the 386DX in instruction set; the only
 # difference is the 16-bit external bus (vs 32-bit on DX), which slows
 # memory-bound code. DOSBox does not model the bus split directly --
 # the cycles count below approximates the combined 386SX-16 throughput.
 #
 # Notes:
 #   core    = normal           accurate per-instruction cycles, not
 #                              recompiled-to-host (auto / dynamic would
 #                              defeat slow-CPU simulation).
 #   cputype = 386              386 instruction set (no 486 BSWAP /
 #                              CMPXCHG, no Pentium MMX).
 #   cycles  = fixed 2200       community-standard approximation for
 #                              386SX-16 throughput in DOSBox.
 #                              DOSBox-Staging deprecates this in favor
 #                              of cpu_cycles, but still accepts it.
 #                              Vanilla DOSBox and DOSBox-X only know
 #                              the old key, so 'cycles' stays for
 #                              cross-fork portability.
 [cpu]
 core    = normal
 cputype = 386
 cycles  = fixed 2200
--- a/scripts/run-dos.sh
+++ b/scripts/run-dos.sh
@ -18,6 +18,7 @@ fi
 prog=${1:-pattern}
 repo=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)
 bin_dir=$repo/build/dos/bin
 conf=$repo/scripts/dosbox-386sx16.conf
 file=${prog^^}.EXE
 if [[ ! -f "$bin_dir/$file" ]]; then
@ -34,7 +35,12 @@ fi
 # default capture-on-click behavior fights the VM's grab and mouse
 # input is unusable. On plain DOSBox this -set flag is unknown and is
 # logged once as a warning, then ignored -- harmless either way.
 #
 # -conf $conf locks the CPU to a simulated 386SX-16 (the slowest
 # realistic 386 desktop). DOSBox layers configs: anything not set in
 # our file falls back to the user's main dosbox.conf.
 exec dosbox \
    -conf "$conf" \
    -set "mouse_capture=seamless" \
    -c "C:" \
    -c "$file" \
--- a/src/core/hal.h
+++ b/src/core/hal.h
@ -140,15 +140,16 @@ void halSpriteRestorePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint1
 //   s->pixels src->dst; on planar ports there is no chunky to copy
 //   (planes already covered by halSurfaceCopyPlanes). Chunky ports
 //   do the memcpy here; Amiga is a no-op.
-// halSurfaceLoadFileChunky / halSurfaceSaveFileChunky wrap fread /
+// halSurfaceLoadFile / halSurfaceSaveFile wrap fread / fwrite of the
-//   fwrite of the pixel data. Chunky ports stream directly to/from
+//   pixel data using each port's native pixel format (chunky on
-//   s->pixels; Amiga uses a scratch buffer + c2p (load) or
+//   IIgs/DOS, interleaved planar on ST, plane-major on Amiga). Files
-//   plane->chunky derivation (save).
+//   written by one port are NOT loadable by another -- conversion is
 //   the asset pipeline's job.
 uint8_t halSamplePixel(const SurfaceT *s, int16_t x, int16_t y);
 uint32_t halSurfaceHash(const SurfaceT *s);
 void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src);
-bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp);
+bool halSurfaceLoadFile(SurfaceT *dst, FILE *fp);
-bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp);
+bool halSurfaceSaveFile(const SurfaceT *src, FILE *fp);
 // Present the dirty regions of the source surface to the display.
 // The cross-platform stagePresent walks the dirty arrays before
--- a/src/core/surface.c
+++ b/src/core/surface.c
@ -158,7 +158,7 @@ bool surfaceLoadFile(SurfaceT *dst, const char *path) {
        fclose(fp);
        return false;
    }
-    if (!halSurfaceLoadFileChunky(dst, fp)) {
+    if (!halSurfaceLoadFile(dst, fp)) {
        fclose(fp);
        return false;
    }
@ -186,7 +186,7 @@ bool surfaceSaveFile(const SurfaceT *src, const char *path) {
    if (fp == NULL) {
        return false;
    }
-    if (!halSurfaceSaveFileChunky(src, fp)) {
+    if (!halSurfaceSaveFile(src, fp)) {
        fclose(fp);
        return false;
    }
--- a/src/port/amiga/c2p.s
+++ b/src/port/amiga/c2p.s
@ -1,127 +0,0 @@
 | Amiga chunky-to-planar conversion -- 68000 hand-rolled.
 |
 | Drop-in replacement for hal.c's old c2pRange C inner loop. Uses a
 | 4 KB lookup table built once at HAL init: each (sourceByte, position,
 | plane) tuple maps to the plane-byte bit contribution that source
 | byte makes when it sits at that position within a 4-byte (8-pixel)
 | planar group going to that plane.
 |
 | Calling convention: m68k-amigaos-gcc cdecl.
 |   Args on stack at 4(sp), 8(sp), ...
 |   d2-d7, a2-a6 are callee-save.
 |   No return value.
 |
 | void chunkyToPlanarRow(const uint8_t *src,    ;  4(sp) - 4bpp packed source row
 |                        uint8_t       *p0,     ;  8(sp) - plane 0 dest row
 |                        uint8_t       *p1,     ; 12(sp) - plane 1 dest row
 |                        uint8_t       *p2,     ; 16(sp) - plane 2 dest row
 |                        uint8_t       *p3,     ; 20(sp) - plane 3 dest row
 |                        uint16_t       n,      ; 24(sp) - planar byte count (low word)
 |                        const uint8_t *lut);   ; 28(sp) - 4 KB LUT base
 |
 | LUT layout: lut[src*16 + pos*4 + plane] = 1-byte plane contribution
 | for source byte `src` sitting at byte-position `pos` (0..3) within
 | its 4-byte planar group, going to plane `plane` (0..3). All 16
 | (pos, plane) entries for one src byte are contiguous, so the inner
 | loop reaches every entry off (a5, d4.w) with an 8-bit displacement
 | (0..15) and never has to advance an index register.
 |
 | Per planar byte we consume 4 source bytes (positions 0..3 of the
 | 8-pixel group). For each we compute d4 = src*16 with four add.w's
 | (faster than asl.w on 68000) and OR the four plane contributions
 | into d0..d3 with byte-displaced (a5,d4.w) reads.
 |
 | GAS-syntax (binutils m68k); assembled by m68k-amigaos-as via the
 | gcc driver.
                .text
                .globl  _chunkyToPlanarRow
 | Stack frame size of MOVEM.L block: d2-d7 (6) + a2-a6 (5) = 11 regs
 | * 4 bytes = 44 bytes. Args therefore start at the original sp+4
 | offset PLUS 44.
                .equ    SAVED_REGS_SIZE, 44
 _chunkyToPlanarRow:
                movem.l %d2-%d7/%a2-%a6,-(%sp)
                move.l   4+SAVED_REGS_SIZE(%sp),%a0     | src
                move.l   8+SAVED_REGS_SIZE(%sp),%a1     | p0
                move.l  12+SAVED_REGS_SIZE(%sp),%a2     | p1
                move.l  16+SAVED_REGS_SIZE(%sp),%a3     | p2
                move.l  20+SAVED_REGS_SIZE(%sp),%a4     | p3
                | n is a uint16_t but GCC promotes to int and pushes a
                | full 4 bytes -- the low word lives at +2 in big-endian
                | layout.
                move.w  24+SAVED_REGS_SIZE+2(%sp),%d7   | planar byte count
                move.l  28+SAVED_REGS_SIZE(%sp),%a5     | LUT base
                subq.w  #1,%d7                          | DBRA: count-1
                bmi     .Ldone                          | nothing to do
 .LbyteLoop:
                moveq   #0,%d0                          | plane 0 acc
                moveq   #0,%d1                          | plane 1 acc
                moveq   #0,%d2                          | plane 2 acc
                moveq   #0,%d3                          | plane 3 acc
                | ----- Source byte position 0 -----
                moveq   #0,%d4
                move.b  (%a0)+,%d4                      | src[0]
                add.w   %d4,%d4
                add.w   %d4,%d4
                add.w   %d4,%d4
                add.w   %d4,%d4                         | d4 = src * 16
                or.b      0(%a5,%d4.w),%d0              | pos0 plane0
                or.b      1(%a5,%d4.w),%d1              | pos0 plane1
                or.b      2(%a5,%d4.w),%d2              | pos0 plane2
                or.b      3(%a5,%d4.w),%d3              | pos0 plane3
                | ----- Source byte position 1 -----
                moveq   #0,%d4
                move.b  (%a0)+,%d4                      | src[1]
                add.w   %d4,%d4
                add.w   %d4,%d4
                add.w   %d4,%d4
                add.w   %d4,%d4
                or.b      4(%a5,%d4.w),%d0              | pos1 plane0
                or.b      5(%a5,%d4.w),%d1              | pos1 plane1
                or.b      6(%a5,%d4.w),%d2              | pos1 plane2
                or.b      7(%a5,%d4.w),%d3              | pos1 plane3
                | ----- Source byte position 2 -----
                moveq   #0,%d4
                move.b  (%a0)+,%d4                      | src[2]
                add.w   %d4,%d4
                add.w   %d4,%d4
                add.w   %d4,%d4
                add.w   %d4,%d4
                or.b      8(%a5,%d4.w),%d0              | pos2 plane0
                or.b      9(%a5,%d4.w),%d1              | pos2 plane1
                or.b     10(%a5,%d4.w),%d2              | pos2 plane2
                or.b     11(%a5,%d4.w),%d3              | pos2 plane3
                | ----- Source byte position 3 -----
                moveq   #0,%d4
                move.b  (%a0)+,%d4                      | src[3]
                add.w   %d4,%d4
                add.w   %d4,%d4
                add.w   %d4,%d4
                add.w   %d4,%d4
                or.b     12(%a5,%d4.w),%d0              | pos3 plane0
                or.b     13(%a5,%d4.w),%d1              | pos3 plane1
                or.b     14(%a5,%d4.w),%d2              | pos3 plane2
                or.b     15(%a5,%d4.w),%d3              | pos3 plane3
                | ----- Store plane bytes -----
                move.b  %d0,(%a1)+
                move.b  %d1,(%a2)+
                move.b  %d2,(%a3)+
                move.b  %d3,(%a4)+
                dbra    %d7,.LbyteLoop
 .Ldone:
                movem.l (%sp)+,%d2-%d7/%a2-%a6
                rts
--- a/src/port/amiga/hal.c
+++ b/src/port/amiga/hal.c
@ -115,69 +115,10 @@ static uint8_t  gCachedScb    [SURFACE_HEIGHT]
 static uint16_t gCachedPalette[SURFACE_PALETTE_COUNT][SURFACE_COLORS_PER_PALETTE] __attribute__((aligned(4)));
 static bool     gCacheValid = false;
 // 4 KB chunky-to-planar lookup table consumed by chunkyToPlanarRow
 // (src/port/amiga/c2p.s). Layout: gC2pLut[src*16 + pos*4 + plane] =
 // the plane-byte bit contribution that source byte `src` makes to
 // plane `plane` when it sits at byte-position `pos` within a 4-byte
 // (8-pixel) planar group. The src-major layout lets the asm inner
 // loop reach all 16 (pos, plane) entries for a single src byte via
 // 8-bit displacements off (a5, d4.w) without any LEA between reads.
 static uint8_t  gC2pLut[4 * 1024];
 static bool     gC2pLutReady = false;
 static bool paletteOrScbChanged(const SurfaceT *src);
 static void initC2pLut(void);
 // Provided by src/port/amiga/c2p.s.
 extern void chunkyToPlanarRow(const uint8_t *src,
                              uint8_t *p0, uint8_t *p1, uint8_t *p2, uint8_t *p3,
                              uint16_t numPlanarBytes,
                              const uint8_t *lut);
 // ----- Internal helpers (alphabetical) -----
 // Build the 4 KB chunky-to-planar lookup table consumed by
 // chunkyToPlanarRow. For each (pos, plane, src) tuple, store the
 // bit contribution that source byte `src` makes to plane `plane`
 // when it sits at byte-position `pos` (0..3) within a 4-byte
 // (8-pixel) planar group:
 //
 //   - src high nibble = leftmost pixel  -> plane bit (7 - 2*pos)
 //   - src low  nibble = rightmost pixel -> plane bit (6 - 2*pos)
 static void initC2pLut(void) {
    uint16_t pos;
    uint16_t plane;
    uint16_t src;
    uint8_t  highShift;
    uint8_t  lowShift;
    uint8_t  highBit;
    uint8_t  lowBit;
    if (gC2pLutReady) {
        return;
    }
    for (src = 0; src < 256; src++) {
        for (pos = 0; pos < 4; pos++) {
            highShift = (uint8_t)(7 - 2 * pos);
            lowShift  = (uint8_t)(6 - 2 * pos);
            for (plane = 0; plane < 4; plane++) {
                highBit = (uint8_t)(((src >> 4) >> plane) & 1);
                lowBit  = (uint8_t)(((src & 0x0F) >> plane) & 1);
                gC2pLut[src * 16 + pos * 4 + plane] =
                    (uint8_t)((highBit << highShift) | (lowBit << lowShift));
            }
        }
    }
    gC2pLutReady = true;
 }
 // (Phase 9 deleted c2pRange. halSurfaceLoadPlanes inlines its own
 // per-row chunkyToPlanarRow loop -- the only code path that still
 // converts chunky to planar today, since asset loading is the only
 // surface mutation that doesn't go through a planar-aware primitive.)
 // Build a user copper list for per-scanline palette (SCB emulation).
 // One WAIT + 16 MOVEs per displayed scanline + one CEND. The list is
 // stored in gNewUCL until installCopperList swaps it onto the screen.
@ -1358,35 +1299,6 @@ void halSpriteRestorePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint1
 }
 /* Helper used by Amiga halSurfaceLoadFileChunky to populate planes
 * from a freshly-loaded chunky pixel buffer (s->pixels). */
 static void amigaPopulatePlanesFromChunky(SurfaceT *s) {
    AmigaPlanarT  *pd;
    int16_t        y;
    const uint8_t *srcLine;
    UBYTE         *p0;
    UBYTE         *p1;
    UBYTE         *p2;
    UBYTE         *p3;
    pd = (AmigaPlanarT *)s->portData;
    if (pd == NULL) {
        return;
    }
    if (!gC2pLutReady) {
        initC2pLut();
    }
    for (y = 0; y < SURFACE_HEIGHT; y++) {
        srcLine = &s->pixels[y * SURFACE_BYTES_PER_ROW];
        p0      = pd->planes[0] + (uint16_t)y * AMIGA_BYTES_PER_ROW;
        p1      = pd->planes[1] + (uint16_t)y * AMIGA_BYTES_PER_ROW;
        p2      = pd->planes[2] + (uint16_t)y * AMIGA_BYTES_PER_ROW;
        p3      = pd->planes[3] + (uint16_t)y * AMIGA_BYTES_PER_ROW;
        chunkyToPlanarRow(srcLine, p0, p1, p2, p3, AMIGA_BYTES_PER_ROW, gC2pLut);
    }
 }
 // Phase 6 planar dual-write for sprite draw. Walks the sprite's
 // chunky tile data with the same clipping the cross-platform code
 // applies, calling amigaPlanarSetPixel for every non-transparent
@ -2118,7 +2030,9 @@ uint8_t halSamplePixel(const SurfaceT *s, int16_t x, int16_t y) {
 /* Reverse-c2p: per row, derive 160 chunky bytes from 40 plane bytes
- * (per plane, 4 planes). Used by halSurfaceHash, halSurfaceSaveFileChunky.
+ * (per plane, 4 planes). Used by halSurfaceHash to fold the planar
 * surface into the same byte-stream the chunky ports hash, so cross-
 * port hash comparisons stay valid.
 * Walks 8 pixels per planar-byte column; per pixel assembles nibble
 * from 4 plane bits. Output: 4 chunky bytes per planar-byte column
 * (since 8 pixels = 4 chunky bytes at 2px/byte). */
@ -2204,62 +2118,35 @@ void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src) {
 }
-bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp) {
+// On-disk format is the Amiga's native plane-major buffer: planes
 // 0..3 written sequentially, AMIGA_PLANE_SIZE bytes each.
 bool halSurfaceLoadFile(SurfaceT *dst, FILE *fp) {
    AmigaPlanarT *pd;
-    uint8_t      *scratch;
+    uint8_t       i;
    uint8_t      *srcLine;
    int16_t       y;
    UBYTE        *p0;
    UBYTE        *p1;
    UBYTE        *p2;
    UBYTE        *p3;
    bool          ok;
    pd = (AmigaPlanarT *)dst->portData;
    if (pd == NULL) {
        return false;
    }
-    /* fread the chunky file payload into a scratch buffer, then c2p
+    for (i = 0; i < AMIGA_BITPLANES; i++) {
-     * directly into our planes. The scratch is a one-shot AllocMem
+        if (fread(pd->planes[i], 1, AMIGA_PLANE_SIZE, fp) != AMIGA_PLANE_SIZE) {
-     * (PUBLIC, not chip) since chunkyToPlanarRow only reads it. */
+            return false;
    scratch = (uint8_t *)AllocMem((ULONG)SURFACE_PIXELS_SIZE, (ULONG)MEMF_PUBLIC);
    if (scratch == NULL) {
        return false;
    }
    ok = (fread(scratch, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE);
    if (ok) {
        if (!gC2pLutReady) {
            initC2pLut();
        }
        for (y = 0; y < SURFACE_HEIGHT; y++) {
            srcLine = &scratch[y * SURFACE_BYTES_PER_ROW];
            p0 = pd->planes[0] + (uint16_t)y * AMIGA_BYTES_PER_ROW;
            p1 = pd->planes[1] + (uint16_t)y * AMIGA_BYTES_PER_ROW;
            p2 = pd->planes[2] + (uint16_t)y * AMIGA_BYTES_PER_ROW;
            p3 = pd->planes[3] + (uint16_t)y * AMIGA_BYTES_PER_ROW;
            chunkyToPlanarRow(srcLine, p0, p1, p2, p3, AMIGA_BYTES_PER_ROW, gC2pLut);
        }
    }
-    FreeMem(scratch, (ULONG)SURFACE_PIXELS_SIZE);
+    return true;
    return ok;
 }
-bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp) {
+bool halSurfaceSaveFile(const SurfaceT *src, FILE *fp) {
    AmigaPlanarT *pd;
-    uint8_t       chunkyRow[SURFACE_BYTES_PER_ROW];
+    uint8_t       i;
    int16_t       y;
    pd = (AmigaPlanarT *)src->portData;
    if (pd == NULL) {
        return false;
    }
-    /* Per row: derive chunky from planes, write 160 bytes. Less
+    for (i = 0; i < AMIGA_BITPLANES; i++) {
-     * efficient than a single fwrite of a full buffer but avoids
+        if (fwrite(pd->planes[i], 1, AMIGA_PLANE_SIZE, fp) != AMIGA_PLANE_SIZE) {
     * needing a 32 KB scratch allocation. */
    for (y = 0; y < SURFACE_HEIGHT; y++) {
        amigaPlanesToChunkyRow(pd, y, chunkyRow);
        if (fwrite(chunkyRow, 1, SURFACE_BYTES_PER_ROW, fp) != SURFACE_BYTES_PER_ROW) {
            return false;
        }
    }
--- a/src/port/atarist/c2p.s
+++ b/src/port/atarist/c2p.s
@ -1,188 +0,0 @@
 | Atari ST chunky-to-planar conversion -- 68000 hand-rolled.
 |
 | Drop-in replacement for hal.c's old c2pRow C inner loop. The C
 | version walked every pixel and built each plane word with a
 | run-time variable bit shift (`1 << bit`), which costs ~6+2*bit
 | cycles on 68000 -- roughly 100+ cycles per pixel after GCC's m68k
 | codegen overhead. This rewrite uses a 4 KB lookup table built once
 | at HAL init: same layout as the Amiga c2p LUT, so the
 | (sourceByte, position, plane) -> 2-bit contribution mapping is
 | identical, but the routine packs results into ST word-interleaved
 | planar (4 plane words per 16-pixel group) instead of 4 separate
 | plane bytes.
 |
 | Each ST group is 8 source bytes -> 4 plane words. Source byte
 | positions 0..3 contribute to the HIGH byte of each plane word
 | (bits 15..8); positions 4..7 contribute to the LOW byte (bits
 | 7..0). Within a byte, the LUT for (src, bp%4, plane) already
 | places bits at (7-2*(bp%4), 6-2*(bp%4)), so we use the SAME LUT
 | entries for both halves -- we just shift d0..d3 left by 8 between
 | the halves to move the high-half bits up before the low half ORs
 | into the now-empty low byte.
 |
 | Calling convention: m68k-atari-mint-gcc cdecl.
 |   Args on stack at 4(sp), 8(sp), ...
 |   d2-d7, a2-a6 are callee-save.
 |   No return value.
 |
 | void chunkyToPlanarRowSt(const uint8_t *src,    ;  4(sp) - 4bpp packed source row
 |                          uint16_t      *dst,    ;  8(sp) - planar dest row (uint16_t*)
 |                          uint16_t       groupStart, ; 12(sp) - first group index (low word)
 |                          uint16_t       groupEnd,   ; 16(sp) - one-past-last group index (low word)
 |                          const uint8_t *lut);   ; 20(sp) - 4 KB LUT base
 |
 | LUT layout: lut[src*16 + pos*4 + plane] (uint8) = the 2-bit plane
 | contribution for source byte `src` at byte-position `pos` (0..3
 | within a 4-byte chunk) going to plane `plane` (0..3). All 16
 | (pos, plane) entries for one src byte are contiguous, so the inner
 | loop reaches every entry off (a5, d4.w) with an 8-bit displacement
 | (0..15) without LEA between reads.
 |
 | GAS-syntax (binutils m68k); assembled by m68k-atari-mint-as via
 | the gcc driver.
                .text
                .globl  _chunkyToPlanarRowSt
 | MOVEM frame: d2-d7 (6) + a2-a6 (5) = 11 regs * 4 bytes = 44 bytes.
                .equ    SAVED_REGS_SIZE, 44
 _chunkyToPlanarRowSt:
                movem.l %d2-%d7/%a2-%a6,-(%sp)
                move.l   4+SAVED_REGS_SIZE(%sp),%a0     | src row base
                move.l   8+SAVED_REGS_SIZE(%sp),%a1     | dst (uint16_t*)
                | Both groupStart and groupEnd are uint16_t but GCC
                | promotes them to int and pushes 4 bytes each; the
                | low word lives at +2 in big-endian layout.
                move.w  12+SAVED_REGS_SIZE+2(%sp),%d6   | groupStart
                move.w  16+SAVED_REGS_SIZE+2(%sp),%d7   | groupEnd
                move.l  20+SAVED_REGS_SIZE(%sp),%a5     | LUT base
                | Advance src and dst to the first group's data.
                | Each group consumes 8 source bytes and produces 4
                | dest words (8 bytes), so both pointers advance by
                | groupStart * 8.
                move.w  %d6,%d4
                lsl.w   #3,%d4
                add.w   %d4,%a0
                add.w   %d4,%a1
                sub.w   %d6,%d7                         | groupCount = end - start
                subq.w  #1,%d7                          | DBRA bias
                bmi     .Ldone
 .LgroupLoop:
                moveq   #0,%d0                          | plane 0 acc
                moveq   #0,%d1                          | plane 1 acc
                moveq   #0,%d2                          | plane 2 acc
                moveq   #0,%d3                          | plane 3 acc
                | ===== Source bytes 0..3 -> high byte of each plane word =====
                moveq   #0,%d4
                move.b  (%a0)+,%d4
                add.w   %d4,%d4
                add.w   %d4,%d4
                add.w   %d4,%d4
                add.w   %d4,%d4                         | d4 = src * 16
                or.b      0(%a5,%d4.w),%d0
                or.b      1(%a5,%d4.w),%d1
                or.b      2(%a5,%d4.w),%d2
                or.b      3(%a5,%d4.w),%d3
                moveq   #0,%d4
                move.b  (%a0)+,%d4
                add.w   %d4,%d4
                add.w   %d4,%d4
                add.w   %d4,%d4
                add.w   %d4,%d4
                or.b      4(%a5,%d4.w),%d0
                or.b      5(%a5,%d4.w),%d1
                or.b      6(%a5,%d4.w),%d2
                or.b      7(%a5,%d4.w),%d3
                moveq   #0,%d4
                move.b  (%a0)+,%d4
                add.w   %d4,%d4
                add.w   %d4,%d4
                add.w   %d4,%d4
                add.w   %d4,%d4
                or.b      8(%a5,%d4.w),%d0
                or.b      9(%a5,%d4.w),%d1
                or.b     10(%a5,%d4.w),%d2
                or.b     11(%a5,%d4.w),%d3
                moveq   #0,%d4
                move.b  (%a0)+,%d4
                add.w   %d4,%d4
                add.w   %d4,%d4
                add.w   %d4,%d4
                add.w   %d4,%d4
                or.b     12(%a5,%d4.w),%d0
                or.b     13(%a5,%d4.w),%d1
                or.b     14(%a5,%d4.w),%d2
                or.b     15(%a5,%d4.w),%d3
                | Move accumulated bits into the HIGH byte of each word.
                lsl.w   #8,%d0
                lsl.w   #8,%d1
                lsl.w   #8,%d2
                lsl.w   #8,%d3
                | ===== Source bytes 4..7 -> low byte of each plane word =====
                moveq   #0,%d4
                move.b  (%a0)+,%d4
                add.w   %d4,%d4
                add.w   %d4,%d4
                add.w   %d4,%d4
                add.w   %d4,%d4
                or.b      0(%a5,%d4.w),%d0
                or.b      1(%a5,%d4.w),%d1
                or.b      2(%a5,%d4.w),%d2
                or.b      3(%a5,%d4.w),%d3
                moveq   #0,%d4
                move.b  (%a0)+,%d4
                add.w   %d4,%d4
                add.w   %d4,%d4
                add.w   %d4,%d4
                add.w   %d4,%d4
                or.b      4(%a5,%d4.w),%d0
                or.b      5(%a5,%d4.w),%d1
                or.b      6(%a5,%d4.w),%d2
                or.b      7(%a5,%d4.w),%d3
                moveq   #0,%d4
                move.b  (%a0)+,%d4
                add.w   %d4,%d4
                add.w   %d4,%d4
                add.w   %d4,%d4
                add.w   %d4,%d4
                or.b      8(%a5,%d4.w),%d0
                or.b      9(%a5,%d4.w),%d1
                or.b     10(%a5,%d4.w),%d2
                or.b     11(%a5,%d4.w),%d3
                moveq   #0,%d4
                move.b  (%a0)+,%d4
                add.w   %d4,%d4
                add.w   %d4,%d4
                add.w   %d4,%d4
                add.w   %d4,%d4
                or.b     12(%a5,%d4.w),%d0
                or.b     13(%a5,%d4.w),%d1
                or.b     14(%a5,%d4.w),%d2
                or.b     15(%a5,%d4.w),%d3
                | Store 4 plane words.
                move.w  %d0,(%a1)+
                move.w  %d1,(%a1)+
                move.w  %d2,(%a1)+
                move.w  %d3,(%a1)+
                dbra    %d7,.LgroupLoop
 .Ldone:
                movem.l (%sp)+,%d2-%d7/%a2-%a6
                rts
--- a/src/port/atarist/circle.s
+++ b/src/port/atarist/circle.s
@ -82,11 +82,9 @@
                .macro  YP_REC  slot, signOp, yreg
                move.l  %a4,%d6
                \signOp\().w \yreg,%d6         | d6.w = yp
-                move.w  %d6,%d0
+                add.w   %d6,%d6                | * 2 for word index
-                lsl.w   #5,%d6                 | d6 = yp << 5
+                move.w  (%a6,%d6.w),%d6        | yLut[yp] = yp * 160
-                lsl.w   #7,%d0                 | d0 = yp << 7
+                move.w  %d6,\slot(%sp)
                add.w   %d6,%d0                | d0 = yp * 160
                move.w  %d0,\slot(%sp)
                .endm
@ -223,14 +221,21 @@ _surface68kStCircleOutline:
                moveq   #1,%d4
                sub.w   %d2,%d4                | err = 1 - bx
                | a6 = yLut base (yp -> yp*160). Lookup is faster than
                | the 4 cyc + 4 cyc + 18 cyc + 22 cyc + 4 cyc shift+add
                | chain we used to do per YP_REC. Saved across all 4
                | YP_RECs per Bresenham iter (~120 cyc/iter).
                | Shared LUT lives in lineSpan.s; reference absolute.
                lea     _gStRowOffsetLut,%a6
                | Dispatch on color (low 4 bits) -> one of 16 main loops.
                moveq   #0,%d6
                move.b  SP_COLOR(%sp),%d6
                and.w   #0x0F,%d6
                add.w   %d6,%d6
                add.w   %d6,%d6                | * 4 for bra.w table
-                lea     .LcoStTable(%pc),%a6
+                lea     .LcoStTable(%pc),%a2
-                jmp     0(%a6,%d6.w)
+                jmp     0(%a2,%d6.w)
 .LcoStTable:
                bra.w   .LcoStLoop_0
@ -280,3 +285,4 @@ bitMaskWordLut:
                .word   0x0800, 0x0400, 0x0200, 0x0100
                .word   0x0080, 0x0040, 0x0020, 0x0010
                .word   0x0008, 0x0004, 0x0002, 0x0001
 | (yLut now lives in lineSpan.s as the shared _gStRowOffsetLut)
--- a/src/port/atarist/fillCircle.s
+++ b/src/port/atarist/fillCircle.s
@ -9,28 +9,16 @@
 | Caller MUST guarantee the bounding box (cx-r, cy-r) (cx+r, cy+r)
 | is fully on-surface. Off-surface circles fall back to the C walker.
 |
 | Phase 10 final: 16-way color dispatch at the OUTER loop. Each color
 | variant has its own Bresenham body where SPAN_BODY inlines a hard-
 | coded 4-plane mask RMW (no btst, no bsr/rts). Saves ~120 cyc per
 | applyMask call (was ~180 via bsr applyMask with runtime btst on d7).
 |
 | ABI: cdecl. d2-d7/a2-a6 callee-save.
 |
 |   void surface68kStFillCircle(uint8_t *base,
 |                               uint16_t cx, uint16_t cy,
 |                               uint16_t r,  uint8_t  color);
 |
 | Register allocation across the loop:
 |   d2.w = bx (Bresenham, starts at r)
 |   d3.w = by (Bresenham, starts at 0)
 |   d4.w = err
 |   d5.l = loLong (planes 0+1 long template)
 |   d6.l = hiLong (planes 2+3 long template)
 |   d7.b = color (low nibble; tested via btst)
 |   a3   = base
 |   a4   = scratch / current group pointer
 |   d0,d1 = scratch
 |
 | Stack scratch (8 bytes at 0(sp)..7(sp)):
 |   0..1  leftMask  (word; per pair)
 |   2..3  rightMask (word; per pair)
 |   4..5  numGroups (word; per pair)
 |   6..7  groupFirstByteOff (word; per pair)
                .text
@ -42,7 +30,7 @@
                .equ    SP_FC_CX,      SP_FC_OFF + 4 + 2
                .equ    SP_FC_CY,      SP_FC_OFF + 8 + 2
                .equ    SP_FC_R,       SP_FC_OFF + 12 + 2
-                .equ    SP_FC_COLOR,   SP_FC_OFF + 16 + 3
+                .equ    SP_FC_COLOR,   SP_FC_OFF + 20 + 3
 | ---- COMPUTE_PAIR_MASKS macro -----------------------------------
@ -50,18 +38,15 @@
 | Output: 0(sp) leftMask, 2(sp) rightMask, 4(sp) numGroups,
 |         6(sp) groupFirstByteOff
 | Trashes: d0, d1
 | (No labels: straightline.)
                .macro  COMPUTE_PAIR_MASKS
                move.w  %d0,0(%sp)             | stash left
                move.w  %d1,2(%sp)             | stash right
                | groupFirst & groupFirstByteOff
                move.w  %d0,%d1
                lsr.w   #4,%d1                 | groupFirst
                move.w  %d1,%d0
                lsl.w   #3,%d0                 | groupFirstByteOff
                move.w  %d0,6(%sp)
                | numGroups = (right >> 4) - groupFirst
                move.w  2(%sp),%d0
                lsr.w   #4,%d0                 | groupLast
                sub.w   %d1,%d0                | numGroups
@ -81,25 +66,53 @@
                .endm
-| ---- SPAN_BODY macro --------------------------------------------
+| ---- APPLY_MASK_INLINE macro ------------------------------------
-| Render one row span using the pair masks at 0(sp)..7(sp).
+| 4-plane mask RMW with HARDCODED color. a4 advances by 8 (postinc).
-| Input:  d0.w = y (signed)
+| Inputs:  d0.w = mask, a4 = group ptr
-|         a3 = base, d5 = loLong, d6 = hiLong, d7 = color
+| Trashes: d1 (notMask scratch)
 | Trashes: d0, d1, a4
 | Macro takes an idx parameter for unique labels.
-                .macro  SPAN_BODY
+                .macro  APPLY_MASK_INLINE  color
-                | a4 = base + y*160
+                move.w  %d0,%d1
-                ext.l   %d0
+                not.w   %d1
-                move.l  %d0,%d1
+                .if  ((\color) & 1)
-                lsl.l   #5,%d0
+                or.w    %d0,(%a4)+
-                lsl.l   #7,%d1
+                .else
-                add.l   %d1,%d0                | y*160
+                and.w   %d1,(%a4)+
-                lea     0(%a3,%d0.l),%a4
+                .endif
-                | a4 += groupFirstByteOff
+                .if  ((\color) & 2)
-                moveq   #0,%d0
+                or.w    %d0,(%a4)+
-                move.w  6(%sp),%d0
+                .else
-                add.l   %d0,%a4
+                and.w   %d1,(%a4)+
                .endif
                .if  ((\color) & 4)
                or.w    %d0,(%a4)+
                .else
                and.w   %d1,(%a4)+
                .endif
                .if  ((\color) & 8)
                or.w    %d0,(%a4)+
                .else
                and.w   %d1,(%a4)+
                .endif
                .endm
 | ---- SPAN_BODY macro --------------------------------------------
 | Render one row span. Color hardcoded.
 | Input:  d0.w = y (signed)
 |         a3 = base, d5 = loLong, d6 = hiLong
 |         masks at 0..7(sp): leftMask, rightMask, numGroups, groupFirstByteOff
 | Trashes: d0, d1, a4
                .macro  SPAN_BODY  color
                | a4 = base + y*160 + groupFirstByteOff
                | y*160 via shared _gStRowOffsetLut (a2 holds lut base).
                | byteOff (y*160 + groupFirstByteOff) fits in 16 bits
                | (max 31992), so word-only ops + .w-indexed lea.
                add.w   %d0,%d0                | y * 2 (word index)
                move.w  (%a2,%d0.w),%d0        | d0 = y * 160
                add.w   6(%sp),%d0             | + groupFirstByteOff
                lea     0(%a3,%d0.w),%a4
                | numGroups in d1
                move.w  4(%sp),%d1
                tst.w   %d1
@ -107,15 +120,14 @@
                | single-group: combinedMask = leftMask & rightMask
                move.w  0(%sp),%d0
                and.w   2(%sp),%d0
-                bsr     .Lfc_applyMask
+                APPLY_MASK_INLINE \color
                bra.w   .Lsb_done\@
 .Lsb_multi\@:
-                | leading mask. applyMask postinc-advances a4 by 8
+                | leading mask. APPLY_MASK_INLINE postinc-advances a4 by 8.
-                | (the 4 plane RMWs each advance by 2 via (a4)+).
+                | APPLY trashes d1, so reload numGroups after.
                | applyMask trashes d1, so reload numGroups after bsr.
                move.w  0(%sp),%d0
-                bsr     .Lfc_applyMask
+                APPLY_MASK_INLINE \color
-                move.w  4(%sp),%d1             | reload numGroups
+                move.w  4(%sp),%d1
                subq.w  #1,%d1                 | d1 = numMid
                beq.s   .Lsb_skipMid\@
 .Lsb_midLoop\@:
@ -126,11 +138,71 @@
 .Lsb_skipMid\@:
                | trailing mask
                move.w  2(%sp),%d0
-                bsr     .Lfc_applyMask
+                APPLY_MASK_INLINE \color
 .Lsb_done\@:
                .endm
 | ---- CO_BODY macro: per-color full Bresenham loop body ----------
                .macro  CO_BODY  color
 .Lfc_loop_\color:
                cmp.w   %d3,%d2
                bcs.w   .Lfc_done
                | --- Pair A: x range = (cx - bx, cx + bx)
                move.w  SP_FC_CX(%sp),%d0
                move.w  %d0,%d1
                sub.w   %d2,%d0
                add.w   %d2,%d1
                COMPUTE_PAIR_MASKS
                | Span A1: y = cy + by
                move.w  SP_FC_CY(%sp),%d0
                add.w   %d3,%d0
                SPAN_BODY  \color
                | Span A2: y = cy - by
                move.w  SP_FC_CY(%sp),%d0
                sub.w   %d3,%d0
                SPAN_BODY  \color
                | --- Pair B: x range = (cx - by, cx + by)
                move.w  SP_FC_CX(%sp),%d0
                move.w  %d0,%d1
                sub.w   %d3,%d0
                add.w   %d3,%d1
                COMPUTE_PAIR_MASKS
                | Span B1: y = cy + bx
                move.w  SP_FC_CY(%sp),%d0
                add.w   %d2,%d0
                SPAN_BODY  \color
                | Span B2: y = cy - bx
                move.w  SP_FC_CY(%sp),%d0
                sub.w   %d2,%d0
                SPAN_BODY  \color
                | --- Bresenham step
                addq.w  #1,%d3
                tst.w   %d4
                bgt.s   .Lfc_decBx_\color
                add.w   %d3,%d4
                add.w   %d3,%d4
                addq.w  #1,%d4
                bra.w   .Lfc_loop_\color
 .Lfc_decBx_\color:
                subq.w  #1,%d2
                add.w   %d3,%d4
                add.w   %d3,%d4
                sub.w   %d2,%d4
                sub.w   %d2,%d4
                addq.w  #1,%d4
                bra.w   .Lfc_loop_\color
                .endm
                .globl  _surface68kStFillCircle
 _surface68kStFillCircle:
@ -142,10 +214,11 @@ _surface68kStFillCircle:
                moveq   #0,%d7
                move.b  SP_FC_COLOR(%sp),%d7
-                | LUT bases (PC-relative indexed has only 8-bit
+                | LUT bases. a5/a6 = mask LUTs (used by COMPUTE_PAIR_MASKS).
-                | displacement, so cache full pointers in a-regs).
+                | a2 = shared _gStRowOffsetLut (used by SPAN_BODY for y*160).
                lea     leftMaskLut(%pc),%a5
                lea     rightMaskLut(%pc),%a6
                lea     _gStRowOffsetLut,%a2
                | loLong = ((c&1)?0xFFFF0000:0) | ((c&2)?0x0000FFFF:0)
                moveq   #0,%d5
@ -174,60 +247,50 @@ _surface68kStFillCircle:
                moveq   #1,%d4
                sub.w   %d2,%d4
-.Lfc_loop:
+                | Dispatch on color (low 4 bits) -> 16 specialized loops.
-                cmp.w   %d3,%d2
+                | Use a4 (gets overwritten in SPAN_BODY's first lea) as
-                bcs.w   .Lfc_done
+                | dispatch scratch since a2 now holds yLut for the body.
                and.w   #0x0F,%d7
                move.w  %d7,%d0
                add.w   %d0,%d0
                add.w   %d0,%d0                | * 4 for bra.w table
                lea     .Lfc_table(%pc),%a4
                jmp     0(%a4,%d0.w)
-                | --- Pair A: x range = (cx - bx, cx + bx)
+.Lfc_table:
-                move.w  SP_FC_CX(%sp),%d0
+                bra.w   .Lfc_loop_0
-                move.w  %d0,%d1
+                bra.w   .Lfc_loop_1
-                sub.w   %d2,%d0                | left  = cx - bx
+                bra.w   .Lfc_loop_2
-                add.w   %d2,%d1                | right = cx + bx
+                bra.w   .Lfc_loop_3
-                COMPUTE_PAIR_MASKS
+                bra.w   .Lfc_loop_4
                bra.w   .Lfc_loop_5
                bra.w   .Lfc_loop_6
                bra.w   .Lfc_loop_7
                bra.w   .Lfc_loop_8
                bra.w   .Lfc_loop_9
                bra.w   .Lfc_loop_10
                bra.w   .Lfc_loop_11
                bra.w   .Lfc_loop_12
                bra.w   .Lfc_loop_13
                bra.w   .Lfc_loop_14
                bra.w   .Lfc_loop_15
-                | Span A1: y = cy + by
+                CO_BODY  0
-                move.w  SP_FC_CY(%sp),%d0
+                CO_BODY  1
-                add.w   %d3,%d0
+                CO_BODY  2
-                SPAN_BODY
+                CO_BODY  3
-
+                CO_BODY  4
-                | Span A2: y = cy - by
+                CO_BODY  5
-                move.w  SP_FC_CY(%sp),%d0
+                CO_BODY  6
-                sub.w   %d3,%d0
+                CO_BODY  7
-                SPAN_BODY
+                CO_BODY  8
-
+                CO_BODY  9
-                | --- Pair B: x range = (cx - by, cx + by)
+                CO_BODY  10
-                move.w  SP_FC_CX(%sp),%d0
+                CO_BODY  11
-                move.w  %d0,%d1
+                CO_BODY  12
-                sub.w   %d3,%d0                | left  = cx - by
+                CO_BODY  13
-                add.w   %d3,%d1                | right = cx + by
+                CO_BODY  14
-                COMPUTE_PAIR_MASKS
+                CO_BODY  15
                | Span B1: y = cy + bx
                move.w  SP_FC_CY(%sp),%d0
                add.w   %d2,%d0
                SPAN_BODY
                | Span B2: y = cy - bx
                move.w  SP_FC_CY(%sp),%d0
                sub.w   %d2,%d0
                SPAN_BODY
                | --- Bresenham step
                addq.w  #1,%d3
                tst.w   %d4
                bgt.s   .Lfc_decBx
                add.w   %d3,%d4
                add.w   %d3,%d4
                addq.w  #1,%d4
                bra.w   .Lfc_loop
 .Lfc_decBx:
                subq.w  #1,%d2
                add.w   %d3,%d4
                add.w   %d3,%d4
                sub.w   %d2,%d4
                sub.w   %d2,%d4
                addq.w  #1,%d4
                bra.w   .Lfc_loop
 .Lfc_done:
@ -236,46 +299,6 @@ _surface68kStFillCircle:
                rts
 | ---- Apply 4-plane mask at (a4) -------------------------------
 | Input:  d0.w = mask, d7.b = color, a4 = group ptr
 | Output: a4 advanced by 8 (next group). Caller must NOT post-add 8.
 | Trashes: d0, d1
 | Subroutine, called via bsr from SPAN_BODY. Postinc on each plane
 | RMW saves 4 cyc/plane vs displacement (12 vs 16 EA cyc).
 .Lfc_applyMask:
                move.w  %d0,%d1
                not.w   %d1                    | d1 = notMask
                btst    #0,%d7
                beq.s   .Lfc_am0a
                or.w    %d0,(%a4)+
                bra.s   .Lfc_am1
 .Lfc_am0a:
                and.w   %d1,(%a4)+
 .Lfc_am1:
                btst    #1,%d7
                beq.s   .Lfc_am1a
                or.w    %d0,(%a4)+
                bra.s   .Lfc_am2
 .Lfc_am1a:
                and.w   %d1,(%a4)+
 .Lfc_am2:
                btst    #2,%d7
                beq.s   .Lfc_am2a
                or.w    %d0,(%a4)+
                bra.s   .Lfc_am3
 .Lfc_am2a:
                and.w   %d1,(%a4)+
 .Lfc_am3:
                btst    #3,%d7
                beq.s   .Lfc_am3a
                or.w    %d0,(%a4)+
                rts
 .Lfc_am3a:
                and.w   %d1,(%a4)+
                rts
                .align  2
 | leftMaskLut[i]  = (1 << (16 - i)) - 1, indexed by bitFirst (0..15)
 leftMaskLut:
--- a/src/port/atarist/hal.c
+++ b/src/port/atarist/hal.c
@ -2,7 +2,7 @@
 //
 // M2 scope:
 //   * XBIOS Setscreen to ST low-res (320x200x16, mode 0).
-//   * Chunky 4bpp to word-interleaved ST planar c2p at present time.
+//   * Word-interleaved ST planar buffer copied to the screen at present.
 //
 // M2.5 scope (per-band palette / SCB emulation):
 //   * halPresent scans the SurfaceT's SCB array and builds a compact
@ -136,17 +136,9 @@ static inline __attribute__((always_inline)) uint8_t stPlanarGetPixel(const StPl
 }
 static uint16_t quantizeColorToSt(uint16_t orgb);
 static void     flattenScbPalettes(const SurfaceT *src);
 static void     initC2pLut(void);
 static void     writeDiagnostics(void);
 static long     writePrevPaletteRegs(void);
 // Provided by src/port/atarist/c2p.s.
 extern void chunkyToPlanarRowSt(const uint8_t *src,
                                uint16_t *dst,
                                uint16_t groupStart,
                                uint16_t groupEnd,
                                const uint8_t *lut);
 static __attribute__((interrupt_handler)) void timerBIsr(void);
 static __attribute__((interrupt_handler)) void vblIsr(void);
 static void                                    buildTransitions(const SurfaceT *src);
@ -201,72 +193,11 @@ static void (*gOldTimerBVec)(void) = NULL;
 // SCB; neither is cheap on a 7 MHz 68000. In the typical game loop
 // (and every frame of the keys demo after the initial paint) SCB and
 // palette never change, so caching and skipping those passes keeps
-// rect presents down to just the c2p work.
+// rect presents down to just the screen blit.
 static uint8_t  gCachedScb    [SURFACE_HEIGHT];
 static uint16_t gCachedPalette[SURFACE_PALETTE_COUNT][SURFACE_COLORS_PER_PALETTE];
 static bool     gCacheValid = false;
 // 256-long plane-spread LUT for the asm sprite SAVE path (defined in
 // spriteAsm.s). For plane byte b, LUT[b] is a 32-bit value where each
 // of b's 8 bits is placed at the bit-0 position of the corresponding
 // pixel's nibble inside a 4-byte chunky long. The asm shifts the LUT
 // entry left by N to get plane N's contribution; OR'd across 4 planes
 // gives the full chunky long. Initialized lazily.
 //
 // LUT used by surface68kStSpriteSaveByteAligned. The asm reads via
 // `move.l (a_ptr, d0.l), d4` which requires the LUT to be long-
 // aligned -- and TOS .PRG BSS only does 2-byte alignment. Worse,
 // the cascading offsets from the odd-sized gC2pLut put even
 // `uint32_t` BSS slots at addr mod 4 == 2.
 //
 // Fix: malloc the LUT. mintlib's malloc returns long-aligned memory.
 // The pointer is passed to the asm via the C-side wrapper (so the
 // asm reads it from the stack, where it's guaranteed long-aligned
 // regardless of where the static pointer slot lives).
 static uint32_t *gStPlaneSpreadLutPtr = NULL;
 static bool      gStPlaneSpreadLutReady = false;
 static bool initStPlaneSpreadLut(void) {
    int b;
    int i;
    if (gStPlaneSpreadLutReady) {
        return true;
    }
    gStPlaneSpreadLutPtr = (uint32_t *)malloc(256 * sizeof(uint32_t));
    if (gStPlaneSpreadLutPtr == NULL) {
        return false;
    }
    for (b = 0; b < 256; b++) {
        uint32_t v = 0u;
        for (i = 0; i < 8; i++) {
            if (b & (0x80 >> i)) {
                int byteIdx = i >> 1;
                int isHigh  = ((i & 1) == 0);
                int bitInLong = (3 - byteIdx) * 8 + (isHigh ? 4 : 0);
                v |= (uint32_t)1u << bitInLong;
            }
        }
        gStPlaneSpreadLutPtr[b] = v;
    }
    gStPlaneSpreadLutReady = true;
    return true;
 }
 // 4 KB chunky-to-planar lookup table consumed by chunkyToPlanarRowSt
 // (src/port/atarist/c2p.s). Layout: gC2pLut[src*16 + pos*4 + plane]
 // = the 2-bit plane-byte contribution for source byte `src` at
 // byte-position `pos` (0..3 within a 4-byte chunk) going to plane
 // `plane`. Bit positions inside the byte are (7-2*pos, 6-2*pos), so
 // the same table feeds both halves of an ST plane word: positions
 // 0..3 land in the high byte, 4..7 (re-indexed mod 4) in the low
 // byte. Built once by initC2pLut on the first halPresent call.
 /* Exported (no static) so spriteAsm.s can `lea _gC2pLut, %a2`. */
 uint8_t         gC2pLut[4 * 1024];
 static bool     gC2pLutReady = false;
 // ----- Internal helpers (alphabetical) -----
 // Scan the surface's SCB and record one transition entry for each
@ -350,37 +281,6 @@ static void refreshPaletteStateIfNeeded(const SurfaceT *src) {
 }
 // Build the 4 KB chunky-to-planar lookup table consumed by
 // chunkyToPlanarRowSt. Same layout/contents as the Amiga c2p LUT;
 // see src/port/atarist/c2p.s for the addressing math.
 static void initC2pLut(void) {
    uint16_t pos;
    uint16_t plane;
    uint16_t src;
    uint8_t  highShift;
    uint8_t  lowShift;
    uint8_t  highBit;
    uint8_t  lowBit;
    if (gC2pLutReady) {
        return;
    }
    for (src = 0; src < 256; src++) {
        for (pos = 0; pos < 4; pos++) {
            highShift = (uint8_t)(7 - 2 * pos);
            lowShift  = (uint8_t)(6 - 2 * pos);
            for (plane = 0; plane < 4; plane++) {
                highBit = (uint8_t)(((src >> 4) >> plane) & 1);
                lowBit  = (uint8_t)(((src & 0x0F) >> plane) & 1);
                gC2pLut[src * 16 + pos * 4 + plane] =
                    (uint8_t)((highBit << highShift) | (lowBit << lowShift));
            }
        }
    }
    gC2pLutReady = true;
 }
 // 12-bit $0RGB to STF 9-bit palette register (drops the low bit of
 // each 4-bit channel).
 static uint16_t quantizeColorToSt(uint16_t orgb) {
@ -619,11 +519,8 @@ void halPresent(const SurfaceT *src) {
    }
    refreshPaletteStateIfNeeded(src);
-    // Phase 9: planar shadow -> screen RAM. Same dirty-word band
+    // Planar buffer -> screen RAM. Each dirty word covers 4 pixels
-    // tracking the c2p path used; just memcpy the planar bytes for
+    // (a quarter of an 8-byte group). Round to whole groups for a
    // each band instead of running c2p on the chunky shadow. Each
    // dirty word covers 4 pixels = ?of one group = quarter of an
    // 8-byte group. We round to whole groups (8 bytes each) for a
    // simple aligned memcpy, since planar groups are the natural
    // copy unit.
    for (y = 0; y < SURFACE_HEIGHT; y++) {
@ -720,8 +617,11 @@ extern void surface68kStFillCircle(uint8_t *base, uint16_t cx, uint16_t cy, uint
 extern void surface68kStFillRectSingleGroup(uint8_t *firstGroupPtr, uint16_t mask, uint16_t h, uint8_t color);
 extern void surface68kStFillRectMulti(uint8_t *base, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t color);
 extern void surface68kStLongFill(uint8_t *dst, uint16_t numGroups, uint32_t loLong, uint32_t hiLong);
-extern void surface68kStSpriteSaveByteAligned(uint8_t *base, uint16_t x, uint16_t y, uint16_t w, uint16_t h, uint8_t *dstChunky, const uint32_t *lut);
+extern void surface68kStTileFill8x8(uint8_t *firstGroupPtr, uint16_t mask, uint8_t color);
-extern void surface68kStSpriteRestoreByteAligned(uint8_t *base, uint16_t x, uint16_t y, uint16_t w, uint16_t h, const uint8_t *srcChunky, const uint8_t *c2pLut);
+extern void surface68kStSprite16x16Save(uint8_t *base, uint16_t x, uint16_t y, uint8_t *dstBuf);
 extern void surface68kStSprite16x16Restore(uint8_t *base, uint16_t x, uint16_t y, const uint8_t *srcBuf);
 extern void surface68kStSpriteSaveByteAligned(uint8_t *base, uint16_t x, uint16_t y, uint16_t w, uint16_t h, uint8_t *dstPlaneBytes);
 extern void surface68kStSpriteRestoreByteAligned(uint8_t *base, uint16_t x, uint16_t y, uint16_t w, uint16_t h, const uint8_t *srcPlaneBytes);
 // Phase 9: clear the entire planar buffer to a 4-bit color. Build an
@ -1262,17 +1162,12 @@ void halTileFillPlanes(SurfaceT *s, uint8_t bx, uint8_t by, uint8_t colorIndex)
    group    = (uint16_t)((uint16_t)bx >> 1);
    halfMask = ((bx & 1u) == 0u) ? 0xFF00u : 0x00FFu;
    gp = pd->base + (uint16_t)by * 8u * ST_BYTES_PER_ROW + group * ST_BYTES_PER_GROUP;
-    surface68kStFillRectSingleGroup(gp, halfMask, TILE_PIXELS_PER_SIDE, colorIndex);
+    /* Phase 10 final: specialized 8x8 unrolled tile-fill skips the
     * generic FRG_LOOP's per-row subq+bne overhead. */
    surface68kStTileFill8x8(gp, halfMask, colorIndex);
 }
 // Phase 10: group-aware tile paste. Per row: extract 8 pixels from
 // 4 chunky bytes, build 4 plane bytes (one per plane), drop them
 // into the high or low half of the 4 plane words at this group --
 // 4 word RMWs per row instead of 64 per-pixel calls.
 static const uint8_t kStTileBitLut[8] = { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 };
 // Phase 10: tile paste/snap reuse the asm sprite save/restore
 // helpers -- identical per-row work patterns at byte-aligned
 // positions. Width 8 = single tile column = single half-group
@ -1301,14 +1196,25 @@ void halTilePastePlanes(SurfaceT *dst, uint8_t bx, uint8_t by, const uint8_t *ti
            + (uint16_t)by * 8u * ST_BYTES_PER_ROW
            + group * ST_BYTES_PER_GROUP
            + (uint16_t)(bx & 1u);
-    for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) {
+    (void)row;
-        dstAddr[0] = tileBytes[0];
+#define ST_TILE_PASTE_ROW                                                   \
-        dstAddr[2] = tileBytes[1];
+    do {                                                                    \
-        dstAddr[4] = tileBytes[2];
+        dstAddr[0] = tileBytes[0];                                          \
-        dstAddr[6] = tileBytes[3];
+        dstAddr[2] = tileBytes[1];                                          \
-        dstAddr   += ST_BYTES_PER_ROW;
+        dstAddr[4] = tileBytes[2];                                          \
-        tileBytes += TILE_BYTES_PER_ROW;
+        dstAddr[6] = tileBytes[3];                                          \
-    }
+        dstAddr   += ST_BYTES_PER_ROW;                                      \
        tileBytes += TILE_BYTES_PER_ROW;                                    \
    } while (0)
    ST_TILE_PASTE_ROW;
    ST_TILE_PASTE_ROW;
    ST_TILE_PASTE_ROW;
    ST_TILE_PASTE_ROW;
    ST_TILE_PASTE_ROW;
    ST_TILE_PASTE_ROW;
    ST_TILE_PASTE_ROW;
    ST_TILE_PASTE_ROW;
 #undef ST_TILE_PASTE_ROW
 }
@ -1331,136 +1237,25 @@ void halTileSnapPlanes(const SurfaceT *src, uint8_t bx, uint8_t by, uint8_t *til
            + (uint16_t)by * 8u * ST_BYTES_PER_ROW
            + group * ST_BYTES_PER_GROUP
            + (uint16_t)(bx & 1u);
-    for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) {
+    (void)row;
-        tileOut[0] = srcAddr[0];
+#define ST_TILE_SNAP_ROW                                                    \
-        tileOut[1] = srcAddr[2];
+    do {                                                                    \
-        tileOut[2] = srcAddr[4];
+        tileOut[0] = srcAddr[0];                                            \
-        tileOut[3] = srcAddr[6];
+        tileOut[1] = srcAddr[2];                                            \
-        srcAddr   += ST_BYTES_PER_ROW;
+        tileOut[2] = srcAddr[4];                                            \
-        tileOut   += TILE_BYTES_PER_ROW;
+        tileOut[3] = srcAddr[6];                                            \
-    }
+        srcAddr   += ST_BYTES_PER_ROW;                                      \
-}
+        tileOut   += TILE_BYTES_PER_ROW;                                    \
-
+    } while (0)
-
+    ST_TILE_SNAP_ROW;
-/* Slow-path C versions kept (renamed) for reference; not in the
+    ST_TILE_SNAP_ROW;
- * active call chain. */
+    ST_TILE_SNAP_ROW;
-static void halTilePastePlanes_oldC(SurfaceT *dst, uint8_t bx, uint8_t by, const uint8_t *chunkyTile) {
+    ST_TILE_SNAP_ROW;
-    StPlanarT *pd;
+    ST_TILE_SNAP_ROW;
-    uint16_t   group;
+    ST_TILE_SNAP_ROW;
-    uint16_t   halfMask;
+    ST_TILE_SNAP_ROW;
-    uint16_t   notHalfMask;
+    ST_TILE_SNAP_ROW;
-    bool       isHigh;
+#undef ST_TILE_SNAP_ROW
    uint8_t   *rowBase;
    int16_t    row;
    int16_t    pix;
    uint16_t  *pw;
    uint8_t    b;
    uint8_t    color;
    uint8_t    pb0;
    uint8_t    pb1;
    uint8_t    pb2;
    uint8_t    pb3;
    uint8_t    bit;
    if (dst == NULL || chunkyTile == NULL) {
        return;
    }
    pd = (StPlanarT *)dst->portData;
    if (pd == NULL) {
        return;
    }
    group       = (uint16_t)((uint16_t)bx >> 1);
    isHigh      = ((bx & 1u) == 0u);
    halfMask    = isHigh ? 0xFF00u : 0x00FFu;
    notHalfMask = (uint16_t)~halfMask;
    rowBase = pd->base
            + (uint16_t)by * TILE_PIXELS_PER_SIDE * ST_BYTES_PER_ROW
            + group * ST_BYTES_PER_GROUP;
    for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) {
        pb0 = pb1 = pb2 = pb3 = 0u;
        for (pix = 0; pix < TILE_PIXELS_PER_SIDE; pix++) {
            b = chunkyTile[row * TILE_BYTES_PER_ROW + (pix >> 1)];
            color = (pix & 1) ? (uint8_t)(b & 0x0Fu) : (uint8_t)(b >> 4);
            bit = kStTileBitLut[pix];
            if (color & 1u) { pb0 = (uint8_t)(pb0 | bit); }
            if (color & 2u) { pb1 = (uint8_t)(pb1 | bit); }
            if (color & 4u) { pb2 = (uint8_t)(pb2 | bit); }
            if (color & 8u) { pb3 = (uint8_t)(pb3 | bit); }
        }
        pw = (uint16_t *)rowBase;
        if (isHigh) {
            pw[0] = (uint16_t)((pw[0] & notHalfMask) | ((uint16_t)pb0 << 8));
            pw[1] = (uint16_t)((pw[1] & notHalfMask) | ((uint16_t)pb1 << 8));
            pw[2] = (uint16_t)((pw[2] & notHalfMask) | ((uint16_t)pb2 << 8));
            pw[3] = (uint16_t)((pw[3] & notHalfMask) | ((uint16_t)pb3 << 8));
        } else {
            pw[0] = (uint16_t)((pw[0] & notHalfMask) | (uint16_t)pb0);
            pw[1] = (uint16_t)((pw[1] & notHalfMask) | (uint16_t)pb1);
            pw[2] = (uint16_t)((pw[2] & notHalfMask) | (uint16_t)pb2);
            pw[3] = (uint16_t)((pw[3] & notHalfMask) | (uint16_t)pb3);
        }
        rowBase += ST_BYTES_PER_ROW;
    }
 }
 // Phase 10: group-aware tile snap. Read 4 plane half-words for the
 // row's group, distribute the 8 plane bits per plane into chunky
 // nibbles. 4 word reads per row + 4 chunky bytes per row, no
 // per-pixel function calls. Replaced by the asm-routed halTileSnapPlanes
 // above; kept for reference as the C-only fallback.
 static void halTileSnapPlanes_oldC(const SurfaceT *src, uint8_t bx, uint8_t by, uint8_t *chunkyTileOut) {
    const StPlanarT *pd;
    uint16_t         group;
    uint16_t         halfShift;
    const uint8_t   *rowBase;
    int16_t          row;
    int16_t          pair;
    const uint16_t  *pw;
    uint8_t          pb0;
    uint8_t          pb1;
    uint8_t          pb2;
    uint8_t          pb3;
    uint8_t          bitHi;
    uint8_t          bitLo;
    uint8_t          hi;
    uint8_t          lo;
    if (src == NULL || chunkyTileOut == NULL) {
        return;
    }
    pd = (const StPlanarT *)src->portData;
    if (pd == NULL) {
        return;
    }
    group     = (uint16_t)((uint16_t)bx >> 1);
    halfShift = ((bx & 1u) == 0u) ? 8u : 0u;
    rowBase = pd->base
            + (uint16_t)by * TILE_PIXELS_PER_SIDE * ST_BYTES_PER_ROW
            + group * ST_BYTES_PER_GROUP;
    for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) {
        pw  = (const uint16_t *)rowBase;
        pb0 = (uint8_t)(pw[0] >> halfShift);
        pb1 = (uint8_t)(pw[1] >> halfShift);
        pb2 = (uint8_t)(pw[2] >> halfShift);
        pb3 = (uint8_t)(pw[3] >> halfShift);
        for (pair = 0; pair < TILE_BYTES_PER_ROW; pair++) {
            bitHi = kStTileBitLut[pair * 2];
            bitLo = kStTileBitLut[pair * 2 + 1];
            hi = 0u;
            lo = 0u;
            if (pb0 & bitHi) hi = (uint8_t)(hi | 1u);
            if (pb1 & bitHi) hi = (uint8_t)(hi | 2u);
            if (pb2 & bitHi) hi = (uint8_t)(hi | 4u);
            if (pb3 & bitHi) hi = (uint8_t)(hi | 8u);
            if (pb0 & bitLo) lo = (uint8_t)(lo | 1u);
            if (pb1 & bitLo) lo = (uint8_t)(lo | 2u);
            if (pb2 & bitLo) lo = (uint8_t)(lo | 4u);
            if (pb3 & bitLo) lo = (uint8_t)(lo | 8u);
            chunkyTileOut[row * TILE_BYTES_PER_ROW + pair] = (uint8_t)((hi << 4) | lo);
        }
        rowBase += ST_BYTES_PER_ROW;
    }
 }
@ -1496,14 +1291,28 @@ void halTileCopyPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const Surfac
            + (uint16_t)dstBy * 8u * ST_BYTES_PER_ROW
            + dstGroup * ST_BYTES_PER_GROUP
            + (uint16_t)(dstBx & 1u);
-    for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) {
+    /* gcc-mint -O2 does NOT unroll the 8-iter byte-copy loop,
-        dstAddr[0] = srcAddr[0];   /* plane 0 byte (high or low half) */
+     * leaving cmpl + bnes loop overhead per row. Manual unroll
-        dstAddr[2] = srcAddr[2];   /* plane 1 */
+     * drops ~150 cyc/call. (void)row keeps the unused decl quiet. */
-        dstAddr[4] = srcAddr[4];   /* plane 2 */
+    (void)row;
-        dstAddr[6] = srcAddr[6];   /* plane 3 */
+#define ST_TILE_COPY_ROW                                                    \
-        srcAddr += ST_BYTES_PER_ROW;
+    do {                                                                    \
-        dstAddr += ST_BYTES_PER_ROW;
+        dstAddr[0] = srcAddr[0];                                            \
-    }
+        dstAddr[2] = srcAddr[2];                                            \
        dstAddr[4] = srcAddr[4];                                            \
        dstAddr[6] = srcAddr[6];                                            \
        srcAddr += ST_BYTES_PER_ROW;                                        \
        dstAddr += ST_BYTES_PER_ROW;                                        \
    } while (0)
    ST_TILE_COPY_ROW;          /* row 0 */
    ST_TILE_COPY_ROW;          /* row 1 */
    ST_TILE_COPY_ROW;          /* row 2 */
    ST_TILE_COPY_ROW;          /* row 3 */
    ST_TILE_COPY_ROW;          /* row 4 */
    ST_TILE_COPY_ROW;          /* row 5 */
    ST_TILE_COPY_ROW;          /* row 6 */
    ST_TILE_COPY_ROW;          /* row 7 */
 #undef ST_TILE_COPY_ROW
 }
@ -1792,109 +1601,6 @@ void halBlitRectPlanes(SurfaceT *dst, int16_t x, int16_t y, const uint8_t *srcBy
 }
 // Phase 10 fast paths for save/restore. Hand-rolled asm
 // (surface68kStSprite{Save,Restore}ByteAligned) does the chunky <->
 // plane bit transpose via ASL+ROXL and walks rows/tile columns. The
 // C wrappers below are kept as a fallback / reference; they're not
 // in the critical path now that the asm versions are wired in.
 static void stSpriteSaveByteAligned(const StPlanarT *pd, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t *dstChunkyBytes) {
    int16_t        bytesPerRow = (int16_t)(w >> 1);
    int16_t        tileCols    = (int16_t)(w >> 3);
    const uint8_t *rowBase     = pd->base + (uint16_t)y * ST_BYTES_PER_ROW;
    int16_t        row;
    int16_t        tileCol;
    for (row = 0; row < (int16_t)h; row++) {
        uint8_t *dstRow = &dstChunkyBytes[(uint16_t)row * (uint16_t)bytesPerRow];
        for (tileCol = 0; tileCol < tileCols; tileCol++) {
            int16_t         srcX  = (int16_t)(x + tileCol * 8);
            uint16_t        group = (uint16_t)((uint16_t)srcX >> 4);
            uint16_t        shift = ((srcX & 8) == 0) ? 8u : 0u;
            const uint16_t *pw    = (const uint16_t *)(rowBase + group * ST_BYTES_PER_GROUP);
            uint8_t         pb0   = (uint8_t)(pw[0] >> shift);
            uint8_t         pb1   = (uint8_t)(pw[1] >> shift);
            uint8_t         pb2   = (uint8_t)(pw[2] >> shift);
            uint8_t         pb3   = (uint8_t)(pw[3] >> shift);
            int16_t         pair;
            for (pair = 0; pair < 4; pair++) {
                uint8_t bitHi = (uint8_t)(0x80u >> (pair * 2));
                uint8_t bitLo = (uint8_t)(0x80u >> (pair * 2 + 1));
                uint8_t hi = 0u;
                uint8_t lo = 0u;
                if (pb0 & bitHi) { hi = (uint8_t)(hi | 1u); }
                if (pb1 & bitHi) { hi = (uint8_t)(hi | 2u); }
                if (pb2 & bitHi) { hi = (uint8_t)(hi | 4u); }
                if (pb3 & bitHi) { hi = (uint8_t)(hi | 8u); }
                if (pb0 & bitLo) { lo = (uint8_t)(lo | 1u); }
                if (pb1 & bitLo) { lo = (uint8_t)(lo | 2u); }
                if (pb2 & bitLo) { lo = (uint8_t)(lo | 4u); }
                if (pb3 & bitLo) { lo = (uint8_t)(lo | 8u); }
                dstRow[tileCol * 4 + pair] = (uint8_t)((hi << 4) | lo);
            }
        }
        rowBase += ST_BYTES_PER_ROW;
    }
 }
 static void stSpriteRestoreByteAligned(StPlanarT *pd, int16_t x, int16_t y, uint16_t w, uint16_t h, const uint8_t *srcChunkyBytes) {
    int16_t   bytesPerRow = (int16_t)(w >> 1);
    int16_t   tileCols    = (int16_t)(w >> 3);
    uint8_t  *rowBase     = pd->base + (uint16_t)y * ST_BYTES_PER_ROW;
    int16_t   row;
    int16_t   tileCol;
    for (row = 0; row < (int16_t)h; row++) {
        const uint8_t *srcRow = &srcChunkyBytes[(uint16_t)row * (uint16_t)bytesPerRow];
        for (tileCol = 0; tileCol < tileCols; tileCol++) {
            uint8_t   b0  = srcRow[tileCol * 4 + 0];
            uint8_t   b1  = srcRow[tileCol * 4 + 1];
            uint8_t   b2  = srcRow[tileCol * 4 + 2];
            uint8_t   b3  = srcRow[tileCol * 4 + 3];
            uint8_t   pb0 = 0u;
            uint8_t   pb1 = 0u;
            uint8_t   pb2 = 0u;
            uint8_t   pb3 = 0u;
            uint8_t   c;
            int16_t   dstX;
            uint16_t  group;
            uint16_t *pw;
            uint16_t  halfMask;
            uint16_t  notHalfMask;
            c = (uint8_t)(b0 >> 4); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x80u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x80u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x80u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x80u);
            c = (uint8_t)(b0 & 0x0Fu); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x40u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x40u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x40u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x40u);
            c = (uint8_t)(b1 >> 4); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x20u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x20u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x20u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x20u);
            c = (uint8_t)(b1 & 0x0Fu); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x10u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x10u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x10u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x10u);
            c = (uint8_t)(b2 >> 4); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x08u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x08u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x08u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x08u);
            c = (uint8_t)(b2 & 0x0Fu); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x04u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x04u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x04u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x04u);
            c = (uint8_t)(b3 >> 4); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x02u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x02u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x02u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x02u);
            c = (uint8_t)(b3 & 0x0Fu); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x01u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x01u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x01u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x01u);
            dstX  = (int16_t)(x + tileCol * 8);
            group = (uint16_t)((uint16_t)dstX >> 4);
            pw    = (uint16_t *)(rowBase + group * ST_BYTES_PER_GROUP);
            if ((dstX & 8) == 0) {
                halfMask = 0xFF00u;
                pw[0] = (uint16_t)((pw[0] & 0x00FFu) | ((uint16_t)pb0 << 8));
                pw[1] = (uint16_t)((pw[1] & 0x00FFu) | ((uint16_t)pb1 << 8));
                pw[2] = (uint16_t)((pw[2] & 0x00FFu) | ((uint16_t)pb2 << 8));
                pw[3] = (uint16_t)((pw[3] & 0x00FFu) | ((uint16_t)pb3 << 8));
            } else {
                halfMask = 0x00FFu;
                pw[0] = (uint16_t)((pw[0] & 0xFF00u) | (uint16_t)pb0);
                pw[1] = (uint16_t)((pw[1] & 0xFF00u) | (uint16_t)pb1);
                pw[2] = (uint16_t)((pw[2] & 0xFF00u) | (uint16_t)pb2);
                pw[3] = (uint16_t)((pw[3] & 0xFF00u) | (uint16_t)pb3);
            }
            (void)halfMask;
            (void)notHalfMask;
        }
        rowBase += ST_BYTES_PER_ROW;
    }
 }
 // Phase 10: hoist y*160 to per-row, fold setPixel/getPixel bodies
 // inline. Each pixel's group address differs only in (x), so we
 // can compute base+row*160 once per row and just do per-pixel
@ -1916,11 +1622,16 @@ void halSpriteSavePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t
        return;
    }
    /* Phase 10.5 fast path: byte-aligned, fully on-surface.
-     * Asm walker does direct planar byte copy (LUT pointer unused). */
+     * Specialized 16x16 (the UBER ball-sprite size) skips the asm
     * walker's per-row col-init + col-loop-check overhead. */
    if ((x & 7) == 0 && (w & 7) == 0
            && x >= 0 && (x + (int16_t)w) <= SURFACE_WIDTH
            && y >= 0 && (y + (int16_t)h) <= SURFACE_HEIGHT) {
-        surface68kStSpriteSaveByteAligned(pd->base, (uint16_t)x, (uint16_t)y, w, h, dstPlaneBytes, NULL);
+        if (w == 16u && h == 16u) {
            surface68kStSprite16x16Save(pd->base, (uint16_t)x, (uint16_t)y, dstPlaneBytes);
        } else {
            surface68kStSpriteSaveByteAligned(pd->base, (uint16_t)x, (uint16_t)y, w, h, dstPlaneBytes);
        }
        return;
    }
@ -1980,11 +1691,15 @@ void halSpriteRestorePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint1
        return;
    }
    /* Phase 10.5 fast path: byte-aligned, fully on-surface.
-     * Asm walker does direct planar byte copy (LUT pointer unused). */
+     * Specialized 16x16 (UBER ball-sprite) skips walker overhead. */
    if ((x & 7) == 0 && (w & 7) == 0
            && x >= 0 && (x + (int16_t)w) <= SURFACE_WIDTH
            && y >= 0 && (y + (int16_t)h) <= SURFACE_HEIGHT) {
-        surface68kStSpriteRestoreByteAligned(pd->base, (uint16_t)x, (uint16_t)y, w, h, srcPlaneBytes, NULL);
+        if (w == 16u && h == 16u) {
            surface68kStSprite16x16Restore(pd->base, (uint16_t)x, (uint16_t)y, srcPlaneBytes);
        } else {
            surface68kStSpriteRestoreByteAligned(pd->base, (uint16_t)x, (uint16_t)y, w, h, srcPlaneBytes);
        }
        return;
    }
@ -2049,10 +1764,11 @@ uint8_t halSamplePixel(const SurfaceT *s, int16_t x, int16_t y) {
 }
-// Phase 9: derive 160 chunky bytes per row from the word-interleaved
+// Derive 160 chunky bytes per row from the word-interleaved planar
-// planar buffer (20 groups x 4 plane words). Same shape as the Amiga's
+// buffer (20 groups x 4 plane words). Same shape as the Amiga's
 // amigaPlanesToChunkyRow but per-group instead of per-byte. Used by
-// halSurfaceHash and halSurfaceSaveFileChunky.
+// halSurfaceHash to fold the planar surface into the same byte stream
 // the chunky ports hash, so cross-port hash comparisons stay valid.
 static void stPlanarToChunkyRow(const StPlanarT *pd, int16_t y, uint8_t *dstChunkyRow) {
    uint16_t        group;
    uint16_t        p;
@ -2134,58 +1850,27 @@ void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src) {
 }
-// Phase 9: read chunky from file into a temporary scratch buffer,
+// On-disk format is the ST's native interleaved planar buffer; one
-// then c2p once into the planar shadow. The .joeysurface file format
+// fread fills it directly, no chunky scratch or c2p step.
-// is still chunky 4bpp on disk (cross-port asset interchange); the
+bool halSurfaceLoadFile(SurfaceT *dst, FILE *fp) {
 // in-memory representation is what changes.
 bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp) {
    StPlanarT *pd;
    uint8_t   *scratch;
    int16_t    y;
    bool       ok;
    pd = (StPlanarT *)dst->portData;
    if (pd == NULL) {
        return false;
    }
-    scratch = (uint8_t *)malloc(SURFACE_PIXELS_SIZE);
+    return fread(pd->base, 1, ST_PLANAR_SIZE, fp) == ST_PLANAR_SIZE;
    if (scratch == NULL) {
        return false;
    }
    ok = (fread(scratch, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE);
    if (ok) {
        if (!gC2pLutReady) {
            initC2pLut();
        }
        for (y = 0; y < SURFACE_HEIGHT; y++) {
            const uint8_t *srcLine = &scratch[y * SURFACE_BYTES_PER_ROW];
            uint16_t      *dstLine = (uint16_t *)&pd->base[y * ST_BYTES_PER_ROW];
            chunkyToPlanarRowSt(srcLine, dstLine, 0u, ST_GROUPS_PER_ROW, gC2pLut);
        }
    }
    free(scratch);
    return ok;
 }
-// Phase 9: derive chunky bytes from the planar shadow row by row,
+bool halSurfaceSaveFile(const SurfaceT *src, FILE *fp) {
 // stream to file. Avoids needing a full 32 KB scratch buffer.
 bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp) {
    StPlanarT *pd;
    uint8_t    chunkyRow[SURFACE_BYTES_PER_ROW];
    int16_t    y;
    pd = (StPlanarT *)src->portData;
    if (pd == NULL) {
        return false;
    }
-    for (y = 0; y < SURFACE_HEIGHT; y++) {
+    return fwrite(pd->base, 1, ST_PLANAR_SIZE, fp) == ST_PLANAR_SIZE;
        stPlanarToChunkyRow(pd, y, chunkyRow);
        if (fwrite(chunkyRow, 1, SURFACE_BYTES_PER_ROW, fp) != SURFACE_BYTES_PER_ROW) {
            return false;
        }
    }
    return true;
 }
--- a/src/port/atarist/lineSpan.s
+++ b/src/port/atarist/lineSpan.s
@ -50,19 +50,17 @@
 | Trashes: d0, d1, a2
                .macro  DL_PLOT  color
-                | byteOff = y*160 + (x>>4)*8
+                | byteOff = y*160 + (x>>4)*8 (fits in 16 bits since
                | surface is 32000 bytes < 32K). Skip ext.l + .l add
                | + .l indexed lea -- all word-sized ops save 14 cyc/pixel.
                move.w  %d3,%d0
-                ext.l   %d0
+                add.w   %d0,%d0                | y * 2 (word index)
-                move.l  %d0,%d1
+                move.w  (%a6,%d0.w),%d0        | d0 = y * 160
                lsl.l   #5,%d0                 | y << 5
                lsl.l   #7,%d1                 | y << 7
                add.l   %d1,%d0                | d0 = y * 160
                move.w  %d2,%d1
                lsr.w   #4,%d1
                lsl.w   #3,%d1                 | (x>>4) * 8
-                ext.l   %d1
+                add.w   %d1,%d0                | d0 = byteOff (fits in 16 bits)
-                add.l   %d1,%d0                | d0 = byteOff
+                lea     0(%a3,%d0.w),%a2       | a2 = base + byteOff
                lea     0(%a3,%d0.l),%a2       | a2 = base + byteOff
                | d1 = bitMask, d0 = notMask
                move.w  %d2,%d1
                and.w   #15,%d1
@ -127,9 +125,11 @@ _surface68kStDrawLine:
                movem.l %d2-%d7/%a2-%a6,-(%sp)
                lea     -SP_LOCAL(%sp),%sp
-                | Load base & lut.
+                | Load base & luts.
                move.l  SP_BASE(%sp),%a3
                lea     bitMaskWordLut(%pc),%a5
                | a6 = yLut base (yp -> yp*160) for use in DL_PLOT.
                lea     _gStRowOffsetLut(%pc),%a6
                | x = x0, y = y0
                move.w  SP_X0(%sp),%d2
@ -179,8 +179,8 @@ _surface68kStDrawLine:
                and.w   #0x0F,%d0
                add.w   %d0,%d0
                add.w   %d0,%d0                | * 4 for bra.w table
-                lea     .LdlStTable(%pc),%a6
+                lea     .LdlStTable(%pc),%a2   | a2 scratch (a6 holds yLut)
-                jmp     0(%a6,%d0.w)
+                jmp     0(%a2,%d0.w)
 .LdlStTable:
                bra.w   .LdlStLoop_0
@ -529,6 +529,129 @@ _surface68kStFillRectSingleGroup:
                rts
 | ---- surface68kStTileFill8x8 ---------------------------------------
 |
 | Specialized 8x8 single-group fill: 16-way color dispatch + 8 rows
 | fully unrolled. Drops the per-row subq+bne overhead that the
 | generic FRG_LOOP pays. Used by halTileFillPlanes.
 |
 |   void surface68kStTileFill8x8(uint8_t *firstGroupPtr,
 |                                uint16_t mask,
 |                                uint8_t color);
 |
 | Per row body: 4 plane RMW with postinc + lea 152(a3),a3 to next
 | row. Row 7 skips the trailing lea (a3 not used after).
                .equ    SP_TF_SAVED, 16        | d3-d4/a2-a3 = 4 longs
                .equ    SP_TF_OFF,         (SP_TF_SAVED + 4)
                .equ    SP_TF_PTR,    SP_TF_OFF + 0
                .equ    SP_TF_MASK,   SP_TF_OFF + 4 + 2
                .equ    SP_TF_COLOR,  SP_TF_OFF + 8 + 3
                .macro  TF8_ROW_BARE  color
                .if  ((\color) & 1)
                or.w    %d3,(%a3)+
                .else
                and.w   %d4,(%a3)+
                .endif
                .if  ((\color) & 2)
                or.w    %d3,(%a3)+
                .else
                and.w   %d4,(%a3)+
                .endif
                .if  ((\color) & 4)
                or.w    %d3,(%a3)+
                .else
                and.w   %d4,(%a3)+
                .endif
                .if  ((\color) & 8)
                or.w    %d3,(%a3)+
                .else
                and.w   %d4,(%a3)+
                .endif
                .endm
                .macro  TF8_ROW  color
                TF8_ROW_BARE  \color
                lea     152(%a3),%a3
                .endm
                .macro  TF8_BODY  color
 .Ltf8_body_\color:
                TF8_ROW       \color           | row 0
                TF8_ROW       \color           | row 1
                TF8_ROW       \color           | row 2
                TF8_ROW       \color           | row 3
                TF8_ROW       \color           | row 4
                TF8_ROW       \color           | row 5
                TF8_ROW       \color           | row 6
                TF8_ROW_BARE  \color           | row 7 (no trailing lea)
                bra.w         .Ltf8_done
                .endm
                .globl  _surface68kStTileFill8x8
 _surface68kStTileFill8x8:
                movem.l %d3-%d4/%a2-%a3,-(%sp)
                move.l  SP_TF_PTR(%sp),%a3
                move.w  SP_TF_MASK(%sp),%d3
                move.w  %d3,%d4
                not.w   %d4
                | Color dispatch
                moveq   #0,%d0
                move.b  SP_TF_COLOR(%sp),%d0
                and.w   #0x0F,%d0
                add.w   %d0,%d0
                add.w   %d0,%d0                | * 4 for bra.w table
                lea     .Ltf8_table(%pc),%a2
                jmp     0(%a2,%d0.w)
 .Ltf8_table:
                bra.w   .Ltf8_body_0
                bra.w   .Ltf8_body_1
                bra.w   .Ltf8_body_2
                bra.w   .Ltf8_body_3
                bra.w   .Ltf8_body_4
                bra.w   .Ltf8_body_5
                bra.w   .Ltf8_body_6
                bra.w   .Ltf8_body_7
                bra.w   .Ltf8_body_8
                bra.w   .Ltf8_body_9
                bra.w   .Ltf8_body_10
                bra.w   .Ltf8_body_11
                bra.w   .Ltf8_body_12
                bra.w   .Ltf8_body_13
                bra.w   .Ltf8_body_14
                bra.w   .Ltf8_body_15
                TF8_BODY  0
                TF8_BODY  1
                TF8_BODY  2
                TF8_BODY  3
                TF8_BODY  4
                TF8_BODY  5
                TF8_BODY  6
                TF8_BODY  7
                TF8_BODY  8
                TF8_BODY  9
                TF8_BODY  10
                TF8_BODY  11
                TF8_BODY  12
                TF8_BODY  13
                TF8_BODY  14
                TF8_BODY  15
 .Ltf8_done:
                movem.l (%sp)+,%d3-%d4/%a2-%a3
                rts
 | ---- surface68kStFillRectMulti -------------------------------------
 |
 | Multi-group fillRect: groupFirst != groupLast. Caller pre-clips.
@ -782,6 +905,21 @@ frmRightMaskLut:
                .word   0xFFF8, 0xFFFC, 0xFFFE, 0xFFFF
                .align  2
 | Shared y -> y*160 LUT. Used by drawLine (DL_PLOT), drawCircle
 | (YP_REC), fillCircle (SPAN_BODY). 200 words = 400 bytes.
 | Replaces a 44-cyc lsl.w #5 + lsl.w #7 + add.w shift chain with
 | a 14-cyc indexed-word load. Exported so circle.s and fillCircle.s
 | can reference it via absolute addressing without duplication.
                .globl  _gStRowOffsetLut
 _gStRowOffsetLut:
                .set    li_y, 0
                .rept   200
                .word   li_y * 160
                .set    li_y, li_y + 1
                .endr
 | ---- surface68kStLongFill ----------------------------------------
 |
 | Bulk long-fill helper for full-row fills (surfaceClear, fillRect
--- a/src/port/atarist/spriteAsm.s
+++ b/src/port/atarist/spriteAsm.s
@ -1,30 +1,19 @@
-| ST byte-aligned sprite save / restore via 256-entry plane-spread
+| ST byte-aligned sprite save / restore. Buffer holds plane-major
-| LUT. The LUT entry for each plane byte value is a 32-bit "spread"
+| bytes: per row, plane0/1/2/3 per tile col, for w/8 tile cols. The
-| where each plane byte bit lands at the corresponding plane-0 bit
+| inner per-tile-col macro is 4 byte copies (no chunky <-> planar
-| position of the 4-byte chunky output. For plane N, we shift the
+| conversion since the buffer matches the surface's plane layout).
 | LUT entry left by N to put bits at the plane-N positions, then OR
 | the 4 plane contributions together to get the chunky long.
 |
 | LUT layout (256 longs = 1 KB), populated by initStPlaneSpreadLut
 | in hal.c:
 |
 |   gStPlaneSpreadLut[b] for plane byte b:
 |     bit i of b (i = 0 = MSB = leftmost pixel) maps to bit
 |     bitInLong(i) = (3 - (i >> 1)) * 8 + ((i & 1) ? 0 : 4)
 |     of the long. Plane 0's bits land at nibble bit 0 of each
 |     chunky byte; left-shift the LUT entry by N for plane N.
 |
 | ABI: cdecl. d2-d7/a2-a6 callee-save. C signatures:
 |
 |   void surface68kStSpriteSaveByteAligned(uint8_t *base,
 |                                          uint16_t x, uint16_t y,
 |                                          uint16_t w, uint16_t h,
-|                                          uint8_t *dstChunky);
+|                                          uint8_t *dstPlaneBytes);
 |
 |   void surface68kStSpriteRestoreByteAligned(uint8_t *base,
 |                                             uint16_t x, uint16_t y,
 |                                             uint16_t w, uint16_t h,
-|                                             const uint8_t *srcChunky);
+|                                             const uint8_t *srcPlaneBytes);
                .text
@ -36,19 +25,12 @@
                .equ    SP_Y,       SP_OFF + 8 + 2
                .equ    SP_W,       SP_OFF + 12 + 2
                .equ    SP_H,       SP_OFF + 16 + 2
-                .equ    SP_CHUNKY,  SP_OFF + 20
+                .equ    SP_BUF,     SP_OFF + 20
                .equ    SP_LUT,     SP_OFF + 24
 | Per-tile-col SAVE: 4 plane bytes -> 4 contiguous bytes in buffer.
 | a0 -> plane 0 byte (high or low half), strides 2 to next plane
 | a1 -> output planar bytes (advanced by 4)
 | a2 -> unused (LUT no longer needed)
 |
 | Phase 10.5: dropped chunky <-> planar conversion. The buffer holds
 | plane-major bytes (per row: plane0, plane1, plane2, plane3 per
 | tile col, for w/8 tile cols). 4 byte copies instead of 4 LUT
 | lookups + shifts + ORs.
                .macro  SAVE_TILECOL
                move.b  (%a0),(%a1)+           | plane 0
@ -64,13 +46,7 @@ _surface68kStSpriteSaveByteAligned:
                movem.l %d2-%d7/%a2-%a6,-(%sp)
                move.l  SP_BASE(%sp),%a3
-                move.l  SP_CHUNKY(%sp),%a1
+                move.l  SP_BUF(%sp),%a1
                | LUT pointer comes in via stack arg -- guaranteed
                | long-aligned because gcc passes ptr args via
                | move.l on a long-aligned sp slot. Avoids the BSS
                | misalignment problem on TOS .PRG (BSS pads only to
                | 2 bytes, even uint32_t slots can land at mod-4 = 2).
                move.l  SP_LUT(%sp),%a2
                move.w  SP_W(%sp),%d5
                lsr.w   #3,%d5                 | d5 = tileCols
@ -128,10 +104,6 @@ _surface68kStSpriteSaveByteAligned:
 | Per-tile-col RESTORE: 4 contiguous bytes from buffer -> 4 plane bytes.
 | a0 -> plane 0 byte (high or low half)
 | a1 -> input planar bytes (advanced by 4)
 | a2 -> unused (LUT no longer needed)
 |
 | Phase 10.5: dropped chunky -> planar conversion. Buffer layout
 | matches SAVE_TILECOL: per row, plane0/1/2/3 per tile col.
                .macro  RESTORE_TILECOL
                move.b  (%a1)+,(%a0)           | plane 0
@ -147,8 +119,7 @@ _surface68kStSpriteRestoreByteAligned:
                movem.l %d2-%d7/%a2-%a6,-(%sp)
                move.l  SP_BASE(%sp),%a3
-                move.l  SP_CHUNKY(%sp),%a1
+                move.l  SP_BUF(%sp),%a1
                move.l  SP_LUT(%sp),%a2        | gC2pLut passed in
                | tileCols is held in a5 (not d5) because the macro
                | trashes d5 (uses it for pb3).
@ -200,3 +171,151 @@ _surface68kStSpriteRestoreByteAligned:
                movem.l (%sp)+,%d2-%d7/%a2-%a6
                rts
 | ---- surface68kStSprite16x16Save / Restore -----------------------
 |
 | Specialized 16x16 sprite save/restore: 16 rows fully unrolled,
 | 8 byte copies per row (2 tile cols), no col loop. Drops the asm
 | walker's per-row col-init + col-loop-check overhead.
 |
 |   void surface68kStSprite16x16Save(uint8_t *base,
 |                                    uint16_t x, uint16_t y,
 |                                    uint8_t *dstBuf);
 |
 |   void surface68kStSprite16x16Restore(uint8_t *base,
 |                                       uint16_t x, uint16_t y,
 |                                       const uint8_t *srcBuf);
 |
 | Caller guarantees x is byte-aligned (x mod 8 == 0). Two halfOff
 | variants dispatch on (x & 8): halfOff=0 reads/writes within one
 | group (offsets 0/2/4/6 high half + 1/3/5/7 low half). halfOff=1
 | spans two groups (low half of group N + high half of group N+1).
                .equ    SP16_SAVED, 12         | d2/a2-a3 = 3 longs
                .equ    SP16_OFF,         (SP16_SAVED + 4)
                .equ    SP16_BASE,    SP16_OFF + 0
                .equ    SP16_X,       SP16_OFF + 4 + 2
                .equ    SP16_Y,       SP16_OFF + 8 + 2
                .equ    SP16_BUF,     SP16_OFF + 12
 | Macro: setup a0 = base + y*160 + group*8 + halfOff
 | Trashes: d0, d1, d2; a0 left at row start
                .macro  SP16_SETUP_A0
                move.l  SP16_BASE(%sp),%a3
                move.w  SP16_X(%sp),%d0
                move.w  SP16_Y(%sp),%d1
                | a0 = base + y*160
                ext.l   %d1
                move.l  %d1,%d2
                lsl.l   #5,%d1
                lsl.l   #7,%d2
                add.l   %d2,%d1
                lea     0(%a3,%d1.l),%a0
                | a0 += (x>>4) * 8
                move.w  %d0,%d1
                lsr.w   #4,%d1
                lsl.w   #3,%d1
                ext.l   %d1
                add.l   %d1,%a0
                | a0 += halfOff (= (x & 8) >> 3)
                and.w   #8,%d0
                lsr.w   #3,%d0
                ext.l   %d0
                add.l   %d0,%a0
                | d0 = halfOff (0 or 1) for downstream dispatch
                .endm
                .globl  _surface68kStSprite16x16Save
 _surface68kStSprite16x16Save:
                movem.l %d2/%a2-%a3,-(%sp)
                SP16_SETUP_A0
                move.l  SP16_BUF(%sp),%a1
                tst.w   %d0
                bne.w   .Lsp16s_low
                | halfOff=0: a0 at high half. Col 0 = high (offsets
                | 0,2,4,6); col 1 = low (offsets 1,3,5,7).
                .rept   16
                move.b  (%a0),(%a1)+
                move.b  2(%a0),(%a1)+
                move.b  4(%a0),(%a1)+
                move.b  6(%a0),(%a1)+
                move.b  1(%a0),(%a1)+
                move.b  3(%a0),(%a1)+
                move.b  5(%a0),(%a1)+
                move.b  7(%a0),(%a1)+
                lea     160(%a0),%a0
                .endr
                bra.w   .Lsp16s_done
 .Lsp16s_low:
                | halfOff=1: a0 at low half (group+1). Col 0 = low of
                | this group, offsets 0,2,4,6 from a0. Col 1 = high of
                | next group, at offsets 7,9,11,13 from a0.
                .rept   16
                move.b  (%a0),(%a1)+
                move.b  2(%a0),(%a1)+
                move.b  4(%a0),(%a1)+
                move.b  6(%a0),(%a1)+
                move.b  7(%a0),(%a1)+
                move.b  9(%a0),(%a1)+
                move.b  11(%a0),(%a1)+
                move.b  13(%a0),(%a1)+
                lea     160(%a0),%a0
                .endr
 .Lsp16s_done:
                movem.l (%sp)+,%d2/%a2-%a3
                rts
                .globl  _surface68kStSprite16x16Restore
 _surface68kStSprite16x16Restore:
                movem.l %d2/%a2-%a3,-(%sp)
                SP16_SETUP_A0
                move.l  SP16_BUF(%sp),%a1
                tst.w   %d0
                bne.w   .Lsp16r_low
                | halfOff=0: write high half (col 0) + low half (col 1).
                .rept   16
                move.b  (%a1)+,(%a0)
                move.b  (%a1)+,2(%a0)
                move.b  (%a1)+,4(%a0)
                move.b  (%a1)+,6(%a0)
                move.b  (%a1)+,1(%a0)
                move.b  (%a1)+,3(%a0)
                move.b  (%a1)+,5(%a0)
                move.b  (%a1)+,7(%a0)
                lea     160(%a0),%a0
                .endr
                bra.w   .Lsp16r_done
 .Lsp16r_low:
                | halfOff=1
                .rept   16
                move.b  (%a1)+,(%a0)
                move.b  (%a1)+,2(%a0)
                move.b  (%a1)+,4(%a0)
                move.b  (%a1)+,6(%a0)
                move.b  (%a1)+,7(%a0)
                move.b  (%a1)+,9(%a0)
                move.b  (%a1)+,11(%a0)
                move.b  (%a1)+,13(%a0)
                lea     160(%a0),%a0
                .endr
 .Lsp16r_done:
                movem.l (%sp)+,%d2/%a2-%a3
                rts
--- a/src/port/dos/hal.c
+++ b/src/port/dos/hal.c
@ -614,12 +614,12 @@ void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src) {
 }
-bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp) {
+bool halSurfaceLoadFile(SurfaceT *dst, FILE *fp) {
    return fread(dst->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE;
 }
-bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp) {
+bool halSurfaceSaveFile(const SurfaceT *src, FILE *fp) {
    return fwrite(src->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE;
 }
--- a/src/port/iigs/hal.c
+++ b/src/port/iigs/hal.c
@ -395,12 +395,12 @@ void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src) {
 }
-bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp) {
+bool halSurfaceLoadFile(SurfaceT *dst, FILE *fp) {
    return fread(dst->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE;
 }
-bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp) {
+bool halSurfaceSaveFile(const SurfaceT *src, FILE *fp) {
    return fwrite(src->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE;
 }