Major Atari ST work.

2026-05-04 00:35:41 -05:00 · 2026-05-04 00:35:41 -05:00 · 818dc801db
commit 818dc801db
parent b1e24b4650
13 changed files with 3480 additions and 198 deletions
--- a/make/atarist.mk
+++ b/make/atarist.mk
@ -37,6 +37,7 @@ LIB_OBJS := \
    $(patsubst $(SRC_PORT)/atarist/%.s,$(BUILD)/obj/port/%.o,$(PORT_S_SRCS)) \
    $(patsubst $(SRC_68K)/%.s,$(BUILD)/obj/68k/%.o,$(SHARED_S)) \
    $(BUILD)/obj/codegen/spriteEmit68k.o \
    $(BUILD)/obj/codegen/spriteEmitInterleaved68k.o \
    $(BUILD)/obj/codegen/spriteCompile.o
 LIB := $(LIBDIR)/libjoey.a
--- a/src/codegen/spriteCompile.c
+++ b/src/codegen/spriteCompile.c
@ -37,7 +37,7 @@ static uint16_t emitDrawForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift
 #elif defined(JOEYLIB_PLATFORM_AMIGA)
    return spriteEmitDrawPlanar68k(out, sp, shift);
 #elif defined(JOEYLIB_PLATFORM_ATARIST)
-    return spriteEmitDraw68k(out, sp, shift);
+    return spriteEmitDrawInterleaved68k(out, sp, shift);
 #elif defined(JOEYLIB_PLATFORM_IIGS)
    return spriteEmitDrawIigs(out, sp, shift);
 #else
@ -57,7 +57,7 @@ static uint16_t emitSaveForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift
 #elif defined(JOEYLIB_PLATFORM_AMIGA)
    return spriteEmitSavePlanar68k(out, sp, shift);
 #elif defined(JOEYLIB_PLATFORM_ATARIST)
-    return spriteEmitSave68k(out, sp, shift);
+    return spriteEmitSaveInterleaved68k(out, sp, shift);
 #elif defined(JOEYLIB_PLATFORM_IIGS)
    return spriteEmitSaveIigs(out, sp, shift);
 #else
@ -73,7 +73,7 @@ static uint16_t emitRestoreForTarget(uint8_t *out, const SpriteT *sp, uint8_t sh
 #elif defined(JOEYLIB_PLATFORM_AMIGA)
    return spriteEmitRestorePlanar68k(out, sp, shift);
 #elif defined(JOEYLIB_PLATFORM_ATARIST)
-    return spriteEmitRestore68k(out, sp, shift);
+    return spriteEmitRestoreInterleaved68k(out, sp, shift);
 #elif defined(JOEYLIB_PLATFORM_IIGS)
    return spriteEmitRestoreIigs(out, sp, shift);
 #else
@ -139,6 +139,15 @@ bool spriteCompile(SpriteT *sp) {
        free(scratch);
        return false;
    }
    if (totalSize == 0) {
        /* Platforms whose emitter returns 0 for every (shift, op) have
         * no compiled bytes -- spriteCompiledDraw / SaveUnder /
         * RestoreUnder would dereference a degenerate slot or chunky
         * shadow. Bail so sp->slot stays NULL and the dispatcher
         * routes through the interpreted halSpriteXxxPlanes path. */
        free(scratch);
        return false;
    }
    slot = codegenArenaAlloc(totalSize);
    if (slot == NULL) {
@ -684,6 +693,68 @@ void spriteCompiledRestoreUnder(SurfaceT *dst, const SpriteBackupT *backup) {
    fn(p0 + byteOff, p1 + byteOff, p2 + byteOff, p3 + byteOff, backup->bytes);
 }
 #elif defined(JOEYLIB_PLATFORM_ATARIST)
 /* ST word-interleaved planar runtime dispatch. The JIT routine takes
 * one arg: groupBase = pd->base + y*160 + (x>>4)*8 (the address of
 * the first 16-pixel group the sprite touches). It walks rows by
 * adda.w #160 at the end of each row. Per (row, tile_col, plane) it
 * emits up to one move.b / clr.b / andi.b+ori.b / ori.b chain at
 * d16(a0).
 *
 * shift selection (in spriteInternal.h SPRITE_SHIFT_INDEX):
 *   0  : byte-aligned x with x mod 16 == 0  (first tile col high half)
 *   1  : byte-aligned x with x mod 16 == 8  (first tile col low half)
 *   2+ : non-byte-aligned x, never compiled (emitter returns 0); the
 *        per-shift offset is SPRITE_NOT_COMPILED so the dispatcher
 *        falls back to halSpriteDrawPlanes. */
 void spriteCompiledDraw(SurfaceT *dst, const SpriteT *sp, int16_t x, int16_t y) {
    typedef void (*DrawFn)(uint8_t *groupBase);
    uint8_t  shift;
    uint16_t routeOffset;
    uint8_t *base;
    uint8_t *groupBase;
    DrawFn   fn;
    shift       = SPRITE_SHIFT_INDEX(x);
    routeOffset = sp->routineOffsets[shift][SPRITE_OP_DRAW];
    if (routeOffset == SPRITE_NOT_COMPILED) {
        /* Non-byte-aligned x: cross-platform spriteDraw will call
         * halSpriteDrawPlanes after this returns (since the dispatcher
         * already chose the compiled path based on sp->slot != NULL,
         * but COMPILED_SPRITE_WRITES_PLANES is 1 on ST so it normally
         * suppresses the planes hook). For non-aligned shifts we
         * deliberately want the interpreted planes hook to run, so
         * delegate via halSpriteDrawPlanes here. */
        halSpriteDrawPlanes(dst, sp, x, y);
        return;
    }
    base = halSurfacePlanePtr(dst, 0);
    if (base == NULL) {
        return;
    }
    groupBase = base
              + (uint16_t)y * 160u
              + (uint16_t)((uint16_t)x >> 4) * 8u;
    fn = (DrawFn)(codegenArenaBase() + sp->slot->offset + routeOffset);
    fn(groupBase);
 }
 /* Save/Restore aren't compiled on ST yet (emitter returns 0). The
 * dispatcher's check on sp->routineOffsets[shift][SPRITE_OP_SAVE/_RESTORE]
 * == SPRITE_NOT_COMPILED already routes those through the
 * interpreted halSpriteSavePlanes / halSpriteRestorePlanes. These
 * stubs exist only to satisfy the linker. */
 void spriteCompiledSaveUnder(const SurfaceT *src, SpriteT *sp, int16_t x, int16_t y, SpriteBackupT *backup) {
    (void)src; (void)sp; (void)x; (void)y; (void)backup;
 }
 void spriteCompiledRestoreUnder(SurfaceT *dst, const SpriteBackupT *backup) {
    (void)dst; (void)backup;
 }
 #else
 void spriteCompiledDraw(SurfaceT *dst, const SpriteT *sp, int16_t x, int16_t y) {
--- a/src/codegen/spriteEmitInterleaved68k.c
+++ b/src/codegen/spriteEmitInterleaved68k.c
@ -0,0 +1,220 @@
 // 68k sprite codegen for ST word-interleaved planar layout. Emits a
 // cdecl-callable routine `void draw(uint8_t *groupBase)` that walks
 // the sprite's tile data and writes plane bytes via `d16(a0)` chains.
 //
 // ST planar layout reminder (doc/atarist_planar.md): one buffer; per
 // scanline 20 groups of 8 bytes; per group, 4 plane words back-to-
 // back. groupBase points at the FIRST group the sprite touches:
 //   pd->base + y * 160 + (x >> 4) * 8
 //
 // Shift index for ST is bit 3 of x (whether the sprite starts in the
 // high half or low half of the first group). x mod 8 != 0 falls back
 // to the interpreter (returns 0 from this emitter so sp->slot stays
 // NULL for those alignments).
 //
 // Per (row, tile_col, plane) we emit one of:
 //   * nothing                         (op byte = 0, all transparent)
 //   * move.b #pbN, d16(a0)            (op = 0xFF, full replace, 6 bytes)
 //   * clr.b d16(a0)                   (op = 0xFF AND pbN = 0, 4 bytes)
 //   * andi.b #~op, d16(a0)            (op partial, pbN = 0, 6 bytes)
 //   * ori.b #pbN, d16(a0)             (op partial, pbN == op, 6 bytes)
 //   * andi.b #~op + ori.b #pbN        (mixed, 12 bytes)
 //
 // d16 is the byte offset from groupBase to the target plane byte.
 // Layout of the byte offset:
 //   shift 0: byteOff = (col >> 1) * 8 + plane*2 + (col & 1)
 //   shift 1: byteOff = ((col + 1) >> 1) * 8 + plane*2 + (1 - (col & 1))
 // Each tile column is 8 sprite pixels = exactly half a 16-pixel
 // group, alternating high (offset 0) and low (offset 1) bytes of
 // each plane word.
 //
 // Per row we adda.w #160, a0 to advance to the next scanline.
 #include "joey/sprite.h"
 #include "joey/surface.h"
 #include "spriteEmitter.h"
 #include "spriteInternal.h"
 // ----- Constants -----
 #define TILE_PIXELS         8
 #define TILE_BYTES          32
 #define TILE_BYTES_PER_ROW  4
 #define ST_BYTES_PER_ROW    160
 // ----- Helpers -----
 static uint16_t writeBE16(uint8_t *out, uint16_t value) {
    out[0] = (uint8_t)(value >> 8);
    out[1] = (uint8_t)(value & 0xFFu);
    return 2;
 }
 // Build the 4 plane bytes + opacity byte for one (row, tileCol)
 // pair. pbN bit 7 is sprite pixel 0 (leftmost), bit 0 is pixel 7.
 // op bit N is set iff that pixel's color != 0.
 static void buildPlaneBytes(const SpriteT *sp, uint16_t row, uint16_t tileCol,
                             uint8_t *outPb0, uint8_t *outPb1,
                             uint8_t *outPb2, uint8_t *outPb3,
                             uint8_t *outOp) {
    uint16_t       tileY     = (uint16_t)(row >> 3);
    uint16_t       inTileY   = (uint16_t)(row & 7u);
    uint16_t       wTiles    = sp->widthTiles;
    const uint8_t *tileBytes = sp->tileData + (uint32_t)(tileY * wTiles + tileCol) * 32u;
    const uint8_t *tileRow   = tileBytes + (uint32_t)inTileY * TILE_BYTES_PER_ROW;
    uint8_t        pb0       = 0u;
    uint8_t        pb1       = 0u;
    uint8_t        pb2       = 0u;
    uint8_t        pb3       = 0u;
    uint8_t        op        = 0u;
    uint8_t        p;
    uint8_t        b;
    uint8_t        color;
    uint8_t        bit;
    for (p = 0; p < 8u; p++) {
        b = tileRow[p >> 1];
        color = (p & 1u) ? (uint8_t)(b & 0x0Fu) : (uint8_t)(b >> 4);
        bit = (uint8_t)(0x80u >> p);
        if (color != 0u) {
            op = (uint8_t)(op | bit);
            if (color & 1u) pb0 = (uint8_t)(pb0 | bit);
            if (color & 2u) pb1 = (uint8_t)(pb1 | bit);
            if (color & 4u) pb2 = (uint8_t)(pb2 | bit);
            if (color & 8u) pb3 = (uint8_t)(pb3 | bit);
        }
    }
    *outPb0 = pb0;
    *outPb1 = pb1;
    *outPb2 = pb2;
    *outPb3 = pb3;
    *outOp  = op;
 }
 // Emit code for one plane byte at d16(a0). Returns bytes written.
 // op=opacity byte, pb=plane byte (subset of op).
 static uint16_t emitPlaneByte(uint8_t *out, uint16_t cursor, uint16_t d16, uint8_t op, uint8_t pb) {
    uint16_t start = cursor;
    if (op == 0u) {
        return 0u;  /* nothing to emit */
    }
    if (op == 0xFFu) {
        /* All 8 pixels opaque: replace the byte. */
        if (pb == 0u) {
            /* clr.b d16(a0). Opcode 0x4228 + d16. 4 bytes. */
            cursor += writeBE16(out + cursor, 0x4228u);
            cursor += writeBE16(out + cursor, d16);
        } else {
            /* move.b #pb, d16(a0). Opcode 0x117C + #imm word + d16. 6 bytes. */
            cursor += writeBE16(out + cursor, 0x117Cu);
            cursor += writeBE16(out + cursor, (uint16_t)pb);
            cursor += writeBE16(out + cursor, d16);
        }
        return (uint16_t)(cursor - start);
    }
    /* Partial opacity. pb is a subset of op. */
    if (pb == 0u) {
        /* All opaque pixels have plane bit 0: just clear those bits. */
        /* andi.b #~op, d16(a0). Opcode 0x0228 + #imm word + d16. 6 bytes. */
        cursor += writeBE16(out + cursor, 0x0228u);
        cursor += writeBE16(out + cursor, (uint16_t)(~op & 0xFFu));
        cursor += writeBE16(out + cursor, d16);
        return (uint16_t)(cursor - start);
    }
    if (pb == op) {
        /* All opaque pixels have plane bit 1: just set those bits. */
        /* ori.b #op, d16(a0). Opcode 0x0028 + #imm word + d16. 6 bytes. */
        cursor += writeBE16(out + cursor, 0x0028u);
        cursor += writeBE16(out + cursor, (uint16_t)op);
        cursor += writeBE16(out + cursor, d16);
        return (uint16_t)(cursor - start);
    }
    /* Mixed: clear opaque bits, then set the plane bits. */
    cursor += writeBE16(out + cursor, 0x0228u);
    cursor += writeBE16(out + cursor, (uint16_t)(~op & 0xFFu));
    cursor += writeBE16(out + cursor, d16);
    cursor += writeBE16(out + cursor, 0x0028u);
    cursor += writeBE16(out + cursor, (uint16_t)pb);
    cursor += writeBE16(out + cursor, d16);
    return (uint16_t)(cursor - start);
 }
 // ----- Emit API -----
 uint16_t spriteEmitDrawInterleaved68k(uint8_t *out, const SpriteT *sp, uint8_t shift) {
    uint16_t cursor;
    uint16_t row;
    uint16_t col;
    uint16_t plane;
    uint16_t heightPx;
    uint16_t wTiles;
    uint8_t  pb[4];
    uint8_t  op;
    /* Only shifts 0 and 1 emit code. shift 0 = first tile col in
     * high half (x mod 16 == 0). shift 1 = first tile col in low
     * half (x mod 16 == 8). Other byte alignments fall through to
     * the interpreter via halSpriteDrawPlanes. */
    if (shift > 1u) {
        return 0u;
    }
    cursor   = 0u;
    heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS);
    wTiles   = sp->widthTiles;
    /* Prologue: movea.l 4(sp), a0. Opcode 0x206F + d16=4. 4 bytes. */
    cursor += writeBE16(out + cursor, 0x206Fu);
    cursor += writeBE16(out + cursor, 0x0004u);
    for (row = 0; row < heightPx; row++) {
        if (row > 0u) {
            /* adda.w #160, a0. Opcode 0xD0FC + imm word. 4 bytes. */
            cursor += writeBE16(out + cursor, 0xD0FCu);
            cursor += writeBE16(out + cursor, (uint16_t)ST_BYTES_PER_ROW);
        }
        for (col = 0; col < wTiles; col++) {
            buildPlaneBytes(sp, row, col, &pb[0], &pb[1], &pb[2], &pb[3], &op);
            if (op == 0u) {
                continue;  /* whole tile column row is transparent */
            }
            for (plane = 0; plane < 4u; plane++) {
                uint16_t d16;
                if (shift == 0u) {
                    /* col 0 (high) -> +0, col 1 (low) -> +1, col 2
                     * (high group 1) -> +8, ... */
                    d16 = (uint16_t)((col >> 1) * 8 + plane * 2 + (col & 1u));
                } else {
                    /* col 0 (low) -> +1, col 1 (high group 1) -> +8, ... */
                    d16 = (uint16_t)(((col + 1u) >> 1) * 8 + plane * 2 + (1u - (col & 1u)));
                }
                cursor += emitPlaneByte(out, cursor, d16, op, pb[plane]);
            }
        }
    }
    /* Epilogue: rts. */
    cursor += writeBE16(out + cursor, 0x4E75u);
    return cursor;
 }
 /* Save / restore aren't implemented yet -- returning 0 so they fall
 * through to the C interpreter (halSpriteSavePlanes / halSpriteRestorePlanes
 * fast paths cover the byte-aligned case). */
 uint16_t spriteEmitSaveInterleaved68k(uint8_t *out, const SpriteT *sp, uint8_t shift) {
    (void)out; (void)sp; (void)shift;
    return 0u;
 }
 uint16_t spriteEmitRestoreInterleaved68k(uint8_t *out, const SpriteT *sp, uint8_t shift) {
    (void)out; (void)sp; (void)shift;
    return 0u;
 }
--- a/src/codegen/spriteEmitter.h
+++ b/src/codegen/spriteEmitter.h
@ -57,4 +57,15 @@ uint16_t spriteEmitDrawPlanar68k    (uint8_t *out, const SpriteT *sp, uint8_t sh
 uint16_t spriteEmitSavePlanar68k    (uint8_t *out, const SpriteT *sp, uint8_t shift);
 uint16_t spriteEmitRestorePlanar68k (uint8_t *out, const SpriteT *sp, uint8_t shift);
 // Word-interleaved planar 68k emitter (ST). Calling convention for
 // the emitted bytes:
 //   void draw(uint8_t *groupBase);
 // where groupBase = pd->base + y*160 + (x>>4)*8. Shifts 0 and 1 emit
 // real bytes (x mod 16 == 0 for shift 0, x mod 16 == 8 for shift 1);
 // other shifts return 0 so the cross-platform dispatcher falls back
 // to halSpriteDrawPlanes.
 uint16_t spriteEmitDrawInterleaved68k    (uint8_t *out, const SpriteT *sp, uint8_t shift);
 uint16_t spriteEmitSaveInterleaved68k    (uint8_t *out, const SpriteT *sp, uint8_t shift);
 uint16_t spriteEmitRestoreInterleaved68k (uint8_t *out, const SpriteT *sp, uint8_t shift);
 #endif
--- a/src/core/sprite.c
+++ b/src/core/sprite.c
@ -31,7 +31,11 @@
 // paths still need the hooks unconditionally on every platform -- the
 // chunky interpreter is a no-op on Amiga (s->pixels NULL) so the hook
 // is the only draw.
-#if defined(JOEYLIB_PLATFORM_AMIGA)
+/* ST also runs pure planar post-Phase-9 (s->pixels NULL); the JIT
 * routine writes plane bytes directly, so the chunky interpreter
 * is a no-op and the halSpriteDrawPlanes hook would be a redundant
 * second draw. Same rationale as Amiga. */
 #if defined(JOEYLIB_PLATFORM_AMIGA) || defined(JOEYLIB_PLATFORM_ATARIST)
 #define COMPILED_SPRITE_WRITES_PLANES 1
 #else
 #define COMPILED_SPRITE_WRITES_PLANES 0
--- a/src/core/spriteInternal.h
+++ b/src/core/spriteInternal.h
@ -16,9 +16,15 @@
 // Per-platform shift index used by the dispatcher. Chunky 4bpp ports
 // store one nibble per pixel pair so the only sub-byte alignment is
 // x % 2. Amiga planar packs 8 pixels per plane byte so all 8
-// alignments matter.
+// alignments matter. ST word-interleaved planar groups 16 pixels
 // per word; for byte-aligned x (x mod 8 == 0) the only meaningful
 // distinction is high vs low byte of the plane word, which is bit
 // 3 of x (== (x >> 3) & 1). Other shifts (x mod 8 != 0) emit 0
 // from the JIT and route to the interpreter.
 #if defined(JOEYLIB_PLATFORM_AMIGA)
 #define SPRITE_SHIFT_INDEX(x)  ((uint8_t)((x) & 7))
 #elif defined(JOEYLIB_PLATFORM_ATARIST)
 #define SPRITE_SHIFT_INDEX(x)  ((uint8_t)(((x) & 7) ? 2u : (uint8_t)(((x) >> 3) & 1u)))
 #else
 #define SPRITE_SHIFT_INDEX(x)  ((uint8_t)((x) & 1))
 #endif
--- a/src/core/tile.c
+++ b/src/core/tile.c
@ -141,11 +141,13 @@ void tileCopy(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src,
    srcPixelX = (uint16_t)((uint16_t)srcBx * TILE_PIXELS_PER_SIDE);
    srcPixelY = (uint16_t)((uint16_t)srcBy * TILE_PIXELS_PER_SIDE);
-    dstRow0 = &dst->pixels[SURFACE_ROW_OFFSET(dstPixelY) + (dstPixelX >> 1)];
+    /* Skip the chunky path on planar ports (pixels NULL). */
-    srcRow0 = &src->pixels[SURFACE_ROW_OFFSET(srcPixelY) + (srcPixelX >> 1)];
+    if (dst->pixels != NULL && src->pixels != NULL) {
-
+        dstRow0 = &dst->pixels[SURFACE_ROW_OFFSET(dstPixelY) + (dstPixelX >> 1)];
-    if (!halFastTileCopy(dstRow0, srcRow0)) {
+        srcRow0 = &src->pixels[SURFACE_ROW_OFFSET(srcPixelY) + (srcPixelX >> 1)];
-        copyTileOpaque(dstRow0, srcRow0);
+        if (!halFastTileCopy(dstRow0, srcRow0)) {
            copyTileOpaque(dstRow0, srcRow0);
        }
    }
    halTileCopyPlanes(dst, dstBx, dstBy, src, srcBx, srcBy);
    surfaceMarkDirtyRect(dst, (int16_t)dstPixelX, (int16_t)dstPixelY,
@ -173,11 +175,13 @@ void tileCopyMasked(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT
    srcPixelX = (uint16_t)((uint16_t)srcBx * TILE_PIXELS_PER_SIDE);
    srcPixelY = (uint16_t)((uint16_t)srcBy * TILE_PIXELS_PER_SIDE);
-    dstRow0 = &dst->pixels[SURFACE_ROW_OFFSET(dstPixelY) + (dstPixelX >> 1)];
+    /* Skip the chunky path on planar ports (pixels NULL). */
-    srcRow0 = &src->pixels[SURFACE_ROW_OFFSET(srcPixelY) + (srcPixelX >> 1)];
+    if (dst->pixels != NULL && src->pixels != NULL) {
-
+        dstRow0 = &dst->pixels[SURFACE_ROW_OFFSET(dstPixelY) + (dstPixelX >> 1)];
-    if (!halFastTileCopyMasked(dstRow0, srcRow0, transparentIndex)) {
+        srcRow0 = &src->pixels[SURFACE_ROW_OFFSET(srcPixelY) + (srcPixelX >> 1)];
-        copyTileMasked(dstRow0, srcRow0, transparentIndex);
+        if (!halFastTileCopyMasked(dstRow0, srcRow0, transparentIndex)) {
            copyTileMasked(dstRow0, srcRow0, transparentIndex);
        }
    }
    halTileCopyMaskedPlanes(dst, dstBx, dstBy, src, srcBx, srcBy, transparentIndex);
    surfaceMarkDirtyRect(dst, (int16_t)dstPixelX, (int16_t)dstPixelY,
@ -199,8 +203,9 @@ void tileFill(SurfaceT *s, uint8_t bx, uint8_t by, uint8_t colorIndex) {
    pixelX  = (uint16_t)((uint16_t)bx * TILE_PIXELS_PER_SIDE);
    pixelY  = (uint16_t)((uint16_t)by * TILE_PIXELS_PER_SIDE);
    doubled = (uint8_t)(((colorIndex & 0x0F) << 4) | (colorIndex & 0x0F));
-    if (!halFastTileFill(s, bx, by,
+    if (s->pixels != NULL
-                         (uint16_t)((uint16_t)doubled | ((uint16_t)doubled << 8)))) {
+            && !halFastTileFill(s, bx, by,
                                (uint16_t)((uint16_t)doubled | ((uint16_t)doubled << 8)))) {
        uint8_t *row = &s->pixels[SURFACE_ROW_OFFSET(pixelY) + (pixelX >> 1)];
        uint8_t  i;
        for (i = 0; i < TILE_PIXELS_PER_SIDE; i++) {
@ -232,16 +237,22 @@ void tilePaste(SurfaceT *dst, uint8_t bx, uint8_t by, const TileT *in) {
    }
    pixelX = (uint16_t)((uint16_t)bx * TILE_PIXELS_PER_SIDE);
    pixelY = (uint16_t)((uint16_t)by * TILE_PIXELS_PER_SIDE);
    dstRow = &dst->pixels[SURFACE_ROW_OFFSET(pixelY) + (pixelX >> 1)];
    src    = &in->pixels[0];
-    if (!halFastTilePaste(dstRow, src)) {
+    /* Skip the chunky write path on planar ports (dst->pixels NULL) --
-        for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) {
+     * mirrors tileSnap's pixels-NULL short-circuit. Saves the dstRow
-            dstRow[0] = src[0];
+     * SURFACE_ROW_OFFSET multiply + halFastTilePaste jsr/rts per call
-            dstRow[1] = src[1];
+     * on ST/Amiga where the planar path below does the real work. */
-            dstRow[2] = src[2];
+    if (dst->pixels != NULL) {
-            dstRow[3] = src[3];
+        dstRow = &dst->pixels[SURFACE_ROW_OFFSET(pixelY) + (pixelX >> 1)];
-            dstRow += SURFACE_BYTES_PER_ROW;
+        if (!halFastTilePaste(dstRow, src)) {
-            src    += TILE_BYTES_PER_ROW;
+            for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) {
                dstRow[0] = src[0];
                dstRow[1] = src[1];
                dstRow[2] = src[2];
                dstRow[3] = src[3];
                dstRow += SURFACE_BYTES_PER_ROW;
                src    += TILE_BYTES_PER_ROW;
            }
        }
    }
    halTilePastePlanes(dst, bx, by, &in->pixels[0]);
--- a/src/port/atarist/audio.c
+++ b/src/port/atarist/audio.c
@ -39,6 +39,13 @@
 #define ST_MFP_IMRA     ((volatile uint8_t *)0xFFFFFA13L)
 #define ST_MFP_ISRA     ((volatile uint8_t *)0xFFFFFA0FL)
 // YM2149 (sound chip) supervisor-only ports. Index reg 7 (mixer)
 // controls per-channel tone + noise enables; reg 8/9/A are volumes
 // for channels A/B/C; regs 0/1, 2/3, 4/5 are tone period for those
 // channels; reg 6 is noise period.
 #define ST_YM_SELECT    ((volatile uint8_t *)0xFFFF8800L)
 #define ST_YM_DATA      ((volatile uint8_t *)0xFFFF8802L)
 #define MFP_TA_BIT      0x20
 #define MFP_TACR_STOP   0x00
 #define MFP_TACR_DIV200 0x07
@ -90,6 +97,32 @@ static long installTimerA(void) {
    gNeedRefill[0] = 0;
    gNeedRefill[1] = 0;
    // YM2149 setup for PWM-via-volume on channel A:
    //   reg 7 (mixer): set bits 0 (tone A off) and 3 (noise A off);
    //                  preserve bits 6+7 (I/O port directions, used
    //                  by TOS for floppy / keyboard / printer).
    //   reg 8 (channel A volume): start at 0 to avoid a pop at start.
    //
    // Without the mixer setup, whatever state TOS left noise A in
    // gets gated by our 12 kHz volume writes -- if noise A was on,
    // a constant volume = constant hiss. Standard PWM-DAC trick is
    // to disable both tone and noise so the volume reg is a pure
    // 4-bit amplitude DAC.
    //
    // We can't reliably read back YM regs on the ST (the data port
    // returns last-write, not register contents), so we OR in the
    // disable bits over an assumed-safe TOS-default mask. Bit 6 set
    // (port A output) matches stock TOS; bit 7 set (port B output)
    // matches the centronics-printer direction TOS configures.
    *ST_YM_SELECT = 7;
    *ST_YM_DATA   = 0xFF;  // all tones + noises off; I/O ports A+B output (TOS default)
    *ST_YM_SELECT = 8;
    *ST_YM_DATA   = 0;     // channel A volume = 0 to avoid a pop at start
    *ST_YM_SELECT = 9;
    *ST_YM_DATA   = 0;     // channel B volume = 0
    *ST_YM_SELECT = 10;
    *ST_YM_DATA   = 0;     // channel C volume = 0
    // MFP Timer A: stop, install our vector, set prescaler 200 + data
    // 1 (= 2.4576 MHz / 200 = 12288 Hz), then start.
    *ST_MFP_TACR = MFP_TACR_STOP;
@ -108,6 +141,10 @@ static long uninstallTimerA(void) {
        (void)Setexc(VEC_MFP_TA, (long)gOldTimerAVec);
        gOldTimerAVec = NULL;
    }
    /* Silence channel A volume so handoff back to TOS is clean (no
     * residual DC level on the speaker). */
    *ST_YM_SELECT = 8;
    *ST_YM_DATA   = 0;
    return 0;
 }
--- a/src/port/atarist/circle.s
+++ b/src/port/atarist/circle.s
@ -0,0 +1,282 @@
 | Atari ST word-interleaved planar circle outline -- 68000 hand-rolled.
 |
 | Mirrors src/port/amiga/circle.s in spirit but for ST's single
 | word-interleaved planar buffer:
 |   * Per scanline: 20 groups of 8 bytes; each group is 4 plane
 |     words back-to-back (p0_word, p1_word, p2_word, p3_word).
 |   * Pixel x: group = x >> 4; bit position within word = 15 - (x & 15).
 |   * Plane N's word at row y, group g: base + y*160 + g*8 + N*2.
 |
 | 16-way color dispatch + per-iter precompute (4 xp records + 4 yp40
 | words) gives a branchless 4-plane RMW per pixel. 8 octants are
 | inlined per Bresenham iter; no bsr.
 |
 | ABI: cdecl. d2-d7/a2-a6 callee-save.
 |
 | void surface68kStCircleOutline(uint8_t *base,
 |                                uint16_t cx, uint16_t cy,
 |                                uint16_t r,  uint8_t  color);
 |
 | Register allocation:
 |   d2.w   = bx (Bresenham)
 |   d3.w   = by (Bresenham)
 |   d4.w   = err (Bresenham)
 |   d5.w   = cx (cached)
 |   a4     = cy (cached, sign-extended)
 |   a3     = base
 |   a5     = bitMaskWordLut
 |   d0,d1,d6,d7 = scratch
 |
 | Scratch block (24 bytes) at sp+0..23:
 |   sp+0..3:   xp1 record [groupOff_w, bitMask_b, notMask_b]
 |              groupOff = (x >> 4) * 8 (byte offset of group within row)
 |              bitMask  = byte representation of 1 << (15 - (x & 15))
 |              ... wait, bitMask must be a WORD on ST not a byte.
 |
 | Actually layout differs from Amiga: ST needs a WORD bit mask, not
 | a byte. Per-record layout (8 bytes):
 |   groupOff_word (2 bytes), bitMask_word (2 bytes), notMask_word
 |   (2 bytes), pad (2 bytes)
 |
 |   sp+0..7:    xp1 record (cx + bx)
 |   sp+8..15:   xp2 record (cx - bx)
 |   sp+16..23:  xp3 record (cx + by)
 |   sp+24..31:  xp4 record (cx - by)
 |   sp+32..33:  yp1_off (cy + by) * 160
 |   sp+34..35:  yp2_off (cy - by) * 160
 |   sp+36..37:  yp3_off (cy + bx) * 160
 |   sp+38..39:  yp4_off (cy - bx) * 160
 | Total: 40 bytes.
                .text
 | ---- BIT_MASK_WORD: build 1 << (15 - (x & 15)) ---------
 | Look up via 16-entry table (a5 holds base). Cheaper than variable
 | shift on 68000 (which is 8 + 2n cyc). Table is 32 bytes (16 words).
 | Returns word in d_out.
 | ---- XP_REC: build xp record at sp+slot for xp = cx <signOp> <xreg>
 | signOp: add or sub
 | xreg:   %d2 (bx) or %d3 (by)
 | slot:   0, 8, 16, or 24
 | Trashes: d0, d1, d6, d7
                .macro  XP_REC  slot, signOp, xreg
                move.w  %d5,%d6                | d6 = cx
                \signOp\().w \xreg,%d6         | d6 = xp
                move.w  %d6,%d7
                lsr.w   #4,%d7                 | d7 = group
                lsl.w   #3,%d7                 | d7 = group * 8 (byte offset)
                and.w   #15,%d6                | d6 = xp & 15 (0..15)
                add.w   %d6,%d6                | d6 *= 2 (word index)
                move.w  (%a5,%d6.w),%d6        | d6 = bitMask word
                move.w  %d7,\slot(%sp)         | groupOff word
                move.w  %d6,\slot+2(%sp)       | bitMask word
                .endm
 | ---- YP_REC: store (yp * 160) at sp+slot ---------
 | yp = cy <signOp> <yreg>; trashes d0, d6.
                .macro  YP_REC  slot, signOp, yreg
                move.l  %a4,%d6
                \signOp\().w \yreg,%d6         | d6.w = yp
                move.w  %d6,%d0
                lsl.w   #5,%d6                 | d6 = yp << 5
                lsl.w   #7,%d0                 | d0 = yp << 7
                add.w   %d6,%d0                | d0 = yp * 160
                move.w  %d0,\slot(%sp)
                .endm
 | ---- PLOT_FIXED: plot one pixel with hardcoded 4-bit color ----
 | slotYp:  32, 34, 36, or 38 (yp_off word slot)
 | slotXp:  0, 8, 16, or 24    (xp record slot)
 | color:   literal 0..15
 | Trashes: d0, d1, d7
                .macro  PLOT_FIXED  slotYp, slotXp, color
                move.w  \slotYp(%sp),%d0       | d0 = yp_off
                add.w   \slotXp(%sp),%d0       | d0 += groupOff
                move.w  \slotXp+2(%sp),%d1     | d1 = bitMask word
                move.w  %d1,%d7
                not.w   %d7                    | d7 = notMask
                lea     0(%a3,%d0.w),%a2       | a2 = base + byteOff (group ptr)
                | 4 plane word RMWs at (a2)+, postinc walks p0->p1->p2->p3
                .if  ((\color) & 1)
                or.w    %d1,(%a2)+
                .else
                and.w   %d7,(%a2)+
                .endif
                .if  ((\color) & 2)
                or.w    %d1,(%a2)+
                .else
                and.w   %d7,(%a2)+
                .endif
                .if  ((\color) & 4)
                or.w    %d1,(%a2)+
                .else
                and.w   %d7,(%a2)+
                .endif
                .if  ((\color) & 8)
                or.w    %d1,(%a2)+
                .else
                and.w   %d7,(%a2)+
                .endif
                .endm
 | ---- PLOT_8: 8 octant pixels for hardcoded color ----
                .macro  PLOT_8  color
                PLOT_FIXED  32,  0, \color     | (cx+bx, cy+by)
                PLOT_FIXED  32,  8, \color     | (cx-bx, cy+by)
                PLOT_FIXED  34,  0, \color     | (cx+bx, cy-by)
                PLOT_FIXED  34,  8, \color     | (cx-bx, cy-by)
                PLOT_FIXED  36, 16, \color     | (cx+by, cy+bx)
                PLOT_FIXED  36, 24, \color     | (cx-by, cy+bx)
                PLOT_FIXED  38, 16, \color     | (cx+by, cy-bx)
                PLOT_FIXED  38, 24, \color     | (cx-by, cy-bx)
                .endm
 | ---- CO_BODY: full Bresenham loop body for hardcoded color ----
                .macro  CO_BODY  color
                XP_REC   0, add, %d2           | xp1 = cx+bx
                XP_REC   8, sub, %d2           | xp2 = cx-bx
                XP_REC  16, add, %d3           | xp3 = cx+by
                XP_REC  24, sub, %d3           | xp4 = cx-by
                YP_REC  32, add, %d3           | yp1 = (cy+by)*160
                YP_REC  34, sub, %d3           | yp2 = (cy-by)*160
                YP_REC  36, add, %d2           | yp3 = (cy+bx)*160
                YP_REC  38, sub, %d2           | yp4 = (cy-bx)*160
                PLOT_8  \color
                addq.w  #1,%d3
                tst.w   %d4
                bgt     .LcoStDecX_\color
                add.w   %d3,%d4
                add.w   %d3,%d4
                addq.w  #1,%d4
                bra.w   .LcoStLoop_\color
 .LcoStDecX_\color:
                subq.w  #1,%d2
                add.w   %d3,%d4
                add.w   %d3,%d4
                sub.w   %d2,%d4
                sub.w   %d2,%d4
                addq.w  #1,%d4
                bra.w   .LcoStLoop_\color
                .endm
                .macro  CO_LOOP_HDR  color
 .LcoStLoop_\color:
                cmp.w   %d3,%d2
                bcs.w   .LcoStDone
                CO_BODY \color
                .endm
 | ---- Function entry ----
 | Stack on entry (after movem.l of 11 regs + lea):
 |   sp+0..39:  scratch (40 bytes)
 |   sp+40..83: movem (44 bytes)
 |   sp+84..87: return PC
 |   sp+88+0:   base (uint8_t *)
 |   sp+88+4:   cx  (int promoted, .w at +88+4+2)
 |   sp+88+8:   cy  (int promoted, .w at +88+8+2)
 |   sp+88+12:  r   (int promoted, .w at +88+12+2)
 |   sp+88+16:  color (int promoted, byte at +88+16+3)
                .equ    SP_SAVED, 44
                .equ    SP_LOCAL, 40
                .equ    SP_OFF,         (SP_SAVED + 4 + SP_LOCAL)
                .equ    SP_BASE,    SP_OFF + 0
                .equ    SP_CX,      SP_OFF + 4 + 2
                .equ    SP_CY,      SP_OFF + 8 + 2
                .equ    SP_R,       SP_OFF + 12 + 2
                .equ    SP_COLOR,   SP_OFF + 16 + 3
                .globl  _surface68kStCircleOutline
 _surface68kStCircleOutline:
                movem.l %d2-%d7/%a2-%a6,-(%sp)
                lea     -SP_LOCAL(%sp),%sp
                | Load base (a3) and bitMaskLut (a5).
                move.l  SP_BASE(%sp),%a3
                lea     bitMaskWordLut(%pc),%a5
                | Cache cx in d5, cy (sign-extended) in a4.
                move.w  SP_CX(%sp),%d5
                move.w  SP_CY(%sp),%d6
                ext.l   %d6
                movea.l %d6,%a4
                | Bresenham init.
                move.w  SP_R(%sp),%d2          | bx = r
                moveq   #0,%d3                 | by = 0
                moveq   #1,%d4
                sub.w   %d2,%d4                | err = 1 - bx
                | Dispatch on color (low 4 bits) -> one of 16 main loops.
                moveq   #0,%d6
                move.b  SP_COLOR(%sp),%d6
                and.w   #0x0F,%d6
                add.w   %d6,%d6
                add.w   %d6,%d6                | * 4 for bra.w table
                lea     .LcoStTable(%pc),%a6
                jmp     0(%a6,%d6.w)
 .LcoStTable:
                bra.w   .LcoStLoop_0
                bra.w   .LcoStLoop_1
                bra.w   .LcoStLoop_2
                bra.w   .LcoStLoop_3
                bra.w   .LcoStLoop_4
                bra.w   .LcoStLoop_5
                bra.w   .LcoStLoop_6
                bra.w   .LcoStLoop_7
                bra.w   .LcoStLoop_8
                bra.w   .LcoStLoop_9
                bra.w   .LcoStLoop_10
                bra.w   .LcoStLoop_11
                bra.w   .LcoStLoop_12
                bra.w   .LcoStLoop_13
                bra.w   .LcoStLoop_14
                bra.w   .LcoStLoop_15
                CO_LOOP_HDR  0
                CO_LOOP_HDR  1
                CO_LOOP_HDR  2
                CO_LOOP_HDR  3
                CO_LOOP_HDR  4
                CO_LOOP_HDR  5
                CO_LOOP_HDR  6
                CO_LOOP_HDR  7
                CO_LOOP_HDR  8
                CO_LOOP_HDR  9
                CO_LOOP_HDR  10
                CO_LOOP_HDR  11
                CO_LOOP_HDR  12
                CO_LOOP_HDR  13
                CO_LOOP_HDR  14
                CO_LOOP_HDR  15
 .LcoStDone:
                lea     SP_LOCAL(%sp),%sp
                movem.l (%sp)+,%d2-%d7/%a2-%a6
                rts
                .align  2
 | 16 word entries: bitMaskWordLut[i] = 1 << (15 - i), for i in 0..15.
 bitMaskWordLut:
                .word   0x8000, 0x4000, 0x2000, 0x1000
                .word   0x0800, 0x0400, 0x0200, 0x0100
                .word   0x0080, 0x0040, 0x0020, 0x0010
                .word   0x0008, 0x0004, 0x0002, 0x0001
--- a/src/port/atarist/fillCircle.s
+++ b/src/port/atarist/fillCircle.s
@ -0,0 +1,292 @@
 | Atari ST word-interleaved planar fillCircle -- 68000 hand-rolled.
 |
 | Bresenham midpoint circle, 4 horizontal spans per Bresenham iter,
 | paired by shared x-range so leftMask/rightMask are computed once
 | per pair:
 |     Pair A: x in [cx-bx, cx+bx], rows y = cy+by, cy-by
 |     Pair B: x in [cx-by, cx+by], rows y = cy+bx, cy-bx
 |
 | Caller MUST guarantee the bounding box (cx-r, cy-r) (cx+r, cy+r)
 | is fully on-surface. Off-surface circles fall back to the C walker.
 |
 | ABI: cdecl. d2-d7/a2-a6 callee-save.
 |
 |   void surface68kStFillCircle(uint8_t *base,
 |                               uint16_t cx, uint16_t cy,
 |                               uint16_t r,  uint8_t  color);
 |
 | Register allocation across the loop:
 |   d2.w = bx (Bresenham, starts at r)
 |   d3.w = by (Bresenham, starts at 0)
 |   d4.w = err
 |   d5.l = loLong (planes 0+1 long template)
 |   d6.l = hiLong (planes 2+3 long template)
 |   d7.b = color (low nibble; tested via btst)
 |   a3   = base
 |   a4   = scratch / current group pointer
 |   d0,d1 = scratch
 |
 | Stack scratch (8 bytes at 0(sp)..7(sp)):
 |   0..1  leftMask  (word; per pair)
 |   2..3  rightMask (word; per pair)
 |   4..5  numGroups (word; per pair)
 |   6..7  groupFirstByteOff (word; per pair)
                .text
                .equ    SP_FC_SAVED, 44
                .equ    SP_FC_LOCAL, 8
                .equ    SP_FC_OFF,         (SP_FC_SAVED + 4 + SP_FC_LOCAL)
                .equ    SP_FC_BASE,    SP_FC_OFF + 0
                .equ    SP_FC_CX,      SP_FC_OFF + 4 + 2
                .equ    SP_FC_CY,      SP_FC_OFF + 8 + 2
                .equ    SP_FC_R,       SP_FC_OFF + 12 + 2
                .equ    SP_FC_COLOR,   SP_FC_OFF + 16 + 3
 | ---- COMPUTE_PAIR_MASKS macro -----------------------------------
 | Input:  d0.w = left, d1.w = right
 | Output: 0(sp) leftMask, 2(sp) rightMask, 4(sp) numGroups,
 |         6(sp) groupFirstByteOff
 | Trashes: d0, d1
 | (No labels: straightline.)
                .macro  COMPUTE_PAIR_MASKS
                move.w  %d0,0(%sp)             | stash left
                move.w  %d1,2(%sp)             | stash right
                | groupFirst & groupFirstByteOff
                move.w  %d0,%d1
                lsr.w   #4,%d1                 | groupFirst
                move.w  %d1,%d0
                lsl.w   #3,%d0                 | groupFirstByteOff
                move.w  %d0,6(%sp)
                | numGroups = (right >> 4) - groupFirst
                move.w  2(%sp),%d0
                lsr.w   #4,%d0                 | groupLast
                sub.w   %d1,%d0                | numGroups
                move.w  %d0,4(%sp)
                | leftMask via LUT[bitFirst]; a5 = leftMaskLut base
                move.w  0(%sp),%d0
                and.w   #15,%d0
                add.w   %d0,%d0
                move.w  (%a5,%d0.w),%d1
                move.w  %d1,0(%sp)
                | rightMask via LUT[bitLast]; a6 = rightMaskLut base
                move.w  2(%sp),%d0
                and.w   #15,%d0
                add.w   %d0,%d0
                move.w  (%a6,%d0.w),%d1
                move.w  %d1,2(%sp)
                .endm
 | ---- SPAN_BODY macro --------------------------------------------
 | Render one row span using the pair masks at 0(sp)..7(sp).
 | Input:  d0.w = y (signed)
 |         a3 = base, d5 = loLong, d6 = hiLong, d7 = color
 | Trashes: d0, d1, a4
 | Macro takes an idx parameter for unique labels.
                .macro  SPAN_BODY
                | a4 = base + y*160
                ext.l   %d0
                move.l  %d0,%d1
                lsl.l   #5,%d0
                lsl.l   #7,%d1
                add.l   %d1,%d0                | y*160
                lea     0(%a3,%d0.l),%a4
                | a4 += groupFirstByteOff
                moveq   #0,%d0
                move.w  6(%sp),%d0
                add.l   %d0,%a4
                | numGroups in d1
                move.w  4(%sp),%d1
                tst.w   %d1
                bne.s   .Lsb_multi\@
                | single-group: combinedMask = leftMask & rightMask
                move.w  0(%sp),%d0
                and.w   2(%sp),%d0
                bsr     .Lfc_applyMask
                bra.w   .Lsb_done\@
 .Lsb_multi\@:
                | leading mask. applyMask postinc-advances a4 by 8
                | (the 4 plane RMWs each advance by 2 via (a4)+).
                | applyMask trashes d1, so reload numGroups after bsr.
                move.w  0(%sp),%d0
                bsr     .Lfc_applyMask
                move.w  4(%sp),%d1             | reload numGroups
                subq.w  #1,%d1                 | d1 = numMid
                beq.s   .Lsb_skipMid\@
 .Lsb_midLoop\@:
                move.l  %d5,(%a4)+
                move.l  %d6,(%a4)+
                subq.w  #1,%d1
                bne.s   .Lsb_midLoop\@
 .Lsb_skipMid\@:
                | trailing mask
                move.w  2(%sp),%d0
                bsr     .Lfc_applyMask
 .Lsb_done\@:
                .endm
                .globl  _surface68kStFillCircle
 _surface68kStFillCircle:
                movem.l %d2-%d7/%a2-%a6,-(%sp)
                lea     -SP_FC_LOCAL(%sp),%sp
                | base, color
                move.l  SP_FC_BASE(%sp),%a3
                moveq   #0,%d7
                move.b  SP_FC_COLOR(%sp),%d7
                | LUT bases (PC-relative indexed has only 8-bit
                | displacement, so cache full pointers in a-regs).
                lea     leftMaskLut(%pc),%a5
                lea     rightMaskLut(%pc),%a6
                | loLong = ((c&1)?0xFFFF0000:0) | ((c&2)?0x0000FFFF:0)
                moveq   #0,%d5
                btst    #1,%d7
                beq.s   .Lfc_lo1
                move.w  #-1,%d5
 .Lfc_lo1:
                btst    #0,%d7
                beq.s   .Lfc_lo0
                ori.l   #0xFFFF0000,%d5
 .Lfc_lo0:
                | hiLong = ((c&4)?0xFFFF0000:0) | ((c&8)?0x0000FFFF:0)
                moveq   #0,%d6
                btst    #3,%d7
                beq.s   .Lfc_hi3
                move.w  #-1,%d6
 .Lfc_hi3:
                btst    #2,%d7
                beq.s   .Lfc_hi2
                ori.l   #0xFFFF0000,%d6
 .Lfc_hi2:
                | Bresenham init: bx=r, by=0, err=1-bx
                move.w  SP_FC_R(%sp),%d2
                moveq   #0,%d3
                moveq   #1,%d4
                sub.w   %d2,%d4
 .Lfc_loop:
                cmp.w   %d3,%d2
                bcs.w   .Lfc_done
                | --- Pair A: x range = (cx - bx, cx + bx)
                move.w  SP_FC_CX(%sp),%d0
                move.w  %d0,%d1
                sub.w   %d2,%d0                | left  = cx - bx
                add.w   %d2,%d1                | right = cx + bx
                COMPUTE_PAIR_MASKS
                | Span A1: y = cy + by
                move.w  SP_FC_CY(%sp),%d0
                add.w   %d3,%d0
                SPAN_BODY
                | Span A2: y = cy - by
                move.w  SP_FC_CY(%sp),%d0
                sub.w   %d3,%d0
                SPAN_BODY
                | --- Pair B: x range = (cx - by, cx + by)
                move.w  SP_FC_CX(%sp),%d0
                move.w  %d0,%d1
                sub.w   %d3,%d0                | left  = cx - by
                add.w   %d3,%d1                | right = cx + by
                COMPUTE_PAIR_MASKS
                | Span B1: y = cy + bx
                move.w  SP_FC_CY(%sp),%d0
                add.w   %d2,%d0
                SPAN_BODY
                | Span B2: y = cy - bx
                move.w  SP_FC_CY(%sp),%d0
                sub.w   %d2,%d0
                SPAN_BODY
                | --- Bresenham step
                addq.w  #1,%d3
                tst.w   %d4
                bgt.s   .Lfc_decBx
                add.w   %d3,%d4
                add.w   %d3,%d4
                addq.w  #1,%d4
                bra.w   .Lfc_loop
 .Lfc_decBx:
                subq.w  #1,%d2
                add.w   %d3,%d4
                add.w   %d3,%d4
                sub.w   %d2,%d4
                sub.w   %d2,%d4
                addq.w  #1,%d4
                bra.w   .Lfc_loop
 .Lfc_done:
                lea     SP_FC_LOCAL(%sp),%sp
                movem.l (%sp)+,%d2-%d7/%a2-%a6
                rts
 | ---- Apply 4-plane mask at (a4) -------------------------------
 | Input:  d0.w = mask, d7.b = color, a4 = group ptr
 | Output: a4 advanced by 8 (next group). Caller must NOT post-add 8.
 | Trashes: d0, d1
 | Subroutine, called via bsr from SPAN_BODY. Postinc on each plane
 | RMW saves 4 cyc/plane vs displacement (12 vs 16 EA cyc).
 .Lfc_applyMask:
                move.w  %d0,%d1
                not.w   %d1                    | d1 = notMask
                btst    #0,%d7
                beq.s   .Lfc_am0a
                or.w    %d0,(%a4)+
                bra.s   .Lfc_am1
 .Lfc_am0a:
                and.w   %d1,(%a4)+
 .Lfc_am1:
                btst    #1,%d7
                beq.s   .Lfc_am1a
                or.w    %d0,(%a4)+
                bra.s   .Lfc_am2
 .Lfc_am1a:
                and.w   %d1,(%a4)+
 .Lfc_am2:
                btst    #2,%d7
                beq.s   .Lfc_am2a
                or.w    %d0,(%a4)+
                bra.s   .Lfc_am3
 .Lfc_am2a:
                and.w   %d1,(%a4)+
 .Lfc_am3:
                btst    #3,%d7
                beq.s   .Lfc_am3a
                or.w    %d0,(%a4)+
                rts
 .Lfc_am3a:
                and.w   %d1,(%a4)+
                rts
                .align  2
 | leftMaskLut[i]  = (1 << (16 - i)) - 1, indexed by bitFirst (0..15)
 leftMaskLut:
                .word   0xFFFF, 0x7FFF, 0x3FFF, 0x1FFF
                .word   0x0FFF, 0x07FF, 0x03FF, 0x01FF
                .word   0x00FF, 0x007F, 0x003F, 0x001F
                .word   0x000F, 0x0007, 0x0003, 0x0001
 | rightMaskLut[i] = ~((1 << (15 - i)) - 1), indexed by bitLast (0..15)
 rightMaskLut:
                .word   0x8000, 0xC000, 0xE000, 0xF000
                .word   0xF800, 0xFC00, 0xFE00, 0xFF00
                .word   0xFF80, 0xFFC0, 0xFFE0, 0xFFF0
                .word   0xFFF8, 0xFFFC, 0xFFFE, 0xFFFF
--- a/src/port/atarist/hal.c
+++ b/src/port/atarist/hal.c
--- a/src/port/atarist/lineSpan.s
+++ b/src/port/atarist/lineSpan.s
@ -0,0 +1,853 @@
 | Atari ST word-interleaved planar drawLine -- 68000 hand-rolled.
 |
 | Bresenham line walker with 16-way color dispatch. Per pixel:
 |   * 4-plane word RMW with branchless OR/AND chosen at compile time
 |   * bit mask via 16-entry word table; group offset via (x>>4)<<3
 |   * y*160 = (y<<5)+(y<<7)
 |
 | Caller MUST guarantee the entire line lies on-surface (full clip
 | precheck). Partial-clip lines fall back to the C walker.
 |
 | ABI: cdecl. d2-d7/a2-a6 callee-save.
 |
 |   void surface68kStDrawLine(uint8_t *base,
 |                             int16_t x0, int16_t y0,
 |                             int16_t x1, int16_t y1,
 |                             uint8_t color);
 |
 | Register allocation in the inner loop:
 |   d2.w   = x  (current pixel)
 |   d3.w   = y  (current pixel)
 |   d4.w   = err
 |   d5.w   = dx       (>= 0)
 |   d6.w   = -dy_abs  (<= 0; "Bresenham uses -dy")
 |   d7     = sx (long; moveq #1 or #-1, low word used for .w add)
 |   a4     = sy (long; sign-extended)
 |   a3     = base
 |   a5     = bitMaskWordLut
 |   a2     = scratch (per-pixel: base + byteOff)
 |   d0,d1  = scratch
 |
 | Stack scratch:
 |   sp+0..1  iter counter (max(dx, dy_abs) + 1)
                .text
                .equ    SP_SAVED, 44
                .equ    SP_LOCAL, 4
                .equ    SP_OFF,         (SP_SAVED + 4 + SP_LOCAL)
                .equ    SP_BASE,    SP_OFF + 0
                .equ    SP_X0,      SP_OFF + 4 + 2
                .equ    SP_Y0,      SP_OFF + 8 + 2
                .equ    SP_X1,      SP_OFF + 12 + 2
                .equ    SP_Y1,      SP_OFF + 16 + 2
                .equ    SP_COLOR,   SP_OFF + 20 + 3
 | ---- DL_PLOT: 4-plane word RMW for hardcoded color ----
 | Inputs:  d2.w = x, d3.w = y, a3 = base, a5 = bitMaskWordLut
 | Trashes: d0, d1, a2
                .macro  DL_PLOT  color
                | byteOff = y*160 + (x>>4)*8
                move.w  %d3,%d0
                ext.l   %d0
                move.l  %d0,%d1
                lsl.l   #5,%d0                 | y << 5
                lsl.l   #7,%d1                 | y << 7
                add.l   %d1,%d0                | d0 = y * 160
                move.w  %d2,%d1
                lsr.w   #4,%d1
                lsl.w   #3,%d1                 | (x>>4) * 8
                ext.l   %d1
                add.l   %d1,%d0                | d0 = byteOff
                lea     0(%a3,%d0.l),%a2       | a2 = base + byteOff
                | d1 = bitMask, d0 = notMask
                move.w  %d2,%d1
                and.w   #15,%d1
                add.w   %d1,%d1
                move.w  (%a5,%d1.w),%d1
                move.w  %d1,%d0
                not.w   %d0
                | per-plane RMW with postinc (drops 4 cyc per RMW vs
                | displacement (d8,An) = 16 cyc, plain (An)+ = 12 cyc).
                .if  ((\color) & 1)
                or.w    %d1,(%a2)+
                .else
                and.w   %d0,(%a2)+
                .endif
                .if  ((\color) & 2)
                or.w    %d1,(%a2)+
                .else
                and.w   %d0,(%a2)+
                .endif
                .if  ((\color) & 4)
                or.w    %d1,(%a2)+
                .else
                and.w   %d0,(%a2)+
                .endif
                .if  ((\color) & 8)
                or.w    %d1,(%a2)+
                .else
                and.w   %d0,(%a2)+
                .endif
                .endm
 | ---- DL_BODY: full Bresenham loop body for hardcoded color ----
                .macro  DL_BODY  color
 .LdlStLoop_\color:
                DL_PLOT  \color
                | e2 = 2 * err
                move.w  %d4,%d0
                add.w   %d0,%d0                | d0 = e2
                | if (e2 >= dy) { err += dy; x += sx; }
                cmp.w   %d6,%d0
                blt.s   .LdlStNoX_\color
                add.w   %d6,%d4
                add.w   %d7,%d2
 .LdlStNoX_\color:
                | if (e2 <= dx) { err += dx; y += sy; }
                cmp.w   %d5,%d0
                bgt.s   .LdlStNoY_\color
                add.w   %d5,%d4
                add.w   %a4,%d3                | sy.w from a4
 .LdlStNoY_\color:
                subq.w  #1,0(%sp)
                bne.w   .LdlStLoop_\color
                bra.w   .LdlStDone
                .endm
                .globl  _surface68kStDrawLine
 _surface68kStDrawLine:
                movem.l %d2-%d7/%a2-%a6,-(%sp)
                lea     -SP_LOCAL(%sp),%sp
                | Load base & lut.
                move.l  SP_BASE(%sp),%a3
                lea     bitMaskWordLut(%pc),%a5
                | x = x0, y = y0
                move.w  SP_X0(%sp),%d2
                move.w  SP_Y0(%sp),%d3
                | dx = abs(x1 - x0), sx = sign(x1 - x0)
                move.w  SP_X1(%sp),%d5
                sub.w   %d2,%d5                | d5 = x1 - x0
                bge.s   .LdlSxPos
                neg.w   %d5
                moveq   #-1,%d7
                bra.s   .LdlSxDone
 .LdlSxPos:
                moveq   #1,%d7
 .LdlSxDone:
                | dy_abs in d6, sy in d0 (-> a4)
                move.w  SP_Y1(%sp),%d6
                sub.w   %d3,%d6                | d6 = y1 - y0
                bge.s   .LdlSyPos
                neg.w   %d6
                moveq   #-1,%d0
                bra.s   .LdlSyDone
 .LdlSyPos:
                moveq   #1,%d0
 .LdlSyDone:
                ext.l   %d0
                movea.l %d0,%a4                | a4 = sy
                | iter counter = max(dx, dy_abs) + 1
                move.w  %d5,%d0
                cmp.w   %d6,%d0
                bge.s   .LdlNitDone
                move.w  %d6,%d0
 .LdlNitDone:
                addq.w  #1,%d0
                move.w  %d0,0(%sp)
                | err = dx - dy_abs (== dx + dy where dy negative)
                move.w  %d5,%d4
                sub.w   %d6,%d4                | d4 = err
                neg.w   %d6                    | d6 = -dy_abs (negative)
                | Dispatch on color (low 4 bits) -> 16 specialized loops.
                moveq   #0,%d0
                move.b  SP_COLOR(%sp),%d0
                and.w   #0x0F,%d0
                add.w   %d0,%d0
                add.w   %d0,%d0                | * 4 for bra.w table
                lea     .LdlStTable(%pc),%a6
                jmp     0(%a6,%d0.w)
 .LdlStTable:
                bra.w   .LdlStLoop_0
                bra.w   .LdlStLoop_1
                bra.w   .LdlStLoop_2
                bra.w   .LdlStLoop_3
                bra.w   .LdlStLoop_4
                bra.w   .LdlStLoop_5
                bra.w   .LdlStLoop_6
                bra.w   .LdlStLoop_7
                bra.w   .LdlStLoop_8
                bra.w   .LdlStLoop_9
                bra.w   .LdlStLoop_10
                bra.w   .LdlStLoop_11
                bra.w   .LdlStLoop_12
                bra.w   .LdlStLoop_13
                bra.w   .LdlStLoop_14
                bra.w   .LdlStLoop_15
                DL_BODY  0
                DL_BODY  1
                DL_BODY  2
                DL_BODY  3
                DL_BODY  4
                DL_BODY  5
                DL_BODY  6
                DL_BODY  7
                DL_BODY  8
                DL_BODY  9
                DL_BODY  10
                DL_BODY  11
                DL_BODY  12
                DL_BODY  13
                DL_BODY  14
                DL_BODY  15
 .LdlStDone:
                lea     SP_LOCAL(%sp),%sp
                movem.l (%sp)+,%d2-%d7/%a2-%a6
                rts
                .align  2
 | 16 word entries: bitMaskWordLut[i] = 1 << (15 - i), for i in 0..15.
 bitMaskWordLut:
                .word   0x8000, 0x4000, 0x2000, 0x1000
                .word   0x0800, 0x0400, 0x0200, 0x0100
                .word   0x0080, 0x0040, 0x0020, 0x0010
                .word   0x0008, 0x0004, 0x0002, 0x0001
 | ---- surface68kStFillSpan ---------------------------------------
 |
 | Single-row span fill: leading-mask group + middle long-fills +
 | trailing-mask group, all in one frame. Caller pre-clips so the
 | span is fully on-surface.
 |
 |   void surface68kStFillSpan(uint8_t *base,
 |                             int16_t left, int16_t right,
 |                             int16_t y, uint8_t color);
 |
 |   Caller guarantees: 0 <= left <= right < 320, 0 <= y < 200.
 |
 | Register layout:
 |   a3 = base
 |   a4 = current group pointer
 |   d2.w = leftMask (then trailing trampoline target)
 |   d3.w = rightMask
 |   d4.w = numGroups - 1 (middle iter count when > 0)
 |   d5.l = loLong (planes 0+1 long template)
 |   d6.l = hiLong (planes 2+3 long template)
 |   d7.b = color (low nibble; tested via btst)
 |   d0,d1 = scratch
                .equ    SP_FS_SAVED, 44
                .equ    SP_FS_OFF,         (SP_FS_SAVED + 4)
                .equ    SP_FS_BASE,    SP_FS_OFF + 0
                .equ    SP_FS_LEFT,    SP_FS_OFF + 4 + 2
                .equ    SP_FS_RIGHT,   SP_FS_OFF + 8 + 2
                .equ    SP_FS_Y,       SP_FS_OFF + 12 + 2
                .equ    SP_FS_COLOR,   SP_FS_OFF + 16 + 3
                .globl  _surface68kStFillSpan
 _surface68kStFillSpan:
                movem.l %d2-%d7/%a2-%a6,-(%sp)
                move.l  SP_FS_BASE(%sp),%a3
                moveq   #0,%d7
                move.b  SP_FS_COLOR(%sp),%d7   | d7 = color
                | loLong = ((c&1)?0xFFFF0000:0) | ((c&2)?0x0000FFFF:0)
                moveq   #0,%d5
                btst    #1,%d7
                beq.s   .LfsLoBit1
                move.w  #-1,%d5
 .LfsLoBit1:
                btst    #0,%d7
                beq.s   .LfsLoBit0
                ori.l   #0xFFFF0000,%d5
 .LfsLoBit0:
                | hiLong = ((c&4)?0xFFFF0000:0) | ((c&8)?0x0000FFFF:0)
                moveq   #0,%d6
                btst    #3,%d7
                beq.s   .LfsHiBit3
                move.w  #-1,%d6
 .LfsHiBit3:
                btst    #2,%d7
                beq.s   .LfsHiBit2
                ori.l   #0xFFFF0000,%d6
 .LfsHiBit2:
                | rowBase = base + y*160 -> a4
                move.w  SP_FS_Y(%sp),%d0
                ext.l   %d0
                move.l  %d0,%d1
                lsl.l   #5,%d0
                lsl.l   #7,%d1
                add.l   %d1,%d0                | d0 = y*160
                lea     0(%a3,%d0.l),%a4
                | left in d0, right in d1
                move.w  SP_FS_LEFT(%sp),%d0
                move.w  SP_FS_RIGHT(%sp),%d1
                | bitFirst in d2, bitLast in d3
                move.w  %d0,%d2
                and.w   #15,%d2
                move.w  %d1,%d3
                and.w   #15,%d3
                | a4 += groupFirst * 8
                | numGroups = groupLast - groupFirst (in d4)
                move.w  %d0,%d4
                lsr.w   #4,%d4                 | d4 = groupFirst
                move.w  %d4,%d0                | save groupFirst into d0
                lsl.w   #3,%d0                 | d0 = groupFirst*8
                ext.l   %d0
                add.l   %d0,%a4
                move.w  %d1,%d0
                lsr.w   #4,%d0                 | d0 = groupLast
                sub.w   %d4,%d0                | d0 = groupLast - groupFirst
                move.w  %d0,%d4                | d4 = numGroups
                | leftMask  = (1 << (16 - bitFirst)) - 1
                moveq   #16,%d0
                sub.w   %d2,%d0                | d0 = 16 - bitFirst (1..16)
                moveq   #1,%d2
                lsl.l   %d0,%d2                | 1 << (16 - bitFirst)
                subq.l  #1,%d2                 | d2.w = leftMask
                | rightMask = ~((1 << (15 - bitLast)) - 1)
                moveq   #15,%d0
                sub.w   %d3,%d0                | d0 = 15 - bitLast (0..15)
                moveq   #1,%d3
                lsl.l   %d0,%d3                | 1 << (15 - bitLast)
                subq.l  #1,%d3                 | inverse mask
                not.w   %d3                    | d3.w = rightMask
                | If numGroups == 0, single-group: mask = leftMask & rightMask
                tst.w   %d4
                bne.s   .LfsMulti
                and.w   %d2,%d3                | d3 = combinedMask
                move.w  %d3,%d2
                bsr.s   .LfsApplyMask
                bra.w   .LfsDone
 .LfsMulti:
                | Leading mask (d2 already = leftMask)
                bsr.s   .LfsApplyMask
                addq.l  #8,%a4                 | next group
                | numMid = numGroups - 1
                subq.w  #1,%d4
                beq.s   .LfsTrailing
 .LfsMidLoop:
                move.l  %d5,(%a4)+
                move.l  %d6,(%a4)+
                subq.w  #1,%d4
                bne.s   .LfsMidLoop
 .LfsTrailing:
                move.w  %d3,%d2                | d2 = rightMask
                bsr.s   .LfsApplyMask
 .LfsDone:
                movem.l (%sp)+,%d2-%d7/%a2-%a6
                rts
 | Apply 4-plane word RMW at (a4) using mask in d2 (or notMask in d0).
 | Plane N: if (color bit N) OR mask else AND notMask.
 | Inputs: a4, d2.w = mask, d7.b = color
 | Trashes: d0
 | Returns via rts.
 .LfsApplyMask:
                move.w  %d2,%d0
                not.w   %d0                    | d0 = notMask
                btst    #0,%d7
                beq.s   .LfsAm0a
                or.w    %d2,(%a4)
                bra.s   .LfsAm1
 .LfsAm0a:
                and.w   %d0,(%a4)
 .LfsAm1:
                btst    #1,%d7
                beq.s   .LfsAm1a
                or.w    %d2,2(%a4)
                bra.s   .LfsAm2
 .LfsAm1a:
                and.w   %d0,2(%a4)
 .LfsAm2:
                btst    #2,%d7
                beq.s   .LfsAm2a
                or.w    %d2,4(%a4)
                bra.s   .LfsAm3
 .LfsAm2a:
                and.w   %d0,4(%a4)
 .LfsAm3:
                btst    #3,%d7
                beq.s   .LfsAm3a
                or.w    %d2,6(%a4)
                rts
 .LfsAm3a:
                and.w   %d0,6(%a4)
                rts
 | ---- surface68kStFillRectSingleGroup -----------------------------
 |
 | Fill rect when groupFirst == groupLast (thin/single-column rect).
 | Caller pre-computes firstGroupPtr = base + y*160 + groupFirst*8
 | and the mask = leftMask & rightMask.
 |
 |   void surface68kStFillRectSingleGroup(uint8_t *firstGroupPtr,
 |                                        uint16_t mask,
 |                                        uint16_t h,
 |                                        uint8_t color);
 |
 | Dispatched on color (low nibble) -> 16 specialized loops with
 | hardcoded OR/AND per plane. Inner loop is 4 plane word RMWs +
 | advance row + branch.
 |
 | drawLine V routes to fillRect 1xH which lands here.
                .equ    SP_FRG_SAVED, 24            | d2-d5/a2-a3 = 6 longs
                .equ    SP_FRG_OFF,         (SP_FRG_SAVED + 4)
                .equ    SP_FRG_PTR,    SP_FRG_OFF + 0
                .equ    SP_FRG_MASK,   SP_FRG_OFF + 4 + 2
                .equ    SP_FRG_H,      SP_FRG_OFF + 8 + 2
                .equ    SP_FRG_COLOR,  SP_FRG_OFF + 12 + 3
                .macro  FRG_LOOP color
 .Lfrg_loop_\color:
                .if  ((\color) & 1)
                or.w    %d3,(%a3)+
                .else
                and.w   %d4,(%a3)+
                .endif
                .if  ((\color) & 2)
                or.w    %d3,(%a3)+
                .else
                and.w   %d4,(%a3)+
                .endif
                .if  ((\color) & 4)
                or.w    %d3,(%a3)+
                .else
                and.w   %d4,(%a3)+
                .endif
                .if  ((\color) & 8)
                or.w    %d3,(%a3)+
                .else
                and.w   %d4,(%a3)+
                .endif
                lea     152(%a3),%a3            | a3 now at row start; advance to next row (160-8)
                subq.w  #1,%d5
                bne.w   .Lfrg_loop_\color
                bra.w   .Lfrg_done
                .endm
                .globl  _surface68kStFillRectSingleGroup
 _surface68kStFillRectSingleGroup:
                movem.l %d2-%d5/%a2-%a3,-(%sp)
                move.l  SP_FRG_PTR(%sp),%a3
                move.w  SP_FRG_MASK(%sp),%d3
                move.w  SP_FRG_H(%sp),%d5
                tst.w   %d5
                beq.w   .Lfrg_done
                move.w  %d3,%d4
                not.w   %d4                        | d4 = notMask
                | Color dispatch
                moveq   #0,%d2
                move.b  SP_FRG_COLOR(%sp),%d2
                and.w   #0x0F,%d2
                add.w   %d2,%d2
                add.w   %d2,%d2                    | * 4 for bra.w table
                lea     .Lfrg_table(%pc),%a2
                jmp     0(%a2,%d2.w)
 .Lfrg_table:
                bra.w   .Lfrg_loop_0
                bra.w   .Lfrg_loop_1
                bra.w   .Lfrg_loop_2
                bra.w   .Lfrg_loop_3
                bra.w   .Lfrg_loop_4
                bra.w   .Lfrg_loop_5
                bra.w   .Lfrg_loop_6
                bra.w   .Lfrg_loop_7
                bra.w   .Lfrg_loop_8
                bra.w   .Lfrg_loop_9
                bra.w   .Lfrg_loop_10
                bra.w   .Lfrg_loop_11
                bra.w   .Lfrg_loop_12
                bra.w   .Lfrg_loop_13
                bra.w   .Lfrg_loop_14
                bra.w   .Lfrg_loop_15
                FRG_LOOP  0
                FRG_LOOP  1
                FRG_LOOP  2
                FRG_LOOP  3
                FRG_LOOP  4
                FRG_LOOP  5
                FRG_LOOP  6
                FRG_LOOP  7
                FRG_LOOP  8
                FRG_LOOP  9
                FRG_LOOP  10
                FRG_LOOP  11
                FRG_LOOP  12
                FRG_LOOP  13
                FRG_LOOP  14
                FRG_LOOP  15
 .Lfrg_done:
                movem.l (%sp)+,%d2-%d5/%a2-%a3
                rts
 | ---- surface68kStFillRectMulti -------------------------------------
 |
 | Multi-group fillRect: groupFirst != groupLast. Caller pre-clips.
 | Dispatched on color (low nibble) -> 16 specialized H-row loops.
 |
 |   void surface68kStFillRectMulti(uint8_t *base,
 |                                  int16_t x, int16_t y,
 |                                  uint16_t w, uint16_t h,
 |                                  uint8_t color);
 |
 | Per row body (per color C):
 |   1. Leading mask: 4 hardcoded plane RMW with leftMask
 |   2. Middle: numMid groups of 2 long-writes (loLong, hiLong)
 |   3. Trailing mask: 4 hardcoded plane RMW with rightMask
 |   4. Advance rowBase by 160; decrement h; loop.
 |
 | Register layout in inner loop:
 |   d2.w = leftMask       d3.w = rightMask
 |   d4.w = ~leftMask      d5.w = ~rightMask
 |   d6.l = loLong         d7.l = hiLong
 |   a3   = rowBase (advances by 160 each iter)
 |   a4   = a_grp (per-row scratch)
 |   d0,d1 = scratch
 |
 | Stack scratch (4 bytes at sp+0):
 |   0..1  numMid   (word, reload per row for mid loop)
 |   2..3  h        (word, decrement per row)
                .equ    SP_FRM_SAVED, 44
                .equ    SP_FRM_LOCAL, 4
                .equ    SP_FRM_OFF,         (SP_FRM_SAVED + 4 + SP_FRM_LOCAL)
                .equ    SP_FRM_BASE,    SP_FRM_OFF + 0
                .equ    SP_FRM_X,       SP_FRM_OFF + 4 + 2
                .equ    SP_FRM_Y,       SP_FRM_OFF + 8 + 2
                .equ    SP_FRM_W,       SP_FRM_OFF + 12 + 2
                .equ    SP_FRM_H,       SP_FRM_OFF + 16 + 2
                .equ    SP_FRM_COLOR,   SP_FRM_OFF + 20 + 3
                .macro  FRM_LOOP color
 .LfrM_loop_\color:
                | Leading mask at (a4)+, walking from row start
                move.l  %a3,%a4                | a4 = current row's groupFirst byte
                .if  ((\color) & 1)
                or.w    %d2,(%a4)+
                .else
                and.w   %d4,(%a4)+
                .endif
                .if  ((\color) & 2)
                or.w    %d2,(%a4)+
                .else
                and.w   %d4,(%a4)+
                .endif
                .if  ((\color) & 4)
                or.w    %d2,(%a4)+
                .else
                and.w   %d4,(%a4)+
                .endif
                .if  ((\color) & 8)
                or.w    %d2,(%a4)+
                .else
                and.w   %d4,(%a4)+
                .endif
                | a4 now points to next group (8 bytes past row start).
                | Middle long-fill
                move.w  0(%sp),%d0
                tst.w   %d0
                beq.s   .LfrM_skipMid_\color
 .LfrM_midLoop_\color:
                move.l  %d6,(%a4)+
                move.l  %d7,(%a4)+
                subq.w  #1,%d0
                bne.s   .LfrM_midLoop_\color
 .LfrM_skipMid_\color:
                | Trailing mask at (a4)+
                .if  ((\color) & 1)
                or.w    %d3,(%a4)+
                .else
                and.w   %d5,(%a4)+
                .endif
                .if  ((\color) & 2)
                or.w    %d3,(%a4)+
                .else
                and.w   %d5,(%a4)+
                .endif
                .if  ((\color) & 4)
                or.w    %d3,(%a4)+
                .else
                and.w   %d5,(%a4)+
                .endif
                .if  ((\color) & 8)
                or.w    %d3,(%a4)+
                .else
                and.w   %d5,(%a4)+
                .endif
                | Advance row (a3 unchanged through the body)
                lea     160(%a3),%a3
                subq.w  #1,2(%sp)
                bne.w   .LfrM_loop_\color
                bra.w   .LfrM_done
                .endm
                .globl  _surface68kStFillRectMulti
 _surface68kStFillRectMulti:
                movem.l %d2-%d7/%a2-%a6,-(%sp)
                lea     -SP_FRM_LOCAL(%sp),%sp
                | Load color, build loLong (d6) and hiLong (d7)
                moveq   #0,%d0
                move.b  SP_FRM_COLOR(%sp),%d0
                moveq   #0,%d6
                btst    #1,%d0
                beq.s   .LfrM_lo1
                move.w  #-1,%d6
 .LfrM_lo1:
                btst    #0,%d0
                beq.s   .LfrM_lo0
                ori.l   #0xFFFF0000,%d6
 .LfrM_lo0:
                moveq   #0,%d7
                btst    #3,%d0
                beq.s   .LfrM_hi3
                move.w  #-1,%d7
 .LfrM_hi3:
                btst    #2,%d0
                beq.s   .LfrM_hi2
                ori.l   #0xFFFF0000,%d7
 .LfrM_hi2:
                | Compute group ptrs and masks
                | groupFirst = x >> 4; groupFirstByteOff = groupFirst * 8
                | bitFirst = x & 15
                move.w  SP_FRM_X(%sp),%d0
                move.w  SP_FRM_W(%sp),%d1
                add.w   %d0,%d1
                subq.w  #1,%d1                 | d1 = x + w - 1 (last pixel)
                | leftMask via LUT[bitFirst]
                move.w  %d0,%d2
                and.w   #15,%d2
                add.w   %d2,%d2
                lea     frmLeftMaskLut(%pc),%a2
                move.w  (%a2,%d2.w),%d2        | d2 = leftMask
                move.w  %d2,%d4
                not.w   %d4                    | d4 = notLeftMask
                | rightMask via LUT[bitLast]
                move.w  %d1,%d3
                and.w   #15,%d3
                add.w   %d3,%d3
                lea     frmRightMaskLut(%pc),%a2
                move.w  (%a2,%d3.w),%d3        | d3 = rightMask
                move.w  %d3,%d5
                not.w   %d5                    | d5 = notRightMask
                | numMid = (last >> 4) - (x >> 4) - 1
                move.w  %d1,%a2                | a2.w = lastPixel (temp)
                move.l  %a2,%d1
                lsr.w   #4,%d1                 | groupLast (low word)
                move.w  %d0,%a2
                move.l  %a2,%d0
                lsr.w   #4,%d0                 | groupFirst
                move.w  %d0,%a4                | a4.w = groupFirst (save for byteOff calc)
                sub.w   %d0,%d1                | d1 = groupLast - groupFirst
                subq.w  #1,%d1                 | d1 = numMid (>= 0 since multi-group caller)
                move.w  %d1,0(%sp)             | numMid -> stack
                | h -> stack
                move.w  SP_FRM_H(%sp),%d1
                move.w  %d1,2(%sp)
                | a3 = base + y*160 + groupFirst*8
                move.w  SP_FRM_Y(%sp),%d0
                ext.l   %d0
                move.l  %d0,%d1
                lsl.l   #5,%d0
                lsl.l   #7,%d1
                add.l   %d1,%d0                | y*160
                move.l  SP_FRM_BASE(%sp),%a3
                add.l   %d0,%a3                | rowBase = base + y*160
                move.l  %a4,%d0                | groupFirst
                lsl.w   #3,%d0                 | * 8
                ext.l   %d0
                add.l   %d0,%a3                | + groupFirst*8
                | Dispatch on color
                moveq   #0,%d0
                move.b  SP_FRM_COLOR(%sp),%d0
                and.w   #0x0F,%d0
                add.w   %d0,%d0
                add.w   %d0,%d0
                lea     .LfrM_table(%pc),%a2
                jmp     0(%a2,%d0.w)
 .LfrM_table:
                bra.w   .LfrM_loop_0
                bra.w   .LfrM_loop_1
                bra.w   .LfrM_loop_2
                bra.w   .LfrM_loop_3
                bra.w   .LfrM_loop_4
                bra.w   .LfrM_loop_5
                bra.w   .LfrM_loop_6
                bra.w   .LfrM_loop_7
                bra.w   .LfrM_loop_8
                bra.w   .LfrM_loop_9
                bra.w   .LfrM_loop_10
                bra.w   .LfrM_loop_11
                bra.w   .LfrM_loop_12
                bra.w   .LfrM_loop_13
                bra.w   .LfrM_loop_14
                bra.w   .LfrM_loop_15
                FRM_LOOP  0
                FRM_LOOP  1
                FRM_LOOP  2
                FRM_LOOP  3
                FRM_LOOP  4
                FRM_LOOP  5
                FRM_LOOP  6
                FRM_LOOP  7
                FRM_LOOP  8
                FRM_LOOP  9
                FRM_LOOP  10
                FRM_LOOP  11
                FRM_LOOP  12
                FRM_LOOP  13
                FRM_LOOP  14
                FRM_LOOP  15
 .LfrM_done:
                lea     SP_FRM_LOCAL(%sp),%sp
                movem.l (%sp)+,%d2-%d7/%a2-%a6
                rts
                .align  2
 | Same LUTs as in fillCircle.s; duplicated locally so each .o file's
 | PC-rel lea can reach them within its own .text segment.
 frmLeftMaskLut:
                .word   0xFFFF, 0x7FFF, 0x3FFF, 0x1FFF
                .word   0x0FFF, 0x07FF, 0x03FF, 0x01FF
                .word   0x00FF, 0x007F, 0x003F, 0x001F
                .word   0x000F, 0x0007, 0x0003, 0x0001
 frmRightMaskLut:
                .word   0x8000, 0xC000, 0xE000, 0xF000
                .word   0xF800, 0xFC00, 0xFE00, 0xFF00
                .word   0xFF80, 0xFFC0, 0xFFE0, 0xFFF0
                .word   0xFFF8, 0xFFFC, 0xFFFE, 0xFFFF
 | ---- surface68kStLongFill ----------------------------------------
 |
 | Bulk long-fill helper for full-row fills (surfaceClear, fillRect
 | 320x200). Writes numGroups groups of 8 bytes (loLong, hiLong)
 | starting at dst. Uses movem.l d2-d7 (3 groups = 24 bytes per
 | batch) plus a tail pair to amortize loop overhead.
 |
 |   void surface68kStLongFill(uint8_t *dst,
 |                             uint16_t numGroups,
 |                             uint32_t loLong,
 |                             uint32_t hiLong);
 |
 | Per-batch cost: movem.l (56 cyc) + subq (8) + bne (10) = 74 cyc
 | for 24 bytes -- ~3 cyc/byte vs ~5 cyc/byte for the straight C
 | do-while of two move.l writes.
                .equ    SP_LF_SAVED, 24            | d2-d7 = 6 longs
                .equ    SP_LF_OFF,         (SP_LF_SAVED + 4)
                .equ    SP_LF_DST,     SP_LF_OFF + 0
                .equ    SP_LF_NGROUPS, SP_LF_OFF + 4 + 2
                .equ    SP_LF_LO,      SP_LF_OFF + 8
                .equ    SP_LF_HI,      SP_LF_OFF + 12
                .globl  _surface68kStLongFill
 _surface68kStLongFill:
                movem.l %d2-%d7,-(%sp)
                move.l  SP_LF_DST(%sp),%a0
                move.l  SP_LF_LO(%sp),%d2
                move.l  SP_LF_HI(%sp),%d3
                move.w  SP_LF_NGROUPS(%sp),%d0
                | Set up d2-d7 = lo, hi, lo, hi, lo, hi (movem writes
                | in d-reg order, so this gives the right alternation
                | for 3 consecutive 8-byte groups).
                move.l  %d2,%d4
                move.l  %d2,%d6
                move.l  %d3,%d5
                move.l  %d3,%d7
                | numBatches = numGroups / 3 (quotient), tail = remainder
                ext.l   %d0
                divu.w  #3,%d0
                move.l  %d0,%d1
                swap    %d1                        | d1.w = remainder
                tst.w   %d0                        | quotient
                beq.s   .Llf_tail
 .Llf_loop:
                movem.l %d2-%d7,(%a0)
                lea     24(%a0),%a0
                subq.w  #1,%d0
                bne.s   .Llf_loop
 .Llf_tail:
                | Remainder: 0, 1, or 2 groups of 8 bytes
                tst.w   %d1
                beq.s   .Llf_done
                move.l  %d2,(%a0)+
                move.l  %d3,(%a0)+
                subq.w  #1,%d1
                beq.s   .Llf_done
                move.l  %d2,(%a0)+
                move.l  %d3,(%a0)+
 .Llf_done:
                movem.l (%sp)+,%d2-%d7
                rts
--- a/src/port/atarist/spriteAsm.s
+++ b/src/port/atarist/spriteAsm.s
@ -0,0 +1,202 @@
 | ST byte-aligned sprite save / restore via 256-entry plane-spread
 | LUT. The LUT entry for each plane byte value is a 32-bit "spread"
 | where each plane byte bit lands at the corresponding plane-0 bit
 | position of the 4-byte chunky output. For plane N, we shift the
 | LUT entry left by N to put bits at the plane-N positions, then OR
 | the 4 plane contributions together to get the chunky long.
 |
 | LUT layout (256 longs = 1 KB), populated by initStPlaneSpreadLut
 | in hal.c:
 |
 |   gStPlaneSpreadLut[b] for plane byte b:
 |     bit i of b (i = 0 = MSB = leftmost pixel) maps to bit
 |     bitInLong(i) = (3 - (i >> 1)) * 8 + ((i & 1) ? 0 : 4)
 |     of the long. Plane 0's bits land at nibble bit 0 of each
 |     chunky byte; left-shift the LUT entry by N for plane N.
 |
 | ABI: cdecl. d2-d7/a2-a6 callee-save. C signatures:
 |
 |   void surface68kStSpriteSaveByteAligned(uint8_t *base,
 |                                          uint16_t x, uint16_t y,
 |                                          uint16_t w, uint16_t h,
 |                                          uint8_t *dstChunky);
 |
 |   void surface68kStSpriteRestoreByteAligned(uint8_t *base,
 |                                             uint16_t x, uint16_t y,
 |                                             uint16_t w, uint16_t h,
 |                                             const uint8_t *srcChunky);
                .text
                .equ    SP_SAVED, 44
                .equ    SP_OFF,   (SP_SAVED + 4)
                .equ    SP_BASE,    SP_OFF + 0
                .equ    SP_X,       SP_OFF + 4 + 2
                .equ    SP_Y,       SP_OFF + 8 + 2
                .equ    SP_W,       SP_OFF + 12 + 2
                .equ    SP_H,       SP_OFF + 16 + 2
                .equ    SP_CHUNKY,  SP_OFF + 20
                .equ    SP_LUT,     SP_OFF + 24
 | Per-tile-col SAVE: 4 plane bytes -> 4 contiguous bytes in buffer.
 | a0 -> plane 0 byte (high or low half), strides 2 to next plane
 | a1 -> output planar bytes (advanced by 4)
 | a2 -> unused (LUT no longer needed)
 |
 | Phase 10.5: dropped chunky <-> planar conversion. The buffer holds
 | plane-major bytes (per row: plane0, plane1, plane2, plane3 per
 | tile col, for w/8 tile cols). 4 byte copies instead of 4 LUT
 | lookups + shifts + ORs.
                .macro  SAVE_TILECOL
                move.b  (%a0),(%a1)+           | plane 0
                move.b  2(%a0),(%a1)+          | plane 1
                move.b  4(%a0),(%a1)+          | plane 2
                move.b  6(%a0),(%a1)+          | plane 3
                .endm
                .globl  _surface68kStSpriteSaveByteAligned
 _surface68kStSpriteSaveByteAligned:
                movem.l %d2-%d7/%a2-%a6,-(%sp)
                move.l  SP_BASE(%sp),%a3
                move.l  SP_CHUNKY(%sp),%a1
                | LUT pointer comes in via stack arg -- guaranteed
                | long-aligned because gcc passes ptr args via
                | move.l on a long-aligned sp slot. Avoids the BSS
                | misalignment problem on TOS .PRG (BSS pads only to
                | 2 bytes, even uint32_t slots can land at mod-4 = 2).
                move.l  SP_LUT(%sp),%a2
                move.w  SP_W(%sp),%d5
                lsr.w   #3,%d5                 | d5 = tileCols
                move.w  SP_H(%sp),%d6          | d6 = h
                move.w  SP_X(%sp),%d7
                | a4 = base + y*160 + (x>>4)*8
                move.w  SP_Y(%sp),%d0
                ext.l   %d0
                move.l  %d0,%d1
                lsl.l   #5,%d0                 | y << 5
                lsl.l   #7,%d1                 | y << 7
                add.l   %d1,%d0                | y * 160
                lea     0(%a3,%d0.l),%a4
                moveq   #0,%d0
                move.w  %d7,%d0
                lsr.w   #4,%d0
                lsl.w   #3,%d0
                ext.l   %d0
                add.l   %d0,%a4
                | Initial half offset: (x & 8) >> 3 = 0 or 1
                and.w   #8,%d7
                lsr.w   #3,%d7
 .LsaveRow:
                move.w  %d5,%d3                | d3 = tileCols
                moveq   #0,%d2
                move.w  %d7,%d2
                lea     0(%a4,%d2.l),%a0       | a0 = first plane-0 byte
 .LsaveCol:
                SAVE_TILECOL
                | Advance a0: bit 0 = 0 -> high, advance to low (+1).
                | bit 0 = 1 -> low, advance to next group's high (+7).
                move.l  %a0,%d4
                btst    #0,%d4
                bne.s   .LsaveColWasLo
                addq.l  #1,%a0
                bra.s   .LsaveColNext
 .LsaveColWasLo:
                lea     7(%a0),%a0
 .LsaveColNext:
                subq.w  #1,%d3
                bne.w   .LsaveCol
                lea     160(%a4),%a4
                subq.w  #1,%d6
                bne.w   .LsaveRow
                movem.l (%sp)+,%d2-%d7/%a2-%a6
                rts
 | Per-tile-col RESTORE: 4 contiguous bytes from buffer -> 4 plane bytes.
 | a0 -> plane 0 byte (high or low half)
 | a1 -> input planar bytes (advanced by 4)
 | a2 -> unused (LUT no longer needed)
 |
 | Phase 10.5: dropped chunky -> planar conversion. Buffer layout
 | matches SAVE_TILECOL: per row, plane0/1/2/3 per tile col.
                .macro  RESTORE_TILECOL
                move.b  (%a1)+,(%a0)           | plane 0
                move.b  (%a1)+,2(%a0)          | plane 1
                move.b  (%a1)+,4(%a0)          | plane 2
                move.b  (%a1)+,6(%a0)          | plane 3
                .endm
                .globl  _surface68kStSpriteRestoreByteAligned
 _surface68kStSpriteRestoreByteAligned:
                movem.l %d2-%d7/%a2-%a6,-(%sp)
                move.l  SP_BASE(%sp),%a3
                move.l  SP_CHUNKY(%sp),%a1
                move.l  SP_LUT(%sp),%a2        | gC2pLut passed in
                | tileCols is held in a5 (not d5) because the macro
                | trashes d5 (uses it for pb3).
                move.w  SP_W(%sp),%d0
                lsr.w   #3,%d0
                movea.w %d0,%a5
                move.w  SP_H(%sp),%d6
                move.w  SP_X(%sp),%d7
                move.w  SP_Y(%sp),%d0
                ext.l   %d0
                move.l  %d0,%d1
                lsl.l   #5,%d0
                lsl.l   #7,%d1
                add.l   %d1,%d0
                lea     0(%a3,%d0.l),%a4
                moveq   #0,%d0
                move.w  %d7,%d0
                lsr.w   #4,%d0
                lsl.w   #3,%d0
                ext.l   %d0
                add.l   %d0,%a4
                and.w   #8,%d7
                lsr.w   #3,%d7
 .LrestoreRow:
                move.w  %a5,%d3                | d3 = tileCols (from a5)
                moveq   #0,%d2
                move.w  %d7,%d2
                lea     0(%a4,%d2.l),%a0
 .LrestoreCol:
                RESTORE_TILECOL
                move.l  %a0,%d4
                btst    #0,%d4
                bne.s   .LrestoreColWasLo
                addq.l  #1,%a0
                bra.s   .LrestoreColNext
 .LrestoreColWasLo:
                lea     7(%a0),%a0
 .LrestoreColNext:
                subq.w  #1,%d3
                bne.w   .LrestoreCol
                lea     160(%a4),%a4
                subq.w  #1,%d6
                bne.w   .LrestoreRow
                movem.l (%sp)+,%d2-%d7/%a2-%a6
                rts