From 6c03d93e8824fe54717c0aae852458169432f6f2 Mon Sep 17 00:00:00 2001
From: Scott Duensing <scott@duensing.com>
Date: Fri, 1 May 2026 13:00:11 -0500
Subject: [PATCH] Amiga and ST improvements.

---
 make/amiga.mk                  |   2 +-
 make/atarist.mk                |   2 +-
 src/port/amiga/hal.c           | 103 ++++++++-----
 src/port/atarist/hal.c         |  98 ++++++++-----
 src/shared68k/draw68k_inline.h | 201 ++++++++++++++++++++++++++
 src/shared68k/surface68k.s     | 255 +++++++++++++++++++++++++++++++++
 6 files changed, 589 insertions(+), 72 deletions(-)
 create mode 100644 src/shared68k/draw68k_inline.h
 create mode 100644 src/shared68k/surface68k.s

diff --git a/make/amiga.mk b/make/amiga.mk
index 4e2f86b..63741b4 100644
--- a/make/amiga.mk
+++ b/make/amiga.mk
@@ -13,7 +13,7 @@ BINDIR   := $(BUILD)/bin
 # independently. -I on $(SRC_PORT)/amiga lets ptplayer.h resolve
 # <SDI_compiler.h> from the port-local shim alongside our HAL code.
 PTPLAYER_DIR := $(REPO_DIR)/toolchains/amiga/ptplayer
-CFLAGS  := $(COMMON_CFLAGS) -DJOEYLIB_PLATFORM_AMIGA -m68000 -fomit-frame-pointer -noixemul -D__OSCOMPAT -I$(SRC_PORT)/amiga -I$(PTPLAYER_DIR)
+CFLAGS  := $(COMMON_CFLAGS) -DJOEYLIB_PLATFORM_AMIGA -m68000 -fomit-frame-pointer -noixemul -D__OSCOMPAT -I$(SRC_PORT)/amiga -I$(SRC_68K) -I$(PTPLAYER_DIR)
 # OSCOMPAT=1 selects PTPlayer's audio.device-friendly variant (uses
 # CIA-B + audio.device interrupts via the OS rather than taking over
 # Paula directly), matching the way our HAL cooperates with Intuition.
diff --git a/make/atarist.mk b/make/atarist.mk
index a30f42e..c8d7536 100644
--- a/make/atarist.mk
+++ b/make/atarist.mk
@@ -7,7 +7,7 @@ BUILD    := $(REPO_DIR)/build/$(PLATFORM)
 LIBDIR   := $(BUILD)/lib
 BINDIR   := $(BUILD)/bin
 
-CFLAGS  := $(COMMON_CFLAGS) -DJOEYLIB_PLATFORM_ATARIST -m68000 -fomit-frame-pointer -I$(REPO_DIR)/toolchains/audio/libxmp-lite/include -I$(REPO_DIR)/toolchains/atarist/include-shim
+CFLAGS  := $(COMMON_CFLAGS) -DJOEYLIB_PLATFORM_ATARIST -m68000 -fomit-frame-pointer -I$(REPO_DIR)/toolchains/audio/libxmp-lite/include -I$(REPO_DIR)/toolchains/atarist/include-shim -I$(SRC_68K)
 LDFLAGS :=
 
 # libxmp-lite shared with the DOS port. Built as a static archive that
diff --git a/src/port/amiga/hal.c b/src/port/amiga/hal.c
index b1d4b85..fe6997b 100644
--- a/src/port/amiga/hal.c
+++ b/src/port/amiga/hal.c
@@ -44,6 +44,7 @@
 
 #include "hal.h"
 #include "surfaceInternal.h"
+#include "draw68k_inline.h"
 
 extern struct Custom custom;
 
@@ -602,22 +603,56 @@ void halShutdown(void) {
 }
 
 
-// Amiga has no asm fast paths yet; cross-platform code falls back to
-// its C implementations whenever these return false.
+// Shared 68k fast paths for the chunky surface buffer (src/shared68k/
+// surface68k.s). Same primitives used by the Atari ST port -- the
+// stage / surfaces are an identical 4bpp packed layout on both.
+extern void surface68kClearLong(uint8_t *pixels, uint16_t fillByte);
+extern void surface68kFillRectFull(uint8_t *pixels, int16_t y, uint16_t h, uint16_t fillByte);
+extern void surface68kFillRectByteAligned(uint8_t *rowFirst, uint16_t midBytes, uint16_t h, uint16_t fillByte);
+
+
 bool halFastSurfaceClear(SurfaceT *s, uint8_t doubled) {
-    (void)s;
-    (void)doubled;
-    return false;
+    if (s != stageGet()) {
+        return false;
+    }
+    surface68kClearLong(s->pixels, (uint16_t)doubled);
+    return true;
 }
 
 
+// Fast path bands:
+//   - x == 0 && w == SURFACE_WIDTH (full row): one move.l-stream per
+//     row via surface68kFillRectFull. No nibble fixups needed -- both
+//     nibbles in every byte get the same value, and rowFirst is the
+//     surface base which is always word-aligned by calloc.
+//   - x % 4 == 0 && w even (byte-aligned AND word-aligned): inner
+//     bytes via the asm. The (x % 4 == 0) part is the 68000 alignment
+//     requirement for the move.l writes inside the asm -- byte index
+//     = x/2, so x must be a multiple of 4 for the byte index to be
+//     even.
+//   - everything else: fall through to C's fillRectClipped, which
+//     does per-byte writes (no alignment needed) and handles the
+//     leading / trailing nibble RMW correctly.
 bool halFastFillRect(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex) {
-    (void)s;
-    (void)x;
-    (void)y;
-    (void)w;
-    (void)h;
-    (void)colorIndex;
+    uint8_t  doubled;
+
+    if (s != stageGet()) {
+        return false;
+    }
+    if (h == 0u || w == 0u) {
+        return true;        /* clipped-out: nothing to do, but we "handled" it */
+    }
+    doubled = (uint8_t)(((colorIndex & 0x0Fu) << 4) | (colorIndex & 0x0Fu));
+
+    if (x == 0 && w == (uint16_t)SURFACE_WIDTH) {
+        surface68kFillRectFull(s->pixels, y, h, (uint16_t)doubled);
+        return true;
+    }
+    if (((x & 3) == 0) && ((w & 1u) == 0u)) {
+        uint8_t *rowFirst = &s->pixels[(uint16_t)y * (uint16_t)SURFACE_BYTES_PER_ROW + ((uint16_t)x >> 1)];
+        surface68kFillRectByteAligned(rowFirst, w >> 1, h, (uint16_t)doubled);
+        return true;
+    }
     return false;
 }
 
@@ -652,42 +687,40 @@ bool halFastTileSnap(uint8_t *dstTilePixels, const uint8_t *srcRow0) {
 
 
 bool halFastDrawPixel(SurfaceT *s, uint16_t x, uint16_t y, uint8_t colorIndex) {
-    (void)s;
-    (void)x;
-    (void)y;
-    (void)colorIndex;
-    return false;
+    uint8_t nibLo;
+    if (s != stageGet()) {
+        return false;
+    }
+    nibLo = (uint8_t)(colorIndex & 0x0Fu);
+    draw68kPlotPixel(s->pixels, (int16_t)x, (int16_t)y, nibLo, (uint8_t)(nibLo << 4));
+    return true;
 }
 
 
 bool halFastDrawLine(SurfaceT *s, int16_t x0, int16_t y0, int16_t x1, int16_t y1, uint8_t colorIndex) {
-    (void)s;
-    (void)x0;
-    (void)y0;
-    (void)x1;
-    (void)y1;
-    (void)colorIndex;
-    return false;
+    if (s != stageGet()) {
+        return false;
+    }
+    draw68kLine(s->pixels, x0, y0, x1, y1, colorIndex);
+    return true;
 }
 
 
 bool halFastDrawCircle(SurfaceT *s, int16_t cx, int16_t cy, uint16_t r, uint8_t colorIndex) {
-    (void)s;
-    (void)cx;
-    (void)cy;
-    (void)r;
-    (void)colorIndex;
-    return false;
+    if (s != stageGet()) {
+        return false;
+    }
+    draw68kCircleOutline(s->pixels, cx, cy, r, colorIndex);
+    return true;
 }
 
 
 bool halFastFillCircle(SurfaceT *s, int16_t cx, int16_t cy, uint16_t r, uint8_t colorIndex) {
-    (void)s;
-    (void)cx;
-    (void)cy;
-    (void)r;
-    (void)colorIndex;
-    return false;
+    if (s != stageGet()) {
+        return false;
+    }
+    draw68kCircleFill(s->pixels, cx, cy, r, colorIndex);
+    return true;
 }
 
 
diff --git a/src/port/atarist/hal.c b/src/port/atarist/hal.c
index c9fd03c..2efcf32 100644
--- a/src/port/atarist/hal.c
+++ b/src/port/atarist/hal.c
@@ -36,6 +36,7 @@
 
 #include "hal.h"
 #include "surfaceInternal.h"
+#include "draw68k_inline.h"
 
 // ----- Constants -----
 
@@ -600,22 +601,51 @@ void halShutdown(void) {
 }
 
 
-// ST has no asm fast paths yet; cross-platform code falls back to its
-// C implementations when these return false.
+// Shared 68k fast paths for the chunky surface buffer (src/shared68k/
+// surface68k.s). Same primitives used by the Amiga port -- the stage /
+// surfaces are identical 4bpp packed layout on both 68k targets.
+extern void surface68kClearLong(uint8_t *pixels, uint16_t fillByte);
+extern void surface68kFillRectFull(uint8_t *pixels, int16_t y, uint16_t h, uint16_t fillByte);
+extern void surface68kFillRectByteAligned(uint8_t *rowFirst, uint16_t midBytes, uint16_t h, uint16_t fillByte);
+
+
 bool halFastSurfaceClear(SurfaceT *s, uint8_t doubled) {
-    (void)s;
-    (void)doubled;
-    return false;
+    if (s != stageGet()) {
+        return false;
+    }
+    surface68kClearLong(s->pixels, (uint16_t)doubled);
+    return true;
 }
 
 
+// Fast path bands:
+//   - x == 0 && w == SURFACE_WIDTH (full row): one move.l-stream per
+//     row via surface68kFillRectFull. Always word-aligned.
+//   - x % 4 == 0 && w even (word-aligned): byte index = x/2, so x must
+//     be a multiple of 4 for the move.l writes inside the asm to land
+//     on even addresses (68000 address-error rule).
+//   - everything else: fall through to C's fillRectClipped, which is
+//     per-byte and tolerates any alignment.
 bool halFastFillRect(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex) {
-    (void)s;
-    (void)x;
-    (void)y;
-    (void)w;
-    (void)h;
-    (void)colorIndex;
+    uint8_t doubled;
+
+    if (s != stageGet()) {
+        return false;
+    }
+    if (h == 0u || w == 0u) {
+        return true;
+    }
+    doubled = (uint8_t)(((colorIndex & 0x0Fu) << 4) | (colorIndex & 0x0Fu));
+
+    if (x == 0 && w == (uint16_t)SURFACE_WIDTH) {
+        surface68kFillRectFull(s->pixels, y, h, (uint16_t)doubled);
+        return true;
+    }
+    if (((x & 3) == 0) && ((w & 1u) == 0u)) {
+        uint8_t *rowFirst = &s->pixels[(uint16_t)y * (uint16_t)SURFACE_BYTES_PER_ROW + ((uint16_t)x >> 1)];
+        surface68kFillRectByteAligned(rowFirst, w >> 1, h, (uint16_t)doubled);
+        return true;
+    }
     return false;
 }
 
@@ -650,42 +680,40 @@ bool halFastTileSnap(uint8_t *dstTilePixels, const uint8_t *srcRow0) {
 
 
 bool halFastDrawPixel(SurfaceT *s, uint16_t x, uint16_t y, uint8_t colorIndex) {
-    (void)s;
-    (void)x;
-    (void)y;
-    (void)colorIndex;
-    return false;
+    uint8_t nibLo;
+    if (s != stageGet()) {
+        return false;
+    }
+    nibLo = (uint8_t)(colorIndex & 0x0Fu);
+    draw68kPlotPixel(s->pixels, (int16_t)x, (int16_t)y, nibLo, (uint8_t)(nibLo << 4));
+    return true;
 }
 
 
 bool halFastDrawLine(SurfaceT *s, int16_t x0, int16_t y0, int16_t x1, int16_t y1, uint8_t colorIndex) {
-    (void)s;
-    (void)x0;
-    (void)y0;
-    (void)x1;
-    (void)y1;
-    (void)colorIndex;
-    return false;
+    if (s != stageGet()) {
+        return false;
+    }
+    draw68kLine(s->pixels, x0, y0, x1, y1, colorIndex);
+    return true;
 }
 
 
 bool halFastDrawCircle(SurfaceT *s, int16_t cx, int16_t cy, uint16_t r, uint8_t colorIndex) {
-    (void)s;
-    (void)cx;
-    (void)cy;
-    (void)r;
-    (void)colorIndex;
-    return false;
+    if (s != stageGet()) {
+        return false;
+    }
+    draw68kCircleOutline(s->pixels, cx, cy, r, colorIndex);
+    return true;
 }
 
 
 bool halFastFillCircle(SurfaceT *s, int16_t cx, int16_t cy, uint16_t r, uint8_t colorIndex) {
-    (void)s;
-    (void)cx;
-    (void)cy;
-    (void)r;
-    (void)colorIndex;
-    return false;
+    if (s != stageGet()) {
+        return false;
+    }
+    draw68kCircleFill(s->pixels, cx, cy, r, colorIndex);
+    return true;
 }
 
 
diff --git a/src/shared68k/draw68k_inline.h b/src/shared68k/draw68k_inline.h
new file mode 100644
index 0000000..091b520
--- /dev/null
+++ b/src/shared68k/draw68k_inline.h
@@ -0,0 +1,201 @@
+// Shared 68k C-side fast paths for drawing primitives.
+//
+// Header-only `static inline` so both Amiga and Atari ST hal.c can
+// pull the same code without makefile changes -- the surface buffer
+// layout is identical between the two ports (4bpp packed, 160 bytes
+// per row, 32000 bytes per surface), so the per-pixel write logic
+// is portable.
+//
+// All functions assume the caller has already verified the rect /
+// circle bounding box is fully on-surface (which is what the
+// halFast* contract guarantees). No clip checks inside the hot
+// loops, no per-pixel surfaceMarkDirtyRect calls -- the caller marks
+// the bounding box dirty once after the call.
+//
+// gcc-amigaos / gcc-atari-mint both inline these aggressively at -O2,
+// so the wrapper functions in each port's hal.c compile to a single
+// tight m68k loop with no JSR per pixel.
+
+#ifndef JOEYLIB_DRAW68K_INLINE_H
+#define JOEYLIB_DRAW68K_INLINE_H
+
+#include "joey/types.h"
+#include "joey/surface.h"
+
+
+// Plot a single pixel at (x, y) without bounds checking. nibLo is
+// the colorIndex's low nibble, nibHi is (low << 4) precomputed by
+// the caller so the hot loop doesn't do the shift per call.
+static inline void draw68kPlotPixel(uint8_t *pixels, int16_t x, int16_t y,
+                                    uint8_t nibLo, uint8_t nibHi) {
+    uint8_t *byte = &pixels[(uint16_t)y * (uint16_t)SURFACE_BYTES_PER_ROW
+                            + ((uint16_t)x >> 1)];
+    if (x & 1) {
+        *byte = (uint8_t)((*byte & 0xF0u) | nibLo);
+    } else {
+        *byte = (uint8_t)((*byte & 0x0Fu) | nibHi);
+    }
+}
+
+
+// Bresenham midpoint-circle outline. Plots all 8 octants per
+// iteration. Caller must verify cx +/- r and cy +/- r are all in
+// surface bounds (the halFastDrawCircle contract).
+static inline void draw68kCircleOutline(uint8_t *pixels, int16_t cx, int16_t cy,
+                                        uint16_t r, uint8_t colorIndex) {
+    int16_t x;
+    int16_t y;
+    int16_t err;
+    uint8_t nibLo = (uint8_t)(colorIndex & 0x0Fu);
+    uint8_t nibHi = (uint8_t)(nibLo << 4);
+
+    if (r == 0u) {
+        draw68kPlotPixel(pixels, cx, cy, nibLo, nibHi);
+        return;
+    }
+
+    x = (int16_t)r;
+    y = 0;
+    err = (int16_t)(1 - x);
+    while (x >= y) {
+        draw68kPlotPixel(pixels, (int16_t)(cx + x), (int16_t)(cy + y), nibLo, nibHi);
+        draw68kPlotPixel(pixels, (int16_t)(cx - x), (int16_t)(cy + y), nibLo, nibHi);
+        draw68kPlotPixel(pixels, (int16_t)(cx + x), (int16_t)(cy - y), nibLo, nibHi);
+        draw68kPlotPixel(pixels, (int16_t)(cx - x), (int16_t)(cy - y), nibLo, nibHi);
+        draw68kPlotPixel(pixels, (int16_t)(cx + y), (int16_t)(cy + x), nibLo, nibHi);
+        draw68kPlotPixel(pixels, (int16_t)(cx - y), (int16_t)(cy + x), nibLo, nibHi);
+        draw68kPlotPixel(pixels, (int16_t)(cx + y), (int16_t)(cy - x), nibLo, nibHi);
+        draw68kPlotPixel(pixels, (int16_t)(cx - y), (int16_t)(cy - x), nibLo, nibHi);
+        y++;
+        if (err <= 0) {
+            err = (int16_t)(err + y + y + 1);
+        } else {
+            x--;
+            err = (int16_t)(err + y + y - x - x + 1);
+        }
+    }
+}
+
+
+// Filled circle: for each y from 0..r, find the largest x with
+// x*x + y*y <= r*r and emit the symmetric horizontal span. Caller
+// must verify the bounding box is on-surface.
+//
+// Each span is a horizontal run on a single row, so we go straight
+// to the byte-fill path used by halFastFillRect-style code rather
+// than calling per-pixel helpers.
+static inline void draw68kCircleFill(uint8_t *pixels, int16_t cx, int16_t cy,
+                                     uint16_t r, uint8_t colorIndex) {
+    int16_t  y;
+    int16_t  x;
+    uint16_t xx;
+    uint16_t yy;
+    uint16_t r2;
+    uint8_t  nibLo = (uint8_t)(colorIndex & 0x0Fu);
+    uint8_t  nibHi = (uint8_t)(nibLo << 4);
+    uint8_t  doubled = (uint8_t)(nibHi | nibLo);
+
+    if (r == 0u) {
+        draw68kPlotPixel(pixels, cx, cy, nibLo, nibHi);
+        return;
+    }
+
+    xx = (uint16_t)(r * r);
+    r2 = xx;
+    yy = 0;
+    x  = (int16_t)r;
+
+    for (y = 0; y <= (int16_t)r; y++) {
+        int16_t  spanLeft;
+        int16_t  spanRight;
+        int16_t  rowsRemaining;
+        int16_t  rowYDelta;
+
+        while (xx + yy > r2) {
+            xx = (uint16_t)(xx - (uint16_t)((uint16_t)x + (uint16_t)x - 1u));
+            x--;
+        }
+
+        spanLeft  = (int16_t)(cx - x);
+        spanRight = (int16_t)(cx + x);
+        rowsRemaining = (y == 0) ? 1 : 2;
+        for (rowYDelta = 0; rowYDelta < rowsRemaining; rowYDelta++) {
+            int16_t  rowY = (rowYDelta == 0) ? (int16_t)(cy + y) : (int16_t)(cy - y);
+            uint8_t *rowBase = &pixels[(uint16_t)rowY * (uint16_t)SURFACE_BYTES_PER_ROW];
+            int16_t  px = spanLeft;
+
+            /* Leading partial nibble. */
+            if (px & 1) {
+                uint8_t *byte = &rowBase[(uint16_t)px >> 1];
+                *byte = (uint8_t)((*byte & 0xF0u) | nibLo);
+                px++;
+            }
+            /* Middle whole bytes. */
+            {
+                int16_t midBytes = (int16_t)((spanRight + 1 - px) >> 1);
+                if (midBytes > 0) {
+                    uint8_t *p = &rowBase[(uint16_t)px >> 1];
+                    int16_t  i;
+                    for (i = 0; i < midBytes; i++) {
+                        p[i] = doubled;
+                    }
+                    px = (int16_t)(px + (midBytes << 1));
+                }
+            }
+            /* Trailing partial nibble. */
+            if (px <= spanRight) {
+                uint8_t *byte = &rowBase[(uint16_t)px >> 1];
+                *byte = (uint8_t)((*byte & 0x0Fu) | nibHi);
+            }
+        }
+
+        yy = (uint16_t)(yy + (uint16_t)((uint16_t)y + (uint16_t)y + 1u));
+    }
+}
+
+
+// Bresenham line plot. Caller must verify both endpoints are on
+// surface (halFastDrawLine contract). Falls through to a tight
+// inner loop with no per-pixel function calls.
+static inline void draw68kLine(uint8_t *pixels, int16_t x0, int16_t y0,
+                               int16_t x1, int16_t y1, uint8_t colorIndex) {
+    int16_t  dx;
+    int16_t  dy;
+    int16_t  sx;
+    int16_t  sy;
+    int16_t  err;
+    int16_t  e2;
+    uint8_t  nibLo = (uint8_t)(colorIndex & 0x0Fu);
+    uint8_t  nibHi = (uint8_t)(nibLo << 4);
+
+    dx = (int16_t)(x1 - x0);
+    if (dx < 0) {
+        dx = (int16_t)(-dx);
+    }
+    dy = (int16_t)(y1 - y0);
+    if (dy < 0) {
+        dy = (int16_t)(-dy);
+    }
+    sx = (x0 < x1) ? 1 : -1;
+    sy = (y0 < y1) ? 1 : -1;
+    err = (int16_t)(dx - dy);
+
+    for (;;) {
+        draw68kPlotPixel(pixels, x0, y0, nibLo, nibHi);
+        if (x0 == x1 && y0 == y1) {
+            break;
+        }
+        e2 = (int16_t)(err + err);
+        if (e2 > -dy) {
+            err = (int16_t)(err - dy);
+            x0  = (int16_t)(x0 + sx);
+        }
+        if (e2 < dx) {
+            err = (int16_t)(err + dx);
+            y0  = (int16_t)(y0 + sy);
+        }
+    }
+}
+
+
+#endif /* JOEYLIB_DRAW68K_INLINE_H */
diff --git a/src/shared68k/surface68k.s b/src/shared68k/surface68k.s
new file mode 100644
index 0000000..7e3186a
--- /dev/null
+++ b/src/shared68k/surface68k.s
@@ -0,0 +1,255 @@
+| Shared 68000 fast paths for the chunky surface buffer.
+|
+| Both Amiga and Atari ST keep the JoeyLib stage / surfaces in the
+| same 4bpp packed (2 px / byte, 160 bytes / row, 32000 bytes / surface)
+| format -- the c2p / blit-to-screen step that converts to native
+| pixel layout happens only at present time. So the per-byte buffer
+| operations (fill, clear) can be shared across both ports.
+|
+| Calling convention: cdecl (gcc-amigaos / gcc-atari-mint).
+|   d0/d1/a0/a1 are caller-save (scratch).
+|   d2-d7/a2-a6 are callee-save (must be preserved).
+|   16-bit shorts are sign-extended to int (4 bytes) on the call stack.
+|
+| GAS m68k syntax. Assembled by the gcc driver via binutils m68k-as.
+
+                .text
+
+
+| ----------------------------------------------------------------
+| void surface68kClearLong(uint8_t *pixels, uint16_t fillByte);
+|
+| Fill 32000 bytes (SURFACE_PIXELS_SIZE) starting at `pixels` with
+| `fillByte`. Unrolled move.l with the byte expanded to a 4-byte
+| word so we can write 32 bytes per loop body instead of 1.
+| ----------------------------------------------------------------
+                .globl  _surface68kClearLong
+
+                .equ    SURF_CLEAR_BYTES, 32000
+                .equ    SURF_CLEAR_LONGS, (SURF_CLEAR_BYTES / 4)
+                .equ    SURF_CLEAR_PER_ITER, 8
+                .equ    SURF_CLEAR_ITERS, (SURF_CLEAR_LONGS / SURF_CLEAR_PER_ITER)
+
+                | Save d2 (used as dbra counter). a0/a1/d0/d1 are
+                | caller-save so we may clobber them freely. Stack
+                | offset to args = 4 (saved d2) + 4 (return PC) = 8.
+                .equ    CLR_SAVED, 4
+
+_surface68kClearLong:
+                move.l  %d2,-(%sp)
+
+                move.l   4+CLR_SAVED(%sp),%a0           | pixels
+                | Zero d0 BEFORE the move.b -- m68k move.b only touches
+                | the low byte of the destination, so d0[31..8] would
+                | otherwise hold whatever garbage the caller left in d0
+                | and pollute the OR-replicate chain below.
+                moveq   #0,%d0
+                move.b   8+CLR_SAVED+3(%sp),%d0         | fillByte (low byte of int)
+
+                | Build a long with fillByte replicated four times.
+                | d0 = $000000FB -> $0000FBFB -> $FBFBFBFB
+                move.l  %d0,%d1
+                lsl.l   #8,%d1
+                or.l    %d1,%d0                         | d0 = $0000FBFB
+                move.l  %d0,%d1
+                swap    %d1                             | d1 = $FBFB0000
+                or.l    %d1,%d0                         | d0 = $FBFBFBFB
+
+                move.l  %a0,%a1                         | dest cursor
+                move.w  #(SURF_CLEAR_ITERS - 1),%d2     | dbra count
+
+.LclearLoop:
+                move.l  %d0,(%a1)+
+                move.l  %d0,(%a1)+
+                move.l  %d0,(%a1)+
+                move.l  %d0,(%a1)+
+                move.l  %d0,(%a1)+
+                move.l  %d0,(%a1)+
+                move.l  %d0,(%a1)+
+                move.l  %d0,(%a1)+
+                dbra    %d2,.LclearLoop
+
+                move.l  (%sp)+,%d2
+                rts
+
+
+| ----------------------------------------------------------------
+| void surface68kFillRectFull(uint8_t *pixels, int16_t y, uint16_t h,
+|                             uint16_t fillByte);
+|
+| Full-width (320 px = 160 byte) rectangle fill. `pixels` is the
+| surface base; the row at `y` is `pixels + y * 160`. Fills `h`
+| consecutive rows using move.l writes.
+| ----------------------------------------------------------------
+                .globl  _surface68kFillRectFull
+
+                .equ    SURF_FRF_BYTES_PER_ROW, 160
+
+                | Save d2/d3 (used for fillByte replicate scratch and
+                | row counter). 8 bytes saved + 4 ret PC = 12 to args.
+                .equ    FRF_SAVED, 8
+
+_surface68kFillRectFull:
+                movem.l %d2-%d3,-(%sp)
+
+                move.l   4+FRF_SAVED(%sp),%a0           | pixels base
+                | Zero d0/d1 BEFORE move.b/move.w -- those only update
+                | sub-register portions and would otherwise carry the
+                | caller's garbage upper bits into the replicate OR.
+                moveq   #0,%d0
+                moveq   #0,%d1
+                moveq   #0,%d2
+                move.w   8+FRF_SAVED+2(%sp),%d1         | y (low word)
+                move.w  12+FRF_SAVED+2(%sp),%d2         | h (low word)
+                move.b  16+FRF_SAVED+3(%sp),%d0         | fillByte
+
+                | Bail on degenerate height.
+                tst.w   %d2
+                ble     .Lfrf_done
+
+                | Build replicated fillByte in d0.
+                move.l  %d0,%d3
+                lsl.l   #8,%d3
+                or.l    %d3,%d0
+                move.l  %d0,%d3
+                swap    %d3
+                or.l    %d3,%d0                         | d0 = byte * $01010101
+
+                | row pointer = pixels + y * 160 = pixels + y*128 + y*32
+                ext.l   %d1
+                move.l  %d1,%d3
+                lsl.l   #7,%d3                          | y * 128
+                lsl.l   #5,%d1                          | y *  32
+                add.l   %d3,%d1                         | y * 160
+                add.l   %d1,%a0                         | a0 = first row to fill
+
+                subq.w  #1,%d2                          | row dbra count
+
+.Lfrf_rowLoop:
+                move.l  %a0,%a1
+                | 40 longs per row, fully unrolled.
+                move.l  %d0,(%a1)+
+                move.l  %d0,(%a1)+
+                move.l  %d0,(%a1)+
+                move.l  %d0,(%a1)+
+                move.l  %d0,(%a1)+
+                move.l  %d0,(%a1)+
+                move.l  %d0,(%a1)+
+                move.l  %d0,(%a1)+
+                move.l  %d0,(%a1)+
+                move.l  %d0,(%a1)+
+                move.l  %d0,(%a1)+
+                move.l  %d0,(%a1)+
+                move.l  %d0,(%a1)+
+                move.l  %d0,(%a1)+
+                move.l  %d0,(%a1)+
+                move.l  %d0,(%a1)+
+                move.l  %d0,(%a1)+
+                move.l  %d0,(%a1)+
+                move.l  %d0,(%a1)+
+                move.l  %d0,(%a1)+
+                move.l  %d0,(%a1)+
+                move.l  %d0,(%a1)+
+                move.l  %d0,(%a1)+
+                move.l  %d0,(%a1)+
+                move.l  %d0,(%a1)+
+                move.l  %d0,(%a1)+
+                move.l  %d0,(%a1)+
+                move.l  %d0,(%a1)+
+                move.l  %d0,(%a1)+
+                move.l  %d0,(%a1)+
+                move.l  %d0,(%a1)+
+                move.l  %d0,(%a1)+
+                move.l  %d0,(%a1)+
+                move.l  %d0,(%a1)+
+                move.l  %d0,(%a1)+
+                move.l  %d0,(%a1)+
+                move.l  %d0,(%a1)+
+                move.l  %d0,(%a1)+
+                move.l  %d0,(%a1)+
+                move.l  %d0,(%a1)+
+                lea     SURF_FRF_BYTES_PER_ROW(%a0),%a0
+                dbra    %d2,.Lfrf_rowLoop
+
+.Lfrf_done:
+                movem.l (%sp)+,%d2-%d3
+                rts
+
+
+| ----------------------------------------------------------------
+| void surface68kFillRectByteAligned(uint8_t *rowFirst, uint16_t midBytes,
+|                                    uint16_t h, uint16_t fillByte);
+|
+| Per-row middle-bytes fill for a fillRect where the leading and
+| trailing nibble fixups have already been done (or none are
+| needed because x and w are both even). `rowFirst` points at the
+| first FULLY-INSIDE byte of the rect on row 0; subsequent rows
+| are at +160. `midBytes` is how many full bytes per row to fill.
+| ----------------------------------------------------------------
+                .globl  _surface68kFillRectByteAligned
+
+                .equ    SURF_FRB_STRIDE, 160
+
+                | Save d2-d6 (5 regs = 20 bytes). Args at +20+4 = +24.
+                .equ    FRB_SAVED, 20
+
+_surface68kFillRectByteAligned:
+                movem.l %d2-%d6,-(%sp)
+
+                move.l   4+FRB_SAVED(%sp),%a0           | rowFirst
+                | Zero scratch regs BEFORE the sub-register loads --
+                | move.w / move.b only touch low portions, leaving
+                | caller's garbage in the upper bits which would
+                | otherwise pollute the replicate-OR chain below.
+                moveq   #0,%d0
+                moveq   #0,%d1
+                moveq   #0,%d2
+                move.w   8+FRB_SAVED+2(%sp),%d1         | midBytes
+                move.w  12+FRB_SAVED+2(%sp),%d2         | h
+                move.b  16+FRB_SAVED+3(%sp),%d0         | fillByte
+
+                tst.w   %d1
+                beq     .Lfrb_done
+                tst.w   %d2
+                ble     .Lfrb_done
+
+                | Build replicated fillByte in d0.
+                move.l  %d0,%d3
+                lsl.l   #8,%d3
+                or.l    %d3,%d0
+                move.l  %d0,%d3
+                swap    %d3
+                or.l    %d3,%d0
+
+                | Pre-compute long-count and trailing-byte residual.
+                move.w  %d1,%d4
+                lsr.w   #2,%d4                          | longCount
+                move.w  %d1,%d5
+                and.w   #3,%d5                          | trailBytes
+
+                subq.w  #1,%d2                          | row dbra count
+
+.Lfrb_rowLoop:
+                move.l  %a0,%a1
+                move.w  %d4,%d6                         | longs left
+                tst.w   %d6
+                beq     .Lfrb_tail
+                subq.w  #1,%d6
+.Lfrb_longLoop:
+                move.l  %d0,(%a1)+
+                dbra    %d6,.Lfrb_longLoop
+.Lfrb_tail:
+                move.w  %d5,%d6                         | trailing bytes
+                tst.w   %d6
+                beq     .Lfrb_rowDone
+                subq.w  #1,%d6
+.Lfrb_byteLoop:
+                move.b  %d0,(%a1)+
+                dbra    %d6,.Lfrb_byteLoop
+.Lfrb_rowDone:
+                lea     SURF_FRB_STRIDE(%a0),%a0
+                dbra    %d2,.Lfrb_rowLoop
+
+.Lfrb_done:
+                movem.l (%sp)+,%d2-%d6
+                rts