From 6c03d93e8824fe54717c0aae852458169432f6f2 Mon Sep 17 00:00:00 2001 From: Scott Duensing Date: Fri, 1 May 2026 13:00:11 -0500 Subject: [PATCH] Amiga and ST improvements. --- make/amiga.mk | 2 +- make/atarist.mk | 2 +- src/port/amiga/hal.c | 103 ++++++++----- src/port/atarist/hal.c | 98 ++++++++----- src/shared68k/draw68k_inline.h | 201 ++++++++++++++++++++++++++ src/shared68k/surface68k.s | 255 +++++++++++++++++++++++++++++++++ 6 files changed, 589 insertions(+), 72 deletions(-) create mode 100644 src/shared68k/draw68k_inline.h create mode 100644 src/shared68k/surface68k.s diff --git a/make/amiga.mk b/make/amiga.mk index 4e2f86b..63741b4 100644 --- a/make/amiga.mk +++ b/make/amiga.mk @@ -13,7 +13,7 @@ BINDIR := $(BUILD)/bin # independently. -I on $(SRC_PORT)/amiga lets ptplayer.h resolve # from the port-local shim alongside our HAL code. PTPLAYER_DIR := $(REPO_DIR)/toolchains/amiga/ptplayer -CFLAGS := $(COMMON_CFLAGS) -DJOEYLIB_PLATFORM_AMIGA -m68000 -fomit-frame-pointer -noixemul -D__OSCOMPAT -I$(SRC_PORT)/amiga -I$(PTPLAYER_DIR) +CFLAGS := $(COMMON_CFLAGS) -DJOEYLIB_PLATFORM_AMIGA -m68000 -fomit-frame-pointer -noixemul -D__OSCOMPAT -I$(SRC_PORT)/amiga -I$(SRC_68K) -I$(PTPLAYER_DIR) # OSCOMPAT=1 selects PTPlayer's audio.device-friendly variant (uses # CIA-B + audio.device interrupts via the OS rather than taking over # Paula directly), matching the way our HAL cooperates with Intuition. diff --git a/make/atarist.mk b/make/atarist.mk index a30f42e..c8d7536 100644 --- a/make/atarist.mk +++ b/make/atarist.mk @@ -7,7 +7,7 @@ BUILD := $(REPO_DIR)/build/$(PLATFORM) LIBDIR := $(BUILD)/lib BINDIR := $(BUILD)/bin -CFLAGS := $(COMMON_CFLAGS) -DJOEYLIB_PLATFORM_ATARIST -m68000 -fomit-frame-pointer -I$(REPO_DIR)/toolchains/audio/libxmp-lite/include -I$(REPO_DIR)/toolchains/atarist/include-shim +CFLAGS := $(COMMON_CFLAGS) -DJOEYLIB_PLATFORM_ATARIST -m68000 -fomit-frame-pointer -I$(REPO_DIR)/toolchains/audio/libxmp-lite/include -I$(REPO_DIR)/toolchains/atarist/include-shim -I$(SRC_68K) LDFLAGS := # libxmp-lite shared with the DOS port. Built as a static archive that diff --git a/src/port/amiga/hal.c b/src/port/amiga/hal.c index b1d4b85..fe6997b 100644 --- a/src/port/amiga/hal.c +++ b/src/port/amiga/hal.c @@ -44,6 +44,7 @@ #include "hal.h" #include "surfaceInternal.h" +#include "draw68k_inline.h" extern struct Custom custom; @@ -602,22 +603,56 @@ void halShutdown(void) { } -// Amiga has no asm fast paths yet; cross-platform code falls back to -// its C implementations whenever these return false. +// Shared 68k fast paths for the chunky surface buffer (src/shared68k/ +// surface68k.s). Same primitives used by the Atari ST port -- the +// stage / surfaces are an identical 4bpp packed layout on both. +extern void surface68kClearLong(uint8_t *pixels, uint16_t fillByte); +extern void surface68kFillRectFull(uint8_t *pixels, int16_t y, uint16_t h, uint16_t fillByte); +extern void surface68kFillRectByteAligned(uint8_t *rowFirst, uint16_t midBytes, uint16_t h, uint16_t fillByte); + + bool halFastSurfaceClear(SurfaceT *s, uint8_t doubled) { - (void)s; - (void)doubled; - return false; + if (s != stageGet()) { + return false; + } + surface68kClearLong(s->pixels, (uint16_t)doubled); + return true; } +// Fast path bands: +// - x == 0 && w == SURFACE_WIDTH (full row): one move.l-stream per +// row via surface68kFillRectFull. No nibble fixups needed -- both +// nibbles in every byte get the same value, and rowFirst is the +// surface base which is always word-aligned by calloc. +// - x % 4 == 0 && w even (byte-aligned AND word-aligned): inner +// bytes via the asm. The (x % 4 == 0) part is the 68000 alignment +// requirement for the move.l writes inside the asm -- byte index +// = x/2, so x must be a multiple of 4 for the byte index to be +// even. +// - everything else: fall through to C's fillRectClipped, which +// does per-byte writes (no alignment needed) and handles the +// leading / trailing nibble RMW correctly. bool halFastFillRect(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex) { - (void)s; - (void)x; - (void)y; - (void)w; - (void)h; - (void)colorIndex; + uint8_t doubled; + + if (s != stageGet()) { + return false; + } + if (h == 0u || w == 0u) { + return true; /* clipped-out: nothing to do, but we "handled" it */ + } + doubled = (uint8_t)(((colorIndex & 0x0Fu) << 4) | (colorIndex & 0x0Fu)); + + if (x == 0 && w == (uint16_t)SURFACE_WIDTH) { + surface68kFillRectFull(s->pixels, y, h, (uint16_t)doubled); + return true; + } + if (((x & 3) == 0) && ((w & 1u) == 0u)) { + uint8_t *rowFirst = &s->pixels[(uint16_t)y * (uint16_t)SURFACE_BYTES_PER_ROW + ((uint16_t)x >> 1)]; + surface68kFillRectByteAligned(rowFirst, w >> 1, h, (uint16_t)doubled); + return true; + } return false; } @@ -652,42 +687,40 @@ bool halFastTileSnap(uint8_t *dstTilePixels, const uint8_t *srcRow0) { bool halFastDrawPixel(SurfaceT *s, uint16_t x, uint16_t y, uint8_t colorIndex) { - (void)s; - (void)x; - (void)y; - (void)colorIndex; - return false; + uint8_t nibLo; + if (s != stageGet()) { + return false; + } + nibLo = (uint8_t)(colorIndex & 0x0Fu); + draw68kPlotPixel(s->pixels, (int16_t)x, (int16_t)y, nibLo, (uint8_t)(nibLo << 4)); + return true; } bool halFastDrawLine(SurfaceT *s, int16_t x0, int16_t y0, int16_t x1, int16_t y1, uint8_t colorIndex) { - (void)s; - (void)x0; - (void)y0; - (void)x1; - (void)y1; - (void)colorIndex; - return false; + if (s != stageGet()) { + return false; + } + draw68kLine(s->pixels, x0, y0, x1, y1, colorIndex); + return true; } bool halFastDrawCircle(SurfaceT *s, int16_t cx, int16_t cy, uint16_t r, uint8_t colorIndex) { - (void)s; - (void)cx; - (void)cy; - (void)r; - (void)colorIndex; - return false; + if (s != stageGet()) { + return false; + } + draw68kCircleOutline(s->pixels, cx, cy, r, colorIndex); + return true; } bool halFastFillCircle(SurfaceT *s, int16_t cx, int16_t cy, uint16_t r, uint8_t colorIndex) { - (void)s; - (void)cx; - (void)cy; - (void)r; - (void)colorIndex; - return false; + if (s != stageGet()) { + return false; + } + draw68kCircleFill(s->pixels, cx, cy, r, colorIndex); + return true; } diff --git a/src/port/atarist/hal.c b/src/port/atarist/hal.c index c9fd03c..2efcf32 100644 --- a/src/port/atarist/hal.c +++ b/src/port/atarist/hal.c @@ -36,6 +36,7 @@ #include "hal.h" #include "surfaceInternal.h" +#include "draw68k_inline.h" // ----- Constants ----- @@ -600,22 +601,51 @@ void halShutdown(void) { } -// ST has no asm fast paths yet; cross-platform code falls back to its -// C implementations when these return false. +// Shared 68k fast paths for the chunky surface buffer (src/shared68k/ +// surface68k.s). Same primitives used by the Amiga port -- the stage / +// surfaces are identical 4bpp packed layout on both 68k targets. +extern void surface68kClearLong(uint8_t *pixels, uint16_t fillByte); +extern void surface68kFillRectFull(uint8_t *pixels, int16_t y, uint16_t h, uint16_t fillByte); +extern void surface68kFillRectByteAligned(uint8_t *rowFirst, uint16_t midBytes, uint16_t h, uint16_t fillByte); + + bool halFastSurfaceClear(SurfaceT *s, uint8_t doubled) { - (void)s; - (void)doubled; - return false; + if (s != stageGet()) { + return false; + } + surface68kClearLong(s->pixels, (uint16_t)doubled); + return true; } +// Fast path bands: +// - x == 0 && w == SURFACE_WIDTH (full row): one move.l-stream per +// row via surface68kFillRectFull. Always word-aligned. +// - x % 4 == 0 && w even (word-aligned): byte index = x/2, so x must +// be a multiple of 4 for the move.l writes inside the asm to land +// on even addresses (68000 address-error rule). +// - everything else: fall through to C's fillRectClipped, which is +// per-byte and tolerates any alignment. bool halFastFillRect(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex) { - (void)s; - (void)x; - (void)y; - (void)w; - (void)h; - (void)colorIndex; + uint8_t doubled; + + if (s != stageGet()) { + return false; + } + if (h == 0u || w == 0u) { + return true; + } + doubled = (uint8_t)(((colorIndex & 0x0Fu) << 4) | (colorIndex & 0x0Fu)); + + if (x == 0 && w == (uint16_t)SURFACE_WIDTH) { + surface68kFillRectFull(s->pixels, y, h, (uint16_t)doubled); + return true; + } + if (((x & 3) == 0) && ((w & 1u) == 0u)) { + uint8_t *rowFirst = &s->pixels[(uint16_t)y * (uint16_t)SURFACE_BYTES_PER_ROW + ((uint16_t)x >> 1)]; + surface68kFillRectByteAligned(rowFirst, w >> 1, h, (uint16_t)doubled); + return true; + } return false; } @@ -650,42 +680,40 @@ bool halFastTileSnap(uint8_t *dstTilePixels, const uint8_t *srcRow0) { bool halFastDrawPixel(SurfaceT *s, uint16_t x, uint16_t y, uint8_t colorIndex) { - (void)s; - (void)x; - (void)y; - (void)colorIndex; - return false; + uint8_t nibLo; + if (s != stageGet()) { + return false; + } + nibLo = (uint8_t)(colorIndex & 0x0Fu); + draw68kPlotPixel(s->pixels, (int16_t)x, (int16_t)y, nibLo, (uint8_t)(nibLo << 4)); + return true; } bool halFastDrawLine(SurfaceT *s, int16_t x0, int16_t y0, int16_t x1, int16_t y1, uint8_t colorIndex) { - (void)s; - (void)x0; - (void)y0; - (void)x1; - (void)y1; - (void)colorIndex; - return false; + if (s != stageGet()) { + return false; + } + draw68kLine(s->pixels, x0, y0, x1, y1, colorIndex); + return true; } bool halFastDrawCircle(SurfaceT *s, int16_t cx, int16_t cy, uint16_t r, uint8_t colorIndex) { - (void)s; - (void)cx; - (void)cy; - (void)r; - (void)colorIndex; - return false; + if (s != stageGet()) { + return false; + } + draw68kCircleOutline(s->pixels, cx, cy, r, colorIndex); + return true; } bool halFastFillCircle(SurfaceT *s, int16_t cx, int16_t cy, uint16_t r, uint8_t colorIndex) { - (void)s; - (void)cx; - (void)cy; - (void)r; - (void)colorIndex; - return false; + if (s != stageGet()) { + return false; + } + draw68kCircleFill(s->pixels, cx, cy, r, colorIndex); + return true; } diff --git a/src/shared68k/draw68k_inline.h b/src/shared68k/draw68k_inline.h new file mode 100644 index 0000000..091b520 --- /dev/null +++ b/src/shared68k/draw68k_inline.h @@ -0,0 +1,201 @@ +// Shared 68k C-side fast paths for drawing primitives. +// +// Header-only `static inline` so both Amiga and Atari ST hal.c can +// pull the same code without makefile changes -- the surface buffer +// layout is identical between the two ports (4bpp packed, 160 bytes +// per row, 32000 bytes per surface), so the per-pixel write logic +// is portable. +// +// All functions assume the caller has already verified the rect / +// circle bounding box is fully on-surface (which is what the +// halFast* contract guarantees). No clip checks inside the hot +// loops, no per-pixel surfaceMarkDirtyRect calls -- the caller marks +// the bounding box dirty once after the call. +// +// gcc-amigaos / gcc-atari-mint both inline these aggressively at -O2, +// so the wrapper functions in each port's hal.c compile to a single +// tight m68k loop with no JSR per pixel. + +#ifndef JOEYLIB_DRAW68K_INLINE_H +#define JOEYLIB_DRAW68K_INLINE_H + +#include "joey/types.h" +#include "joey/surface.h" + + +// Plot a single pixel at (x, y) without bounds checking. nibLo is +// the colorIndex's low nibble, nibHi is (low << 4) precomputed by +// the caller so the hot loop doesn't do the shift per call. +static inline void draw68kPlotPixel(uint8_t *pixels, int16_t x, int16_t y, + uint8_t nibLo, uint8_t nibHi) { + uint8_t *byte = &pixels[(uint16_t)y * (uint16_t)SURFACE_BYTES_PER_ROW + + ((uint16_t)x >> 1)]; + if (x & 1) { + *byte = (uint8_t)((*byte & 0xF0u) | nibLo); + } else { + *byte = (uint8_t)((*byte & 0x0Fu) | nibHi); + } +} + + +// Bresenham midpoint-circle outline. Plots all 8 octants per +// iteration. Caller must verify cx +/- r and cy +/- r are all in +// surface bounds (the halFastDrawCircle contract). +static inline void draw68kCircleOutline(uint8_t *pixels, int16_t cx, int16_t cy, + uint16_t r, uint8_t colorIndex) { + int16_t x; + int16_t y; + int16_t err; + uint8_t nibLo = (uint8_t)(colorIndex & 0x0Fu); + uint8_t nibHi = (uint8_t)(nibLo << 4); + + if (r == 0u) { + draw68kPlotPixel(pixels, cx, cy, nibLo, nibHi); + return; + } + + x = (int16_t)r; + y = 0; + err = (int16_t)(1 - x); + while (x >= y) { + draw68kPlotPixel(pixels, (int16_t)(cx + x), (int16_t)(cy + y), nibLo, nibHi); + draw68kPlotPixel(pixels, (int16_t)(cx - x), (int16_t)(cy + y), nibLo, nibHi); + draw68kPlotPixel(pixels, (int16_t)(cx + x), (int16_t)(cy - y), nibLo, nibHi); + draw68kPlotPixel(pixels, (int16_t)(cx - x), (int16_t)(cy - y), nibLo, nibHi); + draw68kPlotPixel(pixels, (int16_t)(cx + y), (int16_t)(cy + x), nibLo, nibHi); + draw68kPlotPixel(pixels, (int16_t)(cx - y), (int16_t)(cy + x), nibLo, nibHi); + draw68kPlotPixel(pixels, (int16_t)(cx + y), (int16_t)(cy - x), nibLo, nibHi); + draw68kPlotPixel(pixels, (int16_t)(cx - y), (int16_t)(cy - x), nibLo, nibHi); + y++; + if (err <= 0) { + err = (int16_t)(err + y + y + 1); + } else { + x--; + err = (int16_t)(err + y + y - x - x + 1); + } + } +} + + +// Filled circle: for each y from 0..r, find the largest x with +// x*x + y*y <= r*r and emit the symmetric horizontal span. Caller +// must verify the bounding box is on-surface. +// +// Each span is a horizontal run on a single row, so we go straight +// to the byte-fill path used by halFastFillRect-style code rather +// than calling per-pixel helpers. +static inline void draw68kCircleFill(uint8_t *pixels, int16_t cx, int16_t cy, + uint16_t r, uint8_t colorIndex) { + int16_t y; + int16_t x; + uint16_t xx; + uint16_t yy; + uint16_t r2; + uint8_t nibLo = (uint8_t)(colorIndex & 0x0Fu); + uint8_t nibHi = (uint8_t)(nibLo << 4); + uint8_t doubled = (uint8_t)(nibHi | nibLo); + + if (r == 0u) { + draw68kPlotPixel(pixels, cx, cy, nibLo, nibHi); + return; + } + + xx = (uint16_t)(r * r); + r2 = xx; + yy = 0; + x = (int16_t)r; + + for (y = 0; y <= (int16_t)r; y++) { + int16_t spanLeft; + int16_t spanRight; + int16_t rowsRemaining; + int16_t rowYDelta; + + while (xx + yy > r2) { + xx = (uint16_t)(xx - (uint16_t)((uint16_t)x + (uint16_t)x - 1u)); + x--; + } + + spanLeft = (int16_t)(cx - x); + spanRight = (int16_t)(cx + x); + rowsRemaining = (y == 0) ? 1 : 2; + for (rowYDelta = 0; rowYDelta < rowsRemaining; rowYDelta++) { + int16_t rowY = (rowYDelta == 0) ? (int16_t)(cy + y) : (int16_t)(cy - y); + uint8_t *rowBase = &pixels[(uint16_t)rowY * (uint16_t)SURFACE_BYTES_PER_ROW]; + int16_t px = spanLeft; + + /* Leading partial nibble. */ + if (px & 1) { + uint8_t *byte = &rowBase[(uint16_t)px >> 1]; + *byte = (uint8_t)((*byte & 0xF0u) | nibLo); + px++; + } + /* Middle whole bytes. */ + { + int16_t midBytes = (int16_t)((spanRight + 1 - px) >> 1); + if (midBytes > 0) { + uint8_t *p = &rowBase[(uint16_t)px >> 1]; + int16_t i; + for (i = 0; i < midBytes; i++) { + p[i] = doubled; + } + px = (int16_t)(px + (midBytes << 1)); + } + } + /* Trailing partial nibble. */ + if (px <= spanRight) { + uint8_t *byte = &rowBase[(uint16_t)px >> 1]; + *byte = (uint8_t)((*byte & 0x0Fu) | nibHi); + } + } + + yy = (uint16_t)(yy + (uint16_t)((uint16_t)y + (uint16_t)y + 1u)); + } +} + + +// Bresenham line plot. Caller must verify both endpoints are on +// surface (halFastDrawLine contract). Falls through to a tight +// inner loop with no per-pixel function calls. +static inline void draw68kLine(uint8_t *pixels, int16_t x0, int16_t y0, + int16_t x1, int16_t y1, uint8_t colorIndex) { + int16_t dx; + int16_t dy; + int16_t sx; + int16_t sy; + int16_t err; + int16_t e2; + uint8_t nibLo = (uint8_t)(colorIndex & 0x0Fu); + uint8_t nibHi = (uint8_t)(nibLo << 4); + + dx = (int16_t)(x1 - x0); + if (dx < 0) { + dx = (int16_t)(-dx); + } + dy = (int16_t)(y1 - y0); + if (dy < 0) { + dy = (int16_t)(-dy); + } + sx = (x0 < x1) ? 1 : -1; + sy = (y0 < y1) ? 1 : -1; + err = (int16_t)(dx - dy); + + for (;;) { + draw68kPlotPixel(pixels, x0, y0, nibLo, nibHi); + if (x0 == x1 && y0 == y1) { + break; + } + e2 = (int16_t)(err + err); + if (e2 > -dy) { + err = (int16_t)(err - dy); + x0 = (int16_t)(x0 + sx); + } + if (e2 < dx) { + err = (int16_t)(err + dx); + y0 = (int16_t)(y0 + sy); + } + } +} + + +#endif /* JOEYLIB_DRAW68K_INLINE_H */ diff --git a/src/shared68k/surface68k.s b/src/shared68k/surface68k.s new file mode 100644 index 0000000..7e3186a --- /dev/null +++ b/src/shared68k/surface68k.s @@ -0,0 +1,255 @@ +| Shared 68000 fast paths for the chunky surface buffer. +| +| Both Amiga and Atari ST keep the JoeyLib stage / surfaces in the +| same 4bpp packed (2 px / byte, 160 bytes / row, 32000 bytes / surface) +| format -- the c2p / blit-to-screen step that converts to native +| pixel layout happens only at present time. So the per-byte buffer +| operations (fill, clear) can be shared across both ports. +| +| Calling convention: cdecl (gcc-amigaos / gcc-atari-mint). +| d0/d1/a0/a1 are caller-save (scratch). +| d2-d7/a2-a6 are callee-save (must be preserved). +| 16-bit shorts are sign-extended to int (4 bytes) on the call stack. +| +| GAS m68k syntax. Assembled by the gcc driver via binutils m68k-as. + + .text + + +| ---------------------------------------------------------------- +| void surface68kClearLong(uint8_t *pixels, uint16_t fillByte); +| +| Fill 32000 bytes (SURFACE_PIXELS_SIZE) starting at `pixels` with +| `fillByte`. Unrolled move.l with the byte expanded to a 4-byte +| word so we can write 32 bytes per loop body instead of 1. +| ---------------------------------------------------------------- + .globl _surface68kClearLong + + .equ SURF_CLEAR_BYTES, 32000 + .equ SURF_CLEAR_LONGS, (SURF_CLEAR_BYTES / 4) + .equ SURF_CLEAR_PER_ITER, 8 + .equ SURF_CLEAR_ITERS, (SURF_CLEAR_LONGS / SURF_CLEAR_PER_ITER) + + | Save d2 (used as dbra counter). a0/a1/d0/d1 are + | caller-save so we may clobber them freely. Stack + | offset to args = 4 (saved d2) + 4 (return PC) = 8. + .equ CLR_SAVED, 4 + +_surface68kClearLong: + move.l %d2,-(%sp) + + move.l 4+CLR_SAVED(%sp),%a0 | pixels + | Zero d0 BEFORE the move.b -- m68k move.b only touches + | the low byte of the destination, so d0[31..8] would + | otherwise hold whatever garbage the caller left in d0 + | and pollute the OR-replicate chain below. + moveq #0,%d0 + move.b 8+CLR_SAVED+3(%sp),%d0 | fillByte (low byte of int) + + | Build a long with fillByte replicated four times. + | d0 = $000000FB -> $0000FBFB -> $FBFBFBFB + move.l %d0,%d1 + lsl.l #8,%d1 + or.l %d1,%d0 | d0 = $0000FBFB + move.l %d0,%d1 + swap %d1 | d1 = $FBFB0000 + or.l %d1,%d0 | d0 = $FBFBFBFB + + move.l %a0,%a1 | dest cursor + move.w #(SURF_CLEAR_ITERS - 1),%d2 | dbra count + +.LclearLoop: + move.l %d0,(%a1)+ + move.l %d0,(%a1)+ + move.l %d0,(%a1)+ + move.l %d0,(%a1)+ + move.l %d0,(%a1)+ + move.l %d0,(%a1)+ + move.l %d0,(%a1)+ + move.l %d0,(%a1)+ + dbra %d2,.LclearLoop + + move.l (%sp)+,%d2 + rts + + +| ---------------------------------------------------------------- +| void surface68kFillRectFull(uint8_t *pixels, int16_t y, uint16_t h, +| uint16_t fillByte); +| +| Full-width (320 px = 160 byte) rectangle fill. `pixels` is the +| surface base; the row at `y` is `pixels + y * 160`. Fills `h` +| consecutive rows using move.l writes. +| ---------------------------------------------------------------- + .globl _surface68kFillRectFull + + .equ SURF_FRF_BYTES_PER_ROW, 160 + + | Save d2/d3 (used for fillByte replicate scratch and + | row counter). 8 bytes saved + 4 ret PC = 12 to args. + .equ FRF_SAVED, 8 + +_surface68kFillRectFull: + movem.l %d2-%d3,-(%sp) + + move.l 4+FRF_SAVED(%sp),%a0 | pixels base + | Zero d0/d1 BEFORE move.b/move.w -- those only update + | sub-register portions and would otherwise carry the + | caller's garbage upper bits into the replicate OR. + moveq #0,%d0 + moveq #0,%d1 + moveq #0,%d2 + move.w 8+FRF_SAVED+2(%sp),%d1 | y (low word) + move.w 12+FRF_SAVED+2(%sp),%d2 | h (low word) + move.b 16+FRF_SAVED+3(%sp),%d0 | fillByte + + | Bail on degenerate height. + tst.w %d2 + ble .Lfrf_done + + | Build replicated fillByte in d0. + move.l %d0,%d3 + lsl.l #8,%d3 + or.l %d3,%d0 + move.l %d0,%d3 + swap %d3 + or.l %d3,%d0 | d0 = byte * $01010101 + + | row pointer = pixels + y * 160 = pixels + y*128 + y*32 + ext.l %d1 + move.l %d1,%d3 + lsl.l #7,%d3 | y * 128 + lsl.l #5,%d1 | y * 32 + add.l %d3,%d1 | y * 160 + add.l %d1,%a0 | a0 = first row to fill + + subq.w #1,%d2 | row dbra count + +.Lfrf_rowLoop: + move.l %a0,%a1 + | 40 longs per row, fully unrolled. + move.l %d0,(%a1)+ + move.l %d0,(%a1)+ + move.l %d0,(%a1)+ + move.l %d0,(%a1)+ + move.l %d0,(%a1)+ + move.l %d0,(%a1)+ + move.l %d0,(%a1)+ + move.l %d0,(%a1)+ + move.l %d0,(%a1)+ + move.l %d0,(%a1)+ + move.l %d0,(%a1)+ + move.l %d0,(%a1)+ + move.l %d0,(%a1)+ + move.l %d0,(%a1)+ + move.l %d0,(%a1)+ + move.l %d0,(%a1)+ + move.l %d0,(%a1)+ + move.l %d0,(%a1)+ + move.l %d0,(%a1)+ + move.l %d0,(%a1)+ + move.l %d0,(%a1)+ + move.l %d0,(%a1)+ + move.l %d0,(%a1)+ + move.l %d0,(%a1)+ + move.l %d0,(%a1)+ + move.l %d0,(%a1)+ + move.l %d0,(%a1)+ + move.l %d0,(%a1)+ + move.l %d0,(%a1)+ + move.l %d0,(%a1)+ + move.l %d0,(%a1)+ + move.l %d0,(%a1)+ + move.l %d0,(%a1)+ + move.l %d0,(%a1)+ + move.l %d0,(%a1)+ + move.l %d0,(%a1)+ + move.l %d0,(%a1)+ + move.l %d0,(%a1)+ + move.l %d0,(%a1)+ + move.l %d0,(%a1)+ + lea SURF_FRF_BYTES_PER_ROW(%a0),%a0 + dbra %d2,.Lfrf_rowLoop + +.Lfrf_done: + movem.l (%sp)+,%d2-%d3 + rts + + +| ---------------------------------------------------------------- +| void surface68kFillRectByteAligned(uint8_t *rowFirst, uint16_t midBytes, +| uint16_t h, uint16_t fillByte); +| +| Per-row middle-bytes fill for a fillRect where the leading and +| trailing nibble fixups have already been done (or none are +| needed because x and w are both even). `rowFirst` points at the +| first FULLY-INSIDE byte of the rect on row 0; subsequent rows +| are at +160. `midBytes` is how many full bytes per row to fill. +| ---------------------------------------------------------------- + .globl _surface68kFillRectByteAligned + + .equ SURF_FRB_STRIDE, 160 + + | Save d2-d6 (5 regs = 20 bytes). Args at +20+4 = +24. + .equ FRB_SAVED, 20 + +_surface68kFillRectByteAligned: + movem.l %d2-%d6,-(%sp) + + move.l 4+FRB_SAVED(%sp),%a0 | rowFirst + | Zero scratch regs BEFORE the sub-register loads -- + | move.w / move.b only touch low portions, leaving + | caller's garbage in the upper bits which would + | otherwise pollute the replicate-OR chain below. + moveq #0,%d0 + moveq #0,%d1 + moveq #0,%d2 + move.w 8+FRB_SAVED+2(%sp),%d1 | midBytes + move.w 12+FRB_SAVED+2(%sp),%d2 | h + move.b 16+FRB_SAVED+3(%sp),%d0 | fillByte + + tst.w %d1 + beq .Lfrb_done + tst.w %d2 + ble .Lfrb_done + + | Build replicated fillByte in d0. + move.l %d0,%d3 + lsl.l #8,%d3 + or.l %d3,%d0 + move.l %d0,%d3 + swap %d3 + or.l %d3,%d0 + + | Pre-compute long-count and trailing-byte residual. + move.w %d1,%d4 + lsr.w #2,%d4 | longCount + move.w %d1,%d5 + and.w #3,%d5 | trailBytes + + subq.w #1,%d2 | row dbra count + +.Lfrb_rowLoop: + move.l %a0,%a1 + move.w %d4,%d6 | longs left + tst.w %d6 + beq .Lfrb_tail + subq.w #1,%d6 +.Lfrb_longLoop: + move.l %d0,(%a1)+ + dbra %d6,.Lfrb_longLoop +.Lfrb_tail: + move.w %d5,%d6 | trailing bytes + tst.w %d6 + beq .Lfrb_rowDone + subq.w #1,%d6 +.Lfrb_byteLoop: + move.b %d0,(%a1)+ + dbra %d6,.Lfrb_byteLoop +.Lfrb_rowDone: + lea SURF_FRB_STRIDE(%a0),%a0 + dbra %d2,.Lfrb_rowLoop + +.Lfrb_done: + movem.l (%sp)+,%d2-%d6 + rts