Amiga and ST improvements.

This commit is contained in:
Scott Duensing 2026-05-01 13:00:11 -05:00
parent 2eaa16a815
commit 6c03d93e88
6 changed files with 589 additions and 72 deletions

View file

@ -13,7 +13,7 @@ BINDIR := $(BUILD)/bin
# independently. -I on $(SRC_PORT)/amiga lets ptplayer.h resolve
# <SDI_compiler.h> from the port-local shim alongside our HAL code.
PTPLAYER_DIR := $(REPO_DIR)/toolchains/amiga/ptplayer
CFLAGS := $(COMMON_CFLAGS) -DJOEYLIB_PLATFORM_AMIGA -m68000 -fomit-frame-pointer -noixemul -D__OSCOMPAT -I$(SRC_PORT)/amiga -I$(PTPLAYER_DIR)
CFLAGS := $(COMMON_CFLAGS) -DJOEYLIB_PLATFORM_AMIGA -m68000 -fomit-frame-pointer -noixemul -D__OSCOMPAT -I$(SRC_PORT)/amiga -I$(SRC_68K) -I$(PTPLAYER_DIR)
# OSCOMPAT=1 selects PTPlayer's audio.device-friendly variant (uses
# CIA-B + audio.device interrupts via the OS rather than taking over
# Paula directly), matching the way our HAL cooperates with Intuition.

View file

@ -7,7 +7,7 @@ BUILD := $(REPO_DIR)/build/$(PLATFORM)
LIBDIR := $(BUILD)/lib
BINDIR := $(BUILD)/bin
CFLAGS := $(COMMON_CFLAGS) -DJOEYLIB_PLATFORM_ATARIST -m68000 -fomit-frame-pointer -I$(REPO_DIR)/toolchains/audio/libxmp-lite/include -I$(REPO_DIR)/toolchains/atarist/include-shim
CFLAGS := $(COMMON_CFLAGS) -DJOEYLIB_PLATFORM_ATARIST -m68000 -fomit-frame-pointer -I$(REPO_DIR)/toolchains/audio/libxmp-lite/include -I$(REPO_DIR)/toolchains/atarist/include-shim -I$(SRC_68K)
LDFLAGS :=
# libxmp-lite shared with the DOS port. Built as a static archive that

View file

@ -44,6 +44,7 @@
#include "hal.h"
#include "surfaceInternal.h"
#include "draw68k_inline.h"
extern struct Custom custom;
@ -602,22 +603,56 @@ void halShutdown(void) {
}
// Amiga has no asm fast paths yet; cross-platform code falls back to
// its C implementations whenever these return false.
// Shared 68k fast paths for the chunky surface buffer (src/shared68k/
// surface68k.s). Same primitives used by the Atari ST port -- the
// stage / surfaces are an identical 4bpp packed layout on both.
extern void surface68kClearLong(uint8_t *pixels, uint16_t fillByte);
extern void surface68kFillRectFull(uint8_t *pixels, int16_t y, uint16_t h, uint16_t fillByte);
extern void surface68kFillRectByteAligned(uint8_t *rowFirst, uint16_t midBytes, uint16_t h, uint16_t fillByte);
bool halFastSurfaceClear(SurfaceT *s, uint8_t doubled) {
(void)s;
(void)doubled;
if (s != stageGet()) {
return false;
}
surface68kClearLong(s->pixels, (uint16_t)doubled);
return true;
}
// Fast path bands:
// - x == 0 && w == SURFACE_WIDTH (full row): one move.l-stream per
// row via surface68kFillRectFull. No nibble fixups needed -- both
// nibbles in every byte get the same value, and rowFirst is the
// surface base which is always word-aligned by calloc.
// - x % 4 == 0 && w even (byte-aligned AND word-aligned): inner
// bytes via the asm. The (x % 4 == 0) part is the 68000 alignment
// requirement for the move.l writes inside the asm -- byte index
// = x/2, so x must be a multiple of 4 for the byte index to be
// even.
// - everything else: fall through to C's fillRectClipped, which
// does per-byte writes (no alignment needed) and handles the
// leading / trailing nibble RMW correctly.
bool halFastFillRect(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex) {
(void)s;
(void)x;
(void)y;
(void)w;
(void)h;
(void)colorIndex;
uint8_t doubled;
if (s != stageGet()) {
return false;
}
if (h == 0u || w == 0u) {
return true; /* clipped-out: nothing to do, but we "handled" it */
}
doubled = (uint8_t)(((colorIndex & 0x0Fu) << 4) | (colorIndex & 0x0Fu));
if (x == 0 && w == (uint16_t)SURFACE_WIDTH) {
surface68kFillRectFull(s->pixels, y, h, (uint16_t)doubled);
return true;
}
if (((x & 3) == 0) && ((w & 1u) == 0u)) {
uint8_t *rowFirst = &s->pixels[(uint16_t)y * (uint16_t)SURFACE_BYTES_PER_ROW + ((uint16_t)x >> 1)];
surface68kFillRectByteAligned(rowFirst, w >> 1, h, (uint16_t)doubled);
return true;
}
return false;
}
@ -652,43 +687,41 @@ bool halFastTileSnap(uint8_t *dstTilePixels, const uint8_t *srcRow0) {
bool halFastDrawPixel(SurfaceT *s, uint16_t x, uint16_t y, uint8_t colorIndex) {
(void)s;
(void)x;
(void)y;
(void)colorIndex;
uint8_t nibLo;
if (s != stageGet()) {
return false;
}
nibLo = (uint8_t)(colorIndex & 0x0Fu);
draw68kPlotPixel(s->pixels, (int16_t)x, (int16_t)y, nibLo, (uint8_t)(nibLo << 4));
return true;
}
bool halFastDrawLine(SurfaceT *s, int16_t x0, int16_t y0, int16_t x1, int16_t y1, uint8_t colorIndex) {
(void)s;
(void)x0;
(void)y0;
(void)x1;
(void)y1;
(void)colorIndex;
if (s != stageGet()) {
return false;
}
draw68kLine(s->pixels, x0, y0, x1, y1, colorIndex);
return true;
}
bool halFastDrawCircle(SurfaceT *s, int16_t cx, int16_t cy, uint16_t r, uint8_t colorIndex) {
(void)s;
(void)cx;
(void)cy;
(void)r;
(void)colorIndex;
if (s != stageGet()) {
return false;
}
draw68kCircleOutline(s->pixels, cx, cy, r, colorIndex);
return true;
}
bool halFastFillCircle(SurfaceT *s, int16_t cx, int16_t cy, uint16_t r, uint8_t colorIndex) {
(void)s;
(void)cx;
(void)cy;
(void)r;
(void)colorIndex;
if (s != stageGet()) {
return false;
}
draw68kCircleFill(s->pixels, cx, cy, r, colorIndex);
return true;
}
bool halFastFloodWalk(uint8_t *row, int16_t startX, uint8_t matchColor, uint8_t newColor, bool matchEqual, bool *seedMatched, int16_t *leftXOut, int16_t *rightXOut) {

View file

@ -36,6 +36,7 @@
#include "hal.h"
#include "surfaceInternal.h"
#include "draw68k_inline.h"
// ----- Constants -----
@ -600,22 +601,51 @@ void halShutdown(void) {
}
// ST has no asm fast paths yet; cross-platform code falls back to its
// C implementations when these return false.
// Shared 68k fast paths for the chunky surface buffer (src/shared68k/
// surface68k.s). Same primitives used by the Amiga port -- the stage /
// surfaces are identical 4bpp packed layout on both 68k targets.
extern void surface68kClearLong(uint8_t *pixels, uint16_t fillByte);
extern void surface68kFillRectFull(uint8_t *pixels, int16_t y, uint16_t h, uint16_t fillByte);
extern void surface68kFillRectByteAligned(uint8_t *rowFirst, uint16_t midBytes, uint16_t h, uint16_t fillByte);
bool halFastSurfaceClear(SurfaceT *s, uint8_t doubled) {
(void)s;
(void)doubled;
if (s != stageGet()) {
return false;
}
surface68kClearLong(s->pixels, (uint16_t)doubled);
return true;
}
// Fast path bands:
// - x == 0 && w == SURFACE_WIDTH (full row): one move.l-stream per
// row via surface68kFillRectFull. Always word-aligned.
// - x % 4 == 0 && w even (word-aligned): byte index = x/2, so x must
// be a multiple of 4 for the move.l writes inside the asm to land
// on even addresses (68000 address-error rule).
// - everything else: fall through to C's fillRectClipped, which is
// per-byte and tolerates any alignment.
bool halFastFillRect(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex) {
(void)s;
(void)x;
(void)y;
(void)w;
(void)h;
(void)colorIndex;
uint8_t doubled;
if (s != stageGet()) {
return false;
}
if (h == 0u || w == 0u) {
return true;
}
doubled = (uint8_t)(((colorIndex & 0x0Fu) << 4) | (colorIndex & 0x0Fu));
if (x == 0 && w == (uint16_t)SURFACE_WIDTH) {
surface68kFillRectFull(s->pixels, y, h, (uint16_t)doubled);
return true;
}
if (((x & 3) == 0) && ((w & 1u) == 0u)) {
uint8_t *rowFirst = &s->pixels[(uint16_t)y * (uint16_t)SURFACE_BYTES_PER_ROW + ((uint16_t)x >> 1)];
surface68kFillRectByteAligned(rowFirst, w >> 1, h, (uint16_t)doubled);
return true;
}
return false;
}
@ -650,43 +680,41 @@ bool halFastTileSnap(uint8_t *dstTilePixels, const uint8_t *srcRow0) {
bool halFastDrawPixel(SurfaceT *s, uint16_t x, uint16_t y, uint8_t colorIndex) {
(void)s;
(void)x;
(void)y;
(void)colorIndex;
uint8_t nibLo;
if (s != stageGet()) {
return false;
}
nibLo = (uint8_t)(colorIndex & 0x0Fu);
draw68kPlotPixel(s->pixels, (int16_t)x, (int16_t)y, nibLo, (uint8_t)(nibLo << 4));
return true;
}
bool halFastDrawLine(SurfaceT *s, int16_t x0, int16_t y0, int16_t x1, int16_t y1, uint8_t colorIndex) {
(void)s;
(void)x0;
(void)y0;
(void)x1;
(void)y1;
(void)colorIndex;
if (s != stageGet()) {
return false;
}
draw68kLine(s->pixels, x0, y0, x1, y1, colorIndex);
return true;
}
bool halFastDrawCircle(SurfaceT *s, int16_t cx, int16_t cy, uint16_t r, uint8_t colorIndex) {
(void)s;
(void)cx;
(void)cy;
(void)r;
(void)colorIndex;
if (s != stageGet()) {
return false;
}
draw68kCircleOutline(s->pixels, cx, cy, r, colorIndex);
return true;
}
bool halFastFillCircle(SurfaceT *s, int16_t cx, int16_t cy, uint16_t r, uint8_t colorIndex) {
(void)s;
(void)cx;
(void)cy;
(void)r;
(void)colorIndex;
if (s != stageGet()) {
return false;
}
draw68kCircleFill(s->pixels, cx, cy, r, colorIndex);
return true;
}
bool halFastFloodWalk(uint8_t *row, int16_t startX, uint8_t matchColor, uint8_t newColor, bool matchEqual, bool *seedMatched, int16_t *leftXOut, int16_t *rightXOut) {

View file

@ -0,0 +1,201 @@
// Shared 68k C-side fast paths for drawing primitives.
//
// Header-only `static inline` so both Amiga and Atari ST hal.c can
// pull the same code without makefile changes -- the surface buffer
// layout is identical between the two ports (4bpp packed, 160 bytes
// per row, 32000 bytes per surface), so the per-pixel write logic
// is portable.
//
// All functions assume the caller has already verified the rect /
// circle bounding box is fully on-surface (which is what the
// halFast* contract guarantees). No clip checks inside the hot
// loops, no per-pixel surfaceMarkDirtyRect calls -- the caller marks
// the bounding box dirty once after the call.
//
// gcc-amigaos / gcc-atari-mint both inline these aggressively at -O2,
// so the wrapper functions in each port's hal.c compile to a single
// tight m68k loop with no JSR per pixel.
#ifndef JOEYLIB_DRAW68K_INLINE_H
#define JOEYLIB_DRAW68K_INLINE_H
#include "joey/types.h"
#include "joey/surface.h"
// Plot a single pixel at (x, y) without bounds checking. nibLo is
// the colorIndex's low nibble, nibHi is (low << 4) precomputed by
// the caller so the hot loop doesn't do the shift per call.
static inline void draw68kPlotPixel(uint8_t *pixels, int16_t x, int16_t y,
uint8_t nibLo, uint8_t nibHi) {
uint8_t *byte = &pixels[(uint16_t)y * (uint16_t)SURFACE_BYTES_PER_ROW
+ ((uint16_t)x >> 1)];
if (x & 1) {
*byte = (uint8_t)((*byte & 0xF0u) | nibLo);
} else {
*byte = (uint8_t)((*byte & 0x0Fu) | nibHi);
}
}
// Bresenham midpoint-circle outline. Plots all 8 octants per
// iteration. Caller must verify cx +/- r and cy +/- r are all in
// surface bounds (the halFastDrawCircle contract).
static inline void draw68kCircleOutline(uint8_t *pixels, int16_t cx, int16_t cy,
uint16_t r, uint8_t colorIndex) {
int16_t x;
int16_t y;
int16_t err;
uint8_t nibLo = (uint8_t)(colorIndex & 0x0Fu);
uint8_t nibHi = (uint8_t)(nibLo << 4);
if (r == 0u) {
draw68kPlotPixel(pixels, cx, cy, nibLo, nibHi);
return;
}
x = (int16_t)r;
y = 0;
err = (int16_t)(1 - x);
while (x >= y) {
draw68kPlotPixel(pixels, (int16_t)(cx + x), (int16_t)(cy + y), nibLo, nibHi);
draw68kPlotPixel(pixels, (int16_t)(cx - x), (int16_t)(cy + y), nibLo, nibHi);
draw68kPlotPixel(pixels, (int16_t)(cx + x), (int16_t)(cy - y), nibLo, nibHi);
draw68kPlotPixel(pixels, (int16_t)(cx - x), (int16_t)(cy - y), nibLo, nibHi);
draw68kPlotPixel(pixels, (int16_t)(cx + y), (int16_t)(cy + x), nibLo, nibHi);
draw68kPlotPixel(pixels, (int16_t)(cx - y), (int16_t)(cy + x), nibLo, nibHi);
draw68kPlotPixel(pixels, (int16_t)(cx + y), (int16_t)(cy - x), nibLo, nibHi);
draw68kPlotPixel(pixels, (int16_t)(cx - y), (int16_t)(cy - x), nibLo, nibHi);
y++;
if (err <= 0) {
err = (int16_t)(err + y + y + 1);
} else {
x--;
err = (int16_t)(err + y + y - x - x + 1);
}
}
}
// Filled circle: for each y from 0..r, find the largest x with
// x*x + y*y <= r*r and emit the symmetric horizontal span. Caller
// must verify the bounding box is on-surface.
//
// Each span is a horizontal run on a single row, so we go straight
// to the byte-fill path used by halFastFillRect-style code rather
// than calling per-pixel helpers.
static inline void draw68kCircleFill(uint8_t *pixels, int16_t cx, int16_t cy,
uint16_t r, uint8_t colorIndex) {
int16_t y;
int16_t x;
uint16_t xx;
uint16_t yy;
uint16_t r2;
uint8_t nibLo = (uint8_t)(colorIndex & 0x0Fu);
uint8_t nibHi = (uint8_t)(nibLo << 4);
uint8_t doubled = (uint8_t)(nibHi | nibLo);
if (r == 0u) {
draw68kPlotPixel(pixels, cx, cy, nibLo, nibHi);
return;
}
xx = (uint16_t)(r * r);
r2 = xx;
yy = 0;
x = (int16_t)r;
for (y = 0; y <= (int16_t)r; y++) {
int16_t spanLeft;
int16_t spanRight;
int16_t rowsRemaining;
int16_t rowYDelta;
while (xx + yy > r2) {
xx = (uint16_t)(xx - (uint16_t)((uint16_t)x + (uint16_t)x - 1u));
x--;
}
spanLeft = (int16_t)(cx - x);
spanRight = (int16_t)(cx + x);
rowsRemaining = (y == 0) ? 1 : 2;
for (rowYDelta = 0; rowYDelta < rowsRemaining; rowYDelta++) {
int16_t rowY = (rowYDelta == 0) ? (int16_t)(cy + y) : (int16_t)(cy - y);
uint8_t *rowBase = &pixels[(uint16_t)rowY * (uint16_t)SURFACE_BYTES_PER_ROW];
int16_t px = spanLeft;
/* Leading partial nibble. */
if (px & 1) {
uint8_t *byte = &rowBase[(uint16_t)px >> 1];
*byte = (uint8_t)((*byte & 0xF0u) | nibLo);
px++;
}
/* Middle whole bytes. */
{
int16_t midBytes = (int16_t)((spanRight + 1 - px) >> 1);
if (midBytes > 0) {
uint8_t *p = &rowBase[(uint16_t)px >> 1];
int16_t i;
for (i = 0; i < midBytes; i++) {
p[i] = doubled;
}
px = (int16_t)(px + (midBytes << 1));
}
}
/* Trailing partial nibble. */
if (px <= spanRight) {
uint8_t *byte = &rowBase[(uint16_t)px >> 1];
*byte = (uint8_t)((*byte & 0x0Fu) | nibHi);
}
}
yy = (uint16_t)(yy + (uint16_t)((uint16_t)y + (uint16_t)y + 1u));
}
}
// Bresenham line plot. Caller must verify both endpoints are on
// surface (halFastDrawLine contract). Falls through to a tight
// inner loop with no per-pixel function calls.
static inline void draw68kLine(uint8_t *pixels, int16_t x0, int16_t y0,
int16_t x1, int16_t y1, uint8_t colorIndex) {
int16_t dx;
int16_t dy;
int16_t sx;
int16_t sy;
int16_t err;
int16_t e2;
uint8_t nibLo = (uint8_t)(colorIndex & 0x0Fu);
uint8_t nibHi = (uint8_t)(nibLo << 4);
dx = (int16_t)(x1 - x0);
if (dx < 0) {
dx = (int16_t)(-dx);
}
dy = (int16_t)(y1 - y0);
if (dy < 0) {
dy = (int16_t)(-dy);
}
sx = (x0 < x1) ? 1 : -1;
sy = (y0 < y1) ? 1 : -1;
err = (int16_t)(dx - dy);
for (;;) {
draw68kPlotPixel(pixels, x0, y0, nibLo, nibHi);
if (x0 == x1 && y0 == y1) {
break;
}
e2 = (int16_t)(err + err);
if (e2 > -dy) {
err = (int16_t)(err - dy);
x0 = (int16_t)(x0 + sx);
}
if (e2 < dx) {
err = (int16_t)(err + dx);
y0 = (int16_t)(y0 + sy);
}
}
}
#endif /* JOEYLIB_DRAW68K_INLINE_H */

255
src/shared68k/surface68k.s Normal file
View file

@ -0,0 +1,255 @@
| Shared 68000 fast paths for the chunky surface buffer.
|
| Both Amiga and Atari ST keep the JoeyLib stage / surfaces in the
| same 4bpp packed (2 px / byte, 160 bytes / row, 32000 bytes / surface)
| format -- the c2p / blit-to-screen step that converts to native
| pixel layout happens only at present time. So the per-byte buffer
| operations (fill, clear) can be shared across both ports.
|
| Calling convention: cdecl (gcc-amigaos / gcc-atari-mint).
| d0/d1/a0/a1 are caller-save (scratch).
| d2-d7/a2-a6 are callee-save (must be preserved).
| 16-bit shorts are sign-extended to int (4 bytes) on the call stack.
|
| GAS m68k syntax. Assembled by the gcc driver via binutils m68k-as.
.text
| ----------------------------------------------------------------
| void surface68kClearLong(uint8_t *pixels, uint16_t fillByte);
|
| Fill 32000 bytes (SURFACE_PIXELS_SIZE) starting at `pixels` with
| `fillByte`. Unrolled move.l with the byte expanded to a 4-byte
| word so we can write 32 bytes per loop body instead of 1.
| ----------------------------------------------------------------
.globl _surface68kClearLong
.equ SURF_CLEAR_BYTES, 32000
.equ SURF_CLEAR_LONGS, (SURF_CLEAR_BYTES / 4)
.equ SURF_CLEAR_PER_ITER, 8
.equ SURF_CLEAR_ITERS, (SURF_CLEAR_LONGS / SURF_CLEAR_PER_ITER)
| Save d2 (used as dbra counter). a0/a1/d0/d1 are
| caller-save so we may clobber them freely. Stack
| offset to args = 4 (saved d2) + 4 (return PC) = 8.
.equ CLR_SAVED, 4
_surface68kClearLong:
move.l %d2,-(%sp)
move.l 4+CLR_SAVED(%sp),%a0 | pixels
| Zero d0 BEFORE the move.b -- m68k move.b only touches
| the low byte of the destination, so d0[31..8] would
| otherwise hold whatever garbage the caller left in d0
| and pollute the OR-replicate chain below.
moveq #0,%d0
move.b 8+CLR_SAVED+3(%sp),%d0 | fillByte (low byte of int)
| Build a long with fillByte replicated four times.
| d0 = $000000FB -> $0000FBFB -> $FBFBFBFB
move.l %d0,%d1
lsl.l #8,%d1
or.l %d1,%d0 | d0 = $0000FBFB
move.l %d0,%d1
swap %d1 | d1 = $FBFB0000
or.l %d1,%d0 | d0 = $FBFBFBFB
move.l %a0,%a1 | dest cursor
move.w #(SURF_CLEAR_ITERS - 1),%d2 | dbra count
.LclearLoop:
move.l %d0,(%a1)+
move.l %d0,(%a1)+
move.l %d0,(%a1)+
move.l %d0,(%a1)+
move.l %d0,(%a1)+
move.l %d0,(%a1)+
move.l %d0,(%a1)+
move.l %d0,(%a1)+
dbra %d2,.LclearLoop
move.l (%sp)+,%d2
rts
| ----------------------------------------------------------------
| void surface68kFillRectFull(uint8_t *pixels, int16_t y, uint16_t h,
| uint16_t fillByte);
|
| Full-width (320 px = 160 byte) rectangle fill. `pixels` is the
| surface base; the row at `y` is `pixels + y * 160`. Fills `h`
| consecutive rows using move.l writes.
| ----------------------------------------------------------------
.globl _surface68kFillRectFull
.equ SURF_FRF_BYTES_PER_ROW, 160
| Save d2/d3 (used for fillByte replicate scratch and
| row counter). 8 bytes saved + 4 ret PC = 12 to args.
.equ FRF_SAVED, 8
_surface68kFillRectFull:
movem.l %d2-%d3,-(%sp)
move.l 4+FRF_SAVED(%sp),%a0 | pixels base
| Zero d0/d1 BEFORE move.b/move.w -- those only update
| sub-register portions and would otherwise carry the
| caller's garbage upper bits into the replicate OR.
moveq #0,%d0
moveq #0,%d1
moveq #0,%d2
move.w 8+FRF_SAVED+2(%sp),%d1 | y (low word)
move.w 12+FRF_SAVED+2(%sp),%d2 | h (low word)
move.b 16+FRF_SAVED+3(%sp),%d0 | fillByte
| Bail on degenerate height.
tst.w %d2
ble .Lfrf_done
| Build replicated fillByte in d0.
move.l %d0,%d3
lsl.l #8,%d3
or.l %d3,%d0
move.l %d0,%d3
swap %d3
or.l %d3,%d0 | d0 = byte * $01010101
| row pointer = pixels + y * 160 = pixels + y*128 + y*32
ext.l %d1
move.l %d1,%d3
lsl.l #7,%d3 | y * 128
lsl.l #5,%d1 | y * 32
add.l %d3,%d1 | y * 160
add.l %d1,%a0 | a0 = first row to fill
subq.w #1,%d2 | row dbra count
.Lfrf_rowLoop:
move.l %a0,%a1
| 40 longs per row, fully unrolled.
move.l %d0,(%a1)+
move.l %d0,(%a1)+
move.l %d0,(%a1)+
move.l %d0,(%a1)+
move.l %d0,(%a1)+
move.l %d0,(%a1)+
move.l %d0,(%a1)+
move.l %d0,(%a1)+
move.l %d0,(%a1)+
move.l %d0,(%a1)+
move.l %d0,(%a1)+
move.l %d0,(%a1)+
move.l %d0,(%a1)+
move.l %d0,(%a1)+
move.l %d0,(%a1)+
move.l %d0,(%a1)+
move.l %d0,(%a1)+
move.l %d0,(%a1)+
move.l %d0,(%a1)+
move.l %d0,(%a1)+
move.l %d0,(%a1)+
move.l %d0,(%a1)+
move.l %d0,(%a1)+
move.l %d0,(%a1)+
move.l %d0,(%a1)+
move.l %d0,(%a1)+
move.l %d0,(%a1)+
move.l %d0,(%a1)+
move.l %d0,(%a1)+
move.l %d0,(%a1)+
move.l %d0,(%a1)+
move.l %d0,(%a1)+
move.l %d0,(%a1)+
move.l %d0,(%a1)+
move.l %d0,(%a1)+
move.l %d0,(%a1)+
move.l %d0,(%a1)+
move.l %d0,(%a1)+
move.l %d0,(%a1)+
move.l %d0,(%a1)+
lea SURF_FRF_BYTES_PER_ROW(%a0),%a0
dbra %d2,.Lfrf_rowLoop
.Lfrf_done:
movem.l (%sp)+,%d2-%d3
rts
| ----------------------------------------------------------------
| void surface68kFillRectByteAligned(uint8_t *rowFirst, uint16_t midBytes,
| uint16_t h, uint16_t fillByte);
|
| Per-row middle-bytes fill for a fillRect where the leading and
| trailing nibble fixups have already been done (or none are
| needed because x and w are both even). `rowFirst` points at the
| first FULLY-INSIDE byte of the rect on row 0; subsequent rows
| are at +160. `midBytes` is how many full bytes per row to fill.
| ----------------------------------------------------------------
.globl _surface68kFillRectByteAligned
.equ SURF_FRB_STRIDE, 160
| Save d2-d6 (5 regs = 20 bytes). Args at +20+4 = +24.
.equ FRB_SAVED, 20
_surface68kFillRectByteAligned:
movem.l %d2-%d6,-(%sp)
move.l 4+FRB_SAVED(%sp),%a0 | rowFirst
| Zero scratch regs BEFORE the sub-register loads --
| move.w / move.b only touch low portions, leaving
| caller's garbage in the upper bits which would
| otherwise pollute the replicate-OR chain below.
moveq #0,%d0
moveq #0,%d1
moveq #0,%d2
move.w 8+FRB_SAVED+2(%sp),%d1 | midBytes
move.w 12+FRB_SAVED+2(%sp),%d2 | h
move.b 16+FRB_SAVED+3(%sp),%d0 | fillByte
tst.w %d1
beq .Lfrb_done
tst.w %d2
ble .Lfrb_done
| Build replicated fillByte in d0.
move.l %d0,%d3
lsl.l #8,%d3
or.l %d3,%d0
move.l %d0,%d3
swap %d3
or.l %d3,%d0
| Pre-compute long-count and trailing-byte residual.
move.w %d1,%d4
lsr.w #2,%d4 | longCount
move.w %d1,%d5
and.w #3,%d5 | trailBytes
subq.w #1,%d2 | row dbra count
.Lfrb_rowLoop:
move.l %a0,%a1
move.w %d4,%d6 | longs left
tst.w %d6
beq .Lfrb_tail
subq.w #1,%d6
.Lfrb_longLoop:
move.l %d0,(%a1)+
dbra %d6,.Lfrb_longLoop
.Lfrb_tail:
move.w %d5,%d6 | trailing bytes
tst.w %d6
beq .Lfrb_rowDone
subq.w #1,%d6
.Lfrb_byteLoop:
move.b %d0,(%a1)+
dbra %d6,.Lfrb_byteLoop
.Lfrb_rowDone:
lea SURF_FRB_STRIDE(%a0),%a0
dbra %d2,.Lfrb_rowLoop
.Lfrb_done:
movem.l (%sp)+,%d2-%d6
rts