Major Atari ST work.

This commit is contained in:
Scott Duensing 2026-05-04 00:35:41 -05:00
parent b1e24b4650
commit 818dc801db
13 changed files with 3480 additions and 198 deletions

View file

@ -37,6 +37,7 @@ LIB_OBJS := \
$(patsubst $(SRC_PORT)/atarist/%.s,$(BUILD)/obj/port/%.o,$(PORT_S_SRCS)) \ $(patsubst $(SRC_PORT)/atarist/%.s,$(BUILD)/obj/port/%.o,$(PORT_S_SRCS)) \
$(patsubst $(SRC_68K)/%.s,$(BUILD)/obj/68k/%.o,$(SHARED_S)) \ $(patsubst $(SRC_68K)/%.s,$(BUILD)/obj/68k/%.o,$(SHARED_S)) \
$(BUILD)/obj/codegen/spriteEmit68k.o \ $(BUILD)/obj/codegen/spriteEmit68k.o \
$(BUILD)/obj/codegen/spriteEmitInterleaved68k.o \
$(BUILD)/obj/codegen/spriteCompile.o $(BUILD)/obj/codegen/spriteCompile.o
LIB := $(LIBDIR)/libjoey.a LIB := $(LIBDIR)/libjoey.a

View file

@ -37,7 +37,7 @@ static uint16_t emitDrawForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift
#elif defined(JOEYLIB_PLATFORM_AMIGA) #elif defined(JOEYLIB_PLATFORM_AMIGA)
return spriteEmitDrawPlanar68k(out, sp, shift); return spriteEmitDrawPlanar68k(out, sp, shift);
#elif defined(JOEYLIB_PLATFORM_ATARIST) #elif defined(JOEYLIB_PLATFORM_ATARIST)
return spriteEmitDraw68k(out, sp, shift); return spriteEmitDrawInterleaved68k(out, sp, shift);
#elif defined(JOEYLIB_PLATFORM_IIGS) #elif defined(JOEYLIB_PLATFORM_IIGS)
return spriteEmitDrawIigs(out, sp, shift); return spriteEmitDrawIigs(out, sp, shift);
#else #else
@ -57,7 +57,7 @@ static uint16_t emitSaveForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift
#elif defined(JOEYLIB_PLATFORM_AMIGA) #elif defined(JOEYLIB_PLATFORM_AMIGA)
return spriteEmitSavePlanar68k(out, sp, shift); return spriteEmitSavePlanar68k(out, sp, shift);
#elif defined(JOEYLIB_PLATFORM_ATARIST) #elif defined(JOEYLIB_PLATFORM_ATARIST)
return spriteEmitSave68k(out, sp, shift); return spriteEmitSaveInterleaved68k(out, sp, shift);
#elif defined(JOEYLIB_PLATFORM_IIGS) #elif defined(JOEYLIB_PLATFORM_IIGS)
return spriteEmitSaveIigs(out, sp, shift); return spriteEmitSaveIigs(out, sp, shift);
#else #else
@ -73,7 +73,7 @@ static uint16_t emitRestoreForTarget(uint8_t *out, const SpriteT *sp, uint8_t sh
#elif defined(JOEYLIB_PLATFORM_AMIGA) #elif defined(JOEYLIB_PLATFORM_AMIGA)
return spriteEmitRestorePlanar68k(out, sp, shift); return spriteEmitRestorePlanar68k(out, sp, shift);
#elif defined(JOEYLIB_PLATFORM_ATARIST) #elif defined(JOEYLIB_PLATFORM_ATARIST)
return spriteEmitRestore68k(out, sp, shift); return spriteEmitRestoreInterleaved68k(out, sp, shift);
#elif defined(JOEYLIB_PLATFORM_IIGS) #elif defined(JOEYLIB_PLATFORM_IIGS)
return spriteEmitRestoreIigs(out, sp, shift); return spriteEmitRestoreIigs(out, sp, shift);
#else #else
@ -139,6 +139,15 @@ bool spriteCompile(SpriteT *sp) {
free(scratch); free(scratch);
return false; return false;
} }
if (totalSize == 0) {
/* Platforms whose emitter returns 0 for every (shift, op) have
* no compiled bytes -- spriteCompiledDraw / SaveUnder /
* RestoreUnder would dereference a degenerate slot or chunky
* shadow. Bail so sp->slot stays NULL and the dispatcher
* routes through the interpreted halSpriteXxxPlanes path. */
free(scratch);
return false;
}
slot = codegenArenaAlloc(totalSize); slot = codegenArenaAlloc(totalSize);
if (slot == NULL) { if (slot == NULL) {
@ -684,6 +693,68 @@ void spriteCompiledRestoreUnder(SurfaceT *dst, const SpriteBackupT *backup) {
fn(p0 + byteOff, p1 + byteOff, p2 + byteOff, p3 + byteOff, backup->bytes); fn(p0 + byteOff, p1 + byteOff, p2 + byteOff, p3 + byteOff, backup->bytes);
} }
#elif defined(JOEYLIB_PLATFORM_ATARIST)
/* ST word-interleaved planar runtime dispatch. The JIT routine takes
* one arg: groupBase = pd->base + y*160 + (x>>4)*8 (the address of
* the first 16-pixel group the sprite touches). It walks rows by
* adda.w #160 at the end of each row. Per (row, tile_col, plane) it
* emits up to one move.b / clr.b / andi.b+ori.b / ori.b chain at
* d16(a0).
*
* shift selection (in spriteInternal.h SPRITE_SHIFT_INDEX):
* 0 : byte-aligned x with x mod 16 == 0 (first tile col high half)
* 1 : byte-aligned x with x mod 16 == 8 (first tile col low half)
* 2+ : non-byte-aligned x, never compiled (emitter returns 0); the
* per-shift offset is SPRITE_NOT_COMPILED so the dispatcher
* falls back to halSpriteDrawPlanes. */
void spriteCompiledDraw(SurfaceT *dst, const SpriteT *sp, int16_t x, int16_t y) {
typedef void (*DrawFn)(uint8_t *groupBase);
uint8_t shift;
uint16_t routeOffset;
uint8_t *base;
uint8_t *groupBase;
DrawFn fn;
shift = SPRITE_SHIFT_INDEX(x);
routeOffset = sp->routineOffsets[shift][SPRITE_OP_DRAW];
if (routeOffset == SPRITE_NOT_COMPILED) {
/* Non-byte-aligned x: cross-platform spriteDraw will call
* halSpriteDrawPlanes after this returns (since the dispatcher
* already chose the compiled path based on sp->slot != NULL,
* but COMPILED_SPRITE_WRITES_PLANES is 1 on ST so it normally
* suppresses the planes hook). For non-aligned shifts we
* deliberately want the interpreted planes hook to run, so
* delegate via halSpriteDrawPlanes here. */
halSpriteDrawPlanes(dst, sp, x, y);
return;
}
base = halSurfacePlanePtr(dst, 0);
if (base == NULL) {
return;
}
groupBase = base
+ (uint16_t)y * 160u
+ (uint16_t)((uint16_t)x >> 4) * 8u;
fn = (DrawFn)(codegenArenaBase() + sp->slot->offset + routeOffset);
fn(groupBase);
}
/* Save/Restore aren't compiled on ST yet (emitter returns 0). The
* dispatcher's check on sp->routineOffsets[shift][SPRITE_OP_SAVE/_RESTORE]
* == SPRITE_NOT_COMPILED already routes those through the
* interpreted halSpriteSavePlanes / halSpriteRestorePlanes. These
* stubs exist only to satisfy the linker. */
void spriteCompiledSaveUnder(const SurfaceT *src, SpriteT *sp, int16_t x, int16_t y, SpriteBackupT *backup) {
(void)src; (void)sp; (void)x; (void)y; (void)backup;
}
void spriteCompiledRestoreUnder(SurfaceT *dst, const SpriteBackupT *backup) {
(void)dst; (void)backup;
}
#else #else
void spriteCompiledDraw(SurfaceT *dst, const SpriteT *sp, int16_t x, int16_t y) { void spriteCompiledDraw(SurfaceT *dst, const SpriteT *sp, int16_t x, int16_t y) {

View file

@ -0,0 +1,220 @@
// 68k sprite codegen for ST word-interleaved planar layout. Emits a
// cdecl-callable routine `void draw(uint8_t *groupBase)` that walks
// the sprite's tile data and writes plane bytes via `d16(a0)` chains.
//
// ST planar layout reminder (doc/atarist_planar.md): one buffer; per
// scanline 20 groups of 8 bytes; per group, 4 plane words back-to-
// back. groupBase points at the FIRST group the sprite touches:
// pd->base + y * 160 + (x >> 4) * 8
//
// Shift index for ST is bit 3 of x (whether the sprite starts in the
// high half or low half of the first group). x mod 8 != 0 falls back
// to the interpreter (returns 0 from this emitter so sp->slot stays
// NULL for those alignments).
//
// Per (row, tile_col, plane) we emit one of:
// * nothing (op byte = 0, all transparent)
// * move.b #pbN, d16(a0) (op = 0xFF, full replace, 6 bytes)
// * clr.b d16(a0) (op = 0xFF AND pbN = 0, 4 bytes)
// * andi.b #~op, d16(a0) (op partial, pbN = 0, 6 bytes)
// * ori.b #pbN, d16(a0) (op partial, pbN == op, 6 bytes)
// * andi.b #~op + ori.b #pbN (mixed, 12 bytes)
//
// d16 is the byte offset from groupBase to the target plane byte.
// Layout of the byte offset:
// shift 0: byteOff = (col >> 1) * 8 + plane*2 + (col & 1)
// shift 1: byteOff = ((col + 1) >> 1) * 8 + plane*2 + (1 - (col & 1))
// Each tile column is 8 sprite pixels = exactly half a 16-pixel
// group, alternating high (offset 0) and low (offset 1) bytes of
// each plane word.
//
// Per row we adda.w #160, a0 to advance to the next scanline.
#include "joey/sprite.h"
#include "joey/surface.h"
#include "spriteEmitter.h"
#include "spriteInternal.h"
// ----- Constants -----
#define TILE_PIXELS 8
#define TILE_BYTES 32
#define TILE_BYTES_PER_ROW 4
#define ST_BYTES_PER_ROW 160
// ----- Helpers -----
static uint16_t writeBE16(uint8_t *out, uint16_t value) {
out[0] = (uint8_t)(value >> 8);
out[1] = (uint8_t)(value & 0xFFu);
return 2;
}
// Build the 4 plane bytes + opacity byte for one (row, tileCol)
// pair. pbN bit 7 is sprite pixel 0 (leftmost), bit 0 is pixel 7.
// op bit N is set iff that pixel's color != 0.
static void buildPlaneBytes(const SpriteT *sp, uint16_t row, uint16_t tileCol,
uint8_t *outPb0, uint8_t *outPb1,
uint8_t *outPb2, uint8_t *outPb3,
uint8_t *outOp) {
uint16_t tileY = (uint16_t)(row >> 3);
uint16_t inTileY = (uint16_t)(row & 7u);
uint16_t wTiles = sp->widthTiles;
const uint8_t *tileBytes = sp->tileData + (uint32_t)(tileY * wTiles + tileCol) * 32u;
const uint8_t *tileRow = tileBytes + (uint32_t)inTileY * TILE_BYTES_PER_ROW;
uint8_t pb0 = 0u;
uint8_t pb1 = 0u;
uint8_t pb2 = 0u;
uint8_t pb3 = 0u;
uint8_t op = 0u;
uint8_t p;
uint8_t b;
uint8_t color;
uint8_t bit;
for (p = 0; p < 8u; p++) {
b = tileRow[p >> 1];
color = (p & 1u) ? (uint8_t)(b & 0x0Fu) : (uint8_t)(b >> 4);
bit = (uint8_t)(0x80u >> p);
if (color != 0u) {
op = (uint8_t)(op | bit);
if (color & 1u) pb0 = (uint8_t)(pb0 | bit);
if (color & 2u) pb1 = (uint8_t)(pb1 | bit);
if (color & 4u) pb2 = (uint8_t)(pb2 | bit);
if (color & 8u) pb3 = (uint8_t)(pb3 | bit);
}
}
*outPb0 = pb0;
*outPb1 = pb1;
*outPb2 = pb2;
*outPb3 = pb3;
*outOp = op;
}
// Emit code for one plane byte at d16(a0). Returns bytes written.
// op=opacity byte, pb=plane byte (subset of op).
static uint16_t emitPlaneByte(uint8_t *out, uint16_t cursor, uint16_t d16, uint8_t op, uint8_t pb) {
uint16_t start = cursor;
if (op == 0u) {
return 0u; /* nothing to emit */
}
if (op == 0xFFu) {
/* All 8 pixels opaque: replace the byte. */
if (pb == 0u) {
/* clr.b d16(a0). Opcode 0x4228 + d16. 4 bytes. */
cursor += writeBE16(out + cursor, 0x4228u);
cursor += writeBE16(out + cursor, d16);
} else {
/* move.b #pb, d16(a0). Opcode 0x117C + #imm word + d16. 6 bytes. */
cursor += writeBE16(out + cursor, 0x117Cu);
cursor += writeBE16(out + cursor, (uint16_t)pb);
cursor += writeBE16(out + cursor, d16);
}
return (uint16_t)(cursor - start);
}
/* Partial opacity. pb is a subset of op. */
if (pb == 0u) {
/* All opaque pixels have plane bit 0: just clear those bits. */
/* andi.b #~op, d16(a0). Opcode 0x0228 + #imm word + d16. 6 bytes. */
cursor += writeBE16(out + cursor, 0x0228u);
cursor += writeBE16(out + cursor, (uint16_t)(~op & 0xFFu));
cursor += writeBE16(out + cursor, d16);
return (uint16_t)(cursor - start);
}
if (pb == op) {
/* All opaque pixels have plane bit 1: just set those bits. */
/* ori.b #op, d16(a0). Opcode 0x0028 + #imm word + d16. 6 bytes. */
cursor += writeBE16(out + cursor, 0x0028u);
cursor += writeBE16(out + cursor, (uint16_t)op);
cursor += writeBE16(out + cursor, d16);
return (uint16_t)(cursor - start);
}
/* Mixed: clear opaque bits, then set the plane bits. */
cursor += writeBE16(out + cursor, 0x0228u);
cursor += writeBE16(out + cursor, (uint16_t)(~op & 0xFFu));
cursor += writeBE16(out + cursor, d16);
cursor += writeBE16(out + cursor, 0x0028u);
cursor += writeBE16(out + cursor, (uint16_t)pb);
cursor += writeBE16(out + cursor, d16);
return (uint16_t)(cursor - start);
}
// ----- Emit API -----
uint16_t spriteEmitDrawInterleaved68k(uint8_t *out, const SpriteT *sp, uint8_t shift) {
uint16_t cursor;
uint16_t row;
uint16_t col;
uint16_t plane;
uint16_t heightPx;
uint16_t wTiles;
uint8_t pb[4];
uint8_t op;
/* Only shifts 0 and 1 emit code. shift 0 = first tile col in
* high half (x mod 16 == 0). shift 1 = first tile col in low
* half (x mod 16 == 8). Other byte alignments fall through to
* the interpreter via halSpriteDrawPlanes. */
if (shift > 1u) {
return 0u;
}
cursor = 0u;
heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS);
wTiles = sp->widthTiles;
/* Prologue: movea.l 4(sp), a0. Opcode 0x206F + d16=4. 4 bytes. */
cursor += writeBE16(out + cursor, 0x206Fu);
cursor += writeBE16(out + cursor, 0x0004u);
for (row = 0; row < heightPx; row++) {
if (row > 0u) {
/* adda.w #160, a0. Opcode 0xD0FC + imm word. 4 bytes. */
cursor += writeBE16(out + cursor, 0xD0FCu);
cursor += writeBE16(out + cursor, (uint16_t)ST_BYTES_PER_ROW);
}
for (col = 0; col < wTiles; col++) {
buildPlaneBytes(sp, row, col, &pb[0], &pb[1], &pb[2], &pb[3], &op);
if (op == 0u) {
continue; /* whole tile column row is transparent */
}
for (plane = 0; plane < 4u; plane++) {
uint16_t d16;
if (shift == 0u) {
/* col 0 (high) -> +0, col 1 (low) -> +1, col 2
* (high group 1) -> +8, ... */
d16 = (uint16_t)((col >> 1) * 8 + plane * 2 + (col & 1u));
} else {
/* col 0 (low) -> +1, col 1 (high group 1) -> +8, ... */
d16 = (uint16_t)(((col + 1u) >> 1) * 8 + plane * 2 + (1u - (col & 1u)));
}
cursor += emitPlaneByte(out, cursor, d16, op, pb[plane]);
}
}
}
/* Epilogue: rts. */
cursor += writeBE16(out + cursor, 0x4E75u);
return cursor;
}
/* Save / restore aren't implemented yet -- returning 0 so they fall
* through to the C interpreter (halSpriteSavePlanes / halSpriteRestorePlanes
* fast paths cover the byte-aligned case). */
uint16_t spriteEmitSaveInterleaved68k(uint8_t *out, const SpriteT *sp, uint8_t shift) {
(void)out; (void)sp; (void)shift;
return 0u;
}
uint16_t spriteEmitRestoreInterleaved68k(uint8_t *out, const SpriteT *sp, uint8_t shift) {
(void)out; (void)sp; (void)shift;
return 0u;
}

View file

@ -57,4 +57,15 @@ uint16_t spriteEmitDrawPlanar68k (uint8_t *out, const SpriteT *sp, uint8_t sh
uint16_t spriteEmitSavePlanar68k (uint8_t *out, const SpriteT *sp, uint8_t shift); uint16_t spriteEmitSavePlanar68k (uint8_t *out, const SpriteT *sp, uint8_t shift);
uint16_t spriteEmitRestorePlanar68k (uint8_t *out, const SpriteT *sp, uint8_t shift); uint16_t spriteEmitRestorePlanar68k (uint8_t *out, const SpriteT *sp, uint8_t shift);
// Word-interleaved planar 68k emitter (ST). Calling convention for
// the emitted bytes:
// void draw(uint8_t *groupBase);
// where groupBase = pd->base + y*160 + (x>>4)*8. Shifts 0 and 1 emit
// real bytes (x mod 16 == 0 for shift 0, x mod 16 == 8 for shift 1);
// other shifts return 0 so the cross-platform dispatcher falls back
// to halSpriteDrawPlanes.
uint16_t spriteEmitDrawInterleaved68k (uint8_t *out, const SpriteT *sp, uint8_t shift);
uint16_t spriteEmitSaveInterleaved68k (uint8_t *out, const SpriteT *sp, uint8_t shift);
uint16_t spriteEmitRestoreInterleaved68k (uint8_t *out, const SpriteT *sp, uint8_t shift);
#endif #endif

View file

@ -31,7 +31,11 @@
// paths still need the hooks unconditionally on every platform -- the // paths still need the hooks unconditionally on every platform -- the
// chunky interpreter is a no-op on Amiga (s->pixels NULL) so the hook // chunky interpreter is a no-op on Amiga (s->pixels NULL) so the hook
// is the only draw. // is the only draw.
#if defined(JOEYLIB_PLATFORM_AMIGA) /* ST also runs pure planar post-Phase-9 (s->pixels NULL); the JIT
* routine writes plane bytes directly, so the chunky interpreter
* is a no-op and the halSpriteDrawPlanes hook would be a redundant
* second draw. Same rationale as Amiga. */
#if defined(JOEYLIB_PLATFORM_AMIGA) || defined(JOEYLIB_PLATFORM_ATARIST)
#define COMPILED_SPRITE_WRITES_PLANES 1 #define COMPILED_SPRITE_WRITES_PLANES 1
#else #else
#define COMPILED_SPRITE_WRITES_PLANES 0 #define COMPILED_SPRITE_WRITES_PLANES 0

View file

@ -16,9 +16,15 @@
// Per-platform shift index used by the dispatcher. Chunky 4bpp ports // Per-platform shift index used by the dispatcher. Chunky 4bpp ports
// store one nibble per pixel pair so the only sub-byte alignment is // store one nibble per pixel pair so the only sub-byte alignment is
// x % 2. Amiga planar packs 8 pixels per plane byte so all 8 // x % 2. Amiga planar packs 8 pixels per plane byte so all 8
// alignments matter. // alignments matter. ST word-interleaved planar groups 16 pixels
// per word; for byte-aligned x (x mod 8 == 0) the only meaningful
// distinction is high vs low byte of the plane word, which is bit
// 3 of x (== (x >> 3) & 1). Other shifts (x mod 8 != 0) emit 0
// from the JIT and route to the interpreter.
#if defined(JOEYLIB_PLATFORM_AMIGA) #if defined(JOEYLIB_PLATFORM_AMIGA)
#define SPRITE_SHIFT_INDEX(x) ((uint8_t)((x) & 7)) #define SPRITE_SHIFT_INDEX(x) ((uint8_t)((x) & 7))
#elif defined(JOEYLIB_PLATFORM_ATARIST)
#define SPRITE_SHIFT_INDEX(x) ((uint8_t)(((x) & 7) ? 2u : (uint8_t)(((x) >> 3) & 1u)))
#else #else
#define SPRITE_SHIFT_INDEX(x) ((uint8_t)((x) & 1)) #define SPRITE_SHIFT_INDEX(x) ((uint8_t)((x) & 1))
#endif #endif

View file

@ -141,11 +141,13 @@ void tileCopy(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src,
srcPixelX = (uint16_t)((uint16_t)srcBx * TILE_PIXELS_PER_SIDE); srcPixelX = (uint16_t)((uint16_t)srcBx * TILE_PIXELS_PER_SIDE);
srcPixelY = (uint16_t)((uint16_t)srcBy * TILE_PIXELS_PER_SIDE); srcPixelY = (uint16_t)((uint16_t)srcBy * TILE_PIXELS_PER_SIDE);
dstRow0 = &dst->pixels[SURFACE_ROW_OFFSET(dstPixelY) + (dstPixelX >> 1)]; /* Skip the chunky path on planar ports (pixels NULL). */
srcRow0 = &src->pixels[SURFACE_ROW_OFFSET(srcPixelY) + (srcPixelX >> 1)]; if (dst->pixels != NULL && src->pixels != NULL) {
dstRow0 = &dst->pixels[SURFACE_ROW_OFFSET(dstPixelY) + (dstPixelX >> 1)];
if (!halFastTileCopy(dstRow0, srcRow0)) { srcRow0 = &src->pixels[SURFACE_ROW_OFFSET(srcPixelY) + (srcPixelX >> 1)];
copyTileOpaque(dstRow0, srcRow0); if (!halFastTileCopy(dstRow0, srcRow0)) {
copyTileOpaque(dstRow0, srcRow0);
}
} }
halTileCopyPlanes(dst, dstBx, dstBy, src, srcBx, srcBy); halTileCopyPlanes(dst, dstBx, dstBy, src, srcBx, srcBy);
surfaceMarkDirtyRect(dst, (int16_t)dstPixelX, (int16_t)dstPixelY, surfaceMarkDirtyRect(dst, (int16_t)dstPixelX, (int16_t)dstPixelY,
@ -173,11 +175,13 @@ void tileCopyMasked(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT
srcPixelX = (uint16_t)((uint16_t)srcBx * TILE_PIXELS_PER_SIDE); srcPixelX = (uint16_t)((uint16_t)srcBx * TILE_PIXELS_PER_SIDE);
srcPixelY = (uint16_t)((uint16_t)srcBy * TILE_PIXELS_PER_SIDE); srcPixelY = (uint16_t)((uint16_t)srcBy * TILE_PIXELS_PER_SIDE);
dstRow0 = &dst->pixels[SURFACE_ROW_OFFSET(dstPixelY) + (dstPixelX >> 1)]; /* Skip the chunky path on planar ports (pixels NULL). */
srcRow0 = &src->pixels[SURFACE_ROW_OFFSET(srcPixelY) + (srcPixelX >> 1)]; if (dst->pixels != NULL && src->pixels != NULL) {
dstRow0 = &dst->pixels[SURFACE_ROW_OFFSET(dstPixelY) + (dstPixelX >> 1)];
if (!halFastTileCopyMasked(dstRow0, srcRow0, transparentIndex)) { srcRow0 = &src->pixels[SURFACE_ROW_OFFSET(srcPixelY) + (srcPixelX >> 1)];
copyTileMasked(dstRow0, srcRow0, transparentIndex); if (!halFastTileCopyMasked(dstRow0, srcRow0, transparentIndex)) {
copyTileMasked(dstRow0, srcRow0, transparentIndex);
}
} }
halTileCopyMaskedPlanes(dst, dstBx, dstBy, src, srcBx, srcBy, transparentIndex); halTileCopyMaskedPlanes(dst, dstBx, dstBy, src, srcBx, srcBy, transparentIndex);
surfaceMarkDirtyRect(dst, (int16_t)dstPixelX, (int16_t)dstPixelY, surfaceMarkDirtyRect(dst, (int16_t)dstPixelX, (int16_t)dstPixelY,
@ -199,8 +203,9 @@ void tileFill(SurfaceT *s, uint8_t bx, uint8_t by, uint8_t colorIndex) {
pixelX = (uint16_t)((uint16_t)bx * TILE_PIXELS_PER_SIDE); pixelX = (uint16_t)((uint16_t)bx * TILE_PIXELS_PER_SIDE);
pixelY = (uint16_t)((uint16_t)by * TILE_PIXELS_PER_SIDE); pixelY = (uint16_t)((uint16_t)by * TILE_PIXELS_PER_SIDE);
doubled = (uint8_t)(((colorIndex & 0x0F) << 4) | (colorIndex & 0x0F)); doubled = (uint8_t)(((colorIndex & 0x0F) << 4) | (colorIndex & 0x0F));
if (!halFastTileFill(s, bx, by, if (s->pixels != NULL
(uint16_t)((uint16_t)doubled | ((uint16_t)doubled << 8)))) { && !halFastTileFill(s, bx, by,
(uint16_t)((uint16_t)doubled | ((uint16_t)doubled << 8)))) {
uint8_t *row = &s->pixels[SURFACE_ROW_OFFSET(pixelY) + (pixelX >> 1)]; uint8_t *row = &s->pixels[SURFACE_ROW_OFFSET(pixelY) + (pixelX >> 1)];
uint8_t i; uint8_t i;
for (i = 0; i < TILE_PIXELS_PER_SIDE; i++) { for (i = 0; i < TILE_PIXELS_PER_SIDE; i++) {
@ -232,16 +237,22 @@ void tilePaste(SurfaceT *dst, uint8_t bx, uint8_t by, const TileT *in) {
} }
pixelX = (uint16_t)((uint16_t)bx * TILE_PIXELS_PER_SIDE); pixelX = (uint16_t)((uint16_t)bx * TILE_PIXELS_PER_SIDE);
pixelY = (uint16_t)((uint16_t)by * TILE_PIXELS_PER_SIDE); pixelY = (uint16_t)((uint16_t)by * TILE_PIXELS_PER_SIDE);
dstRow = &dst->pixels[SURFACE_ROW_OFFSET(pixelY) + (pixelX >> 1)];
src = &in->pixels[0]; src = &in->pixels[0];
if (!halFastTilePaste(dstRow, src)) { /* Skip the chunky write path on planar ports (dst->pixels NULL) --
for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) { * mirrors tileSnap's pixels-NULL short-circuit. Saves the dstRow
dstRow[0] = src[0]; * SURFACE_ROW_OFFSET multiply + halFastTilePaste jsr/rts per call
dstRow[1] = src[1]; * on ST/Amiga where the planar path below does the real work. */
dstRow[2] = src[2]; if (dst->pixels != NULL) {
dstRow[3] = src[3]; dstRow = &dst->pixels[SURFACE_ROW_OFFSET(pixelY) + (pixelX >> 1)];
dstRow += SURFACE_BYTES_PER_ROW; if (!halFastTilePaste(dstRow, src)) {
src += TILE_BYTES_PER_ROW; for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) {
dstRow[0] = src[0];
dstRow[1] = src[1];
dstRow[2] = src[2];
dstRow[3] = src[3];
dstRow += SURFACE_BYTES_PER_ROW;
src += TILE_BYTES_PER_ROW;
}
} }
} }
halTilePastePlanes(dst, bx, by, &in->pixels[0]); halTilePastePlanes(dst, bx, by, &in->pixels[0]);

View file

@ -39,6 +39,13 @@
#define ST_MFP_IMRA ((volatile uint8_t *)0xFFFFFA13L) #define ST_MFP_IMRA ((volatile uint8_t *)0xFFFFFA13L)
#define ST_MFP_ISRA ((volatile uint8_t *)0xFFFFFA0FL) #define ST_MFP_ISRA ((volatile uint8_t *)0xFFFFFA0FL)
// YM2149 (sound chip) supervisor-only ports. Index reg 7 (mixer)
// controls per-channel tone + noise enables; reg 8/9/A are volumes
// for channels A/B/C; regs 0/1, 2/3, 4/5 are tone period for those
// channels; reg 6 is noise period.
#define ST_YM_SELECT ((volatile uint8_t *)0xFFFF8800L)
#define ST_YM_DATA ((volatile uint8_t *)0xFFFF8802L)
#define MFP_TA_BIT 0x20 #define MFP_TA_BIT 0x20
#define MFP_TACR_STOP 0x00 #define MFP_TACR_STOP 0x00
#define MFP_TACR_DIV200 0x07 #define MFP_TACR_DIV200 0x07
@ -90,6 +97,32 @@ static long installTimerA(void) {
gNeedRefill[0] = 0; gNeedRefill[0] = 0;
gNeedRefill[1] = 0; gNeedRefill[1] = 0;
// YM2149 setup for PWM-via-volume on channel A:
// reg 7 (mixer): set bits 0 (tone A off) and 3 (noise A off);
// preserve bits 6+7 (I/O port directions, used
// by TOS for floppy / keyboard / printer).
// reg 8 (channel A volume): start at 0 to avoid a pop at start.
//
// Without the mixer setup, whatever state TOS left noise A in
// gets gated by our 12 kHz volume writes -- if noise A was on,
// a constant volume = constant hiss. Standard PWM-DAC trick is
// to disable both tone and noise so the volume reg is a pure
// 4-bit amplitude DAC.
//
// We can't reliably read back YM regs on the ST (the data port
// returns last-write, not register contents), so we OR in the
// disable bits over an assumed-safe TOS-default mask. Bit 6 set
// (port A output) matches stock TOS; bit 7 set (port B output)
// matches the centronics-printer direction TOS configures.
*ST_YM_SELECT = 7;
*ST_YM_DATA = 0xFF; // all tones + noises off; I/O ports A+B output (TOS default)
*ST_YM_SELECT = 8;
*ST_YM_DATA = 0; // channel A volume = 0 to avoid a pop at start
*ST_YM_SELECT = 9;
*ST_YM_DATA = 0; // channel B volume = 0
*ST_YM_SELECT = 10;
*ST_YM_DATA = 0; // channel C volume = 0
// MFP Timer A: stop, install our vector, set prescaler 200 + data // MFP Timer A: stop, install our vector, set prescaler 200 + data
// 1 (= 2.4576 MHz / 200 = 12288 Hz), then start. // 1 (= 2.4576 MHz / 200 = 12288 Hz), then start.
*ST_MFP_TACR = MFP_TACR_STOP; *ST_MFP_TACR = MFP_TACR_STOP;
@ -108,6 +141,10 @@ static long uninstallTimerA(void) {
(void)Setexc(VEC_MFP_TA, (long)gOldTimerAVec); (void)Setexc(VEC_MFP_TA, (long)gOldTimerAVec);
gOldTimerAVec = NULL; gOldTimerAVec = NULL;
} }
/* Silence channel A volume so handoff back to TOS is clean (no
* residual DC level on the speaker). */
*ST_YM_SELECT = 8;
*ST_YM_DATA = 0;
return 0; return 0;
} }

282
src/port/atarist/circle.s Normal file
View file

@ -0,0 +1,282 @@
| Atari ST word-interleaved planar circle outline -- 68000 hand-rolled.
|
| Mirrors src/port/amiga/circle.s in spirit but for ST's single
| word-interleaved planar buffer:
| * Per scanline: 20 groups of 8 bytes; each group is 4 plane
| words back-to-back (p0_word, p1_word, p2_word, p3_word).
| * Pixel x: group = x >> 4; bit position within word = 15 - (x & 15).
| * Plane N's word at row y, group g: base + y*160 + g*8 + N*2.
|
| 16-way color dispatch + per-iter precompute (4 xp records + 4 yp40
| words) gives a branchless 4-plane RMW per pixel. 8 octants are
| inlined per Bresenham iter; no bsr.
|
| ABI: cdecl. d2-d7/a2-a6 callee-save.
|
| void surface68kStCircleOutline(uint8_t *base,
| uint16_t cx, uint16_t cy,
| uint16_t r, uint8_t color);
|
| Register allocation:
| d2.w = bx (Bresenham)
| d3.w = by (Bresenham)
| d4.w = err (Bresenham)
| d5.w = cx (cached)
| a4 = cy (cached, sign-extended)
| a3 = base
| a5 = bitMaskWordLut
| d0,d1,d6,d7 = scratch
|
| Scratch block (24 bytes) at sp+0..23:
| sp+0..3: xp1 record [groupOff_w, bitMask_b, notMask_b]
| groupOff = (x >> 4) * 8 (byte offset of group within row)
| bitMask = byte representation of 1 << (15 - (x & 15))
| ... wait, bitMask must be a WORD on ST not a byte.
|
| Actually layout differs from Amiga: ST needs a WORD bit mask, not
| a byte. Per-record layout (8 bytes):
| groupOff_word (2 bytes), bitMask_word (2 bytes), notMask_word
| (2 bytes), pad (2 bytes)
|
| sp+0..7: xp1 record (cx + bx)
| sp+8..15: xp2 record (cx - bx)
| sp+16..23: xp3 record (cx + by)
| sp+24..31: xp4 record (cx - by)
| sp+32..33: yp1_off (cy + by) * 160
| sp+34..35: yp2_off (cy - by) * 160
| sp+36..37: yp3_off (cy + bx) * 160
| sp+38..39: yp4_off (cy - bx) * 160
| Total: 40 bytes.
.text
| ---- BIT_MASK_WORD: build 1 << (15 - (x & 15)) ---------
| Look up via 16-entry table (a5 holds base). Cheaper than variable
| shift on 68000 (which is 8 + 2n cyc). Table is 32 bytes (16 words).
| Returns word in d_out.
| ---- XP_REC: build xp record at sp+slot for xp = cx <signOp> <xreg>
| signOp: add or sub
| xreg: %d2 (bx) or %d3 (by)
| slot: 0, 8, 16, or 24
| Trashes: d0, d1, d6, d7
.macro XP_REC slot, signOp, xreg
move.w %d5,%d6 | d6 = cx
\signOp\().w \xreg,%d6 | d6 = xp
move.w %d6,%d7
lsr.w #4,%d7 | d7 = group
lsl.w #3,%d7 | d7 = group * 8 (byte offset)
and.w #15,%d6 | d6 = xp & 15 (0..15)
add.w %d6,%d6 | d6 *= 2 (word index)
move.w (%a5,%d6.w),%d6 | d6 = bitMask word
move.w %d7,\slot(%sp) | groupOff word
move.w %d6,\slot+2(%sp) | bitMask word
.endm
| ---- YP_REC: store (yp * 160) at sp+slot ---------
| yp = cy <signOp> <yreg>; trashes d0, d6.
.macro YP_REC slot, signOp, yreg
move.l %a4,%d6
\signOp\().w \yreg,%d6 | d6.w = yp
move.w %d6,%d0
lsl.w #5,%d6 | d6 = yp << 5
lsl.w #7,%d0 | d0 = yp << 7
add.w %d6,%d0 | d0 = yp * 160
move.w %d0,\slot(%sp)
.endm
| ---- PLOT_FIXED: plot one pixel with hardcoded 4-bit color ----
| slotYp: 32, 34, 36, or 38 (yp_off word slot)
| slotXp: 0, 8, 16, or 24 (xp record slot)
| color: literal 0..15
| Trashes: d0, d1, d7
.macro PLOT_FIXED slotYp, slotXp, color
move.w \slotYp(%sp),%d0 | d0 = yp_off
add.w \slotXp(%sp),%d0 | d0 += groupOff
move.w \slotXp+2(%sp),%d1 | d1 = bitMask word
move.w %d1,%d7
not.w %d7 | d7 = notMask
lea 0(%a3,%d0.w),%a2 | a2 = base + byteOff (group ptr)
| 4 plane word RMWs at (a2)+, postinc walks p0->p1->p2->p3
.if ((\color) & 1)
or.w %d1,(%a2)+
.else
and.w %d7,(%a2)+
.endif
.if ((\color) & 2)
or.w %d1,(%a2)+
.else
and.w %d7,(%a2)+
.endif
.if ((\color) & 4)
or.w %d1,(%a2)+
.else
and.w %d7,(%a2)+
.endif
.if ((\color) & 8)
or.w %d1,(%a2)+
.else
and.w %d7,(%a2)+
.endif
.endm
| ---- PLOT_8: 8 octant pixels for hardcoded color ----
.macro PLOT_8 color
PLOT_FIXED 32, 0, \color | (cx+bx, cy+by)
PLOT_FIXED 32, 8, \color | (cx-bx, cy+by)
PLOT_FIXED 34, 0, \color | (cx+bx, cy-by)
PLOT_FIXED 34, 8, \color | (cx-bx, cy-by)
PLOT_FIXED 36, 16, \color | (cx+by, cy+bx)
PLOT_FIXED 36, 24, \color | (cx-by, cy+bx)
PLOT_FIXED 38, 16, \color | (cx+by, cy-bx)
PLOT_FIXED 38, 24, \color | (cx-by, cy-bx)
.endm
| ---- CO_BODY: full Bresenham loop body for hardcoded color ----
.macro CO_BODY color
XP_REC 0, add, %d2 | xp1 = cx+bx
XP_REC 8, sub, %d2 | xp2 = cx-bx
XP_REC 16, add, %d3 | xp3 = cx+by
XP_REC 24, sub, %d3 | xp4 = cx-by
YP_REC 32, add, %d3 | yp1 = (cy+by)*160
YP_REC 34, sub, %d3 | yp2 = (cy-by)*160
YP_REC 36, add, %d2 | yp3 = (cy+bx)*160
YP_REC 38, sub, %d2 | yp4 = (cy-bx)*160
PLOT_8 \color
addq.w #1,%d3
tst.w %d4
bgt .LcoStDecX_\color
add.w %d3,%d4
add.w %d3,%d4
addq.w #1,%d4
bra.w .LcoStLoop_\color
.LcoStDecX_\color:
subq.w #1,%d2
add.w %d3,%d4
add.w %d3,%d4
sub.w %d2,%d4
sub.w %d2,%d4
addq.w #1,%d4
bra.w .LcoStLoop_\color
.endm
.macro CO_LOOP_HDR color
.LcoStLoop_\color:
cmp.w %d3,%d2
bcs.w .LcoStDone
CO_BODY \color
.endm
| ---- Function entry ----
| Stack on entry (after movem.l of 11 regs + lea):
| sp+0..39: scratch (40 bytes)
| sp+40..83: movem (44 bytes)
| sp+84..87: return PC
| sp+88+0: base (uint8_t *)
| sp+88+4: cx (int promoted, .w at +88+4+2)
| sp+88+8: cy (int promoted, .w at +88+8+2)
| sp+88+12: r (int promoted, .w at +88+12+2)
| sp+88+16: color (int promoted, byte at +88+16+3)
.equ SP_SAVED, 44
.equ SP_LOCAL, 40
.equ SP_OFF, (SP_SAVED + 4 + SP_LOCAL)
.equ SP_BASE, SP_OFF + 0
.equ SP_CX, SP_OFF + 4 + 2
.equ SP_CY, SP_OFF + 8 + 2
.equ SP_R, SP_OFF + 12 + 2
.equ SP_COLOR, SP_OFF + 16 + 3
.globl _surface68kStCircleOutline
_surface68kStCircleOutline:
movem.l %d2-%d7/%a2-%a6,-(%sp)
lea -SP_LOCAL(%sp),%sp
| Load base (a3) and bitMaskLut (a5).
move.l SP_BASE(%sp),%a3
lea bitMaskWordLut(%pc),%a5
| Cache cx in d5, cy (sign-extended) in a4.
move.w SP_CX(%sp),%d5
move.w SP_CY(%sp),%d6
ext.l %d6
movea.l %d6,%a4
| Bresenham init.
move.w SP_R(%sp),%d2 | bx = r
moveq #0,%d3 | by = 0
moveq #1,%d4
sub.w %d2,%d4 | err = 1 - bx
| Dispatch on color (low 4 bits) -> one of 16 main loops.
moveq #0,%d6
move.b SP_COLOR(%sp),%d6
and.w #0x0F,%d6
add.w %d6,%d6
add.w %d6,%d6 | * 4 for bra.w table
lea .LcoStTable(%pc),%a6
jmp 0(%a6,%d6.w)
.LcoStTable:
bra.w .LcoStLoop_0
bra.w .LcoStLoop_1
bra.w .LcoStLoop_2
bra.w .LcoStLoop_3
bra.w .LcoStLoop_4
bra.w .LcoStLoop_5
bra.w .LcoStLoop_6
bra.w .LcoStLoop_7
bra.w .LcoStLoop_8
bra.w .LcoStLoop_9
bra.w .LcoStLoop_10
bra.w .LcoStLoop_11
bra.w .LcoStLoop_12
bra.w .LcoStLoop_13
bra.w .LcoStLoop_14
bra.w .LcoStLoop_15
CO_LOOP_HDR 0
CO_LOOP_HDR 1
CO_LOOP_HDR 2
CO_LOOP_HDR 3
CO_LOOP_HDR 4
CO_LOOP_HDR 5
CO_LOOP_HDR 6
CO_LOOP_HDR 7
CO_LOOP_HDR 8
CO_LOOP_HDR 9
CO_LOOP_HDR 10
CO_LOOP_HDR 11
CO_LOOP_HDR 12
CO_LOOP_HDR 13
CO_LOOP_HDR 14
CO_LOOP_HDR 15
.LcoStDone:
lea SP_LOCAL(%sp),%sp
movem.l (%sp)+,%d2-%d7/%a2-%a6
rts
.align 2
| 16 word entries: bitMaskWordLut[i] = 1 << (15 - i), for i in 0..15.
bitMaskWordLut:
.word 0x8000, 0x4000, 0x2000, 0x1000
.word 0x0800, 0x0400, 0x0200, 0x0100
.word 0x0080, 0x0040, 0x0020, 0x0010
.word 0x0008, 0x0004, 0x0002, 0x0001

View file

@ -0,0 +1,292 @@
| Atari ST word-interleaved planar fillCircle -- 68000 hand-rolled.
|
| Bresenham midpoint circle, 4 horizontal spans per Bresenham iter,
| paired by shared x-range so leftMask/rightMask are computed once
| per pair:
| Pair A: x in [cx-bx, cx+bx], rows y = cy+by, cy-by
| Pair B: x in [cx-by, cx+by], rows y = cy+bx, cy-bx
|
| Caller MUST guarantee the bounding box (cx-r, cy-r) (cx+r, cy+r)
| is fully on-surface. Off-surface circles fall back to the C walker.
|
| ABI: cdecl. d2-d7/a2-a6 callee-save.
|
| void surface68kStFillCircle(uint8_t *base,
| uint16_t cx, uint16_t cy,
| uint16_t r, uint8_t color);
|
| Register allocation across the loop:
| d2.w = bx (Bresenham, starts at r)
| d3.w = by (Bresenham, starts at 0)
| d4.w = err
| d5.l = loLong (planes 0+1 long template)
| d6.l = hiLong (planes 2+3 long template)
| d7.b = color (low nibble; tested via btst)
| a3 = base
| a4 = scratch / current group pointer
| d0,d1 = scratch
|
| Stack scratch (8 bytes at 0(sp)..7(sp)):
| 0..1 leftMask (word; per pair)
| 2..3 rightMask (word; per pair)
| 4..5 numGroups (word; per pair)
| 6..7 groupFirstByteOff (word; per pair)
.text
.equ SP_FC_SAVED, 44
.equ SP_FC_LOCAL, 8
.equ SP_FC_OFF, (SP_FC_SAVED + 4 + SP_FC_LOCAL)
.equ SP_FC_BASE, SP_FC_OFF + 0
.equ SP_FC_CX, SP_FC_OFF + 4 + 2
.equ SP_FC_CY, SP_FC_OFF + 8 + 2
.equ SP_FC_R, SP_FC_OFF + 12 + 2
.equ SP_FC_COLOR, SP_FC_OFF + 16 + 3
| ---- COMPUTE_PAIR_MASKS macro -----------------------------------
| Input: d0.w = left, d1.w = right
| Output: 0(sp) leftMask, 2(sp) rightMask, 4(sp) numGroups,
| 6(sp) groupFirstByteOff
| Trashes: d0, d1
| (No labels: straightline.)
.macro COMPUTE_PAIR_MASKS
move.w %d0,0(%sp) | stash left
move.w %d1,2(%sp) | stash right
| groupFirst & groupFirstByteOff
move.w %d0,%d1
lsr.w #4,%d1 | groupFirst
move.w %d1,%d0
lsl.w #3,%d0 | groupFirstByteOff
move.w %d0,6(%sp)
| numGroups = (right >> 4) - groupFirst
move.w 2(%sp),%d0
lsr.w #4,%d0 | groupLast
sub.w %d1,%d0 | numGroups
move.w %d0,4(%sp)
| leftMask via LUT[bitFirst]; a5 = leftMaskLut base
move.w 0(%sp),%d0
and.w #15,%d0
add.w %d0,%d0
move.w (%a5,%d0.w),%d1
move.w %d1,0(%sp)
| rightMask via LUT[bitLast]; a6 = rightMaskLut base
move.w 2(%sp),%d0
and.w #15,%d0
add.w %d0,%d0
move.w (%a6,%d0.w),%d1
move.w %d1,2(%sp)
.endm
| ---- SPAN_BODY macro --------------------------------------------
| Render one row span using the pair masks at 0(sp)..7(sp).
| Input: d0.w = y (signed)
| a3 = base, d5 = loLong, d6 = hiLong, d7 = color
| Trashes: d0, d1, a4
| Macro takes an idx parameter for unique labels.
.macro SPAN_BODY
| a4 = base + y*160
ext.l %d0
move.l %d0,%d1
lsl.l #5,%d0
lsl.l #7,%d1
add.l %d1,%d0 | y*160
lea 0(%a3,%d0.l),%a4
| a4 += groupFirstByteOff
moveq #0,%d0
move.w 6(%sp),%d0
add.l %d0,%a4
| numGroups in d1
move.w 4(%sp),%d1
tst.w %d1
bne.s .Lsb_multi\@
| single-group: combinedMask = leftMask & rightMask
move.w 0(%sp),%d0
and.w 2(%sp),%d0
bsr .Lfc_applyMask
bra.w .Lsb_done\@
.Lsb_multi\@:
| leading mask. applyMask postinc-advances a4 by 8
| (the 4 plane RMWs each advance by 2 via (a4)+).
| applyMask trashes d1, so reload numGroups after bsr.
move.w 0(%sp),%d0
bsr .Lfc_applyMask
move.w 4(%sp),%d1 | reload numGroups
subq.w #1,%d1 | d1 = numMid
beq.s .Lsb_skipMid\@
.Lsb_midLoop\@:
move.l %d5,(%a4)+
move.l %d6,(%a4)+
subq.w #1,%d1
bne.s .Lsb_midLoop\@
.Lsb_skipMid\@:
| trailing mask
move.w 2(%sp),%d0
bsr .Lfc_applyMask
.Lsb_done\@:
.endm
.globl _surface68kStFillCircle
_surface68kStFillCircle:
movem.l %d2-%d7/%a2-%a6,-(%sp)
lea -SP_FC_LOCAL(%sp),%sp
| base, color
move.l SP_FC_BASE(%sp),%a3
moveq #0,%d7
move.b SP_FC_COLOR(%sp),%d7
| LUT bases (PC-relative indexed has only 8-bit
| displacement, so cache full pointers in a-regs).
lea leftMaskLut(%pc),%a5
lea rightMaskLut(%pc),%a6
| loLong = ((c&1)?0xFFFF0000:0) | ((c&2)?0x0000FFFF:0)
moveq #0,%d5
btst #1,%d7
beq.s .Lfc_lo1
move.w #-1,%d5
.Lfc_lo1:
btst #0,%d7
beq.s .Lfc_lo0
ori.l #0xFFFF0000,%d5
.Lfc_lo0:
| hiLong = ((c&4)?0xFFFF0000:0) | ((c&8)?0x0000FFFF:0)
moveq #0,%d6
btst #3,%d7
beq.s .Lfc_hi3
move.w #-1,%d6
.Lfc_hi3:
btst #2,%d7
beq.s .Lfc_hi2
ori.l #0xFFFF0000,%d6
.Lfc_hi2:
| Bresenham init: bx=r, by=0, err=1-bx
move.w SP_FC_R(%sp),%d2
moveq #0,%d3
moveq #1,%d4
sub.w %d2,%d4
.Lfc_loop:
cmp.w %d3,%d2
bcs.w .Lfc_done
| --- Pair A: x range = (cx - bx, cx + bx)
move.w SP_FC_CX(%sp),%d0
move.w %d0,%d1
sub.w %d2,%d0 | left = cx - bx
add.w %d2,%d1 | right = cx + bx
COMPUTE_PAIR_MASKS
| Span A1: y = cy + by
move.w SP_FC_CY(%sp),%d0
add.w %d3,%d0
SPAN_BODY
| Span A2: y = cy - by
move.w SP_FC_CY(%sp),%d0
sub.w %d3,%d0
SPAN_BODY
| --- Pair B: x range = (cx - by, cx + by)
move.w SP_FC_CX(%sp),%d0
move.w %d0,%d1
sub.w %d3,%d0 | left = cx - by
add.w %d3,%d1 | right = cx + by
COMPUTE_PAIR_MASKS
| Span B1: y = cy + bx
move.w SP_FC_CY(%sp),%d0
add.w %d2,%d0
SPAN_BODY
| Span B2: y = cy - bx
move.w SP_FC_CY(%sp),%d0
sub.w %d2,%d0
SPAN_BODY
| --- Bresenham step
addq.w #1,%d3
tst.w %d4
bgt.s .Lfc_decBx
add.w %d3,%d4
add.w %d3,%d4
addq.w #1,%d4
bra.w .Lfc_loop
.Lfc_decBx:
subq.w #1,%d2
add.w %d3,%d4
add.w %d3,%d4
sub.w %d2,%d4
sub.w %d2,%d4
addq.w #1,%d4
bra.w .Lfc_loop
.Lfc_done:
lea SP_FC_LOCAL(%sp),%sp
movem.l (%sp)+,%d2-%d7/%a2-%a6
rts
| ---- Apply 4-plane mask at (a4) -------------------------------
| Input: d0.w = mask, d7.b = color, a4 = group ptr
| Output: a4 advanced by 8 (next group). Caller must NOT post-add 8.
| Trashes: d0, d1
| Subroutine, called via bsr from SPAN_BODY. Postinc on each plane
| RMW saves 4 cyc/plane vs displacement (12 vs 16 EA cyc).
.Lfc_applyMask:
move.w %d0,%d1
not.w %d1 | d1 = notMask
btst #0,%d7
beq.s .Lfc_am0a
or.w %d0,(%a4)+
bra.s .Lfc_am1
.Lfc_am0a:
and.w %d1,(%a4)+
.Lfc_am1:
btst #1,%d7
beq.s .Lfc_am1a
or.w %d0,(%a4)+
bra.s .Lfc_am2
.Lfc_am1a:
and.w %d1,(%a4)+
.Lfc_am2:
btst #2,%d7
beq.s .Lfc_am2a
or.w %d0,(%a4)+
bra.s .Lfc_am3
.Lfc_am2a:
and.w %d1,(%a4)+
.Lfc_am3:
btst #3,%d7
beq.s .Lfc_am3a
or.w %d0,(%a4)+
rts
.Lfc_am3a:
and.w %d1,(%a4)+
rts
.align 2
| leftMaskLut[i] = (1 << (16 - i)) - 1, indexed by bitFirst (0..15)
leftMaskLut:
.word 0xFFFF, 0x7FFF, 0x3FFF, 0x1FFF
.word 0x0FFF, 0x07FF, 0x03FF, 0x01FF
.word 0x00FF, 0x007F, 0x003F, 0x001F
.word 0x000F, 0x0007, 0x0003, 0x0001
| rightMaskLut[i] = ~((1 << (15 - i)) - 1), indexed by bitLast (0..15)
rightMaskLut:
.word 0x8000, 0xC000, 0xE000, 0xF000
.word 0xF800, 0xFC00, 0xFE00, 0xFF00
.word 0xFF80, 0xFFC0, 0xFFE0, 0xFFF0
.word 0xFFF8, 0xFFFC, 0xFFFE, 0xFFFF

File diff suppressed because it is too large Load diff

853
src/port/atarist/lineSpan.s Normal file
View file

@ -0,0 +1,853 @@
| Atari ST word-interleaved planar drawLine -- 68000 hand-rolled.
|
| Bresenham line walker with 16-way color dispatch. Per pixel:
| * 4-plane word RMW with branchless OR/AND chosen at compile time
| * bit mask via 16-entry word table; group offset via (x>>4)<<3
| * y*160 = (y<<5)+(y<<7)
|
| Caller MUST guarantee the entire line lies on-surface (full clip
| precheck). Partial-clip lines fall back to the C walker.
|
| ABI: cdecl. d2-d7/a2-a6 callee-save.
|
| void surface68kStDrawLine(uint8_t *base,
| int16_t x0, int16_t y0,
| int16_t x1, int16_t y1,
| uint8_t color);
|
| Register allocation in the inner loop:
| d2.w = x (current pixel)
| d3.w = y (current pixel)
| d4.w = err
| d5.w = dx (>= 0)
| d6.w = -dy_abs (<= 0; "Bresenham uses -dy")
| d7 = sx (long; moveq #1 or #-1, low word used for .w add)
| a4 = sy (long; sign-extended)
| a3 = base
| a5 = bitMaskWordLut
| a2 = scratch (per-pixel: base + byteOff)
| d0,d1 = scratch
|
| Stack scratch:
| sp+0..1 iter counter (max(dx, dy_abs) + 1)
.text
.equ SP_SAVED, 44
.equ SP_LOCAL, 4
.equ SP_OFF, (SP_SAVED + 4 + SP_LOCAL)
.equ SP_BASE, SP_OFF + 0
.equ SP_X0, SP_OFF + 4 + 2
.equ SP_Y0, SP_OFF + 8 + 2
.equ SP_X1, SP_OFF + 12 + 2
.equ SP_Y1, SP_OFF + 16 + 2
.equ SP_COLOR, SP_OFF + 20 + 3
| ---- DL_PLOT: 4-plane word RMW for hardcoded color ----
| Inputs: d2.w = x, d3.w = y, a3 = base, a5 = bitMaskWordLut
| Trashes: d0, d1, a2
.macro DL_PLOT color
| byteOff = y*160 + (x>>4)*8
move.w %d3,%d0
ext.l %d0
move.l %d0,%d1
lsl.l #5,%d0 | y << 5
lsl.l #7,%d1 | y << 7
add.l %d1,%d0 | d0 = y * 160
move.w %d2,%d1
lsr.w #4,%d1
lsl.w #3,%d1 | (x>>4) * 8
ext.l %d1
add.l %d1,%d0 | d0 = byteOff
lea 0(%a3,%d0.l),%a2 | a2 = base + byteOff
| d1 = bitMask, d0 = notMask
move.w %d2,%d1
and.w #15,%d1
add.w %d1,%d1
move.w (%a5,%d1.w),%d1
move.w %d1,%d0
not.w %d0
| per-plane RMW with postinc (drops 4 cyc per RMW vs
| displacement (d8,An) = 16 cyc, plain (An)+ = 12 cyc).
.if ((\color) & 1)
or.w %d1,(%a2)+
.else
and.w %d0,(%a2)+
.endif
.if ((\color) & 2)
or.w %d1,(%a2)+
.else
and.w %d0,(%a2)+
.endif
.if ((\color) & 4)
or.w %d1,(%a2)+
.else
and.w %d0,(%a2)+
.endif
.if ((\color) & 8)
or.w %d1,(%a2)+
.else
and.w %d0,(%a2)+
.endif
.endm
| ---- DL_BODY: full Bresenham loop body for hardcoded color ----
.macro DL_BODY color
.LdlStLoop_\color:
DL_PLOT \color
| e2 = 2 * err
move.w %d4,%d0
add.w %d0,%d0 | d0 = e2
| if (e2 >= dy) { err += dy; x += sx; }
cmp.w %d6,%d0
blt.s .LdlStNoX_\color
add.w %d6,%d4
add.w %d7,%d2
.LdlStNoX_\color:
| if (e2 <= dx) { err += dx; y += sy; }
cmp.w %d5,%d0
bgt.s .LdlStNoY_\color
add.w %d5,%d4
add.w %a4,%d3 | sy.w from a4
.LdlStNoY_\color:
subq.w #1,0(%sp)
bne.w .LdlStLoop_\color
bra.w .LdlStDone
.endm
.globl _surface68kStDrawLine
_surface68kStDrawLine:
movem.l %d2-%d7/%a2-%a6,-(%sp)
lea -SP_LOCAL(%sp),%sp
| Load base & lut.
move.l SP_BASE(%sp),%a3
lea bitMaskWordLut(%pc),%a5
| x = x0, y = y0
move.w SP_X0(%sp),%d2
move.w SP_Y0(%sp),%d3
| dx = abs(x1 - x0), sx = sign(x1 - x0)
move.w SP_X1(%sp),%d5
sub.w %d2,%d5 | d5 = x1 - x0
bge.s .LdlSxPos
neg.w %d5
moveq #-1,%d7
bra.s .LdlSxDone
.LdlSxPos:
moveq #1,%d7
.LdlSxDone:
| dy_abs in d6, sy in d0 (-> a4)
move.w SP_Y1(%sp),%d6
sub.w %d3,%d6 | d6 = y1 - y0
bge.s .LdlSyPos
neg.w %d6
moveq #-1,%d0
bra.s .LdlSyDone
.LdlSyPos:
moveq #1,%d0
.LdlSyDone:
ext.l %d0
movea.l %d0,%a4 | a4 = sy
| iter counter = max(dx, dy_abs) + 1
move.w %d5,%d0
cmp.w %d6,%d0
bge.s .LdlNitDone
move.w %d6,%d0
.LdlNitDone:
addq.w #1,%d0
move.w %d0,0(%sp)
| err = dx - dy_abs (== dx + dy where dy negative)
move.w %d5,%d4
sub.w %d6,%d4 | d4 = err
neg.w %d6 | d6 = -dy_abs (negative)
| Dispatch on color (low 4 bits) -> 16 specialized loops.
moveq #0,%d0
move.b SP_COLOR(%sp),%d0
and.w #0x0F,%d0
add.w %d0,%d0
add.w %d0,%d0 | * 4 for bra.w table
lea .LdlStTable(%pc),%a6
jmp 0(%a6,%d0.w)
.LdlStTable:
bra.w .LdlStLoop_0
bra.w .LdlStLoop_1
bra.w .LdlStLoop_2
bra.w .LdlStLoop_3
bra.w .LdlStLoop_4
bra.w .LdlStLoop_5
bra.w .LdlStLoop_6
bra.w .LdlStLoop_7
bra.w .LdlStLoop_8
bra.w .LdlStLoop_9
bra.w .LdlStLoop_10
bra.w .LdlStLoop_11
bra.w .LdlStLoop_12
bra.w .LdlStLoop_13
bra.w .LdlStLoop_14
bra.w .LdlStLoop_15
DL_BODY 0
DL_BODY 1
DL_BODY 2
DL_BODY 3
DL_BODY 4
DL_BODY 5
DL_BODY 6
DL_BODY 7
DL_BODY 8
DL_BODY 9
DL_BODY 10
DL_BODY 11
DL_BODY 12
DL_BODY 13
DL_BODY 14
DL_BODY 15
.LdlStDone:
lea SP_LOCAL(%sp),%sp
movem.l (%sp)+,%d2-%d7/%a2-%a6
rts
.align 2
| 16 word entries: bitMaskWordLut[i] = 1 << (15 - i), for i in 0..15.
bitMaskWordLut:
.word 0x8000, 0x4000, 0x2000, 0x1000
.word 0x0800, 0x0400, 0x0200, 0x0100
.word 0x0080, 0x0040, 0x0020, 0x0010
.word 0x0008, 0x0004, 0x0002, 0x0001
| ---- surface68kStFillSpan ---------------------------------------
|
| Single-row span fill: leading-mask group + middle long-fills +
| trailing-mask group, all in one frame. Caller pre-clips so the
| span is fully on-surface.
|
| void surface68kStFillSpan(uint8_t *base,
| int16_t left, int16_t right,
| int16_t y, uint8_t color);
|
| Caller guarantees: 0 <= left <= right < 320, 0 <= y < 200.
|
| Register layout:
| a3 = base
| a4 = current group pointer
| d2.w = leftMask (then trailing trampoline target)
| d3.w = rightMask
| d4.w = numGroups - 1 (middle iter count when > 0)
| d5.l = loLong (planes 0+1 long template)
| d6.l = hiLong (planes 2+3 long template)
| d7.b = color (low nibble; tested via btst)
| d0,d1 = scratch
.equ SP_FS_SAVED, 44
.equ SP_FS_OFF, (SP_FS_SAVED + 4)
.equ SP_FS_BASE, SP_FS_OFF + 0
.equ SP_FS_LEFT, SP_FS_OFF + 4 + 2
.equ SP_FS_RIGHT, SP_FS_OFF + 8 + 2
.equ SP_FS_Y, SP_FS_OFF + 12 + 2
.equ SP_FS_COLOR, SP_FS_OFF + 16 + 3
.globl _surface68kStFillSpan
_surface68kStFillSpan:
movem.l %d2-%d7/%a2-%a6,-(%sp)
move.l SP_FS_BASE(%sp),%a3
moveq #0,%d7
move.b SP_FS_COLOR(%sp),%d7 | d7 = color
| loLong = ((c&1)?0xFFFF0000:0) | ((c&2)?0x0000FFFF:0)
moveq #0,%d5
btst #1,%d7
beq.s .LfsLoBit1
move.w #-1,%d5
.LfsLoBit1:
btst #0,%d7
beq.s .LfsLoBit0
ori.l #0xFFFF0000,%d5
.LfsLoBit0:
| hiLong = ((c&4)?0xFFFF0000:0) | ((c&8)?0x0000FFFF:0)
moveq #0,%d6
btst #3,%d7
beq.s .LfsHiBit3
move.w #-1,%d6
.LfsHiBit3:
btst #2,%d7
beq.s .LfsHiBit2
ori.l #0xFFFF0000,%d6
.LfsHiBit2:
| rowBase = base + y*160 -> a4
move.w SP_FS_Y(%sp),%d0
ext.l %d0
move.l %d0,%d1
lsl.l #5,%d0
lsl.l #7,%d1
add.l %d1,%d0 | d0 = y*160
lea 0(%a3,%d0.l),%a4
| left in d0, right in d1
move.w SP_FS_LEFT(%sp),%d0
move.w SP_FS_RIGHT(%sp),%d1
| bitFirst in d2, bitLast in d3
move.w %d0,%d2
and.w #15,%d2
move.w %d1,%d3
and.w #15,%d3
| a4 += groupFirst * 8
| numGroups = groupLast - groupFirst (in d4)
move.w %d0,%d4
lsr.w #4,%d4 | d4 = groupFirst
move.w %d4,%d0 | save groupFirst into d0
lsl.w #3,%d0 | d0 = groupFirst*8
ext.l %d0
add.l %d0,%a4
move.w %d1,%d0
lsr.w #4,%d0 | d0 = groupLast
sub.w %d4,%d0 | d0 = groupLast - groupFirst
move.w %d0,%d4 | d4 = numGroups
| leftMask = (1 << (16 - bitFirst)) - 1
moveq #16,%d0
sub.w %d2,%d0 | d0 = 16 - bitFirst (1..16)
moveq #1,%d2
lsl.l %d0,%d2 | 1 << (16 - bitFirst)
subq.l #1,%d2 | d2.w = leftMask
| rightMask = ~((1 << (15 - bitLast)) - 1)
moveq #15,%d0
sub.w %d3,%d0 | d0 = 15 - bitLast (0..15)
moveq #1,%d3
lsl.l %d0,%d3 | 1 << (15 - bitLast)
subq.l #1,%d3 | inverse mask
not.w %d3 | d3.w = rightMask
| If numGroups == 0, single-group: mask = leftMask & rightMask
tst.w %d4
bne.s .LfsMulti
and.w %d2,%d3 | d3 = combinedMask
move.w %d3,%d2
bsr.s .LfsApplyMask
bra.w .LfsDone
.LfsMulti:
| Leading mask (d2 already = leftMask)
bsr.s .LfsApplyMask
addq.l #8,%a4 | next group
| numMid = numGroups - 1
subq.w #1,%d4
beq.s .LfsTrailing
.LfsMidLoop:
move.l %d5,(%a4)+
move.l %d6,(%a4)+
subq.w #1,%d4
bne.s .LfsMidLoop
.LfsTrailing:
move.w %d3,%d2 | d2 = rightMask
bsr.s .LfsApplyMask
.LfsDone:
movem.l (%sp)+,%d2-%d7/%a2-%a6
rts
| Apply 4-plane word RMW at (a4) using mask in d2 (or notMask in d0).
| Plane N: if (color bit N) OR mask else AND notMask.
| Inputs: a4, d2.w = mask, d7.b = color
| Trashes: d0
| Returns via rts.
.LfsApplyMask:
move.w %d2,%d0
not.w %d0 | d0 = notMask
btst #0,%d7
beq.s .LfsAm0a
or.w %d2,(%a4)
bra.s .LfsAm1
.LfsAm0a:
and.w %d0,(%a4)
.LfsAm1:
btst #1,%d7
beq.s .LfsAm1a
or.w %d2,2(%a4)
bra.s .LfsAm2
.LfsAm1a:
and.w %d0,2(%a4)
.LfsAm2:
btst #2,%d7
beq.s .LfsAm2a
or.w %d2,4(%a4)
bra.s .LfsAm3
.LfsAm2a:
and.w %d0,4(%a4)
.LfsAm3:
btst #3,%d7
beq.s .LfsAm3a
or.w %d2,6(%a4)
rts
.LfsAm3a:
and.w %d0,6(%a4)
rts
| ---- surface68kStFillRectSingleGroup -----------------------------
|
| Fill rect when groupFirst == groupLast (thin/single-column rect).
| Caller pre-computes firstGroupPtr = base + y*160 + groupFirst*8
| and the mask = leftMask & rightMask.
|
| void surface68kStFillRectSingleGroup(uint8_t *firstGroupPtr,
| uint16_t mask,
| uint16_t h,
| uint8_t color);
|
| Dispatched on color (low nibble) -> 16 specialized loops with
| hardcoded OR/AND per plane. Inner loop is 4 plane word RMWs +
| advance row + branch.
|
| drawLine V routes to fillRect 1xH which lands here.
.equ SP_FRG_SAVED, 24 | d2-d5/a2-a3 = 6 longs
.equ SP_FRG_OFF, (SP_FRG_SAVED + 4)
.equ SP_FRG_PTR, SP_FRG_OFF + 0
.equ SP_FRG_MASK, SP_FRG_OFF + 4 + 2
.equ SP_FRG_H, SP_FRG_OFF + 8 + 2
.equ SP_FRG_COLOR, SP_FRG_OFF + 12 + 3
.macro FRG_LOOP color
.Lfrg_loop_\color:
.if ((\color) & 1)
or.w %d3,(%a3)+
.else
and.w %d4,(%a3)+
.endif
.if ((\color) & 2)
or.w %d3,(%a3)+
.else
and.w %d4,(%a3)+
.endif
.if ((\color) & 4)
or.w %d3,(%a3)+
.else
and.w %d4,(%a3)+
.endif
.if ((\color) & 8)
or.w %d3,(%a3)+
.else
and.w %d4,(%a3)+
.endif
lea 152(%a3),%a3 | a3 now at row start; advance to next row (160-8)
subq.w #1,%d5
bne.w .Lfrg_loop_\color
bra.w .Lfrg_done
.endm
.globl _surface68kStFillRectSingleGroup
_surface68kStFillRectSingleGroup:
movem.l %d2-%d5/%a2-%a3,-(%sp)
move.l SP_FRG_PTR(%sp),%a3
move.w SP_FRG_MASK(%sp),%d3
move.w SP_FRG_H(%sp),%d5
tst.w %d5
beq.w .Lfrg_done
move.w %d3,%d4
not.w %d4 | d4 = notMask
| Color dispatch
moveq #0,%d2
move.b SP_FRG_COLOR(%sp),%d2
and.w #0x0F,%d2
add.w %d2,%d2
add.w %d2,%d2 | * 4 for bra.w table
lea .Lfrg_table(%pc),%a2
jmp 0(%a2,%d2.w)
.Lfrg_table:
bra.w .Lfrg_loop_0
bra.w .Lfrg_loop_1
bra.w .Lfrg_loop_2
bra.w .Lfrg_loop_3
bra.w .Lfrg_loop_4
bra.w .Lfrg_loop_5
bra.w .Lfrg_loop_6
bra.w .Lfrg_loop_7
bra.w .Lfrg_loop_8
bra.w .Lfrg_loop_9
bra.w .Lfrg_loop_10
bra.w .Lfrg_loop_11
bra.w .Lfrg_loop_12
bra.w .Lfrg_loop_13
bra.w .Lfrg_loop_14
bra.w .Lfrg_loop_15
FRG_LOOP 0
FRG_LOOP 1
FRG_LOOP 2
FRG_LOOP 3
FRG_LOOP 4
FRG_LOOP 5
FRG_LOOP 6
FRG_LOOP 7
FRG_LOOP 8
FRG_LOOP 9
FRG_LOOP 10
FRG_LOOP 11
FRG_LOOP 12
FRG_LOOP 13
FRG_LOOP 14
FRG_LOOP 15
.Lfrg_done:
movem.l (%sp)+,%d2-%d5/%a2-%a3
rts
| ---- surface68kStFillRectMulti -------------------------------------
|
| Multi-group fillRect: groupFirst != groupLast. Caller pre-clips.
| Dispatched on color (low nibble) -> 16 specialized H-row loops.
|
| void surface68kStFillRectMulti(uint8_t *base,
| int16_t x, int16_t y,
| uint16_t w, uint16_t h,
| uint8_t color);
|
| Per row body (per color C):
| 1. Leading mask: 4 hardcoded plane RMW with leftMask
| 2. Middle: numMid groups of 2 long-writes (loLong, hiLong)
| 3. Trailing mask: 4 hardcoded plane RMW with rightMask
| 4. Advance rowBase by 160; decrement h; loop.
|
| Register layout in inner loop:
| d2.w = leftMask d3.w = rightMask
| d4.w = ~leftMask d5.w = ~rightMask
| d6.l = loLong d7.l = hiLong
| a3 = rowBase (advances by 160 each iter)
| a4 = a_grp (per-row scratch)
| d0,d1 = scratch
|
| Stack scratch (4 bytes at sp+0):
| 0..1 numMid (word, reload per row for mid loop)
| 2..3 h (word, decrement per row)
.equ SP_FRM_SAVED, 44
.equ SP_FRM_LOCAL, 4
.equ SP_FRM_OFF, (SP_FRM_SAVED + 4 + SP_FRM_LOCAL)
.equ SP_FRM_BASE, SP_FRM_OFF + 0
.equ SP_FRM_X, SP_FRM_OFF + 4 + 2
.equ SP_FRM_Y, SP_FRM_OFF + 8 + 2
.equ SP_FRM_W, SP_FRM_OFF + 12 + 2
.equ SP_FRM_H, SP_FRM_OFF + 16 + 2
.equ SP_FRM_COLOR, SP_FRM_OFF + 20 + 3
.macro FRM_LOOP color
.LfrM_loop_\color:
| Leading mask at (a4)+, walking from row start
move.l %a3,%a4 | a4 = current row's groupFirst byte
.if ((\color) & 1)
or.w %d2,(%a4)+
.else
and.w %d4,(%a4)+
.endif
.if ((\color) & 2)
or.w %d2,(%a4)+
.else
and.w %d4,(%a4)+
.endif
.if ((\color) & 4)
or.w %d2,(%a4)+
.else
and.w %d4,(%a4)+
.endif
.if ((\color) & 8)
or.w %d2,(%a4)+
.else
and.w %d4,(%a4)+
.endif
| a4 now points to next group (8 bytes past row start).
| Middle long-fill
move.w 0(%sp),%d0
tst.w %d0
beq.s .LfrM_skipMid_\color
.LfrM_midLoop_\color:
move.l %d6,(%a4)+
move.l %d7,(%a4)+
subq.w #1,%d0
bne.s .LfrM_midLoop_\color
.LfrM_skipMid_\color:
| Trailing mask at (a4)+
.if ((\color) & 1)
or.w %d3,(%a4)+
.else
and.w %d5,(%a4)+
.endif
.if ((\color) & 2)
or.w %d3,(%a4)+
.else
and.w %d5,(%a4)+
.endif
.if ((\color) & 4)
or.w %d3,(%a4)+
.else
and.w %d5,(%a4)+
.endif
.if ((\color) & 8)
or.w %d3,(%a4)+
.else
and.w %d5,(%a4)+
.endif
| Advance row (a3 unchanged through the body)
lea 160(%a3),%a3
subq.w #1,2(%sp)
bne.w .LfrM_loop_\color
bra.w .LfrM_done
.endm
.globl _surface68kStFillRectMulti
_surface68kStFillRectMulti:
movem.l %d2-%d7/%a2-%a6,-(%sp)
lea -SP_FRM_LOCAL(%sp),%sp
| Load color, build loLong (d6) and hiLong (d7)
moveq #0,%d0
move.b SP_FRM_COLOR(%sp),%d0
moveq #0,%d6
btst #1,%d0
beq.s .LfrM_lo1
move.w #-1,%d6
.LfrM_lo1:
btst #0,%d0
beq.s .LfrM_lo0
ori.l #0xFFFF0000,%d6
.LfrM_lo0:
moveq #0,%d7
btst #3,%d0
beq.s .LfrM_hi3
move.w #-1,%d7
.LfrM_hi3:
btst #2,%d0
beq.s .LfrM_hi2
ori.l #0xFFFF0000,%d7
.LfrM_hi2:
| Compute group ptrs and masks
| groupFirst = x >> 4; groupFirstByteOff = groupFirst * 8
| bitFirst = x & 15
move.w SP_FRM_X(%sp),%d0
move.w SP_FRM_W(%sp),%d1
add.w %d0,%d1
subq.w #1,%d1 | d1 = x + w - 1 (last pixel)
| leftMask via LUT[bitFirst]
move.w %d0,%d2
and.w #15,%d2
add.w %d2,%d2
lea frmLeftMaskLut(%pc),%a2
move.w (%a2,%d2.w),%d2 | d2 = leftMask
move.w %d2,%d4
not.w %d4 | d4 = notLeftMask
| rightMask via LUT[bitLast]
move.w %d1,%d3
and.w #15,%d3
add.w %d3,%d3
lea frmRightMaskLut(%pc),%a2
move.w (%a2,%d3.w),%d3 | d3 = rightMask
move.w %d3,%d5
not.w %d5 | d5 = notRightMask
| numMid = (last >> 4) - (x >> 4) - 1
move.w %d1,%a2 | a2.w = lastPixel (temp)
move.l %a2,%d1
lsr.w #4,%d1 | groupLast (low word)
move.w %d0,%a2
move.l %a2,%d0
lsr.w #4,%d0 | groupFirst
move.w %d0,%a4 | a4.w = groupFirst (save for byteOff calc)
sub.w %d0,%d1 | d1 = groupLast - groupFirst
subq.w #1,%d1 | d1 = numMid (>= 0 since multi-group caller)
move.w %d1,0(%sp) | numMid -> stack
| h -> stack
move.w SP_FRM_H(%sp),%d1
move.w %d1,2(%sp)
| a3 = base + y*160 + groupFirst*8
move.w SP_FRM_Y(%sp),%d0
ext.l %d0
move.l %d0,%d1
lsl.l #5,%d0
lsl.l #7,%d1
add.l %d1,%d0 | y*160
move.l SP_FRM_BASE(%sp),%a3
add.l %d0,%a3 | rowBase = base + y*160
move.l %a4,%d0 | groupFirst
lsl.w #3,%d0 | * 8
ext.l %d0
add.l %d0,%a3 | + groupFirst*8
| Dispatch on color
moveq #0,%d0
move.b SP_FRM_COLOR(%sp),%d0
and.w #0x0F,%d0
add.w %d0,%d0
add.w %d0,%d0
lea .LfrM_table(%pc),%a2
jmp 0(%a2,%d0.w)
.LfrM_table:
bra.w .LfrM_loop_0
bra.w .LfrM_loop_1
bra.w .LfrM_loop_2
bra.w .LfrM_loop_3
bra.w .LfrM_loop_4
bra.w .LfrM_loop_5
bra.w .LfrM_loop_6
bra.w .LfrM_loop_7
bra.w .LfrM_loop_8
bra.w .LfrM_loop_9
bra.w .LfrM_loop_10
bra.w .LfrM_loop_11
bra.w .LfrM_loop_12
bra.w .LfrM_loop_13
bra.w .LfrM_loop_14
bra.w .LfrM_loop_15
FRM_LOOP 0
FRM_LOOP 1
FRM_LOOP 2
FRM_LOOP 3
FRM_LOOP 4
FRM_LOOP 5
FRM_LOOP 6
FRM_LOOP 7
FRM_LOOP 8
FRM_LOOP 9
FRM_LOOP 10
FRM_LOOP 11
FRM_LOOP 12
FRM_LOOP 13
FRM_LOOP 14
FRM_LOOP 15
.LfrM_done:
lea SP_FRM_LOCAL(%sp),%sp
movem.l (%sp)+,%d2-%d7/%a2-%a6
rts
.align 2
| Same LUTs as in fillCircle.s; duplicated locally so each .o file's
| PC-rel lea can reach them within its own .text segment.
frmLeftMaskLut:
.word 0xFFFF, 0x7FFF, 0x3FFF, 0x1FFF
.word 0x0FFF, 0x07FF, 0x03FF, 0x01FF
.word 0x00FF, 0x007F, 0x003F, 0x001F
.word 0x000F, 0x0007, 0x0003, 0x0001
frmRightMaskLut:
.word 0x8000, 0xC000, 0xE000, 0xF000
.word 0xF800, 0xFC00, 0xFE00, 0xFF00
.word 0xFF80, 0xFFC0, 0xFFE0, 0xFFF0
.word 0xFFF8, 0xFFFC, 0xFFFE, 0xFFFF
| ---- surface68kStLongFill ----------------------------------------
|
| Bulk long-fill helper for full-row fills (surfaceClear, fillRect
| 320x200). Writes numGroups groups of 8 bytes (loLong, hiLong)
| starting at dst. Uses movem.l d2-d7 (3 groups = 24 bytes per
| batch) plus a tail pair to amortize loop overhead.
|
| void surface68kStLongFill(uint8_t *dst,
| uint16_t numGroups,
| uint32_t loLong,
| uint32_t hiLong);
|
| Per-batch cost: movem.l (56 cyc) + subq (8) + bne (10) = 74 cyc
| for 24 bytes -- ~3 cyc/byte vs ~5 cyc/byte for the straight C
| do-while of two move.l writes.
.equ SP_LF_SAVED, 24 | d2-d7 = 6 longs
.equ SP_LF_OFF, (SP_LF_SAVED + 4)
.equ SP_LF_DST, SP_LF_OFF + 0
.equ SP_LF_NGROUPS, SP_LF_OFF + 4 + 2
.equ SP_LF_LO, SP_LF_OFF + 8
.equ SP_LF_HI, SP_LF_OFF + 12
.globl _surface68kStLongFill
_surface68kStLongFill:
movem.l %d2-%d7,-(%sp)
move.l SP_LF_DST(%sp),%a0
move.l SP_LF_LO(%sp),%d2
move.l SP_LF_HI(%sp),%d3
move.w SP_LF_NGROUPS(%sp),%d0
| Set up d2-d7 = lo, hi, lo, hi, lo, hi (movem writes
| in d-reg order, so this gives the right alternation
| for 3 consecutive 8-byte groups).
move.l %d2,%d4
move.l %d2,%d6
move.l %d3,%d5
move.l %d3,%d7
| numBatches = numGroups / 3 (quotient), tail = remainder
ext.l %d0
divu.w #3,%d0
move.l %d0,%d1
swap %d1 | d1.w = remainder
tst.w %d0 | quotient
beq.s .Llf_tail
.Llf_loop:
movem.l %d2-%d7,(%a0)
lea 24(%a0),%a0
subq.w #1,%d0
bne.s .Llf_loop
.Llf_tail:
| Remainder: 0, 1, or 2 groups of 8 bytes
tst.w %d1
beq.s .Llf_done
move.l %d2,(%a0)+
move.l %d3,(%a0)+
subq.w #1,%d1
beq.s .Llf_done
move.l %d2,(%a0)+
move.l %d3,(%a0)+
.Llf_done:
movem.l (%sp)+,%d2-%d7
rts

View file

@ -0,0 +1,202 @@
| ST byte-aligned sprite save / restore via 256-entry plane-spread
| LUT. The LUT entry for each plane byte value is a 32-bit "spread"
| where each plane byte bit lands at the corresponding plane-0 bit
| position of the 4-byte chunky output. For plane N, we shift the
| LUT entry left by N to put bits at the plane-N positions, then OR
| the 4 plane contributions together to get the chunky long.
|
| LUT layout (256 longs = 1 KB), populated by initStPlaneSpreadLut
| in hal.c:
|
| gStPlaneSpreadLut[b] for plane byte b:
| bit i of b (i = 0 = MSB = leftmost pixel) maps to bit
| bitInLong(i) = (3 - (i >> 1)) * 8 + ((i & 1) ? 0 : 4)
| of the long. Plane 0's bits land at nibble bit 0 of each
| chunky byte; left-shift the LUT entry by N for plane N.
|
| ABI: cdecl. d2-d7/a2-a6 callee-save. C signatures:
|
| void surface68kStSpriteSaveByteAligned(uint8_t *base,
| uint16_t x, uint16_t y,
| uint16_t w, uint16_t h,
| uint8_t *dstChunky);
|
| void surface68kStSpriteRestoreByteAligned(uint8_t *base,
| uint16_t x, uint16_t y,
| uint16_t w, uint16_t h,
| const uint8_t *srcChunky);
.text
.equ SP_SAVED, 44
.equ SP_OFF, (SP_SAVED + 4)
.equ SP_BASE, SP_OFF + 0
.equ SP_X, SP_OFF + 4 + 2
.equ SP_Y, SP_OFF + 8 + 2
.equ SP_W, SP_OFF + 12 + 2
.equ SP_H, SP_OFF + 16 + 2
.equ SP_CHUNKY, SP_OFF + 20
.equ SP_LUT, SP_OFF + 24
| Per-tile-col SAVE: 4 plane bytes -> 4 contiguous bytes in buffer.
| a0 -> plane 0 byte (high or low half), strides 2 to next plane
| a1 -> output planar bytes (advanced by 4)
| a2 -> unused (LUT no longer needed)
|
| Phase 10.5: dropped chunky <-> planar conversion. The buffer holds
| plane-major bytes (per row: plane0, plane1, plane2, plane3 per
| tile col, for w/8 tile cols). 4 byte copies instead of 4 LUT
| lookups + shifts + ORs.
.macro SAVE_TILECOL
move.b (%a0),(%a1)+ | plane 0
move.b 2(%a0),(%a1)+ | plane 1
move.b 4(%a0),(%a1)+ | plane 2
move.b 6(%a0),(%a1)+ | plane 3
.endm
.globl _surface68kStSpriteSaveByteAligned
_surface68kStSpriteSaveByteAligned:
movem.l %d2-%d7/%a2-%a6,-(%sp)
move.l SP_BASE(%sp),%a3
move.l SP_CHUNKY(%sp),%a1
| LUT pointer comes in via stack arg -- guaranteed
| long-aligned because gcc passes ptr args via
| move.l on a long-aligned sp slot. Avoids the BSS
| misalignment problem on TOS .PRG (BSS pads only to
| 2 bytes, even uint32_t slots can land at mod-4 = 2).
move.l SP_LUT(%sp),%a2
move.w SP_W(%sp),%d5
lsr.w #3,%d5 | d5 = tileCols
move.w SP_H(%sp),%d6 | d6 = h
move.w SP_X(%sp),%d7
| a4 = base + y*160 + (x>>4)*8
move.w SP_Y(%sp),%d0
ext.l %d0
move.l %d0,%d1
lsl.l #5,%d0 | y << 5
lsl.l #7,%d1 | y << 7
add.l %d1,%d0 | y * 160
lea 0(%a3,%d0.l),%a4
moveq #0,%d0
move.w %d7,%d0
lsr.w #4,%d0
lsl.w #3,%d0
ext.l %d0
add.l %d0,%a4
| Initial half offset: (x & 8) >> 3 = 0 or 1
and.w #8,%d7
lsr.w #3,%d7
.LsaveRow:
move.w %d5,%d3 | d3 = tileCols
moveq #0,%d2
move.w %d7,%d2
lea 0(%a4,%d2.l),%a0 | a0 = first plane-0 byte
.LsaveCol:
SAVE_TILECOL
| Advance a0: bit 0 = 0 -> high, advance to low (+1).
| bit 0 = 1 -> low, advance to next group's high (+7).
move.l %a0,%d4
btst #0,%d4
bne.s .LsaveColWasLo
addq.l #1,%a0
bra.s .LsaveColNext
.LsaveColWasLo:
lea 7(%a0),%a0
.LsaveColNext:
subq.w #1,%d3
bne.w .LsaveCol
lea 160(%a4),%a4
subq.w #1,%d6
bne.w .LsaveRow
movem.l (%sp)+,%d2-%d7/%a2-%a6
rts
| Per-tile-col RESTORE: 4 contiguous bytes from buffer -> 4 plane bytes.
| a0 -> plane 0 byte (high or low half)
| a1 -> input planar bytes (advanced by 4)
| a2 -> unused (LUT no longer needed)
|
| Phase 10.5: dropped chunky -> planar conversion. Buffer layout
| matches SAVE_TILECOL: per row, plane0/1/2/3 per tile col.
.macro RESTORE_TILECOL
move.b (%a1)+,(%a0) | plane 0
move.b (%a1)+,2(%a0) | plane 1
move.b (%a1)+,4(%a0) | plane 2
move.b (%a1)+,6(%a0) | plane 3
.endm
.globl _surface68kStSpriteRestoreByteAligned
_surface68kStSpriteRestoreByteAligned:
movem.l %d2-%d7/%a2-%a6,-(%sp)
move.l SP_BASE(%sp),%a3
move.l SP_CHUNKY(%sp),%a1
move.l SP_LUT(%sp),%a2 | gC2pLut passed in
| tileCols is held in a5 (not d5) because the macro
| trashes d5 (uses it for pb3).
move.w SP_W(%sp),%d0
lsr.w #3,%d0
movea.w %d0,%a5
move.w SP_H(%sp),%d6
move.w SP_X(%sp),%d7
move.w SP_Y(%sp),%d0
ext.l %d0
move.l %d0,%d1
lsl.l #5,%d0
lsl.l #7,%d1
add.l %d1,%d0
lea 0(%a3,%d0.l),%a4
moveq #0,%d0
move.w %d7,%d0
lsr.w #4,%d0
lsl.w #3,%d0
ext.l %d0
add.l %d0,%a4
and.w #8,%d7
lsr.w #3,%d7
.LrestoreRow:
move.w %a5,%d3 | d3 = tileCols (from a5)
moveq #0,%d2
move.w %d7,%d2
lea 0(%a4,%d2.l),%a0
.LrestoreCol:
RESTORE_TILECOL
move.l %a0,%d4
btst #0,%d4
bne.s .LrestoreColWasLo
addq.l #1,%a0
bra.s .LrestoreColNext
.LrestoreColWasLo:
lea 7(%a0),%a0
.LrestoreColNext:
subq.w #1,%d3
bne.w .LrestoreCol
lea 160(%a4),%a4
subq.w #1,%d6
bne.w .LrestoreRow
movem.l (%sp)+,%d2-%d7/%a2-%a6
rts