diff --git a/make/atarist.mk b/make/atarist.mk index 38bc119..b516aa2 100644 --- a/make/atarist.mk +++ b/make/atarist.mk @@ -37,6 +37,7 @@ LIB_OBJS := \ $(patsubst $(SRC_PORT)/atarist/%.s,$(BUILD)/obj/port/%.o,$(PORT_S_SRCS)) \ $(patsubst $(SRC_68K)/%.s,$(BUILD)/obj/68k/%.o,$(SHARED_S)) \ $(BUILD)/obj/codegen/spriteEmit68k.o \ + $(BUILD)/obj/codegen/spriteEmitInterleaved68k.o \ $(BUILD)/obj/codegen/spriteCompile.o LIB := $(LIBDIR)/libjoey.a diff --git a/src/codegen/spriteCompile.c b/src/codegen/spriteCompile.c index 750283f..e051adc 100644 --- a/src/codegen/spriteCompile.c +++ b/src/codegen/spriteCompile.c @@ -37,7 +37,7 @@ static uint16_t emitDrawForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift #elif defined(JOEYLIB_PLATFORM_AMIGA) return spriteEmitDrawPlanar68k(out, sp, shift); #elif defined(JOEYLIB_PLATFORM_ATARIST) - return spriteEmitDraw68k(out, sp, shift); + return spriteEmitDrawInterleaved68k(out, sp, shift); #elif defined(JOEYLIB_PLATFORM_IIGS) return spriteEmitDrawIigs(out, sp, shift); #else @@ -57,7 +57,7 @@ static uint16_t emitSaveForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift #elif defined(JOEYLIB_PLATFORM_AMIGA) return spriteEmitSavePlanar68k(out, sp, shift); #elif defined(JOEYLIB_PLATFORM_ATARIST) - return spriteEmitSave68k(out, sp, shift); + return spriteEmitSaveInterleaved68k(out, sp, shift); #elif defined(JOEYLIB_PLATFORM_IIGS) return spriteEmitSaveIigs(out, sp, shift); #else @@ -73,7 +73,7 @@ static uint16_t emitRestoreForTarget(uint8_t *out, const SpriteT *sp, uint8_t sh #elif defined(JOEYLIB_PLATFORM_AMIGA) return spriteEmitRestorePlanar68k(out, sp, shift); #elif defined(JOEYLIB_PLATFORM_ATARIST) - return spriteEmitRestore68k(out, sp, shift); + return spriteEmitRestoreInterleaved68k(out, sp, shift); #elif defined(JOEYLIB_PLATFORM_IIGS) return spriteEmitRestoreIigs(out, sp, shift); #else @@ -139,6 +139,15 @@ bool spriteCompile(SpriteT *sp) { free(scratch); return false; } + if (totalSize == 0) { + /* Platforms whose emitter returns 0 for every (shift, op) have + * no compiled bytes -- spriteCompiledDraw / SaveUnder / + * RestoreUnder would dereference a degenerate slot or chunky + * shadow. Bail so sp->slot stays NULL and the dispatcher + * routes through the interpreted halSpriteXxxPlanes path. */ + free(scratch); + return false; + } slot = codegenArenaAlloc(totalSize); if (slot == NULL) { @@ -684,6 +693,68 @@ void spriteCompiledRestoreUnder(SurfaceT *dst, const SpriteBackupT *backup) { fn(p0 + byteOff, p1 + byteOff, p2 + byteOff, p3 + byteOff, backup->bytes); } +#elif defined(JOEYLIB_PLATFORM_ATARIST) + +/* ST word-interleaved planar runtime dispatch. The JIT routine takes + * one arg: groupBase = pd->base + y*160 + (x>>4)*8 (the address of + * the first 16-pixel group the sprite touches). It walks rows by + * adda.w #160 at the end of each row. Per (row, tile_col, plane) it + * emits up to one move.b / clr.b / andi.b+ori.b / ori.b chain at + * d16(a0). + * + * shift selection (in spriteInternal.h SPRITE_SHIFT_INDEX): + * 0 : byte-aligned x with x mod 16 == 0 (first tile col high half) + * 1 : byte-aligned x with x mod 16 == 8 (first tile col low half) + * 2+ : non-byte-aligned x, never compiled (emitter returns 0); the + * per-shift offset is SPRITE_NOT_COMPILED so the dispatcher + * falls back to halSpriteDrawPlanes. */ +void spriteCompiledDraw(SurfaceT *dst, const SpriteT *sp, int16_t x, int16_t y) { + typedef void (*DrawFn)(uint8_t *groupBase); + uint8_t shift; + uint16_t routeOffset; + uint8_t *base; + uint8_t *groupBase; + DrawFn fn; + + shift = SPRITE_SHIFT_INDEX(x); + routeOffset = sp->routineOffsets[shift][SPRITE_OP_DRAW]; + if (routeOffset == SPRITE_NOT_COMPILED) { + /* Non-byte-aligned x: cross-platform spriteDraw will call + * halSpriteDrawPlanes after this returns (since the dispatcher + * already chose the compiled path based on sp->slot != NULL, + * but COMPILED_SPRITE_WRITES_PLANES is 1 on ST so it normally + * suppresses the planes hook). For non-aligned shifts we + * deliberately want the interpreted planes hook to run, so + * delegate via halSpriteDrawPlanes here. */ + halSpriteDrawPlanes(dst, sp, x, y); + return; + } + base = halSurfacePlanePtr(dst, 0); + if (base == NULL) { + return; + } + groupBase = base + + (uint16_t)y * 160u + + (uint16_t)((uint16_t)x >> 4) * 8u; + fn = (DrawFn)(codegenArenaBase() + sp->slot->offset + routeOffset); + fn(groupBase); +} + + +/* Save/Restore aren't compiled on ST yet (emitter returns 0). The + * dispatcher's check on sp->routineOffsets[shift][SPRITE_OP_SAVE/_RESTORE] + * == SPRITE_NOT_COMPILED already routes those through the + * interpreted halSpriteSavePlanes / halSpriteRestorePlanes. These + * stubs exist only to satisfy the linker. */ +void spriteCompiledSaveUnder(const SurfaceT *src, SpriteT *sp, int16_t x, int16_t y, SpriteBackupT *backup) { + (void)src; (void)sp; (void)x; (void)y; (void)backup; +} + + +void spriteCompiledRestoreUnder(SurfaceT *dst, const SpriteBackupT *backup) { + (void)dst; (void)backup; +} + #else void spriteCompiledDraw(SurfaceT *dst, const SpriteT *sp, int16_t x, int16_t y) { diff --git a/src/codegen/spriteEmitInterleaved68k.c b/src/codegen/spriteEmitInterleaved68k.c new file mode 100644 index 0000000..32a7f5c --- /dev/null +++ b/src/codegen/spriteEmitInterleaved68k.c @@ -0,0 +1,220 @@ +// 68k sprite codegen for ST word-interleaved planar layout. Emits a +// cdecl-callable routine `void draw(uint8_t *groupBase)` that walks +// the sprite's tile data and writes plane bytes via `d16(a0)` chains. +// +// ST planar layout reminder (doc/atarist_planar.md): one buffer; per +// scanline 20 groups of 8 bytes; per group, 4 plane words back-to- +// back. groupBase points at the FIRST group the sprite touches: +// pd->base + y * 160 + (x >> 4) * 8 +// +// Shift index for ST is bit 3 of x (whether the sprite starts in the +// high half or low half of the first group). x mod 8 != 0 falls back +// to the interpreter (returns 0 from this emitter so sp->slot stays +// NULL for those alignments). +// +// Per (row, tile_col, plane) we emit one of: +// * nothing (op byte = 0, all transparent) +// * move.b #pbN, d16(a0) (op = 0xFF, full replace, 6 bytes) +// * clr.b d16(a0) (op = 0xFF AND pbN = 0, 4 bytes) +// * andi.b #~op, d16(a0) (op partial, pbN = 0, 6 bytes) +// * ori.b #pbN, d16(a0) (op partial, pbN == op, 6 bytes) +// * andi.b #~op + ori.b #pbN (mixed, 12 bytes) +// +// d16 is the byte offset from groupBase to the target plane byte. +// Layout of the byte offset: +// shift 0: byteOff = (col >> 1) * 8 + plane*2 + (col & 1) +// shift 1: byteOff = ((col + 1) >> 1) * 8 + plane*2 + (1 - (col & 1)) +// Each tile column is 8 sprite pixels = exactly half a 16-pixel +// group, alternating high (offset 0) and low (offset 1) bytes of +// each plane word. +// +// Per row we adda.w #160, a0 to advance to the next scanline. + +#include "joey/sprite.h" +#include "joey/surface.h" +#include "spriteEmitter.h" +#include "spriteInternal.h" + + +// ----- Constants ----- + +#define TILE_PIXELS 8 +#define TILE_BYTES 32 +#define TILE_BYTES_PER_ROW 4 +#define ST_BYTES_PER_ROW 160 + + +// ----- Helpers ----- + +static uint16_t writeBE16(uint8_t *out, uint16_t value) { + out[0] = (uint8_t)(value >> 8); + out[1] = (uint8_t)(value & 0xFFu); + return 2; +} + + +// Build the 4 plane bytes + opacity byte for one (row, tileCol) +// pair. pbN bit 7 is sprite pixel 0 (leftmost), bit 0 is pixel 7. +// op bit N is set iff that pixel's color != 0. +static void buildPlaneBytes(const SpriteT *sp, uint16_t row, uint16_t tileCol, + uint8_t *outPb0, uint8_t *outPb1, + uint8_t *outPb2, uint8_t *outPb3, + uint8_t *outOp) { + uint16_t tileY = (uint16_t)(row >> 3); + uint16_t inTileY = (uint16_t)(row & 7u); + uint16_t wTiles = sp->widthTiles; + const uint8_t *tileBytes = sp->tileData + (uint32_t)(tileY * wTiles + tileCol) * 32u; + const uint8_t *tileRow = tileBytes + (uint32_t)inTileY * TILE_BYTES_PER_ROW; + uint8_t pb0 = 0u; + uint8_t pb1 = 0u; + uint8_t pb2 = 0u; + uint8_t pb3 = 0u; + uint8_t op = 0u; + uint8_t p; + uint8_t b; + uint8_t color; + uint8_t bit; + + for (p = 0; p < 8u; p++) { + b = tileRow[p >> 1]; + color = (p & 1u) ? (uint8_t)(b & 0x0Fu) : (uint8_t)(b >> 4); + bit = (uint8_t)(0x80u >> p); + if (color != 0u) { + op = (uint8_t)(op | bit); + if (color & 1u) pb0 = (uint8_t)(pb0 | bit); + if (color & 2u) pb1 = (uint8_t)(pb1 | bit); + if (color & 4u) pb2 = (uint8_t)(pb2 | bit); + if (color & 8u) pb3 = (uint8_t)(pb3 | bit); + } + } + *outPb0 = pb0; + *outPb1 = pb1; + *outPb2 = pb2; + *outPb3 = pb3; + *outOp = op; +} + + +// Emit code for one plane byte at d16(a0). Returns bytes written. +// op=opacity byte, pb=plane byte (subset of op). +static uint16_t emitPlaneByte(uint8_t *out, uint16_t cursor, uint16_t d16, uint8_t op, uint8_t pb) { + uint16_t start = cursor; + + if (op == 0u) { + return 0u; /* nothing to emit */ + } + if (op == 0xFFu) { + /* All 8 pixels opaque: replace the byte. */ + if (pb == 0u) { + /* clr.b d16(a0). Opcode 0x4228 + d16. 4 bytes. */ + cursor += writeBE16(out + cursor, 0x4228u); + cursor += writeBE16(out + cursor, d16); + } else { + /* move.b #pb, d16(a0). Opcode 0x117C + #imm word + d16. 6 bytes. */ + cursor += writeBE16(out + cursor, 0x117Cu); + cursor += writeBE16(out + cursor, (uint16_t)pb); + cursor += writeBE16(out + cursor, d16); + } + return (uint16_t)(cursor - start); + } + /* Partial opacity. pb is a subset of op. */ + if (pb == 0u) { + /* All opaque pixels have plane bit 0: just clear those bits. */ + /* andi.b #~op, d16(a0). Opcode 0x0228 + #imm word + d16. 6 bytes. */ + cursor += writeBE16(out + cursor, 0x0228u); + cursor += writeBE16(out + cursor, (uint16_t)(~op & 0xFFu)); + cursor += writeBE16(out + cursor, d16); + return (uint16_t)(cursor - start); + } + if (pb == op) { + /* All opaque pixels have plane bit 1: just set those bits. */ + /* ori.b #op, d16(a0). Opcode 0x0028 + #imm word + d16. 6 bytes. */ + cursor += writeBE16(out + cursor, 0x0028u); + cursor += writeBE16(out + cursor, (uint16_t)op); + cursor += writeBE16(out + cursor, d16); + return (uint16_t)(cursor - start); + } + /* Mixed: clear opaque bits, then set the plane bits. */ + cursor += writeBE16(out + cursor, 0x0228u); + cursor += writeBE16(out + cursor, (uint16_t)(~op & 0xFFu)); + cursor += writeBE16(out + cursor, d16); + cursor += writeBE16(out + cursor, 0x0028u); + cursor += writeBE16(out + cursor, (uint16_t)pb); + cursor += writeBE16(out + cursor, d16); + return (uint16_t)(cursor - start); +} + + +// ----- Emit API ----- + +uint16_t spriteEmitDrawInterleaved68k(uint8_t *out, const SpriteT *sp, uint8_t shift) { + uint16_t cursor; + uint16_t row; + uint16_t col; + uint16_t plane; + uint16_t heightPx; + uint16_t wTiles; + uint8_t pb[4]; + uint8_t op; + + /* Only shifts 0 and 1 emit code. shift 0 = first tile col in + * high half (x mod 16 == 0). shift 1 = first tile col in low + * half (x mod 16 == 8). Other byte alignments fall through to + * the interpreter via halSpriteDrawPlanes. */ + if (shift > 1u) { + return 0u; + } + + cursor = 0u; + heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS); + wTiles = sp->widthTiles; + + /* Prologue: movea.l 4(sp), a0. Opcode 0x206F + d16=4. 4 bytes. */ + cursor += writeBE16(out + cursor, 0x206Fu); + cursor += writeBE16(out + cursor, 0x0004u); + + for (row = 0; row < heightPx; row++) { + if (row > 0u) { + /* adda.w #160, a0. Opcode 0xD0FC + imm word. 4 bytes. */ + cursor += writeBE16(out + cursor, 0xD0FCu); + cursor += writeBE16(out + cursor, (uint16_t)ST_BYTES_PER_ROW); + } + for (col = 0; col < wTiles; col++) { + buildPlaneBytes(sp, row, col, &pb[0], &pb[1], &pb[2], &pb[3], &op); + if (op == 0u) { + continue; /* whole tile column row is transparent */ + } + for (plane = 0; plane < 4u; plane++) { + uint16_t d16; + if (shift == 0u) { + /* col 0 (high) -> +0, col 1 (low) -> +1, col 2 + * (high group 1) -> +8, ... */ + d16 = (uint16_t)((col >> 1) * 8 + plane * 2 + (col & 1u)); + } else { + /* col 0 (low) -> +1, col 1 (high group 1) -> +8, ... */ + d16 = (uint16_t)(((col + 1u) >> 1) * 8 + plane * 2 + (1u - (col & 1u))); + } + cursor += emitPlaneByte(out, cursor, d16, op, pb[plane]); + } + } + } + + /* Epilogue: rts. */ + cursor += writeBE16(out + cursor, 0x4E75u); + return cursor; +} + + +/* Save / restore aren't implemented yet -- returning 0 so they fall + * through to the C interpreter (halSpriteSavePlanes / halSpriteRestorePlanes + * fast paths cover the byte-aligned case). */ +uint16_t spriteEmitSaveInterleaved68k(uint8_t *out, const SpriteT *sp, uint8_t shift) { + (void)out; (void)sp; (void)shift; + return 0u; +} + + +uint16_t spriteEmitRestoreInterleaved68k(uint8_t *out, const SpriteT *sp, uint8_t shift) { + (void)out; (void)sp; (void)shift; + return 0u; +} diff --git a/src/codegen/spriteEmitter.h b/src/codegen/spriteEmitter.h index acd7169..694fe01 100644 --- a/src/codegen/spriteEmitter.h +++ b/src/codegen/spriteEmitter.h @@ -57,4 +57,15 @@ uint16_t spriteEmitDrawPlanar68k (uint8_t *out, const SpriteT *sp, uint8_t sh uint16_t spriteEmitSavePlanar68k (uint8_t *out, const SpriteT *sp, uint8_t shift); uint16_t spriteEmitRestorePlanar68k (uint8_t *out, const SpriteT *sp, uint8_t shift); +// Word-interleaved planar 68k emitter (ST). Calling convention for +// the emitted bytes: +// void draw(uint8_t *groupBase); +// where groupBase = pd->base + y*160 + (x>>4)*8. Shifts 0 and 1 emit +// real bytes (x mod 16 == 0 for shift 0, x mod 16 == 8 for shift 1); +// other shifts return 0 so the cross-platform dispatcher falls back +// to halSpriteDrawPlanes. +uint16_t spriteEmitDrawInterleaved68k (uint8_t *out, const SpriteT *sp, uint8_t shift); +uint16_t spriteEmitSaveInterleaved68k (uint8_t *out, const SpriteT *sp, uint8_t shift); +uint16_t spriteEmitRestoreInterleaved68k (uint8_t *out, const SpriteT *sp, uint8_t shift); + #endif diff --git a/src/core/sprite.c b/src/core/sprite.c index 177ca53..0fea341 100644 --- a/src/core/sprite.c +++ b/src/core/sprite.c @@ -31,7 +31,11 @@ // paths still need the hooks unconditionally on every platform -- the // chunky interpreter is a no-op on Amiga (s->pixels NULL) so the hook // is the only draw. -#if defined(JOEYLIB_PLATFORM_AMIGA) +/* ST also runs pure planar post-Phase-9 (s->pixels NULL); the JIT + * routine writes plane bytes directly, so the chunky interpreter + * is a no-op and the halSpriteDrawPlanes hook would be a redundant + * second draw. Same rationale as Amiga. */ +#if defined(JOEYLIB_PLATFORM_AMIGA) || defined(JOEYLIB_PLATFORM_ATARIST) #define COMPILED_SPRITE_WRITES_PLANES 1 #else #define COMPILED_SPRITE_WRITES_PLANES 0 diff --git a/src/core/spriteInternal.h b/src/core/spriteInternal.h index 99a6bd5..d2059c9 100644 --- a/src/core/spriteInternal.h +++ b/src/core/spriteInternal.h @@ -16,9 +16,15 @@ // Per-platform shift index used by the dispatcher. Chunky 4bpp ports // store one nibble per pixel pair so the only sub-byte alignment is // x % 2. Amiga planar packs 8 pixels per plane byte so all 8 -// alignments matter. +// alignments matter. ST word-interleaved planar groups 16 pixels +// per word; for byte-aligned x (x mod 8 == 0) the only meaningful +// distinction is high vs low byte of the plane word, which is bit +// 3 of x (== (x >> 3) & 1). Other shifts (x mod 8 != 0) emit 0 +// from the JIT and route to the interpreter. #if defined(JOEYLIB_PLATFORM_AMIGA) #define SPRITE_SHIFT_INDEX(x) ((uint8_t)((x) & 7)) +#elif defined(JOEYLIB_PLATFORM_ATARIST) +#define SPRITE_SHIFT_INDEX(x) ((uint8_t)(((x) & 7) ? 2u : (uint8_t)(((x) >> 3) & 1u))) #else #define SPRITE_SHIFT_INDEX(x) ((uint8_t)((x) & 1)) #endif diff --git a/src/core/tile.c b/src/core/tile.c index d84b585..16e36ef 100644 --- a/src/core/tile.c +++ b/src/core/tile.c @@ -141,11 +141,13 @@ void tileCopy(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, srcPixelX = (uint16_t)((uint16_t)srcBx * TILE_PIXELS_PER_SIDE); srcPixelY = (uint16_t)((uint16_t)srcBy * TILE_PIXELS_PER_SIDE); - dstRow0 = &dst->pixels[SURFACE_ROW_OFFSET(dstPixelY) + (dstPixelX >> 1)]; - srcRow0 = &src->pixels[SURFACE_ROW_OFFSET(srcPixelY) + (srcPixelX >> 1)]; - - if (!halFastTileCopy(dstRow0, srcRow0)) { - copyTileOpaque(dstRow0, srcRow0); + /* Skip the chunky path on planar ports (pixels NULL). */ + if (dst->pixels != NULL && src->pixels != NULL) { + dstRow0 = &dst->pixels[SURFACE_ROW_OFFSET(dstPixelY) + (dstPixelX >> 1)]; + srcRow0 = &src->pixels[SURFACE_ROW_OFFSET(srcPixelY) + (srcPixelX >> 1)]; + if (!halFastTileCopy(dstRow0, srcRow0)) { + copyTileOpaque(dstRow0, srcRow0); + } } halTileCopyPlanes(dst, dstBx, dstBy, src, srcBx, srcBy); surfaceMarkDirtyRect(dst, (int16_t)dstPixelX, (int16_t)dstPixelY, @@ -173,11 +175,13 @@ void tileCopyMasked(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT srcPixelX = (uint16_t)((uint16_t)srcBx * TILE_PIXELS_PER_SIDE); srcPixelY = (uint16_t)((uint16_t)srcBy * TILE_PIXELS_PER_SIDE); - dstRow0 = &dst->pixels[SURFACE_ROW_OFFSET(dstPixelY) + (dstPixelX >> 1)]; - srcRow0 = &src->pixels[SURFACE_ROW_OFFSET(srcPixelY) + (srcPixelX >> 1)]; - - if (!halFastTileCopyMasked(dstRow0, srcRow0, transparentIndex)) { - copyTileMasked(dstRow0, srcRow0, transparentIndex); + /* Skip the chunky path on planar ports (pixels NULL). */ + if (dst->pixels != NULL && src->pixels != NULL) { + dstRow0 = &dst->pixels[SURFACE_ROW_OFFSET(dstPixelY) + (dstPixelX >> 1)]; + srcRow0 = &src->pixels[SURFACE_ROW_OFFSET(srcPixelY) + (srcPixelX >> 1)]; + if (!halFastTileCopyMasked(dstRow0, srcRow0, transparentIndex)) { + copyTileMasked(dstRow0, srcRow0, transparentIndex); + } } halTileCopyMaskedPlanes(dst, dstBx, dstBy, src, srcBx, srcBy, transparentIndex); surfaceMarkDirtyRect(dst, (int16_t)dstPixelX, (int16_t)dstPixelY, @@ -199,8 +203,9 @@ void tileFill(SurfaceT *s, uint8_t bx, uint8_t by, uint8_t colorIndex) { pixelX = (uint16_t)((uint16_t)bx * TILE_PIXELS_PER_SIDE); pixelY = (uint16_t)((uint16_t)by * TILE_PIXELS_PER_SIDE); doubled = (uint8_t)(((colorIndex & 0x0F) << 4) | (colorIndex & 0x0F)); - if (!halFastTileFill(s, bx, by, - (uint16_t)((uint16_t)doubled | ((uint16_t)doubled << 8)))) { + if (s->pixels != NULL + && !halFastTileFill(s, bx, by, + (uint16_t)((uint16_t)doubled | ((uint16_t)doubled << 8)))) { uint8_t *row = &s->pixels[SURFACE_ROW_OFFSET(pixelY) + (pixelX >> 1)]; uint8_t i; for (i = 0; i < TILE_PIXELS_PER_SIDE; i++) { @@ -232,16 +237,22 @@ void tilePaste(SurfaceT *dst, uint8_t bx, uint8_t by, const TileT *in) { } pixelX = (uint16_t)((uint16_t)bx * TILE_PIXELS_PER_SIDE); pixelY = (uint16_t)((uint16_t)by * TILE_PIXELS_PER_SIDE); - dstRow = &dst->pixels[SURFACE_ROW_OFFSET(pixelY) + (pixelX >> 1)]; src = &in->pixels[0]; - if (!halFastTilePaste(dstRow, src)) { - for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) { - dstRow[0] = src[0]; - dstRow[1] = src[1]; - dstRow[2] = src[2]; - dstRow[3] = src[3]; - dstRow += SURFACE_BYTES_PER_ROW; - src += TILE_BYTES_PER_ROW; + /* Skip the chunky write path on planar ports (dst->pixels NULL) -- + * mirrors tileSnap's pixels-NULL short-circuit. Saves the dstRow + * SURFACE_ROW_OFFSET multiply + halFastTilePaste jsr/rts per call + * on ST/Amiga where the planar path below does the real work. */ + if (dst->pixels != NULL) { + dstRow = &dst->pixels[SURFACE_ROW_OFFSET(pixelY) + (pixelX >> 1)]; + if (!halFastTilePaste(dstRow, src)) { + for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) { + dstRow[0] = src[0]; + dstRow[1] = src[1]; + dstRow[2] = src[2]; + dstRow[3] = src[3]; + dstRow += SURFACE_BYTES_PER_ROW; + src += TILE_BYTES_PER_ROW; + } } } halTilePastePlanes(dst, bx, by, &in->pixels[0]); diff --git a/src/port/atarist/audio.c b/src/port/atarist/audio.c index f6c5552..687e83b 100644 --- a/src/port/atarist/audio.c +++ b/src/port/atarist/audio.c @@ -39,6 +39,13 @@ #define ST_MFP_IMRA ((volatile uint8_t *)0xFFFFFA13L) #define ST_MFP_ISRA ((volatile uint8_t *)0xFFFFFA0FL) +// YM2149 (sound chip) supervisor-only ports. Index reg 7 (mixer) +// controls per-channel tone + noise enables; reg 8/9/A are volumes +// for channels A/B/C; regs 0/1, 2/3, 4/5 are tone period for those +// channels; reg 6 is noise period. +#define ST_YM_SELECT ((volatile uint8_t *)0xFFFF8800L) +#define ST_YM_DATA ((volatile uint8_t *)0xFFFF8802L) + #define MFP_TA_BIT 0x20 #define MFP_TACR_STOP 0x00 #define MFP_TACR_DIV200 0x07 @@ -90,6 +97,32 @@ static long installTimerA(void) { gNeedRefill[0] = 0; gNeedRefill[1] = 0; + // YM2149 setup for PWM-via-volume on channel A: + // reg 7 (mixer): set bits 0 (tone A off) and 3 (noise A off); + // preserve bits 6+7 (I/O port directions, used + // by TOS for floppy / keyboard / printer). + // reg 8 (channel A volume): start at 0 to avoid a pop at start. + // + // Without the mixer setup, whatever state TOS left noise A in + // gets gated by our 12 kHz volume writes -- if noise A was on, + // a constant volume = constant hiss. Standard PWM-DAC trick is + // to disable both tone and noise so the volume reg is a pure + // 4-bit amplitude DAC. + // + // We can't reliably read back YM regs on the ST (the data port + // returns last-write, not register contents), so we OR in the + // disable bits over an assumed-safe TOS-default mask. Bit 6 set + // (port A output) matches stock TOS; bit 7 set (port B output) + // matches the centronics-printer direction TOS configures. + *ST_YM_SELECT = 7; + *ST_YM_DATA = 0xFF; // all tones + noises off; I/O ports A+B output (TOS default) + *ST_YM_SELECT = 8; + *ST_YM_DATA = 0; // channel A volume = 0 to avoid a pop at start + *ST_YM_SELECT = 9; + *ST_YM_DATA = 0; // channel B volume = 0 + *ST_YM_SELECT = 10; + *ST_YM_DATA = 0; // channel C volume = 0 + // MFP Timer A: stop, install our vector, set prescaler 200 + data // 1 (= 2.4576 MHz / 200 = 12288 Hz), then start. *ST_MFP_TACR = MFP_TACR_STOP; @@ -108,6 +141,10 @@ static long uninstallTimerA(void) { (void)Setexc(VEC_MFP_TA, (long)gOldTimerAVec); gOldTimerAVec = NULL; } + /* Silence channel A volume so handoff back to TOS is clean (no + * residual DC level on the speaker). */ + *ST_YM_SELECT = 8; + *ST_YM_DATA = 0; return 0; } diff --git a/src/port/atarist/circle.s b/src/port/atarist/circle.s new file mode 100644 index 0000000..b7c65c9 --- /dev/null +++ b/src/port/atarist/circle.s @@ -0,0 +1,282 @@ +| Atari ST word-interleaved planar circle outline -- 68000 hand-rolled. +| +| Mirrors src/port/amiga/circle.s in spirit but for ST's single +| word-interleaved planar buffer: +| * Per scanline: 20 groups of 8 bytes; each group is 4 plane +| words back-to-back (p0_word, p1_word, p2_word, p3_word). +| * Pixel x: group = x >> 4; bit position within word = 15 - (x & 15). +| * Plane N's word at row y, group g: base + y*160 + g*8 + N*2. +| +| 16-way color dispatch + per-iter precompute (4 xp records + 4 yp40 +| words) gives a branchless 4-plane RMW per pixel. 8 octants are +| inlined per Bresenham iter; no bsr. +| +| ABI: cdecl. d2-d7/a2-a6 callee-save. +| +| void surface68kStCircleOutline(uint8_t *base, +| uint16_t cx, uint16_t cy, +| uint16_t r, uint8_t color); +| +| Register allocation: +| d2.w = bx (Bresenham) +| d3.w = by (Bresenham) +| d4.w = err (Bresenham) +| d5.w = cx (cached) +| a4 = cy (cached, sign-extended) +| a3 = base +| a5 = bitMaskWordLut +| d0,d1,d6,d7 = scratch +| +| Scratch block (24 bytes) at sp+0..23: +| sp+0..3: xp1 record [groupOff_w, bitMask_b, notMask_b] +| groupOff = (x >> 4) * 8 (byte offset of group within row) +| bitMask = byte representation of 1 << (15 - (x & 15)) +| ... wait, bitMask must be a WORD on ST not a byte. +| +| Actually layout differs from Amiga: ST needs a WORD bit mask, not +| a byte. Per-record layout (8 bytes): +| groupOff_word (2 bytes), bitMask_word (2 bytes), notMask_word +| (2 bytes), pad (2 bytes) +| +| sp+0..7: xp1 record (cx + bx) +| sp+8..15: xp2 record (cx - bx) +| sp+16..23: xp3 record (cx + by) +| sp+24..31: xp4 record (cx - by) +| sp+32..33: yp1_off (cy + by) * 160 +| sp+34..35: yp2_off (cy - by) * 160 +| sp+36..37: yp3_off (cy + bx) * 160 +| sp+38..39: yp4_off (cy - bx) * 160 +| Total: 40 bytes. + + .text + + +| ---- BIT_MASK_WORD: build 1 << (15 - (x & 15)) --------- +| Look up via 16-entry table (a5 holds base). Cheaper than variable +| shift on 68000 (which is 8 + 2n cyc). Table is 32 bytes (16 words). +| Returns word in d_out. + +| ---- XP_REC: build xp record at sp+slot for xp = cx +| signOp: add or sub +| xreg: %d2 (bx) or %d3 (by) +| slot: 0, 8, 16, or 24 +| Trashes: d0, d1, d6, d7 + + .macro XP_REC slot, signOp, xreg + move.w %d5,%d6 | d6 = cx + \signOp\().w \xreg,%d6 | d6 = xp + move.w %d6,%d7 + lsr.w #4,%d7 | d7 = group + lsl.w #3,%d7 | d7 = group * 8 (byte offset) + and.w #15,%d6 | d6 = xp & 15 (0..15) + add.w %d6,%d6 | d6 *= 2 (word index) + move.w (%a5,%d6.w),%d6 | d6 = bitMask word + move.w %d7,\slot(%sp) | groupOff word + move.w %d6,\slot+2(%sp) | bitMask word + .endm + + +| ---- YP_REC: store (yp * 160) at sp+slot --------- +| yp = cy ; trashes d0, d6. + + .macro YP_REC slot, signOp, yreg + move.l %a4,%d6 + \signOp\().w \yreg,%d6 | d6.w = yp + move.w %d6,%d0 + lsl.w #5,%d6 | d6 = yp << 5 + lsl.w #7,%d0 | d0 = yp << 7 + add.w %d6,%d0 | d0 = yp * 160 + move.w %d0,\slot(%sp) + .endm + + +| ---- PLOT_FIXED: plot one pixel with hardcoded 4-bit color ---- +| slotYp: 32, 34, 36, or 38 (yp_off word slot) +| slotXp: 0, 8, 16, or 24 (xp record slot) +| color: literal 0..15 +| Trashes: d0, d1, d7 + + .macro PLOT_FIXED slotYp, slotXp, color + move.w \slotYp(%sp),%d0 | d0 = yp_off + add.w \slotXp(%sp),%d0 | d0 += groupOff + move.w \slotXp+2(%sp),%d1 | d1 = bitMask word + move.w %d1,%d7 + not.w %d7 | d7 = notMask + lea 0(%a3,%d0.w),%a2 | a2 = base + byteOff (group ptr) + | 4 plane word RMWs at (a2)+, postinc walks p0->p1->p2->p3 + .if ((\color) & 1) + or.w %d1,(%a2)+ + .else + and.w %d7,(%a2)+ + .endif + .if ((\color) & 2) + or.w %d1,(%a2)+ + .else + and.w %d7,(%a2)+ + .endif + .if ((\color) & 4) + or.w %d1,(%a2)+ + .else + and.w %d7,(%a2)+ + .endif + .if ((\color) & 8) + or.w %d1,(%a2)+ + .else + and.w %d7,(%a2)+ + .endif + .endm + + +| ---- PLOT_8: 8 octant pixels for hardcoded color ---- + + .macro PLOT_8 color + PLOT_FIXED 32, 0, \color | (cx+bx, cy+by) + PLOT_FIXED 32, 8, \color | (cx-bx, cy+by) + PLOT_FIXED 34, 0, \color | (cx+bx, cy-by) + PLOT_FIXED 34, 8, \color | (cx-bx, cy-by) + PLOT_FIXED 36, 16, \color | (cx+by, cy+bx) + PLOT_FIXED 36, 24, \color | (cx-by, cy+bx) + PLOT_FIXED 38, 16, \color | (cx+by, cy-bx) + PLOT_FIXED 38, 24, \color | (cx-by, cy-bx) + .endm + + +| ---- CO_BODY: full Bresenham loop body for hardcoded color ---- + + .macro CO_BODY color + XP_REC 0, add, %d2 | xp1 = cx+bx + XP_REC 8, sub, %d2 | xp2 = cx-bx + XP_REC 16, add, %d3 | xp3 = cx+by + XP_REC 24, sub, %d3 | xp4 = cx-by + YP_REC 32, add, %d3 | yp1 = (cy+by)*160 + YP_REC 34, sub, %d3 | yp2 = (cy-by)*160 + YP_REC 36, add, %d2 | yp3 = (cy+bx)*160 + YP_REC 38, sub, %d2 | yp4 = (cy-bx)*160 + + PLOT_8 \color + + addq.w #1,%d3 + tst.w %d4 + bgt .LcoStDecX_\color + add.w %d3,%d4 + add.w %d3,%d4 + addq.w #1,%d4 + bra.w .LcoStLoop_\color +.LcoStDecX_\color: + subq.w #1,%d2 + add.w %d3,%d4 + add.w %d3,%d4 + sub.w %d2,%d4 + sub.w %d2,%d4 + addq.w #1,%d4 + bra.w .LcoStLoop_\color + .endm + + + .macro CO_LOOP_HDR color +.LcoStLoop_\color: + cmp.w %d3,%d2 + bcs.w .LcoStDone + CO_BODY \color + .endm + + +| ---- Function entry ---- +| Stack on entry (after movem.l of 11 regs + lea): +| sp+0..39: scratch (40 bytes) +| sp+40..83: movem (44 bytes) +| sp+84..87: return PC +| sp+88+0: base (uint8_t *) +| sp+88+4: cx (int promoted, .w at +88+4+2) +| sp+88+8: cy (int promoted, .w at +88+8+2) +| sp+88+12: r (int promoted, .w at +88+12+2) +| sp+88+16: color (int promoted, byte at +88+16+3) + + .equ SP_SAVED, 44 + .equ SP_LOCAL, 40 + .equ SP_OFF, (SP_SAVED + 4 + SP_LOCAL) + .equ SP_BASE, SP_OFF + 0 + .equ SP_CX, SP_OFF + 4 + 2 + .equ SP_CY, SP_OFF + 8 + 2 + .equ SP_R, SP_OFF + 12 + 2 + .equ SP_COLOR, SP_OFF + 16 + 3 + + .globl _surface68kStCircleOutline + +_surface68kStCircleOutline: + movem.l %d2-%d7/%a2-%a6,-(%sp) + lea -SP_LOCAL(%sp),%sp + + | Load base (a3) and bitMaskLut (a5). + move.l SP_BASE(%sp),%a3 + lea bitMaskWordLut(%pc),%a5 + + | Cache cx in d5, cy (sign-extended) in a4. + move.w SP_CX(%sp),%d5 + move.w SP_CY(%sp),%d6 + ext.l %d6 + movea.l %d6,%a4 + + | Bresenham init. + move.w SP_R(%sp),%d2 | bx = r + moveq #0,%d3 | by = 0 + moveq #1,%d4 + sub.w %d2,%d4 | err = 1 - bx + + | Dispatch on color (low 4 bits) -> one of 16 main loops. + moveq #0,%d6 + move.b SP_COLOR(%sp),%d6 + and.w #0x0F,%d6 + add.w %d6,%d6 + add.w %d6,%d6 | * 4 for bra.w table + lea .LcoStTable(%pc),%a6 + jmp 0(%a6,%d6.w) + +.LcoStTable: + bra.w .LcoStLoop_0 + bra.w .LcoStLoop_1 + bra.w .LcoStLoop_2 + bra.w .LcoStLoop_3 + bra.w .LcoStLoop_4 + bra.w .LcoStLoop_5 + bra.w .LcoStLoop_6 + bra.w .LcoStLoop_7 + bra.w .LcoStLoop_8 + bra.w .LcoStLoop_9 + bra.w .LcoStLoop_10 + bra.w .LcoStLoop_11 + bra.w .LcoStLoop_12 + bra.w .LcoStLoop_13 + bra.w .LcoStLoop_14 + bra.w .LcoStLoop_15 + + CO_LOOP_HDR 0 + CO_LOOP_HDR 1 + CO_LOOP_HDR 2 + CO_LOOP_HDR 3 + CO_LOOP_HDR 4 + CO_LOOP_HDR 5 + CO_LOOP_HDR 6 + CO_LOOP_HDR 7 + CO_LOOP_HDR 8 + CO_LOOP_HDR 9 + CO_LOOP_HDR 10 + CO_LOOP_HDR 11 + CO_LOOP_HDR 12 + CO_LOOP_HDR 13 + CO_LOOP_HDR 14 + CO_LOOP_HDR 15 + +.LcoStDone: + lea SP_LOCAL(%sp),%sp + movem.l (%sp)+,%d2-%d7/%a2-%a6 + rts + + + .align 2 +| 16 word entries: bitMaskWordLut[i] = 1 << (15 - i), for i in 0..15. +bitMaskWordLut: + .word 0x8000, 0x4000, 0x2000, 0x1000 + .word 0x0800, 0x0400, 0x0200, 0x0100 + .word 0x0080, 0x0040, 0x0020, 0x0010 + .word 0x0008, 0x0004, 0x0002, 0x0001 diff --git a/src/port/atarist/fillCircle.s b/src/port/atarist/fillCircle.s new file mode 100644 index 0000000..ba508df --- /dev/null +++ b/src/port/atarist/fillCircle.s @@ -0,0 +1,292 @@ +| Atari ST word-interleaved planar fillCircle -- 68000 hand-rolled. +| +| Bresenham midpoint circle, 4 horizontal spans per Bresenham iter, +| paired by shared x-range so leftMask/rightMask are computed once +| per pair: +| Pair A: x in [cx-bx, cx+bx], rows y = cy+by, cy-by +| Pair B: x in [cx-by, cx+by], rows y = cy+bx, cy-bx +| +| Caller MUST guarantee the bounding box (cx-r, cy-r) (cx+r, cy+r) +| is fully on-surface. Off-surface circles fall back to the C walker. +| +| ABI: cdecl. d2-d7/a2-a6 callee-save. +| +| void surface68kStFillCircle(uint8_t *base, +| uint16_t cx, uint16_t cy, +| uint16_t r, uint8_t color); +| +| Register allocation across the loop: +| d2.w = bx (Bresenham, starts at r) +| d3.w = by (Bresenham, starts at 0) +| d4.w = err +| d5.l = loLong (planes 0+1 long template) +| d6.l = hiLong (planes 2+3 long template) +| d7.b = color (low nibble; tested via btst) +| a3 = base +| a4 = scratch / current group pointer +| d0,d1 = scratch +| +| Stack scratch (8 bytes at 0(sp)..7(sp)): +| 0..1 leftMask (word; per pair) +| 2..3 rightMask (word; per pair) +| 4..5 numGroups (word; per pair) +| 6..7 groupFirstByteOff (word; per pair) + + .text + + + .equ SP_FC_SAVED, 44 + .equ SP_FC_LOCAL, 8 + .equ SP_FC_OFF, (SP_FC_SAVED + 4 + SP_FC_LOCAL) + .equ SP_FC_BASE, SP_FC_OFF + 0 + .equ SP_FC_CX, SP_FC_OFF + 4 + 2 + .equ SP_FC_CY, SP_FC_OFF + 8 + 2 + .equ SP_FC_R, SP_FC_OFF + 12 + 2 + .equ SP_FC_COLOR, SP_FC_OFF + 16 + 3 + + +| ---- COMPUTE_PAIR_MASKS macro ----------------------------------- +| Input: d0.w = left, d1.w = right +| Output: 0(sp) leftMask, 2(sp) rightMask, 4(sp) numGroups, +| 6(sp) groupFirstByteOff +| Trashes: d0, d1 +| (No labels: straightline.) + + .macro COMPUTE_PAIR_MASKS + move.w %d0,0(%sp) | stash left + move.w %d1,2(%sp) | stash right + | groupFirst & groupFirstByteOff + move.w %d0,%d1 + lsr.w #4,%d1 | groupFirst + move.w %d1,%d0 + lsl.w #3,%d0 | groupFirstByteOff + move.w %d0,6(%sp) + | numGroups = (right >> 4) - groupFirst + move.w 2(%sp),%d0 + lsr.w #4,%d0 | groupLast + sub.w %d1,%d0 | numGroups + move.w %d0,4(%sp) + | leftMask via LUT[bitFirst]; a5 = leftMaskLut base + move.w 0(%sp),%d0 + and.w #15,%d0 + add.w %d0,%d0 + move.w (%a5,%d0.w),%d1 + move.w %d1,0(%sp) + | rightMask via LUT[bitLast]; a6 = rightMaskLut base + move.w 2(%sp),%d0 + and.w #15,%d0 + add.w %d0,%d0 + move.w (%a6,%d0.w),%d1 + move.w %d1,2(%sp) + .endm + + +| ---- SPAN_BODY macro -------------------------------------------- +| Render one row span using the pair masks at 0(sp)..7(sp). +| Input: d0.w = y (signed) +| a3 = base, d5 = loLong, d6 = hiLong, d7 = color +| Trashes: d0, d1, a4 +| Macro takes an idx parameter for unique labels. + + .macro SPAN_BODY + | a4 = base + y*160 + ext.l %d0 + move.l %d0,%d1 + lsl.l #5,%d0 + lsl.l #7,%d1 + add.l %d1,%d0 | y*160 + lea 0(%a3,%d0.l),%a4 + | a4 += groupFirstByteOff + moveq #0,%d0 + move.w 6(%sp),%d0 + add.l %d0,%a4 + | numGroups in d1 + move.w 4(%sp),%d1 + tst.w %d1 + bne.s .Lsb_multi\@ + | single-group: combinedMask = leftMask & rightMask + move.w 0(%sp),%d0 + and.w 2(%sp),%d0 + bsr .Lfc_applyMask + bra.w .Lsb_done\@ +.Lsb_multi\@: + | leading mask. applyMask postinc-advances a4 by 8 + | (the 4 plane RMWs each advance by 2 via (a4)+). + | applyMask trashes d1, so reload numGroups after bsr. + move.w 0(%sp),%d0 + bsr .Lfc_applyMask + move.w 4(%sp),%d1 | reload numGroups + subq.w #1,%d1 | d1 = numMid + beq.s .Lsb_skipMid\@ +.Lsb_midLoop\@: + move.l %d5,(%a4)+ + move.l %d6,(%a4)+ + subq.w #1,%d1 + bne.s .Lsb_midLoop\@ +.Lsb_skipMid\@: + | trailing mask + move.w 2(%sp),%d0 + bsr .Lfc_applyMask +.Lsb_done\@: + .endm + + + .globl _surface68kStFillCircle + +_surface68kStFillCircle: + movem.l %d2-%d7/%a2-%a6,-(%sp) + lea -SP_FC_LOCAL(%sp),%sp + + | base, color + move.l SP_FC_BASE(%sp),%a3 + moveq #0,%d7 + move.b SP_FC_COLOR(%sp),%d7 + + | LUT bases (PC-relative indexed has only 8-bit + | displacement, so cache full pointers in a-regs). + lea leftMaskLut(%pc),%a5 + lea rightMaskLut(%pc),%a6 + + | loLong = ((c&1)?0xFFFF0000:0) | ((c&2)?0x0000FFFF:0) + moveq #0,%d5 + btst #1,%d7 + beq.s .Lfc_lo1 + move.w #-1,%d5 +.Lfc_lo1: + btst #0,%d7 + beq.s .Lfc_lo0 + ori.l #0xFFFF0000,%d5 +.Lfc_lo0: + | hiLong = ((c&4)?0xFFFF0000:0) | ((c&8)?0x0000FFFF:0) + moveq #0,%d6 + btst #3,%d7 + beq.s .Lfc_hi3 + move.w #-1,%d6 +.Lfc_hi3: + btst #2,%d7 + beq.s .Lfc_hi2 + ori.l #0xFFFF0000,%d6 +.Lfc_hi2: + + | Bresenham init: bx=r, by=0, err=1-bx + move.w SP_FC_R(%sp),%d2 + moveq #0,%d3 + moveq #1,%d4 + sub.w %d2,%d4 + +.Lfc_loop: + cmp.w %d3,%d2 + bcs.w .Lfc_done + + | --- Pair A: x range = (cx - bx, cx + bx) + move.w SP_FC_CX(%sp),%d0 + move.w %d0,%d1 + sub.w %d2,%d0 | left = cx - bx + add.w %d2,%d1 | right = cx + bx + COMPUTE_PAIR_MASKS + + | Span A1: y = cy + by + move.w SP_FC_CY(%sp),%d0 + add.w %d3,%d0 + SPAN_BODY + + | Span A2: y = cy - by + move.w SP_FC_CY(%sp),%d0 + sub.w %d3,%d0 + SPAN_BODY + + | --- Pair B: x range = (cx - by, cx + by) + move.w SP_FC_CX(%sp),%d0 + move.w %d0,%d1 + sub.w %d3,%d0 | left = cx - by + add.w %d3,%d1 | right = cx + by + COMPUTE_PAIR_MASKS + + | Span B1: y = cy + bx + move.w SP_FC_CY(%sp),%d0 + add.w %d2,%d0 + SPAN_BODY + + | Span B2: y = cy - bx + move.w SP_FC_CY(%sp),%d0 + sub.w %d2,%d0 + SPAN_BODY + + | --- Bresenham step + addq.w #1,%d3 + tst.w %d4 + bgt.s .Lfc_decBx + add.w %d3,%d4 + add.w %d3,%d4 + addq.w #1,%d4 + bra.w .Lfc_loop +.Lfc_decBx: + subq.w #1,%d2 + add.w %d3,%d4 + add.w %d3,%d4 + sub.w %d2,%d4 + sub.w %d2,%d4 + addq.w #1,%d4 + bra.w .Lfc_loop + + +.Lfc_done: + lea SP_FC_LOCAL(%sp),%sp + movem.l (%sp)+,%d2-%d7/%a2-%a6 + rts + + +| ---- Apply 4-plane mask at (a4) ------------------------------- +| Input: d0.w = mask, d7.b = color, a4 = group ptr +| Output: a4 advanced by 8 (next group). Caller must NOT post-add 8. +| Trashes: d0, d1 +| Subroutine, called via bsr from SPAN_BODY. Postinc on each plane +| RMW saves 4 cyc/plane vs displacement (12 vs 16 EA cyc). + +.Lfc_applyMask: + move.w %d0,%d1 + not.w %d1 | d1 = notMask + btst #0,%d7 + beq.s .Lfc_am0a + or.w %d0,(%a4)+ + bra.s .Lfc_am1 +.Lfc_am0a: + and.w %d1,(%a4)+ +.Lfc_am1: + btst #1,%d7 + beq.s .Lfc_am1a + or.w %d0,(%a4)+ + bra.s .Lfc_am2 +.Lfc_am1a: + and.w %d1,(%a4)+ +.Lfc_am2: + btst #2,%d7 + beq.s .Lfc_am2a + or.w %d0,(%a4)+ + bra.s .Lfc_am3 +.Lfc_am2a: + and.w %d1,(%a4)+ +.Lfc_am3: + btst #3,%d7 + beq.s .Lfc_am3a + or.w %d0,(%a4)+ + rts +.Lfc_am3a: + and.w %d1,(%a4)+ + rts + + + .align 2 +| leftMaskLut[i] = (1 << (16 - i)) - 1, indexed by bitFirst (0..15) +leftMaskLut: + .word 0xFFFF, 0x7FFF, 0x3FFF, 0x1FFF + .word 0x0FFF, 0x07FF, 0x03FF, 0x01FF + .word 0x00FF, 0x007F, 0x003F, 0x001F + .word 0x000F, 0x0007, 0x0003, 0x0001 + +| rightMaskLut[i] = ~((1 << (15 - i)) - 1), indexed by bitLast (0..15) +rightMaskLut: + .word 0x8000, 0xC000, 0xE000, 0xF000 + .word 0xF800, 0xFC00, 0xFE00, 0xFF00 + .word 0xFF80, 0xFFC0, 0xFFE0, 0xFFF0 + .word 0xFFF8, 0xFFFC, 0xFFFE, 0xFFFF diff --git a/src/port/atarist/hal.c b/src/port/atarist/hal.c index 2e77041..77a5c5c 100644 --- a/src/port/atarist/hal.c +++ b/src/port/atarist/hal.c @@ -36,6 +36,8 @@ #include "hal.h" #include "surfaceInternal.h" +#include "spriteInternal.h" +#include "joey/tile.h" #include "draw68k_inline.h" // ----- Constants ----- @@ -45,8 +47,32 @@ // with each word holding the 16 one-bit samples for one bitplane. #define ST_BYTES_PER_ROW 160 #define ST_GROUPS_PER_ROW 20 +#define ST_BYTES_PER_GROUP 8 // 4 plane words back-to-back +#define ST_PLANE_OFF_BYTES 2 // step between adjacent plane words within a group +#define ST_BITPLANES 4 +#define ST_PLANAR_SIZE (ST_BYTES_PER_ROW * SURFACE_HEIGHT) #define ST_SCREEN_ALIGN 256 + +// ----- Per-surface planar storage (project_planar_68k_plan, ST Phase 2) ----- +// +// ST has word-interleaved planar: ONE 32000-byte buffer per surface +// holds all 4 planes packed 8 bytes per 16-pixel group. Per-scanline +// layout (160 bytes) is 20 groups of 8 bytes, where each group holds +// p0_word, p1_word, p2_word, p3_word back-to-back. Compare with +// Amiga's 4 separate plane buffers -- same total bytes, very different +// access pattern. +// +// The stage gets its own SHADOW planar buffer (NOT aliased to +// gScreenBase) so drawing primitives don't appear until stagePresent +// memcpy's shadow -> screen. Same rationale as Amiga's per-stage +// shadow planes. +typedef struct { + uint8_t *base; // 32000-byte interleaved planar buffer + uint8_t *raw; // unaligned malloc result for free() + bool ownsBuffer; // true = we malloc'd, false = aliased +} StPlanarT; + // Shifter palette registers: 16 words at $FFFF8240..$FFFF825F. #define ST_PALETTE_REGS ((volatile uint16_t *)0xFFFF8240L) @@ -65,8 +91,50 @@ // ----- Prototypes ----- +// Phase 10: planar primitive helpers must be visible everywhere they +// could inline. Defined up here (between StPlanarT and the rest of +// the prototype block) so every halFast* / fillSpan / circle walker +// can fold the 4-plane RMW directly into its body. always_inline +// hammers the point home for gcc-mint's conservative inliner. + +static inline __attribute__((always_inline)) void stApplyMaskToGroup(uint8_t *groupBase, uint16_t mask, uint8_t color) { + uint16_t notMask = (uint16_t)~mask; + uint16_t *pw = (uint16_t *)groupBase; + if (color & 1u) { pw[0] = (uint16_t)(pw[0] | mask); } else { pw[0] = (uint16_t)(pw[0] & notMask); } + if (color & 2u) { pw[1] = (uint16_t)(pw[1] | mask); } else { pw[1] = (uint16_t)(pw[1] & notMask); } + if (color & 4u) { pw[2] = (uint16_t)(pw[2] | mask); } else { pw[2] = (uint16_t)(pw[2] & notMask); } + if (color & 8u) { pw[3] = (uint16_t)(pw[3] | mask); } else { pw[3] = (uint16_t)(pw[3] & notMask); } +} + + +static inline __attribute__((always_inline)) void stPlanarSetPixel(StPlanarT *pd, int16_t x, int16_t y, uint8_t color) { + uint16_t group = (uint16_t)((uint16_t)x >> 4); + uint16_t bitMask = (uint16_t)(1u << (15u - ((uint16_t)x & 15u))); + uint16_t notMask = (uint16_t)~bitMask; + uint16_t *pw = (uint16_t *)(pd->base + + (uint16_t)y * ST_BYTES_PER_ROW + + group * ST_BYTES_PER_GROUP); + if (color & 1u) { pw[0] = (uint16_t)(pw[0] | bitMask); } else { pw[0] = (uint16_t)(pw[0] & notMask); } + if (color & 2u) { pw[1] = (uint16_t)(pw[1] | bitMask); } else { pw[1] = (uint16_t)(pw[1] & notMask); } + if (color & 4u) { pw[2] = (uint16_t)(pw[2] | bitMask); } else { pw[2] = (uint16_t)(pw[2] & notMask); } + if (color & 8u) { pw[3] = (uint16_t)(pw[3] | bitMask); } else { pw[3] = (uint16_t)(pw[3] & notMask); } +} + + +static inline __attribute__((always_inline)) uint8_t stPlanarGetPixel(const StPlanarT *pd, int16_t x, int16_t y) { + uint16_t group = (uint16_t)((uint16_t)x >> 4); + uint16_t bitMask = (uint16_t)(1u << (15u - ((uint16_t)x & 15u))); + const uint16_t *pw = (const uint16_t *)(pd->base + + (uint16_t)y * ST_BYTES_PER_ROW + + group * ST_BYTES_PER_GROUP); + uint8_t c = 0u; + if (pw[0] & bitMask) { c = (uint8_t)(c | 1u); } + if (pw[1] & bitMask) { c = (uint8_t)(c | 2u); } + if (pw[2] & bitMask) { c = (uint8_t)(c | 4u); } + if (pw[3] & bitMask) { c = (uint8_t)(c | 8u); } + return c; +} static uint16_t quantizeColorToSt(uint16_t orgb); -static void c2pRange(const SurfaceT *src, int16_t y0, int16_t y1, uint16_t groupStart, uint16_t groupEnd); static void flattenScbPalettes(const SurfaceT *src); static void initC2pLut(void); static void writeDiagnostics(void); @@ -138,6 +206,55 @@ static uint8_t gCachedScb [SURFACE_HEIGHT]; static uint16_t gCachedPalette[SURFACE_PALETTE_COUNT][SURFACE_COLORS_PER_PALETTE]; static bool gCacheValid = false; +// 256-long plane-spread LUT for the asm sprite SAVE path (defined in +// spriteAsm.s). For plane byte b, LUT[b] is a 32-bit value where each +// of b's 8 bits is placed at the bit-0 position of the corresponding +// pixel's nibble inside a 4-byte chunky long. The asm shifts the LUT +// entry left by N to get plane N's contribution; OR'd across 4 planes +// gives the full chunky long. Initialized lazily. +// +// LUT used by surface68kStSpriteSaveByteAligned. The asm reads via +// `move.l (a_ptr, d0.l), d4` which requires the LUT to be long- +// aligned -- and TOS .PRG BSS only does 2-byte alignment. Worse, +// the cascading offsets from the odd-sized gC2pLut put even +// `uint32_t` BSS slots at addr mod 4 == 2. +// +// Fix: malloc the LUT. mintlib's malloc returns long-aligned memory. +// The pointer is passed to the asm via the C-side wrapper (so the +// asm reads it from the stack, where it's guaranteed long-aligned +// regardless of where the static pointer slot lives). +static uint32_t *gStPlaneSpreadLutPtr = NULL; +static bool gStPlaneSpreadLutReady = false; + +static bool initStPlaneSpreadLut(void) { + int b; + int i; + + if (gStPlaneSpreadLutReady) { + return true; + } + gStPlaneSpreadLutPtr = (uint32_t *)malloc(256 * sizeof(uint32_t)); + if (gStPlaneSpreadLutPtr == NULL) { + return false; + } + + for (b = 0; b < 256; b++) { + uint32_t v = 0u; + for (i = 0; i < 8; i++) { + if (b & (0x80 >> i)) { + int byteIdx = i >> 1; + int isHigh = ((i & 1) == 0); + int bitInLong = (3 - byteIdx) * 8 + (isHigh ? 4 : 0); + v |= (uint32_t)1u << bitInLong; + } + } + gStPlaneSpreadLutPtr[b] = v; + } + gStPlaneSpreadLutReady = true; + return true; +} + + // 4 KB chunky-to-planar lookup table consumed by chunkyToPlanarRowSt // (src/port/atarist/c2p.s). Layout: gC2pLut[src*16 + pos*4 + plane] // = the 2-bit plane-byte contribution for source byte `src` at @@ -146,27 +263,12 @@ static bool gCacheValid = false; // the same table feeds both halves of an ST plane word: positions // 0..3 land in the high byte, 4..7 (re-indexed mod 4) in the low // byte. Built once by initC2pLut on the first halPresent call. -static uint8_t gC2pLut[4 * 1024]; +/* Exported (no static) so spriteAsm.s can `lea _gC2pLut, %a2`. */ +uint8_t gC2pLut[4 * 1024]; static bool gC2pLutReady = false; // ----- Internal helpers (alphabetical) ----- -static void c2pRange(const SurfaceT *src, int16_t y0, int16_t y1, uint16_t groupStart, uint16_t groupEnd) { - int16_t y; - const uint8_t *srcLine; - uint16_t *dstLine; - - if (!gC2pLutReady) { - initC2pLut(); - } - for (y = y0; y < y1; y++) { - srcLine = &src->pixels[y * SURFACE_BYTES_PER_ROW]; - dstLine = (uint16_t *)&gScreenBase[y * ST_BYTES_PER_ROW]; - chunkyToPlanarRowSt(srcLine, dstLine, groupStart, groupEnd, gC2pLut); - } -} - - // Scan the surface's SCB and record one transition entry for each // run of the same palette index. gBandCount is the number of // distinct bands; gBandStart[i] is the display line where band i @@ -499,20 +601,31 @@ const char *halLastError(void) { void halPresent(const SurfaceT *src) { - int16_t y; - uint8_t minWord; - uint8_t maxWord; - uint16_t groupStart; - uint16_t groupEnd; + StPlanarT *pd; + int16_t y; + uint8_t minWord; + uint8_t maxWord; + uint16_t groupStart; + uint16_t groupEnd; + uint16_t byteStart; + uint16_t byteLen; if (src == NULL || !gModeSet) { return; } + pd = (StPlanarT *)src->portData; + if (pd == NULL) { + return; + } refreshPaletteStateIfNeeded(src); - // Walk per-row dirty bands: each c2p group covers 16 px = 4 chunky - // words, so groupStart = minWord/4 and groupEnd = maxWord/4 + 1 - // converts dirty-word units to c2pRange's group units. + // Phase 9: planar shadow -> screen RAM. Same dirty-word band + // tracking the c2p path used; just memcpy the planar bytes for + // each band instead of running c2p on the chunky shadow. Each + // dirty word covers 4 pixels = ?of one group = quarter of an + // 8-byte group. We round to whole groups (8 bytes each) for a + // simple aligned memcpy, since planar groups are the natural + // copy unit. for (y = 0; y < SURFACE_HEIGHT; y++) { minWord = gStageMinWord[y]; maxWord = gStageMaxWord[y]; @@ -521,7 +634,11 @@ void halPresent(const SurfaceT *src) { } groupStart = (uint16_t)(minWord >> 2); groupEnd = (uint16_t)((maxWord >> 2) + 1); - c2pRange(src, y, (int16_t)(y + 1), groupStart, groupEnd); + byteStart = (uint16_t)(groupStart * ST_BYTES_PER_GROUP); + byteLen = (uint16_t)((groupEnd - groupStart) * ST_BYTES_PER_GROUP); + memcpy(&gScreenBase[(uint16_t)y * ST_BYTES_PER_ROW + byteStart], + &pd->base [(uint16_t)y * ST_BYTES_PER_ROW + byteStart], + byteLen); } } @@ -563,6 +680,15 @@ void halShutdown(void) { return; } + // Stop the audio Timer A first. The audio HAL has its own + // halAudioShutdown that disables Timer A and restores the vector, + // but cross-platform joeyShutdown doesn't call it -- if a sketch + // forgets joeyAudioShutdown(), Timer A keeps firing after our + // code unloads and TOS panics on the first dangling vector hit. + // Calling halAudioShutdown here is idempotent (gReady guard), + // so explicit-shutdown sketches still work. + halAudioShutdown(); + // Disable MFP Timer B and restore the exception vectors before // changing the screen -- a late ISR firing mid-Setscreen would // write palette into whatever buffer TOS remapped. @@ -587,112 +713,279 @@ void halShutdown(void) { extern void surface68kClearLong(uint8_t *pixels, uint16_t fillByte); extern void surface68kFillRectFull(uint8_t *pixels, int16_t y, uint16_t h, uint16_t fillByte); extern void surface68kFillRectByteAligned(uint8_t *rowFirst, uint16_t midBytes, uint16_t h, uint16_t fillByte); +extern void surface68kStCircleOutline(uint8_t *base, uint16_t cx, uint16_t cy, uint16_t r, uint8_t color); +extern void surface68kStDrawLine(uint8_t *base, int16_t x0, int16_t y0, int16_t x1, int16_t y1, uint8_t color); +extern void surface68kStFillSpan(uint8_t *base, int16_t left, int16_t right, int16_t y, uint8_t color); +extern void surface68kStFillCircle(uint8_t *base, uint16_t cx, uint16_t cy, uint16_t r, uint8_t color); +extern void surface68kStFillRectSingleGroup(uint8_t *firstGroupPtr, uint16_t mask, uint16_t h, uint8_t color); +extern void surface68kStFillRectMulti(uint8_t *base, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t color); +extern void surface68kStLongFill(uint8_t *dst, uint16_t numGroups, uint32_t loLong, uint32_t hiLong); +extern void surface68kStSpriteSaveByteAligned(uint8_t *base, uint16_t x, uint16_t y, uint16_t w, uint16_t h, uint8_t *dstChunky, const uint32_t *lut); +extern void surface68kStSpriteRestoreByteAligned(uint8_t *base, uint16_t x, uint16_t y, uint16_t w, uint16_t h, const uint8_t *srcChunky, const uint8_t *c2pLut); +// Phase 9: clear the entire planar buffer to a 4-bit color. Build an +// 8-byte group template (4 plane words: 0xFFFF or 0x0000 each by +// color bit) then stream it across all 4000 groups via long stores. bool halFastSurfaceClear(SurfaceT *s, uint8_t doubled) { + StPlanarT *pd; + uint8_t color; + uint32_t loLong; + uint32_t hiLong; + uint32_t *p32; + uint16_t groups; + if (s != stageGet()) { return false; } - surface68kClearLong(s->pixels, (uint16_t)doubled); + pd = (StPlanarT *)s->portData; + if (pd == NULL) { + return true; + } + color = (uint8_t)(doubled & 0x0Fu); + /* Per-group: [p0_word][p1_word][p2_word][p3_word] = 8 bytes = 2 longs. + * loLong = (p0_word << 16) | p1_word; hiLong = (p2_word << 16) | p3_word. */ + loLong = ((color & 1u) ? 0xFFFF0000ul : 0ul) + | ((color & 2u) ? 0x0000FFFFul : 0ul); + hiLong = ((color & 4u) ? 0xFFFF0000ul : 0ul) + | ((color & 8u) ? 0x0000FFFFul : 0ul); + (void)p32; + (void)groups; + surface68kStLongFill(pd->base, + (uint16_t)(ST_PLANAR_SIZE / ST_BYTES_PER_GROUP), + loLong, hiLong); return true; } -// Fast path bands: -// - x == 0 && w == SURFACE_WIDTH (full row): one move.l-stream per -// row via surface68kFillRectFull. Always word-aligned. -// - x % 4 == 0 && w even (word-aligned): byte index = x/2, so x must -// be a multiple of 4 for the move.l writes inside the asm to land -// on even addresses (68000 address-error rule). -// - everything else: fall through to C's fillRectClipped, which is -// per-byte and tolerates any alignment. +// Phase 9: pure short-circuit. halFillRectPlanes (called by cross- +// platform fillRect right after this) does the actual planar fill; +// we just claim ownership so the chunky fillRectClipped fallback +// never runs. bool halFastFillRect(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex) { - uint8_t doubled; - + (void)x; (void)y; (void)w; (void)h; (void)colorIndex; if (s != stageGet()) { return false; } - if (h == 0u || w == 0u) { - return true; - } - doubled = (uint8_t)(((colorIndex & 0x0Fu) << 4) | (colorIndex & 0x0Fu)); - - if (x == 0 && w == (uint16_t)SURFACE_WIDTH) { - surface68kFillRectFull(s->pixels, y, h, (uint16_t)doubled); - return true; - } - if (((x & 3) == 0) && ((w & 1u) == 0u)) { - uint8_t *rowFirst = &s->pixels[(uint16_t)y * (uint16_t)SURFACE_BYTES_PER_ROW + ((uint16_t)x >> 1)]; - surface68kFillRectByteAligned(rowFirst, w >> 1, h, (uint16_t)doubled); - return true; - } - return false; + return true; } +// Phase 9: claim every halFastTile* call so the cross-platform chunky +// fallback (which would dereference NULL s->pixels) never fires. The +// halTileXxxPlanes hook called separately by tile.c does the planar +// work. bool halFastTileCopy(uint8_t *dstRow0, const uint8_t *srcRow0) { - (void)dstRow0; - (void)srcRow0; - return false; + (void)dstRow0; (void)srcRow0; + return true; } bool halFastTileCopyMasked(uint8_t *dstRow0, const uint8_t *srcRow0, uint8_t transparent) { - (void)dstRow0; - (void)srcRow0; - (void)transparent; - return false; -} - - -bool halFastTilePaste(uint8_t *dstRow0, const uint8_t *srcTilePixels) { - (void)dstRow0; - (void)srcTilePixels; - return false; -} - - -bool halFastTileSnap(uint8_t *dstTilePixels, const uint8_t *srcRow0) { - (void)dstTilePixels; - (void)srcRow0; - return false; -} - - -bool halFastDrawPixel(SurfaceT *s, uint16_t x, uint16_t y, uint8_t colorIndex) { - uint8_t nibLo; - if (s != stageGet()) { - return false; - } - nibLo = (uint8_t)(colorIndex & 0x0Fu); - draw68kPlotPixel(s->pixels, (int16_t)x, (int16_t)y, nibLo, (uint8_t)(nibLo << 4)); + (void)dstRow0; (void)srcRow0; (void)transparent; return true; } -bool halFastDrawLine(SurfaceT *s, int16_t x0, int16_t y0, int16_t x1, int16_t y1, uint8_t colorIndex) { +bool halFastTilePaste(uint8_t *dstRow0, const uint8_t *srcTilePixels) { + (void)dstRow0; (void)srcTilePixels; + return true; +} + + +bool halFastTileSnap(uint8_t *dstTilePixels, const uint8_t *srcRow0) { + (void)dstTilePixels; (void)srcRow0; + return true; +} + + +// Phase 9: planar-only. Chunky shadow is gone; only the planar buffer +// gets the pixel. +bool halFastDrawPixel(SurfaceT *s, uint16_t x, uint16_t y, uint8_t colorIndex) { + StPlanarT *pd; + if (s != stageGet()) { return false; } - draw68kLine(s->pixels, x0, y0, x1, y1, colorIndex); + pd = (StPlanarT *)s->portData; + if (pd != NULL) { + stPlanarSetPixel(pd, (int16_t)x, (int16_t)y, (uint8_t)(colorIndex & 0x0Fu)); + } + return true; +} + + +// Phase 9 planar walkers. Same Bresenham as the cross-platform +// fallback, but writing to the planar buffer via stPlanarSetPixel. +// Mirror the Amiga amigaPlanarLine / amigaPlanarCircleOutline / +// amigaPlanarCircleFill structure. Phase 10 hand-rolled asm replaces +// these (drawCircle.s already exists for Amiga; ST will get its own). +static void stPlanarLine(StPlanarT *pd, int16_t x0, int16_t y0, int16_t x1, int16_t y1, uint8_t color) { + int16_t dx; + int16_t dy; + int16_t sx; + int16_t sy; + int16_t err; + int16_t e2; + + dx = (int16_t)((x1 > x0) ? (x1 - x0) : (x0 - x1)); + dy = (int16_t)(-((y1 > y0) ? (y1 - y0) : (y0 - y1))); + sx = (int16_t)((x0 < x1) ? 1 : -1); + sy = (int16_t)((y0 < y1) ? 1 : -1); + err = (int16_t)(dx + dy); + while (1) { + if (x0 >= 0 && x0 < SURFACE_WIDTH && y0 >= 0 && y0 < SURFACE_HEIGHT) { + stPlanarSetPixel(pd, x0, y0, color); + } + if (x0 == x1 && y0 == y1) { + break; + } + e2 = (int16_t)(2 * err); + if (e2 >= dy) { err = (int16_t)(err + dy); x0 = (int16_t)(x0 + sx); } + if (e2 <= dx) { err = (int16_t)(err + dx); y0 = (int16_t)(y0 + sy); } + } +} + + +static void stPlanarCircleOutline(StPlanarT *pd, int16_t cx, int16_t cy, uint16_t r, uint8_t color) { + int16_t bx = (int16_t)r; + int16_t by = 0; + int16_t err = (int16_t)(1 - bx); + int16_t px; + int16_t py; + + while (bx >= by) { + /* 8 octants. Per-pixel clip since Bresenham can leave the + * surface for circles touching the edge. */ + #define ST_PLOT(X, Y) do { px = (X); py = (Y); if (px >= 0 && px < SURFACE_WIDTH && py >= 0 && py < SURFACE_HEIGHT) { stPlanarSetPixel(pd, px, py, color); } } while (0) + ST_PLOT((int16_t)(cx + bx), (int16_t)(cy + by)); + ST_PLOT((int16_t)(cx - bx), (int16_t)(cy + by)); + ST_PLOT((int16_t)(cx + bx), (int16_t)(cy - by)); + ST_PLOT((int16_t)(cx - bx), (int16_t)(cy - by)); + ST_PLOT((int16_t)(cx + by), (int16_t)(cy + bx)); + ST_PLOT((int16_t)(cx - by), (int16_t)(cy + bx)); + ST_PLOT((int16_t)(cx + by), (int16_t)(cy - bx)); + ST_PLOT((int16_t)(cx - by), (int16_t)(cy - bx)); + #undef ST_PLOT + by++; + if (err > 0) { + bx--; + err = (int16_t)(err + 2 * (by - bx) + 1); + } else { + err = (int16_t)(err + 2 * by + 1); + } + } +} + + +// Phase 10: group-aware span fill -- the same leading-mask / +// full-group / trailing-mask decomposition halFillRectPlanes uses, +// but for one row. Replaces the per-pixel walk that gave fillCircle +// r=40 ~1 ops/sec. +static void stPlanarFillSpan(StPlanarT *pd, int16_t x0, int16_t x1, int16_t y, uint8_t color) { + int16_t left; + int16_t right; + + if (y < 0 || y >= SURFACE_HEIGHT) { + return; + } + left = (x0 < x1) ? x0 : x1; + right = (x0 > x1) ? x0 : x1; + if (left < 0) { left = 0; } + if (right >= SURFACE_WIDTH) { right = SURFACE_WIDTH - 1; } + if (left > right) { + return; + } + surface68kStFillSpan(pd->base, left, right, y, color); +} + + +static void stPlanarCircleFill(StPlanarT *pd, int16_t cx, int16_t cy, uint16_t r, uint8_t color) { + int16_t bx = (int16_t)r; + int16_t by = 0; + int16_t err = (int16_t)(1 - bx); + + while (bx >= by) { + stPlanarFillSpan(pd, (int16_t)(cx - bx), (int16_t)(cx + bx), (int16_t)(cy + by), color); + stPlanarFillSpan(pd, (int16_t)(cx - bx), (int16_t)(cx + bx), (int16_t)(cy - by), color); + stPlanarFillSpan(pd, (int16_t)(cx - by), (int16_t)(cx + by), (int16_t)(cy + bx), color); + stPlanarFillSpan(pd, (int16_t)(cx - by), (int16_t)(cx + by), (int16_t)(cy - bx), color); + by++; + if (err > 0) { + bx--; + err = (int16_t)(err + 2 * (by - bx) + 1); + } else { + err = (int16_t)(err + 2 * by + 1); + } + } +} + + +bool halFastDrawLine(SurfaceT *s, int16_t x0, int16_t y0, int16_t x1, int16_t y1, uint8_t colorIndex) { + StPlanarT *pd; + if (s != stageGet()) { + return false; + } + pd = (StPlanarT *)s->portData; + if (pd != NULL) { + // Asm walker assumes fully on-surface; partial-clip lines fall + // back to the C walker which clips per-pixel. + if (x0 >= 0 && x0 < SURFACE_WIDTH && y0 >= 0 && y0 < SURFACE_HEIGHT + && x1 >= 0 && x1 < SURFACE_WIDTH && y1 >= 0 && y1 < SURFACE_HEIGHT) { + surface68kStDrawLine(pd->base, x0, y0, x1, y1, (uint8_t)(colorIndex & 0x0Fu)); + } else { + stPlanarLine(pd, x0, y0, x1, y1, (uint8_t)(colorIndex & 0x0Fu)); + } + } return true; } bool halFastDrawCircle(SurfaceT *s, int16_t cx, int16_t cy, uint16_t r, uint8_t colorIndex) { + StPlanarT *pd; if (s != stageGet()) { return false; } - draw68kCircleOutline(s->pixels, cx, cy, r, colorIndex); + pd = (StPlanarT *)s->portData; + if (pd == NULL) { + return true; + } + /* Off-surface circles fall back to the per-pixel C walker which + * does the clip per plot; the asm assumes fully-on-surface so it + * can drop the clip check from the inner loop. */ + if ((int32_t)cx - (int32_t)r < 0 + || (int32_t)cx + (int32_t)r >= SURFACE_WIDTH + || (int32_t)cy - (int32_t)r < 0 + || (int32_t)cy + (int32_t)r >= SURFACE_HEIGHT) { + stPlanarCircleOutline(pd, cx, cy, r, (uint8_t)(colorIndex & 0x0Fu)); + } else { + surface68kStCircleOutline(pd->base, (uint16_t)cx, (uint16_t)cy, r, + (uint8_t)(colorIndex & 0x0Fu)); + } return true; } bool halFastFillCircle(SurfaceT *s, int16_t cx, int16_t cy, uint16_t r, uint8_t colorIndex) { + StPlanarT *pd; if (s != stageGet()) { return false; } - draw68kCircleFill(s->pixels, cx, cy, r, colorIndex); + pd = (StPlanarT *)s->portData; + if (pd == NULL) { + return true; + } + // Off-surface bounding box falls back to the C span walker, which + // clips each span; the asm assumes the whole circle is on-surface. + if ((int32_t)cx - (int32_t)r < 0 + || (int32_t)cx + (int32_t)r >= SURFACE_WIDTH + || (int32_t)cy - (int32_t)r < 0 + || (int32_t)cy + (int32_t)r >= SURFACE_HEIGHT) { + stPlanarCircleFill(pd, cx, cy, r, (uint8_t)(colorIndex & 0x0Fu)); + } else { + surface68kStFillCircle(pd->base, (uint16_t)cx, (uint16_t)cy, r, + (uint8_t)(colorIndex & 0x0Fu)); + } return true; } @@ -736,16 +1029,14 @@ bool halFastFloodScanRow(uint8_t *row, int16_t leftX, int16_t rightX, uint8_t ma } +// Phase 9: short-circuit. Cross-platform blitRect calls +// halBlitRectPlanes after halFastBlitRect; the planar work happens +// there, so we just claim ownership to skip the chunky copy that +// would dereference NULL dstRow0. bool halFastBlitRect(uint8_t *dstRow0, int16_t dstX, const uint8_t *srcRow0, int16_t srcX, int16_t copyW, int16_t copyH, int16_t srcRowBytes, uint16_t transparent) { - (void)dstRow0; - (void)dstX; - (void)srcRow0; - (void)srcX; - (void)copyW; - (void)copyH; - (void)srcRowBytes; - (void)transparent; - return false; + (void)dstRow0; (void)dstX; (void)srcRow0; (void)srcX; + (void)copyW; (void)copyH; (void)srcRowBytes; (void)transparent; + return true; } @@ -783,113 +1074,1045 @@ bool halFastFloodWalkAndScans(uint8_t *pixels, int16_t x, int16_t y, uint8_t mat } +// Phase 9: short-circuit. halTileFillPlanes does the planar work +// after this returns true; the chunky fallback that would run on +// false would dereference NULL s->pixels. bool halFastTileFill(SurfaceT *s, uint8_t bx, uint8_t by, uint16_t fillWord) { - (void)s; - (void)bx; - (void)by; - (void)fillWord; - return false; + (void)bx; (void)by; (void)fillWord; + if (s != stageGet()) { + return false; + } + return true; } -// Phase-1 planar plumbing: portData hooks declared and exported, but -// returning NULL keeps the ST port operating in the legacy -// chunky-with-c2p model. Phase 4 replaces this with an interleaved -// planar buffer + stride blob, and rewrites every halFast* primitive -// to read/write planes directly. +// Phase 2: allocate a shadow word-interleaved planar buffer per +// surface. Both stage and non-stage get their own buffer (gScreenBase +// remains the single display target). +// +// LONG alignment is required, not just word: the full-row long-fill +// path and circle.s both do `move.l` writes on this buffer, and +// 68000 address-errors on long access to a word-aligned-but-not- +// long-aligned destination. mintlib's malloc usually returns long- +// aligned blocks, but TOS heaps can land at odd offsets after a +// few allocations -- over-allocate by 4 bytes and align up here. +// Symptom of getting this wrong: intermittent return-to-desktop +// after the red startup paint as the first long write hits an +// odd-by-2 base. void *halSurfaceAllocPortData(SurfaceT *s, bool isStage) { + StPlanarT *pd; + uint8_t *raw; + uintptr_t addr; + (void)s; (void)isStage; - return NULL; + + pd = (StPlanarT *)calloc(1, sizeof(StPlanarT)); + if (pd == NULL) { + return NULL; + } + + raw = (uint8_t *)malloc(ST_PLANAR_SIZE + 4u); + if (raw == NULL) { + free(pd); + return NULL; + } + addr = (uintptr_t)raw; + addr = (addr + 3u) & ~(uintptr_t)3u; /* round up to long-aligned */ + pd->raw = raw; + pd->base = (uint8_t *)addr; + pd->ownsBuffer = true; + memset(pd->base, 0, ST_PLANAR_SIZE); + return pd; } void halSurfaceFreePortData(SurfaceT *s, bool isStage, void *portData) { + StPlanarT *pd; + (void)s; (void)isStage; - (void)portData; + if (portData == NULL) { + return; + } + pd = (StPlanarT *)portData; + if (pd->ownsBuffer && pd->raw != NULL) { + free(pd->raw); + } + free(pd); } -// ST planar dual-write isn't implemented yet (interleaved word-planar -// layout needs a different code path than Amiga's separate plane -// buffers). Stub for now; chunky shadow + c2p still drives display. +// Phase 3: dual-write to the word-interleaved planar shadow buffer. +// Chunky shadow (s->pixels) is still the source-of-truth for display +// (c2p at present); the planar buffer becomes authoritative at +// Phase 9 switch flip. +// +// Per row: split [x, x+w) into a leading partial group (bits +// 15..15-bitFirst within the leading word -> mask = (1<<(16-bitFirst)) +// - 1), zero or more full groups, and a trailing partial group +// (bits 15..15-bitLast -> mask = ~((1<<(15-bitLast)) - 1)). For each +// of the 4 plane words within a group, the bit value of the color +// index controls OR-with-mask (set) vs AND-with-not-mask (clear). +// Single-group case (groupFirst == groupLast) collapses to one word +// RMW per plane with the combined mask. +// (stApplyMaskToGroup is defined inline near the top of the file.) + + void halFillRectPlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex) { - (void)s; - (void)x; - (void)y; - (void)w; - (void)h; - (void)colorIndex; + StPlanarT *pd; + uint16_t groupFirst; + uint16_t groupLast; + uint8_t *rowBase; + + if (s == NULL || w == 0u || h == 0u) { + return; + } + pd = (StPlanarT *)s->portData; + if (pd == NULL) { + return; + } + + /* Phase 10 fast path: x == 0 AND w == SURFACE_WIDTH means the rect + * spans every group on every row with no edge masks. movem.l-based + * asm long-fill batches 24 bytes per call. UBER fillRect 320x200 + * lands here. */ + if (x == 0 && w == SURFACE_WIDTH) { + uint32_t loLong = ((colorIndex & 1u) ? 0xFFFF0000ul : 0ul) + | ((colorIndex & 2u) ? 0x0000FFFFul : 0ul); + uint32_t hiLong = ((colorIndex & 4u) ? 0xFFFF0000ul : 0ul) + | ((colorIndex & 8u) ? 0x0000FFFFul : 0ul); + surface68kStLongFill(pd->base + (uint16_t)y * ST_BYTES_PER_ROW, + (uint16_t)((uint16_t)h * ST_GROUPS_PER_ROW), + loLong, hiLong); + return; + } + + groupFirst = (uint16_t)((uint16_t)x >> 4); + groupLast = (uint16_t)(((uint16_t)x + w - 1u) >> 4); + + if (groupFirst == groupLast) { + uint16_t bitFirst = (uint16_t)((uint16_t)x & 15u); + uint16_t bitLast = (uint16_t)(((uint16_t)x + w - 1u) & 15u); + uint16_t leftMask = (uint16_t)((1ul << (16u - bitFirst)) - 1ul); + uint16_t rightMask = (uint16_t)~((1ul << (15u - bitLast)) - 1ul); + uint16_t mask = (uint16_t)(leftMask & rightMask); + rowBase = pd->base + (uint16_t)y * ST_BYTES_PER_ROW; + surface68kStFillRectSingleGroup(rowBase + groupFirst * ST_BYTES_PER_GROUP, + mask, h, colorIndex); + return; + } + + /* Phase 10.5: multi-group case (groupFirst != groupLast) handled + * by 16-way-color-dispatched asm with hoisted mask state. ~3-5x + * faster than the C loop with inlined stApplyMaskToGroup. */ + surface68kStFillRectMulti(pd->base, x, y, w, h, colorIndex); } void halSurfaceCopyPlanes(SurfaceT *dst, const SurfaceT *src) { - (void)dst; - (void)src; + StPlanarT *dstPd; + StPlanarT *srcPd; + + if (dst == NULL || src == NULL) { + return; + } + dstPd = (StPlanarT *)dst->portData; + srcPd = (StPlanarT *)src->portData; + if (dstPd == NULL || srcPd == NULL) { + return; + } + memcpy(dstPd->base, srcPd->base, ST_PLANAR_SIZE); } +// ----- Phases 4-7: per-pixel / tile / sprite / blit planar primitives ----- +// +// These implementations dual-write the planar shadow alongside the +// chunky shadow that cross-platform code maintains. They use simple +// per-pixel walks for clarity and correctness; Phase 10 will replace +// the hot ones (fillRect, drawPixel, sprite codegen) with hand-rolled +// asm. The emphasis here is "correct first, fast later" -- Phase 9 +// flips the read source from chunky to planar and we'll see immediately +// (DRAW hash vs IIgs reference) whether each primitive landed bits +// in the right place. + +// stPlanarSetPixel and stPlanarGetPixel are defined inline near the +// top of the file (between StPlanarT and the prototype block) so +// every callsite folds the 4-plane RMW into its body. + + +// Phase 5 tile ops. 8x8 tiles at byte position (bx, by) start at +// pixel (bx*8, by*8). 8 pixels wide always covers exactly half a +// 16-pixel group: high half (bits 15..8) when bx is even, low half +// (bits 7..0) when bx is odd. Per-row work is 4 plane half-word RMWs. void halTileFillPlanes(SurfaceT *s, uint8_t bx, uint8_t by, uint8_t colorIndex) { - (void)s; (void)bx; (void)by; (void)colorIndex; + StPlanarT *pd; + uint16_t group; + uint16_t halfMask; + uint8_t *gp; + + if (s == NULL) { + return; + } + pd = (StPlanarT *)s->portData; + if (pd == NULL) { + return; + } + group = (uint16_t)((uint16_t)bx >> 1); + halfMask = ((bx & 1u) == 0u) ? 0xFF00u : 0x00FFu; + gp = pd->base + (uint16_t)by * 8u * ST_BYTES_PER_ROW + group * ST_BYTES_PER_GROUP; + surface68kStFillRectSingleGroup(gp, halfMask, TILE_PIXELS_PER_SIDE, colorIndex); } + + +// Phase 10: group-aware tile paste. Per row: extract 8 pixels from +// 4 chunky bytes, build 4 plane bytes (one per plane), drop them +// into the high or low half of the 4 plane words at this group -- +// 4 word RMWs per row instead of 64 per-pixel calls. +static const uint8_t kStTileBitLut[8] = { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 }; + + +// Phase 10: tile paste/snap reuse the asm sprite save/restore +// helpers -- identical per-row work patterns at byte-aligned +// positions. Width 8 = single tile column = single half-group +// write per plane. The asm walker handles 8 rows just as well +// as a sprite's variable height. +void halTilePastePlanes(SurfaceT *dst, uint8_t bx, uint8_t by, const uint8_t *tileBytes) { + StPlanarT *pd; + uint16_t group; + uint8_t *dstAddr; + int16_t row; + + if (dst == NULL || tileBytes == NULL) { + return; + } + pd = (StPlanarT *)dst->portData; + if (pd == NULL) { + return; + } + /* Phase 10.5: TileT.pixels holds plane-major bytes (4 plane bytes + * per row * 8 rows = 32 bytes). Direct byte copy to the planar + * buffer; no chunky <-> planar conversion. Mirrors the sibling + * halTileCopyPlanes pattern but reads from the contiguous tile + * buffer. Drops the asm-walker entry/exit overhead. */ + group = (uint16_t)((uint16_t)bx >> 1); + dstAddr = pd->base + + (uint16_t)by * 8u * ST_BYTES_PER_ROW + + group * ST_BYTES_PER_GROUP + + (uint16_t)(bx & 1u); + for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) { + dstAddr[0] = tileBytes[0]; + dstAddr[2] = tileBytes[1]; + dstAddr[4] = tileBytes[2]; + dstAddr[6] = tileBytes[3]; + dstAddr += ST_BYTES_PER_ROW; + tileBytes += TILE_BYTES_PER_ROW; + } +} + + +void halTileSnapPlanes(const SurfaceT *src, uint8_t bx, uint8_t by, uint8_t *tileOut) { + const StPlanarT *pd; + uint16_t group; + const uint8_t *srcAddr; + int16_t row; + + if (src == NULL || tileOut == NULL) { + return; + } + pd = (const StPlanarT *)src->portData; + if (pd == NULL) { + return; + } + /* Phase 10.5: write plane-major bytes to TileT (4 per row * 8 rows). */ + group = (uint16_t)((uint16_t)bx >> 1); + srcAddr = pd->base + + (uint16_t)by * 8u * ST_BYTES_PER_ROW + + group * ST_BYTES_PER_GROUP + + (uint16_t)(bx & 1u); + for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) { + tileOut[0] = srcAddr[0]; + tileOut[1] = srcAddr[2]; + tileOut[2] = srcAddr[4]; + tileOut[3] = srcAddr[6]; + srcAddr += ST_BYTES_PER_ROW; + tileOut += TILE_BYTES_PER_ROW; + } +} + + +/* Slow-path C versions kept (renamed) for reference; not in the + * active call chain. */ +static void halTilePastePlanes_oldC(SurfaceT *dst, uint8_t bx, uint8_t by, const uint8_t *chunkyTile) { + StPlanarT *pd; + uint16_t group; + uint16_t halfMask; + uint16_t notHalfMask; + bool isHigh; + uint8_t *rowBase; + int16_t row; + int16_t pix; + uint16_t *pw; + uint8_t b; + uint8_t color; + uint8_t pb0; + uint8_t pb1; + uint8_t pb2; + uint8_t pb3; + uint8_t bit; + + if (dst == NULL || chunkyTile == NULL) { + return; + } + pd = (StPlanarT *)dst->portData; + if (pd == NULL) { + return; + } + group = (uint16_t)((uint16_t)bx >> 1); + isHigh = ((bx & 1u) == 0u); + halfMask = isHigh ? 0xFF00u : 0x00FFu; + notHalfMask = (uint16_t)~halfMask; + rowBase = pd->base + + (uint16_t)by * TILE_PIXELS_PER_SIDE * ST_BYTES_PER_ROW + + group * ST_BYTES_PER_GROUP; + for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) { + pb0 = pb1 = pb2 = pb3 = 0u; + for (pix = 0; pix < TILE_PIXELS_PER_SIDE; pix++) { + b = chunkyTile[row * TILE_BYTES_PER_ROW + (pix >> 1)]; + color = (pix & 1) ? (uint8_t)(b & 0x0Fu) : (uint8_t)(b >> 4); + bit = kStTileBitLut[pix]; + if (color & 1u) { pb0 = (uint8_t)(pb0 | bit); } + if (color & 2u) { pb1 = (uint8_t)(pb1 | bit); } + if (color & 4u) { pb2 = (uint8_t)(pb2 | bit); } + if (color & 8u) { pb3 = (uint8_t)(pb3 | bit); } + } + pw = (uint16_t *)rowBase; + if (isHigh) { + pw[0] = (uint16_t)((pw[0] & notHalfMask) | ((uint16_t)pb0 << 8)); + pw[1] = (uint16_t)((pw[1] & notHalfMask) | ((uint16_t)pb1 << 8)); + pw[2] = (uint16_t)((pw[2] & notHalfMask) | ((uint16_t)pb2 << 8)); + pw[3] = (uint16_t)((pw[3] & notHalfMask) | ((uint16_t)pb3 << 8)); + } else { + pw[0] = (uint16_t)((pw[0] & notHalfMask) | (uint16_t)pb0); + pw[1] = (uint16_t)((pw[1] & notHalfMask) | (uint16_t)pb1); + pw[2] = (uint16_t)((pw[2] & notHalfMask) | (uint16_t)pb2); + pw[3] = (uint16_t)((pw[3] & notHalfMask) | (uint16_t)pb3); + } + rowBase += ST_BYTES_PER_ROW; + } +} + + +// Phase 10: group-aware tile snap. Read 4 plane half-words for the +// row's group, distribute the 8 plane bits per plane into chunky +// nibbles. 4 word reads per row + 4 chunky bytes per row, no +// per-pixel function calls. Replaced by the asm-routed halTileSnapPlanes +// above; kept for reference as the C-only fallback. +static void halTileSnapPlanes_oldC(const SurfaceT *src, uint8_t bx, uint8_t by, uint8_t *chunkyTileOut) { + const StPlanarT *pd; + uint16_t group; + uint16_t halfShift; + const uint8_t *rowBase; + int16_t row; + int16_t pair; + const uint16_t *pw; + uint8_t pb0; + uint8_t pb1; + uint8_t pb2; + uint8_t pb3; + uint8_t bitHi; + uint8_t bitLo; + uint8_t hi; + uint8_t lo; + + if (src == NULL || chunkyTileOut == NULL) { + return; + } + pd = (const StPlanarT *)src->portData; + if (pd == NULL) { + return; + } + group = (uint16_t)((uint16_t)bx >> 1); + halfShift = ((bx & 1u) == 0u) ? 8u : 0u; + rowBase = pd->base + + (uint16_t)by * TILE_PIXELS_PER_SIDE * ST_BYTES_PER_ROW + + group * ST_BYTES_PER_GROUP; + for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) { + pw = (const uint16_t *)rowBase; + pb0 = (uint8_t)(pw[0] >> halfShift); + pb1 = (uint8_t)(pw[1] >> halfShift); + pb2 = (uint8_t)(pw[2] >> halfShift); + pb3 = (uint8_t)(pw[3] >> halfShift); + for (pair = 0; pair < TILE_BYTES_PER_ROW; pair++) { + bitHi = kStTileBitLut[pair * 2]; + bitLo = kStTileBitLut[pair * 2 + 1]; + hi = 0u; + lo = 0u; + if (pb0 & bitHi) hi = (uint8_t)(hi | 1u); + if (pb1 & bitHi) hi = (uint8_t)(hi | 2u); + if (pb2 & bitHi) hi = (uint8_t)(hi | 4u); + if (pb3 & bitHi) hi = (uint8_t)(hi | 8u); + if (pb0 & bitLo) lo = (uint8_t)(lo | 1u); + if (pb1 & bitLo) lo = (uint8_t)(lo | 2u); + if (pb2 & bitLo) lo = (uint8_t)(lo | 4u); + if (pb3 & bitLo) lo = (uint8_t)(lo | 8u); + chunkyTileOut[row * TILE_BYTES_PER_ROW + pair] = (uint8_t)((hi << 4) | lo); + } + rowBase += ST_BYTES_PER_ROW; + } +} + + +// Phase 10: direct planar->planar tile copy. Each tile occupies one +// half-byte of one plane word per plane per row (8 rows total). +// We just byte-copy 4 plane bytes per row -- no chunky scratch, no +// bit transpose, no LUT. ~640 cyc per tile vs ~5000 cyc for the +// snap+paste path. void halTileCopyPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy) { - (void)dst; (void)dstBx; (void)dstBy; (void)src; (void)srcBx; (void)srcBy; + StPlanarT *dstPd; + const StPlanarT *srcPd; + uint8_t *dstAddr; + const uint8_t *srcAddr; + uint16_t srcGroup; + uint16_t dstGroup; + int16_t row; + + if (dst == NULL || src == NULL) { + return; + } + dstPd = (StPlanarT *)dst->portData; + srcPd = (const StPlanarT *)src->portData; + if (dstPd == NULL || srcPd == NULL) { + return; + } + srcGroup = (uint16_t)((uint16_t)srcBx >> 1); + dstGroup = (uint16_t)((uint16_t)dstBx >> 1); + srcAddr = srcPd->base + + (uint16_t)srcBy * 8u * ST_BYTES_PER_ROW + + srcGroup * ST_BYTES_PER_GROUP + + (uint16_t)(srcBx & 1u); + dstAddr = dstPd->base + + (uint16_t)dstBy * 8u * ST_BYTES_PER_ROW + + dstGroup * ST_BYTES_PER_GROUP + + (uint16_t)(dstBx & 1u); + for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) { + dstAddr[0] = srcAddr[0]; /* plane 0 byte (high or low half) */ + dstAddr[2] = srcAddr[2]; /* plane 1 */ + dstAddr[4] = srcAddr[4]; /* plane 2 */ + dstAddr[6] = srcAddr[6]; /* plane 3 */ + srcAddr += ST_BYTES_PER_ROW; + dstAddr += ST_BYTES_PER_ROW; + } } + + void halTileCopyMaskedPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy, uint8_t transparentIndex) { - (void)dst; (void)dstBx; (void)dstBy; (void)src; (void)srcBx; (void)srcBy; (void)transparentIndex; + StPlanarT *dstPd; + uint8_t scratch[TILE_BYTES]; + int16_t row; + uint16_t dstX0; + uint16_t dstY0; + uint16_t group; + uint16_t halfOff; + uint8_t *dstByte; + uint8_t p0; + uint8_t p1; + uint8_t p2; + uint8_t p3; + uint8_t xK0; + uint8_t xK1; + uint8_t xK2; + uint8_t xK3; + uint8_t mask; + uint8_t notMask; + + if (dst == NULL || src == NULL) { + return; + } + dstPd = (StPlanarT *)dst->portData; + if (dstPd == NULL) { + return; + } + /* Phase 10.5: bulk-plane fast path. scratch holds plane-major bytes + * (4 plane bytes per row * 8 rows). For each row, build a "non- + * transparent" mask = OR of (plane_byte XOR replicated transparent + * bit) -- 1s where the source pixel != transparent. Then 4 byte + * RMWs (one per plane) write the row at byte-aligned dst. + * + * For transparent=0 this collapses to mask = p0|p1|p2|p3. + * Replaces the prior 64-iteration per-pixel SetPixel walker. */ + halTileSnapPlanes(src, srcBx, srcBy, scratch); + dstX0 = (uint16_t)((uint16_t)dstBx * TILE_PIXELS_PER_SIDE); + dstY0 = (uint16_t)((uint16_t)dstBy * TILE_PIXELS_PER_SIDE); + group = (uint16_t)(dstX0 >> 4); + halfOff = (uint16_t)((dstX0 & 8u) >> 3u); + dstByte = dstPd->base + dstY0 * ST_BYTES_PER_ROW + + group * ST_BYTES_PER_GROUP + halfOff; + + xK0 = (transparentIndex & 1u) ? 0xFFu : 0u; + xK1 = (transparentIndex & 2u) ? 0xFFu : 0u; + xK2 = (transparentIndex & 4u) ? 0xFFu : 0u; + xK3 = (transparentIndex & 8u) ? 0xFFu : 0u; + + for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) { + p0 = scratch[row * 4 + 0]; + p1 = scratch[row * 4 + 1]; + p2 = scratch[row * 4 + 2]; + p3 = scratch[row * 4 + 3]; + mask = (uint8_t)((p0 ^ xK0) | (p1 ^ xK1) | (p2 ^ xK2) | (p3 ^ xK3)); + if (mask != 0u) { + notMask = (uint8_t)~mask; + dstByte[0] = (uint8_t)((dstByte[0] & notMask) | (p0 & mask)); + dstByte[2] = (uint8_t)((dstByte[2] & notMask) | (p1 & mask)); + dstByte[4] = (uint8_t)((dstByte[4] & notMask) | (p2 & mask)); + dstByte[6] = (uint8_t)((dstByte[6] & notMask) | (p3 & mask)); + } + dstByte += ST_BYTES_PER_ROW; + } } -void halTilePastePlanes(SurfaceT *dst, uint8_t bx, uint8_t by, const uint8_t *chunkyTile) { - (void)dst; (void)bx; (void)by; (void)chunkyTile; -} -void halTileSnapPlanes(const SurfaceT *src, uint8_t bx, uint8_t by, uint8_t *chunkyTileOut) { - (void)src; (void)bx; (void)by; (void)chunkyTileOut; + + +// Phase 10 fast path: byte-aligned, fully-on-surface sprite draw. +// Builds 4 plane bytes + 1 opacity byte from each tile-column row +// in one pass, then does 4 word RMWs per group half. ~7x faster +// than the per-pixel walker for the typical (byte-aligned) case. +// +// Per row of a tile column: 4 chunky bytes -> 8 nibbles -> {plane0 +// byte, plane1 byte, plane2 byte, plane3 byte, opacity byte}. The +// opacity byte has bits set where the sprite pixel is non-zero; +// transparent pixels (color 0) leave the destination plane bits +// alone via the (word AND ~opMask) | (planeBits AND opMask) RMW. +// +// 8 pixels at byte-aligned x always cover exactly one half of one +// group: high half if (x mod 16) == 0, low half if (x mod 16) == 8. +// We branch once per tile column on (dstX & 8). +static void stSpriteDrawByteAligned(StPlanarT *pd, const SpriteT *sp, int16_t x, int16_t y) { + uint16_t wTiles = sp->widthTiles; + int16_t srcH = (int16_t)(sp->heightTiles * 8); + uint8_t *rowBase = pd->base + (uint16_t)y * ST_BYTES_PER_ROW; + int16_t row; + + for (row = 0; row < srcH; row++) { + int16_t tileY = (int16_t)(row >> 3); + int16_t inTileY = (int16_t)(row & 7); + const uint8_t *tileRowBase = sp->tileData + (uint32_t)tileY * wTiles * 32u + (uint32_t)inTileY * 4u; + int16_t tileCol; + + for (tileCol = 0; tileCol < (int16_t)wTiles; tileCol++) { + const uint8_t *trp = tileRowBase + (uint32_t)tileCol * 32u; + uint8_t b0 = trp[0]; + uint8_t b1 = trp[1]; + uint8_t b2 = trp[2]; + uint8_t b3 = trp[3]; + uint8_t pb0 = 0u; + uint8_t pb1 = 0u; + uint8_t pb2 = 0u; + uint8_t pb3 = 0u; + uint8_t pop = 0u; + uint8_t c; + + /* 8 pixels per tile column: hi(b0),lo(b0),hi(b1),lo(b1), + * hi(b2),lo(b2),hi(b3),lo(b3) at bit positions 7..0 + * within the eventual plane byte. Walk inline -- no LUT + * loop overhead. */ + c = (uint8_t)(b0 >> 4); if (c) { pop = (uint8_t)(pop | 0x80u); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x80u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x80u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x80u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x80u); } + c = (uint8_t)(b0 & 0x0Fu); if (c) { pop = (uint8_t)(pop | 0x40u); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x40u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x40u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x40u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x40u); } + c = (uint8_t)(b1 >> 4); if (c) { pop = (uint8_t)(pop | 0x20u); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x20u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x20u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x20u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x20u); } + c = (uint8_t)(b1 & 0x0Fu); if (c) { pop = (uint8_t)(pop | 0x10u); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x10u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x10u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x10u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x10u); } + c = (uint8_t)(b2 >> 4); if (c) { pop = (uint8_t)(pop | 0x08u); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x08u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x08u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x08u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x08u); } + c = (uint8_t)(b2 & 0x0Fu); if (c) { pop = (uint8_t)(pop | 0x04u); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x04u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x04u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x04u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x04u); } + c = (uint8_t)(b3 >> 4); if (c) { pop = (uint8_t)(pop | 0x02u); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x02u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x02u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x02u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x02u); } + c = (uint8_t)(b3 & 0x0Fu); if (c) { pop = (uint8_t)(pop | 0x01u); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x01u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x01u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x01u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x01u); } + + if (pop != 0u) { + int16_t dstX = (int16_t)(x + tileCol * 8); + uint16_t group = (uint16_t)((uint16_t)dstX >> 4); + uint16_t *pw = (uint16_t *)(rowBase + group * ST_BYTES_PER_GROUP); + uint16_t opMask; + uint16_t notOpMask; + uint16_t pv0; + uint16_t pv1; + uint16_t pv2; + uint16_t pv3; + + if ((dstX & 8) == 0) { + opMask = (uint16_t)((uint16_t)pop << 8); + pv0 = (uint16_t)((uint16_t)pb0 << 8); + pv1 = (uint16_t)((uint16_t)pb1 << 8); + pv2 = (uint16_t)((uint16_t)pb2 << 8); + pv3 = (uint16_t)((uint16_t)pb3 << 8); + } else { + opMask = (uint16_t)pop; + pv0 = (uint16_t)pb0; + pv1 = (uint16_t)pb1; + pv2 = (uint16_t)pb2; + pv3 = (uint16_t)pb3; + } + notOpMask = (uint16_t)~opMask; + pw[0] = (uint16_t)((pw[0] & notOpMask) | pv0); + pw[1] = (uint16_t)((pw[1] & notOpMask) | pv1); + pw[2] = (uint16_t)((pw[2] & notOpMask) | pv2); + pw[3] = (uint16_t)((pw[3] & notOpMask) | pv3); + } + } + rowBase += ST_BYTES_PER_ROW; + } } + + +// Phase 10: sprite walker with hoisted state. rowBase advances by +// 160 per row instead of recomputing y*160 per pixel; tile-row +// pointer is advanced once per tile column (8 cols) instead of +// recomputed per pixel; the per-pixel inner block is the inlined +// stPlanarSetPixel body so there's no nested function entry / y*160 +// re-derivation. Major rewrite of the dispatcher path that drove +// the 0.06x sprite gap before this commit. void halSpriteDrawPlanes(SurfaceT *s, const SpriteT *sp, int16_t x, int16_t y) { - (void)s; (void)sp; (void)x; (void)y; + StPlanarT *pd; + int16_t spritePxStart; + int16_t spritePyStart; + int16_t srcW; + int16_t srcH; + int16_t sx; + int16_t sy; + int16_t row; + int16_t col; + uint16_t wTiles; + uint8_t *rowBase; + + if (s == NULL || sp == NULL || sp->tileData == NULL) { + return; + } + pd = (StPlanarT *)s->portData; + if (pd == NULL) { + return; + } + wTiles = sp->widthTiles; + srcW = (int16_t)(wTiles * 8); + srcH = (int16_t)(sp->heightTiles * 8); + spritePxStart = x; + spritePyStart = y; + + /* Phase 10 fast path: byte-aligned x AND fully on-surface lets + * us bulk-write 8 pixels into one group half per tile column, + * skipping the per-pixel walker entirely. UBER's spriteDraw test + * at (160, 100) lands here. */ + if ((x & 7) == 0 + && x >= 0 && (x + srcW) <= SURFACE_WIDTH + && y >= 0 && (y + srcH) <= SURFACE_HEIGHT) { + stSpriteDrawByteAligned(pd, sp, x, y); + return; + } + + sx = 0; + sy = 0; + if (spritePxStart < 0) { sx = (int16_t)(-spritePxStart); srcW = (int16_t)(srcW - sx); spritePxStart = 0; } + if (spritePyStart < 0) { sy = (int16_t)(-spritePyStart); srcH = (int16_t)(srcH - sy); spritePyStart = 0; } + if (spritePxStart >= SURFACE_WIDTH || spritePyStart >= SURFACE_HEIGHT || srcW <= 0 || srcH <= 0) { + return; + } + if (spritePxStart + srcW > SURFACE_WIDTH) { srcW = (int16_t)(SURFACE_WIDTH - spritePxStart); } + if (spritePyStart + srcH > SURFACE_HEIGHT) { srcH = (int16_t)(SURFACE_HEIGHT - spritePyStart); } + + rowBase = pd->base + (uint16_t)spritePyStart * ST_BYTES_PER_ROW; + for (row = 0; row < srcH; row++) { + int16_t spritePy = (int16_t)(sy + row); + int16_t tileY = (int16_t)(spritePy >> 3); + int16_t inTileY = (int16_t)(spritePy & 7); + const uint8_t *tileRowBase = sp->tileData + (uint32_t)tileY * wTiles * 32u + (uint32_t)inTileY * TILE_BYTES_PER_ROW; + int16_t sxStart = sx; + int16_t sxEnd = (int16_t)(sx + srcW); + int16_t spritePx = sxStart; + + /* Walk in tile-column chunks; each tile column advances the + * tileRow pointer once and gives 8 contiguous source pixels. */ + col = 0; + while (spritePx < sxEnd) { + int16_t tileX = (int16_t)(spritePx >> 3); + int16_t inTileX = (int16_t)(spritePx & 7); + const uint8_t *tilePixelRow = tileRowBase + (uint32_t)tileX * 32u; + int16_t pixelsLeft = (int16_t)(8 - inTileX); + int16_t endThisTile = (int16_t)(spritePx + pixelsLeft); + if (endThisTile > sxEnd) { + endThisTile = sxEnd; + pixelsLeft = (int16_t)(endThisTile - spritePx); + } + for (; spritePx < endThisTile; spritePx++) { + uint8_t b = tilePixelRow[inTileX >> 1]; + uint8_t nibble = (inTileX & 1) ? (uint8_t)(b & 0x0Fu) : (uint8_t)(b >> 4); + if (nibble != 0u) { + int16_t dstX = (int16_t)(spritePxStart + col); + uint16_t group = (uint16_t)((uint16_t)dstX >> 4); + uint16_t bitMask = (uint16_t)(1u << (15u - ((uint16_t)dstX & 15u))); + uint16_t notMask = (uint16_t)~bitMask; + uint16_t *pw = (uint16_t *)(rowBase + group * ST_BYTES_PER_GROUP); + if (nibble & 1u) { pw[0] = (uint16_t)(pw[0] | bitMask); } else { pw[0] = (uint16_t)(pw[0] & notMask); } + if (nibble & 2u) { pw[1] = (uint16_t)(pw[1] | bitMask); } else { pw[1] = (uint16_t)(pw[1] & notMask); } + if (nibble & 4u) { pw[2] = (uint16_t)(pw[2] | bitMask); } else { pw[2] = (uint16_t)(pw[2] & notMask); } + if (nibble & 8u) { pw[3] = (uint16_t)(pw[3] | bitMask); } else { pw[3] = (uint16_t)(pw[3] & notMask); } + } + inTileX++; + col++; + } + } + rowBase += ST_BYTES_PER_ROW; + } } + + void halBlitRectPlanes(SurfaceT *dst, int16_t x, int16_t y, const uint8_t *srcBytes, int16_t srcX0, int16_t srcY0, int16_t copyW, int16_t copyH, int16_t srcRowBytes, uint16_t transparent) { - (void)dst; (void)x; (void)y; (void)srcBytes; (void)srcX0; (void)srcY0; - (void)copyW; (void)copyH; (void)srcRowBytes; (void)transparent; + StPlanarT *pd; + int16_t row; + int16_t col; + int16_t srcXCol; + const uint8_t *srcRow; + uint8_t b; + uint8_t color; + + if (dst == NULL || srcBytes == NULL) { + return; + } + pd = (StPlanarT *)dst->portData; + if (pd == NULL) { + return; + } + for (row = 0; row < copyH; row++) { + srcRow = &srcBytes[(srcY0 + row) * srcRowBytes]; + for (col = 0; col < copyW; col++) { + srcXCol = (int16_t)(srcX0 + col); + b = srcRow[srcXCol >> 1]; + color = (srcXCol & 1) ? (uint8_t)(b & 0x0Fu) : (uint8_t)(b >> 4); + if (transparent < 16u && color == (uint8_t)transparent) { + continue; + } + stPlanarSetPixel(pd, (int16_t)(x + col), (int16_t)(y + row), color); + } + } } + + +// Phase 10 fast paths for save/restore. Hand-rolled asm +// (surface68kStSprite{Save,Restore}ByteAligned) does the chunky <-> +// plane bit transpose via ASL+ROXL and walks rows/tile columns. The +// C wrappers below are kept as a fallback / reference; they're not +// in the critical path now that the asm versions are wired in. +static void stSpriteSaveByteAligned(const StPlanarT *pd, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t *dstChunkyBytes) { + int16_t bytesPerRow = (int16_t)(w >> 1); + int16_t tileCols = (int16_t)(w >> 3); + const uint8_t *rowBase = pd->base + (uint16_t)y * ST_BYTES_PER_ROW; + int16_t row; + int16_t tileCol; + + for (row = 0; row < (int16_t)h; row++) { + uint8_t *dstRow = &dstChunkyBytes[(uint16_t)row * (uint16_t)bytesPerRow]; + for (tileCol = 0; tileCol < tileCols; tileCol++) { + int16_t srcX = (int16_t)(x + tileCol * 8); + uint16_t group = (uint16_t)((uint16_t)srcX >> 4); + uint16_t shift = ((srcX & 8) == 0) ? 8u : 0u; + const uint16_t *pw = (const uint16_t *)(rowBase + group * ST_BYTES_PER_GROUP); + uint8_t pb0 = (uint8_t)(pw[0] >> shift); + uint8_t pb1 = (uint8_t)(pw[1] >> shift); + uint8_t pb2 = (uint8_t)(pw[2] >> shift); + uint8_t pb3 = (uint8_t)(pw[3] >> shift); + int16_t pair; + for (pair = 0; pair < 4; pair++) { + uint8_t bitHi = (uint8_t)(0x80u >> (pair * 2)); + uint8_t bitLo = (uint8_t)(0x80u >> (pair * 2 + 1)); + uint8_t hi = 0u; + uint8_t lo = 0u; + if (pb0 & bitHi) { hi = (uint8_t)(hi | 1u); } + if (pb1 & bitHi) { hi = (uint8_t)(hi | 2u); } + if (pb2 & bitHi) { hi = (uint8_t)(hi | 4u); } + if (pb3 & bitHi) { hi = (uint8_t)(hi | 8u); } + if (pb0 & bitLo) { lo = (uint8_t)(lo | 1u); } + if (pb1 & bitLo) { lo = (uint8_t)(lo | 2u); } + if (pb2 & bitLo) { lo = (uint8_t)(lo | 4u); } + if (pb3 & bitLo) { lo = (uint8_t)(lo | 8u); } + dstRow[tileCol * 4 + pair] = (uint8_t)((hi << 4) | lo); + } + } + rowBase += ST_BYTES_PER_ROW; + } +} + + +static void stSpriteRestoreByteAligned(StPlanarT *pd, int16_t x, int16_t y, uint16_t w, uint16_t h, const uint8_t *srcChunkyBytes) { + int16_t bytesPerRow = (int16_t)(w >> 1); + int16_t tileCols = (int16_t)(w >> 3); + uint8_t *rowBase = pd->base + (uint16_t)y * ST_BYTES_PER_ROW; + int16_t row; + int16_t tileCol; + + for (row = 0; row < (int16_t)h; row++) { + const uint8_t *srcRow = &srcChunkyBytes[(uint16_t)row * (uint16_t)bytesPerRow]; + for (tileCol = 0; tileCol < tileCols; tileCol++) { + uint8_t b0 = srcRow[tileCol * 4 + 0]; + uint8_t b1 = srcRow[tileCol * 4 + 1]; + uint8_t b2 = srcRow[tileCol * 4 + 2]; + uint8_t b3 = srcRow[tileCol * 4 + 3]; + uint8_t pb0 = 0u; + uint8_t pb1 = 0u; + uint8_t pb2 = 0u; + uint8_t pb3 = 0u; + uint8_t c; + int16_t dstX; + uint16_t group; + uint16_t *pw; + uint16_t halfMask; + uint16_t notHalfMask; + + c = (uint8_t)(b0 >> 4); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x80u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x80u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x80u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x80u); + c = (uint8_t)(b0 & 0x0Fu); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x40u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x40u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x40u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x40u); + c = (uint8_t)(b1 >> 4); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x20u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x20u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x20u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x20u); + c = (uint8_t)(b1 & 0x0Fu); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x10u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x10u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x10u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x10u); + c = (uint8_t)(b2 >> 4); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x08u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x08u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x08u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x08u); + c = (uint8_t)(b2 & 0x0Fu); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x04u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x04u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x04u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x04u); + c = (uint8_t)(b3 >> 4); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x02u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x02u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x02u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x02u); + c = (uint8_t)(b3 & 0x0Fu); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x01u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x01u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x01u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x01u); + + dstX = (int16_t)(x + tileCol * 8); + group = (uint16_t)((uint16_t)dstX >> 4); + pw = (uint16_t *)(rowBase + group * ST_BYTES_PER_GROUP); + if ((dstX & 8) == 0) { + halfMask = 0xFF00u; + pw[0] = (uint16_t)((pw[0] & 0x00FFu) | ((uint16_t)pb0 << 8)); + pw[1] = (uint16_t)((pw[1] & 0x00FFu) | ((uint16_t)pb1 << 8)); + pw[2] = (uint16_t)((pw[2] & 0x00FFu) | ((uint16_t)pb2 << 8)); + pw[3] = (uint16_t)((pw[3] & 0x00FFu) | ((uint16_t)pb3 << 8)); + } else { + halfMask = 0x00FFu; + pw[0] = (uint16_t)((pw[0] & 0xFF00u) | (uint16_t)pb0); + pw[1] = (uint16_t)((pw[1] & 0xFF00u) | (uint16_t)pb1); + pw[2] = (uint16_t)((pw[2] & 0xFF00u) | (uint16_t)pb2); + pw[3] = (uint16_t)((pw[3] & 0xFF00u) | (uint16_t)pb3); + } + (void)halfMask; + (void)notHalfMask; + } + rowBase += ST_BYTES_PER_ROW; + } +} + + +// Phase 10: hoist y*160 to per-row, fold setPixel/getPixel bodies +// inline. Each pixel's group address differs only in (x), so we +// can compute base+row*160 once per row and just do per-pixel +// (group, bitMask, 4 plane RMW). 2x speedup over the per-pixel +// stPlanarSetPixel form. void halSpriteSavePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t *dstPlaneBytes) { - (void)s; (void)x; (void)y; (void)w; (void)h; (void)dstPlaneBytes; + StPlanarT *pd; + int16_t row; + int16_t pair; + int16_t pairs; + uint8_t *pp; + const uint8_t *rowBase; + + if (s == NULL || dstPlaneBytes == NULL || w == 0u || h == 0u) { + return; + } + pd = (StPlanarT *)s->portData; + if (pd == NULL) { + return; + } + /* Phase 10.5 fast path: byte-aligned, fully on-surface. + * Asm walker does direct planar byte copy (LUT pointer unused). */ + if ((x & 7) == 0 && (w & 7) == 0 + && x >= 0 && (x + (int16_t)w) <= SURFACE_WIDTH + && y >= 0 && (y + (int16_t)h) <= SURFACE_HEIGHT) { + surface68kStSpriteSaveByteAligned(pd->base, (uint16_t)x, (uint16_t)y, w, h, dstPlaneBytes, NULL); + return; + } + + pairs = (int16_t)(w >> 1); + rowBase = pd->base + (uint16_t)y * ST_BYTES_PER_ROW; + for (row = 0; row < (int16_t)h; row++) { + pp = &dstPlaneBytes[(uint16_t)row * (uint16_t)pairs]; + for (pair = 0; pair < pairs; pair++) { + int16_t px; + uint16_t group; + uint16_t bitMask; + const uint16_t *pw; + uint8_t hi; + uint8_t lo; + + px = (int16_t)(x + pair * 2); + group = (uint16_t)((uint16_t)px >> 4); + bitMask = (uint16_t)(1u << (15u - ((uint16_t)px & 15u))); + pw = (const uint16_t *)(rowBase + group * ST_BYTES_PER_GROUP); + hi = 0u; + if (pw[0] & bitMask) { hi = (uint8_t)(hi | 1u); } + if (pw[1] & bitMask) { hi = (uint8_t)(hi | 2u); } + if (pw[2] & bitMask) { hi = (uint8_t)(hi | 4u); } + if (pw[3] & bitMask) { hi = (uint8_t)(hi | 8u); } + + px = (int16_t)(x + pair * 2 + 1); + group = (uint16_t)((uint16_t)px >> 4); + bitMask = (uint16_t)(1u << (15u - ((uint16_t)px & 15u))); + pw = (const uint16_t *)(rowBase + group * ST_BYTES_PER_GROUP); + lo = 0u; + if (pw[0] & bitMask) { lo = (uint8_t)(lo | 1u); } + if (pw[1] & bitMask) { lo = (uint8_t)(lo | 2u); } + if (pw[2] & bitMask) { lo = (uint8_t)(lo | 4u); } + if (pw[3] & bitMask) { lo = (uint8_t)(lo | 8u); } + + pp[pair] = (uint8_t)((hi << 4) | lo); + } + rowBase += ST_BYTES_PER_ROW; + } } + + void halSpriteRestorePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, const uint8_t *srcPlaneBytes) { - (void)s; (void)x; (void)y; (void)w; (void)h; (void)srcPlaneBytes; + StPlanarT *pd; + int16_t row; + int16_t pair; + int16_t pairs; + uint8_t b; + const uint8_t *pp; + uint8_t *rowBase; + + if (s == NULL || srcPlaneBytes == NULL || w == 0u || h == 0u) { + return; + } + pd = (StPlanarT *)s->portData; + if (pd == NULL) { + return; + } + /* Phase 10.5 fast path: byte-aligned, fully on-surface. + * Asm walker does direct planar byte copy (LUT pointer unused). */ + if ((x & 7) == 0 && (w & 7) == 0 + && x >= 0 && (x + (int16_t)w) <= SURFACE_WIDTH + && y >= 0 && (y + (int16_t)h) <= SURFACE_HEIGHT) { + surface68kStSpriteRestoreByteAligned(pd->base, (uint16_t)x, (uint16_t)y, w, h, srcPlaneBytes, NULL); + return; + } + + pairs = (int16_t)(w >> 1); + rowBase = pd->base + (uint16_t)y * ST_BYTES_PER_ROW; + for (row = 0; row < (int16_t)h; row++) { + pp = &srcPlaneBytes[(uint16_t)row * (uint16_t)pairs]; + for (pair = 0; pair < pairs; pair++) { + int16_t px; + uint16_t group; + uint16_t bitMask; + uint16_t notMask; + uint16_t *pw; + uint8_t color; + + b = pp[pair]; + + px = (int16_t)(x + pair * 2); + color = (uint8_t)(b >> 4); + group = (uint16_t)((uint16_t)px >> 4); + bitMask = (uint16_t)(1u << (15u - ((uint16_t)px & 15u))); + notMask = (uint16_t)~bitMask; + pw = (uint16_t *)(rowBase + group * ST_BYTES_PER_GROUP); + if (color & 1u) { pw[0] = (uint16_t)(pw[0] | bitMask); } else { pw[0] = (uint16_t)(pw[0] & notMask); } + if (color & 2u) { pw[1] = (uint16_t)(pw[1] | bitMask); } else { pw[1] = (uint16_t)(pw[1] & notMask); } + if (color & 4u) { pw[2] = (uint16_t)(pw[2] | bitMask); } else { pw[2] = (uint16_t)(pw[2] & notMask); } + if (color & 8u) { pw[3] = (uint16_t)(pw[3] | bitMask); } else { pw[3] = (uint16_t)(pw[3] & notMask); } + + px = (int16_t)(x + pair * 2 + 1); + color = (uint8_t)(b & 0x0Fu); + group = (uint16_t)((uint16_t)px >> 4); + bitMask = (uint16_t)(1u << (15u - ((uint16_t)px & 15u))); + notMask = (uint16_t)~bitMask; + pw = (uint16_t *)(rowBase + group * ST_BYTES_PER_GROUP); + if (color & 1u) { pw[0] = (uint16_t)(pw[0] | bitMask); } else { pw[0] = (uint16_t)(pw[0] & notMask); } + if (color & 2u) { pw[1] = (uint16_t)(pw[1] | bitMask); } else { pw[1] = (uint16_t)(pw[1] & notMask); } + if (color & 4u) { pw[2] = (uint16_t)(pw[2] | bitMask); } else { pw[2] = (uint16_t)(pw[2] & notMask); } + if (color & 8u) { pw[3] = (uint16_t)(pw[3] | bitMask); } else { pw[3] = (uint16_t)(pw[3] & notMask); } + } + rowBase += ST_BYTES_PER_ROW; + } } -/* Phase 9 chunky reader hooks -- ST is still chunky-shadow + c2p, - * so reads come from s->pixels just like DOS / IIgs. */ +// Phase 7: pixel reader. Pre-Phase-9 reads from the chunky shadow +// (s->pixels) since that's the source-of-truth during transition. +// Once Phase 9 sets s->pixels = NULL the planar shadow becomes +// authoritative and we walk the 4 plane bits at (x, y). uint8_t halSamplePixel(const SurfaceT *s, int16_t x, int16_t y) { - uint8_t byte = s->pixels[(uint16_t)y * SURFACE_BYTES_PER_ROW + ((uint16_t)x >> 1)]; - if (x & 1) return (uint8_t)(byte & 0x0Fu); - return (uint8_t)((byte & 0xF0u) >> 4); + if (s->pixels != NULL) { + uint8_t byte = s->pixels[(uint16_t)y * SURFACE_BYTES_PER_ROW + ((uint16_t)x >> 1)]; + if (x & 1) return (uint8_t)(byte & 0x0Fu); + return (uint8_t)((byte & 0xF0u) >> 4); + } + { + StPlanarT *pd = (StPlanarT *)s->portData; + if (pd == NULL) { + return 0u; + } + return stPlanarGetPixel(pd, x, y); + } +} + + +// Phase 9: derive 160 chunky bytes per row from the word-interleaved +// planar buffer (20 groups x 4 plane words). Same shape as the Amiga's +// amigaPlanesToChunkyRow but per-group instead of per-byte. Used by +// halSurfaceHash and halSurfaceSaveFileChunky. +static void stPlanarToChunkyRow(const StPlanarT *pd, int16_t y, uint8_t *dstChunkyRow) { + uint16_t group; + uint16_t p; + uint16_t bitMask; + uint8_t pix; + const uint16_t *gp; + + for (group = 0; group < ST_GROUPS_PER_ROW; group++) { + gp = (const uint16_t *)(pd->base + + (uint16_t)y * ST_BYTES_PER_ROW + + group * ST_BYTES_PER_GROUP); + for (p = 0; p < 16u; p++) { + bitMask = (uint16_t)(1u << (15u - p)); + pix = 0u; + if (gp[0] & bitMask) { pix = (uint8_t)(pix | 1u); } + if (gp[1] & bitMask) { pix = (uint8_t)(pix | 2u); } + if (gp[2] & bitMask) { pix = (uint8_t)(pix | 4u); } + if (gp[3] & bitMask) { pix = (uint8_t)(pix | 8u); } + if ((p & 1u) == 0u) { + dstChunkyRow[group * 8u + (p >> 1)] = (uint8_t)(pix << 4); + } else { + dstChunkyRow[group * 8u + (p >> 1)] = (uint8_t)(dstChunkyRow[group * 8u + (p >> 1)] | pix); + } + } + } } uint32_t halSurfaceHash(const SurfaceT *s) { - uint16_t lo = 0xACE1u, hi = 0x1357u, blocks, n, v; - const uint8_t *p; - const uint16_t *w; + StPlanarT *pd; + uint16_t lo = 0xACE1u; + uint16_t hi = 0x1357u; + uint16_t n; + uint16_t v; + int16_t row; + uint16_t col; uint8_t b; - p = s->pixels; - blocks = (uint16_t)(SURFACE_PIXELS_SIZE / 8); - do { - b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b); - b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b); - b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b); - b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b); - b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b); - b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b); - b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b); - b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b); - blocks--; - } while (blocks > 0u); - p = s->scb; - for (n = 0; n < (uint16_t)SURFACE_HEIGHT; n++) { - b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b); + uint8_t chunkyRow[SURFACE_BYTES_PER_ROW]; + const uint16_t *w; + + pd = (StPlanarT *)s->portData; + if (pd == NULL) { + return 0u; + } + /* Pixel hash: derive equivalent chunky bytes from the planar + * shadow row by row, fold them into the same SURFACE_HASH_MIX_BYTE + * the chunky ports use so cross-port hash comparisons stay valid. */ + for (row = 0; row < SURFACE_HEIGHT; row++) { + stPlanarToChunkyRow(pd, row, chunkyRow); + for (col = 0; col < SURFACE_BYTES_PER_ROW; col++) { + b = chunkyRow[col]; + SURFACE_HASH_MIX_BYTE(lo, hi, b); + } + } + /* SCB + palette mix unchanged from chunky days. */ + { + const uint8_t *sp = s->scb; + for (n = 0; n < (uint16_t)SURFACE_HEIGHT; n++) { + b = *sp++; SURFACE_HASH_MIX_BYTE(lo, hi, b); + } } w = &s->palette[0][0]; for (n = 0; n < (uint16_t)SURFACE_PALETTE_ENTRIES; n++) { @@ -901,39 +2124,108 @@ uint32_t halSurfaceHash(const SurfaceT *s) { } +// Phase 9: planar-only. The chunky shadow is gone; surface copy is +// 32000 bytes of planar data. halSurfaceCopyPlanes already handles +// the planar copy via memcpy of pd->base. This stub only guards the +// pre-Phase-9 contract; cross-platform surfaceCopy still calls both +// halSurfaceCopyChunky and halSurfaceCopyPlanes. void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src) { - memcpy(dst->pixels, src->pixels, SURFACE_PIXELS_SIZE); + (void)dst; (void)src; } +// Phase 9: read chunky from file into a temporary scratch buffer, +// then c2p once into the planar shadow. The .joeysurface file format +// is still chunky 4bpp on disk (cross-port asset interchange); the +// in-memory representation is what changes. bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp) { - return fread(dst->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE; + StPlanarT *pd; + uint8_t *scratch; + int16_t y; + bool ok; + + pd = (StPlanarT *)dst->portData; + if (pd == NULL) { + return false; + } + scratch = (uint8_t *)malloc(SURFACE_PIXELS_SIZE); + if (scratch == NULL) { + return false; + } + ok = (fread(scratch, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE); + if (ok) { + if (!gC2pLutReady) { + initC2pLut(); + } + for (y = 0; y < SURFACE_HEIGHT; y++) { + const uint8_t *srcLine = &scratch[y * SURFACE_BYTES_PER_ROW]; + uint16_t *dstLine = (uint16_t *)&pd->base[y * ST_BYTES_PER_ROW]; + chunkyToPlanarRowSt(srcLine, dstLine, 0u, ST_GROUPS_PER_ROW, gC2pLut); + } + } + free(scratch); + return ok; } +// Phase 9: derive chunky bytes from the planar shadow row by row, +// stream to file. Avoids needing a full 32 KB scratch buffer. bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp) { - return fwrite(src->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE; + StPlanarT *pd; + uint8_t chunkyRow[SURFACE_BYTES_PER_ROW]; + int16_t y; + + pd = (StPlanarT *)src->portData; + if (pd == NULL) { + return false; + } + for (y = 0; y < SURFACE_HEIGHT; y++) { + stPlanarToChunkyRow(pd, y, chunkyRow); + if (fwrite(chunkyRow, 1, SURFACE_BYTES_PER_ROW, fp) != SURFACE_BYTES_PER_ROW) { + return false; + } + } + return true; } +// Phase 9: no chunky storage on the ST. Cross-platform code treats +// NULL as "port has no chunky shadow" (same contract Amiga uses). uint8_t *halSurfaceAllocPixels(void) { - return (uint8_t *)calloc(1, SURFACE_PIXELS_SIZE); -} - - -void halSurfaceFreePixels(uint8_t *pixels) { - free(pixels); -} - - -uint8_t *halSurfacePlanePtr(const SurfaceT *s, uint8_t planeIdx) { - (void)s; (void)planeIdx; return NULL; } +void halSurfaceFreePixels(uint8_t *pixels) { + free(pixels); /* free(NULL) is a no-op; symmetric for non-planar ports. */ +} + + +// ST is word-interleaved: one buffer holds all 4 planes per group +// back-to-back. There's no per-plane base, but we overload planeIdx +// 0 to return the single buffer base so the cross-platform sprite +// dispatcher (spriteCompiledDraw) can hand it to the ST JIT +// routine, which computes plane offsets internally via d16(a0) +// chains. planeIdx >= 1 returns NULL since they don't make sense +// in interleaved layout. +uint8_t *halSurfacePlanePtr(const SurfaceT *s, uint8_t planeIdx) { + StPlanarT *pd; + + if (s == NULL || planeIdx != 0u) { + return NULL; + } + pd = (StPlanarT *)s->portData; + if (pd == NULL) { + return NULL; + } + return pd->base; +} + + +// Phase 9: stage has no chunky shadow either. Cross-platform stageAlloc +// stores NULL in s->pixels and skips the chunky memset. uint8_t *halStageAllocPixels(void) { - return (uint8_t *)malloc(SURFACE_PIXELS_SIZE); + return NULL; } diff --git a/src/port/atarist/lineSpan.s b/src/port/atarist/lineSpan.s new file mode 100644 index 0000000..242b7b4 --- /dev/null +++ b/src/port/atarist/lineSpan.s @@ -0,0 +1,853 @@ +| Atari ST word-interleaved planar drawLine -- 68000 hand-rolled. +| +| Bresenham line walker with 16-way color dispatch. Per pixel: +| * 4-plane word RMW with branchless OR/AND chosen at compile time +| * bit mask via 16-entry word table; group offset via (x>>4)<<3 +| * y*160 = (y<<5)+(y<<7) +| +| Caller MUST guarantee the entire line lies on-surface (full clip +| precheck). Partial-clip lines fall back to the C walker. +| +| ABI: cdecl. d2-d7/a2-a6 callee-save. +| +| void surface68kStDrawLine(uint8_t *base, +| int16_t x0, int16_t y0, +| int16_t x1, int16_t y1, +| uint8_t color); +| +| Register allocation in the inner loop: +| d2.w = x (current pixel) +| d3.w = y (current pixel) +| d4.w = err +| d5.w = dx (>= 0) +| d6.w = -dy_abs (<= 0; "Bresenham uses -dy") +| d7 = sx (long; moveq #1 or #-1, low word used for .w add) +| a4 = sy (long; sign-extended) +| a3 = base +| a5 = bitMaskWordLut +| a2 = scratch (per-pixel: base + byteOff) +| d0,d1 = scratch +| +| Stack scratch: +| sp+0..1 iter counter (max(dx, dy_abs) + 1) + + .text + + + .equ SP_SAVED, 44 + .equ SP_LOCAL, 4 + .equ SP_OFF, (SP_SAVED + 4 + SP_LOCAL) + .equ SP_BASE, SP_OFF + 0 + .equ SP_X0, SP_OFF + 4 + 2 + .equ SP_Y0, SP_OFF + 8 + 2 + .equ SP_X1, SP_OFF + 12 + 2 + .equ SP_Y1, SP_OFF + 16 + 2 + .equ SP_COLOR, SP_OFF + 20 + 3 + + +| ---- DL_PLOT: 4-plane word RMW for hardcoded color ---- +| Inputs: d2.w = x, d3.w = y, a3 = base, a5 = bitMaskWordLut +| Trashes: d0, d1, a2 + + .macro DL_PLOT color + | byteOff = y*160 + (x>>4)*8 + move.w %d3,%d0 + ext.l %d0 + move.l %d0,%d1 + lsl.l #5,%d0 | y << 5 + lsl.l #7,%d1 | y << 7 + add.l %d1,%d0 | d0 = y * 160 + move.w %d2,%d1 + lsr.w #4,%d1 + lsl.w #3,%d1 | (x>>4) * 8 + ext.l %d1 + add.l %d1,%d0 | d0 = byteOff + lea 0(%a3,%d0.l),%a2 | a2 = base + byteOff + | d1 = bitMask, d0 = notMask + move.w %d2,%d1 + and.w #15,%d1 + add.w %d1,%d1 + move.w (%a5,%d1.w),%d1 + move.w %d1,%d0 + not.w %d0 + | per-plane RMW with postinc (drops 4 cyc per RMW vs + | displacement (d8,An) = 16 cyc, plain (An)+ = 12 cyc). + .if ((\color) & 1) + or.w %d1,(%a2)+ + .else + and.w %d0,(%a2)+ + .endif + .if ((\color) & 2) + or.w %d1,(%a2)+ + .else + and.w %d0,(%a2)+ + .endif + .if ((\color) & 4) + or.w %d1,(%a2)+ + .else + and.w %d0,(%a2)+ + .endif + .if ((\color) & 8) + or.w %d1,(%a2)+ + .else + and.w %d0,(%a2)+ + .endif + .endm + + +| ---- DL_BODY: full Bresenham loop body for hardcoded color ---- + + .macro DL_BODY color +.LdlStLoop_\color: + DL_PLOT \color + | e2 = 2 * err + move.w %d4,%d0 + add.w %d0,%d0 | d0 = e2 + | if (e2 >= dy) { err += dy; x += sx; } + cmp.w %d6,%d0 + blt.s .LdlStNoX_\color + add.w %d6,%d4 + add.w %d7,%d2 +.LdlStNoX_\color: + | if (e2 <= dx) { err += dx; y += sy; } + cmp.w %d5,%d0 + bgt.s .LdlStNoY_\color + add.w %d5,%d4 + add.w %a4,%d3 | sy.w from a4 +.LdlStNoY_\color: + subq.w #1,0(%sp) + bne.w .LdlStLoop_\color + bra.w .LdlStDone + .endm + + + .globl _surface68kStDrawLine + +_surface68kStDrawLine: + movem.l %d2-%d7/%a2-%a6,-(%sp) + lea -SP_LOCAL(%sp),%sp + + | Load base & lut. + move.l SP_BASE(%sp),%a3 + lea bitMaskWordLut(%pc),%a5 + + | x = x0, y = y0 + move.w SP_X0(%sp),%d2 + move.w SP_Y0(%sp),%d3 + + | dx = abs(x1 - x0), sx = sign(x1 - x0) + move.w SP_X1(%sp),%d5 + sub.w %d2,%d5 | d5 = x1 - x0 + bge.s .LdlSxPos + neg.w %d5 + moveq #-1,%d7 + bra.s .LdlSxDone +.LdlSxPos: + moveq #1,%d7 +.LdlSxDone: + + | dy_abs in d6, sy in d0 (-> a4) + move.w SP_Y1(%sp),%d6 + sub.w %d3,%d6 | d6 = y1 - y0 + bge.s .LdlSyPos + neg.w %d6 + moveq #-1,%d0 + bra.s .LdlSyDone +.LdlSyPos: + moveq #1,%d0 +.LdlSyDone: + ext.l %d0 + movea.l %d0,%a4 | a4 = sy + + | iter counter = max(dx, dy_abs) + 1 + move.w %d5,%d0 + cmp.w %d6,%d0 + bge.s .LdlNitDone + move.w %d6,%d0 +.LdlNitDone: + addq.w #1,%d0 + move.w %d0,0(%sp) + + | err = dx - dy_abs (== dx + dy where dy negative) + move.w %d5,%d4 + sub.w %d6,%d4 | d4 = err + neg.w %d6 | d6 = -dy_abs (negative) + + | Dispatch on color (low 4 bits) -> 16 specialized loops. + moveq #0,%d0 + move.b SP_COLOR(%sp),%d0 + and.w #0x0F,%d0 + add.w %d0,%d0 + add.w %d0,%d0 | * 4 for bra.w table + lea .LdlStTable(%pc),%a6 + jmp 0(%a6,%d0.w) + +.LdlStTable: + bra.w .LdlStLoop_0 + bra.w .LdlStLoop_1 + bra.w .LdlStLoop_2 + bra.w .LdlStLoop_3 + bra.w .LdlStLoop_4 + bra.w .LdlStLoop_5 + bra.w .LdlStLoop_6 + bra.w .LdlStLoop_7 + bra.w .LdlStLoop_8 + bra.w .LdlStLoop_9 + bra.w .LdlStLoop_10 + bra.w .LdlStLoop_11 + bra.w .LdlStLoop_12 + bra.w .LdlStLoop_13 + bra.w .LdlStLoop_14 + bra.w .LdlStLoop_15 + + DL_BODY 0 + DL_BODY 1 + DL_BODY 2 + DL_BODY 3 + DL_BODY 4 + DL_BODY 5 + DL_BODY 6 + DL_BODY 7 + DL_BODY 8 + DL_BODY 9 + DL_BODY 10 + DL_BODY 11 + DL_BODY 12 + DL_BODY 13 + DL_BODY 14 + DL_BODY 15 + +.LdlStDone: + lea SP_LOCAL(%sp),%sp + movem.l (%sp)+,%d2-%d7/%a2-%a6 + rts + + + .align 2 +| 16 word entries: bitMaskWordLut[i] = 1 << (15 - i), for i in 0..15. +bitMaskWordLut: + .word 0x8000, 0x4000, 0x2000, 0x1000 + .word 0x0800, 0x0400, 0x0200, 0x0100 + .word 0x0080, 0x0040, 0x0020, 0x0010 + .word 0x0008, 0x0004, 0x0002, 0x0001 + + +| ---- surface68kStFillSpan --------------------------------------- +| +| Single-row span fill: leading-mask group + middle long-fills + +| trailing-mask group, all in one frame. Caller pre-clips so the +| span is fully on-surface. +| +| void surface68kStFillSpan(uint8_t *base, +| int16_t left, int16_t right, +| int16_t y, uint8_t color); +| +| Caller guarantees: 0 <= left <= right < 320, 0 <= y < 200. +| +| Register layout: +| a3 = base +| a4 = current group pointer +| d2.w = leftMask (then trailing trampoline target) +| d3.w = rightMask +| d4.w = numGroups - 1 (middle iter count when > 0) +| d5.l = loLong (planes 0+1 long template) +| d6.l = hiLong (planes 2+3 long template) +| d7.b = color (low nibble; tested via btst) +| d0,d1 = scratch + + .equ SP_FS_SAVED, 44 + .equ SP_FS_OFF, (SP_FS_SAVED + 4) + .equ SP_FS_BASE, SP_FS_OFF + 0 + .equ SP_FS_LEFT, SP_FS_OFF + 4 + 2 + .equ SP_FS_RIGHT, SP_FS_OFF + 8 + 2 + .equ SP_FS_Y, SP_FS_OFF + 12 + 2 + .equ SP_FS_COLOR, SP_FS_OFF + 16 + 3 + + + .globl _surface68kStFillSpan + +_surface68kStFillSpan: + movem.l %d2-%d7/%a2-%a6,-(%sp) + + move.l SP_FS_BASE(%sp),%a3 + moveq #0,%d7 + move.b SP_FS_COLOR(%sp),%d7 | d7 = color + + | loLong = ((c&1)?0xFFFF0000:0) | ((c&2)?0x0000FFFF:0) + moveq #0,%d5 + btst #1,%d7 + beq.s .LfsLoBit1 + move.w #-1,%d5 +.LfsLoBit1: + btst #0,%d7 + beq.s .LfsLoBit0 + ori.l #0xFFFF0000,%d5 +.LfsLoBit0: + | hiLong = ((c&4)?0xFFFF0000:0) | ((c&8)?0x0000FFFF:0) + moveq #0,%d6 + btst #3,%d7 + beq.s .LfsHiBit3 + move.w #-1,%d6 +.LfsHiBit3: + btst #2,%d7 + beq.s .LfsHiBit2 + ori.l #0xFFFF0000,%d6 +.LfsHiBit2: + + | rowBase = base + y*160 -> a4 + move.w SP_FS_Y(%sp),%d0 + ext.l %d0 + move.l %d0,%d1 + lsl.l #5,%d0 + lsl.l #7,%d1 + add.l %d1,%d0 | d0 = y*160 + lea 0(%a3,%d0.l),%a4 + + | left in d0, right in d1 + move.w SP_FS_LEFT(%sp),%d0 + move.w SP_FS_RIGHT(%sp),%d1 + + | bitFirst in d2, bitLast in d3 + move.w %d0,%d2 + and.w #15,%d2 + move.w %d1,%d3 + and.w #15,%d3 + + | a4 += groupFirst * 8 + | numGroups = groupLast - groupFirst (in d4) + move.w %d0,%d4 + lsr.w #4,%d4 | d4 = groupFirst + move.w %d4,%d0 | save groupFirst into d0 + lsl.w #3,%d0 | d0 = groupFirst*8 + ext.l %d0 + add.l %d0,%a4 + move.w %d1,%d0 + lsr.w #4,%d0 | d0 = groupLast + sub.w %d4,%d0 | d0 = groupLast - groupFirst + move.w %d0,%d4 | d4 = numGroups + + | leftMask = (1 << (16 - bitFirst)) - 1 + moveq #16,%d0 + sub.w %d2,%d0 | d0 = 16 - bitFirst (1..16) + moveq #1,%d2 + lsl.l %d0,%d2 | 1 << (16 - bitFirst) + subq.l #1,%d2 | d2.w = leftMask + + | rightMask = ~((1 << (15 - bitLast)) - 1) + moveq #15,%d0 + sub.w %d3,%d0 | d0 = 15 - bitLast (0..15) + moveq #1,%d3 + lsl.l %d0,%d3 | 1 << (15 - bitLast) + subq.l #1,%d3 | inverse mask + not.w %d3 | d3.w = rightMask + + | If numGroups == 0, single-group: mask = leftMask & rightMask + tst.w %d4 + bne.s .LfsMulti + + and.w %d2,%d3 | d3 = combinedMask + move.w %d3,%d2 + bsr.s .LfsApplyMask + bra.w .LfsDone + +.LfsMulti: + | Leading mask (d2 already = leftMask) + bsr.s .LfsApplyMask + addq.l #8,%a4 | next group + + | numMid = numGroups - 1 + subq.w #1,%d4 + beq.s .LfsTrailing + +.LfsMidLoop: + move.l %d5,(%a4)+ + move.l %d6,(%a4)+ + subq.w #1,%d4 + bne.s .LfsMidLoop + +.LfsTrailing: + move.w %d3,%d2 | d2 = rightMask + bsr.s .LfsApplyMask + +.LfsDone: + movem.l (%sp)+,%d2-%d7/%a2-%a6 + rts + + +| Apply 4-plane word RMW at (a4) using mask in d2 (or notMask in d0). +| Plane N: if (color bit N) OR mask else AND notMask. +| Inputs: a4, d2.w = mask, d7.b = color +| Trashes: d0 +| Returns via rts. + +.LfsApplyMask: + move.w %d2,%d0 + not.w %d0 | d0 = notMask + btst #0,%d7 + beq.s .LfsAm0a + or.w %d2,(%a4) + bra.s .LfsAm1 +.LfsAm0a: + and.w %d0,(%a4) +.LfsAm1: + btst #1,%d7 + beq.s .LfsAm1a + or.w %d2,2(%a4) + bra.s .LfsAm2 +.LfsAm1a: + and.w %d0,2(%a4) +.LfsAm2: + btst #2,%d7 + beq.s .LfsAm2a + or.w %d2,4(%a4) + bra.s .LfsAm3 +.LfsAm2a: + and.w %d0,4(%a4) +.LfsAm3: + btst #3,%d7 + beq.s .LfsAm3a + or.w %d2,6(%a4) + rts +.LfsAm3a: + and.w %d0,6(%a4) + rts + + +| ---- surface68kStFillRectSingleGroup ----------------------------- +| +| Fill rect when groupFirst == groupLast (thin/single-column rect). +| Caller pre-computes firstGroupPtr = base + y*160 + groupFirst*8 +| and the mask = leftMask & rightMask. +| +| void surface68kStFillRectSingleGroup(uint8_t *firstGroupPtr, +| uint16_t mask, +| uint16_t h, +| uint8_t color); +| +| Dispatched on color (low nibble) -> 16 specialized loops with +| hardcoded OR/AND per plane. Inner loop is 4 plane word RMWs + +| advance row + branch. +| +| drawLine V routes to fillRect 1xH which lands here. + + .equ SP_FRG_SAVED, 24 | d2-d5/a2-a3 = 6 longs + .equ SP_FRG_OFF, (SP_FRG_SAVED + 4) + .equ SP_FRG_PTR, SP_FRG_OFF + 0 + .equ SP_FRG_MASK, SP_FRG_OFF + 4 + 2 + .equ SP_FRG_H, SP_FRG_OFF + 8 + 2 + .equ SP_FRG_COLOR, SP_FRG_OFF + 12 + 3 + + + .macro FRG_LOOP color +.Lfrg_loop_\color: + .if ((\color) & 1) + or.w %d3,(%a3)+ + .else + and.w %d4,(%a3)+ + .endif + .if ((\color) & 2) + or.w %d3,(%a3)+ + .else + and.w %d4,(%a3)+ + .endif + .if ((\color) & 4) + or.w %d3,(%a3)+ + .else + and.w %d4,(%a3)+ + .endif + .if ((\color) & 8) + or.w %d3,(%a3)+ + .else + and.w %d4,(%a3)+ + .endif + lea 152(%a3),%a3 | a3 now at row start; advance to next row (160-8) + subq.w #1,%d5 + bne.w .Lfrg_loop_\color + bra.w .Lfrg_done + .endm + + + .globl _surface68kStFillRectSingleGroup + +_surface68kStFillRectSingleGroup: + movem.l %d2-%d5/%a2-%a3,-(%sp) + + move.l SP_FRG_PTR(%sp),%a3 + move.w SP_FRG_MASK(%sp),%d3 + move.w SP_FRG_H(%sp),%d5 + tst.w %d5 + beq.w .Lfrg_done + move.w %d3,%d4 + not.w %d4 | d4 = notMask + + | Color dispatch + moveq #0,%d2 + move.b SP_FRG_COLOR(%sp),%d2 + and.w #0x0F,%d2 + add.w %d2,%d2 + add.w %d2,%d2 | * 4 for bra.w table + lea .Lfrg_table(%pc),%a2 + jmp 0(%a2,%d2.w) + +.Lfrg_table: + bra.w .Lfrg_loop_0 + bra.w .Lfrg_loop_1 + bra.w .Lfrg_loop_2 + bra.w .Lfrg_loop_3 + bra.w .Lfrg_loop_4 + bra.w .Lfrg_loop_5 + bra.w .Lfrg_loop_6 + bra.w .Lfrg_loop_7 + bra.w .Lfrg_loop_8 + bra.w .Lfrg_loop_9 + bra.w .Lfrg_loop_10 + bra.w .Lfrg_loop_11 + bra.w .Lfrg_loop_12 + bra.w .Lfrg_loop_13 + bra.w .Lfrg_loop_14 + bra.w .Lfrg_loop_15 + + FRG_LOOP 0 + FRG_LOOP 1 + FRG_LOOP 2 + FRG_LOOP 3 + FRG_LOOP 4 + FRG_LOOP 5 + FRG_LOOP 6 + FRG_LOOP 7 + FRG_LOOP 8 + FRG_LOOP 9 + FRG_LOOP 10 + FRG_LOOP 11 + FRG_LOOP 12 + FRG_LOOP 13 + FRG_LOOP 14 + FRG_LOOP 15 + +.Lfrg_done: + movem.l (%sp)+,%d2-%d5/%a2-%a3 + rts + + +| ---- surface68kStFillRectMulti ------------------------------------- +| +| Multi-group fillRect: groupFirst != groupLast. Caller pre-clips. +| Dispatched on color (low nibble) -> 16 specialized H-row loops. +| +| void surface68kStFillRectMulti(uint8_t *base, +| int16_t x, int16_t y, +| uint16_t w, uint16_t h, +| uint8_t color); +| +| Per row body (per color C): +| 1. Leading mask: 4 hardcoded plane RMW with leftMask +| 2. Middle: numMid groups of 2 long-writes (loLong, hiLong) +| 3. Trailing mask: 4 hardcoded plane RMW with rightMask +| 4. Advance rowBase by 160; decrement h; loop. +| +| Register layout in inner loop: +| d2.w = leftMask d3.w = rightMask +| d4.w = ~leftMask d5.w = ~rightMask +| d6.l = loLong d7.l = hiLong +| a3 = rowBase (advances by 160 each iter) +| a4 = a_grp (per-row scratch) +| d0,d1 = scratch +| +| Stack scratch (4 bytes at sp+0): +| 0..1 numMid (word, reload per row for mid loop) +| 2..3 h (word, decrement per row) + + .equ SP_FRM_SAVED, 44 + .equ SP_FRM_LOCAL, 4 + .equ SP_FRM_OFF, (SP_FRM_SAVED + 4 + SP_FRM_LOCAL) + .equ SP_FRM_BASE, SP_FRM_OFF + 0 + .equ SP_FRM_X, SP_FRM_OFF + 4 + 2 + .equ SP_FRM_Y, SP_FRM_OFF + 8 + 2 + .equ SP_FRM_W, SP_FRM_OFF + 12 + 2 + .equ SP_FRM_H, SP_FRM_OFF + 16 + 2 + .equ SP_FRM_COLOR, SP_FRM_OFF + 20 + 3 + + + .macro FRM_LOOP color +.LfrM_loop_\color: + | Leading mask at (a4)+, walking from row start + move.l %a3,%a4 | a4 = current row's groupFirst byte + .if ((\color) & 1) + or.w %d2,(%a4)+ + .else + and.w %d4,(%a4)+ + .endif + .if ((\color) & 2) + or.w %d2,(%a4)+ + .else + and.w %d4,(%a4)+ + .endif + .if ((\color) & 4) + or.w %d2,(%a4)+ + .else + and.w %d4,(%a4)+ + .endif + .if ((\color) & 8) + or.w %d2,(%a4)+ + .else + and.w %d4,(%a4)+ + .endif + | a4 now points to next group (8 bytes past row start). + | Middle long-fill + move.w 0(%sp),%d0 + tst.w %d0 + beq.s .LfrM_skipMid_\color +.LfrM_midLoop_\color: + move.l %d6,(%a4)+ + move.l %d7,(%a4)+ + subq.w #1,%d0 + bne.s .LfrM_midLoop_\color +.LfrM_skipMid_\color: + | Trailing mask at (a4)+ + .if ((\color) & 1) + or.w %d3,(%a4)+ + .else + and.w %d5,(%a4)+ + .endif + .if ((\color) & 2) + or.w %d3,(%a4)+ + .else + and.w %d5,(%a4)+ + .endif + .if ((\color) & 4) + or.w %d3,(%a4)+ + .else + and.w %d5,(%a4)+ + .endif + .if ((\color) & 8) + or.w %d3,(%a4)+ + .else + and.w %d5,(%a4)+ + .endif + | Advance row (a3 unchanged through the body) + lea 160(%a3),%a3 + subq.w #1,2(%sp) + bne.w .LfrM_loop_\color + bra.w .LfrM_done + .endm + + + .globl _surface68kStFillRectMulti + +_surface68kStFillRectMulti: + movem.l %d2-%d7/%a2-%a6,-(%sp) + lea -SP_FRM_LOCAL(%sp),%sp + + | Load color, build loLong (d6) and hiLong (d7) + moveq #0,%d0 + move.b SP_FRM_COLOR(%sp),%d0 + moveq #0,%d6 + btst #1,%d0 + beq.s .LfrM_lo1 + move.w #-1,%d6 +.LfrM_lo1: + btst #0,%d0 + beq.s .LfrM_lo0 + ori.l #0xFFFF0000,%d6 +.LfrM_lo0: + moveq #0,%d7 + btst #3,%d0 + beq.s .LfrM_hi3 + move.w #-1,%d7 +.LfrM_hi3: + btst #2,%d0 + beq.s .LfrM_hi2 + ori.l #0xFFFF0000,%d7 +.LfrM_hi2: + + | Compute group ptrs and masks + | groupFirst = x >> 4; groupFirstByteOff = groupFirst * 8 + | bitFirst = x & 15 + move.w SP_FRM_X(%sp),%d0 + move.w SP_FRM_W(%sp),%d1 + add.w %d0,%d1 + subq.w #1,%d1 | d1 = x + w - 1 (last pixel) + + | leftMask via LUT[bitFirst] + move.w %d0,%d2 + and.w #15,%d2 + add.w %d2,%d2 + lea frmLeftMaskLut(%pc),%a2 + move.w (%a2,%d2.w),%d2 | d2 = leftMask + move.w %d2,%d4 + not.w %d4 | d4 = notLeftMask + + | rightMask via LUT[bitLast] + move.w %d1,%d3 + and.w #15,%d3 + add.w %d3,%d3 + lea frmRightMaskLut(%pc),%a2 + move.w (%a2,%d3.w),%d3 | d3 = rightMask + move.w %d3,%d5 + not.w %d5 | d5 = notRightMask + + | numMid = (last >> 4) - (x >> 4) - 1 + move.w %d1,%a2 | a2.w = lastPixel (temp) + move.l %a2,%d1 + lsr.w #4,%d1 | groupLast (low word) + move.w %d0,%a2 + move.l %a2,%d0 + lsr.w #4,%d0 | groupFirst + move.w %d0,%a4 | a4.w = groupFirst (save for byteOff calc) + sub.w %d0,%d1 | d1 = groupLast - groupFirst + subq.w #1,%d1 | d1 = numMid (>= 0 since multi-group caller) + move.w %d1,0(%sp) | numMid -> stack + + | h -> stack + move.w SP_FRM_H(%sp),%d1 + move.w %d1,2(%sp) + + | a3 = base + y*160 + groupFirst*8 + move.w SP_FRM_Y(%sp),%d0 + ext.l %d0 + move.l %d0,%d1 + lsl.l #5,%d0 + lsl.l #7,%d1 + add.l %d1,%d0 | y*160 + move.l SP_FRM_BASE(%sp),%a3 + add.l %d0,%a3 | rowBase = base + y*160 + move.l %a4,%d0 | groupFirst + lsl.w #3,%d0 | * 8 + ext.l %d0 + add.l %d0,%a3 | + groupFirst*8 + + | Dispatch on color + moveq #0,%d0 + move.b SP_FRM_COLOR(%sp),%d0 + and.w #0x0F,%d0 + add.w %d0,%d0 + add.w %d0,%d0 + lea .LfrM_table(%pc),%a2 + jmp 0(%a2,%d0.w) + +.LfrM_table: + bra.w .LfrM_loop_0 + bra.w .LfrM_loop_1 + bra.w .LfrM_loop_2 + bra.w .LfrM_loop_3 + bra.w .LfrM_loop_4 + bra.w .LfrM_loop_5 + bra.w .LfrM_loop_6 + bra.w .LfrM_loop_7 + bra.w .LfrM_loop_8 + bra.w .LfrM_loop_9 + bra.w .LfrM_loop_10 + bra.w .LfrM_loop_11 + bra.w .LfrM_loop_12 + bra.w .LfrM_loop_13 + bra.w .LfrM_loop_14 + bra.w .LfrM_loop_15 + + FRM_LOOP 0 + FRM_LOOP 1 + FRM_LOOP 2 + FRM_LOOP 3 + FRM_LOOP 4 + FRM_LOOP 5 + FRM_LOOP 6 + FRM_LOOP 7 + FRM_LOOP 8 + FRM_LOOP 9 + FRM_LOOP 10 + FRM_LOOP 11 + FRM_LOOP 12 + FRM_LOOP 13 + FRM_LOOP 14 + FRM_LOOP 15 + +.LfrM_done: + lea SP_FRM_LOCAL(%sp),%sp + movem.l (%sp)+,%d2-%d7/%a2-%a6 + rts + + + .align 2 +| Same LUTs as in fillCircle.s; duplicated locally so each .o file's +| PC-rel lea can reach them within its own .text segment. +frmLeftMaskLut: + .word 0xFFFF, 0x7FFF, 0x3FFF, 0x1FFF + .word 0x0FFF, 0x07FF, 0x03FF, 0x01FF + .word 0x00FF, 0x007F, 0x003F, 0x001F + .word 0x000F, 0x0007, 0x0003, 0x0001 + +frmRightMaskLut: + .word 0x8000, 0xC000, 0xE000, 0xF000 + .word 0xF800, 0xFC00, 0xFE00, 0xFF00 + .word 0xFF80, 0xFFC0, 0xFFE0, 0xFFF0 + .word 0xFFF8, 0xFFFC, 0xFFFE, 0xFFFF + + +| ---- surface68kStLongFill ---------------------------------------- +| +| Bulk long-fill helper for full-row fills (surfaceClear, fillRect +| 320x200). Writes numGroups groups of 8 bytes (loLong, hiLong) +| starting at dst. Uses movem.l d2-d7 (3 groups = 24 bytes per +| batch) plus a tail pair to amortize loop overhead. +| +| void surface68kStLongFill(uint8_t *dst, +| uint16_t numGroups, +| uint32_t loLong, +| uint32_t hiLong); +| +| Per-batch cost: movem.l (56 cyc) + subq (8) + bne (10) = 74 cyc +| for 24 bytes -- ~3 cyc/byte vs ~5 cyc/byte for the straight C +| do-while of two move.l writes. + + .equ SP_LF_SAVED, 24 | d2-d7 = 6 longs + .equ SP_LF_OFF, (SP_LF_SAVED + 4) + .equ SP_LF_DST, SP_LF_OFF + 0 + .equ SP_LF_NGROUPS, SP_LF_OFF + 4 + 2 + .equ SP_LF_LO, SP_LF_OFF + 8 + .equ SP_LF_HI, SP_LF_OFF + 12 + + + .globl _surface68kStLongFill + +_surface68kStLongFill: + movem.l %d2-%d7,-(%sp) + + move.l SP_LF_DST(%sp),%a0 + move.l SP_LF_LO(%sp),%d2 + move.l SP_LF_HI(%sp),%d3 + move.w SP_LF_NGROUPS(%sp),%d0 + + | Set up d2-d7 = lo, hi, lo, hi, lo, hi (movem writes + | in d-reg order, so this gives the right alternation + | for 3 consecutive 8-byte groups). + move.l %d2,%d4 + move.l %d2,%d6 + move.l %d3,%d5 + move.l %d3,%d7 + + | numBatches = numGroups / 3 (quotient), tail = remainder + ext.l %d0 + divu.w #3,%d0 + move.l %d0,%d1 + swap %d1 | d1.w = remainder + tst.w %d0 | quotient + beq.s .Llf_tail +.Llf_loop: + movem.l %d2-%d7,(%a0) + lea 24(%a0),%a0 + subq.w #1,%d0 + bne.s .Llf_loop + +.Llf_tail: + | Remainder: 0, 1, or 2 groups of 8 bytes + tst.w %d1 + beq.s .Llf_done + move.l %d2,(%a0)+ + move.l %d3,(%a0)+ + subq.w #1,%d1 + beq.s .Llf_done + move.l %d2,(%a0)+ + move.l %d3,(%a0)+ + +.Llf_done: + movem.l (%sp)+,%d2-%d7 + rts diff --git a/src/port/atarist/spriteAsm.s b/src/port/atarist/spriteAsm.s new file mode 100644 index 0000000..b1b233c --- /dev/null +++ b/src/port/atarist/spriteAsm.s @@ -0,0 +1,202 @@ +| ST byte-aligned sprite save / restore via 256-entry plane-spread +| LUT. The LUT entry for each plane byte value is a 32-bit "spread" +| where each plane byte bit lands at the corresponding plane-0 bit +| position of the 4-byte chunky output. For plane N, we shift the +| LUT entry left by N to put bits at the plane-N positions, then OR +| the 4 plane contributions together to get the chunky long. +| +| LUT layout (256 longs = 1 KB), populated by initStPlaneSpreadLut +| in hal.c: +| +| gStPlaneSpreadLut[b] for plane byte b: +| bit i of b (i = 0 = MSB = leftmost pixel) maps to bit +| bitInLong(i) = (3 - (i >> 1)) * 8 + ((i & 1) ? 0 : 4) +| of the long. Plane 0's bits land at nibble bit 0 of each +| chunky byte; left-shift the LUT entry by N for plane N. +| +| ABI: cdecl. d2-d7/a2-a6 callee-save. C signatures: +| +| void surface68kStSpriteSaveByteAligned(uint8_t *base, +| uint16_t x, uint16_t y, +| uint16_t w, uint16_t h, +| uint8_t *dstChunky); +| +| void surface68kStSpriteRestoreByteAligned(uint8_t *base, +| uint16_t x, uint16_t y, +| uint16_t w, uint16_t h, +| const uint8_t *srcChunky); + + .text + + + .equ SP_SAVED, 44 + .equ SP_OFF, (SP_SAVED + 4) + .equ SP_BASE, SP_OFF + 0 + .equ SP_X, SP_OFF + 4 + 2 + .equ SP_Y, SP_OFF + 8 + 2 + .equ SP_W, SP_OFF + 12 + 2 + .equ SP_H, SP_OFF + 16 + 2 + .equ SP_CHUNKY, SP_OFF + 20 + .equ SP_LUT, SP_OFF + 24 + + +| Per-tile-col SAVE: 4 plane bytes -> 4 contiguous bytes in buffer. +| a0 -> plane 0 byte (high or low half), strides 2 to next plane +| a1 -> output planar bytes (advanced by 4) +| a2 -> unused (LUT no longer needed) +| +| Phase 10.5: dropped chunky <-> planar conversion. The buffer holds +| plane-major bytes (per row: plane0, plane1, plane2, plane3 per +| tile col, for w/8 tile cols). 4 byte copies instead of 4 LUT +| lookups + shifts + ORs. + + .macro SAVE_TILECOL + move.b (%a0),(%a1)+ | plane 0 + move.b 2(%a0),(%a1)+ | plane 1 + move.b 4(%a0),(%a1)+ | plane 2 + move.b 6(%a0),(%a1)+ | plane 3 + .endm + + + .globl _surface68kStSpriteSaveByteAligned + +_surface68kStSpriteSaveByteAligned: + movem.l %d2-%d7/%a2-%a6,-(%sp) + + move.l SP_BASE(%sp),%a3 + move.l SP_CHUNKY(%sp),%a1 + | LUT pointer comes in via stack arg -- guaranteed + | long-aligned because gcc passes ptr args via + | move.l on a long-aligned sp slot. Avoids the BSS + | misalignment problem on TOS .PRG (BSS pads only to + | 2 bytes, even uint32_t slots can land at mod-4 = 2). + move.l SP_LUT(%sp),%a2 + + move.w SP_W(%sp),%d5 + lsr.w #3,%d5 | d5 = tileCols + move.w SP_H(%sp),%d6 | d6 = h + move.w SP_X(%sp),%d7 + + | a4 = base + y*160 + (x>>4)*8 + move.w SP_Y(%sp),%d0 + ext.l %d0 + move.l %d0,%d1 + lsl.l #5,%d0 | y << 5 + lsl.l #7,%d1 | y << 7 + add.l %d1,%d0 | y * 160 + lea 0(%a3,%d0.l),%a4 + moveq #0,%d0 + move.w %d7,%d0 + lsr.w #4,%d0 + lsl.w #3,%d0 + ext.l %d0 + add.l %d0,%a4 + + | Initial half offset: (x & 8) >> 3 = 0 or 1 + and.w #8,%d7 + lsr.w #3,%d7 + +.LsaveRow: + move.w %d5,%d3 | d3 = tileCols + moveq #0,%d2 + move.w %d7,%d2 + lea 0(%a4,%d2.l),%a0 | a0 = first plane-0 byte + +.LsaveCol: + SAVE_TILECOL + | Advance a0: bit 0 = 0 -> high, advance to low (+1). + | bit 0 = 1 -> low, advance to next group's high (+7). + move.l %a0,%d4 + btst #0,%d4 + bne.s .LsaveColWasLo + addq.l #1,%a0 + bra.s .LsaveColNext +.LsaveColWasLo: + lea 7(%a0),%a0 +.LsaveColNext: + subq.w #1,%d3 + bne.w .LsaveCol + + lea 160(%a4),%a4 + subq.w #1,%d6 + bne.w .LsaveRow + + movem.l (%sp)+,%d2-%d7/%a2-%a6 + rts + + +| Per-tile-col RESTORE: 4 contiguous bytes from buffer -> 4 plane bytes. +| a0 -> plane 0 byte (high or low half) +| a1 -> input planar bytes (advanced by 4) +| a2 -> unused (LUT no longer needed) +| +| Phase 10.5: dropped chunky -> planar conversion. Buffer layout +| matches SAVE_TILECOL: per row, plane0/1/2/3 per tile col. + + .macro RESTORE_TILECOL + move.b (%a1)+,(%a0) | plane 0 + move.b (%a1)+,2(%a0) | plane 1 + move.b (%a1)+,4(%a0) | plane 2 + move.b (%a1)+,6(%a0) | plane 3 + .endm + + + .globl _surface68kStSpriteRestoreByteAligned + +_surface68kStSpriteRestoreByteAligned: + movem.l %d2-%d7/%a2-%a6,-(%sp) + + move.l SP_BASE(%sp),%a3 + move.l SP_CHUNKY(%sp),%a1 + move.l SP_LUT(%sp),%a2 | gC2pLut passed in + + | tileCols is held in a5 (not d5) because the macro + | trashes d5 (uses it for pb3). + move.w SP_W(%sp),%d0 + lsr.w #3,%d0 + movea.w %d0,%a5 + move.w SP_H(%sp),%d6 + move.w SP_X(%sp),%d7 + + move.w SP_Y(%sp),%d0 + ext.l %d0 + move.l %d0,%d1 + lsl.l #5,%d0 + lsl.l #7,%d1 + add.l %d1,%d0 + lea 0(%a3,%d0.l),%a4 + moveq #0,%d0 + move.w %d7,%d0 + lsr.w #4,%d0 + lsl.w #3,%d0 + ext.l %d0 + add.l %d0,%a4 + + and.w #8,%d7 + lsr.w #3,%d7 + +.LrestoreRow: + move.w %a5,%d3 | d3 = tileCols (from a5) + moveq #0,%d2 + move.w %d7,%d2 + lea 0(%a4,%d2.l),%a0 + +.LrestoreCol: + RESTORE_TILECOL + move.l %a0,%d4 + btst #0,%d4 + bne.s .LrestoreColWasLo + addq.l #1,%a0 + bra.s .LrestoreColNext +.LrestoreColWasLo: + lea 7(%a0),%a0 +.LrestoreColNext: + subq.w #1,%d3 + bne.w .LrestoreCol + + lea 160(%a4),%a4 + subq.w #1,%d6 + bne.w .LrestoreRow + + movem.l (%sp)+,%d2-%d7/%a2-%a6 + rts