From b1e24b4650c8cce2787df6f1426f0a8204c6a561 Mon Sep 17 00:00:00 2001 From: Scott Duensing Date: Sun, 3 May 2026 01:44:39 -0500 Subject: [PATCH] Amiga parity with IIgs! --- examples/audio/audio.c | 4 +- examples/joy/joy.c | 4 +- examples/keys/keys.c | 17 +- examples/sprite/sprite.c | 70 +- examples/uber/uber.c | 83 +- include/joey/debug.h | 1 + include/joey/present.h | 16 +- include/joey/sprite.h | 17 +- include/joey/surface.h | 9 + make/amiga.mk | 9 +- make/atarist.mk | 8 +- make/dos.mk | 8 +- make/iigs.mk | 64 +- src/codegen/spriteCompile.c | 136 ++- src/codegen/spriteEmit68k.c | 15 + src/codegen/spriteEmitIigs.c | 12 + src/codegen/spriteEmitPlanar68k.c | 505 ++++++++ src/codegen/spriteEmitX86.c | 12 + src/codegen/spriteEmitter.h | 15 + src/core/debug.c | 70 +- src/core/draw.c | 100 +- src/core/hal.h | 172 ++- src/core/present.c | 48 +- src/core/sprite.c | 82 +- src/core/spriteInternal.h | 10 + src/core/surface.c | 63 +- src/core/surfaceInternal.h | 30 + src/core/tile.c | 12 +- src/port/amiga/circle.s | 270 +++++ src/port/amiga/hal.c | 1857 ++++++++++++++++++++++++++--- src/port/atarist/hal.c | 174 ++- src/port/dos/hal.c | 169 ++- src/port/iigs/hal.c | 201 +++- src/port/iigs/peislam.asm | 67 +- src/shared68k/surface68k.s | 250 ++++ tools/diff-uber-hashes | 93 ++ tools/diff-uber-perf | 132 ++ 37 files changed, 4312 insertions(+), 493 deletions(-) create mode 100644 src/codegen/spriteEmitPlanar68k.c create mode 100644 src/port/amiga/circle.s create mode 100755 tools/diff-uber-hashes create mode 100755 tools/diff-uber-perf diff --git a/examples/audio/audio.c b/examples/audio/audio.c index 90ac866..163c14c 100644 --- a/examples/audio/audio.c +++ b/examples/audio/audio.c @@ -171,11 +171,11 @@ int main(void) { if (flashFrames > 0) { fillRect(screen, BAR_X, BAR_Y, BAR_W, BAR_H, COLOR_BAR); - stagePresentRect(BAR_X, BAR_Y, BAR_W, BAR_H); + stagePresent(); flashFrames--; if (flashFrames == 0) { fillRect(screen, BAR_X, BAR_Y, BAR_W, BAR_H, COLOR_HINT); - stagePresentRect(BAR_X, BAR_Y, BAR_W, BAR_H); + stagePresent(); } } } diff --git a/examples/joy/joy.c b/examples/joy/joy.c index 2683706..24de236 100644 --- a/examples/joy/joy.c +++ b/examples/joy/joy.c @@ -80,8 +80,10 @@ static void buildPalette(SurfaceT *screen) { static void drawAndPresent(SurfaceT *screen, int16_t x, int16_t y, int16_t w, int16_t h, uint8_t color) { + /* fillRect marks the rect dirty; stagePresent flushes only that + * dirty band. */ fillRect(screen, x, y, (uint16_t)w, (uint16_t)h, color); - stagePresentRect(x, y, (uint16_t)w, (uint16_t)h); + stagePresent(); } diff --git a/examples/keys/keys.c b/examples/keys/keys.c index 31bbfef..253977b 100644 --- a/examples/keys/keys.c +++ b/examples/keys/keys.c @@ -158,8 +158,6 @@ static void presentChangedCells(SurfaceT *screen, int16_t cursorCol, int16_t cur int16_t row; JoeyKeyE key; bool lit; - int16_t x; - int16_t y; for (row = 0; row < GRID_ROWS; row++) { for (col = 0; col < GRID_COLS; col++) { @@ -171,10 +169,10 @@ static void presentChangedCells(SurfaceT *screen, int16_t cursorCol, int16_t cur if (lit == gCellLit[row][col]) { continue; } + /* drawCell marks the cell's rect dirty; stagePresent + * flushes that one band. */ drawCell(screen, col, row, lit); - x = (int16_t)(MARGIN_X + col * (CELL_W + GAP)); - y = (int16_t)(MARGIN_Y + row * (CELL_H + GAP)); - stagePresentRect(x, y, CELL_W, CELL_H); + stagePresent(); gCellLit[row][col] = lit; } } @@ -195,19 +193,16 @@ static void updateCursor(SurfaceT *screen, int16_t cursorCol, int16_t cursorRow) if (gLastCursorX != mouseX || gLastCursorY != mouseY) { if (gLastCursorCol != CELL_NONE) { drawCell(screen, gLastCursorCol, gLastCursorRow, gCellLit[gLastCursorRow][gLastCursorCol]); - stagePresentRect( - (int16_t)(MARGIN_X + gLastCursorCol * (CELL_W + GAP)), - (int16_t)(MARGIN_Y + gLastCursorRow * (CELL_H + GAP)), - CELL_W, CELL_H); } else if (gLastCursorX >= 0 && gLastCursorY >= 0) { // Old cursor was in a gap region. Stamp background over it. fillRect(screen, gLastCursorX, gLastCursorY, CURSOR_W, CURSOR_H, COLOR_BACKGROUND); - stagePresentRect(gLastCursorX, gLastCursorY, CURSOR_W, CURSOR_H); } } drawCursor(screen, mouseX, mouseY); - stagePresentRect(mouseX, mouseY, CURSOR_W, CURSOR_H); + /* All draw calls above marked their rects dirty; one stagePresent + * flushes the union (cursor erase + cursor draw). */ + stagePresent(); gLastCursorX = mouseX; gLastCursorY = mouseY; diff --git a/examples/sprite/sprite.c b/examples/sprite/sprite.c index 3c7509b..49e5a96 100644 --- a/examples/sprite/sprite.c +++ b/examples/sprite/sprite.c @@ -15,11 +15,11 @@ #define BALL_TILES_Y (BALL_H / 8) #define BALL_TILE_BYTES (BALL_TILES_X * BALL_TILES_Y * TILE_BYTES) -// SaveUnder must store rounded-up byte boundaries: x rounded down to -// even, width rounded up to even. Worst case for BALL_W=16 (already -// even) is 8 bytes per row + alignment slack of 1 byte; size for the -// pessimistic case so the buffer never overflows. -#define BALL_BACKUP_BYTES (((BALL_W + 2) >> 1) * BALL_H) +// SaveUnder rounds x down to the platform's storage alignment: 2 px +// for chunky 4bpp (1 extra byte/row worst case), 8 px for planar +// 4-plane (4 extra bytes/row worst case -- one per plane). The +4 +// covers the planar case and is a no-op overhead on chunky. +#define BALL_BACKUP_BYTES (((BALL_W >> 1) + 4) * BALL_H) #define BALL_PALETTE_IDX 0 @@ -100,18 +100,14 @@ int main(void) { int16_t y; int16_t vx; int16_t vy; - int16_t oldX; - int16_t oldY; - uint16_t oldW; - uint16_t oldH; - int16_t unionX; - int16_t unionY; - int16_t unionRight; - int16_t unionBottom; bool haveBackup; config.hostMode = HOST_MODE_TAKEOVER; - config.codegenBytes = 8 * 1024; + /* Amiga planar emits 8 pre-shifted DRAW variants per sprite (one + * per x % 8 alignment) so the codegen arena needs roughly 8x what + * the chunky two-shift case asks for. 32 KB fits a 16x16 ball + * with all variants. */ + config.codegenBytes = 32UL * 1024; config.maxSurfaces = 4; config.audioBytes = 64UL * 1024; config.assetBytes = 128UL * 1024; @@ -155,7 +151,7 @@ int main(void) { haveBackup = false; spriteSaveAndDraw(screen, ball, x, y, &backup); - stagePresentRect(backup.x, backup.y, backup.width, backup.height); + stagePresent(); haveBackup = true; for (;;) { @@ -164,19 +160,15 @@ int main(void) { break; } - // Stash the prior ball's region before restoring the bytes - // under it. Do all off-screen work (restore + move + draw) - // first, then waitVBL + ONE stagePresentRect covering both - // old and new regions. Putting waitVBL immediately before the - // present lets the present land inside the VBL window so the - // CRT never sees a half-updated framebuffer (matters most on - // single-buffered chunky targets like IIgs SHR; on planar - // c2p platforms it also avoids c2p racing the raster). - oldX = backup.x; - oldY = backup.y; - oldW = backup.width; - oldH = backup.height; - + // Do all off-screen work first (restore + move + draw), then + // ONE stagePresent flushes the union of dirty bands set by + // restoreUnder + draw. Add a joeyWaitVBL() before the present + // to land it inside the VBL window so the CRT never sees a + // half-updated framebuffer (matters most on single-buffered + // chunky targets like IIgs SHR; on planar c2p platforms it + // also avoids c2p racing the raster). VBL wait is omitted + // here so the demo runs at the sprite pipeline's native + // throughput -- expect tearing on the ball. if (haveBackup) { spriteRestoreUnder(screen, &backup); } @@ -190,27 +182,7 @@ int main(void) { spriteSaveAndDraw(screen, ball, x, y, &backup); - // Bounding box of (old rect) U (new rect). For typical - // small-step motion the rects overlap heavily so the union - // is barely larger than one ball. - unionX = (oldX < backup.x) ? oldX : backup.x; - unionY = (oldY < backup.y) ? oldY : backup.y; - unionRight = (int16_t)((oldX + oldW > backup.x + backup.width) - ? (oldX + oldW) - : (backup.x + backup.width)); - unionBottom = (int16_t)((oldY + oldH > backup.y + backup.height) - ? (oldY + oldH) - : (backup.y + backup.height)); - - // VBL wait removed -- the demo runs at the native compute speed - // of save+restore+draw+presentRect so we can SEE the sprite - // pipeline's actual throughput. Expect tearing on the ball - // since the present can land mid-scan; that's the cost of - // showing real frame rate. Add joeyWaitVBL() back here for - // tear-free 60 Hz motion. - stagePresentRect(unionX, unionY, - (uint16_t)(unionRight - unionX), - (uint16_t)(unionBottom - unionY)); + stagePresent(); haveBackup = true; } diff --git a/examples/uber/uber.c b/examples/uber/uber.c index 5538b48..eb9e243 100644 --- a/examples/uber/uber.c +++ b/examples/uber/uber.c @@ -28,7 +28,16 @@ // 4-frame measurement window. Long enough that loop overhead doesn't // dominate; short enough to keep the full demo run under ~10 sec. -#define UBER_FRAMES 4u +/* 16 frames per timed op gives 4x the iter-count resolution of the + * earlier 4-frame budget. Exposes the actual per-op cost on slow + * ops where 4 frames produced the same iter count on different + * framerates -- e.g. drawCircle r=80 read as "4 iters / 4 frames" + * on both 60 Hz IIgs (16.7 ms/frame, 67 ms window) and 50 Hz Amiga + * (20 ms/frame, 80 ms window) even though per-op cost was equal, + * just because 4 ops at 16-17 ms happen to fit both windows. The + * 16-frame budget extends the windows to 267 ms / 320 ms; quantum + * gap shrinks to ~6%. Total run time scales 4x (~80 sec each). */ +#define UBER_FRAMES 16u typedef void (*OpFn)(void); @@ -44,9 +53,10 @@ static TileT gTileScratch; // Run `op` in a tight loop until `targetFrames` joeyFrameCount ticks // have elapsed. Returns iterations completed. -static unsigned long runForFrames(OpFn op, unsigned int targetFrames) { +static unsigned long runForFrames(OpFn op, unsigned int targetFrames, uint16_t *actualFramesOut) { unsigned long count; uint16_t startFrame; + uint16_t endFrame; count = 0UL; @@ -57,29 +67,50 @@ static unsigned long runForFrames(OpFn op, unsigned int targetFrames) { op(); count++; } + /* Capture the actual elapsed frames -- the last iter typically + * overruns the target. Using actual instead of target as the + * ops/sec divisor stays honest for ops slower than 1 frame + * (where count is forced low while real time stretches well + * past targetFrames). */ + endFrame = joeyFrameCount(); + *actualFramesOut = (uint16_t)(endFrame - startFrame); + if (*actualFramesOut == 0u) { + *actualFramesOut = 1u; /* defensive: avoid div-by-zero */ + } return count; } // Time and log one op. Reports iters / N frames AND the derived // ops/sec so per-port results are directly comparable against IIgs -// regardless of CPU speed or display refresh rate. +// regardless of CPU speed or display refresh rate. Also logs an +// FNV-1a hash of the surface state after timing -- this is the +// pixel-perfect comparison input for the cross-port validation +// harness (tools/diff-uber-hashes.py). Captured against IIgs as the +// golden reference; planar 68k rewrites validate by matching it. static void timeOp(const char *name, OpFn op) { unsigned long iters; unsigned long opsPerSec; + uint16_t actualFrames; + uint32_t hash; gCurName = name; - iters = runForFrames(op, UBER_FRAMES); + iters = runForFrames(op, UBER_FRAMES, &actualFrames); if (iters == 0UL) { joeyLogF("UBER: %s: 0 iters (op too slow?)\n", name); return; } - opsPerSec = (iters * (unsigned long)joeyFrameHz()) / (unsigned long)UBER_FRAMES; - joeyLogF("UBER: %s: %lu iters / %u frames = %lu ops/sec\n", - name, iters, UBER_FRAMES, opsPerSec); + /* Divide by ACTUAL elapsed frames, not the target. For sub-frame + * ops actualFrames ~= UBER_FRAMES so the answer is unchanged; + * for ops that overrun (slow stagePresent etc.), this stops + * inflating ops/sec. */ + opsPerSec = (iters * (unsigned long)joeyFrameHz()) / (unsigned long)actualFrames; + hash = surfaceHash(gStage); + joeyLogF("UBER: %s: %lu iters / %u frames = %lu ops/sec | hash=%08lX\n", + name, iters, actualFrames, opsPerSec, (unsigned long)hash); } @@ -125,8 +156,6 @@ static void op_spriteRestore (void) { spriteRestoreUnder(gStage, &gBackup); static void op_spriteSaveAndDraw (void) { spriteSaveAndDraw (gStage, gSprite, gSpriteX, gSpriteY, &gBackup); } static void op_stagePresent (void) { stagePresent(); } -static void op_stagePresentRect8(void) { stagePresentRect( 40, 30, 16, 16); } -static void op_stagePresentRectF(void) { stagePresentRect( 0, 0, 320, 200); } static void op_inputPoll (void) { joeyInputPoll(); } static void op_keyDown (void) { (void)joeyKeyDown(KEY_A); } @@ -229,10 +258,14 @@ static void runAllTests(void) { timeOp("spriteRestoreUnder", op_spriteRestore); timeOp("spriteSaveAndDraw", op_spriteSaveAndDraw); - // Present. + // Present. One warm-up call before each timed loop primes any + // per-port one-time setup (Amiga: copper list rebuild after the + // paletteSet / scbSetRange tests dirty the cache; without warm-up + // the rebuild's MakeScreen + MrgCop + WaitTOF chain consumes the + // entire 4-frame measurement window) so we measure steady-state + // throughput rather than first-call penalty. + stagePresent(); timeOp("stagePresent full", op_stagePresent); - timeOp("stagePresentRect 8b",op_stagePresentRect8); - timeOp("stagePresentRect F", op_stagePresentRectF); // Input. timeOp("joeyInputPoll", op_inputPoll); @@ -253,12 +286,19 @@ static void runAllTests(void) { int main(void) { - JoeyConfigT config; - uint16_t pal[16]; - int i; + JoeyConfigT config; + uint16_t pal[16]; + int i; + uint16_t startFrame; + uint16_t endFrame; + uint16_t elapsedFrames; + unsigned long elapsedMs; config.hostMode = HOST_MODE_TAKEOVER; - config.codegenBytes = 8 * 1024; + /* 32 KB fits the 8 pre-shifted DRAW variants the Amiga planar + * compiled sprite emitter generates. UL on the multiply because + * ORCA-C's 16-bit int overflows on 32 * 1024. */ + config.codegenBytes = 32UL * 1024; config.maxSurfaces = 4; config.audioBytes = 64UL * 1024; config.assetBytes = 128UL * 1024; @@ -266,6 +306,11 @@ int main(void) { if (!joeyInit(&config)) { return 1; } + /* joeyFrameCount is VBL-driven, so it only ticks after halInit + * installed its VBL ISR -- captured here is "everything from now + * to press-any-key". Pre-init setup time is small and not the + * cost the user is chasing; runAllTests dominates. */ + startFrame = joeyFrameCount(); gStage = stageGet(); if (gStage == NULL) { @@ -337,6 +382,12 @@ int main(void) { runAllTests(); + endFrame = joeyFrameCount(); + elapsedFrames = (uint16_t)(endFrame - startFrame); + elapsedMs = ((unsigned long)elapsedFrames * 1000UL) / (unsigned long)joeyFrameHz(); + joeyLogF("UBER: total wall time: %lu ms (%u frames @ %u Hz)\n", + elapsedMs, elapsedFrames, (unsigned)joeyFrameHz()); + // Done. Green screen + waitForKey. surfaceClear(gStage, 2); stagePresent(); diff --git a/include/joey/debug.h b/include/joey/debug.h index 161fdef..c446530 100644 --- a/include/joey/debug.h +++ b/include/joey/debug.h @@ -5,6 +5,7 @@ void joeyLog (const char *msg); void joeyLogF (const char *fmt, ...); +void joeyLogFlush(void); void joeyLogReset(void); #endif diff --git a/include/joey/present.h b/include/joey/present.h index 521baf7..cdc4abc 100644 --- a/include/joey/present.h +++ b/include/joey/present.h @@ -15,14 +15,14 @@ #include "types.h" // Flip the dirty regions of the stage to the display, then clear the -// dirty state. Cheap when nothing has changed since the last call. +// dirty state. Cheap when nothing has changed since the last call +// (gStageAnyDirty short-circuit). Drawing primitives mark dirty as +// a side effect, so callers only need to call stagePresent at the +// end of a frame -- everything they drew shows up. +// +// To present a region you didn't draw with the standard primitives +// (e.g. direct framebuffer poking), call surfaceMarkDirtyRect on +// the same rect first, then stagePresent. void stagePresent(void); -// Flip a specific rectangular region of the stage to the display, -// regardless of dirty state. Coordinates are clipped to the surface; -// negative or zero dimensions are no-ops. Does not consult or modify -// the dirty arrays -- callers mixing stagePresentRect with stagePresent -// in the same frame may see redundant work on the next stagePresent. -void stagePresentRect(int16_t x, int16_t y, uint16_t w, uint16_t h); - #endif diff --git a/include/joey/sprite.h b/include/joey/sprite.h index 0a34ee8..c66474c 100644 --- a/include/joey/sprite.h +++ b/include/joey/sprite.h @@ -27,13 +27,16 @@ #include "surface.h" #include "types.h" -// Sprites always write to a 4bpp packed SurfaceT, never to display -// memory directly (halPresent owns that path). The codegen emits 2 -// shift variants on every platform: shift 0 for even x (sprite byte -// boundaries match destination byte boundaries) and shift 1 for odd -// x (each destination byte combines two adjacent sprite bytes' -// nibbles). -#define JOEY_SPRITE_SHIFT_COUNT 2 +// Sprite codegen emits per-shift variants. Chunky 4bpp ports (DOS, +// IIgs, Atari ST) only need 2 shifts -- pixel offset 0 (sprite/dest +// byte boundaries align) and offset 1 (every dest byte combines two +// sprite bytes' nibbles). Planar ports (Amiga -- 8 px per plane byte) +// need 8 shifts: one for each x % 8 alignment, so smooth horizontal +// motion at any pixel position uses pre-shifted source bytes without +// runtime bit-shifting. Allocate the max so routineOffsets[] has +// slots for every variant; chunky ports leave shifts 2..7 as +// SPRITE_NOT_COMPILED, planar ports use all 8. +#define JOEY_SPRITE_SHIFT_COUNT 8 typedef enum { SPRITE_FLAGS_NONE = 0 diff --git a/include/joey/surface.h b/include/joey/surface.h index 6f6e9d9..dcab6b9 100644 --- a/include/joey/surface.h +++ b/include/joey/surface.h @@ -58,4 +58,13 @@ bool surfaceSaveFile(const SurfaceT *src, const char *path); // identity (no reallocation). bool surfaceLoadFile(SurfaceT *dst, const char *path); +// FNV-1a 32-bit hash of the surface's logical pixel content (color +// indices in row-major order, 0..15 per pixel). Same logical pixels +// produce the same hash on every port regardless of internal storage +// format -- so a hash captured on IIgs (chunky) compares directly +// against the same op's output on Amiga (planar) once the planar +// rewrite is done. Used by the UBER validation harness to +// pixel-compare ports against an IIgs golden reference. +uint32_t surfaceHash(const SurfaceT *s); + #endif diff --git a/make/amiga.mk b/make/amiga.mk index 63741b4..ecd08eb 100644 --- a/make/amiga.mk +++ b/make/amiga.mk @@ -13,7 +13,7 @@ BINDIR := $(BUILD)/bin # independently. -I on $(SRC_PORT)/amiga lets ptplayer.h resolve # from the port-local shim alongside our HAL code. PTPLAYER_DIR := $(REPO_DIR)/toolchains/amiga/ptplayer -CFLAGS := $(COMMON_CFLAGS) -DJOEYLIB_PLATFORM_AMIGA -m68000 -fomit-frame-pointer -noixemul -D__OSCOMPAT -I$(SRC_PORT)/amiga -I$(SRC_68K) -I$(PTPLAYER_DIR) +CFLAGS := $(COMMON_CFLAGS) -DJOEYLIB_PLATFORM_AMIGA -m68000 -fomit-frame-pointer -noixemul -D__OSCOMPAT -I$(SRC_PORT)/amiga -I$(SRC_68K) -I$(PTPLAYER_DIR) -MMD -MP $(CFLAGS_EXTRA) # OSCOMPAT=1 selects PTPlayer's audio.device-friendly variant (uses # CIA-B + audio.device interrupts via the OS rather than taking over # Paula directly), matching the way our HAL cooperates with Intuition. @@ -52,6 +52,7 @@ LIB_OBJS := \ $(patsubst $(SRC_68K)/%.s,$(BUILD)/obj/68k/%.o,$(SHARED_S)) \ $(BUILD)/obj/port/ptplayer.o \ $(BUILD)/obj/codegen/spriteEmit68k.o \ + $(BUILD)/obj/codegen/spriteEmitPlanar68k.o \ $(BUILD)/obj/codegen/spriteCompile.o LIB := $(LIBDIR)/libjoey.a @@ -156,3 +157,9 @@ $(DATA_DIR)/test.sfx: $(REPO_DIR)/assets/test.sfx clean-amiga: rm -rf $(BUILD) + +# Pull in per-object header-dependency files generated by gcc -MMD/-MP. +# Without this, editing a header (e.g. surfaceInternal.h) doesn't rebuild +# the .c files that include it, leaving a frankenstein binary where +# different TUs see different struct layouts. +-include $(LIB_OBJS:.o=.d) diff --git a/make/atarist.mk b/make/atarist.mk index c8d7536..38bc119 100644 --- a/make/atarist.mk +++ b/make/atarist.mk @@ -7,7 +7,7 @@ BUILD := $(REPO_DIR)/build/$(PLATFORM) LIBDIR := $(BUILD)/lib BINDIR := $(BUILD)/bin -CFLAGS := $(COMMON_CFLAGS) -DJOEYLIB_PLATFORM_ATARIST -m68000 -fomit-frame-pointer -I$(REPO_DIR)/toolchains/audio/libxmp-lite/include -I$(REPO_DIR)/toolchains/atarist/include-shim -I$(SRC_68K) +CFLAGS := $(COMMON_CFLAGS) -DJOEYLIB_PLATFORM_ATARIST -m68000 -fomit-frame-pointer -I$(REPO_DIR)/toolchains/audio/libxmp-lite/include -I$(REPO_DIR)/toolchains/atarist/include-shim -I$(SRC_68K) -MMD -MP LDFLAGS := # libxmp-lite shared with the DOS port. Built as a static archive that @@ -148,3 +148,9 @@ $(DATA_DIR)/test.sfx: $(REPO_DIR)/assets/test.sfx clean-atarist: rm -rf $(BUILD) + +# Pull in per-object header-dependency files generated by gcc -MMD/-MP. +# Without this, editing a header (e.g. surfaceInternal.h) doesn't rebuild +# the .c files that include it, leaving a frankenstein binary where +# different TUs see different struct layouts. +-include $(LIB_OBJS:.o=.d) diff --git a/make/dos.mk b/make/dos.mk index ee960ec..15ac57a 100644 --- a/make/dos.mk +++ b/make/dos.mk @@ -7,7 +7,7 @@ BUILD := $(REPO_DIR)/build/$(PLATFORM) LIBDIR := $(BUILD)/lib BINDIR := $(BUILD)/bin -CFLAGS := $(COMMON_CFLAGS) -DJOEYLIB_PLATFORM_DOS -march=i386 -m32 -I$(REPO_DIR)/toolchains/audio/libxmp-lite/include +CFLAGS := $(COMMON_CFLAGS) -DJOEYLIB_PLATFORM_DOS -march=i386 -m32 -I$(REPO_DIR)/toolchains/audio/libxmp-lite/include -MMD -MP ASFLAGS := -f coff LDFLAGS := @@ -138,3 +138,9 @@ $(DATA_DIR)/test.sfx: $(REPO_DIR)/assets/test.sfx clean-dos: rm -rf $(BUILD) + +# Pull in per-object header-dependency files generated by gcc -MMD/-MP. +# Without this, editing a header (e.g. surfaceInternal.h) doesn't rebuild +# the .c files that include it, leaving a frankenstein binary where +# different TUs see different struct layouts. +-include $(LIB_OBJS:.o=.d) diff --git a/make/iigs.mk b/make/iigs.mk index a049d5c..d895384 100644 --- a/make/iigs.mk +++ b/make/iigs.mk @@ -51,11 +51,11 @@ IIGS_MERLIN := $(REPO_DIR)/toolchains/iigs/merlin32/bin/merlin32 LIB_SRCS := $(CORE_C_SRCS_IIGS) $(PORT_C_SRCS) $(PORT_ASM_SRCS_ALL) $(NTP_ASM) $(CODEGEN_SRCS) -# HELLO and PATTERN are intentionally omitted from this list. The UBER -# demo (below) exercises every public API, including what those two -# small examples covered, and the IIgs disk image was running out of -# room. Source for HELLO/PATTERN is still in examples/{hello,pattern}/ -# for reference and for other ports that want them. +# HELLO is omitted from the disk because UBER exercises everything it +# does and the disk was tight. PATTERN is included as the SCB / palette +# golden-reference for cross-port debugging. +PATTERN_SRC := $(EXAMPLES)/pattern/pattern.c +PATTERN_BIN := $(BINDIR)/PATTERN DRAW_SRC := $(EXAMPLES)/draw/draw.c DRAW_BIN := $(BINDIR)/DRAW KEYS_SRC := $(EXAMPLES)/keys/keys.c @@ -120,24 +120,44 @@ $(NTP_ASM): $(NTP_BIN) $(REPO_DIR)/toolchains/iigs/bin-to-asm.sh # everywhere, so library asm can take SurfaceT* args via one # consistent ABI (small-mm 16-bit pointers truncated bank bytes, # which broke any asm that wanted to address bank-1 stage memory). +# Per-binary header dependency files. iix-build.sh -M emits one .d +# alongside each binary covering every header transitively included +# by the C sources in that binary's build. Pulled in via -include at +# the bottom of this file so editing a shared header (e.g. +# surfaceInternal.h) triggers a rebuild of every IIgs binary that +# transitively depends on it. +DEP_DIR := $(BUILD)/dep +PATTERN_DEP := $(DEP_DIR)/PATTERN.d +DRAW_DEP := $(DEP_DIR)/DRAW.d +KEYS_DEP := $(DEP_DIR)/KEYS.d +JOY_DEP := $(DEP_DIR)/JOY.d +SPRITE_DEP := $(DEP_DIR)/SPRITE.d +UBER_DEP := $(DEP_DIR)/UBER.d +AUDIO_DEP := $(DEP_DIR)/AUDIO.d + +$(PATTERN_BIN): $(PATTERN_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD) + @mkdir -p $(dir $@) $(DEP_DIR) + $(IIGS_BUILD) -b -M $(PATTERN_DEP) $(IIX_INCLUDES) -o $@ $(PATTERN_SRC) $(LIB_SRCS) + $(IIGS_IIX) chtyp -t S16 $@ + $(DRAW_BIN): $(DRAW_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD) - @mkdir -p $(dir $@) - $(IIGS_BUILD) -b $(IIX_INCLUDES) -o $@ $(DRAW_SRC) $(LIB_SRCS) + @mkdir -p $(dir $@) $(DEP_DIR) + $(IIGS_BUILD) -b -M $(DRAW_DEP) $(IIX_INCLUDES) -o $@ $(DRAW_SRC) $(LIB_SRCS) $(IIGS_IIX) chtyp -t S16 $@ $(KEYS_BIN): $(KEYS_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD) - @mkdir -p $(dir $@) - $(IIGS_BUILD) -b $(IIX_INCLUDES) -o $@ $(KEYS_SRC) $(LIB_SRCS) + @mkdir -p $(dir $@) $(DEP_DIR) + $(IIGS_BUILD) -b -M $(KEYS_DEP) $(IIX_INCLUDES) -o $@ $(KEYS_SRC) $(LIB_SRCS) $(IIGS_IIX) chtyp -t S16 $@ $(JOY_BIN): $(JOY_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD) - @mkdir -p $(dir $@) - $(IIGS_BUILD) -b $(IIX_INCLUDES) -o $@ $(JOY_SRC) $(LIB_SRCS) + @mkdir -p $(dir $@) $(DEP_DIR) + $(IIGS_BUILD) -b -M $(JOY_DEP) $(IIX_INCLUDES) -o $@ $(JOY_SRC) $(LIB_SRCS) $(IIGS_IIX) chtyp -t S16 $@ $(SPRITE_BIN): $(SPRITE_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD) - @mkdir -p $(dir $@) - $(IIGS_BUILD) -b $(IIX_INCLUDES) -o $@ $(SPRITE_SRC) $(LIB_SRCS) + @mkdir -p $(dir $@) $(DEP_DIR) + $(IIGS_BUILD) -b -M $(SPRITE_DEP) $(IIX_INCLUDES) -o $@ $(SPRITE_SRC) $(LIB_SRCS) $(IIGS_IIX) chtyp -t S16 $@ # UBER bumps user stack to 16 KB. ORCA-C's default user stack is small @@ -147,8 +167,8 @@ $(SPRITE_BIN): $(SPRITE_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD) # decimal formatter in uber.c also uses larger stack-local buffers # (line[96], num[16]) than typical demos. 16 KB is plenty of headroom. $(UBER_BIN): $(UBER_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD) - @mkdir -p $(dir $@) - $(IIGS_BUILD) -b -s 16384 $(IIX_INCLUDES) -o $@ $(UBER_SRC) $(LIB_SRCS) + @mkdir -p $(dir $@) $(DEP_DIR) + $(IIGS_BUILD) -b -s 16384 -M $(UBER_DEP) $(IIX_INCLUDES) -o $@ $(UBER_SRC) $(LIB_SRCS) $(IIGS_IIX) chtyp -t S16 $@ # Convert the cross-platform .MOD asset to NinjaTrackerPlus runtime @@ -170,17 +190,23 @@ AUDIO_DATA_FILES := $(AUDIO_SFX) endif $(AUDIO_BIN): $(AUDIO_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD) - @mkdir -p $(dir $@) - $(IIGS_BUILD) -b $(IIX_INCLUDES) -I $(EXAMPLES)/audio -o $@ $(AUDIO_SRC) $(LIB_SRCS) + @mkdir -p $(dir $@) $(DEP_DIR) + $(IIGS_BUILD) -b -M $(AUDIO_DEP) $(IIX_INCLUDES) -I $(EXAMPLES)/audio -o $@ $(AUDIO_SRC) $(LIB_SRCS) $(IIGS_IIX) chtyp -t S16 $@ # Assemble a ProDOS 2img containing the examples, ready to mount in # GSplus alongside a GS/OS boot volume. iigs-disk: $(DISK_IMG) -$(DISK_IMG): $(DRAW_BIN) $(KEYS_BIN) $(JOY_BIN) $(SPRITE_BIN) $(AUDIO_BIN) $(UBER_BIN) $(AUDIO_DATA_FILES) $(IIGS_PACKAGE) +$(DISK_IMG): $(PATTERN_BIN) $(DRAW_BIN) $(KEYS_BIN) $(JOY_BIN) $(SPRITE_BIN) $(AUDIO_BIN) $(UBER_BIN) $(AUDIO_DATA_FILES) $(IIGS_PACKAGE) @mkdir -p $(dir $@) - $(IIGS_PACKAGE) $@ $(DRAW_BIN) $(KEYS_BIN) $(JOY_BIN) $(SPRITE_BIN) $(AUDIO_BIN) $(UBER_BIN) -- $(AUDIO_DATA_FILES) + $(IIGS_PACKAGE) $@ $(PATTERN_BIN) $(DRAW_BIN) $(KEYS_BIN) $(JOY_BIN) $(SPRITE_BIN) $(AUDIO_BIN) $(UBER_BIN) -- $(AUDIO_DATA_FILES) clean-iigs: rm -rf $(BUILD) + +# Pull in per-binary header-dependency files generated by iix-build.sh -M. +# Without this, editing a header (e.g. surfaceInternal.h) doesn't rebuild +# IIgs binaries that include it -- the IIgs's iix toolchain has no native +# -MMD analog, so iix-build.sh shells out to host gcc for the scan. +-include $(PATTERN_DEP) $(DRAW_DEP) $(KEYS_DEP) $(JOY_DEP) $(SPRITE_DEP) $(UBER_DEP) $(AUDIO_DEP) diff --git a/src/codegen/spriteCompile.c b/src/codegen/spriteCompile.c index a01c59f..750283f 100644 --- a/src/codegen/spriteCompile.c +++ b/src/codegen/spriteCompile.c @@ -14,6 +14,7 @@ #include "joey/sprite.h" #include "joey/surface.h" #include "codegenArenaInternal.h" +#include "hal.h" #include "spriteEmitter.h" #include "spriteInternal.h" #include "surfaceInternal.h" @@ -33,7 +34,9 @@ static uint16_t emitDrawForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift) { #if defined(JOEYLIB_PLATFORM_DOS) return spriteEmitDrawX86(out, sp, shift); -#elif defined(JOEYLIB_PLATFORM_AMIGA) || defined(JOEYLIB_PLATFORM_ATARIST) +#elif defined(JOEYLIB_PLATFORM_AMIGA) + return spriteEmitDrawPlanar68k(out, sp, shift); +#elif defined(JOEYLIB_PLATFORM_ATARIST) return spriteEmitDraw68k(out, sp, shift); #elif defined(JOEYLIB_PLATFORM_IIGS) return spriteEmitDrawIigs(out, sp, shift); @@ -51,7 +54,9 @@ static uint16_t emitDrawForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift static uint16_t emitSaveForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift) { #if defined(JOEYLIB_PLATFORM_DOS) return spriteEmitSaveX86(out, sp, shift); -#elif defined(JOEYLIB_PLATFORM_AMIGA) || defined(JOEYLIB_PLATFORM_ATARIST) +#elif defined(JOEYLIB_PLATFORM_AMIGA) + return spriteEmitSavePlanar68k(out, sp, shift); +#elif defined(JOEYLIB_PLATFORM_ATARIST) return spriteEmitSave68k(out, sp, shift); #elif defined(JOEYLIB_PLATFORM_IIGS) return spriteEmitSaveIigs(out, sp, shift); @@ -65,7 +70,9 @@ static uint16_t emitSaveForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift static uint16_t emitRestoreForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift) { #if defined(JOEYLIB_PLATFORM_DOS) return spriteEmitRestoreX86(out, sp, shift); -#elif defined(JOEYLIB_PLATFORM_AMIGA) || defined(JOEYLIB_PLATFORM_ATARIST) +#elif defined(JOEYLIB_PLATFORM_AMIGA) + return spriteEmitRestorePlanar68k(out, sp, shift); +#elif defined(JOEYLIB_PLATFORM_ATARIST) return spriteEmitRestore68k(out, sp, shift); #elif defined(JOEYLIB_PLATFORM_IIGS) return spriteEmitRestoreIigs(out, sp, shift); @@ -114,6 +121,13 @@ bool spriteCompile(SpriteT *sp) { if (sp->tileData == NULL) { return false; } + /* Amiga (post-Phase 9) uses spriteEmitPlanar68k.c which writes + * directly to bitplanes. DRAW emits a unique pre-shifted variant + * per shift in 0..7 (smooth horizontal motion at any pixel x); + * SAVE/RESTORE emit only shift 0 and shift 1 since shifted variants + * 1..7 share identical bytes (plain memcpy of widthTiles+1 plane + * bytes per row). The post-emit pass below aliases slots 2..7 + * for save/restore to slot 1's bytes. */ scratch = (uint8_t *)malloc(SPRITE_EMIT_SCRATCH_BYTES); if (scratch == NULL) { @@ -150,6 +164,16 @@ bool spriteCompile(SpriteT *sp) { } } } +#if defined(JOEYLIB_PLATFORM_AMIGA) + /* Save/restore bytes for any non-zero shift are identical (plain + * memcpy of widthTiles+1 plane bytes per row). The emitter emits + * them once at slot 1; alias slots 2..7 here so the dispatcher + * gate (sprite.c) sees them as compiled. */ + for (shift = 2; shift < JOEY_SPRITE_SHIFT_COUNT; shift++) { + sp->routineOffsets[shift][SPRITE_OP_SAVE] = sp->routineOffsets[1][SPRITE_OP_SAVE]; + sp->routineOffsets[shift][SPRITE_OP_RESTORE] = sp->routineOffsets[1][SPRITE_OP_RESTORE]; + } +#endif sp->slot = slot; free(scratch); return true; @@ -554,6 +578,112 @@ void spriteCompiledRestoreUnder(SurfaceT *dst, const SpriteBackupT *backup) { } } +#elif defined(JOEYLIB_PLATFORM_AMIGA) + +/* Amiga planar dispatchers. spriteEmitPlanar68k.c emits routines with + * cdecl(p0, p1, p2, p3[, backup]) signatures that write directly to + * bitplanes. Compute byteOff = y*40 + x/8 and pass plane[i]+byteOff + * as the 4 plane args. shift = x % 8 selects the variant; today only + * shift 0 emits non-zero bytes, so callers should already have + * gated on routineOffsets[shift][op] != SPRITE_NOT_COMPILED. + * + * For non-zero shifts (x not 8-px-aligned), the dispatcher in + * src/core/sprite.c (spriteDraw / spriteSaveUnder / spriteRestoreUnder) + * sees SPRITE_NOT_COMPILED for the shift and falls back to the + * interpreter, which handles arbitrary x via halSpriteDrawPlanes / + * halSpriteSavePlanes / halSpriteRestorePlanes. */ + +#define AMIGA_BYTES_PER_ROW_LOCAL 40 + +void spriteCompiledDraw(SurfaceT *dst, const SpriteT *sp, int16_t x, int16_t y) { + typedef void (*DrawFn)(uint8_t *p0, uint8_t *p1, uint8_t *p2, uint8_t *p3); + uint8_t shift; + uint16_t byteOff; + uint8_t *p0; + uint8_t *p1; + uint8_t *p2; + uint8_t *p3; + DrawFn fn; + + shift = (uint8_t)(x & 7); + byteOff = (uint16_t)((uint16_t)y * AMIGA_BYTES_PER_ROW_LOCAL + ((uint16_t)x >> 3)); + p0 = halSurfacePlanePtr(dst, 0); if (p0 == NULL) return; + p1 = halSurfacePlanePtr(dst, 1); + p2 = halSurfacePlanePtr(dst, 2); + p3 = halSurfacePlanePtr(dst, 3); + fn = (DrawFn)(codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_DRAW]); + fn(p0 + byteOff, p1 + byteOff, p2 + byteOff, p3 + byteOff); +} + + +void spriteCompiledSaveUnder(const SurfaceT *src, SpriteT *sp, int16_t x, int16_t y, SpriteBackupT *backup) { + typedef void (*SaveFn)(uint8_t *p0, uint8_t *p1, uint8_t *p2, uint8_t *p3, uint8_t *backup); + uint8_t shift; + int16_t clippedX; + uint16_t widthPx; + uint16_t heightPx; + uint16_t byteOff; + uint8_t *p0; + uint8_t *p1; + uint8_t *p2; + uint8_t *p3; + SaveFn fn; + + shift = (uint8_t)(x & 7); + clippedX = (int16_t)(x & ~7); + widthPx = (uint16_t)(sp->widthTiles * 8); + heightPx = (uint16_t)(sp->heightTiles * 8); + /* Shifts 1..7 spill into one extra plane byte per row (= +8 px). */ + if (shift != 0u) { + widthPx = (uint16_t)(widthPx + 8u); + } + byteOff = (uint16_t)((uint16_t)y * AMIGA_BYTES_PER_ROW_LOCAL + ((uint16_t)clippedX >> 3)); + + backup->sprite = sp; + backup->x = clippedX; + backup->y = y; + backup->width = widthPx; + backup->height = heightPx; + /* 4 planes * h * (widthPx/8) bytes = h * widthPx/2. */ + backup->sizeBytes = (uint16_t)((uint16_t)heightPx * (widthPx >> 1)); + + p0 = halSurfacePlanePtr(src, 0); if (p0 == NULL) return; + p1 = halSurfacePlanePtr(src, 1); + p2 = halSurfacePlanePtr(src, 2); + p3 = halSurfacePlanePtr(src, 3); + fn = (SaveFn)(codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_SAVE]); + fn(p0 + byteOff, p1 + byteOff, p2 + byteOff, p3 + byteOff, backup->bytes); +} + + +void spriteCompiledRestoreUnder(SurfaceT *dst, const SpriteBackupT *backup) { + typedef void (*RestoreFn)(uint8_t *p0, uint8_t *p1, uint8_t *p2, uint8_t *p3, const uint8_t *backup); + SpriteT *sp; + uint8_t shift; + uint16_t byteOff; + uint8_t *p0; + uint8_t *p1; + uint8_t *p2; + uint8_t *p3; + RestoreFn fn; + + sp = backup->sprite; + /* backup->x is 8-px aligned (clippedX from save), so x & 7 is + * useless for picking the original shift. Encode it via + * backup->width: == widthTiles*8 means shift 0; > means shifted. + * Shifted slots 1..7 all alias to the same restore bytes, so + * slot 1 stands in for any non-zero shift. */ + shift = (uint8_t)(backup->width > (uint16_t)(sp->widthTiles * 8) ? 1u : 0u); + byteOff = (uint16_t)((uint16_t)backup->y * AMIGA_BYTES_PER_ROW_LOCAL + ((uint16_t)backup->x >> 3)); + + p0 = halSurfacePlanePtr(dst, 0); if (p0 == NULL) return; + p1 = halSurfacePlanePtr(dst, 1); + p2 = halSurfacePlanePtr(dst, 2); + p3 = halSurfacePlanePtr(dst, 3); + fn = (RestoreFn)(codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_RESTORE]); + fn(p0 + byteOff, p1 + byteOff, p2 + byteOff, p3 + byteOff, backup->bytes); +} + #else void spriteCompiledDraw(SurfaceT *dst, const SpriteT *sp, int16_t x, int16_t y) { diff --git a/src/codegen/spriteEmit68k.c b/src/codegen/spriteEmit68k.c index b86851c..3afb2fa 100644 --- a/src/codegen/spriteEmit68k.c +++ b/src/codegen/spriteEmit68k.c @@ -166,6 +166,13 @@ uint16_t spriteEmitDraw68k(uint8_t *out, const SpriteT *sp, uint8_t shift) { uint8_t value; uint8_t opaqueMask; + // Chunky 4bpp has only two nibble-alignment positions; the + // dispatcher uses x & 1 so shifts 2..7 are unreachable. Bail + // early so the arena slot stays SPRITE_NOT_COMPILED. + if (shift > 1u) { + return 0u; + } + cursor = 0; heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS); spriteBytesPerRow = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW); @@ -225,6 +232,10 @@ uint16_t spriteEmitRestore68k(uint8_t *out, const SpriteT *sp, uint8_t shift) { uint16_t heightPx; uint16_t copyBytes; + if (shift > 1u) { + return 0u; + } + cursor = 0; heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS); copyBytes = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW + (shift == 1u ? 1u : 0u)); @@ -248,6 +259,10 @@ uint16_t spriteEmitSave68k(uint8_t *out, const SpriteT *sp, uint8_t shift) { uint16_t heightPx; uint16_t copyBytes; + if (shift > 1u) { + return 0u; + } + cursor = 0; heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS); copyBytes = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW + (shift == 1u ? 1u : 0u)); diff --git a/src/codegen/spriteEmitIigs.c b/src/codegen/spriteEmitIigs.c index 4975c1c..a960297 100644 --- a/src/codegen/spriteEmitIigs.c +++ b/src/codegen/spriteEmitIigs.c @@ -189,6 +189,10 @@ uint16_t spriteEmitSaveIigs(uint8_t *out, const SpriteT *sp, uint8_t shift) { uint16_t spriteBytesPerRow; uint16_t copyBytes; + if (shift > 1u) { + return 0u; + } + heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS); spriteBytesPerRow = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW); copyBytes = (uint16_t)(spriteBytesPerRow + (shift == 1 ? 1 : 0)); @@ -205,6 +209,10 @@ uint16_t spriteEmitRestoreIigs(uint8_t *out, const SpriteT *sp, uint8_t shift) { uint16_t spriteBytesPerRow; uint16_t copyBytes; + if (shift > 1u) { + return 0u; + } + heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS); spriteBytesPerRow = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW); copyBytes = (uint16_t)(spriteBytesPerRow + (shift == 1 ? 1 : 0)); @@ -258,6 +266,10 @@ uint16_t spriteEmitDrawIigs(uint8_t *out, const SpriteT *sp, uint8_t shift) { uint8_t nextOpaqueMask; bool wide; + if (shift > 1u) { + return 0u; + } + cursor = 0; heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS); spriteBytesPerRow = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW); diff --git a/src/codegen/spriteEmitPlanar68k.c b/src/codegen/spriteEmitPlanar68k.c new file mode 100644 index 0000000..d201e80 --- /dev/null +++ b/src/codegen/spriteEmitPlanar68k.c @@ -0,0 +1,505 @@ +// Planar 68k sprite codegen for Amiga (post-Phase 9, no chunky shadow). +// +// Emits PIC routines that write directly to the four bitplanes via 4 +// address-register pointers (a0..a3 = plane[0..3] base + byteOff, +// where byteOff = y*40 + x/8 -- the dispatcher pre-computes this). +// +// Calling convention (cdecl on m68k-amigaos-gcc): +// draw(p0, p1, p2, p3): +// args at 4(sp), 8(sp), 12(sp), 16(sp) -- one ULONG per plane. +// loaded into a0..a3 by the prologue. +// save(p0, p1, p2, p3, backup): +// 5 args; backup at 20(sp), loaded into a4. +// restore(p0, p1, p2, p3, backup): +// same as save but reads backup, writes planes. +// +// Per-byte plane write encoding decisions: +// - all-transparent (mask=0): skip the byte entirely +// - all-opaque (mask=0xFF): move.b #imm, d16(an) (6 bytes) +// - mixed (0> 8) & 0xFFu); + out[1] = (uint8_t)(value & 0xFFu); + return 2u; +} + + +// movea.l , an -- load arg at SP+disp into An. +// Encoding: 0010 nnn 001 010 111 + disp16 +// = 0x2057 + (n << 9), where n is dst An. +// a0: 0x206F, a1: 0x226F, a2: 0x246F, a3: 0x266F, a4: 0x286F. +static const uint16_t kMoveaSpToAn[] = { + 0x206Fu, 0x226Fu, 0x246Fu, 0x266Fu, 0x286Fu, 0x2A6Fu, 0x2C6Fu, 0x2E6Fu +}; + + +// adda.w #imm, an -- adds 16-bit signed imm to An (sign-extended). +// Encoding: 1101 nnn 011 111 100 + imm +// = 0xD0FC + (n << 9). +static const uint16_t kAddaWImmToAn[] = { + 0xD0FCu, 0xD2FCu, 0xD4FCu, 0xD6FCu, 0xD8FCu, 0xDAFCu, 0xDCFCu, 0xDEFCu +}; + + +// ANDI.B #imm, D0 -- 4 bytes (opcode word + imm word, byte in low half). +// Opcode: 0000 0010 00 000 000 (size=byte, mode=Dn, reg=D0) +#define ANDI_B_IMM_D0 0x0200u + +// ORI.B #imm, D0 -- 4 bytes (opcode word + imm word, byte in low half). +// Opcode: 0000 0000 00 000 000 +#define ORI_B_IMM_D0 0x0000u + + +// MOVE.B d16(An), D0 -- 4 bytes (opcode + disp). +// Encoding: 0001 000 000 mode reg +// = size=01 (byte), dst reg=000 (D0), dst mode=000 (Dn), +// src mode=101 (d16,An), src reg=An. +// = 0001000 000 101 nnn = 0x1028 + An. +static const uint16_t kMoveBD16AnToD0[] = { + 0x1028u, 0x1029u, 0x102Au, 0x102Bu +}; + + +// MOVE.B D0, d16(An) -- 4 bytes (opcode + disp). +// Encoding: 0001 nnn 101 000 000 = 0x1140 + (An << 9). +static const uint16_t kMoveBD0ToD16An[] = { + 0x1140u, 0x1340u, 0x1540u, 0x1740u +}; + + +// MOVE.B #imm, d16(An) -- 6 bytes (opcode + imm + disp). +// Encoding: 0001 nnn 101 111 100 = 0x117C + (An << 9). +// (Was 0x113C earlier -- that's mode=100=predec; mode=101=d16(An) +// is the bit difference. Predec emits a 4-byte instruction with no +// disp word, so the byte stream went out of sync and every +// subsequent instruction decoded into garbage.) +static const uint16_t kMoveBImmToD16An[] = { + 0x117Cu, 0x137Cu, 0x157Cu, 0x177Cu +}; + + +// MOVE.B (a4)+, d16(An) -- 4 bytes (opcode + disp). -- used by save/restore (backup in a4) +// Encoding: 0001 nnn 101 011 100 = 0x115C + (An << 9). +static const uint16_t kMoveBA4PostincToD16An[] = { + 0x115Cu, 0x135Cu, 0x155Cu, 0x175Cu +}; + + +// MOVE.B d16(An), (a4)+ -- 4 bytes (opcode + disp). -- used by save (planes -> backup) +// Encoding: 1001 100 011 mode reg +// Wait, MOVE.B src,(a4)+ : dst mode = 011 (an+), dst reg = 100 (A4), +// so dst reg=100, dst mode=011 -> opcode high = 0001 100 011 ... +// = 0001100011 mode reg = 0x18C0.. +// 0001 100 011 101 nnn = 0x18E8 + An. +static const uint16_t kMoveBD16AnToA4Postinc[] = { + 0x18E8u, 0x18E9u, 0x18EAu, 0x18EBu +}; + + +// MOVEM.L reglist, -(SP) -- 4 bytes (opcode + reglist mask). +// Opcode 0x48E7. Predec mask is REVERSED vs all other modes: +// bit 15 = D0, ..., bit 8 = D7, bit 7 = A0, bit 6 = A1, bit 5 = A2, +// bit 4 = A3, bit 3 = A4, bit 2 = A5, bit 1 = A6, bit 0 = A7. +#define MOVEM_L_PUSH_OPCODE 0x48E7u +#define MOVEM_L_MASK_A2_A3 0x0030u /* bits 5,4 = A2,A3 (predec order) */ +#define MOVEM_L_MASK_A2_A3_A4 0x0038u /* bits 5,4,3 = A2,A3,A4 */ + +// MOVEM.L (SP)+, reglist -- 4 bytes (opcode + reglist mask). +// Opcode 0x4CDF. Postinc mask follows the standard layout: +// bit 0 = D0, ..., bit 7 = D7, bit 8 = A0, ..., bit 15 = A7. +#define MOVEM_L_POP_OPCODE 0x4CDFu +#define MOVEM_L_MASK_POP_A2_A3 0x0C00u /* bits 11,10 = A3,A2 */ +#define MOVEM_L_MASK_POP_A2_A3_A4 0x1C00u /* bits 12,11,10 = A4,A3,A2 */ + +// RTS opcode. +#define OPCODE_RTS 0x4E75u + + +// ----- Emit helpers ----- + +// For shift 0 (byte-aligned x), the sprite's chunky tile data converts +// directly to plane bytes without any sub-byte shifting. For each +// (row, col-byte, plane) we extract the 8 plane bits from 4 chunky +// bytes (= 8 pixels) and produce one plane byte; we also produce a +// mask byte indicating which pixel positions are non-transparent +// (any plane bit != 0 in the source means non-transparent if +// transparent index is 0, the JoeyLib convention). +// +// Sprite layout: tileData = wTiles x hTiles tiles, each tile = 8 rows +// x 4 chunky bytes (32 bytes). Tiles laid out row-major within the +// sprite. For plane-byte column `c` of row `r`: +// tileX = c (since each plane byte covers exactly one tile column) +// tileY = r / 8 +// inTileY = r % 8 +// chunky bytes = tileData + (tileY*wTiles + tileX)*32 + inTileY*4 + 0..3 +// +// `col` must be in [0, widthTiles); callers handle out-of-range cols +// (used when computing shifted variants that span widthTiles+1 output +// bytes per row) by passing a sentinel and checking against widthTiles +// before invoking this helper. +static void planeByteAndMaskAt(const SpriteT *sp, uint16_t row, uint16_t col, + uint8_t *planeBytes /*[4]*/, uint8_t *maskByte) +{ + uint16_t tileX; + uint16_t tileY; + uint16_t inTileY; + const uint8_t *tile; + const uint8_t *chunky; + uint8_t nibbles[8]; + uint8_t b0, b1, b2, b3; + uint16_t p; + uint8_t bitMask; + uint8_t pix; + + tileX = col; + tileY = row >> 3; + inTileY = row & 7u; + + tile = sp->tileData + (uint32_t)((tileY * sp->widthTiles + tileX) * 32u); + chunky = tile + inTileY * 4u; + + nibbles[0] = (uint8_t)(chunky[0] >> 4); + nibbles[1] = (uint8_t)(chunky[0] & 0x0Fu); + nibbles[2] = (uint8_t)(chunky[1] >> 4); + nibbles[3] = (uint8_t)(chunky[1] & 0x0Fu); + nibbles[4] = (uint8_t)(chunky[2] >> 4); + nibbles[5] = (uint8_t)(chunky[2] & 0x0Fu); + nibbles[6] = (uint8_t)(chunky[3] >> 4); + nibbles[7] = (uint8_t)(chunky[3] & 0x0Fu); + + b0 = 0u; b1 = 0u; b2 = 0u; b3 = 0u; + *maskByte = 0u; + for (p = 0; p < 8u; p++) { + pix = nibbles[p]; + if (pix == TRANSPARENT_NIBBLE) { + continue; + } + bitMask = (uint8_t)(0x80u >> p); + *maskByte = (uint8_t)(*maskByte | bitMask); + if (pix & 1u) b0 = (uint8_t)(b0 | bitMask); + if (pix & 2u) b1 = (uint8_t)(b1 | bitMask); + if (pix & 4u) b2 = (uint8_t)(b2 | bitMask); + if (pix & 8u) b3 = (uint8_t)(b3 | bitMask); + } + planeBytes[0] = b0; + planeBytes[1] = b1; + planeBytes[2] = b2; + planeBytes[3] = b3; +} + + +// Shifted variant: produces 4 plane bytes and 1 mask byte for output +// column `outCol` (0..widthTiles inclusive) of row `row` when the +// sprite is shifted right by `shift` pixels (1..7). For shift 0, +// callers should use planeByteAndMaskAt directly (faster, no spill). +// +// Each output byte is composed of bits drawn from up to two source +// plane bytes: +// leftPart = src[outCol-1] << (8 - shift) (high (shift) bits) +// rightPart = src[outCol] >> shift (low (8-shift) bits) +// with src[-1] and src[widthTiles] treated as 0/transparent. The +// resulting plane byte is leftPart | rightPart; the mask byte is the +// shifted union of the per-byte source masks. +static void planeByteAndMaskShifted(const SpriteT *sp, uint16_t row, uint16_t outCol, + uint8_t shift, uint16_t widthTiles, + uint8_t *planeBytes /*[4]*/, uint8_t *maskByte) +{ + uint8_t leftPlanes[AMIGA_BITPLANES]; + uint8_t leftMask; + uint8_t rightPlanes[AMIGA_BITPLANES]; + uint8_t rightMask; + uint8_t i; + + leftMask = 0u; + rightMask = 0u; + for (i = 0; i < AMIGA_BITPLANES; i++) { + leftPlanes[i] = 0u; + rightPlanes[i] = 0u; + } + + if (outCol > 0u && (uint16_t)(outCol - 1u) < widthTiles) { + planeByteAndMaskAt(sp, row, (uint16_t)(outCol - 1u), leftPlanes, &leftMask); + } + if (outCol < widthTiles) { + planeByteAndMaskAt(sp, row, outCol, rightPlanes, &rightMask); + } + + *maskByte = (uint8_t)(((leftMask << (8u - shift)) & 0xFFu) | + ((rightMask >> shift) & 0xFFu)); + for (i = 0; i < AMIGA_BITPLANES; i++) { + planeBytes[i] = (uint8_t)(((leftPlanes[i] << (8u - shift)) & 0xFFu) | + ((rightPlanes[i] >> shift) & 0xFFu)); + } +} + + +// Emit code that merges one plane byte into d16(an) where d16 is the +// row-relative byte offset (0 since we re-base each row by adda.w). +// The choice of all-opaque vs mixed encoding cuts code size when many +// pixels are opaque (typical for sprite interiors). +static uint16_t emitMergeByteToD16An(uint8_t *out, uint16_t cursor, + uint8_t an, uint8_t disp, + uint8_t maskByte, uint8_t srcByte) +{ + if (maskByte == 0u) { + return cursor; /* nothing to write */ + } + if (maskByte == 0xFFu) { + /* All-opaque shortcut: move.b #src, d16(an). */ + cursor += writeBE16(out + cursor, kMoveBImmToD16An[an]); + cursor += writeBE16(out + cursor, (uint16_t)srcByte); + cursor += writeBE16(out + cursor, (uint16_t)disp); + return cursor; + } + /* Mixed: load existing, clear mask bits, OR in src, write back. */ + cursor += writeBE16(out + cursor, kMoveBD16AnToD0[an]); + cursor += writeBE16(out + cursor, (uint16_t)disp); + cursor += writeBE16(out + cursor, ANDI_B_IMM_D0); + cursor += writeBE16(out + cursor, (uint16_t)((~maskByte) & 0xFFu)); + cursor += writeBE16(out + cursor, ORI_B_IMM_D0); + cursor += writeBE16(out + cursor, (uint16_t)srcByte); + cursor += writeBE16(out + cursor, kMoveBD0ToD16An[an]); + cursor += writeBE16(out + cursor, (uint16_t)disp); + return cursor; +} + + +// ----- Public API ----- + +uint16_t spriteEmitDrawPlanar68k(uint8_t *out, const SpriteT *sp, uint8_t shift) { + uint16_t cursor; + uint16_t row; + uint16_t col; + uint16_t heightPx; + uint16_t widthTiles; + uint16_t bytesPerRow; /* per plane, per row */ + uint8_t planeBytes[AMIGA_BITPLANES]; + uint8_t maskByte; + uint8_t i; + + if (shift > 7u) { + return 0u; + } + + cursor = 0; + heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS); + widthTiles = (uint16_t)sp->widthTiles; + bytesPerRow = (uint16_t)(widthTiles + (shift == 0u ? 0u : 1u)); + + /* Prologue: m68k cdecl callee-saves a2-a6; we clobber a2 and a3 + * loading plane pointers, so push them first. After the push, all + * stack arg displacements shift by +8 (two longs). */ + cursor += writeBE16(out + cursor, MOVEM_L_PUSH_OPCODE); + cursor += writeBE16(out + cursor, MOVEM_L_MASK_A2_A3); + for (i = 0; i < AMIGA_BITPLANES; i++) { + cursor += writeBE16(out + cursor, kMoveaSpToAn[i]); + cursor += writeBE16(out + cursor, (uint16_t)(8u + 4u + i * 4u)); + } + + for (row = 0; row < heightPx; row++) { + for (col = 0; col < bytesPerRow; col++) { + if (shift == 0u) { + planeByteAndMaskAt(sp, row, col, planeBytes, &maskByte); + } else { + planeByteAndMaskShifted(sp, row, col, shift, widthTiles, planeBytes, &maskByte); + } + for (i = 0; i < AMIGA_BITPLANES; i++) { + cursor = emitMergeByteToD16An(out, cursor, i, (uint8_t)col, + maskByte, planeBytes[i]); + } + } + if (row + 1u < heightPx) { + for (i = 0; i < AMIGA_BITPLANES; i++) { + cursor += writeBE16(out + cursor, kAddaWImmToAn[i]); + cursor += writeBE16(out + cursor, (uint16_t)AMIGA_BYTES_PER_ROW); + } + } + } + + /* Epilogue: restore a2-a3, rts. */ + cursor += writeBE16(out + cursor, MOVEM_L_POP_OPCODE); + cursor += writeBE16(out + cursor, MOVEM_L_MASK_POP_A2_A3); + cursor += writeBE16(out + cursor, OPCODE_RTS); + return cursor; +} + + +// SAVE: planes -> backup. backup is one contiguous 4*H*W/8 byte buffer +// laid out as 4 plane stripes, matching halSpriteSavePlanes format +// (so cross-platform save buffer is interchangeable). +// +// Per row: for each plane, copy bytesPerRow bytes from d16(an) to +// (a4)+. After the row's reads, the planes need to advance by 40, +// while a4 advances naturally via post-increment. +// +// Plane stripes are sequential in backup. We could either (a) do all +// rows of plane 0, then plane 1, etc. (matches halSpriteSavePlanes +// layout), or (b) interleave rows of all 4 planes (different layout). +// halSpriteSavePlanes does (a) -- 4 separate plane stripes. The +// emitted code below matches that layout for compat. +uint16_t spriteEmitSavePlanar68k(uint8_t *out, const SpriteT *sp, uint8_t shift) { + uint16_t cursor; + uint16_t row; + uint16_t col; + uint16_t heightPx; + uint16_t bytesPerRow; + uint8_t i; + + /* Shifts 2..7 reuse shift 1's bytes (identical memcpy). The + * spriteCompile post-emit pass aliases their routineOffsets to + * slot 1 so this routine is emitted once. */ + if (shift > 1u) { + return 0u; + } + + cursor = 0; + heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS); + bytesPerRow = (uint16_t)(sp->widthTiles + (shift == 0u ? 0u : 1u)); + + /* Prologue: callee-save a2/a3/a4 (m68k cdecl), then load 4 plane + * pointers + backup pointer. After the push, all stack arg disps + * shift by +12 (three longs). */ + cursor += writeBE16(out + cursor, MOVEM_L_PUSH_OPCODE); + cursor += writeBE16(out + cursor, MOVEM_L_MASK_A2_A3_A4); + for (i = 0; i < AMIGA_BITPLANES; i++) { + cursor += writeBE16(out + cursor, kMoveaSpToAn[i]); + cursor += writeBE16(out + cursor, (uint16_t)(12u + 4u + i * 4u)); + } + /* a4 = backup. */ + cursor += writeBE16(out + cursor, kMoveaSpToAn[4]); + cursor += writeBE16(out + cursor, (uint16_t)(12u + 4u + 4u * 4u)); + + /* Plane-major: for each plane, walk all rows. After this routine, + * each An has advanced by H*40 (one frame full); we don't need to + * unwind because the function returns. We DO need to reset An + * back to start before walking the NEXT plane though. + * + * Simpler alternative: row-major (interleaved). Per row, copy + * bytesPerRow bytes from each plane to (a4)+, then advance all + * 4 planes by 40. Net: a4 advances by 4*H*bytesPerRow; planes + * advance by H*40. Backup layout becomes interleaved (plane0_row0, + * plane1_row0, plane2_row0, plane3_row0, plane0_row1, ...). + * + * That doesn't match halSpriteSavePlanes' plane-major layout. Need + * to either (a) match it -- emit per-plane outer loop with a4 + * stride between planes -- or (b) change halSpriteSavePlanes to + * interleaved. Picking (b) is simpler in emitted code, but ALSO + * requires updating halSpriteRestorePlanes and halSpriteRestoreUnder + * fallback math. + * + * For now: use plane-major matching halSpriteSavePlanes. Per + * plane: walk rows, copy bytes from d16(an) to (a4)+, advance an + * by 40 after each row except the last; reset an back to start + * before next plane. */ + for (i = 0; i < AMIGA_BITPLANES; i++) { + for (row = 0; row < heightPx; row++) { + for (col = 0; col < bytesPerRow; col++) { + cursor += writeBE16(out + cursor, kMoveBD16AnToA4Postinc[i]); + cursor += writeBE16(out + cursor, (uint16_t)col); + } + if (row + 1u < heightPx) { + cursor += writeBE16(out + cursor, kAddaWImmToAn[i]); + cursor += writeBE16(out + cursor, (uint16_t)AMIGA_BYTES_PER_ROW); + } + } + /* Reset An back to the plane base for next iteration. The + * total advance was (heightPx - 1) * 40. Subtract that. */ + if (i + 1u < AMIGA_BITPLANES) { + cursor += writeBE16(out + cursor, kAddaWImmToAn[i]); + cursor += writeBE16(out + cursor, (uint16_t)(0u - ((heightPx - 1u) * AMIGA_BYTES_PER_ROW))); + } + } + + cursor += writeBE16(out + cursor, MOVEM_L_POP_OPCODE); + cursor += writeBE16(out + cursor, MOVEM_L_MASK_POP_A2_A3_A4); + cursor += writeBE16(out + cursor, OPCODE_RTS); + return cursor; +} + + +// RESTORE: backup -> planes. Mirror of save. Uses MOVE.B (a4)+, d16(an). +uint16_t spriteEmitRestorePlanar68k(uint8_t *out, const SpriteT *sp, uint8_t shift) { + uint16_t cursor; + uint16_t row; + uint16_t col; + uint16_t heightPx; + uint16_t bytesPerRow; + uint8_t i; + + if (shift > 1u) { + return 0u; + } + + cursor = 0; + heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS); + bytesPerRow = (uint16_t)(sp->widthTiles + (shift == 0u ? 0u : 1u)); + + /* Callee-save a2/a3/a4; arg disps shift by +12. */ + cursor += writeBE16(out + cursor, MOVEM_L_PUSH_OPCODE); + cursor += writeBE16(out + cursor, MOVEM_L_MASK_A2_A3_A4); + for (i = 0; i < AMIGA_BITPLANES; i++) { + cursor += writeBE16(out + cursor, kMoveaSpToAn[i]); + cursor += writeBE16(out + cursor, (uint16_t)(12u + 4u + i * 4u)); + } + cursor += writeBE16(out + cursor, kMoveaSpToAn[4]); + cursor += writeBE16(out + cursor, (uint16_t)(12u + 4u + 4u * 4u)); + + for (i = 0; i < AMIGA_BITPLANES; i++) { + for (row = 0; row < heightPx; row++) { + for (col = 0; col < bytesPerRow; col++) { + cursor += writeBE16(out + cursor, kMoveBA4PostincToD16An[i]); + cursor += writeBE16(out + cursor, (uint16_t)col); + } + if (row + 1u < heightPx) { + cursor += writeBE16(out + cursor, kAddaWImmToAn[i]); + cursor += writeBE16(out + cursor, (uint16_t)AMIGA_BYTES_PER_ROW); + } + } + if (i + 1u < AMIGA_BITPLANES) { + cursor += writeBE16(out + cursor, kAddaWImmToAn[i]); + cursor += writeBE16(out + cursor, (uint16_t)(0u - ((heightPx - 1u) * AMIGA_BYTES_PER_ROW))); + } + } + + cursor += writeBE16(out + cursor, MOVEM_L_POP_OPCODE); + cursor += writeBE16(out + cursor, MOVEM_L_MASK_POP_A2_A3_A4); + cursor += writeBE16(out + cursor, OPCODE_RTS); + return cursor; +} diff --git a/src/codegen/spriteEmitX86.c b/src/codegen/spriteEmitX86.c index b0c1bbf..226eb9c 100644 --- a/src/codegen/spriteEmitX86.c +++ b/src/codegen/spriteEmitX86.c @@ -200,6 +200,10 @@ uint16_t spriteEmitDrawX86(uint8_t *out, const SpriteT *sp, uint8_t shift) { uint8_t v3; uint8_t m; + if (shift > 1u) { + return 0u; + } + cursor = 0; heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS); spriteBytesPerRow = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW); @@ -313,6 +317,10 @@ uint16_t spriteEmitRestoreX86(uint8_t *out, const SpriteT *sp, uint8_t shift) { uint16_t heightPx; uint16_t copyBytes; + if (shift > 1u) { + return 0u; + } + cursor = 0; heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS); copyBytes = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW + (shift == 1u ? 1u : 0u)); @@ -339,6 +347,10 @@ uint16_t spriteEmitSaveX86(uint8_t *out, const SpriteT *sp, uint8_t shift) { uint16_t heightPx; uint16_t copyBytes; + if (shift > 1u) { + return 0u; + } + cursor = 0; heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS); copyBytes = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW + (shift == 1u ? 1u : 0u)); diff --git a/src/codegen/spriteEmitter.h b/src/codegen/spriteEmitter.h index 8fbe359..acd7169 100644 --- a/src/codegen/spriteEmitter.h +++ b/src/codegen/spriteEmitter.h @@ -42,4 +42,19 @@ uint16_t spriteEmitRestoreX86 (uint8_t *out, const SpriteT *sp, uint8_t shift); uint16_t spriteEmitSave68k (uint8_t *out, const SpriteT *sp, uint8_t shift); uint16_t spriteEmitRestore68k (uint8_t *out, const SpriteT *sp, uint8_t shift); +// Planar 68k emitters (Amiga). Distinct from the chunky 68k emitters +// above because the destination addressing is across 4 separate +// bitplane buffers, not a single packed-pixel surface. Calling +// convention for the emitted bytes (cdecl): +// void draw (uint8_t *p0, uint8_t *p1, uint8_t *p2, uint8_t *p3); +// void save (uint8_t *p0, uint8_t *p1, uint8_t *p2, uint8_t *p3, uint8_t *backup); +// void restore (uint8_t *p0, uint8_t *p1, uint8_t *p2, uint8_t *p3, const uint8_t *backup); +// Each pi is plane_base + byteOff (= y*40 + x/8 already added by the +// dispatcher). Returns 0 for shifts not yet implemented (today only +// shift 0 == byte-aligned x is emitted; shifts 1..7 fall back to the +// cross-platform interpreter). +uint16_t spriteEmitDrawPlanar68k (uint8_t *out, const SpriteT *sp, uint8_t shift); +uint16_t spriteEmitSavePlanar68k (uint8_t *out, const SpriteT *sp, uint8_t shift); +uint16_t spriteEmitRestorePlanar68k (uint8_t *out, const SpriteT *sp, uint8_t shift); + #endif diff --git a/src/core/debug.c b/src/core/debug.c index b748ac2..8dbad17 100644 --- a/src/core/debug.c +++ b/src/core/debug.c @@ -1,11 +1,18 @@ -// Cross-platform "where did it hang?" logger. Each call opens -// joeylog.txt, appends a line, fflushes, closes. Slow but durable -// -- the last line in the file is guaranteed to be on disk before -// any subsequent operation that might hang. +// Cross-platform "where did it hang?" logger. Holds joeylog.txt open +// across calls; libc's stdio buffer absorbs writes (~4 KB) and the +// final fclose at program exit (via atexit) gets the buffer to disk. // -// Build only as needed for diagnostics; remove the calls when the -// bug is fixed. The hang on ST kept us looking at the wrong layer -// without this kind of trace. +// Earlier rev opened+closed per call for crash durability ("last line +// guaranteed on disk if we hang"); that cost ~1 second per call +// through GoldenGate's ProDOS FST emulation -- a 50-line UBER run +// burned ~5 minutes in IO. Even per-line fflush is too expensive +// because every fflush forces an FST WRITE, and host-OS file IO time +// isn't tracked by the IIgs VBL counter so wall-time logs underreport. +// +// Tradeoff: if the program crashes mid-run, buffered log lines may +// not reach disk. For UBER and similar batch demos that's acceptable; +// for hang-debugging where durability matters, call joeyLogFlush() +// at the suspected hang points. #include #include @@ -15,6 +22,27 @@ static const char *kLogPath = "joeylog.txt"; +static FILE *gLogFp = NULL; +/* 16 KB is enough for UBER's full log (~5 KB) plus generous headroom, + * so the file never auto-flushes mid-run. ORCA-C / libnix default + * buffers are only ~512 bytes; with that, a 50-line log triggers ~10 + * ProDOS / AmigaDOS WRITEs through the host FST, each of which is + * untracked-host-time (seconds). Buffer the whole thing in memory and + * let the atexit fclose flush once. */ +#define JOEY_LOG_BUF_BYTES 16384 +static char gLogBuf[JOEY_LOG_BUF_BYTES]; + + +/* Lazy-open. Returns NULL if the open failed (silently disable). */ +static FILE *logFile(void) { + if (gLogFp == NULL) { + gLogFp = fopen(kLogPath, "a"); + if (gLogFp != NULL) { + (void)setvbuf(gLogFp, gLogBuf, _IOFBF, sizeof(gLogBuf)); + } + } + return gLogFp; +} void joeyLog(const char *msg) { @@ -22,13 +50,12 @@ void joeyLog(const char *msg) { if (msg == NULL) { return; } - fp = fopen(kLogPath, "a"); + fp = logFile(); if (fp == NULL) { return; } fputs(msg, fp); fputc('\n', fp); - fclose(fp); } @@ -38,7 +65,7 @@ void joeyLogF(const char *fmt, ...) { if (fmt == NULL) { return; } - fp = fopen(kLogPath, "a"); + fp = logFile(); if (fp == NULL) { return; } @@ -46,14 +73,27 @@ void joeyLogF(const char *fmt, ...) { vfprintf(fp, fmt, args); va_end(args); fputc('\n', fp); - fclose(fp); +} + + +void joeyLogFlush(void) { + if (gLogFp != NULL) { + fflush(gLogFp); + } } void joeyLogReset(void) { - FILE *fp; - fp = fopen(kLogPath, "w"); - if (fp != NULL) { - fclose(fp); + if (gLogFp != NULL) { + fclose(gLogFp); + gLogFp = NULL; + } + /* Truncate by opening for write then closing; subsequent + * joeyLog* will reopen for append. */ + { + FILE *fp = fopen(kLogPath, "w"); + if (fp != NULL) { + fclose(fp); + } } } diff --git a/src/core/draw.c b/src/core/draw.c index c60d092..802351e 100644 --- a/src/core/draw.c +++ b/src/core/draw.c @@ -186,13 +186,17 @@ static void floodFillInternal(SurfaceT *s, int16_t startX, int16_t startY, uint8 continue; } - // Highest-tier asm fast path: seed-test + walk-left + walk-right - // + 1-row fill + scan-above + scan-below + push, all in one - // cross-segment call. The asm caches row addr / match decoder - // across every sub-operation. C just pops and dispatches; this - // path completes the entire per-seed work and computes the row - // address itself, so we don't pay y*160 in C unless we fall back. - { + /* Phase 9: planar ports have NULL s->pixels and the asm fast + * paths take a chunky-row pointer. Skip them on planar; the C + * fallback below uses halSamplePixel which works on both + * storage layouts. */ + if (s->pixels != NULL) { + // Highest-tier asm fast path: seed-test + walk-left + walk-right + // + 1-row fill + scan-above + scan-below + push, all in one + // cross-segment call. The asm caches row addr / match decoder + // across every sub-operation. C just pops and dispatches; this + // path completes the entire per-seed work and computes the row + // address itself, so we don't pay y*160 in C unless we fall back. bool seedMatched; if (halFastFloodWalkAndScans(s->pixels, x, y, matchColor, newNibble, matchEqual, @@ -203,22 +207,27 @@ static void floodFillInternal(SurfaceT *s, int16_t startX, int16_t startY, uint8 } } - // Fallback path needs row; compute it here so the asm path - // above doesn't pay for an unused y*160 multiply on every iter. - row = &s->pixels[SURFACE_ROW_OFFSET(y)]; + /* Fallback path: compute row only if chunky; halFastFloodWalk + * needs it but isn't implemented on Amiga. */ + row = (s->pixels != NULL) ? &s->pixels[SURFACE_ROW_OFFSET(y)] : NULL; // Tier-2 asm fast path: combined seed test + walk-left + // walk-right in one cross-segment call. Falls back to the // pure-C walks below on ports without an asm implementation. { bool seedMatched; - if (halFastFloodWalk(row, x, matchColor, newNibble, matchEqual, - &seedMatched, &leftX, &rightX)) { + if (row != NULL && halFastFloodWalk(row, x, matchColor, newNibble, matchEqual, + &seedMatched, &leftX, &rightX)) { + if (!seedMatched) { + continue; + } + } else if (halFloodWalkPlanes(s, x, y, matchColor, newNibble, matchEqual, + &seedMatched, &leftX, &rightX)) { if (!seedMatched) { continue; } } else { - pix = srcPixel(row, x); + pix = halSamplePixel(s, x, y); pixMatch = (pix == matchColor); if (matchEqual) { if (!pixMatch) { @@ -233,7 +242,7 @@ static void floodFillInternal(SurfaceT *s, int16_t startX, int16_t startY, uint8 // Walk left to find the start of the matching run. leftX = x; while (leftX > 0) { - pix = srcPixel(row, (int16_t)(leftX - 1)); + pix = halSamplePixel(s, (int16_t)(leftX - 1), y); pixMatch = (pix == matchColor); if (matchEqual ? !pixMatch : (pixMatch || pix == newNibble)) { break; @@ -244,7 +253,7 @@ static void floodFillInternal(SurfaceT *s, int16_t startX, int16_t startY, uint8 // Walk right to find the end. rightX = x; while (rightX < SURFACE_WIDTH - 1) { - pix = srcPixel(row, (int16_t)(rightX + 1)); + pix = halSamplePixel(s, (int16_t)(rightX + 1), y); pixMatch = (pix == matchColor); if (matchEqual ? !pixMatch : (pixMatch || pix == newNibble)) { break; @@ -256,12 +265,18 @@ static void floodFillInternal(SurfaceT *s, int16_t startX, int16_t startY, uint8 // Fill the span. Bypass fillRect's clipping wrapper: walk-out // already guaranteed leftX/rightX are in [0..SURFACE_WIDTH-1] - // and the seed-pop bounds check did the same for y. + // and the seed-pop bounds check did the same for y. We DO + // need the planar dual-write (which fillRect's wrapper would + // call), so invoke halFillRectPlanes explicitly after the + // chunky span fill -- otherwise PLANAR_PRESENT builds (and, + // post-Phase-9, every build) display flood-filled regions + // as the unfilled background. { int16_t spanW = (int16_t)(rightX - leftX + 1); if (!halFastFillRect(s, leftX, y, (uint16_t)spanW, 1, newNibble)) { fillRectClipped(s, leftX, y, spanW, 1, newNibble); } + halFillRectPlanes(s, leftX, y, (uint16_t)spanW, 1, newNibble); } // Scan rows above and below for run boundaries. The hot @@ -291,19 +306,26 @@ static void floodFillInternal(SurfaceT *s, int16_t startX, int16_t startY, uint8 } scanY = (int16_t)(y + 1); } - scanRow = &s->pixels[SURFACE_ROW_OFFSET(scanY)]; + scanRow = (s->pixels != NULL) ? &s->pixels[SURFACE_ROW_OFFSET(scanY)] : NULL; // Prefer the combined scan+push asm path (one call per - // scan, no markBuf and no per-pixel C edge walk). - if (!halFastFloodScanAndPush(scanRow, leftX, rightX, + // scan, no markBuf and no per-pixel C edge walk). Skip + // the asm tiers if we don't have a chunky row pointer + // (Phase 9 planar ports). + if (scanRow == NULL || + !halFastFloodScanAndPush(scanRow, leftX, rightX, matchColor, newNibble, matchEqual, scanY, stackX, stackY, &sp, FLOOD_STACK_SIZE)) { - if (!halFastFloodScanRow(scanRow, leftX, rightX, - matchColor, newNibble, matchEqual, - floodMarkBuf)) { + if ((scanRow == NULL || + !halFastFloodScanRow(scanRow, leftX, rightX, + matchColor, newNibble, matchEqual, + floodMarkBuf)) && + !halFloodScanRowPlanes(s, leftX, rightX, scanY, + matchColor, newNibble, matchEqual, + floodMarkBuf)) { // C fallback: fill markBuf the slow way. for (i = 0; i < spanLen; i++) { - pix = srcPixel(scanRow, (int16_t)(leftX + i)); + pix = halSamplePixel(s, (int16_t)(leftX + i), scanY); pixMatch = (pix == matchColor); floodMarkBuf[i] = (uint8_t)(matchEqual ? (pixMatch ? 1 : 0) @@ -621,12 +643,12 @@ void fillRect(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t if (!halFastFillRect(s, sx, sy, (uint16_t)sw, (uint16_t)sh, colorIndex)) { fillRectClipped(s, sx, sy, sw, sh, colorIndex); } + halFillRectPlanes(s, sx, sy, (uint16_t)sw, (uint16_t)sh, colorIndex); surfaceMarkDirtyRect(s, sx, sy, sw, sh); } void floodFill(SurfaceT *s, int16_t x, int16_t y, uint8_t newColor) { - uint8_t *row; uint8_t seedColor; if (s == NULL) { @@ -635,8 +657,9 @@ void floodFill(SurfaceT *s, int16_t x, int16_t y, uint8_t newColor) { if (x < 0 || x >= SURFACE_WIDTH || y < 0 || y >= SURFACE_HEIGHT) { return; } - row = &s->pixels[SURFACE_ROW_OFFSET(y)]; - seedColor = srcPixel(row, x); + /* halSamplePixel reads from whichever storage the port uses -- + * works on both chunky (s->pixels) and planar (s->portData) ports. */ + seedColor = halSamplePixel(s, x, y); if ((seedColor & 0x0F) == (newColor & 0x0F)) { return; } @@ -645,7 +668,6 @@ void floodFill(SurfaceT *s, int16_t x, int16_t y, uint8_t newColor) { void floodFillBounded(SurfaceT *s, int16_t x, int16_t y, uint8_t newColor, uint8_t boundaryColor) { - uint8_t *row; uint8_t pix; if (s == NULL) { @@ -654,8 +676,7 @@ void floodFillBounded(SurfaceT *s, int16_t x, int16_t y, uint8_t newColor, uint8 if (x < 0 || x >= SURFACE_WIDTH || y < 0 || y >= SURFACE_HEIGHT) { return; } - row = &s->pixels[SURFACE_ROW_OFFSET(y)]; - pix = srcPixel(row, x); + pix = halSamplePixel(s, x, y); // Starting on a boundary pixel or already-filled pixel: nothing // to do. if ((pix & 0x0F) == (boundaryColor & 0x0F)) { @@ -669,25 +690,16 @@ void floodFillBounded(SurfaceT *s, int16_t x, int16_t y, uint8_t newColor, uint8 uint8_t samplePixel(const SurfaceT *s, int16_t x, int16_t y) { - uint8_t byte; - if (s == NULL) { return 0; } if (x < 0 || x >= SURFACE_WIDTH || y < 0 || y >= SURFACE_HEIGHT) { return 0; } - - /* Cast to uint16_t before shift -- already validated x >= 0, - * unsigned semantics match. Avoids ~SSHIFTRIGHT helper. */ - byte = s->pixels[SURFACE_ROW_OFFSET(y) + ((uint16_t)x >> 1)]; - if (x & 1) { - return (uint8_t)(byte & 0x0F); - } - /* `byte >> 4` is uint8_t but ORCA-C promotes to int (signed 16-bit) - * for the shift, then narrows -- triggers ~SSHIFTRIGHT. The - * mask-then-shift sidesteps the promotion path. */ - return (uint8_t)((byte & 0xF0u) >> 4); + /* halSamplePixel reads from whichever storage the port uses -- + * chunky ports return a nibble extracted from s->pixels; planar + * ports read 4 plane bits and assemble the nibble. */ + return halSamplePixel(s, x, y); } @@ -725,6 +737,8 @@ void surfaceBlit(SurfaceT *dst, const JoeyAssetT *src, int16_t x, int16_t y) { } } } + halBlitRectPlanes(dst, x, y, src->pixels, srcX0, srcY0, + copyW, copyH, srcRowBytes, 0xFFFFu); surfaceMarkDirtyRect(dst, x, y, copyW, copyH); } @@ -768,6 +782,8 @@ void surfaceBlitMasked(SurfaceT *dst, const JoeyAssetT *src, int16_t x, int16_t } } } + halBlitRectPlanes(dst, x, y, src->pixels, srcX0, srcY0, + copyW, copyH, srcRowBytes, (uint16_t)transparent); surfaceMarkDirtyRect(dst, x, y, copyW, copyH); } diff --git a/src/core/hal.h b/src/core/hal.h index fec0777..05cdbd8 100644 --- a/src/core/hal.h +++ b/src/core/hal.h @@ -9,8 +9,11 @@ #ifndef JOEYLIB_HAL_H #define JOEYLIB_HAL_H +#include + #include "joey/core.h" #include "joey/input.h" +#include "joey/sprite.h" #include "joey/surface.h" // Per-port one-shot initialization. Called from joeyInit after config @@ -27,17 +30,131 @@ void halShutdown(void); // backs the library-owned stage surface. Ports that have a // hardware-friendly pin location for the back buffer (IIgs $01/2000 // with SHR shadow inhibited) return that address here; ports with no -// such constraint just malloc/free. +// such constraint just malloc/free. Planar 68k ports may return NULL +// if the surface is planar-only and has no chunky shadow. uint8_t *halStageAllocPixels(void); void halStageFreePixels(uint8_t *pixels); -// Present the entire source surface to the display. -void halPresent(const SurfaceT *src); +// Allocate / release the per-surface portData blob (see SurfaceT in +// surfaceInternal.h). Chunky ports return NULL from Init -- they keep +// portData unused and operate on the chunky `pixels` buffer. Planar +// 68k ports allocate a per-surface struct here describing the +// bitplane storage (Amiga: 4 separate plane buffers + stride; ST: one +// interleaved buffer + stride). Called by surfaceCreate / stageAlloc +// after pixels is allocated; freed by surfaceDestroy / stageFree +// before pixels is freed. `isStage` lets the port short-circuit for +// the stage if its planes are display-owned (e.g. Amiga's BitMap +// planes from OpenScreen) rather than allocated per surface. +void *halSurfaceAllocPortData(SurfaceT *s, bool isStage); +void halSurfaceFreePortData(SurfaceT *s, bool isStage, void *portData); -// Present a rectangular region of the source surface. The caller has -// already validated and clipped the rect to be fully inside the -// surface bounds and to have positive extents. -void halPresentRect(const SurfaceT *src, int16_t x, int16_t y, uint16_t w, uint16_t h); +// Phase 3 planar dual-write: called from cross-platform fillRect AFTER +// the chunky shadow has been written, with the same already-clipped +// (x, y, w, h) and the raw color index 0..15. Planar ports update +// the bitplanes with the rect's bit pattern (per-plane bit value = +// (color >> plane) & 1). Chunky ports (DOS, IIgs) provide a no-op +// stub. Called unconditionally so cross-platform code doesn't have +// to know the port is planar; the per-port stub is the cheapest +// possible thing on chunky ports. +void halFillRectPlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex); + +// Phase 3 planar dual-write for surfaceCopy: called from cross-platform +// surfaceCopy AFTER the chunky pixel buffer is memcpy'd. Planar ports +// also memcpy the bitplanes from src to dst so JOEYLIB_PLANAR_PRESENT +// builds see correct planes. dst and src are non-NULL and distinct +// (caller's no-op guards already passed). +void halSurfaceCopyPlanes(SurfaceT *dst, const SurfaceT *src); + +// Phase 5 planar dual-write for tile ops. Called from cross-platform +// tile.c AFTER the chunky path completes. (bx, by) are tile-grid +// coords (0..39 horiz, 0..24 vert; surface is 40x25 tiles). +// transparentIndex for tileCopyMasked: pixel value to skip. tilePaste +// reads from a packed 32-byte chunky TileT (4 bytes/row x 8 rows). +// All Amiga impls operate on the off-screen shadow planes via +// AmigaPlanarT; chunky-port stubs are no-ops. tileSnap is read-only +// so has no planar dual-write hook. +void halTileFillPlanes(SurfaceT *s, uint8_t bx, uint8_t by, uint8_t colorIndex); +void halTileCopyPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy); +void halTileCopyMaskedPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy, uint8_t transparentIndex); +void halTilePastePlanes(SurfaceT *dst, uint8_t bx, uint8_t by, const uint8_t *chunkyTile); + +// tileSnap: cross-platform code reads s->pixels chunky bytes into a +// 32-byte TileT. On planar ports (s->pixels NULL) the chunky read +// crashes -- this hook is the planar derivation: reads bitplane bits +// for the tile rect and assembles 32 chunky bytes (4 bytes/row x 8 +// rows) into chunkyTileOut. Chunky ports (s->pixels valid) implement +// this as a no-op since the cross-platform fallback already filled +// chunkyTileOut from s->pixels. +void halTileSnapPlanes(const SurfaceT *src, uint8_t bx, uint8_t by, uint8_t *chunkyTileOut); + +// Phase 6 planar dual-write for spriteDraw. Called from cross-platform +// sprite.c AFTER spriteCompiledDraw or spriteDrawInterpreted has +// updated the chunky shadow. (x, y) is the destination top-left in +// surface pixels (may be partially off-surface; the hook does its own +// clipping). Walks the sprite's chunky tile data and updates dst +// surface planes for every non-transparent pixel (nibble != 0). +// Save/restore have NO planar dual-write yet -- after spriteSaveUnder +// + spriteDraw + spriteRestoreUnder under JOEYLIB_PLANAR_PRESENT, the +// planes still show the sprite (chunky restored, planes unchanged). +// Workable approach for that needs a parallel plane backup buffer; +// deferred until apps actually depend on PLANAR_PRESENT save/restore. +void halSpriteDrawPlanes(SurfaceT *s, const SpriteT *sp, int16_t x, int16_t y); + +// Phase 8 planar dual-write for asset blits and full surface loads. +// halBlitRectPlanes is called from surfaceBlit / surfaceBlitMasked +// AFTER the chunky path. transparent == 0xFFFF means opaque blit; any +// other value is a nibble (0..15) to skip. srcBytes is the asset's +// raw chunky pixel buffer; srcRowBytes is its stride. (x, y) is the +// already-clipped destination top-left in dst surface pixels; +// srcX0/srcY0 is where in the asset the visible region starts after +// clip; copyW/copyH is the visible region size in pixels. +// +void halBlitRectPlanes(SurfaceT *dst, int16_t x, int16_t y, const uint8_t *srcBytes, int16_t srcX0, int16_t srcY0, int16_t copyW, int16_t copyH, int16_t srcRowBytes, uint16_t transparent); + +// Phase 9 sprite save/restore plane data. Chunky ports already hold +// pixel data in backup->bytes via the cross-platform memcpy. Planar +// ports (Amiga) DO have chunky NULL, so backup->bytes is unused by +// the chunky path -- we repurpose it to hold per-plane bytes. Layout: +// 4 plane stripes of (h * bytesPerPlaneRow) bytes each, where +// bytesPerPlaneRow = w/8 (sprite x and w are guaranteed 2-pixel +// aligned by spriteSaveUnder; planar requires further 8-pixel +// rounding -- see Amiga impl notes). Total bytes: +// 4 * h * w/8 = h * w/2 = same as chunky. backup->sizeBytes capacity +// works on both ports. Chunky-port impls are no-ops; Amiga writes / +// reads plane bytes via AmigaPlanarT. +void halSpriteSavePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t *dstPlaneBytes); +void halSpriteRestorePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, const uint8_t *srcPlaneBytes); + +// Phase 9 reader hooks. Cross-platform code calls these instead of +// reading from s->pixels directly so it works regardless of whether +// the port stores chunky or planar as the source of truth. Chunky +// ports (DOS, IIgs) implement these reading from s->pixels (cheap); +// Amiga reads from the bitplanes in AmigaPlanarT. (x, y) bounds are +// already validated by the caller. +// +// halSamplePixel: returns the 0..15 nibble at (x, y). +// halSurfaceHash: returns the FNV-style hash of pixel + scb + palette +// that surfaceHash currently computes by walking s->pixels. Allows +// ports to use their native pixel storage instead. +// halSurfaceCopyChunky: cross-platform surfaceCopy used to memcpy +// s->pixels src->dst; on planar ports there is no chunky to copy +// (planes already covered by halSurfaceCopyPlanes). Chunky ports +// do the memcpy here; Amiga is a no-op. +// halSurfaceLoadFileChunky / halSurfaceSaveFileChunky wrap fread / +// fwrite of the pixel data. Chunky ports stream directly to/from +// s->pixels; Amiga uses a scratch buffer + c2p (load) or +// plane->chunky derivation (save). +uint8_t halSamplePixel(const SurfaceT *s, int16_t x, int16_t y); +uint32_t halSurfaceHash(const SurfaceT *s); +void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src); +bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp); +bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp); + +// Present the dirty regions of the source surface to the display. +// The cross-platform stagePresent walks the dirty arrays before +// calling this; ports may use the dirty arrays themselves to skip +// untouched rows. +void halPresent(const SurfaceT *src); // Optional: returns a port-specific error message string for the last // HAL failure, or NULL if none. Ports may return NULL always. @@ -73,9 +190,23 @@ uint16_t halFrameHz(void); // Audio: per-port engine setup, module + SFX playback, teardown. // halAudioInit returns true if the platform has a working engine. -// All entry points are safe to call when init failed -- they become -// no-ops. See joey/audio.h for the public API contract that wraps -// these. +// Per-surface chunky pixel allocation. Chunky ports (DOS, IIgs, ST +// while still chunky) allocate SURFACE_PIXELS_SIZE bytes (calloc- +// style, zero-filled). Pure-planar Amiga returns NULL -- there's no +// chunky shadow; cross-platform code that previously read s->pixels +// goes through halSamplePixel / halSurfaceCopyChunky / etc. instead. +// halSurfaceFreePixels mirrors free(); NULL is a valid input on +// planar ports. +uint8_t *halSurfaceAllocPixels(void); +void halSurfaceFreePixels(uint8_t *pixels); + +// Get a pointer to the start of bitplane `planeIdx` (0..3) for surface +// `s`. Returns NULL on chunky ports (no planes). On Amiga returns +// pd->planes[planeIdx] from the AmigaPlanarT struct in portData. +// Used by the planar sprite codegen dispatcher to compute the 4 +// plane addresses to hand the emitted asm. +uint8_t *halSurfacePlanePtr(const SurfaceT *s, uint8_t planeIdx); + bool halAudioInit(void); void halAudioShutdown(void); void halAudioPlayMod(const uint8_t *data, uint32_t length, bool loop); @@ -185,6 +316,21 @@ bool halFastFloodWalkAndScans(uint8_t *pixels, int16_t x, int16_t y, bool *seedMatched, int16_t *leftXOut, int16_t *rightXOut); +// Planar variants of halFastFloodWalk / halFastFloodScanRow. Take a +// SurfaceT* instead of a chunky-row pointer so they work on planar +// ports (Amiga post-Phase 9) where s->pixels is NULL. Same semantics; +// chunky ports return false (the chunky variants above are faster +// when a chunky row is available). Replace the per-pixel +// halSamplePixel walk on planar ports. +bool halFloodWalkPlanes(const SurfaceT *s, int16_t startX, int16_t y, + uint8_t matchColor, uint8_t newColor, bool matchEqual, + bool *seedMatched, + int16_t *leftXOut, int16_t *rightXOut); + +bool halFloodScanRowPlanes(const SurfaceT *s, int16_t leftX, int16_t rightX, int16_t scanY, + uint8_t matchColor, uint8_t newColor, bool matchEqual, + uint8_t *markBuf); + // surfaceBlit / surfaceBlitMasked rect-copy helper. Caller has done // the clip math: dstRow0 / srcRow0 point at row 0 of the source/dest // regions, dstX / srcX are intra-row pixel offsets, copyW/copyH are @@ -333,6 +479,12 @@ extern uint16_t gFloodRightX; #undef halFastFloodScanAndPush #define halFastFloodScanAndPush(_row, _lx, _rx, _mc, _nc, _me, _sy, _sx, _syA, _sp, _ms) (false) +// IIgs is chunky; the planar flood hooks are never reachable. +#undef halFloodWalkPlanes +#define halFloodWalkPlanes(_s, _sx, _y, _mc, _nc, _me, _sm, _lx, _rx) (false) +#undef halFloodScanRowPlanes +#define halFloodScanRowPlanes(_s, _lx, _rx, _sy, _mc, _nc, _me, _mb) (false) + // Tier-1 flood: multi-output. Asm sets gFloodSeedMatch / gFloodLeftX / // gFloodRightX; macro reads those into the caller's out-ptrs. #undef halFastFloodWalkAndScans diff --git a/src/core/present.c b/src/core/present.c index 1184f27..a550c66 100644 --- a/src/core/present.c +++ b/src/core/present.c @@ -2,8 +2,7 @@ // // stagePresent walks the per-row dirty bands set by drawing primitives // and asks the port HAL to flip just those rows to the display, then -// resets the dirty state. stagePresentRect bypasses dirty tracking -// entirely and slams a caller-specified rectangle (after clipping). +// resets the dirty state. #include @@ -25,48 +24,3 @@ void stagePresent(void) { halPresent(stage); stageDirtyClearAll(); } - - -void stagePresentRect(int16_t x, int16_t y, uint16_t w, uint16_t h) { - SurfaceT *stage; - int16_t sx; - int16_t sy; - int16_t sw; - int16_t sh; - - stage = stageGet(); - if (stage == NULL) { - return; - } - - sx = x; - sy = y; - sw = (int16_t)w; - sh = (int16_t)h; - - if (sw <= 0 || sh <= 0) { - return; - } - if (sx < 0) { - sw += sx; - sx = 0; - } - if (sy < 0) { - sh += sy; - sy = 0; - } - if (sx >= SURFACE_WIDTH || sy >= SURFACE_HEIGHT) { - return; - } - if (sx + sw > SURFACE_WIDTH) { - sw = SURFACE_WIDTH - sx; - } - if (sy + sh > SURFACE_HEIGHT) { - sh = SURFACE_HEIGHT - sy; - } - if (sw <= 0 || sh <= 0) { - return; - } - - halPresentRect(stage, sx, sy, (uint16_t)sw, (uint16_t)sh); -} diff --git a/src/core/sprite.c b/src/core/sprite.c index 80bf036..177ca53 100644 --- a/src/core/sprite.c +++ b/src/core/sprite.c @@ -10,6 +10,7 @@ #include "joey/sprite.h" #include "codegenArenaInternal.h" +#include "hal.h" #include "spriteInternal.h" #include "surfaceInternal.h" @@ -22,6 +23,20 @@ // Color 0 is always transparent for sprites (DESIGN.md contract). #define TRANSPARENT_NIBBLE 0 +// On Amiga (post-Phase 9 / Phase 6 redux) the compiled sprite emitter +// writes directly to the bitplanes, so the halSpritePlanes hooks are +// pure duplicate work after a compiled call. On other ports the +// hooks are either no-op stubs (chunky-only IIgs/DOS) or the only +// thing writing planes (ST: chunky-shadow + planes). Slow / interpreter +// paths still need the hooks unconditionally on every platform -- the +// chunky interpreter is a no-op on Amiga (s->pixels NULL) so the hook +// is the only draw. +#if defined(JOEYLIB_PLATFORM_AMIGA) +#define COMPILED_SPRITE_WRITES_PLANES 1 +#else +#define COMPILED_SPRITE_WRITES_PLANES 0 +#endif + // ----- Prototypes ----- @@ -144,14 +159,20 @@ static void spriteDrawInterpreted(SurfaceT *s, SpriteT *sp, int16_t x, int16_t y return; } - for (row = 0; row < h; row++) { - dstRow = &s->pixels[(dy + row) * SURFACE_BYTES_PER_ROW]; - for (col = 0; col < w; col++) { - nibble = srcNibble(sp, (int16_t)(sx + col), (int16_t)(sy + row)); - if (nibble == TRANSPARENT_NIBBLE) { - continue; + /* Skip the chunky write loop on planar ports (s->pixels == NULL). + * halSpriteDrawPlanes is called by the spriteDraw caller and does + * its own clip + plane write, so the dirty mark + planar update + * happen there. Phase 9 dropped the chunky shadow on Amiga. */ + if (s->pixels != NULL) { + for (row = 0; row < h; row++) { + dstRow = &s->pixels[(dy + row) * SURFACE_BYTES_PER_ROW]; + for (col = 0; col < w; col++) { + nibble = srcNibble(sp, (int16_t)(sx + col), (int16_t)(sy + row)); + if (nibble == TRANSPARENT_NIBBLE) { + continue; + } + writeDstNibble(dstRow, (int16_t)(dx + col), nibble); } - writeDstNibble(dstRow, (int16_t)(dx + col), nibble); } } surfaceMarkDirtyRect(s, dx, dy, w, h); @@ -200,6 +221,13 @@ SpriteT *spriteCreateFromSurface(const SurfaceT *src, int16_t x, int16_t y, if (src == NULL || widthTiles == 0 || heightTiles == 0) { return NULL; } + /* Phase 9: planar ports have NULL src->pixels. Capturing a sprite + * from such a surface needs a planar-to-chunky derivation hook; + * not implemented yet, so refuse the call. Apps targeting Amiga + * should ship sprites as static tile data instead. */ + if (src->pixels == NULL) { + return NULL; + } // Source x/y must be on a tile boundary so each captured tile lands // on whole bytes -- mid-byte snapshots would lose half a pixel at // the left edge. @@ -284,10 +312,14 @@ void spriteDraw(SurfaceT *s, SpriteT *sp, int16_t x, int16_t y) { // need clip math (they walk fixed offsets). if (sp->slot != NULL && isFullyOnSurface(x, y, widthPx, heightPx)) { spriteCompiledDraw(s, sp, x, y); + if (!COMPILED_SPRITE_WRITES_PLANES) { + halSpriteDrawPlanes(s, sp, x, y); + } surfaceMarkDirtyRect(s, x, y, (int16_t)widthPx, (int16_t)heightPx); return; } spriteDrawInterpreted(s, sp, x, y); + halSpriteDrawPlanes(s, sp, x, y); } @@ -332,7 +364,7 @@ void spriteSaveAndDraw(SurfaceT *s, SpriteT *sp, int16_t x, int16_t y, SpriteBac uint16_t saveIdx; uint16_t drawIdx; uint8_t *offsetsBase; - shift = (uint8_t)(x & 1); + shift = SPRITE_SHIFT_INDEX(x); saveIdx = (uint16_t)(((uint16_t)shift << 1) + (uint16_t)shift + SPRITE_OP_SAVE); drawIdx = (uint16_t)(((uint16_t)shift << 1) + (uint16_t)shift + SPRITE_OP_DRAW); offsetsBase = (uint8_t *)sp->routineOffsets; @@ -340,6 +372,10 @@ void spriteSaveAndDraw(SurfaceT *s, SpriteT *sp, int16_t x, int16_t y, SpriteBac *(uint16_t *)(offsetsBase + (drawIdx << 1)) != SPRITE_NOT_COMPILED) { spriteCompiledSaveUnder(s, sp, x, y, backup); spriteCompiledDraw (s, sp, x, y); + if (!COMPILED_SPRITE_WRITES_PLANES) { + halSpriteSavePlanes(s, backup->x, backup->y, backup->width, backup->height, backup->bytes); + halSpriteDrawPlanes(s, sp, x, y); + } surfaceMarkDirtyRect (s, x, y, (int16_t)widthPx, (int16_t)heightPx); return; } @@ -630,13 +666,18 @@ void spriteRestoreUnder(SurfaceT *s, const SpriteBackupT *backup) { routeOffset = *(uint16_t *)((uint8_t *)sp->routineOffsets + (routeIdx << 1)); if (routeOffset != SPRITE_NOT_COMPILED) { spriteCompiledRestoreUnder(s, backup); + if (!COMPILED_SPRITE_WRITES_PLANES) { + halSpriteRestorePlanes(s, bx, by, bw, bh, backup->bytes); + } surfaceMarkDirtyRect(s, bx, by, (int16_t)bw, (int16_t)bh); return; } } - /* Slow / interpreted memcpy fallback. */ - { + /* Slow / interpreted memcpy fallback. Skip the chunky memcpy if + * the port has no chunky shadow (Phase 9 Amiga: s->pixels NULL); + * halSpriteRestorePlanes below does the planar restore. */ + if (s->pixels != NULL) { int16_t row; int16_t byteStart; uint8_t *dstRow; @@ -650,6 +691,7 @@ void spriteRestoreUnder(SurfaceT *s, const SpriteBackupT *backup) { (size_t)copyBytes); } } + halSpriteRestorePlanes(s, bx, by, bw, bh, backup->bytes); surfaceMarkDirtyRect(s, bx, by, (int16_t)bw, (int16_t)bh); } @@ -684,11 +726,14 @@ void spriteSaveUnder(const SurfaceT *s, SpriteT *sp, int16_t x, int16_t y, Sprit if (backup->bytes != NULL && slot != NULL && isFullyOnSurface(x, y, widthPx, heightPx)) { uint16_t routeIdx; uint16_t routeOffset; - shift = (uint8_t)(x & 1); + shift = SPRITE_SHIFT_INDEX(x); routeIdx = (uint16_t)(((uint16_t)shift << 1) + (uint16_t)shift + SPRITE_OP_SAVE); routeOffset = *(uint16_t *)((uint8_t *)sp->routineOffsets + (routeIdx << 1)); if (routeOffset != SPRITE_NOT_COMPILED) { spriteCompiledSaveUnder(s, sp, x, y, backup); + if (!COMPILED_SPRITE_WRITES_PLANES) { + halSpriteSavePlanes(s, backup->x, backup->y, backup->width, backup->height, backup->bytes); + } return; } } @@ -744,11 +789,16 @@ void spriteSaveUnder(const SurfaceT *s, SpriteT *sp, int16_t x, int16_t y, Sprit // backup with bytes==NULL. return; } - for (row = 0; row < h; row++) { - srcRow = &s->pixels[(dy + row) * SURFACE_BYTES_PER_ROW]; - memcpy(&backup->bytes[(uint16_t)row * (uint16_t)copyBytes], - &srcRow[byteStart], - (size_t)copyBytes); + /* Chunky save path: skip on planar ports (s->pixels NULL). + * halSpriteSavePlanes below covers the planar case. */ + if (s->pixels != NULL) { + for (row = 0; row < h; row++) { + srcRow = &s->pixels[(dy + row) * SURFACE_BYTES_PER_ROW]; + memcpy(&backup->bytes[(uint16_t)row * (uint16_t)copyBytes], + &srcRow[byteStart], + (size_t)copyBytes); + } } + halSpriteSavePlanes(s, clippedX, dy, (uint16_t)clippedW, (uint16_t)h, backup->bytes); } /* end slow path */ } diff --git a/src/core/spriteInternal.h b/src/core/spriteInternal.h index 8e4733a..99a6bd5 100644 --- a/src/core/spriteInternal.h +++ b/src/core/spriteInternal.h @@ -13,6 +13,16 @@ #define SPRITE_OP_RESTORE 2 #define SPRITE_OP_COUNT 3 +// Per-platform shift index used by the dispatcher. Chunky 4bpp ports +// store one nibble per pixel pair so the only sub-byte alignment is +// x % 2. Amiga planar packs 8 pixels per plane byte so all 8 +// alignments matter. +#if defined(JOEYLIB_PLATFORM_AMIGA) +#define SPRITE_SHIFT_INDEX(x) ((uint8_t)((x) & 7)) +#else +#define SPRITE_SHIFT_INDEX(x) ((uint8_t)((x) & 1)) +#endif + // Sentinel stored in routineOffsets[shift][op] when that op's emitter // returned 0 bytes (i.e., the platform doesn't implement compiled // codegen for that op yet). Distinct from a real offset of 0, which diff --git a/src/core/surface.c b/src/core/surface.c index 486620f..229b5f0 100644 --- a/src/core/surface.c +++ b/src/core/surface.c @@ -65,9 +65,10 @@ void surfaceCopy(SurfaceT *dst, const SurfaceT *src) { if (dst == NULL || src == NULL || dst == src) { return; } - memcpy(dst->pixels, src->pixels, SURFACE_PIXELS_SIZE); + halSurfaceCopyChunky(dst, src); /* memcpy on chunky ports; no-op on planar */ memcpy(dst->scb, src->scb, sizeof(src->scb)); memcpy(dst->palette, src->palette, sizeof(src->palette)); + halSurfaceCopyPlanes(dst, src); /* 4 plane memcpys on planar ports; no-op on chunky */ surfaceMarkDirtyAll(dst); } @@ -79,11 +80,10 @@ SurfaceT *surfaceCreate(void) { if (s == NULL) { return NULL; } - s->pixels = (uint8_t *)calloc(1, SURFACE_PIXELS_SIZE); - if (s->pixels == NULL) { - free(s); - return NULL; - } + /* halSurfaceAllocPixels returns NULL on planar ports (Amiga); the + * primary storage is the port-allocated planes via portData below. */ + s->pixels = halSurfaceAllocPixels(); + s->portData = halSurfaceAllocPortData(s, false); paletteInitDefault(s); return s; } @@ -96,11 +96,44 @@ void surfaceDestroy(SurfaceT *s) { if (s == gStage) { return; } - free(s->pixels); + halSurfaceFreePortData(s, false, s->portData); + halSurfaceFreePixels(s->pixels); free(s); } +// Cheapest deterministic hash that still detects per-byte changes: +// (hash << 1) ^ byte, a single 16-bit accumulator. ORCA-C / 65816 +// compiles to ASL + EOR -- about 35 cyc per byte. A 32-bit multiply +// FNV-style hash takes ~200 cyc per byte via ~UMUL4, which adds +// 80+ seconds to a UBER run on IIgs. Discrimination is weaker than +// FNV but plenty for cross-port validation: we only need "did the +// same logical-pixel sequence produce the same hash?" -- not +// crypto-grade collision resistance over arbitrary inputs. +// +// Walks the chunky pixel buffer byte-by-byte, the same logical-pixel +// ordering on every chunky-format port (IIgs, DOS, Amiga and ST +// while still chunky). When the planar rewrite drops s->pixels on +// Amiga/ST this function will need a HAL hook (halSurfaceHash) to +// read planes natively while producing the same logical hash. +/* Cross-port FNV-style hash of pixels + SCB + palette. The hash logic + * (multiplier streams, byte ordering for palette) is identical across + * ports, but the pixel READS go through the port HAL so chunky ports + * walk s->pixels and planar ports walk plane bits and assemble nibble + * pairs into chunky bytes for the hash. Both produce the same logical- + * pixel hash because they hash the same logical pixel sequence in the + * same chunky byte order. SCB and palette are still hashed inline + * here because they live in the SurfaceT struct on every port (no + * port-specific storage) and the byte/value-with-explicit-byte-order + * walks are already endian-independent. */ +uint32_t surfaceHash(const SurfaceT *s) { + if (s == NULL) { + return 0u; + } + return halSurfaceHash(s); +} + + bool surfaceLoadFile(SurfaceT *dst, const char *path) { FILE *fp; long fileSize; @@ -125,7 +158,7 @@ bool surfaceLoadFile(SurfaceT *dst, const char *path) { fclose(fp); return false; } - if (fread(dst->pixels, 1, SURFACE_PIXELS_SIZE, fp) != SURFACE_PIXELS_SIZE) { + if (!halSurfaceLoadFileChunky(dst, fp)) { fclose(fp); return false; } @@ -153,7 +186,7 @@ bool surfaceSaveFile(const SurfaceT *src, const char *path) { if (fp == NULL) { return false; } - if (fwrite(src->pixels, 1, SURFACE_PIXELS_SIZE, fp) != SURFACE_PIXELS_SIZE) { + if (!halSurfaceSaveFileChunky(src, fp)) { fclose(fp); return false; } @@ -228,13 +261,14 @@ bool stageAlloc(void) { if (gStage == NULL) { return false; } + /* halStageAllocPixels returns NULL on planar ports (Amiga) where + * the chunky shadow doesn't exist; the planes from portData are + * the source of truth. NULL pixels is no longer a failure. */ gStage->pixels = halStageAllocPixels(); - if (gStage->pixels == NULL) { - free(gStage); - gStage = NULL; - return false; + if (gStage->pixels != NULL) { + memset(gStage->pixels, 0, SURFACE_PIXELS_SIZE); } - memset(gStage->pixels, 0, SURFACE_PIXELS_SIZE); + gStage->portData = halSurfaceAllocPortData(gStage, true); stageDirtyClearAll(); paletteInitDefault(gStage); return true; @@ -255,6 +289,7 @@ void stageFree(void) { if (gStage == NULL) { return; } + halSurfaceFreePortData(gStage, true, gStage->portData); halStageFreePixels(gStage->pixels); free(gStage); gStage = NULL; diff --git a/src/core/surfaceInternal.h b/src/core/surfaceInternal.h index 45017a5..c7743d9 100644 --- a/src/core/surfaceInternal.h +++ b/src/core/surfaceInternal.h @@ -14,8 +14,17 @@ // auto-mirroring to $E1). Caller-side `s->pixels[i]` syntax is // unchanged; only allocation/copy paths in surface.c shift to a // two-buffer model. +// +// portData is per-port opaque storage. On chunky ports (IIgs, DOS) it +// stays NULL -- pixels is the source of truth. On planar ports +// (Amiga, Atari ST) it points to a port-private struct describing the +// 4 bitplanes (Amiga: 4 separate plane buffers + stride; ST: single +// interleaved buffer + stride). Cross-platform code never touches it +// directly -- all primitive access goes through halFast* on planar +// ports. See project_planar_68k_plan.md for the full architecture. struct SurfaceT { uint8_t *pixels; + void *portData; uint8_t scb[SURFACE_HEIGHT]; uint16_t palette[SURFACE_PALETTE_COUNT][SURFACE_COLORS_PER_PALETTE]; }; @@ -38,6 +47,18 @@ struct SurfaceT { extern uint8_t gStageMinWord[SURFACE_HEIGHT]; extern uint8_t gStageMaxWord[SURFACE_HEIGHT]; +// Per-byte mixer for surfaceHash. Two-stream: lo *= 31 + b, hi *= 251 + b. +// Strength-reduced to shifts so ORCA-C doesn't emit `~UMUL2` (~150 cyc +// per call); 32 KB hashed twice -> ~5 minutes per UBER run. The +// shift form is 16-bit-equivalent (mod 2^16) so hash values are +// identical to the original `* 31u` / `* 251u` form. +// lo *= 31 == (lo << 5) - lo +// hi *= 251 == (hi << 8) - (hi << 2) - hi +#define SURFACE_HASH_MIX_BYTE(lo_, hi_, b_) do { \ + (lo_) = (uint16_t)(((((lo_) << 5) - (lo_)) + (b_))); \ + (hi_) = (uint16_t)((((hi_) << 8) - ((hi_) << 2) - (hi_)) + (b_)); \ +} while (0) + // Stage SCB / palette dirty flags. scbSet* and paletteSet set them // true when the stage's data is modified; the per-port present code // checks the flags and clears after upload. Replaces a per-frame @@ -50,6 +71,15 @@ extern bool gStagePaletteDirty; // bands are widened to cover the rect. If `s` is any other surface, // the call is a no-op -- non-stage surfaces never get presented, so // they don't carry dirty state. +// +// Planar ports rely on the chunky shadow + c2p path through Phase 8. +// Planar-native primitives (Phases 3+) dual-write: they update both +// the chunky pixels and the bitplanes in the same call, so c2p at +// present time always derives correct planes from up-to-date chunky. +// Phase 9 deletes the chunky shadow + c2p; only at that point will +// per-row planar-vs-chunky tracking even be a possible question, and +// the plan is to avoid it entirely there too (planes become the only +// source of truth). void surfaceMarkDirtyRect(const SurfaceT *s, int16_t x, int16_t y, int16_t w, int16_t h); // Shorthand for "every row, full width" -- used by surfaceClear and diff --git a/src/core/tile.c b/src/core/tile.c index e451425..d84b585 100644 --- a/src/core/tile.c +++ b/src/core/tile.c @@ -147,6 +147,7 @@ void tileCopy(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, if (!halFastTileCopy(dstRow0, srcRow0)) { copyTileOpaque(dstRow0, srcRow0); } + halTileCopyPlanes(dst, dstBx, dstBy, src, srcBx, srcBy); surfaceMarkDirtyRect(dst, (int16_t)dstPixelX, (int16_t)dstPixelY, TILE_PIXELS_PER_SIDE, TILE_PIXELS_PER_SIDE); } @@ -178,6 +179,7 @@ void tileCopyMasked(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT if (!halFastTileCopyMasked(dstRow0, srcRow0, transparentIndex)) { copyTileMasked(dstRow0, srcRow0, transparentIndex); } + halTileCopyMaskedPlanes(dst, dstBx, dstBy, src, srcBx, srcBy, transparentIndex); surfaceMarkDirtyRect(dst, (int16_t)dstPixelX, (int16_t)dstPixelY, TILE_PIXELS_PER_SIDE, TILE_PIXELS_PER_SIDE); } @@ -209,6 +211,7 @@ void tileFill(SurfaceT *s, uint8_t bx, uint8_t by, uint8_t colorIndex) { row += SURFACE_BYTES_PER_ROW; } } + halTileFillPlanes(s, bx, by, colorIndex); surfaceMarkDirtyRect(s, (int16_t)pixelX, (int16_t)pixelY, TILE_PIXELS_PER_SIDE, TILE_PIXELS_PER_SIDE); } @@ -241,6 +244,7 @@ void tilePaste(SurfaceT *dst, uint8_t bx, uint8_t by, const TileT *in) { src += TILE_BYTES_PER_ROW; } } + halTilePastePlanes(dst, bx, by, &in->pixels[0]); surfaceMarkDirtyRect(dst, (int16_t)pixelX, (int16_t)pixelY, TILE_PIXELS_PER_SIDE, TILE_PIXELS_PER_SIDE); } @@ -261,9 +265,12 @@ void tileSnap(const SurfaceT *src, uint8_t bx, uint8_t by, TileT *out) { } pixelX = (uint16_t)((uint16_t)bx * TILE_PIXELS_PER_SIDE); pixelY = (uint16_t)((uint16_t)by * TILE_PIXELS_PER_SIDE); - srcRow = &src->pixels[SURFACE_ROW_OFFSET(pixelY) + (pixelX >> 1)]; dst = &out->pixels[0]; - if (!halFastTileSnap(dst, srcRow)) { + /* On planar ports (s->pixels NULL) the chunky read path is + * skipped; halTileSnapPlanes below derives the tile bytes from + * the bitplanes. */ + if (src->pixels != NULL && !halFastTileSnap(dst, &src->pixels[SURFACE_ROW_OFFSET(pixelY) + (pixelX >> 1)])) { + srcRow = &src->pixels[SURFACE_ROW_OFFSET(pixelY) + (pixelX >> 1)]; for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) { dst[0] = srcRow[0]; dst[1] = srcRow[1]; @@ -273,4 +280,5 @@ void tileSnap(const SurfaceT *src, uint8_t bx, uint8_t by, TileT *out) { dst += TILE_BYTES_PER_ROW; } } + halTileSnapPlanes(src, bx, by, &out->pixels[0]); } diff --git a/src/port/amiga/circle.s b/src/port/amiga/circle.s new file mode 100644 index 0000000..e36503c --- /dev/null +++ b/src/port/amiga/circle.s @@ -0,0 +1,270 @@ +| Amiga planar circle outline V4 -- 16-way color-specialized. +| +| Per Bresenham iter: +| 1. Precompute 4 xp records (xp_byte_w + bitMask_b + notMask_b) for +| cx +/- bx and cx +/- by, stored at sp+0..15 (4 records x 4 bytes). +| 2. Precompute 4 yp40 words for cy +/- by and cy +/- bx, stored at +| sp+16..23 (4 words x 2 bytes). +| 3. Plot 8 octant pixels with hardcoded color: each pixel does 4 +| branchless plane RMW ops (or.b for set bits, and.b for clear +| bits) -- no btst, no per-plane branch. +| 4. Bresenham step. +| +| At function entry the color is masked to 4 bits and used as the index +| into a 16-entry jump table that selects the matching main loop. +| Each main loop has the color hardcoded into the per-plane RMW ops. +| +| The branchless plot saves ~20-28 cyc per plane vs V3's btst+branch +| pattern -- ~640-900 cyc per Bresenham iter. +| +| ABI: cdecl. d2-d7/a2-a6 callee-save. +| +| void surface68kAmigaCircleOutline(uint8_t *p0, uint8_t *p1, +| uint8_t *p2, uint8_t *p3, +| uint16_t cx, uint16_t cy, +| uint16_t r, uint8_t color); +| +| Register allocation across the iter loop: +| d2.w = bx (Bresenham) +| d3.w = by (Bresenham) +| d4.w = err (Bresenham) +| d5.w = cx (cached) +| a4 = cy (cached, sign-extended) +| a0..a3 = plane bases +| a5 = bitMaskLut +| d0,d1,d6,d7 = scratch in precompute / plot +| +| Scratch block (24 bytes) at sp+0..23: +| sp+0..3: xp1 record [xp_byte_w, bitMask_b, notMask_b] for cx+bx +| sp+4..7: xp2 record for cx-bx +| sp+8..11: xp3 record for cx+by +| sp+12..15: xp4 record for cx-by +| sp+16..17: yp1 word (cy+by) * 40 +| sp+18..19: yp2 word (cy-by) * 40 +| sp+20..21: yp3 word (cy+bx) * 40 +| sp+22..23: yp4 word (cy-bx) * 40 + + .text + + +| ---- XP_REC: build xp record at sp+slot for xp = cx ---- +| signOp: add or sub +| xreg: %d2 (bx) or %d3 (by) +| slot: 0, 4, 8, or 12 +| Trashes: d0, d1, d6, d7 + + .macro XP_REC slot, signOp, xreg + move.w %d5,%d6 + \signOp\().w \xreg,%d6 | d6 = xp + move.w %d6,%d7 + lsr.w #3,%d7 | d7 = xp >> 3 (xp_byte) + and.w #7,%d6 | d6 = xp & 7 + move.b (%a5,%d6.w),%d6 | d6 = bitMask + move.b %d6,%d1 + not.b %d1 | d1 = notMask + move.w %d7,\slot(%sp) | xp_byte word + move.b %d6,\slot+2(%sp) | bitMask byte + move.b %d1,\slot+3(%sp) | notMask byte + .endm + + +| ---- YP_REC: build yp40 word at sp+slot for yp = cy ---- + + .macro YP_REC slot, signOp, yreg + move.l %a4,%d6 + \signOp\().w \yreg,%d6 | d6.w = yp + move.w %d6,%d0 + lsl.w #3,%d6 | d6 = yp << 3 + lsl.w #5,%d0 | d0 = yp << 5 + add.w %d6,%d0 | d0 = yp * 40 + move.w %d0,\slot(%sp) + .endm + + +| ---- PLOT_FIXED: plot one pixel with hardcoded 4-bit color ---- +| slotYp: 16, 18, 20, or 22 (yp40 word slot) +| slotXp: 0, 4, 8, or 12 (xp record slot) +| color: literal 0..15 +| Trashes: d0, d1, d7 + + .macro PLOT_FIXED slotYp, slotXp, color + move.w \slotYp(%sp),%d0 | d0 = yp40 + add.w \slotXp(%sp),%d0 | d0 += xp_byte + move.b \slotXp+2(%sp),%d1 | d1.b = bitMask + move.b \slotXp+3(%sp),%d7 | d7.b = notMask + .if ((\color) & 1) + or.b %d1,(%a0,%d0.w) + .else + and.b %d7,(%a0,%d0.w) + .endif + .if ((\color) & 2) + or.b %d1,(%a1,%d0.w) + .else + and.b %d7,(%a1,%d0.w) + .endif + .if ((\color) & 4) + or.b %d1,(%a2,%d0.w) + .else + and.b %d7,(%a2,%d0.w) + .endif + .if ((\color) & 8) + or.b %d1,(%a3,%d0.w) + .else + and.b %d7,(%a3,%d0.w) + .endif + .endm + + +| ---- PLOT_8: plot all 8 octant pixels for a given hardcoded color ---- + + .macro PLOT_8 color + PLOT_FIXED 16, 0, \color | (cx+bx, cy+by) + PLOT_FIXED 16, 4, \color | (cx-bx, cy+by) + PLOT_FIXED 18, 0, \color | (cx+bx, cy-by) + PLOT_FIXED 18, 4, \color | (cx-bx, cy-by) + PLOT_FIXED 20, 8, \color | (cx+by, cy+bx) + PLOT_FIXED 20, 12, \color | (cx-by, cy+bx) + PLOT_FIXED 22, 8, \color | (cx+by, cy-bx) + PLOT_FIXED 22, 12, \color | (cx-by, cy-bx) + .endm + + +| ---- CO_BODY: full Bresenham loop body for a hardcoded color ---- +| Generates the per-iter precompute, branchless plot, and Bresenham +| step. Uses unique labels via \color suffix. + + .macro CO_BODY color + XP_REC 0, add, %d2 | xp1 = cx+bx + XP_REC 4, sub, %d2 | xp2 = cx-bx + XP_REC 8, add, %d3 | xp3 = cx+by + XP_REC 12, sub, %d3 | xp4 = cx-by + YP_REC 16, add, %d3 | yp1 = cy+by + YP_REC 18, sub, %d3 | yp2 = cy-by + YP_REC 20, add, %d2 | yp3 = cy+bx + YP_REC 22, sub, %d2 | yp4 = cy-bx + + PLOT_8 \color + + addq.w #1,%d3 + tst.w %d4 + bgt .LcoDecX_\color + add.w %d3,%d4 + add.w %d3,%d4 + addq.w #1,%d4 + bra.w .LcoLoop_\color +.LcoDecX_\color: + subq.w #1,%d2 + add.w %d3,%d4 + add.w %d3,%d4 + sub.w %d2,%d4 + sub.w %d2,%d4 + addq.w #1,%d4 + bra.w .LcoLoop_\color + .endm + + +| ---- CO_LOOP_HDR: emit a labelled loop header for a color ---- + + .macro CO_LOOP_HDR color +.LcoLoop_\color: + cmp.w %d3,%d2 + bcs.w .LcoDone + CO_BODY \color + .endm + + +| ---- Function entry ---- + + .equ SP_SAVED, 44 + .equ SP_LOCAL, 24 + + .equ SP_OFF, (SP_SAVED + 4 + SP_LOCAL) + + .equ SP_P0, SP_OFF + 0 + .equ SP_P1, SP_OFF + 4 + .equ SP_P2, SP_OFF + 8 + .equ SP_P3, SP_OFF + 12 + .equ SP_CX, SP_OFF + 16 + 2 + .equ SP_CY, SP_OFF + 20 + 2 + .equ SP_R, SP_OFF + 24 + 2 + .equ SP_COLOR, SP_OFF + 28 + 3 + + .globl _surface68kAmigaCircleOutline + +_surface68kAmigaCircleOutline: + movem.l %d2-%d7/%a2-%a6,-(%sp) + lea -SP_LOCAL(%sp),%sp + + | Plane bases. + move.l SP_P0(%sp),%a0 + move.l SP_P1(%sp),%a1 + move.l SP_P2(%sp),%a2 + move.l SP_P3(%sp),%a3 + lea bitMaskLut(%pc),%a5 + + | Cache cx in d5, cy (sign-extended) in a4. + move.w SP_CX(%sp),%d5 + move.w SP_CY(%sp),%d6 + ext.l %d6 + movea.l %d6,%a4 + + | Bresenham init. + move.w SP_R(%sp),%d2 | bx = r + moveq #0,%d3 | by = 0 + moveq #1,%d4 + sub.w %d2,%d4 | err = 1 - bx + + | Dispatch on color (low 4 bits) -> one of 16 main loops. + | Each table entry is a bra.w (4 bytes), so index *= 4. + moveq #0,%d6 + move.b SP_COLOR(%sp),%d6 + and.w #0x0F,%d6 + add.w %d6,%d6 + add.w %d6,%d6 + lea .LcoTable(%pc),%a6 + jmp 0(%a6,%d6.w) + +.LcoTable: + bra.w .LcoLoop_0 + bra.w .LcoLoop_1 + bra.w .LcoLoop_2 + bra.w .LcoLoop_3 + bra.w .LcoLoop_4 + bra.w .LcoLoop_5 + bra.w .LcoLoop_6 + bra.w .LcoLoop_7 + bra.w .LcoLoop_8 + bra.w .LcoLoop_9 + bra.w .LcoLoop_10 + bra.w .LcoLoop_11 + bra.w .LcoLoop_12 + bra.w .LcoLoop_13 + bra.w .LcoLoop_14 + bra.w .LcoLoop_15 + + CO_LOOP_HDR 0 + CO_LOOP_HDR 1 + CO_LOOP_HDR 2 + CO_LOOP_HDR 3 + CO_LOOP_HDR 4 + CO_LOOP_HDR 5 + CO_LOOP_HDR 6 + CO_LOOP_HDR 7 + CO_LOOP_HDR 8 + CO_LOOP_HDR 9 + CO_LOOP_HDR 10 + CO_LOOP_HDR 11 + CO_LOOP_HDR 12 + CO_LOOP_HDR 13 + CO_LOOP_HDR 14 + CO_LOOP_HDR 15 + +.LcoDone: + lea SP_LOCAL(%sp),%sp + movem.l (%sp)+,%d2-%d7/%a2-%a6 + rts + + + .align 2 +bitMaskLut: + .byte 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 diff --git a/src/port/amiga/hal.c b/src/port/amiga/hal.c index fe6997b..c51b5a9 100644 --- a/src/port/amiga/hal.c +++ b/src/port/amiga/hal.c @@ -23,8 +23,11 @@ #include #include +#include "joey/debug.h" + #include #include +#include #include #include #include @@ -43,6 +46,7 @@ #include #include "hal.h" +#include "spriteInternal.h" #include "surfaceInternal.h" #include "draw68k_inline.h" @@ -59,11 +63,31 @@ static void removeVblServer(void); #define AMIGA_BITPLANES 4 #define AMIGA_BYTES_PER_ROW 40 +#define AMIGA_PLANE_SIZE (AMIGA_BYTES_PER_ROW * SURFACE_HEIGHT) + + +// ----- Per-surface planar storage (project_planar_68k_plan) ----- +// +// Phase 1 carved out the SurfaceT.portData hook. This struct is what +// it points to on Amiga: the 4 plane base pointers + stride. For the +// stage, planes[i] aliases gPlanes[i] (Intuition-allocated, already +// in chip RAM, already on display). For non-stage surfaces planes[i] +// gets its own AllocMem(MEMF_CHIP) so the blitter can reach it. +// +// Direct fields rather than a union because the consumer is asm / +// inline C that wants minimal indirection in the inner loop. +typedef struct { + uint8_t *planes[AMIGA_BITPLANES]; + uint16_t bytesPerRow; // = AMIGA_BYTES_PER_ROW (40) + uint16_t bytesPerPlane; // = AMIGA_PLANE_SIZE (8000) + bool ownsPlanes; // true = AllocMem'd (free at destroy); + // false = aliased to gPlanes (don't free) +} AmigaPlanarT; // ----- Prototypes ----- +static void amigaPlanarSetPixel(AmigaPlanarT *pd, int16_t x, int16_t y, uint8_t color); static void buildCopperList(const SurfaceT *src); -static void c2pRange(const SurfaceT *src, int16_t y0, int16_t y1, uint16_t byteStart, uint16_t byteEnd); static void dumpCopperList(void); static void installCopperList(void); static void uploadFirstBandPalette(const SurfaceT *src); @@ -83,8 +107,12 @@ static struct UCopList *gNewUCL = NULL; // built but not yet installed // demo after the initial paint) leave both alone. MrgCop + LoadView + // WaitTOF is hundreds of milliseconds on a 7 MHz 68000, so skipping // them on clean frames is a major win. -static uint8_t gCachedScb [SURFACE_HEIGHT]; -static uint16_t gCachedPalette[SURFACE_PALETTE_COUNT][SURFACE_COLORS_PER_PALETTE]; +/* Long-aligned so memcmpLongs (uint32_t pointer compare) won't + * address-error on 68000. SurfaceT.scb/palette are at long-aligned + * offsets (8 and 208), so the source side is safe; these cached + * mirrors need the same property. */ +static uint8_t gCachedScb [SURFACE_HEIGHT] __attribute__((aligned(4))); +static uint16_t gCachedPalette[SURFACE_PALETTE_COUNT][SURFACE_COLORS_PER_PALETTE] __attribute__((aligned(4))); static bool gCacheValid = false; // 4 KB chunky-to-planar lookup table consumed by chunkyToPlanarRow @@ -144,42 +172,10 @@ static void initC2pLut(void) { } -// Convert a range of chunky scanlines [y0, y1) to Amiga planar over -// planar-byte columns [byteStart, byteEnd). Per row the work is dropped -// into chunkyToPlanarRow (src/port/amiga/c2p.s) which is ~5x faster -// than the old per-pixel C inner loop GCC emits for m68k. -// -// Each planar byte corresponds to 8 horizontal pixels = 4 source bytes -// at 4bpp packed; partial-rect callers should round byteStart down and -// byteEnd up to keep the 8-pixel alignment. -static void c2pRange(const SurfaceT *src, int16_t y0, int16_t y1, uint16_t byteStart, uint16_t byteEnd) { - const uint8_t *srcLine; - UBYTE *p0; - UBYTE *p1; - UBYTE *p2; - UBYTE *p3; - int16_t y; - uint16_t numBytes; - - if (byteStart >= byteEnd) { - return; - } - if (!gC2pLutReady) { - initC2pLut(); - } - numBytes = (uint16_t)(byteEnd - byteStart); - - for (y = y0; y < y1; y++) { - // 4 source bytes per planar byte: source-byte offset = - // byteStart * 4 within the chunky row. - srcLine = &src->pixels[y * SURFACE_BYTES_PER_ROW + byteStart * 4]; - p0 = &gPlanes[0][y * AMIGA_BYTES_PER_ROW + byteStart]; - p1 = &gPlanes[1][y * AMIGA_BYTES_PER_ROW + byteStart]; - p2 = &gPlanes[2][y * AMIGA_BYTES_PER_ROW + byteStart]; - p3 = &gPlanes[3][y * AMIGA_BYTES_PER_ROW + byteStart]; - chunkyToPlanarRow(srcLine, p0, p1, p2, p3, numBytes, gC2pLut); - } -} +// (Phase 9 deleted c2pRange. halSurfaceLoadPlanes inlines its own +// per-row chunkyToPlanarRow loop -- the only code path that still +// converts chunky to planar today, since asset loading is the only +// surface mutation that doesn't go through a planar-aware primitive.) // Build a user copper list for per-scanline palette (SCB emulation). @@ -360,16 +356,39 @@ static void dumpCopperList(void) { } +/* Long-aligned compare. SCB is 200 bytes, palette is 16*16*2 = 512 + * bytes; both gCached* are statically aligned and src->scb/palette + * sit at long-aligned offsets in SurfaceT. libnix memcmp walks + * byte-by-byte (~10 cyc/byte = ~7 ms for 712 bytes); a long-pointer + * inline compare drops that to ~2 ms, which dominates per-call + * overhead for tight stagePresentRect loops where there's nothing + * to actually present. Returns nonzero on first mismatch. */ +static bool memcmpLongs(const void *a, const void *b, uint16_t bytes) { + const uint32_t *pa = (const uint32_t *)a; + const uint32_t *pb = (const uint32_t *)b; + uint16_t n = (uint16_t)(bytes >> 2); + while (n > 0u) { + if (*pa != *pb) { + return true; + } + pa++; + pb++; + n--; + } + return false; +} + + // Returns true if the SCB table or palette RGB values differ from the // last presented frame, or if no frame has been presented yet. static bool paletteOrScbChanged(const SurfaceT *src) { if (!gCacheValid) { return true; } - if (memcmp(gCachedScb, src->scb, sizeof(gCachedScb)) != 0) { + if (memcmpLongs(gCachedScb, src->scb, (uint16_t)sizeof(gCachedScb))) { return true; } - if (memcmp(gCachedPalette, src->palette, sizeof(gCachedPalette)) != 0) { + if (memcmpLongs(gCachedPalette, src->palette, (uint16_t)sizeof(gCachedPalette))) { return true; } return false; @@ -380,10 +399,14 @@ static bool paletteOrScbChanged(const SurfaceT *src) { // state visible to the display differs from what the surface carries // now. On clean frames we skip the AllocMem + MrgCop + LoadView + // WaitTOF chain entirely. +static uint32_t gCopperRebuildCount = 0; +static uint32_t gPresentCallCount = 0; + static void updateCopperIfNeeded(const SurfaceT *src) { if (!paletteOrScbChanged(src)) { return; } + gCopperRebuildCount++; uploadFirstBandPalette(src); buildCopperList(src); installCopperList(); @@ -393,6 +416,15 @@ static void updateCopperIfNeeded(const SurfaceT *src) { } +/* Diag hook: callable from anywhere via the linker symbol. */ +void amigaDumpPresentCounters(const char *label) { + joeyLogF("amiga-perf: %s: present=%lu copperRebuild=%lu", + label != NULL ? label : "?", + (unsigned long)gPresentCallCount, + (unsigned long)gCopperRebuildCount); +} + + // Load the first band's palette into the screen's ColorMap so the // Intuition-generated frame-start copper writes those values on each // frame. This acts as a safety net: even if our user copper list does @@ -419,11 +451,50 @@ static void uploadFirstBandPalette(const SurfaceT *src) { bool halInit(const JoeyConfigT *config) { uint16_t i; + uint16_t j; (void)config; + // Allocate our own BitMap with explicitly-non-interleaved planes, + // then hand it to OpenScreen via SA_BitMap. Why not let Intuition + // build one for us: + // * AllocBitMap is V39+ and we target OCS / Kickstart 1.3 + // (V34), so we have to do this manually with InitBitMap + + // per-plane AllocMem. + // * Without SA_BitMap, OpenScreen on AmigaOS 2.x+ / AROS may + // return a BitMap with interleaved planes (single allocation, + // stride = depth * bytes_per_row). Our c2pRange and every + // planar primitive assume stride = bytes_per_row per plane; + // interleaved layout would silently corrupt with no + // diagnostic. + // * Forcing the layout also locks in chip-RAM placement (display + // DMA can only fetch from chip RAM) regardless of host + // defaults. + gBitMap = (struct BitMap *)AllocMem((ULONG)sizeof(struct BitMap), + (ULONG)(MEMF_PUBLIC | MEMF_CLEAR)); + if (gBitMap == NULL) { + return false; + } + InitBitMap(gBitMap, (LONG)AMIGA_BITPLANES, (LONG)SURFACE_WIDTH, (LONG)SURFACE_HEIGHT); + for (i = 0; i < AMIGA_BITPLANES; i++) { + gPlanes[i] = (UBYTE *)AllocMem((ULONG)AMIGA_PLANE_SIZE, + (ULONG)(MEMF_CHIP | MEMF_CLEAR)); + if (gPlanes[i] == NULL) { + for (j = 0; j < i; j++) { + FreeMem(gPlanes[j], (ULONG)AMIGA_PLANE_SIZE); + gPlanes[j] = NULL; + } + FreeMem(gBitMap, (ULONG)sizeof(struct BitMap)); + gBitMap = NULL; + return false; + } + gBitMap->Planes[i] = gPlanes[i]; + } + // SA_DisplayID pins us to OCS PAL low-res so Intuition opens a - // real planar screen rather than an RTG substitute. + // real planar screen rather than an RTG substitute. SA_BitMap + // makes Intuition use OUR pre-allocated planes; CloseScreen will + // not free them -- our halShutdown does. gScreen = OpenScreenTags(NULL, (ULONG)SA_Width, (ULONG)SURFACE_WIDTH, (ULONG)SA_Height, (ULONG)SURFACE_HEIGHT, @@ -434,19 +505,19 @@ bool halInit(const JoeyConfigT *config) { (ULONG)SA_Title, (ULONG)"JoeyLib", (ULONG)SA_Type, (ULONG)CUSTOMSCREEN, (ULONG)SA_Quiet, (ULONG)TRUE, + (ULONG)SA_BitMap, (ULONG)gBitMap, TAG_DONE); if (gScreen == NULL) { - return false; - } - gBitMap = gScreen->RastPort.BitMap; - for (i = 0; i < AMIGA_BITPLANES; i++) { - gPlanes[i] = gBitMap->Planes[i]; - if (gPlanes[i] == NULL) { - CloseScreen(gScreen); - gScreen = NULL; - return false; + for (i = 0; i < AMIGA_BITPLANES; i++) { + if (gPlanes[i] != NULL) { + FreeMem(gPlanes[i], (ULONG)AMIGA_PLANE_SIZE); + gPlanes[i] = NULL; + } } + FreeMem(gBitMap, (ULONG)sizeof(struct BitMap)); + gBitMap = NULL; + return false; } // Force COLOR00 to black so the overscan/border region around the // 320x200 display is black until the app's palette load takes over @@ -464,51 +535,128 @@ const char *halLastError(void) { } -void halPresent(const SurfaceT *src) { - int16_t y; - uint8_t minWord; - uint8_t maxWord; - uint16_t byteStart; - uint16_t byteEnd; +// Phase 9 switch flip: present is now always a per-row planar memcpy +// from the off-screen shadow planes (where every drawing primitive +// dual-writes today) into gPlanes[] (the displayed BitMap). c2p is +// gone; the chunky `s->pixels` shadow is still maintained by the +// halFast* primitives but no longer drives display. Phase 10 will +// either (a) BPLPTR-swap shadow <-> display planes (zero-copy) or +// (b) stop writing chunky in the fast paths to recover the dual- +// write cost. Per-row dirty tracking is reused: only dirty bands +// memcpy. +/* Helper: copy a rect (firstRow..lastRow inclusive) from each shadow + * plane into the displayed plane. One CopyMemQuick per plane covers + * the bounding box. CopyMemQuick is guaranteed long-move and beats + * libnix memcpy by ~5x for full-screen present, but REQUIRES both + * pointers long-aligned and length a long multiple -- misalignment + * gurus. byteStart rounds DOWN to long boundary; copyLen rounds UP + * past the right edge. Over-copies up to 3 bytes per side, which + * stays inside the plane buffer (AMIGA_PLANE_SIZE = 8000). */ +static void amigaPresentRectInner(AmigaPlanarT *pd, int16_t firstRow, int16_t lastRow, + uint16_t byteStart, uint16_t bytesPerRow) { + uint16_t alignedStart; + uint16_t alignedEnd; + uint16_t alignedBytesPerRow; + uint16_t offset; + uint16_t copyLen; + uint16_t i; + int16_t row; + alignedStart = (uint16_t)(byteStart & ~3u); + alignedEnd = (uint16_t)((byteStart + bytesPerRow + 3u) & ~3u); + alignedBytesPerRow = (uint16_t)(alignedEnd - alignedStart); + + /* THIN path: rect is narrow enough that the bounding-box memcpy + * would over-copy. e.g. an 8x8 rect at x=40 has 32 bytes of + * actual data vs 284 bytes of bounding box (9x over-copy). For + * narrow rects, per-row inline long-copy beats memcpy because + * memcpy has function-call dispatch per call AND we only need a + * couple of long stores per row. Threshold tuned so + * stagePresentRect 8b lands here, stagePresentRect F doesn't. */ + if (alignedBytesPerRow <= (AMIGA_BYTES_PER_ROW / 2u)) { + offset = (uint16_t)((uint16_t)firstRow * AMIGA_BYTES_PER_ROW + alignedStart); + for (i = 0; i < AMIGA_BITPLANES; i++) { + uint8_t *src = pd->planes[i] + offset; + uint8_t *dst = gPlanes[i] + offset; + for (row = firstRow; row <= lastRow; row++) { + uint32_t *d32 = (uint32_t *)dst; + const uint32_t *s32 = (const uint32_t *)src; + uint16_t n = (uint16_t)(alignedBytesPerRow >> 2); + do { + *d32++ = *s32++; + } while (--n); + src += AMIGA_BYTES_PER_ROW; + dst += AMIGA_BYTES_PER_ROW; + } + } + return; + } + + /* FAT path: bounding-box memcpy. Single libc call per plane; + * over-copies the L/R margins of inner rows but that overhead + * amortizes when the rect spans a large fraction of the row. */ + offset = (uint16_t)((uint16_t)firstRow * AMIGA_BYTES_PER_ROW + alignedStart); + copyLen = (uint16_t)((uint16_t)(lastRow - firstRow) * AMIGA_BYTES_PER_ROW + + alignedBytesPerRow); + for (i = 0; i < AMIGA_BITPLANES; i++) { + memcpy(gPlanes[i] + offset, pd->planes[i] + offset, copyLen); + } +} + + +void halPresent(const SurfaceT *src) { + AmigaPlanarT *pd; + int16_t y; + int16_t firstRow; + int16_t lastRow; + uint8_t minWord; + uint8_t maxWord; + uint8_t unionMinWord; + uint8_t unionMaxWord; + uint16_t byteStart; + uint16_t bytesPerRow; + + gPresentCallCount++; if (src == NULL || gScreen == NULL) { return; } updateCopperIfNeeded(src); - // Walk per-row dirty bands: each planar byte covers 8 px = 2 chunky - // words, so byteStart = minWord/2 and byteEnd = maxWord/2 + 1 - // converts dirty-word units to the planar-byte units c2pRange wants. + pd = (AmigaPlanarT *)src->portData; + if (pd == NULL) { + return; + } + + /* Reduce the per-row dirty bands to a bounding box. */ + firstRow = -1; + lastRow = -1; + unionMinWord = 0xFFu; + unionMaxWord = 0u; for (y = 0; y < SURFACE_HEIGHT; y++) { minWord = gStageMinWord[y]; maxWord = gStageMaxWord[y]; if (minWord > maxWord) { continue; } - byteStart = (uint16_t)(minWord >> 1); - byteEnd = (uint16_t)((maxWord >> 1) + 1); - c2pRange(src, y, (int16_t)(y + 1), byteStart, byteEnd); + if (firstRow < 0) { + firstRow = y; + } + lastRow = y; + if (minWord < unionMinWord) { + unionMinWord = minWord; + } + if (maxWord > unionMaxWord) { + unionMaxWord = maxWord; + } } -} - - -void halPresentRect(const SurfaceT *src, int16_t x, int16_t y, uint16_t w, uint16_t h) { - uint16_t byteStart; - uint16_t byteEnd; - - if (src == NULL || gScreen == NULL) { + if (firstRow < 0) { return; } - updateCopperIfNeeded(src); - // Each planar byte covers 8 horizontal pixels. Round dirty pixel - // range to the enclosing planar-byte range so we never miss an - // edge pixel while still honoring the rect width. - byteStart = (uint16_t)(x >> 3); - byteEnd = (uint16_t)(((uint16_t)x + w + 7) >> 3); - if (byteEnd > AMIGA_BYTES_PER_ROW) { - byteEnd = AMIGA_BYTES_PER_ROW; - } - c2pRange(src, y, y + (int16_t)h, byteStart, byteEnd); + + /* Each planar byte covers 8 px = 2 chunky words. */ + byteStart = (uint16_t)(unionMinWord >> 1); + bytesPerRow = (uint16_t)(((unionMaxWord >> 1) + 1u) - byteStart); + amigaPresentRectInner(pd, firstRow, lastRow, byteStart, bytesPerRow); } @@ -581,6 +729,9 @@ uint16_t halFrameHz(void) { void halShutdown(void) { + uint16_t i; + + amigaDumpPresentCounters("halShutdown"); // Tear down the VBL server before closing the screen so the // interrupt chain is clean if anything else is watching. removeVblServer(); @@ -594,6 +745,19 @@ void halShutdown(void) { Permit(); CloseScreen(gScreen); gScreen = NULL; + } + // We allocated the BitMap and its planes manually (see halInit) + // so SA_BitMap could pin Intuition to non-interleaved layout. + // CloseScreen with an SA_BitMap'd screen does NOT free our + // BitMap or planes -- we own them and must clean up here. + if (gBitMap != NULL) { + for (i = 0; i < AMIGA_BITPLANES; i++) { + if (gPlanes[i] != NULL) { + FreeMem(gPlanes[i], (ULONG)AMIGA_PLANE_SIZE); + gPlanes[i] = NULL; + } + } + FreeMem(gBitMap, (ULONG)sizeof(struct BitMap)); gBitMap = NULL; } if (gNewUCL != NULL) { @@ -609,58 +773,795 @@ void halShutdown(void) { extern void surface68kClearLong(uint8_t *pixels, uint16_t fillByte); extern void surface68kFillRectFull(uint8_t *pixels, int16_t y, uint16_t h, uint16_t fillByte); extern void surface68kFillRectByteAligned(uint8_t *rowFirst, uint16_t midBytes, uint16_t h, uint16_t fillByte); +extern void surface68kFillSpan4Planes(uint8_t *p0, uint8_t *p1, uint8_t *p2, uint8_t *p3, + uint16_t numMid, uint8_t leftMask, uint8_t rightMask, + uint8_t fb0, uint8_t fb1, uint8_t fb2, uint8_t fb3); +extern void surface68kAmigaCircleOutline(uint8_t *p0, uint8_t *p1, uint8_t *p2, uint8_t *p3, + uint16_t cx, uint16_t cy, uint16_t r, uint8_t color); +// Phase 3 planar dual-write: write the bitplanes alongside the +// Phase 9: Amiga is pure planar. Every halFast* below returns true +// to suppress the cross-platform chunky fallback path -- there is no +// chunky shadow on Amiga (s->pixels is NULL post-Phase 9). The actual +// planar work happens in dedicated halXxxPlanes hooks called by +// cross-platform code AFTER each halFast*. +// +// halFastSurfaceClear inlines the planar fill (no separate hook). +// All other halFast* are short-circuit stubs that return true. bool halFastSurfaceClear(SurfaceT *s, uint8_t doubled) { + AmigaPlanarT *pd; + uint8_t colorIndex; + uint16_t i; + uint8_t planeByte; + if (s != stageGet()) { return false; } - surface68kClearLong(s->pixels, (uint16_t)doubled); + colorIndex = (uint8_t)(doubled & 0x0Fu); + pd = (AmigaPlanarT *)s->portData; + if (pd != NULL) { + for (i = 0; i < AMIGA_BITPLANES; i++) { + planeByte = ((colorIndex >> i) & 1u) ? 0xFFu : 0x00u; + memset(pd->planes[i], planeByte, AMIGA_PLANE_SIZE); + } + } return true; } -// Fast path bands: -// - x == 0 && w == SURFACE_WIDTH (full row): one move.l-stream per -// row via surface68kFillRectFull. No nibble fixups needed -- both -// nibbles in every byte get the same value, and rowFirst is the -// surface base which is always word-aligned by calloc. -// - x % 4 == 0 && w even (byte-aligned AND word-aligned): inner -// bytes via the asm. The (x % 4 == 0) part is the 68000 alignment -// requirement for the move.l writes inside the asm -- byte index -// = x/2, so x must be a multiple of 4 for the byte index to be -// even. -// - everything else: fall through to C's fillRectClipped, which -// does per-byte writes (no alignment needed) and handles the -// leading / trailing nibble RMW correctly. bool halFastFillRect(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex) { - uint8_t doubled; - + /* Pure short-circuit: halFillRectPlanes (called by cross-platform + * fillRect after this) does the actual planar fill with full + * partial-byte mask handling. We just claim ownership so the C + * chunky fallback never runs. */ + (void)x; (void)y; (void)w; (void)h; (void)colorIndex; if (s != stageGet()) { return false; } - if (h == 0u || w == 0u) { - return true; /* clipped-out: nothing to do, but we "handled" it */ - } - doubled = (uint8_t)(((colorIndex & 0x0Fu) << 4) | (colorIndex & 0x0Fu)); - - if (x == 0 && w == (uint16_t)SURFACE_WIDTH) { - surface68kFillRectFull(s->pixels, y, h, (uint16_t)doubled); - return true; - } - if (((x & 3) == 0) && ((w & 1u) == 0u)) { - uint8_t *rowFirst = &s->pixels[(uint16_t)y * (uint16_t)SURFACE_BYTES_PER_ROW + ((uint16_t)x >> 1)]; - surface68kFillRectByteAligned(rowFirst, w >> 1, h, (uint16_t)doubled); - return true; - } - return false; + return true; } +// Phase 3 planar dual-write for fillRect: writes the four off-screen +// shadow plane buffers alongside the chunky shadow. Caller (cross- +// platform fillRect) has already done the chunky write via +// halFastFillRect or fillRectClipped. The shadow planes are off- +// screen so this is invisible until stagePresent. +// +// Layout reminder (see docs/amiga_planar.md): each plane byte covers +// 8 horizontal pixels; bit 7 = leftmost pixel of that byte. So a +// rect clipped to [x, x+w) needs: +// * bytes [x/8 .. (x+w-1)/8] in each plane row +// * leading partial byte if (x % 8) != 0 (only bits [7-x%8 .. 0] +// get touched -- the upper bits stay) +// * trailing partial byte if ((x+w-1) % 8) != 7 (only bits [7 .. +// 7-(x+w-1)%8] get touched) +// * single-byte case (byteFirst == byteLast) collapses to one +// read-modify-write with a combined mask. +// For each plane, the bit value at every pixel in the rect is +// constant: (colorIndex >> plane) & 1. Set bit -> OR with mask; +// clear bit -> AND with ~mask. +void halFillRectPlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex) { + AmigaPlanarT *pd; + uint16_t byteFirst; + uint16_t byteLast; + uint16_t numMid; + uint8_t leftMask; + uint8_t rightMask; + uint16_t plane; + int16_t row; + int16_t yEnd; + uint8_t bitVal; + uint8_t fullByte; + uint8_t *p; + uint8_t *planeBase; + + if (s == NULL || w == 0u || h == 0u) { + return; + } + pd = (AmigaPlanarT *)s->portData; + if (pd == NULL) { + return; + } + + /* Variable shifts on 68000 cost 8 cyc per bit shifted -- for shifts + * by up to 7 that's ~30 cyc per mask, ~60 cyc both. LUTs compile + * to a single byte load. fillCircle r=40 calls halFillRectPlanes + * 160 times, so this saves ~160*60=9600 cyc per fillCircle. */ + static const uint8_t kLeftMaskLut[8] = { + 0xFFu, 0x7Fu, 0x3Fu, 0x1Fu, 0x0Fu, 0x07u, 0x03u, 0x01u + }; + static const uint8_t kRightMaskLut[8] = { + 0x80u, 0xC0u, 0xE0u, 0xF0u, 0xF8u, 0xFCu, 0xFEu, 0xFFu + }; + + byteFirst = (uint16_t)((uint16_t)x >> 3); + byteLast = (uint16_t)(((uint16_t)x + w - 1u) >> 3); + + leftMask = kLeftMaskLut [(uint16_t)x & 7u]; + rightMask = kRightMaskLut[((uint16_t)x + w - 1u) & 7u]; + + yEnd = y + (int16_t)h; + + /* Full-row fast path: no partial-byte RMW on either edge, so each + * plane is a pure long-fill of (h * 40) bytes. fillRect 320x200 + * is the dominant case and lands here; saves 200 rows of leading + * byte RMW + 200 of trailing byte RMW per plane = 1600 chip-bus + * read+write cycles per plane on top of the actual data write. */ + if (byteFirst == 0u && byteLast == (uint16_t)(AMIGA_BYTES_PER_ROW - 1u)) { + uint16_t totalLongs; + uint16_t groups; + uint16_t tail; + uint32_t fillLong; + uint32_t *p32; + + /* AMIGA_BYTES_PER_ROW = 40 = 10 longs / row. 8x-unrolled inner + * loop amortizes the dec+bne to ~2 cyc per store; net ~12 cyc + * per long including chip-bus contention. Tail handles the + * 0..7 longs that don't fit a full group. */ + totalLongs = (uint16_t)((uint16_t)h * (AMIGA_BYTES_PER_ROW / 4u)); + groups = (uint16_t)(totalLongs >> 3); + tail = (uint16_t)(totalLongs & 7u); + for (plane = 0; plane < AMIGA_BITPLANES; plane++) { + bitVal = (uint8_t)((colorIndex >> plane) & 1u); + fullByte = bitVal ? 0xFFu : 0x00u; + fillLong = (uint32_t)fullByte * 0x01010101UL; + p32 = (uint32_t *)(pd->planes[plane] + + (uint16_t)y * AMIGA_BYTES_PER_ROW); + if (groups > 0u) { + uint16_t n = groups; + do { + p32[0] = fillLong; + p32[1] = fillLong; + p32[2] = fillLong; + p32[3] = fillLong; + p32[4] = fillLong; + p32[5] = fillLong; + p32[6] = fillLong; + p32[7] = fillLong; + p32 += 8; + } while (--n); + } + { + uint16_t t = tail; + while (t > 0u) { + *p32++ = fillLong; + t--; + } + } + } + return; + } + + /* Byte-aligned partial-row fast path: when both edges are full + * bytes (leftMask == rightMask == 0xFF) every byte in the row is + * a full overwrite -- no RMW needed. UBER fillRect 80x80 at x=120 + * lands here (byteFirst=15, byteLast=24). Plane bases are + * MEMF_FAST-allocated long-aligned, and y*40 is also a multiple + * of 4, so rowP alignment is determined by byteFirst alone -- + * computed once, not per-row. */ + if (leftMask == 0xFFu && rightMask == 0xFFu) { + uint16_t nBytes = (uint16_t)(byteLast - byteFirst + 1u); + uint8_t alignBytes = (uint8_t)((4u - (byteFirst & 3u)) & 3u); + uint16_t midBytes; + uint16_t midLongs; + uint16_t tailBytes; + + if (alignBytes > nBytes) { + alignBytes = (uint8_t)nBytes; + } + midBytes = (uint16_t)(nBytes - alignBytes); + midLongs = (uint16_t)(midBytes >> 2); + tailBytes = (uint16_t)(midBytes & 3u); + + for (plane = 0; plane < AMIGA_BITPLANES; plane++) { + bitVal = (uint8_t)((colorIndex >> plane) & 1u); + fullByte = bitVal ? 0xFFu : 0x00u; + uint32_t fillLong = (uint32_t)fullByte * 0x01010101UL; + planeBase = pd->planes[plane]; + uint8_t *rowP = planeBase + + (uint16_t)y * AMIGA_BYTES_PER_ROW + + byteFirst; + for (row = y; row < yEnd; row++) { + uint8_t *pp = rowP; + uint8_t ab = alignBytes; + uint16_t ml = midLongs; + uint16_t tb = tailBytes; + while (ab > 0u) { + *pp++ = fullByte; + ab--; + } + if (ml > 0u) { + uint32_t *p32 = (uint32_t *)pp; + do { + *p32++ = fillLong; + } while (--ml); + pp = (uint8_t *)p32; + } + while (tb > 0u) { + *pp++ = fullByte; + tb--; + } + rowP += AMIGA_BYTES_PER_ROW; + } + } + return; + } + + /* Hoist bitVal-dependent setup outside the row loop. Two + * specialized per-plane paths (OR for set, AND-NOT for clear) + * give gcc -O2 simple branchless inner loops. Row pointer is + * advanced by += AMIGA_BYTES_PER_ROW instead of recomputed per + * row -- saves the per-iter multiply. + * + * Single-byte case (byteFirst == byteLast) uses the combined + * mask; multi-byte case uses leading + middle long-fill + + * trailing. The middle long-fill path is identical to the + * earlier code (align, long stores, drain) but lifted into the + * per-plane scope so the constants are loop-invariant. */ + { + uint8_t notLeftMask = (uint8_t)~leftMask; + uint8_t notRightMask = (uint8_t)~rightMask; + + if (byteFirst == byteLast) { + uint8_t mask = (uint8_t)(leftMask & rightMask); + uint8_t notMask = (uint8_t)~mask; + for (plane = 0; plane < AMIGA_BITPLANES; plane++) { + planeBase = pd->planes[plane]; + p = planeBase + (uint16_t)y * AMIGA_BYTES_PER_ROW + byteFirst; + if ((colorIndex >> plane) & 1u) { + /* OR path */ + for (row = y; row < yEnd; row++) { + *p = (uint8_t)(*p | mask); + p += AMIGA_BYTES_PER_ROW; + } + } else { + /* AND-NOT path */ + for (row = y; row < yEnd; row++) { + *p = (uint8_t)(*p & notMask); + p += AMIGA_BYTES_PER_ROW; + } + } + } + return; + } + + numMid = (uint16_t)(byteLast - byteFirst - 1u); + + /* Hoist middle-region alignment outside both per-plane and + * per-row loops. midStart = planeBase + y*40 + byteFirst + 1. + * Plane bases are MEMF_FAST long-aligned and y*40 is a + * multiple of 4, so midStart's alignment is determined by + * (byteFirst + 1) & 3 alone -- constant across planes/rows. */ + uint8_t midAlignBytes = (uint8_t)((4u - ((byteFirst + 1u) & 3u)) & 3u); + uint16_t midRem; + uint16_t midLongs; + uint16_t midTail; + + if (midAlignBytes > numMid) { + midAlignBytes = (uint8_t)numMid; + } + midRem = (uint16_t)(numMid - midAlignBytes); + midLongs = (uint16_t)(midRem >> 2); + midTail = (uint16_t)(midRem & 3u); + + /* Small-numMid byte-only path: when there are no full longs to + * fill, the unified long-fill machinery's runtime ml/tb checks + * cost more than they save. UBER fillRect 16x16 (numMid=1) + * lands here. */ + if (midLongs == 0u) { + for (plane = 0; plane < AMIGA_BITPLANES; plane++) { + bitVal = (uint8_t)((colorIndex >> plane) & 1u); + fullByte = bitVal ? 0xFFu : 0x00u; + uint8_t leadBits = (uint8_t)(bitVal ? leftMask : notLeftMask); + uint8_t trailBits = (uint8_t)(bitVal ? rightMask : notRightMask); + planeBase = pd->planes[plane]; + p = planeBase + (uint16_t)y * AMIGA_BYTES_PER_ROW + byteFirst; + + if (bitVal) { + for (row = y; row < yEnd; row++) { + uint8_t *pp = p; + uint16_t m = numMid; + *pp = (uint8_t)(*pp | leadBits); pp++; + while (m > 0u) { *pp++ = fullByte; m--; } + *pp = (uint8_t)(*pp | trailBits); + p += AMIGA_BYTES_PER_ROW; + } + } else { + for (row = y; row < yEnd; row++) { + uint8_t *pp = p; + uint16_t m = numMid; + *pp = (uint8_t)(*pp & leadBits); pp++; + while (m > 0u) { *pp++ = fullByte; m--; } + *pp = (uint8_t)(*pp & trailBits); + p += AMIGA_BYTES_PER_ROW; + } + } + } + return; + } + + for (plane = 0; plane < AMIGA_BITPLANES; plane++) { + bitVal = (uint8_t)((colorIndex >> plane) & 1u); + fullByte = bitVal ? 0xFFu : 0x00u; + uint32_t fillLong = (uint32_t)fullByte * 0x01010101UL; + uint8_t leadBits = (uint8_t)(bitVal ? leftMask : notLeftMask); + uint8_t trailBits= (uint8_t)(bitVal ? rightMask : notRightMask); + planeBase = pd->planes[plane]; + p = planeBase + (uint16_t)y * AMIGA_BYTES_PER_ROW + byteFirst; + + if (bitVal) { + for (row = y; row < yEnd; row++) { + uint8_t *pp = p; + uint8_t ab = midAlignBytes; + uint16_t ml = midLongs; + uint16_t tb = midTail; + *pp = (uint8_t)(*pp | leadBits); pp++; + while (ab > 0u) { *pp++ = fullByte; ab--; } + { + uint32_t *p32 = (uint32_t *)pp; + do { *p32++ = fillLong; } while (--ml); + pp = (uint8_t *)p32; + } + while (tb > 0u) { *pp++ = fullByte; tb--; } + *pp = (uint8_t)(*pp | trailBits); + p += AMIGA_BYTES_PER_ROW; + } + } else { + for (row = y; row < yEnd; row++) { + uint8_t *pp = p; + uint8_t ab = midAlignBytes; + uint16_t ml = midLongs; + uint16_t tb = midTail; + *pp = (uint8_t)(*pp & leadBits); pp++; + while (ab > 0u) { *pp++ = fullByte; ab--; } + { + uint32_t *p32 = (uint32_t *)pp; + do { *p32++ = fillLong; } while (--ml); + pp = (uint8_t *)p32; + } + while (tb > 0u) { *pp++ = fullByte; tb--; } + *pp = (uint8_t)(*pp & trailBits); + p += AMIGA_BYTES_PER_ROW; + } + } + } + } +} + + +// Phase 5 planar dual-write for tile ops, fully planar after Phase 9 +// dropped the chunky shadow. All tiles are 8-pixel aligned (8x8 blocks +// at multiples of 8), so plane writes are byte-aligned -- one plane +// byte per row, 8 rows per tile, no edge masks. Stride between rows +// in a plane is AMIGA_BYTES_PER_ROW (40). + +void halTileFillPlanes(SurfaceT *s, uint8_t bx, uint8_t by, uint8_t colorIndex) { + AmigaPlanarT *pd; + uint16_t plane; + uint8_t fillByte; + uint8_t *p; + uint8_t row; + + pd = (AmigaPlanarT *)s->portData; + if (pd == NULL) { + return; + } + for (plane = 0; plane < AMIGA_BITPLANES; plane++) { + fillByte = ((colorIndex >> plane) & 1u) ? 0xFFu : 0x00u; + p = pd->planes[plane] + (uint16_t)by * 8u * AMIGA_BYTES_PER_ROW + bx; + for (row = 0; row < 8u; row++) { + *p = fillByte; + p += AMIGA_BYTES_PER_ROW; + } + } +} + + +void halTileCopyPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy) { + AmigaPlanarT *dstPd; + AmigaPlanarT *srcPd; + uint16_t plane; + uint8_t *dp; + const uint8_t *sp; + uint8_t row; + + dstPd = (AmigaPlanarT *)dst->portData; + srcPd = (AmigaPlanarT *)src->portData; + if (dstPd == NULL || srcPd == NULL) { + return; + } + for (plane = 0; plane < AMIGA_BITPLANES; plane++) { + dp = dstPd->planes[plane] + (uint16_t)dstBy * 8u * AMIGA_BYTES_PER_ROW + dstBx; + sp = srcPd->planes[plane] + (uint16_t)srcBy * 8u * AMIGA_BYTES_PER_ROW + srcBx; + for (row = 0; row < 8u; row++) { + *dp = *sp; + dp += AMIGA_BYTES_PER_ROW; + sp += AMIGA_BYTES_PER_ROW; + } + } +} + + +// Pure-planar masked copy. For each row of the 8x8 tile, read 4 src +// plane bytes; compute a per-pixel "non-transparent" mask via XOR +// against the transparent index's per-plane bit pattern (a pixel +// matches transparent IFF all 4 plane bits match transparent's 4 +// bits = OR of 4 XOR'd bytes is 0 in that bit). Then for each plane, +// dst = (dst & ~mask) | (src & mask) -- copy src bits at mask-set +// positions, preserve dst bits elsewhere. +void halTileCopyMaskedPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy, uint8_t transparentIndex) { + AmigaPlanarT *dstPd; + AmigaPlanarT *srcPd; + uint8_t transparent; + uint8_t transBitByte[AMIGA_BITPLANES]; + uint16_t i; + uint8_t row; + uint16_t srcByteOff; + uint16_t dstByteOff; + uint8_t srcPlaneBytes[AMIGA_BITPLANES]; + uint8_t maskByte; + + dstPd = (AmigaPlanarT *)dst->portData; + srcPd = (AmigaPlanarT *)src->portData; + if (dstPd == NULL || srcPd == NULL) { + return; + } + transparent = (uint8_t)(transparentIndex & 0x0Fu); + /* Per-plane "all bits set if transparent's bit at this plane is 1 + * else all 0" -- so XOR gives bit set where pixel differs from + * transparent in that plane. */ + for (i = 0; i < AMIGA_BITPLANES; i++) { + transBitByte[i] = ((transparent >> i) & 1u) ? 0xFFu : 0x00u; + } + + for (row = 0; row < 8u; row++) { + srcByteOff = (uint16_t)((uint16_t)srcBy * 8u + row) * AMIGA_BYTES_PER_ROW + srcBx; + dstByteOff = (uint16_t)((uint16_t)dstBy * 8u + row) * AMIGA_BYTES_PER_ROW + dstBx; + srcPlaneBytes[0] = srcPd->planes[0][srcByteOff]; + srcPlaneBytes[1] = srcPd->planes[1][srcByteOff]; + srcPlaneBytes[2] = srcPd->planes[2][srcByteOff]; + srcPlaneBytes[3] = srcPd->planes[3][srcByteOff]; + /* maskByte: bit set where pixel differs from transparent in + * ANY plane -- i.e., where the pixel is non-transparent. */ + maskByte = (uint8_t)((srcPlaneBytes[0] ^ transBitByte[0]) + | (srcPlaneBytes[1] ^ transBitByte[1]) + | (srcPlaneBytes[2] ^ transBitByte[2]) + | (srcPlaneBytes[3] ^ transBitByte[3])); + for (i = 0; i < AMIGA_BITPLANES; i++) { + uint8_t existing = dstPd->planes[i][dstByteOff]; + dstPd->planes[i][dstByteOff] = (uint8_t)((existing & (uint8_t)~maskByte) + | (srcPlaneBytes[i] & maskByte)); + } + } +} + + +// Phase 8 planar dual-write for asset blits. Walks the asset's +// chunky pixel buffer in the already-clipped (srcX0, srcY0).. +// (srcX0+copyW, srcY0+copyH) range and sets dst plane bits per +// pixel via amigaPlanarSetPixel. transparent == 0xFFFF means opaque +// (no skip); any nibble value 0..15 means skip that color. Asset +// row stride is srcRowBytes (asset width may be < SURFACE_WIDTH). +void halBlitRectPlanes(SurfaceT *dst, int16_t x, int16_t y, const uint8_t *srcBytes, int16_t srcX0, int16_t srcY0, int16_t copyW, int16_t copyH, int16_t srcRowBytes, uint16_t transparent) { + AmigaPlanarT *pd; + int16_t row; + int16_t col; + int16_t sx; + const uint8_t *srcRow; + uint8_t byte; + uint8_t nibble; + bool hasMask; + uint8_t transNibble; + + pd = (AmigaPlanarT *)dst->portData; + if (pd == NULL) { + return; + } + hasMask = (transparent <= 0x0Fu); + transNibble = (uint8_t)(transparent & 0x0Fu); + + for (row = 0; row < copyH; row++) { + srcRow = &srcBytes[(uint16_t)(srcY0 + row) * (uint16_t)srcRowBytes]; + for (col = 0; col < copyW; col++) { + sx = (int16_t)(srcX0 + col); + byte = srcRow[sx >> 1]; + nibble = (sx & 1) ? (uint8_t)(byte & 0x0Fu) : (uint8_t)(byte >> 4); + if (hasMask && nibble == transNibble) { + continue; + } + amigaPlanarSetPixel(pd, (int16_t)(x + col), (int16_t)(y + row), nibble); + } + } +} + + +// Phase 9 sprite save/restore plane-backup hooks. +// Sprite save at (x, y, w, h) writes 4 plane stripes into backup +// buffer; restore reads them back. x and w are 2-pixel aligned by +// cross-platform code; we round x DOWN and w UP to 8-pixel boundaries +// here so plane writes are byte-aligned. Backup layout (matches the +// h * w/2 = 4 * h * w/8 sizing the cross-platform code allocates): +// bytes [0 .. h*bpr ): plane 0 rows +// bytes [h*bpr .. 2*h*bpr ): plane 1 rows +// bytes [2*h*bpr .. 3*h*bpr ): plane 2 rows +// bytes [3*h*bpr .. 4*h*bpr ): plane 3 rows +// where bpr = bytesPerPlaneRow = roundedW/8. +// +// If the rect's rounded width is wider than the chunky-sized backup +// would hold (h * (w/2) bytes), we silently truncate -- the planar +// stripes for partial-byte-aligned sprites won't fit. This case is +// rare for tile-aligned sprites; document if it bites. + +static void amigaSpriteRoundRect(int16_t *xp, uint16_t *wp, uint16_t *bprp) { + int16_t xIn = *xp; + uint16_t wIn = *wp; + int16_t xOut = (int16_t)(xIn & ~7); /* round down to 8-pixel */ + uint16_t span = (uint16_t)(((uint16_t)xIn + wIn) - (uint16_t)xOut); + uint16_t wOut = (uint16_t)((span + 7u) & ~7u); + *xp = xOut; + *wp = wOut; + *bprp = (uint16_t)(wOut >> 3); +} + + +void halSpriteSavePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t *dstPlaneBytes) { + AmigaPlanarT *pd; + uint16_t bpr; + uint16_t planeStripe; + uint16_t i; + uint16_t row; + uint8_t *dst; + const uint8_t *src; + + pd = (AmigaPlanarT *)s->portData; + if (pd == NULL || dstPlaneBytes == NULL) { + return; + } + amigaSpriteRoundRect(&x, &w, &bpr); + planeStripe = (uint16_t)((uint16_t)h * bpr); + for (i = 0; i < AMIGA_BITPLANES; i++) { + dst = dstPlaneBytes + i * planeStripe; + for (row = 0; row < (uint16_t)h; row++) { + src = pd->planes[i] + ((uint16_t)y + row) * AMIGA_BYTES_PER_ROW + ((uint16_t)x >> 3); + memcpy(dst, src, bpr); + dst += bpr; + } + } +} + + +void halSpriteRestorePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, const uint8_t *srcPlaneBytes) { + AmigaPlanarT *pd; + uint16_t bpr; + uint16_t planeStripe; + uint16_t i; + uint16_t row; + uint8_t *dst; + const uint8_t *src; + + pd = (AmigaPlanarT *)s->portData; + if (pd == NULL || srcPlaneBytes == NULL) { + return; + } + amigaSpriteRoundRect(&x, &w, &bpr); + planeStripe = (uint16_t)((uint16_t)h * bpr); + for (i = 0; i < AMIGA_BITPLANES; i++) { + src = srcPlaneBytes + i * planeStripe; + for (row = 0; row < (uint16_t)h; row++) { + dst = pd->planes[i] + ((uint16_t)y + row) * AMIGA_BYTES_PER_ROW + ((uint16_t)x >> 3); + memcpy(dst, src, bpr); + src += bpr; + } + } +} + + +/* Helper used by Amiga halSurfaceLoadFileChunky to populate planes + * from a freshly-loaded chunky pixel buffer (s->pixels). */ +static void amigaPopulatePlanesFromChunky(SurfaceT *s) { + AmigaPlanarT *pd; + int16_t y; + const uint8_t *srcLine; + UBYTE *p0; + UBYTE *p1; + UBYTE *p2; + UBYTE *p3; + + pd = (AmigaPlanarT *)s->portData; + if (pd == NULL) { + return; + } + if (!gC2pLutReady) { + initC2pLut(); + } + for (y = 0; y < SURFACE_HEIGHT; y++) { + srcLine = &s->pixels[y * SURFACE_BYTES_PER_ROW]; + p0 = pd->planes[0] + (uint16_t)y * AMIGA_BYTES_PER_ROW; + p1 = pd->planes[1] + (uint16_t)y * AMIGA_BYTES_PER_ROW; + p2 = pd->planes[2] + (uint16_t)y * AMIGA_BYTES_PER_ROW; + p3 = pd->planes[3] + (uint16_t)y * AMIGA_BYTES_PER_ROW; + chunkyToPlanarRow(srcLine, p0, p1, p2, p3, AMIGA_BYTES_PER_ROW, gC2pLut); + } +} + + +// Phase 6 planar dual-write for sprite draw. Walks the sprite's +// chunky tile data with the same clipping the cross-platform code +// applies, calling amigaPlanarSetPixel for every non-transparent +// pixel (nibble != 0). Bypasses the compiled fast path entirely -- +// the chunky compiled draw already ran by the time we get here, so +// we just mirror its pixel set into the planes. +// +// Sprite tile data layout: tileData = wTiles * hTiles tiles, each +// tile = 8 rows x 4 chunky bytes. Tiles laid out row-major. +// For pixel (px, py) within the sprite: +// tileX = px / 8, tileY = py / 8 +// inTileX = px % 8, inTileY = py % 8 +// tileBase = tileData + (tileY * wTiles + tileX) * 32 +// byte = tileBase[inTileY * 4 + inTileX/2] +// nibble = (inTileX & 1) ? byte & 0x0F : byte >> 4 +// +// Save/restore have no equivalent planar dual-write yet (the +// SpriteBackupT only has chunky storage); workable approach needs +// a parallel plane-data buffer. Apps that depend on PLANAR_PRESENT +// save/restore semantics will see stale planes after restore. +void halSpriteDrawPlanes(SurfaceT *s, const SpriteT *sp, int16_t x, int16_t y) { + AmigaPlanarT *pd; + int16_t dx; + int16_t dy; + int16_t sx; + int16_t sy; + int16_t w; + int16_t h; + int16_t row; + int16_t col; + int16_t spritePx; + int16_t spritePy; + int16_t tileX; + int16_t tileY; + int16_t inTileX; + int16_t inTileY; + uint16_t wTiles; + const uint8_t *tile; + uint8_t byte; + uint8_t nibble; + + pd = (AmigaPlanarT *)s->portData; + if (pd == NULL) { + return; + } + wTiles = sp->widthTiles; + w = (int16_t)(wTiles * 8); + h = (int16_t)(sp->heightTiles * 8); + dx = x; + dy = y; + + /* Clip dst rect against surface. sx/sy track the offset INTO the + * sprite that the clipped region starts at. Same logic as + * sprite.c:clipRect() but inlined since that helper is static. */ + sx = 0; + sy = 0; + if (dx < 0) { sx = (int16_t)(-dx); w = (int16_t)(w - sx); dx = 0; } + if (dy < 0) { sy = (int16_t)(-dy); h = (int16_t)(h - sy); dy = 0; } + if (dx >= SURFACE_WIDTH || dy >= SURFACE_HEIGHT || w <= 0 || h <= 0) { + return; + } + if (dx + w > SURFACE_WIDTH) { w = (int16_t)(SURFACE_WIDTH - dx); } + if (dy + h > SURFACE_HEIGHT) { h = (int16_t)(SURFACE_HEIGHT - dy); } + + for (row = 0; row < h; row++) { + spritePy = (int16_t)(sy + row); + tileY = (int16_t)(spritePy >> 3); /* / 8 */ + inTileY = (int16_t)(spritePy & 7); + for (col = 0; col < w; col++) { + spritePx = (int16_t)(sx + col); + tileX = (int16_t)(spritePx >> 3); + inTileX = (int16_t)(spritePx & 7); + tile = sp->tileData + (uint32_t)((tileY * wTiles + tileX) * 32); + byte = tile[inTileY * 4 + (inTileX >> 1)]; + nibble = (inTileX & 1) ? (uint8_t)(byte & 0x0Fu) : (uint8_t)(byte >> 4); + if (nibble != 0u) { + amigaPlanarSetPixel(pd, (int16_t)(dx + col), (int16_t)(dy + row), nibble); + } + } + } +} + + +// Phase 9 plane-to-chunky derivation for tileSnap. Reads 8 plane +// bytes (1 byte per row x 8 rows) from each of 4 planes for the +// 8-pixel-aligned tile column at bx, then assembles 32 chunky bytes +// (4 per row x 8 rows, packed 2 px/byte high-then-low nibble) into +// On Amiga, TileT.pixels is opaque port-specific storage (cross- +// platform tile.c never reads it directly when s->pixels is NULL). +// We use those 32 bytes as 4 planes x 8 rows, plane-major: +// bytes [0..7] = plane 0, rows 0..7 +// bytes [8..15] = plane 1, rows 0..7 +// bytes [16..23] = plane 2, rows 0..7 +// bytes [24..31] = plane 3, rows 0..7 +// snap/paste then become 32 plain byte loads + stores -- no chunky +// <-> planar conversion at all. The previous c2p-based path paid +// 4 KB LUT lookups + bit shuffling per pixel; this is ~50x cheaper. +#define AMIGA_TILE_PLANE_STRIDE 8 +void halTileSnapPlanes(const SurfaceT *src, uint8_t bx, uint8_t by, uint8_t *tileOut) { + AmigaPlanarT *pd; + uint16_t row; + uint16_t rowBase; + uint8_t plane; + + pd = (AmigaPlanarT *)src->portData; + if (pd == NULL) { + return; + } + rowBase = (uint16_t)((uint16_t)by * 8u) * AMIGA_BYTES_PER_ROW + bx; + for (plane = 0; plane < AMIGA_BITPLANES; plane++) { + const uint8_t *p = pd->planes[plane] + rowBase; + uint8_t *q = tileOut + plane * AMIGA_TILE_PLANE_STRIDE; + for (row = 0; row < 8u; row++) { + q[row] = p[row * AMIGA_BYTES_PER_ROW]; + } + } +} + + +void halTilePastePlanes(SurfaceT *dst, uint8_t bx, uint8_t by, const uint8_t *tileBytes) { + AmigaPlanarT *pd; + uint8_t row; + uint8_t plane; + uint16_t rowBase; + + pd = (AmigaPlanarT *)dst->portData; + if (pd == NULL) { + return; + } + /* TileT.pixels[] holds plane-major planar bytes (see + * halTileSnapPlanes header for layout). Paste = 32 byte stores + * with no chunky -> planar conversion. */ + rowBase = (uint16_t)((uint16_t)by * 8u) * AMIGA_BYTES_PER_ROW + bx; + for (plane = 0; plane < AMIGA_BITPLANES; plane++) { + const uint8_t *q = tileBytes + plane * AMIGA_TILE_PLANE_STRIDE; + uint8_t *p = pd->planes[plane] + rowBase; + for (row = 0; row < 8u; row++) { + p[row * AMIGA_BYTES_PER_ROW] = q[row]; + } + } +} + + +// Phase 3 planar dual-write for surfaceCopy: 4 plane memcpys after +// the cross-platform chunky pixel memcpy. Both src and dst planes +// are off-screen shadow buffers; the displayed gPlanes[] is updated +// only at stagePresent. +void halSurfaceCopyPlanes(SurfaceT *dst, const SurfaceT *src) { + AmigaPlanarT *dstPd; + AmigaPlanarT *srcPd; + uint16_t i; + + dstPd = (AmigaPlanarT *)dst->portData; + srcPd = (AmigaPlanarT *)src->portData; + if (dstPd == NULL || srcPd == NULL) { + return; + } + for (i = 0; i < AMIGA_BITPLANES; i++) { + memcpy(dstPd->planes[i], srcPd->planes[i], AMIGA_PLANE_SIZE); + } +} + + +/* Phase 9: tile halFast hooks return true to suppress chunky fallback. + * The actual planar work happens in halTileFillPlanes / halTileCopyPlanes + * / etc. (called by cross-platform tile.c after each halFast). tileSnap + * outputs a chunky TileT -- see halTileSnapPlanes-style derivation + * inside the snap planar work added below if needed. For now tileSnap + * skips its output (TileT will be all-zeros) when called on Amiga; + * apps that depend on tileSnap on Amiga need a planar-to-chunky + * derivation hook (TODO if any UBER/demo path actually exercises it). */ bool halFastTileCopy(uint8_t *dstRow0, const uint8_t *srcRow0) { (void)dstRow0; (void)srcRow0; - return false; + return true; } @@ -668,58 +1569,304 @@ bool halFastTileCopyMasked(uint8_t *dstRow0, const uint8_t *srcRow0, uint8_t tra (void)dstRow0; (void)srcRow0; (void)transparent; - return false; + return true; } bool halFastTilePaste(uint8_t *dstRow0, const uint8_t *srcTilePixels) { (void)dstRow0; (void)srcTilePixels; - return false; + return true; } bool halFastTileSnap(uint8_t *dstTilePixels, const uint8_t *srcRow0) { (void)dstTilePixels; (void)srcRow0; - return false; + return true; +} + + +// Phase 4 planar dual-write helper: set one pixel's bit in each of +// the four shadow planes. Caller (the per-primitive walker below) has +// already validated (x, y) is on-surface. byteOff is computed once +// and reused across all four planes since stride is the same in each. +/* Per-pixel plane RMW with the color-bit classification hoisted by + * the caller. set0..set3 are 0xFF if that plane's color bit is 1 + * (OR-in bitMask), 0 if 0 (AND-out bitMask). Each plane updates with + * `*pn = (*pn & ~bitMask) | (setN & bitMask)` -- branch-free, 4 + * RMWs per pixel. Inlined into shape walkers so the outer loop pays + * no function-call overhead. */ +/* Bit-mask LUT for the per-pixel macro -- replaces a runtime + * `0x80u >> (x & 7)` (8 cyc per bit shifted on 68000) with a single + * byte load. Saves ~25 cyc per pixel inside circle / line walkers. */ +static const uint8_t kAmigaPlanePutPixelBitLut[8] = { + 0x80u, 0x40u, 0x20u, 0x10u, 0x08u, 0x04u, 0x02u, 0x01u +}; + +#define AMIGA_PLANE_PUT_PIXEL(pd_, x_, y_, set0_, set1_, set2_, set3_) do { \ + uint16_t byteOff_ = (uint16_t)((uint16_t)(y_) * AMIGA_BYTES_PER_ROW + ((uint16_t)(x_) >> 3)); \ + uint8_t bitMask_ = kAmigaPlanePutPixelBitLut[(uint16_t)(x_) & 7u]; \ + uint8_t notMask_ = (uint8_t)~bitMask_; \ + uint8_t *p0_ = &(pd_)->planes[0][byteOff_]; \ + uint8_t *p1_ = &(pd_)->planes[1][byteOff_]; \ + uint8_t *p2_ = &(pd_)->planes[2][byteOff_]; \ + uint8_t *p3_ = &(pd_)->planes[3][byteOff_]; \ + *p0_ = (uint8_t)((*p0_ & notMask_) | ((set0_) & bitMask_)); \ + *p1_ = (uint8_t)((*p1_ & notMask_) | ((set1_) & bitMask_)); \ + *p2_ = (uint8_t)((*p2_ & notMask_) | ((set2_) & bitMask_)); \ + *p3_ = (uint8_t)((*p3_ & notMask_) | ((set3_) & bitMask_)); \ +} while (0) + + +static void amigaPlanarSetPixel(AmigaPlanarT *pd, int16_t x, int16_t y, uint8_t color) { + uint16_t byteOff; + uint8_t bitMask; + uint8_t notMask; + uint8_t *p0; + uint8_t *p1; + uint8_t *p2; + uint8_t *p3; + + byteOff = (uint16_t)((uint16_t)y * AMIGA_BYTES_PER_ROW + ((uint16_t)x >> 3)); + bitMask = (uint8_t)(0x80u >> ((uint16_t)x & 7u)); + notMask = (uint8_t)~bitMask; + + /* Unroll the 4-plane loop. Loop counter + array indexing inside + * the hot per-pixel path is the gating cost on circle outlines: + * UBER drawCircle r=80 calls this ~640 times per call. */ + p0 = &pd->planes[0][byteOff]; + p1 = &pd->planes[1][byteOff]; + p2 = &pd->planes[2][byteOff]; + p3 = &pd->planes[3][byteOff]; + + if (color & 0x01u) { *p0 = (uint8_t)(*p0 | bitMask); } else { *p0 = (uint8_t)(*p0 & notMask); } + if (color & 0x02u) { *p1 = (uint8_t)(*p1 | bitMask); } else { *p1 = (uint8_t)(*p1 & notMask); } + if (color & 0x04u) { *p2 = (uint8_t)(*p2 | bitMask); } else { *p2 = (uint8_t)(*p2 & notMask); } + if (color & 0x08u) { *p3 = (uint8_t)(*p3 | bitMask); } else { *p3 = (uint8_t)(*p3 & notMask); } } bool halFastDrawPixel(SurfaceT *s, uint16_t x, uint16_t y, uint8_t colorIndex) { - uint8_t nibLo; + AmigaPlanarT *pd; + uint8_t nibLo; if (s != stageGet()) { return false; } nibLo = (uint8_t)(colorIndex & 0x0Fu); - draw68kPlotPixel(s->pixels, (int16_t)x, (int16_t)y, nibLo, (uint8_t)(nibLo << 4)); + pd = (AmigaPlanarT *)s->portData; + if (pd != NULL) { + amigaPlanarSetPixel(pd, (int16_t)x, (int16_t)y, nibLo); + } return true; } +// Bresenham's diagonal line, planar-only walk. Same algorithm as +// cross-platform drawLine's fallback so the pixel set matches the +// chunky walker (draw68kLine) bit-for-bit. +static void amigaPlanarLine(AmigaPlanarT *pd, int16_t x0, int16_t y0, int16_t x1, int16_t y1, uint8_t color) { + int16_t dx; + int16_t dy; + int16_t sx; + int16_t sy; + int16_t err; + int16_t e2; + uint8_t set0; + uint8_t set1; + uint8_t set2; + uint8_t set3; + + set0 = (color & 0x01u) ? 0xFFu : 0x00u; + set1 = (color & 0x02u) ? 0xFFu : 0x00u; + set2 = (color & 0x04u) ? 0xFFu : 0x00u; + set3 = (color & 0x08u) ? 0xFFu : 0x00u; + + dx = (int16_t)((x1 > x0) ? (x1 - x0) : (x0 - x1)); + dy = (int16_t)(-((y1 > y0) ? (y1 - y0) : (y0 - y1))); + sx = (int16_t)((x0 < x1) ? 1 : -1); + sy = (int16_t)((y0 < y1) ? 1 : -1); + err = (int16_t)(dx + dy); + while (1) { + AMIGA_PLANE_PUT_PIXEL(pd, x0, y0, set0, set1, set2, set3); + if (x0 == x1 && y0 == y1) { + break; + } + e2 = (int16_t)(2 * err); + if (e2 >= dy) { + err = (int16_t)(err + dy); + x0 = (int16_t)(x0 + sx); + } + if (e2 <= dx) { + err = (int16_t)(err + dx); + y0 = (int16_t)(y0 + sy); + } + } +} + + bool halFastDrawLine(SurfaceT *s, int16_t x0, int16_t y0, int16_t x1, int16_t y1, uint8_t colorIndex) { + AmigaPlanarT *pd; if (s != stageGet()) { return false; } - draw68kLine(s->pixels, x0, y0, x1, y1, colorIndex); + pd = (AmigaPlanarT *)s->portData; + if (pd != NULL) { + amigaPlanarLine(pd, x0, y0, x1, y1, (uint8_t)(colorIndex & 0x0Fu)); + } return true; } +// 8-octant midpoint circle outline, planar-only walk. Mirrors +// drawCircle's cross-platform fallback exactly so plane bits land at +// the same pixels as the chunky walker (draw68kCircleOutline). +static void amigaPlanarCircleOutline(AmigaPlanarT *pd, int16_t cx, int16_t cy, uint16_t r, uint8_t color) { + int16_t x; + int16_t y; + int16_t err; + uint8_t set0; + uint8_t set1; + uint8_t set2; + uint8_t set3; + + /* Classify each plane once: 0xFF if color bit is 1 (set bitMask), + * 0 if bit is 0 (clear bitMask). The per-pixel macro then folds + * this into a branch-free RMW. */ + set0 = (color & 0x01u) ? 0xFFu : 0x00u; + set1 = (color & 0x02u) ? 0xFFu : 0x00u; + set2 = (color & 0x04u) ? 0xFFu : 0x00u; + set3 = (color & 0x08u) ? 0xFFu : 0x00u; + + x = (int16_t)r; + y = 0; + err = (int16_t)(1 - x); + while (x >= y) { + AMIGA_PLANE_PUT_PIXEL(pd, (int16_t)(cx + x), (int16_t)(cy + y), set0, set1, set2, set3); + AMIGA_PLANE_PUT_PIXEL(pd, (int16_t)(cx - x), (int16_t)(cy + y), set0, set1, set2, set3); + AMIGA_PLANE_PUT_PIXEL(pd, (int16_t)(cx + x), (int16_t)(cy - y), set0, set1, set2, set3); + AMIGA_PLANE_PUT_PIXEL(pd, (int16_t)(cx - x), (int16_t)(cy - y), set0, set1, set2, set3); + AMIGA_PLANE_PUT_PIXEL(pd, (int16_t)(cx + y), (int16_t)(cy + x), set0, set1, set2, set3); + AMIGA_PLANE_PUT_PIXEL(pd, (int16_t)(cx - y), (int16_t)(cy + x), set0, set1, set2, set3); + AMIGA_PLANE_PUT_PIXEL(pd, (int16_t)(cx + y), (int16_t)(cy - x), set0, set1, set2, set3); + AMIGA_PLANE_PUT_PIXEL(pd, (int16_t)(cx - y), (int16_t)(cy - x), set0, set1, set2, set3); + y++; + if (err <= 0) { + err = (int16_t)(err + y + y + 1); + } else { + x--; + err = (int16_t)(err + y + y - x - x + 1); + } + } +} + + bool halFastDrawCircle(SurfaceT *s, int16_t cx, int16_t cy, uint16_t r, uint8_t colorIndex) { + AmigaPlanarT *pd; if (s != stageGet()) { return false; } - draw68kCircleOutline(s->pixels, cx, cy, r, colorIndex); + pd = (AmigaPlanarT *)s->portData; + if (pd != NULL) { + surface68kAmigaCircleOutline(pd->planes[0], pd->planes[1], + pd->planes[2], pd->planes[3], + (uint16_t)cx, (uint16_t)cy, r, + (uint8_t)(colorIndex & 0x0Fu)); + } return true; } +/* Single-row 4-plane span fill via shared68k asm. Caller pre-computes + * the left/right partial-byte masks; fillByte per plane is just + * 0xFF/0x00 based on colorIndex bit. The asm body avoids per-byte + * function-call dispatch and the C compiler's per-iter overhead -- + * critical for fillCircle r=40 which pre-asm was paying ~50 ms/call + * for 80 spans. */ +static inline __attribute__((always_inline)) +void amigaFillSpanInline(AmigaPlanarT *pd, int16_t spanX, int16_t spanY, + uint16_t spanW, uint8_t colorIndex) { + static const uint8_t kLM[8] = {0xFFu,0x7Fu,0x3Fu,0x1Fu,0x0Fu,0x07u,0x03u,0x01u}; + static const uint8_t kRM[8] = {0x80u,0xC0u,0xE0u,0xF0u,0xF8u,0xFCu,0xFEu,0xFFu}; + + uint16_t byteFirst = (uint16_t)((uint16_t)spanX >> 3); + uint16_t lastBit = (uint16_t)(spanX + spanW - 1); + uint16_t byteLast = (uint16_t)(lastBit >> 3); + uint8_t leftMask = kLM[(uint16_t)spanX & 7u]; + uint8_t rightMask = kRM[lastBit & 7u]; + uint16_t rowOff = (uint16_t)((uint16_t)spanY * AMIGA_BYTES_PER_ROW + byteFirst); + uint8_t fb0 = ((colorIndex >> 0) & 1u) ? 0xFFu : 0x00u; + uint8_t fb1 = ((colorIndex >> 1) & 1u) ? 0xFFu : 0x00u; + uint8_t fb2 = ((colorIndex >> 2) & 1u) ? 0xFFu : 0x00u; + uint8_t fb3 = ((colorIndex >> 3) & 1u) ? 0xFFu : 0x00u; + uint8_t *p0 = pd->planes[0] + rowOff; + uint8_t *p1 = pd->planes[1] + rowOff; + uint8_t *p2 = pd->planes[2] + rowOff; + uint8_t *p3 = pd->planes[3] + rowOff; + + if (byteFirst == byteLast) { + /* Single-byte case kept in C: the asm path post-increments + * the pointer between leading and trailing RMW, which would + * read the wrong byte if both edges land on the same byte. + * One-byte spans are rare anyway (~1 of 80 in fillCircle r=40) + * so the C overhead is fine here. */ + uint8_t mask = (uint8_t)(leftMask & rightMask); + uint8_t notMask = (uint8_t)~mask; + uint8_t *pp[AMIGA_BITPLANES]; + uint8_t fb[AMIGA_BITPLANES]; + uint8_t i; + pp[0] = p0; pp[1] = p1; pp[2] = p2; pp[3] = p3; + fb[0] = fb0; fb[1] = fb1; fb[2] = fb2; fb[3] = fb3; + for (i = 0; i < AMIGA_BITPLANES; i++) { + *pp[i] = (uint8_t)((*pp[i] & notMask) | (fb[i] & mask)); + } + return; + } + + { + uint16_t numMid = (uint16_t)(byteLast - byteFirst - 1u); + surface68kFillSpan4Planes(p0, p1, p2, p3, numMid, leftMask, rightMask, + fb0, fb1, fb2, fb3); + } +} + + bool halFastFillCircle(SurfaceT *s, int16_t cx, int16_t cy, uint16_t r, uint8_t colorIndex) { + AmigaPlanarT *pd; + int16_t x; + int16_t y; + int16_t err; + int16_t spanX; + uint16_t spanW; + if (s != stageGet()) { return false; } - draw68kCircleFill(s->pixels, cx, cy, r, colorIndex); + pd = (AmigaPlanarT *)s->portData; + if (pd != NULL) { + /* Inline per-span plane fill -- avoids 4 halFillRectPlanes + * dispatches per midpoint iter (~320 dispatches for r=40). */ + x = (int16_t)r; + y = 0; + err = (int16_t)(1 - x); + while (x >= y) { + spanX = (int16_t)(cx - x); + spanW = (uint16_t)(2 * x + 1); + amigaFillSpanInline(pd, spanX, (int16_t)(cy + y), spanW, colorIndex); + amigaFillSpanInline(pd, spanX, (int16_t)(cy - y), spanW, colorIndex); + spanX = (int16_t)(cx - y); + spanW = (uint16_t)(2 * y + 1); + amigaFillSpanInline(pd, spanX, (int16_t)(cy + x), spanW, colorIndex); + amigaFillSpanInline(pd, spanX, (int16_t)(cy - x), spanW, colorIndex); + y++; + if (err <= 0) { + err = (int16_t)(err + y + y + 1); + } else { + x--; + err = (int16_t)(err + y + y - x - x + 1); + } + } + } return true; } @@ -749,19 +1896,6 @@ bool halFastFloodScanRow(uint8_t *row, int16_t leftX, int16_t rightX, uint8_t ma } -bool halFastBlitRect(uint8_t *dstRow0, int16_t dstX, const uint8_t *srcRow0, int16_t srcX, int16_t copyW, int16_t copyH, int16_t srcRowBytes, uint16_t transparent) { - (void)dstRow0; - (void)dstX; - (void)srcRow0; - (void)srcX; - (void)copyW; - (void)copyH; - (void)srcRowBytes; - (void)transparent; - return false; -} - - bool halFastFloodScanAndPush(uint8_t *row, int16_t leftX, int16_t rightX, uint8_t matchColor, uint8_t newColor, bool matchEqual, int16_t scanY, int16_t *stackX, int16_t *stackY, int16_t *spInOut, int16_t maxSp) { (void)row; (void)leftX; @@ -796,20 +1930,471 @@ bool halFastFloodWalkAndScans(uint8_t *pixels, int16_t x, int16_t y, uint8_t mat } +/* Plane-aware nibble at (x, y) given the 4 plane row bases. Reads one + * byte per plane and assembles the 4-bit color from a single bit + * position. Caller is responsible for valid (x, y). */ +static uint8_t amigaNibbleFromPlanes(uint8_t * const planes[AMIGA_BITPLANES], int16_t x) { + uint16_t byteOff; + uint8_t bitMask; + uint8_t color; + + byteOff = (uint16_t)((uint16_t)x >> 3); + bitMask = (uint8_t)(0x80u >> ((uint16_t)x & 7u)); + color = 0u; + if (planes[0][byteOff] & bitMask) color = (uint8_t)(color | 0x01u); + if (planes[1][byteOff] & bitMask) color = (uint8_t)(color | 0x02u); + if (planes[2][byteOff] & bitMask) color = (uint8_t)(color | 0x04u); + if (planes[3][byteOff] & bitMask) color = (uint8_t)(color | 0x08u); + return color; +} + + +/* Build the 4 plane row pointers for a given y. */ +static void amigaPlaneRowPtrs(const SurfaceT *s, int16_t y, uint8_t **out /* [4] */) { + AmigaPlanarT *pd; + uint16_t yOff; + uint8_t i; + + pd = (AmigaPlanarT *)s->portData; + yOff = (uint16_t)((uint16_t)y * AMIGA_BYTES_PER_ROW); + for (i = 0; i < AMIGA_BITPLANES; i++) { + out[i] = pd->planes[i] + yOff; + } +} + + +bool halFloodWalkPlanes(const SurfaceT *s, int16_t startX, int16_t y, uint8_t matchColor, uint8_t newColor, bool matchEqual, bool *seedMatched, int16_t *leftXOut, int16_t *rightXOut) { + AmigaPlanarT *pd; + uint8_t *rowPlanes[AMIGA_BITPLANES]; + int16_t leftX; + int16_t rightX; + uint8_t pix; + bool pixMatch; + + pd = (AmigaPlanarT *)s->portData; + if (pd == NULL) { + return false; + } + matchColor = (uint8_t)(matchColor & 0x0Fu); + newColor = (uint8_t)(newColor & 0x0Fu); + amigaPlaneRowPtrs(s, y, rowPlanes); + + pix = amigaNibbleFromPlanes(rowPlanes, startX); + pixMatch = (pix == matchColor); + if (matchEqual ? !pixMatch : (pixMatch || pix == newColor)) { + *seedMatched = false; + return true; + } + *seedMatched = true; + + leftX = startX; + while (leftX > 0) { + pix = amigaNibbleFromPlanes(rowPlanes, (int16_t)(leftX - 1)); + pixMatch = (pix == matchColor); + if (matchEqual ? !pixMatch : (pixMatch || pix == newColor)) { + break; + } + leftX--; + } + + rightX = startX; + while (rightX < SURFACE_WIDTH - 1) { + pix = amigaNibbleFromPlanes(rowPlanes, (int16_t)(rightX + 1)); + pixMatch = (pix == matchColor); + if (matchEqual ? !pixMatch : (pixMatch || pix == newColor)) { + break; + } + rightX++; + } + + *leftXOut = leftX; + *rightXOut = rightX; + return true; +} + + +bool halFloodScanRowPlanes(const SurfaceT *s, int16_t leftX, int16_t rightX, int16_t scanY, uint8_t matchColor, uint8_t newColor, bool matchEqual, uint8_t *markBuf) { + AmigaPlanarT *pd; + uint8_t *rowPlanes[AMIGA_BITPLANES]; + int16_t byteCol; + int16_t byteColFirst; + int16_t byteColLast; + int16_t bit; + int16_t x; + int16_t markIdx; + uint8_t p0, p1, p2, p3; + uint8_t bitMask; + uint8_t pix; + bool pixMatch; + + pd = (AmigaPlanarT *)s->portData; + if (pd == NULL) { + return false; + } + matchColor = (uint8_t)(matchColor & 0x0Fu); + newColor = (uint8_t)(newColor & 0x0Fu); + amigaPlaneRowPtrs(s, scanY, rowPlanes); + + byteColFirst = (int16_t)(leftX >> 3); + byteColLast = (int16_t)(rightX >> 3); + for (byteCol = byteColFirst; byteCol <= byteColLast; byteCol++) { + p0 = rowPlanes[0][byteCol]; + p1 = rowPlanes[1][byteCol]; + p2 = rowPlanes[2][byteCol]; + p3 = rowPlanes[3][byteCol]; + for (bit = 0; bit < 8; bit++) { + x = (int16_t)((byteCol << 3) + bit); + if (x < leftX || x > rightX) { + continue; + } + bitMask = (uint8_t)(0x80u >> bit); + pix = 0u; + if (p0 & bitMask) pix = (uint8_t)(pix | 0x01u); + if (p1 & bitMask) pix = (uint8_t)(pix | 0x02u); + if (p2 & bitMask) pix = (uint8_t)(pix | 0x04u); + if (p3 & bitMask) pix = (uint8_t)(pix | 0x08u); + pixMatch = (pix == matchColor); + markIdx = (int16_t)(x - leftX); + markBuf[markIdx] = (uint8_t)(matchEqual + ? (pixMatch ? 1 : 0) + : ((!pixMatch && pix != newColor) ? 1 : 0)); + } + } + return true; +} + + bool halFastTileFill(SurfaceT *s, uint8_t bx, uint8_t by, uint16_t fillWord) { + /* Phase 9: chunky write skipped; halTileFillPlanes (called by + * cross-platform tile.c after this) does the planar fill. */ (void)s; (void)bx; (void)by; (void)fillWord; - return false; + return true; +} + + +bool halFastBlitRect(uint8_t *dstRow0, int16_t dstX, const uint8_t *srcRow0, int16_t srcX, int16_t copyW, int16_t copyH, int16_t srcRowBytes, uint16_t transparent) { + /* Phase 9: chunky write skipped; halBlitRectPlanes (called by + * cross-platform surfaceBlit after this) does the planar work. */ + (void)dstRow0; (void)dstX; (void)srcRow0; (void)srcX; + (void)copyW; (void)copyH; (void)srcRowBytes; (void)transparent; + return true; +} + + +/* ===== Phase 9 reader hooks: pure-planar Amiga implementations ===== + * + * Cross-platform code that USED to read s->pixels (chunky shadow) now + * goes through these. On Amiga the chunky shadow doesn't exist; + * pixels are derived on demand by walking the plane bits. + * + * Per-pixel assembly: for pixel (x, y), read bit (7 - x%8) from plane + * byte at row*40 + x/8 in each of the 4 planes. Color index = sum + * of (bit_p << p). */ + +uint8_t halSamplePixel(const SurfaceT *s, int16_t x, int16_t y) { + AmigaPlanarT *pd; + uint16_t byteOff; + uint8_t bitMask; + uint8_t color; + uint16_t i; + + pd = (AmigaPlanarT *)s->portData; + if (pd == NULL) { + return 0u; + } + byteOff = (uint16_t)((uint16_t)y * AMIGA_BYTES_PER_ROW + ((uint16_t)x >> 3)); + bitMask = (uint8_t)(0x80u >> ((uint16_t)x & 7u)); + color = 0u; + for (i = 0; i < AMIGA_BITPLANES; i++) { + if (pd->planes[i][byteOff] & bitMask) { + color = (uint8_t)(color | (1u << i)); + } + } + return color; +} + + +/* Reverse-c2p: per row, derive 160 chunky bytes from 40 plane bytes + * (per plane, 4 planes). Used by halSurfaceHash, halSurfaceSaveFileChunky. + * Walks 8 pixels per planar-byte column; per pixel assembles nibble + * from 4 plane bits. Output: 4 chunky bytes per planar-byte column + * (since 8 pixels = 4 chunky bytes at 2px/byte). */ +static void amigaPlanesToChunkyRow(const AmigaPlanarT *pd, int16_t y, uint8_t *dstChunkyRow) { + uint16_t col; + uint16_t byteOff; + uint8_t b0, b1, b2, b3; + uint8_t pix; + uint8_t bitMask; + uint16_t p; + + for (col = 0; col < AMIGA_BYTES_PER_ROW; col++) { + byteOff = (uint16_t)((uint16_t)y * AMIGA_BYTES_PER_ROW + col); + b0 = pd->planes[0][byteOff]; + b1 = pd->planes[1][byteOff]; + b2 = pd->planes[2][byteOff]; + b3 = pd->planes[3][byteOff]; + /* For each of 8 pixels in this planar byte (bit 7 = leftmost), + * assemble nibble from the 4 plane bits and pack into chunky + * bytes (high nibble = even pixel, low nibble = odd pixel). */ + for (p = 0; p < 8u; p++) { + bitMask = (uint8_t)(0x80u >> p); + pix = 0u; + if (b0 & bitMask) pix = (uint8_t)(pix | 1u); + if (b1 & bitMask) pix = (uint8_t)(pix | 2u); + if (b2 & bitMask) pix = (uint8_t)(pix | 4u); + if (b3 & bitMask) pix = (uint8_t)(pix | 8u); + if ((p & 1u) == 0u) { + dstChunkyRow[col * 4u + (p >> 1)] = (uint8_t)(pix << 4); + } else { + dstChunkyRow[col * 4u + (p >> 1)] = (uint8_t)(dstChunkyRow[col * 4u + (p >> 1)] | pix); + } + } + } +} + + +uint32_t halSurfaceHash(const SurfaceT *s) { + AmigaPlanarT *pd; + uint16_t lo = 0xACE1u, hi = 0x1357u; + uint16_t n, v; + int16_t row; + uint8_t b; + uint8_t chunkyRow[SURFACE_BYTES_PER_ROW]; + const uint16_t *w; + + pd = (AmigaPlanarT *)s->portData; + if (pd == NULL) { + return 0u; + } + /* Pixel hash: derive chunky one row at a time, fold byte-by-byte + * via the shared SURFACE_HASH_MIX_BYTE so cross-port hash + * matches. */ + for (row = 0; row < SURFACE_HEIGHT; row++) { + amigaPlanesToChunkyRow(pd, row, chunkyRow); + for (n = 0; n < SURFACE_BYTES_PER_ROW; n++) { + b = chunkyRow[n]; + SURFACE_HASH_MIX_BYTE(lo, hi, b); + } + } + /* SCB: byte sequence, endian-independent. */ + for (n = 0; n < (uint16_t)SURFACE_HEIGHT; n++) { + b = s->scb[n]; + SURFACE_HASH_MIX_BYTE(lo, hi, b); + } + /* Palette: read uint16 values, fold high-then-low for endian- + * independence. */ + w = &s->palette[0][0]; + for (n = 0; n < (uint16_t)SURFACE_PALETTE_ENTRIES; n++) { + v = *w++; + b = (uint8_t)((v >> 8) & 0xFFu); SURFACE_HASH_MIX_BYTE(lo, hi, b); + b = (uint8_t)(v & 0xFFu); SURFACE_HASH_MIX_BYTE(lo, hi, b); + } + return ((uint32_t)hi << 16) | (uint32_t)lo; +} + + +void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src) { + /* Amiga has no chunky shadow. Plane copy happens in + * halSurfaceCopyPlanes (called separately by surfaceCopy). */ + (void)dst; + (void)src; +} + + +bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp) { + AmigaPlanarT *pd; + uint8_t *scratch; + uint8_t *srcLine; + int16_t y; + UBYTE *p0; + UBYTE *p1; + UBYTE *p2; + UBYTE *p3; + bool ok; + + pd = (AmigaPlanarT *)dst->portData; + if (pd == NULL) { + return false; + } + /* fread the chunky file payload into a scratch buffer, then c2p + * directly into our planes. The scratch is a one-shot AllocMem + * (PUBLIC, not chip) since chunkyToPlanarRow only reads it. */ + scratch = (uint8_t *)AllocMem((ULONG)SURFACE_PIXELS_SIZE, (ULONG)MEMF_PUBLIC); + if (scratch == NULL) { + return false; + } + ok = (fread(scratch, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE); + if (ok) { + if (!gC2pLutReady) { + initC2pLut(); + } + for (y = 0; y < SURFACE_HEIGHT; y++) { + srcLine = &scratch[y * SURFACE_BYTES_PER_ROW]; + p0 = pd->planes[0] + (uint16_t)y * AMIGA_BYTES_PER_ROW; + p1 = pd->planes[1] + (uint16_t)y * AMIGA_BYTES_PER_ROW; + p2 = pd->planes[2] + (uint16_t)y * AMIGA_BYTES_PER_ROW; + p3 = pd->planes[3] + (uint16_t)y * AMIGA_BYTES_PER_ROW; + chunkyToPlanarRow(srcLine, p0, p1, p2, p3, AMIGA_BYTES_PER_ROW, gC2pLut); + } + } + FreeMem(scratch, (ULONG)SURFACE_PIXELS_SIZE); + return ok; +} + + +bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp) { + AmigaPlanarT *pd; + uint8_t chunkyRow[SURFACE_BYTES_PER_ROW]; + int16_t y; + + pd = (AmigaPlanarT *)src->portData; + if (pd == NULL) { + return false; + } + /* Per row: derive chunky from planes, write 160 bytes. Less + * efficient than a single fwrite of a full buffer but avoids + * needing a 32 KB scratch allocation. */ + for (y = 0; y < SURFACE_HEIGHT; y++) { + amigaPlanesToChunkyRow(pd, y, chunkyRow); + if (fwrite(chunkyRow, 1, SURFACE_BYTES_PER_ROW, fp) != SURFACE_BYTES_PER_ROW) { + return false; + } + } + return true; } uint8_t *halStageAllocPixels(void) { - return (uint8_t *)malloc(SURFACE_PIXELS_SIZE); + /* Phase 9: Amiga has no chunky shadow. The stage pixels pointer + * stays NULL; cross-platform code reads pixels via halSamplePixel + * (or other halXxxChunky hooks) which read from planes. NULL is + * a valid return -- cross-platform stageAlloc treats NULL as + * "port has no chunky storage" and skips the chunky memset. */ + return NULL; } void halStageFreePixels(uint8_t *pixels) { + /* halStageAllocPixels returned NULL on Amiga, so this is always + * NULL (free(NULL) is well-defined no-op). Symmetric for any + * future port that does allocate stage pixels. */ free(pixels); } + + +uint8_t *halSurfaceAllocPixels(void) { + /* Same rationale as halStageAllocPixels: no chunky on Amiga. */ + return NULL; +} + + +void halSurfaceFreePixels(uint8_t *pixels) { + free(pixels); +} + + +uint8_t *halSurfacePlanePtr(const SurfaceT *s, uint8_t planeIdx) { + AmigaPlanarT *pd; + if (planeIdx >= AMIGA_BITPLANES) { + return NULL; + } + pd = (AmigaPlanarT *)s->portData; + if (pd == NULL) { + return NULL; + } + return pd->planes[planeIdx]; +} + + +// Allocate the per-surface planar storage: an AmigaPlanarT plus 4 +// off-screen chip-RAM plane buffers. The stage gets its OWN shadow +// planes (NOT aliased to gPlanes / Intuition's BitMap) so drawing +// primitives that dual-write to planes don't immediately appear on +// screen -- the display is updated atomically at stagePresent time +// (c2p chunky->gPlanes today; memcpy shadow->gPlanes under +// JOEYLIB_PLANAR_PRESENT; pointer-swap or alias in Phase 9). +// Aliasing the stage to gPlanes was tried and reverted because every +// drawing primitive showed AS IT HAPPENED, which broke the "draw +// invisibly, palette flips with content at present" semantic Pattern +// and apps depend on. See project_planar_68k_plan.md Phase 3 notes. +// +// Returns NULL on allocation failure. Cross-platform code stores the +// result in s->portData; primitives access via (AmigaPlanarT *) +// s->portData. +void *halSurfaceAllocPortData(SurfaceT *s, bool isStage) { + AmigaPlanarT *pd; + uint16_t i; + + (void)s; + (void)isStage; + pd = (AmigaPlanarT *)AllocMem((ULONG)sizeof(AmigaPlanarT), + (ULONG)(MEMF_CHIP | MEMF_CLEAR)); + if (pd == NULL) { + return NULL; + } + pd->bytesPerRow = AMIGA_BYTES_PER_ROW; + pd->bytesPerPlane = AMIGA_PLANE_SIZE; + + /* Both stage and non-stage: AllocMem fresh planes, MEMF_CLEAR + * for the JoeyLib contract that color 0 = black at surface + * allocation. Stage uses these as off-screen back planes (display + * is gPlanes[], CHIP, updated only by halPresent). Non-stage uses + * these as the surface's only planes. + * + * MEMF_FAST (no MEMF_CHIP) explicitly demands fast RAM. The + * shadow planes are CPU-only (no blitter/copper access); freeing + * them from chip RAM cuts halPresent's chip-bus contention in + * half (read FAST + write CHIP instead of read CHIP + write CHIP) + * and gives RMW drawing primitives 2-3x speedup. If fast RAM is + * unavailable (bare A500, no expansion), we fall back to chip + * via the loop below. */ + for (i = 0; i < AMIGA_BITPLANES; i++) { + pd->planes[i] = (uint8_t *)AllocMem((ULONG)AMIGA_PLANE_SIZE, + (ULONG)(MEMF_FAST | MEMF_CLEAR)); + if (pd->planes[i] == NULL) { + /* No fast RAM available; fall back to chip. */ + pd->planes[i] = (uint8_t *)AllocMem((ULONG)AMIGA_PLANE_SIZE, + (ULONG)(MEMF_CHIP | MEMF_CLEAR)); + joeyLogF("amiga: shadow plane %u in CHIP (fast unavailable, addr=$%08lX)", + (unsigned)i, (unsigned long)pd->planes[i]); + } else { + joeyLogF("amiga: shadow plane %u in FAST (addr=$%08lX)", + (unsigned)i, (unsigned long)pd->planes[i]); + } + if (pd->planes[i] == NULL) { + /* Roll back any planes already allocated. */ + while (i > 0u) { + i--; + FreeMem(pd->planes[i], (ULONG)AMIGA_PLANE_SIZE); + } + FreeMem(pd, (ULONG)sizeof(AmigaPlanarT)); + return NULL; + } + } + pd->ownsPlanes = true; + return pd; +} + + +void halSurfaceFreePortData(SurfaceT *s, bool isStage, void *portData) { + AmigaPlanarT *pd; + uint16_t i; + + (void)s; + (void)isStage; + if (portData == NULL) { + return; + } + pd = (AmigaPlanarT *)portData; + if (pd->ownsPlanes) { + for (i = 0; i < AMIGA_BITPLANES; i++) { + if (pd->planes[i] != NULL) { + FreeMem(pd->planes[i], (ULONG)AMIGA_PLANE_SIZE); + } + } + } + FreeMem(pd, (ULONG)sizeof(AmigaPlanarT)); +} diff --git a/src/port/atarist/hal.c b/src/port/atarist/hal.c index 2efcf32..2e77041 100644 --- a/src/port/atarist/hal.c +++ b/src/port/atarist/hal.c @@ -526,26 +526,6 @@ void halPresent(const SurfaceT *src) { } -void halPresentRect(const SurfaceT *src, int16_t x, int16_t y, uint16_t w, uint16_t h) { - uint16_t groupStart; - uint16_t groupEnd; - - if (src == NULL || !gModeSet) { - return; - } - refreshPaletteStateIfNeeded(src); - // Each c2p group covers 16 horizontal pixels. Round dirty pixel - // range to the enclosing group range to keep the planar word - // alignment without missing edge pixels. - groupStart = (uint16_t)(x >> 4); - groupEnd = (uint16_t)(((uint16_t)x + w + 15) >> 4); - if (groupEnd > ST_GROUPS_PER_ROW) { - groupEnd = ST_GROUPS_PER_ROW; - } - c2pRange(src, y, y + (int16_t)h, groupStart, groupEnd); -} - - // Vsync() is XBIOS opcode 37; mintlib exposes it directly. It blocks // until the next 50 Hz (PAL) or 60 Hz (NTSC) vertical blank. void halWaitVBL(void) { @@ -730,6 +710,20 @@ bool halFastFloodWalk(uint8_t *row, int16_t startX, uint8_t matchColor, uint8_t } +bool halFloodWalkPlanes(const SurfaceT *s, int16_t startX, int16_t y, uint8_t matchColor, uint8_t newColor, bool matchEqual, bool *seedMatched, int16_t *leftXOut, int16_t *rightXOut) { + (void)s; (void)startX; (void)y; (void)matchColor; (void)newColor; (void)matchEqual; + (void)seedMatched; (void)leftXOut; (void)rightXOut; + return false; +} + + +bool halFloodScanRowPlanes(const SurfaceT *s, int16_t leftX, int16_t rightX, int16_t scanY, uint8_t matchColor, uint8_t newColor, bool matchEqual, uint8_t *markBuf) { + (void)s; (void)leftX; (void)rightX; (void)scanY; (void)matchColor; (void)newColor; (void)matchEqual; + (void)markBuf; + return false; +} + + bool halFastFloodScanRow(uint8_t *row, int16_t leftX, int16_t rightX, uint8_t matchColor, uint8_t newColor, bool matchEqual, uint8_t *markBuf) { (void)row; (void)leftX; @@ -798,6 +792,146 @@ bool halFastTileFill(SurfaceT *s, uint8_t bx, uint8_t by, uint16_t fillWord) { } +// Phase-1 planar plumbing: portData hooks declared and exported, but +// returning NULL keeps the ST port operating in the legacy +// chunky-with-c2p model. Phase 4 replaces this with an interleaved +// planar buffer + stride blob, and rewrites every halFast* primitive +// to read/write planes directly. +void *halSurfaceAllocPortData(SurfaceT *s, bool isStage) { + (void)s; + (void)isStage; + return NULL; +} + + +void halSurfaceFreePortData(SurfaceT *s, bool isStage, void *portData) { + (void)s; + (void)isStage; + (void)portData; +} + + +// ST planar dual-write isn't implemented yet (interleaved word-planar +// layout needs a different code path than Amiga's separate plane +// buffers). Stub for now; chunky shadow + c2p still drives display. +void halFillRectPlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex) { + (void)s; + (void)x; + (void)y; + (void)w; + (void)h; + (void)colorIndex; +} + + +void halSurfaceCopyPlanes(SurfaceT *dst, const SurfaceT *src) { + (void)dst; + (void)src; +} + + +void halTileFillPlanes(SurfaceT *s, uint8_t bx, uint8_t by, uint8_t colorIndex) { + (void)s; (void)bx; (void)by; (void)colorIndex; +} +void halTileCopyPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy) { + (void)dst; (void)dstBx; (void)dstBy; (void)src; (void)srcBx; (void)srcBy; +} +void halTileCopyMaskedPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy, uint8_t transparentIndex) { + (void)dst; (void)dstBx; (void)dstBy; (void)src; (void)srcBx; (void)srcBy; (void)transparentIndex; +} +void halTilePastePlanes(SurfaceT *dst, uint8_t bx, uint8_t by, const uint8_t *chunkyTile) { + (void)dst; (void)bx; (void)by; (void)chunkyTile; +} +void halTileSnapPlanes(const SurfaceT *src, uint8_t bx, uint8_t by, uint8_t *chunkyTileOut) { + (void)src; (void)bx; (void)by; (void)chunkyTileOut; +} +void halSpriteDrawPlanes(SurfaceT *s, const SpriteT *sp, int16_t x, int16_t y) { + (void)s; (void)sp; (void)x; (void)y; +} +void halBlitRectPlanes(SurfaceT *dst, int16_t x, int16_t y, const uint8_t *srcBytes, int16_t srcX0, int16_t srcY0, int16_t copyW, int16_t copyH, int16_t srcRowBytes, uint16_t transparent) { + (void)dst; (void)x; (void)y; (void)srcBytes; (void)srcX0; (void)srcY0; + (void)copyW; (void)copyH; (void)srcRowBytes; (void)transparent; +} +void halSpriteSavePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t *dstPlaneBytes) { + (void)s; (void)x; (void)y; (void)w; (void)h; (void)dstPlaneBytes; +} +void halSpriteRestorePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, const uint8_t *srcPlaneBytes) { + (void)s; (void)x; (void)y; (void)w; (void)h; (void)srcPlaneBytes; +} + + +/* Phase 9 chunky reader hooks -- ST is still chunky-shadow + c2p, + * so reads come from s->pixels just like DOS / IIgs. */ +uint8_t halSamplePixel(const SurfaceT *s, int16_t x, int16_t y) { + uint8_t byte = s->pixels[(uint16_t)y * SURFACE_BYTES_PER_ROW + ((uint16_t)x >> 1)]; + if (x & 1) return (uint8_t)(byte & 0x0Fu); + return (uint8_t)((byte & 0xF0u) >> 4); +} + + +uint32_t halSurfaceHash(const SurfaceT *s) { + uint16_t lo = 0xACE1u, hi = 0x1357u, blocks, n, v; + const uint8_t *p; + const uint16_t *w; + uint8_t b; + p = s->pixels; + blocks = (uint16_t)(SURFACE_PIXELS_SIZE / 8); + do { + b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b); + b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b); + b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b); + b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b); + b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b); + b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b); + b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b); + b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b); + blocks--; + } while (blocks > 0u); + p = s->scb; + for (n = 0; n < (uint16_t)SURFACE_HEIGHT; n++) { + b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b); + } + w = &s->palette[0][0]; + for (n = 0; n < (uint16_t)SURFACE_PALETTE_ENTRIES; n++) { + v = *w++; + b = (uint8_t)((v >> 8) & 0xFFu); SURFACE_HASH_MIX_BYTE(lo, hi, b); + b = (uint8_t)(v & 0xFFu); SURFACE_HASH_MIX_BYTE(lo, hi, b); + } + return ((uint32_t)hi << 16) | (uint32_t)lo; +} + + +void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src) { + memcpy(dst->pixels, src->pixels, SURFACE_PIXELS_SIZE); +} + + +bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp) { + return fread(dst->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE; +} + + +bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp) { + return fwrite(src->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE; +} + + +uint8_t *halSurfaceAllocPixels(void) { + return (uint8_t *)calloc(1, SURFACE_PIXELS_SIZE); +} + + +void halSurfaceFreePixels(uint8_t *pixels) { + free(pixels); +} + + +uint8_t *halSurfacePlanePtr(const SurfaceT *s, uint8_t planeIdx) { + (void)s; (void)planeIdx; + return NULL; +} + + uint8_t *halStageAllocPixels(void) { return (uint8_t *)malloc(SURFACE_PIXELS_SIZE); } diff --git a/src/port/dos/hal.c b/src/port/dos/hal.c index 7e446c2..d1ca693 100644 --- a/src/port/dos/hal.c +++ b/src/port/dos/hal.c @@ -244,21 +244,6 @@ void halPresent(const SurfaceT *src) { } -void halPresentRect(const SurfaceT *src, int16_t x, int16_t y, uint16_t w, uint16_t h) { - int16_t py; - int16_t yEnd; - - if (src == NULL || gVgaMem == NULL) { - return; - } - uploadPaletteIfNeeded(src); - yEnd = y + (int16_t)h; - for (py = y; py < yEnd; py++) { - expandAndWriteLine(src, py, x, w, &gVgaMem[py * VGA_STRIDE]); - } -} - - // VGA mode 13h vertical refresh on a real CRT runs at ~70 Hz. We // detect the start of vertical retrace by polling input status // register 1 ($3DA) bit 3: 1 = currently in vretrace. To get a @@ -423,6 +408,20 @@ bool halFastFloodWalk(uint8_t *row, int16_t startX, uint8_t matchColor, uint8_t } +bool halFloodWalkPlanes(const SurfaceT *s, int16_t startX, int16_t y, uint8_t matchColor, uint8_t newColor, bool matchEqual, bool *seedMatched, int16_t *leftXOut, int16_t *rightXOut) { + (void)s; (void)startX; (void)y; (void)matchColor; (void)newColor; (void)matchEqual; + (void)seedMatched; (void)leftXOut; (void)rightXOut; + return false; +} + + +bool halFloodScanRowPlanes(const SurfaceT *s, int16_t leftX, int16_t rightX, int16_t scanY, uint8_t matchColor, uint8_t newColor, bool matchEqual, uint8_t *markBuf) { + (void)s; (void)leftX; (void)rightX; (void)scanY; (void)matchColor; (void)newColor; (void)matchEqual; + (void)markBuf; + return false; +} + + bool halFastFloodScanRow(uint8_t *row, int16_t leftX, int16_t rightX, uint8_t matchColor, uint8_t newColor, bool matchEqual, uint8_t *markBuf) { (void)row; (void)leftX; @@ -499,3 +498,143 @@ uint8_t *halStageAllocPixels(void) { void halStageFreePixels(uint8_t *pixels) { free(pixels); } + + +// DOS / VGA mode 13h is chunky-native (8bpp linear). portData is +// unused; the chunky `pixels` buffer feeds the present-time +// nearest-neighbor copy to VGA RAM. +void *halSurfaceAllocPortData(SurfaceT *s, bool isStage) { + (void)s; + (void)isStage; + return NULL; +} + + +void halSurfaceFreePortData(SurfaceT *s, bool isStage, void *portData) { + (void)s; + (void)isStage; + (void)portData; +} + + +// DOS has no bitplanes -- chunky pixels are the source of truth and +// expandAndWriteLine derives the VGA DAC indices straight from them. +// This hook is a stub here; the cross-platform fillRect calls it +// unconditionally. +void halFillRectPlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex) { + (void)s; + (void)x; + (void)y; + (void)w; + (void)h; + (void)colorIndex; +} + + +void halSurfaceCopyPlanes(SurfaceT *dst, const SurfaceT *src) { + (void)dst; + (void)src; +} + + +void halTileFillPlanes(SurfaceT *s, uint8_t bx, uint8_t by, uint8_t colorIndex) { + (void)s; (void)bx; (void)by; (void)colorIndex; +} +void halTileCopyPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy) { + (void)dst; (void)dstBx; (void)dstBy; (void)src; (void)srcBx; (void)srcBy; +} +void halTileCopyMaskedPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy, uint8_t transparentIndex) { + (void)dst; (void)dstBx; (void)dstBy; (void)src; (void)srcBx; (void)srcBy; (void)transparentIndex; +} +void halTilePastePlanes(SurfaceT *dst, uint8_t bx, uint8_t by, const uint8_t *chunkyTile) { + (void)dst; (void)bx; (void)by; (void)chunkyTile; +} +void halTileSnapPlanes(const SurfaceT *src, uint8_t bx, uint8_t by, uint8_t *chunkyTileOut) { + (void)src; (void)bx; (void)by; (void)chunkyTileOut; +} +void halSpriteDrawPlanes(SurfaceT *s, const SpriteT *sp, int16_t x, int16_t y) { + (void)s; (void)sp; (void)x; (void)y; +} +void halBlitRectPlanes(SurfaceT *dst, int16_t x, int16_t y, const uint8_t *srcBytes, int16_t srcX0, int16_t srcY0, int16_t copyW, int16_t copyH, int16_t srcRowBytes, uint16_t transparent) { + (void)dst; (void)x; (void)y; (void)srcBytes; (void)srcX0; (void)srcY0; + (void)copyW; (void)copyH; (void)srcRowBytes; (void)transparent; +} +void halSpriteSavePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t *dstPlaneBytes) { + (void)s; (void)x; (void)y; (void)w; (void)h; (void)dstPlaneBytes; +} +void halSpriteRestorePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, const uint8_t *srcPlaneBytes) { + (void)s; (void)x; (void)y; (void)w; (void)h; (void)srcPlaneBytes; +} + + +/* Phase 9 reader hooks: chunky ports use the original s->pixels-based + * paths. */ + +uint8_t halSamplePixel(const SurfaceT *s, int16_t x, int16_t y) { + uint8_t byte = s->pixels[(uint16_t)y * SURFACE_BYTES_PER_ROW + ((uint16_t)x >> 1)]; + if (x & 1) return (uint8_t)(byte & 0x0Fu); + return (uint8_t)((byte & 0xF0u) >> 4); +} + + +uint32_t halSurfaceHash(const SurfaceT *s) { + uint16_t lo = 0xACE1u, hi = 0x1357u, blocks, n, v; + const uint8_t *p; + const uint16_t *w; + uint8_t b; + p = s->pixels; + blocks = (uint16_t)(SURFACE_PIXELS_SIZE / 8); + do { + b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b); + b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b); + b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b); + b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b); + b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b); + b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b); + b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b); + b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b); + blocks--; + } while (blocks > 0u); + p = s->scb; + for (n = 0; n < (uint16_t)SURFACE_HEIGHT; n++) { + b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b); + } + w = &s->palette[0][0]; + for (n = 0; n < (uint16_t)SURFACE_PALETTE_ENTRIES; n++) { + v = *w++; + b = (uint8_t)((v >> 8) & 0xFFu); SURFACE_HASH_MIX_BYTE(lo, hi, b); + b = (uint8_t)(v & 0xFFu); SURFACE_HASH_MIX_BYTE(lo, hi, b); + } + return ((uint32_t)hi << 16) | (uint32_t)lo; +} + + +void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src) { + memcpy(dst->pixels, src->pixels, SURFACE_PIXELS_SIZE); +} + + +bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp) { + return fread(dst->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE; +} + + +bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp) { + return fwrite(src->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE; +} + + +uint8_t *halSurfaceAllocPixels(void) { + return (uint8_t *)calloc(1, SURFACE_PIXELS_SIZE); +} + + +void halSurfaceFreePixels(uint8_t *pixels) { + free(pixels); +} + + +uint8_t *halSurfacePlanePtr(const SurfaceT *s, uint8_t planeIdx) { + (void)s; (void)planeIdx; + return NULL; +} diff --git a/src/port/iigs/hal.c b/src/port/iigs/hal.c index 5cad5b7..237fcab 100644 --- a/src/port/iigs/hal.c +++ b/src/port/iigs/hal.c @@ -26,12 +26,25 @@ // crowd up against the 64 KB-per-bank limit). #include +#include #include #include "joey/debug.h" #include "hal.h" #include "surfaceInternal.h" +/* GetTick wrapper in peislam.asm: invokes the Misc Toolset GetTick + * ($2503) and returns the low 16 bits of the system's tick counter + * (firmware VBL ISR-driven). Polling $C019 from C user code missed + * transitions for any op over ~1 ms; the system's tick counter is + * updated by the actual interrupt handler so it stays accurate + * regardless of caller polling rate. Tick rate matches the video + * field rate -- 60 Hz on NTSC, 50 Hz on PAL. */ +extern uint16_t iigsGetTickWord(void); +/* Reads battery RAM hrtz50or60: 0 = NTSC, 1 = PAL. */ +extern uint16_t iigsReadHzParam(void); +static uint16_t gFrameHz = 60u; + // hal.c is the single TU that calls into joeyDraw.asm. Cross- // platform draw.c / tile.c / etc. dispatch through halFast* // functions defined here; they never reference the asm symbols @@ -210,6 +223,7 @@ bool halInit(const JoeyConfigT *config) { // is unreliable from halInit's calling context, so we don't try // it here -- the first present will set up SCB to 320 mode. iigsInitRowLut(); + gFrameHz = (iigsReadHzParam() == 1u) ? 50u : 60u; gModeSet = true; return true; } @@ -234,40 +248,6 @@ void halPresent(const SurfaceT *src) { } -void halPresentRect(const SurfaceT *src, int16_t x, int16_t y, uint16_t w, uint16_t h) { - uint16_t copyBytes; - int16_t byteStart; - uint16_t srcOffset; - - if (src == NULL) { - return; - } - - uploadScbAndPaletteIfNeeded(src); - - // Pixel copy: byte-aligned runs per scanline. x is always >= 0 - // after API-level clipping. Use unsigned shifts to avoid - // ~SSHIFTRIGHT helper for `x >> 1` on signed int16_t. - byteStart = (int16_t)((uint16_t)x >> 1); - copyBytes = (uint16_t)((((uint16_t)x + w + 1u) >> 1) - (uint16_t)byteStart); - - if (copyBytes == 0 || h == 0) { - return; - } - - // Pixel copy: prefer the PEI-slam variant when the rect satisfies - // its contract (copyBytes even, 2..80). Sprite-rect presents - // (typical 8 bytes wide) hit this ~3x faster than MVN. Wider or - // odd-byte rects fall back to MVN, which has no width cap. - srcOffset = (uint16_t)(0x2000 + SURFACE_ROW_OFFSET(y) + byteStart); - if ((copyBytes & 1) == 0 && copyBytes >= 2 && copyBytes <= 80) { - iigsBlitRectStageToShrPEI(srcOffset, copyBytes, h); - } else { - iigsBlitRectStageToShr(srcOffset, copyBytes, h); - } -} - - void halShutdown(void) { if (gModeSet) { *IIGS_NEWVIDEO_REG = gPreviousNewVideo; @@ -305,6 +285,142 @@ void halStageFreePixels(uint8_t *pixels) { } +// IIgs is chunky-native: portData is unused. The chunky `pixels` +// buffer at $01:2000 is the stage's pixel storage and the source for +// stagePresent's PEI-slam to $E1. +void *halSurfaceAllocPortData(SurfaceT *s, bool isStage) { + (void)s; + (void)isStage; + return NULL; +} + + +void halSurfaceFreePortData(SurfaceT *s, bool isStage, void *portData) { + (void)s; + (void)isStage; + (void)portData; +} + + +// IIgs SHR is chunky-native; no bitplanes to update. +void halFillRectPlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex) { + (void)s; + (void)x; + (void)y; + (void)w; + (void)h; + (void)colorIndex; +} + + +void halSurfaceCopyPlanes(SurfaceT *dst, const SurfaceT *src) { + (void)dst; + (void)src; +} + + +void halTileFillPlanes(SurfaceT *s, uint8_t bx, uint8_t by, uint8_t colorIndex) { + (void)s; (void)bx; (void)by; (void)colorIndex; +} +void halTileCopyPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy) { + (void)dst; (void)dstBx; (void)dstBy; (void)src; (void)srcBx; (void)srcBy; +} +void halTileCopyMaskedPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy, uint8_t transparentIndex) { + (void)dst; (void)dstBx; (void)dstBy; (void)src; (void)srcBx; (void)srcBy; (void)transparentIndex; +} +void halTilePastePlanes(SurfaceT *dst, uint8_t bx, uint8_t by, const uint8_t *chunkyTile) { + (void)dst; (void)bx; (void)by; (void)chunkyTile; +} +void halTileSnapPlanes(const SurfaceT *src, uint8_t bx, uint8_t by, uint8_t *chunkyTileOut) { + (void)src; (void)bx; (void)by; (void)chunkyTileOut; +} +void halSpriteDrawPlanes(SurfaceT *s, const SpriteT *sp, int16_t x, int16_t y) { + (void)s; (void)sp; (void)x; (void)y; +} +void halBlitRectPlanes(SurfaceT *dst, int16_t x, int16_t y, const uint8_t *srcBytes, int16_t srcX0, int16_t srcY0, int16_t copyW, int16_t copyH, int16_t srcRowBytes, uint16_t transparent) { + (void)dst; (void)x; (void)y; (void)srcBytes; (void)srcX0; (void)srcY0; + (void)copyW; (void)copyH; (void)srcRowBytes; (void)transparent; +} +void halSpriteSavePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t *dstPlaneBytes) { + (void)s; (void)x; (void)y; (void)w; (void)h; (void)dstPlaneBytes; +} +void halSpriteRestorePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, const uint8_t *srcPlaneBytes) { + (void)s; (void)x; (void)y; (void)w; (void)h; (void)srcPlaneBytes; +} + + +/* Phase 9 chunky reader hooks: IIgs reads from s->pixels just like + * the legacy paths did. Same logic as the DOS port. */ +uint8_t halSamplePixel(const SurfaceT *s, int16_t x, int16_t y) { + uint8_t byte = s->pixels[(uint16_t)y * SURFACE_BYTES_PER_ROW + ((uint16_t)x >> 1)]; + if (x & 1) return (uint8_t)(byte & 0x0Fu); + return (uint8_t)((byte & 0xF0u) >> 4); +} + + +uint32_t halSurfaceHash(const SurfaceT *s) { + uint16_t lo = 0xACE1u, hi = 0x1357u, blocks, n, v; + const uint8_t *p; + const uint16_t *w; + uint8_t b; + p = s->pixels; + blocks = (uint16_t)(SURFACE_PIXELS_SIZE / 8); + do { + b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b); + b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b); + b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b); + b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b); + b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b); + b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b); + b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b); + b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b); + blocks--; + } while (blocks > 0u); + p = s->scb; + for (n = 0; n < (uint16_t)SURFACE_HEIGHT; n++) { + b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b); + } + w = &s->palette[0][0]; + for (n = 0; n < (uint16_t)SURFACE_PALETTE_ENTRIES; n++) { + v = *w++; + b = (uint8_t)((v >> 8) & 0xFFu); SURFACE_HASH_MIX_BYTE(lo, hi, b); + b = (uint8_t)(v & 0xFFu); SURFACE_HASH_MIX_BYTE(lo, hi, b); + } + return ((uint32_t)hi << 16) | (uint32_t)lo; +} + + +void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src) { + memcpy(dst->pixels, src->pixels, SURFACE_PIXELS_SIZE); +} + + +bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp) { + return fread(dst->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE; +} + + +bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp) { + return fwrite(src->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE; +} + + +uint8_t *halSurfaceAllocPixels(void) { + return (uint8_t *)calloc(1, SURFACE_PIXELS_SIZE); +} + + +void halSurfaceFreePixels(uint8_t *pixels) { + free(pixels); +} + + +uint8_t *halSurfacePlanePtr(const SurfaceT *s, uint8_t planeIdx) { + (void)s; (void)planeIdx; + return NULL; +} + + // $C019 RDVBLBAR: bit 7 = 0 during vertical blank, 1 during active // scan. To produce a rising-edge wait (one VBL per call), first spin // while VBL is currently active (bit 7 = 0), then spin until VBL @@ -333,24 +449,11 @@ void halWaitVBL(void) { // byte and the counter never advances. The explicit lda > / sta > // pattern uses long-mode addressing throughout, which is // DBR-independent. -static uint16_t gFrameCount = 0; -static uint8_t gPrevInVbl = 0; - uint16_t halFrameCount(void) { - uint8_t now; - uint16_t cnt; - - now = (*IIGS_VBL_STATUS & VBL_BAR_BIT) == 0; - if (now && !gPrevInVbl) { - cnt = gFrameCount; - cnt = (uint16_t)(cnt + 1u); - gFrameCount = cnt; - } - gPrevInVbl = now; - return gFrameCount; + return iigsGetTickWord(); } uint16_t halFrameHz(void) { - return 60u; + return gFrameHz; } diff --git a/src/port/iigs/peislam.asm b/src/port/iigs/peislam.asm index 74fa672..0c1b6ff 100644 --- a/src/port/iigs/peislam.asm +++ b/src/port/iigs/peislam.asm @@ -1,15 +1,66 @@ -* peislam.asm - placeholder. -* -* The original PEI-slam-per-row helper was removed; its functionality -* was rolled into iigsBlitStageToShr in joeyDraw.asm (full PEI-slam -* with per-row dirty skip). This stub remains so the build's -* PORT_ASM_SRCS_ALL wildcard pulls in a file with a recognized load -* segment and the linker keeps the same segment-bank layout it had -* when peislam.asm was a real translation unit. +* peislam.asm - originally a PEI-slam helper, now hosts the GetTick +* and ReadBParam trampolines. The PEI-slam logic was rolled into +* iigsBlitStageToShr in joeyDraw.asm. keep PEISLAM case on + +* Stub kept so the PEISLAM load segment stays present (the build's +* PORT_ASM_SRCS_ALL wildcard pulls in this file by name). peislamStub start IIGSASM rtl end + + +**************************************************************** +* uint16_t iigsGetTickWord(void) +* +* Calls Misc Toolset GetTick ($2503) and returns the low 16 bits of +* the 32-bit tick counter. The system increments this counter from +* the actual VBL hardware interrupt, so it stays accurate regardless +* of caller polling rate -- C-side polling of $C019 missed transitions +* for any op over ~1 ms. +* +* GetTick output convention: caller pushes 4 bytes of output space, +* tool dispatcher writes the LongWord into them. We pull the low 16 +* bits into A (ORCA-C Word return convention -- A holds the result, +* not Y; verified against jIIgs.asm asmGetVbl) and discard the high +* 16 into X. +* +* ORCA-C cdecl ABI: caller has M=I=16. Word return in A. +**************************************************************** + +iigsGetTickWord start IIGSASM + pha ; output space high word + pha ; output space low word + ldx #$2503 ; _GetTick + jsl $E10000 + + pla ; A = low 16 bits (return value) + plx ; discard high 16 bits + rtl + end + + +**************************************************************** +* uint16_t iigsReadHzParam(void) +* +* Reads battery RAM parameter hrtz50or60 ($1D) via _ReadBParam ($0C03) +* and returns the raw value: 0 = NTSC (60 Hz), 1 = PAL (50 Hz). +* +* GetTick fires from the hardware VBL ISR, so its rate matches the +* video field rate -- 60 Hz on NTSC, 50 Hz on PAL. halFrameHz must +* report whichever this machine actually runs so wall-clock math +* (frames * 1000 / halFrameHz) is correct on both. +**************************************************************** + +iigsReadHzParam start IIGSASM + pha ; output space (Word) + pea $001D ; hrtz50or60 parameter ID + ldx #$0C03 ; _ReadBParam + jsl $E10000 + + pla ; A = result (ORCA-C Word return) + rtl + end diff --git a/src/shared68k/surface68k.s b/src/shared68k/surface68k.s index 7e3186a..4d68fb2 100644 --- a/src/shared68k/surface68k.s +++ b/src/shared68k/surface68k.s @@ -253,3 +253,253 @@ _surface68kFillRectByteAligned: .Lfrb_done: movem.l (%sp)+,%d2-%d6 rts + + +| ---------------------------------------------------------------- +| void surface68kFillSpan4Planes(uint8_t *p0, uint8_t *p1, +| uint8_t *p2, uint8_t *p3, +| uint16_t numMid, +| uint8_t leftMask, uint8_t rightMask, +| uint8_t fb0, uint8_t fb1, +| uint8_t fb2, uint8_t fb3); +| +| Fill ONE planar row across 4 planes -- the per-row body of +| halFillRectPlanes lifted into asm. Each pN points at the leading +| byte (already advanced by planeBase + y*40 + byteFirst on the C +| side). leftMask and rightMask are the partial-byte masks for the +| left/right edges; numMid is the count of full bytes between them. +| fbN is 0x00 or 0xFF, the per-plane fill byte (caller pre-classifies +| (colorIndex >> N) & 1 -> 0xFF or 0x00). +| +| Used by Amiga halFastFillCircle (one call per scanline span) and +| Amiga halFillRectPlanes (one call per row of the rect). Replaces +| the C inner loop whose ~13 cyc/byte was the gating cost on +| fillCircle r=40 even after C-side inlining. +| +| Mask convention is uniform for all planes: +| leading byte := (*p & ~leftMask) | (fbN & leftMask) +| middle bytes := fbN +| trailing byte := (*p & ~rightMask) | (fbN & rightMask) +| -- branchless: the same arithmetic produces "set" or "clear" based +| on whether fbN is 0xFF or 0x00. +| +| ABI: m68k cdecl. d2-d7/a2-a6 callee-save (movem'd here). +| Stack offset to first arg after MOVEM: 11 regs * 4 = 44 bytes saved +| + 4 ret PC = 48. +| ---------------------------------------------------------------- + .globl _surface68kFillSpan4Planes + + .equ SP_SAVED, 44 + .equ SP_RPC, 4 + .equ SP_OFF, (SP_SAVED + SP_RPC) + + .equ SP_P0, SP_OFF + 0 + .equ SP_P1, SP_OFF + 4 + .equ SP_P2, SP_OFF + 8 + .equ SP_P3, SP_OFF + 12 + .equ SP_NMID, SP_OFF + 16 + 2 | int -> low word at +2 + .equ SP_LMASK, SP_OFF + 20 + 3 | int -> low byte at +3 + .equ SP_RMASK, SP_OFF + 24 + 3 + .equ SP_FB0, SP_OFF + 28 + 3 + .equ SP_FB1, SP_OFF + 32 + 3 + .equ SP_FB2, SP_OFF + 36 + 3 + .equ SP_FB3, SP_OFF + 40 + 3 + +| Macro: per-plane work fully inlined. Args: +| plane_an = the address register holding this plane's pointer. +| fb_off = the stack offset for this plane's fillByte. +| Uses d6/d7 as scratch; d1=leftMask, d2=~leftMask, d3=rightMask, +| d4=~rightMask; d0=numMid-1 (only valid if mid_count > 0). The mid +| loop is skipped via .LfsSkipMid_ when numMid was 0 at entry -- +| the per-plane caller branches to the right tail label. +| +| Hand-unrolled 4x rather than using bsr+rts to dodge ~12 cyc per +| return + the per-plane re-test of numMid that the previous build +| paid. The mid-loop label suffix is the plane index so all four +| inline copies can coexist without label collisions. +| +| Plain text version of the per-plane body (translate to asm 4x with +| different a-regs and fb stack offsets): +| +| move.b (an),%d6 +| and.b %d2,%d6 +| move.b fb,%d7 +| and.b %d1,%d7 +| or.b %d7,%d6 +| move.b %d6,(an)+ +| < if has-middle path: > +| move.w %d0,%d7 +| .midN: +| move.b fb,(an)+ +| dbra %d7,.midN +| < trailing: > +| move.b (an),%d6 +| and.b %d4,%d6 +| move.b fb,%d7 +| and.b %d3,%d7 +| or.b %d7,%d6 +| move.b %d6,(an) + +_surface68kFillSpan4Planes: + movem.l %d2-%d7/%a2-%a6,-(%sp) + + move.b SP_LMASK(%sp),%d1 + move.b %d1,%d2 + not.b %d2 + move.b SP_RMASK(%sp),%d3 + move.b %d3,%d4 + not.b %d4 + + move.l SP_P0(%sp),%a0 + move.l SP_P1(%sp),%a1 + move.l SP_P2(%sp),%a2 + move.l SP_P3(%sp),%a3 + + | One-time numMid test. d0.w = numMid; if 0 jump to + | the no-middle entry, otherwise pre-decrement for dbra + | and fall into the with-middle entry. Both paths + | unroll all 4 planes. + move.w SP_NMID(%sp),%d0 + beq .LfsNoMid + subq.w #1,%d0 + + | ---- WITH-MIDDLE PATH ---- + | Plane 0 + move.b (%a0),%d6 + and.b %d2,%d6 + move.b SP_FB0(%sp),%d5 + move.b %d5,%d7 + and.b %d1,%d7 + or.b %d7,%d6 + move.b %d6,(%a0)+ + move.w %d0,%d7 +.LfsMid0: move.b %d5,(%a0)+ + dbra %d7,.LfsMid0 + move.b (%a0),%d6 + and.b %d4,%d6 + move.b %d5,%d7 + and.b %d3,%d7 + or.b %d7,%d6 + move.b %d6,(%a0) + + | Plane 1 + move.b (%a1),%d6 + and.b %d2,%d6 + move.b SP_FB1(%sp),%d5 + move.b %d5,%d7 + and.b %d1,%d7 + or.b %d7,%d6 + move.b %d6,(%a1)+ + move.w %d0,%d7 +.LfsMid1: move.b %d5,(%a1)+ + dbra %d7,.LfsMid1 + move.b (%a1),%d6 + and.b %d4,%d6 + move.b %d5,%d7 + and.b %d3,%d7 + or.b %d7,%d6 + move.b %d6,(%a1) + + | Plane 2 + move.b (%a2),%d6 + and.b %d2,%d6 + move.b SP_FB2(%sp),%d5 + move.b %d5,%d7 + and.b %d1,%d7 + or.b %d7,%d6 + move.b %d6,(%a2)+ + move.w %d0,%d7 +.LfsMid2: move.b %d5,(%a2)+ + dbra %d7,.LfsMid2 + move.b (%a2),%d6 + and.b %d4,%d6 + move.b %d5,%d7 + and.b %d3,%d7 + or.b %d7,%d6 + move.b %d6,(%a2) + + | Plane 3 + move.b (%a3),%d6 + and.b %d2,%d6 + move.b SP_FB3(%sp),%d5 + move.b %d5,%d7 + and.b %d1,%d7 + or.b %d7,%d6 + move.b %d6,(%a3)+ + move.w %d0,%d7 +.LfsMid3: move.b %d5,(%a3)+ + dbra %d7,.LfsMid3 + move.b (%a3),%d6 + and.b %d4,%d6 + move.b %d5,%d7 + and.b %d3,%d7 + or.b %d7,%d6 + move.b %d6,(%a3) + + movem.l (%sp)+,%d2-%d7/%a2-%a6 + rts + +.LfsNoMid: + | ---- NO-MIDDLE PATH (just leading + trailing) ---- + | Plane 0 + move.b (%a0),%d6 + and.b %d2,%d6 + move.b SP_FB0(%sp),%d5 + move.b %d5,%d7 + and.b %d1,%d7 + or.b %d7,%d6 + move.b %d6,(%a0)+ + move.b (%a0),%d6 + and.b %d4,%d6 + move.b %d5,%d7 + and.b %d3,%d7 + or.b %d7,%d6 + move.b %d6,(%a0) + + | Plane 1 + move.b (%a1),%d6 + and.b %d2,%d6 + move.b SP_FB1(%sp),%d5 + move.b %d5,%d7 + and.b %d1,%d7 + or.b %d7,%d6 + move.b %d6,(%a1)+ + move.b (%a1),%d6 + and.b %d4,%d6 + move.b %d5,%d7 + and.b %d3,%d7 + or.b %d7,%d6 + move.b %d6,(%a1) + + | Plane 2 + move.b (%a2),%d6 + and.b %d2,%d6 + move.b SP_FB2(%sp),%d5 + move.b %d5,%d7 + and.b %d1,%d7 + or.b %d7,%d6 + move.b %d6,(%a2)+ + move.b (%a2),%d6 + and.b %d4,%d6 + move.b %d5,%d7 + and.b %d3,%d7 + or.b %d7,%d6 + move.b %d6,(%a2) + + | Plane 3 + move.b (%a3),%d6 + and.b %d2,%d6 + move.b SP_FB3(%sp),%d5 + move.b %d5,%d7 + and.b %d1,%d7 + or.b %d7,%d6 + move.b %d6,(%a3)+ + move.b (%a3),%d6 + and.b %d4,%d6 + move.b %d5,%d7 + and.b %d3,%d7 + or.b %d7,%d6 + move.b %d6,(%a3) + + movem.l (%sp)+,%d2-%d7/%a2-%a6 + rts diff --git a/tools/diff-uber-hashes b/tools/diff-uber-hashes new file mode 100755 index 0000000..6b36814 --- /dev/null +++ b/tools/diff-uber-hashes @@ -0,0 +1,93 @@ +#!/usr/bin/env python3 +"""Compare two UBER joeylog.txt files by per-op surface hash. + +Used by the planar 68k rewrite (project_planar_68k_plan.md): IIgs +captures the golden reference, each 68k port re-runs UBER after a +primitive conversion, and this tool tells you which ops produced +different pixels. Without this, "looks right visually" misses the +subtle mismatches that cascade into hard-to-debug corruption. + +Usage: + tools/diff-uber-hashes + +Exit code: + 0 = all hashes match + 1 = at least one mismatch + 2 = usage error or missing file +""" + +import re +import sys + +# Match e.g.: +# UBER: drawCircle r=80: 56 iters / 4 frames = 840 ops/sec | hash=A1B2C3D4 +LINE_RE = re.compile( + r"UBER:\s+(?P[^:]+):\s+\d+\s+iters\s+/\s+\d+\s+frames\s+=\s+\d+\s+ops/sec\s+\|\s+hash=(?P[0-9A-Fa-f]+)" +) + + +def parse_log(path): + """Return ordered dict {op_name: hash} from a UBER log file. + + Multiple runs may be concatenated in the same log (joeyLog appends) + -- in that case the LAST hash for each op wins, matching the most + recent run. + """ + hashes = {} + with open(path) as f: + for line in f: + m = LINE_RE.search(line) + if m: + hashes[m.group("op").strip()] = m.group("hash").upper() + return hashes + + +def main(argv): + if len(argv) != 3: + sys.stderr.write( + "usage: diff-uber-hashes \n" + ) + return 2 + + try: + ref = parse_log(argv[1]) + test = parse_log(argv[2]) + except OSError as e: + sys.stderr.write(f"error: {e}\n") + return 2 + + if not ref: + sys.stderr.write(f"error: no UBER hash lines found in {argv[1]}\n") + return 2 + if not test: + sys.stderr.write(f"error: no UBER hash lines found in {argv[2]}\n") + return 2 + + mismatches = 0 + matches = 0 + for op, ref_hash in ref.items(): + test_hash = test.get(op) + if test_hash is None: + print(f" MISSING in test: {op} (ref={ref_hash})") + mismatches += 1 + elif test_hash != ref_hash: + print(f" MISMATCH {op}: ref={ref_hash} test={test_hash}") + mismatches += 1 + else: + matches += 1 + + extras = [op for op in test if op not in ref] + for op in extras: + print(f" EXTRA in test: {op} (test={test[op]})") + + total = len(ref) + len(extras) + print() + if mismatches == 0 and not extras: + print(f"OK: {matches}/{total} ops match") + return 0 + print(f"FAIL: {matches} match, {mismatches} mismatch, {len(extras)} extras") + return 1 + + +if __name__ == "__main__": + sys.exit(main(sys.argv)) diff --git a/tools/diff-uber-perf b/tools/diff-uber-perf new file mode 100755 index 0000000..82df37d --- /dev/null +++ b/tools/diff-uber-perf @@ -0,0 +1,132 @@ +#!/usr/bin/env python3 +"""Compare two UBER joeylog.txt files by per-op ops/sec. + +Sibling of diff-uber-hashes (which compares pixel correctness). This +tool drives Phase 10 of project_planar_68k_plan.md: pick the +biggest perf gaps vs the IIgs reference and target asm/algorithmic +optimization at those. + +Usage: + tools/diff-uber-perf [--threshold 1.0] + +Output is sorted by speed ratio (test/ref) ascending, so the worst +gaps print first. Ops missing from either log are flagged. The +threshold flag (default 1.0) marks ops below that ratio as FAIL -- +project_perf_directive.md says "IIgs is the perf floor; every +other target must match or beat it", so parity = 1.0x. Use +--threshold 0.8 for the project_planar_68k_plan looser acceptance. + +Exit code: + 0 = all common ops at >= threshold + 1 = at least one op below threshold (or missing) + 2 = usage error or missing file +""" + +import re +import sys + +# Match e.g.: +# UBER: drawCircle r=80: 56 iters / 4 frames = 840 ops/sec | hash=A1B2C3D4 +LINE_RE = re.compile( + r"UBER:\s+(?P[^:]+):\s+\d+\s+iters\s+/\s+\d+\s+frames\s+=\s+(?P\d+)\s+ops/sec" +) + + +def parse_log(path): + """Return ordered dict {op_name: ops_per_sec} from a UBER log file. + + Multiple runs may be concatenated (joeyLog appends); last value + for each op wins, matching the most recent run. + """ + perf = {} + with open(path) as f: + for line in f: + m = LINE_RE.search(line) + if m: + perf[m.group("op").strip()] = int(m.group("ops")) + return perf + + +def main(argv): + threshold = 1.0 + args = [] + i = 1 + while i < len(argv): + if argv[i] == "--threshold" and i + 1 < len(argv): + try: + threshold = float(argv[i + 1]) + except ValueError: + sys.stderr.write(f"error: bad threshold {argv[i+1]}\n") + return 2 + i += 2 + else: + args.append(argv[i]) + i += 1 + + if len(args) != 2: + sys.stderr.write( + "usage: diff-uber-perf [--threshold 1.0]\n" + ) + return 2 + + try: + ref = parse_log(args[0]) + test = parse_log(args[1]) + except OSError as e: + sys.stderr.write(f"error: {e}\n") + return 2 + + if not ref: + sys.stderr.write(f"error: no UBER lines found in {args[0]}\n") + return 2 + if not test: + sys.stderr.write(f"error: no UBER lines found in {args[1]}\n") + return 2 + + rows = [] + for op, ref_ops in ref.items(): + test_ops = test.get(op) + if test_ops is None: + rows.append((op, ref_ops, None, None, "MISSING")) + continue + if ref_ops == 0: + ratio = float("inf") if test_ops > 0 else 1.0 + else: + ratio = test_ops / ref_ops + status = "ok" if ratio >= threshold else "FAIL" + rows.append((op, ref_ops, test_ops, ratio, status)) + + extras = [(op, None, test[op], None, "EXTRA") for op in test if op not in ref] + + # Sort: missing/fail first by worst ratio, then ok ascending by ratio. + def sort_key(row): + op, refv, testv, ratio, status = row + if status == "MISSING": + return (0, 0.0, op) + if status == "EXTRA": + return (3, 0.0, op) + return (1 if status == "FAIL" else 2, ratio, op) + + rows.sort(key=sort_key) + + op_w = max(len(op) for op in ref) if ref else 8 + op_w = max(op_w, max((len(op) for op in test), default=8), len("op")) + + print(f"{'op':<{op_w}} {'ref':>10} {'test':>10} {'ratio':>7} status") + print(f"{'-'*op_w} {'-'*10} {'-'*10} {'-'*7} ------") + fails = 0 + for op, refv, testv, ratio, status in rows + extras: + refs = "" if refv is None else str(refv) + tests = "" if testv is None else str(testv) + rats = "" if ratio is None else f"{ratio:.2f}x" + print(f"{op:<{op_w}} {refs:>10} {tests:>10} {rats:>7} {status}") + if status in ("FAIL", "MISSING"): + fails += 1 + + print() + print(f"threshold: {threshold:.2f}x ({len(rows)} ops compared, {fails} below threshold)") + return 1 if fails > 0 else 0 + + +if __name__ == "__main__": + sys.exit(main(sys.argv))