Amiga parity with IIgs!

This commit is contained in:
Scott Duensing 2026-05-03 01:44:39 -05:00
parent 6c03d93e88
commit b1e24b4650
37 changed files with 4312 additions and 493 deletions

View file

@ -171,11 +171,11 @@ int main(void) {
if (flashFrames > 0) {
fillRect(screen, BAR_X, BAR_Y, BAR_W, BAR_H, COLOR_BAR);
stagePresentRect(BAR_X, BAR_Y, BAR_W, BAR_H);
stagePresent();
flashFrames--;
if (flashFrames == 0) {
fillRect(screen, BAR_X, BAR_Y, BAR_W, BAR_H, COLOR_HINT);
stagePresentRect(BAR_X, BAR_Y, BAR_W, BAR_H);
stagePresent();
}
}
}

View file

@ -80,8 +80,10 @@ static void buildPalette(SurfaceT *screen) {
static void drawAndPresent(SurfaceT *screen, int16_t x, int16_t y, int16_t w, int16_t h, uint8_t color) {
/* fillRect marks the rect dirty; stagePresent flushes only that
* dirty band. */
fillRect(screen, x, y, (uint16_t)w, (uint16_t)h, color);
stagePresentRect(x, y, (uint16_t)w, (uint16_t)h);
stagePresent();
}

View file

@ -158,8 +158,6 @@ static void presentChangedCells(SurfaceT *screen, int16_t cursorCol, int16_t cur
int16_t row;
JoeyKeyE key;
bool lit;
int16_t x;
int16_t y;
for (row = 0; row < GRID_ROWS; row++) {
for (col = 0; col < GRID_COLS; col++) {
@ -171,10 +169,10 @@ static void presentChangedCells(SurfaceT *screen, int16_t cursorCol, int16_t cur
if (lit == gCellLit[row][col]) {
continue;
}
/* drawCell marks the cell's rect dirty; stagePresent
* flushes that one band. */
drawCell(screen, col, row, lit);
x = (int16_t)(MARGIN_X + col * (CELL_W + GAP));
y = (int16_t)(MARGIN_Y + row * (CELL_H + GAP));
stagePresentRect(x, y, CELL_W, CELL_H);
stagePresent();
gCellLit[row][col] = lit;
}
}
@ -195,19 +193,16 @@ static void updateCursor(SurfaceT *screen, int16_t cursorCol, int16_t cursorRow)
if (gLastCursorX != mouseX || gLastCursorY != mouseY) {
if (gLastCursorCol != CELL_NONE) {
drawCell(screen, gLastCursorCol, gLastCursorRow, gCellLit[gLastCursorRow][gLastCursorCol]);
stagePresentRect(
(int16_t)(MARGIN_X + gLastCursorCol * (CELL_W + GAP)),
(int16_t)(MARGIN_Y + gLastCursorRow * (CELL_H + GAP)),
CELL_W, CELL_H);
} else if (gLastCursorX >= 0 && gLastCursorY >= 0) {
// Old cursor was in a gap region. Stamp background over it.
fillRect(screen, gLastCursorX, gLastCursorY, CURSOR_W, CURSOR_H, COLOR_BACKGROUND);
stagePresentRect(gLastCursorX, gLastCursorY, CURSOR_W, CURSOR_H);
}
}
drawCursor(screen, mouseX, mouseY);
stagePresentRect(mouseX, mouseY, CURSOR_W, CURSOR_H);
/* All draw calls above marked their rects dirty; one stagePresent
* flushes the union (cursor erase + cursor draw). */
stagePresent();
gLastCursorX = mouseX;
gLastCursorY = mouseY;

View file

@ -15,11 +15,11 @@
#define BALL_TILES_Y (BALL_H / 8)
#define BALL_TILE_BYTES (BALL_TILES_X * BALL_TILES_Y * TILE_BYTES)
// SaveUnder must store rounded-up byte boundaries: x rounded down to
// even, width rounded up to even. Worst case for BALL_W=16 (already
// even) is 8 bytes per row + alignment slack of 1 byte; size for the
// pessimistic case so the buffer never overflows.
#define BALL_BACKUP_BYTES (((BALL_W + 2) >> 1) * BALL_H)
// SaveUnder rounds x down to the platform's storage alignment: 2 px
// for chunky 4bpp (1 extra byte/row worst case), 8 px for planar
// 4-plane (4 extra bytes/row worst case -- one per plane). The +4
// covers the planar case and is a no-op overhead on chunky.
#define BALL_BACKUP_BYTES (((BALL_W >> 1) + 4) * BALL_H)
#define BALL_PALETTE_IDX 0
@ -100,18 +100,14 @@ int main(void) {
int16_t y;
int16_t vx;
int16_t vy;
int16_t oldX;
int16_t oldY;
uint16_t oldW;
uint16_t oldH;
int16_t unionX;
int16_t unionY;
int16_t unionRight;
int16_t unionBottom;
bool haveBackup;
config.hostMode = HOST_MODE_TAKEOVER;
config.codegenBytes = 8 * 1024;
/* Amiga planar emits 8 pre-shifted DRAW variants per sprite (one
* per x % 8 alignment) so the codegen arena needs roughly 8x what
* the chunky two-shift case asks for. 32 KB fits a 16x16 ball
* with all variants. */
config.codegenBytes = 32UL * 1024;
config.maxSurfaces = 4;
config.audioBytes = 64UL * 1024;
config.assetBytes = 128UL * 1024;
@ -155,7 +151,7 @@ int main(void) {
haveBackup = false;
spriteSaveAndDraw(screen, ball, x, y, &backup);
stagePresentRect(backup.x, backup.y, backup.width, backup.height);
stagePresent();
haveBackup = true;
for (;;) {
@ -164,19 +160,15 @@ int main(void) {
break;
}
// Stash the prior ball's region before restoring the bytes
// under it. Do all off-screen work (restore + move + draw)
// first, then waitVBL + ONE stagePresentRect covering both
// old and new regions. Putting waitVBL immediately before the
// present lets the present land inside the VBL window so the
// CRT never sees a half-updated framebuffer (matters most on
// single-buffered chunky targets like IIgs SHR; on planar
// c2p platforms it also avoids c2p racing the raster).
oldX = backup.x;
oldY = backup.y;
oldW = backup.width;
oldH = backup.height;
// Do all off-screen work first (restore + move + draw), then
// ONE stagePresent flushes the union of dirty bands set by
// restoreUnder + draw. Add a joeyWaitVBL() before the present
// to land it inside the VBL window so the CRT never sees a
// half-updated framebuffer (matters most on single-buffered
// chunky targets like IIgs SHR; on planar c2p platforms it
// also avoids c2p racing the raster). VBL wait is omitted
// here so the demo runs at the sprite pipeline's native
// throughput -- expect tearing on the ball.
if (haveBackup) {
spriteRestoreUnder(screen, &backup);
}
@ -190,27 +182,7 @@ int main(void) {
spriteSaveAndDraw(screen, ball, x, y, &backup);
// Bounding box of (old rect) U (new rect). For typical
// small-step motion the rects overlap heavily so the union
// is barely larger than one ball.
unionX = (oldX < backup.x) ? oldX : backup.x;
unionY = (oldY < backup.y) ? oldY : backup.y;
unionRight = (int16_t)((oldX + oldW > backup.x + backup.width)
? (oldX + oldW)
: (backup.x + backup.width));
unionBottom = (int16_t)((oldY + oldH > backup.y + backup.height)
? (oldY + oldH)
: (backup.y + backup.height));
// VBL wait removed -- the demo runs at the native compute speed
// of save+restore+draw+presentRect so we can SEE the sprite
// pipeline's actual throughput. Expect tearing on the ball
// since the present can land mid-scan; that's the cost of
// showing real frame rate. Add joeyWaitVBL() back here for
// tear-free 60 Hz motion.
stagePresentRect(unionX, unionY,
(uint16_t)(unionRight - unionX),
(uint16_t)(unionBottom - unionY));
stagePresent();
haveBackup = true;
}

View file

@ -28,7 +28,16 @@
// 4-frame measurement window. Long enough that loop overhead doesn't
// dominate; short enough to keep the full demo run under ~10 sec.
#define UBER_FRAMES 4u
/* 16 frames per timed op gives 4x the iter-count resolution of the
* earlier 4-frame budget. Exposes the actual per-op cost on slow
* ops where 4 frames produced the same iter count on different
* framerates -- e.g. drawCircle r=80 read as "4 iters / 4 frames"
* on both 60 Hz IIgs (16.7 ms/frame, 67 ms window) and 50 Hz Amiga
* (20 ms/frame, 80 ms window) even though per-op cost was equal,
* just because 4 ops at 16-17 ms happen to fit both windows. The
* 16-frame budget extends the windows to 267 ms / 320 ms; quantum
* gap shrinks to ~6%. Total run time scales 4x (~80 sec each). */
#define UBER_FRAMES 16u
typedef void (*OpFn)(void);
@ -44,9 +53,10 @@ static TileT gTileScratch;
// Run `op` in a tight loop until `targetFrames` joeyFrameCount ticks
// have elapsed. Returns iterations completed.
static unsigned long runForFrames(OpFn op, unsigned int targetFrames) {
static unsigned long runForFrames(OpFn op, unsigned int targetFrames, uint16_t *actualFramesOut) {
unsigned long count;
uint16_t startFrame;
uint16_t endFrame;
count = 0UL;
@ -57,29 +67,50 @@ static unsigned long runForFrames(OpFn op, unsigned int targetFrames) {
op();
count++;
}
/* Capture the actual elapsed frames -- the last iter typically
* overruns the target. Using actual instead of target as the
* ops/sec divisor stays honest for ops slower than 1 frame
* (where count is forced low while real time stretches well
* past targetFrames). */
endFrame = joeyFrameCount();
*actualFramesOut = (uint16_t)(endFrame - startFrame);
if (*actualFramesOut == 0u) {
*actualFramesOut = 1u; /* defensive: avoid div-by-zero */
}
return count;
}
// Time and log one op. Reports iters / N frames AND the derived
// ops/sec so per-port results are directly comparable against IIgs
// regardless of CPU speed or display refresh rate.
// regardless of CPU speed or display refresh rate. Also logs an
// FNV-1a hash of the surface state after timing -- this is the
// pixel-perfect comparison input for the cross-port validation
// harness (tools/diff-uber-hashes.py). Captured against IIgs as the
// golden reference; planar 68k rewrites validate by matching it.
static void timeOp(const char *name, OpFn op) {
unsigned long iters;
unsigned long opsPerSec;
uint16_t actualFrames;
uint32_t hash;
gCurName = name;
iters = runForFrames(op, UBER_FRAMES);
iters = runForFrames(op, UBER_FRAMES, &actualFrames);
if (iters == 0UL) {
joeyLogF("UBER: %s: 0 iters (op too slow?)\n", name);
return;
}
opsPerSec = (iters * (unsigned long)joeyFrameHz()) / (unsigned long)UBER_FRAMES;
joeyLogF("UBER: %s: %lu iters / %u frames = %lu ops/sec\n",
name, iters, UBER_FRAMES, opsPerSec);
/* Divide by ACTUAL elapsed frames, not the target. For sub-frame
* ops actualFrames ~= UBER_FRAMES so the answer is unchanged;
* for ops that overrun (slow stagePresent etc.), this stops
* inflating ops/sec. */
opsPerSec = (iters * (unsigned long)joeyFrameHz()) / (unsigned long)actualFrames;
hash = surfaceHash(gStage);
joeyLogF("UBER: %s: %lu iters / %u frames = %lu ops/sec | hash=%08lX\n",
name, iters, actualFrames, opsPerSec, (unsigned long)hash);
}
@ -125,8 +156,6 @@ static void op_spriteRestore (void) { spriteRestoreUnder(gStage, &gBackup);
static void op_spriteSaveAndDraw (void) { spriteSaveAndDraw (gStage, gSprite, gSpriteX, gSpriteY, &gBackup); }
static void op_stagePresent (void) { stagePresent(); }
static void op_stagePresentRect8(void) { stagePresentRect( 40, 30, 16, 16); }
static void op_stagePresentRectF(void) { stagePresentRect( 0, 0, 320, 200); }
static void op_inputPoll (void) { joeyInputPoll(); }
static void op_keyDown (void) { (void)joeyKeyDown(KEY_A); }
@ -229,10 +258,14 @@ static void runAllTests(void) {
timeOp("spriteRestoreUnder", op_spriteRestore);
timeOp("spriteSaveAndDraw", op_spriteSaveAndDraw);
// Present.
// Present. One warm-up call before each timed loop primes any
// per-port one-time setup (Amiga: copper list rebuild after the
// paletteSet / scbSetRange tests dirty the cache; without warm-up
// the rebuild's MakeScreen + MrgCop + WaitTOF chain consumes the
// entire 4-frame measurement window) so we measure steady-state
// throughput rather than first-call penalty.
stagePresent();
timeOp("stagePresent full", op_stagePresent);
timeOp("stagePresentRect 8b",op_stagePresentRect8);
timeOp("stagePresentRect F", op_stagePresentRectF);
// Input.
timeOp("joeyInputPoll", op_inputPoll);
@ -256,9 +289,16 @@ int main(void) {
JoeyConfigT config;
uint16_t pal[16];
int i;
uint16_t startFrame;
uint16_t endFrame;
uint16_t elapsedFrames;
unsigned long elapsedMs;
config.hostMode = HOST_MODE_TAKEOVER;
config.codegenBytes = 8 * 1024;
/* 32 KB fits the 8 pre-shifted DRAW variants the Amiga planar
* compiled sprite emitter generates. UL on the multiply because
* ORCA-C's 16-bit int overflows on 32 * 1024. */
config.codegenBytes = 32UL * 1024;
config.maxSurfaces = 4;
config.audioBytes = 64UL * 1024;
config.assetBytes = 128UL * 1024;
@ -266,6 +306,11 @@ int main(void) {
if (!joeyInit(&config)) {
return 1;
}
/* joeyFrameCount is VBL-driven, so it only ticks after halInit
* installed its VBL ISR -- captured here is "everything from now
* to press-any-key". Pre-init setup time is small and not the
* cost the user is chasing; runAllTests dominates. */
startFrame = joeyFrameCount();
gStage = stageGet();
if (gStage == NULL) {
@ -337,6 +382,12 @@ int main(void) {
runAllTests();
endFrame = joeyFrameCount();
elapsedFrames = (uint16_t)(endFrame - startFrame);
elapsedMs = ((unsigned long)elapsedFrames * 1000UL) / (unsigned long)joeyFrameHz();
joeyLogF("UBER: total wall time: %lu ms (%u frames @ %u Hz)\n",
elapsedMs, elapsedFrames, (unsigned)joeyFrameHz());
// Done. Green screen + waitForKey.
surfaceClear(gStage, 2);
stagePresent();

View file

@ -5,6 +5,7 @@
void joeyLog (const char *msg);
void joeyLogF (const char *fmt, ...);
void joeyLogFlush(void);
void joeyLogReset(void);
#endif

View file

@ -15,14 +15,14 @@
#include "types.h"
// Flip the dirty regions of the stage to the display, then clear the
// dirty state. Cheap when nothing has changed since the last call.
// dirty state. Cheap when nothing has changed since the last call
// (gStageAnyDirty short-circuit). Drawing primitives mark dirty as
// a side effect, so callers only need to call stagePresent at the
// end of a frame -- everything they drew shows up.
//
// To present a region you didn't draw with the standard primitives
// (e.g. direct framebuffer poking), call surfaceMarkDirtyRect on
// the same rect first, then stagePresent.
void stagePresent(void);
// Flip a specific rectangular region of the stage to the display,
// regardless of dirty state. Coordinates are clipped to the surface;
// negative or zero dimensions are no-ops. Does not consult or modify
// the dirty arrays -- callers mixing stagePresentRect with stagePresent
// in the same frame may see redundant work on the next stagePresent.
void stagePresentRect(int16_t x, int16_t y, uint16_t w, uint16_t h);
#endif

View file

@ -27,13 +27,16 @@
#include "surface.h"
#include "types.h"
// Sprites always write to a 4bpp packed SurfaceT, never to display
// memory directly (halPresent owns that path). The codegen emits 2
// shift variants on every platform: shift 0 for even x (sprite byte
// boundaries match destination byte boundaries) and shift 1 for odd
// x (each destination byte combines two adjacent sprite bytes'
// nibbles).
#define JOEY_SPRITE_SHIFT_COUNT 2
// Sprite codegen emits per-shift variants. Chunky 4bpp ports (DOS,
// IIgs, Atari ST) only need 2 shifts -- pixel offset 0 (sprite/dest
// byte boundaries align) and offset 1 (every dest byte combines two
// sprite bytes' nibbles). Planar ports (Amiga -- 8 px per plane byte)
// need 8 shifts: one for each x % 8 alignment, so smooth horizontal
// motion at any pixel position uses pre-shifted source bytes without
// runtime bit-shifting. Allocate the max so routineOffsets[] has
// slots for every variant; chunky ports leave shifts 2..7 as
// SPRITE_NOT_COMPILED, planar ports use all 8.
#define JOEY_SPRITE_SHIFT_COUNT 8
typedef enum {
SPRITE_FLAGS_NONE = 0

View file

@ -58,4 +58,13 @@ bool surfaceSaveFile(const SurfaceT *src, const char *path);
// identity (no reallocation).
bool surfaceLoadFile(SurfaceT *dst, const char *path);
// FNV-1a 32-bit hash of the surface's logical pixel content (color
// indices in row-major order, 0..15 per pixel). Same logical pixels
// produce the same hash on every port regardless of internal storage
// format -- so a hash captured on IIgs (chunky) compares directly
// against the same op's output on Amiga (planar) once the planar
// rewrite is done. Used by the UBER validation harness to
// pixel-compare ports against an IIgs golden reference.
uint32_t surfaceHash(const SurfaceT *s);
#endif

View file

@ -13,7 +13,7 @@ BINDIR := $(BUILD)/bin
# independently. -I on $(SRC_PORT)/amiga lets ptplayer.h resolve
# <SDI_compiler.h> from the port-local shim alongside our HAL code.
PTPLAYER_DIR := $(REPO_DIR)/toolchains/amiga/ptplayer
CFLAGS := $(COMMON_CFLAGS) -DJOEYLIB_PLATFORM_AMIGA -m68000 -fomit-frame-pointer -noixemul -D__OSCOMPAT -I$(SRC_PORT)/amiga -I$(SRC_68K) -I$(PTPLAYER_DIR)
CFLAGS := $(COMMON_CFLAGS) -DJOEYLIB_PLATFORM_AMIGA -m68000 -fomit-frame-pointer -noixemul -D__OSCOMPAT -I$(SRC_PORT)/amiga -I$(SRC_68K) -I$(PTPLAYER_DIR) -MMD -MP $(CFLAGS_EXTRA)
# OSCOMPAT=1 selects PTPlayer's audio.device-friendly variant (uses
# CIA-B + audio.device interrupts via the OS rather than taking over
# Paula directly), matching the way our HAL cooperates with Intuition.
@ -52,6 +52,7 @@ LIB_OBJS := \
$(patsubst $(SRC_68K)/%.s,$(BUILD)/obj/68k/%.o,$(SHARED_S)) \
$(BUILD)/obj/port/ptplayer.o \
$(BUILD)/obj/codegen/spriteEmit68k.o \
$(BUILD)/obj/codegen/spriteEmitPlanar68k.o \
$(BUILD)/obj/codegen/spriteCompile.o
LIB := $(LIBDIR)/libjoey.a
@ -156,3 +157,9 @@ $(DATA_DIR)/test.sfx: $(REPO_DIR)/assets/test.sfx
clean-amiga:
rm -rf $(BUILD)
# Pull in per-object header-dependency files generated by gcc -MMD/-MP.
# Without this, editing a header (e.g. surfaceInternal.h) doesn't rebuild
# the .c files that include it, leaving a frankenstein binary where
# different TUs see different struct layouts.
-include $(LIB_OBJS:.o=.d)

View file

@ -7,7 +7,7 @@ BUILD := $(REPO_DIR)/build/$(PLATFORM)
LIBDIR := $(BUILD)/lib
BINDIR := $(BUILD)/bin
CFLAGS := $(COMMON_CFLAGS) -DJOEYLIB_PLATFORM_ATARIST -m68000 -fomit-frame-pointer -I$(REPO_DIR)/toolchains/audio/libxmp-lite/include -I$(REPO_DIR)/toolchains/atarist/include-shim -I$(SRC_68K)
CFLAGS := $(COMMON_CFLAGS) -DJOEYLIB_PLATFORM_ATARIST -m68000 -fomit-frame-pointer -I$(REPO_DIR)/toolchains/audio/libxmp-lite/include -I$(REPO_DIR)/toolchains/atarist/include-shim -I$(SRC_68K) -MMD -MP
LDFLAGS :=
# libxmp-lite shared with the DOS port. Built as a static archive that
@ -148,3 +148,9 @@ $(DATA_DIR)/test.sfx: $(REPO_DIR)/assets/test.sfx
clean-atarist:
rm -rf $(BUILD)
# Pull in per-object header-dependency files generated by gcc -MMD/-MP.
# Without this, editing a header (e.g. surfaceInternal.h) doesn't rebuild
# the .c files that include it, leaving a frankenstein binary where
# different TUs see different struct layouts.
-include $(LIB_OBJS:.o=.d)

View file

@ -7,7 +7,7 @@ BUILD := $(REPO_DIR)/build/$(PLATFORM)
LIBDIR := $(BUILD)/lib
BINDIR := $(BUILD)/bin
CFLAGS := $(COMMON_CFLAGS) -DJOEYLIB_PLATFORM_DOS -march=i386 -m32 -I$(REPO_DIR)/toolchains/audio/libxmp-lite/include
CFLAGS := $(COMMON_CFLAGS) -DJOEYLIB_PLATFORM_DOS -march=i386 -m32 -I$(REPO_DIR)/toolchains/audio/libxmp-lite/include -MMD -MP
ASFLAGS := -f coff
LDFLAGS :=
@ -138,3 +138,9 @@ $(DATA_DIR)/test.sfx: $(REPO_DIR)/assets/test.sfx
clean-dos:
rm -rf $(BUILD)
# Pull in per-object header-dependency files generated by gcc -MMD/-MP.
# Without this, editing a header (e.g. surfaceInternal.h) doesn't rebuild
# the .c files that include it, leaving a frankenstein binary where
# different TUs see different struct layouts.
-include $(LIB_OBJS:.o=.d)

View file

@ -51,11 +51,11 @@ IIGS_MERLIN := $(REPO_DIR)/toolchains/iigs/merlin32/bin/merlin32
LIB_SRCS := $(CORE_C_SRCS_IIGS) $(PORT_C_SRCS) $(PORT_ASM_SRCS_ALL) $(NTP_ASM) $(CODEGEN_SRCS)
# HELLO and PATTERN are intentionally omitted from this list. The UBER
# demo (below) exercises every public API, including what those two
# small examples covered, and the IIgs disk image was running out of
# room. Source for HELLO/PATTERN is still in examples/{hello,pattern}/
# for reference and for other ports that want them.
# HELLO is omitted from the disk because UBER exercises everything it
# does and the disk was tight. PATTERN is included as the SCB / palette
# golden-reference for cross-port debugging.
PATTERN_SRC := $(EXAMPLES)/pattern/pattern.c
PATTERN_BIN := $(BINDIR)/PATTERN
DRAW_SRC := $(EXAMPLES)/draw/draw.c
DRAW_BIN := $(BINDIR)/DRAW
KEYS_SRC := $(EXAMPLES)/keys/keys.c
@ -120,24 +120,44 @@ $(NTP_ASM): $(NTP_BIN) $(REPO_DIR)/toolchains/iigs/bin-to-asm.sh
# everywhere, so library asm can take SurfaceT* args via one
# consistent ABI (small-mm 16-bit pointers truncated bank bytes,
# which broke any asm that wanted to address bank-1 stage memory).
# Per-binary header dependency files. iix-build.sh -M emits one .d
# alongside each binary covering every header transitively included
# by the C sources in that binary's build. Pulled in via -include at
# the bottom of this file so editing a shared header (e.g.
# surfaceInternal.h) triggers a rebuild of every IIgs binary that
# transitively depends on it.
DEP_DIR := $(BUILD)/dep
PATTERN_DEP := $(DEP_DIR)/PATTERN.d
DRAW_DEP := $(DEP_DIR)/DRAW.d
KEYS_DEP := $(DEP_DIR)/KEYS.d
JOY_DEP := $(DEP_DIR)/JOY.d
SPRITE_DEP := $(DEP_DIR)/SPRITE.d
UBER_DEP := $(DEP_DIR)/UBER.d
AUDIO_DEP := $(DEP_DIR)/AUDIO.d
$(PATTERN_BIN): $(PATTERN_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD)
@mkdir -p $(dir $@) $(DEP_DIR)
$(IIGS_BUILD) -b -M $(PATTERN_DEP) $(IIX_INCLUDES) -o $@ $(PATTERN_SRC) $(LIB_SRCS)
$(IIGS_IIX) chtyp -t S16 $@
$(DRAW_BIN): $(DRAW_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD)
@mkdir -p $(dir $@)
$(IIGS_BUILD) -b $(IIX_INCLUDES) -o $@ $(DRAW_SRC) $(LIB_SRCS)
@mkdir -p $(dir $@) $(DEP_DIR)
$(IIGS_BUILD) -b -M $(DRAW_DEP) $(IIX_INCLUDES) -o $@ $(DRAW_SRC) $(LIB_SRCS)
$(IIGS_IIX) chtyp -t S16 $@
$(KEYS_BIN): $(KEYS_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD)
@mkdir -p $(dir $@)
$(IIGS_BUILD) -b $(IIX_INCLUDES) -o $@ $(KEYS_SRC) $(LIB_SRCS)
@mkdir -p $(dir $@) $(DEP_DIR)
$(IIGS_BUILD) -b -M $(KEYS_DEP) $(IIX_INCLUDES) -o $@ $(KEYS_SRC) $(LIB_SRCS)
$(IIGS_IIX) chtyp -t S16 $@
$(JOY_BIN): $(JOY_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD)
@mkdir -p $(dir $@)
$(IIGS_BUILD) -b $(IIX_INCLUDES) -o $@ $(JOY_SRC) $(LIB_SRCS)
@mkdir -p $(dir $@) $(DEP_DIR)
$(IIGS_BUILD) -b -M $(JOY_DEP) $(IIX_INCLUDES) -o $@ $(JOY_SRC) $(LIB_SRCS)
$(IIGS_IIX) chtyp -t S16 $@
$(SPRITE_BIN): $(SPRITE_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD)
@mkdir -p $(dir $@)
$(IIGS_BUILD) -b $(IIX_INCLUDES) -o $@ $(SPRITE_SRC) $(LIB_SRCS)
@mkdir -p $(dir $@) $(DEP_DIR)
$(IIGS_BUILD) -b -M $(SPRITE_DEP) $(IIX_INCLUDES) -o $@ $(SPRITE_SRC) $(LIB_SRCS)
$(IIGS_IIX) chtyp -t S16 $@
# UBER bumps user stack to 16 KB. ORCA-C's default user stack is small
@ -147,8 +167,8 @@ $(SPRITE_BIN): $(SPRITE_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD)
# decimal formatter in uber.c also uses larger stack-local buffers
# (line[96], num[16]) than typical demos. 16 KB is plenty of headroom.
$(UBER_BIN): $(UBER_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD)
@mkdir -p $(dir $@)
$(IIGS_BUILD) -b -s 16384 $(IIX_INCLUDES) -o $@ $(UBER_SRC) $(LIB_SRCS)
@mkdir -p $(dir $@) $(DEP_DIR)
$(IIGS_BUILD) -b -s 16384 -M $(UBER_DEP) $(IIX_INCLUDES) -o $@ $(UBER_SRC) $(LIB_SRCS)
$(IIGS_IIX) chtyp -t S16 $@
# Convert the cross-platform .MOD asset to NinjaTrackerPlus runtime
@ -170,17 +190,23 @@ AUDIO_DATA_FILES := $(AUDIO_SFX)
endif
$(AUDIO_BIN): $(AUDIO_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD)
@mkdir -p $(dir $@)
$(IIGS_BUILD) -b $(IIX_INCLUDES) -I $(EXAMPLES)/audio -o $@ $(AUDIO_SRC) $(LIB_SRCS)
@mkdir -p $(dir $@) $(DEP_DIR)
$(IIGS_BUILD) -b -M $(AUDIO_DEP) $(IIX_INCLUDES) -I $(EXAMPLES)/audio -o $@ $(AUDIO_SRC) $(LIB_SRCS)
$(IIGS_IIX) chtyp -t S16 $@
# Assemble a ProDOS 2img containing the examples, ready to mount in
# GSplus alongside a GS/OS boot volume.
iigs-disk: $(DISK_IMG)
$(DISK_IMG): $(DRAW_BIN) $(KEYS_BIN) $(JOY_BIN) $(SPRITE_BIN) $(AUDIO_BIN) $(UBER_BIN) $(AUDIO_DATA_FILES) $(IIGS_PACKAGE)
$(DISK_IMG): $(PATTERN_BIN) $(DRAW_BIN) $(KEYS_BIN) $(JOY_BIN) $(SPRITE_BIN) $(AUDIO_BIN) $(UBER_BIN) $(AUDIO_DATA_FILES) $(IIGS_PACKAGE)
@mkdir -p $(dir $@)
$(IIGS_PACKAGE) $@ $(DRAW_BIN) $(KEYS_BIN) $(JOY_BIN) $(SPRITE_BIN) $(AUDIO_BIN) $(UBER_BIN) -- $(AUDIO_DATA_FILES)
$(IIGS_PACKAGE) $@ $(PATTERN_BIN) $(DRAW_BIN) $(KEYS_BIN) $(JOY_BIN) $(SPRITE_BIN) $(AUDIO_BIN) $(UBER_BIN) -- $(AUDIO_DATA_FILES)
clean-iigs:
rm -rf $(BUILD)
# Pull in per-binary header-dependency files generated by iix-build.sh -M.
# Without this, editing a header (e.g. surfaceInternal.h) doesn't rebuild
# IIgs binaries that include it -- the IIgs's iix toolchain has no native
# -MMD analog, so iix-build.sh shells out to host gcc for the scan.
-include $(PATTERN_DEP) $(DRAW_DEP) $(KEYS_DEP) $(JOY_DEP) $(SPRITE_DEP) $(UBER_DEP) $(AUDIO_DEP)

View file

@ -14,6 +14,7 @@
#include "joey/sprite.h"
#include "joey/surface.h"
#include "codegenArenaInternal.h"
#include "hal.h"
#include "spriteEmitter.h"
#include "spriteInternal.h"
#include "surfaceInternal.h"
@ -33,7 +34,9 @@
static uint16_t emitDrawForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift) {
#if defined(JOEYLIB_PLATFORM_DOS)
return spriteEmitDrawX86(out, sp, shift);
#elif defined(JOEYLIB_PLATFORM_AMIGA) || defined(JOEYLIB_PLATFORM_ATARIST)
#elif defined(JOEYLIB_PLATFORM_AMIGA)
return spriteEmitDrawPlanar68k(out, sp, shift);
#elif defined(JOEYLIB_PLATFORM_ATARIST)
return spriteEmitDraw68k(out, sp, shift);
#elif defined(JOEYLIB_PLATFORM_IIGS)
return spriteEmitDrawIigs(out, sp, shift);
@ -51,7 +54,9 @@ static uint16_t emitDrawForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift
static uint16_t emitSaveForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift) {
#if defined(JOEYLIB_PLATFORM_DOS)
return spriteEmitSaveX86(out, sp, shift);
#elif defined(JOEYLIB_PLATFORM_AMIGA) || defined(JOEYLIB_PLATFORM_ATARIST)
#elif defined(JOEYLIB_PLATFORM_AMIGA)
return spriteEmitSavePlanar68k(out, sp, shift);
#elif defined(JOEYLIB_PLATFORM_ATARIST)
return spriteEmitSave68k(out, sp, shift);
#elif defined(JOEYLIB_PLATFORM_IIGS)
return spriteEmitSaveIigs(out, sp, shift);
@ -65,7 +70,9 @@ static uint16_t emitSaveForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift
static uint16_t emitRestoreForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift) {
#if defined(JOEYLIB_PLATFORM_DOS)
return spriteEmitRestoreX86(out, sp, shift);
#elif defined(JOEYLIB_PLATFORM_AMIGA) || defined(JOEYLIB_PLATFORM_ATARIST)
#elif defined(JOEYLIB_PLATFORM_AMIGA)
return spriteEmitRestorePlanar68k(out, sp, shift);
#elif defined(JOEYLIB_PLATFORM_ATARIST)
return spriteEmitRestore68k(out, sp, shift);
#elif defined(JOEYLIB_PLATFORM_IIGS)
return spriteEmitRestoreIigs(out, sp, shift);
@ -114,6 +121,13 @@ bool spriteCompile(SpriteT *sp) {
if (sp->tileData == NULL) {
return false;
}
/* Amiga (post-Phase 9) uses spriteEmitPlanar68k.c which writes
* directly to bitplanes. DRAW emits a unique pre-shifted variant
* per shift in 0..7 (smooth horizontal motion at any pixel x);
* SAVE/RESTORE emit only shift 0 and shift 1 since shifted variants
* 1..7 share identical bytes (plain memcpy of widthTiles+1 plane
* bytes per row). The post-emit pass below aliases slots 2..7
* for save/restore to slot 1's bytes. */
scratch = (uint8_t *)malloc(SPRITE_EMIT_SCRATCH_BYTES);
if (scratch == NULL) {
@ -150,6 +164,16 @@ bool spriteCompile(SpriteT *sp) {
}
}
}
#if defined(JOEYLIB_PLATFORM_AMIGA)
/* Save/restore bytes for any non-zero shift are identical (plain
* memcpy of widthTiles+1 plane bytes per row). The emitter emits
* them once at slot 1; alias slots 2..7 here so the dispatcher
* gate (sprite.c) sees them as compiled. */
for (shift = 2; shift < JOEY_SPRITE_SHIFT_COUNT; shift++) {
sp->routineOffsets[shift][SPRITE_OP_SAVE] = sp->routineOffsets[1][SPRITE_OP_SAVE];
sp->routineOffsets[shift][SPRITE_OP_RESTORE] = sp->routineOffsets[1][SPRITE_OP_RESTORE];
}
#endif
sp->slot = slot;
free(scratch);
return true;
@ -554,6 +578,112 @@ void spriteCompiledRestoreUnder(SurfaceT *dst, const SpriteBackupT *backup) {
}
}
#elif defined(JOEYLIB_PLATFORM_AMIGA)
/* Amiga planar dispatchers. spriteEmitPlanar68k.c emits routines with
* cdecl(p0, p1, p2, p3[, backup]) signatures that write directly to
* bitplanes. Compute byteOff = y*40 + x/8 and pass plane[i]+byteOff
* as the 4 plane args. shift = x % 8 selects the variant; today only
* shift 0 emits non-zero bytes, so callers should already have
* gated on routineOffsets[shift][op] != SPRITE_NOT_COMPILED.
*
* For non-zero shifts (x not 8-px-aligned), the dispatcher in
* src/core/sprite.c (spriteDraw / spriteSaveUnder / spriteRestoreUnder)
* sees SPRITE_NOT_COMPILED for the shift and falls back to the
* interpreter, which handles arbitrary x via halSpriteDrawPlanes /
* halSpriteSavePlanes / halSpriteRestorePlanes. */
#define AMIGA_BYTES_PER_ROW_LOCAL 40
void spriteCompiledDraw(SurfaceT *dst, const SpriteT *sp, int16_t x, int16_t y) {
typedef void (*DrawFn)(uint8_t *p0, uint8_t *p1, uint8_t *p2, uint8_t *p3);
uint8_t shift;
uint16_t byteOff;
uint8_t *p0;
uint8_t *p1;
uint8_t *p2;
uint8_t *p3;
DrawFn fn;
shift = (uint8_t)(x & 7);
byteOff = (uint16_t)((uint16_t)y * AMIGA_BYTES_PER_ROW_LOCAL + ((uint16_t)x >> 3));
p0 = halSurfacePlanePtr(dst, 0); if (p0 == NULL) return;
p1 = halSurfacePlanePtr(dst, 1);
p2 = halSurfacePlanePtr(dst, 2);
p3 = halSurfacePlanePtr(dst, 3);
fn = (DrawFn)(codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_DRAW]);
fn(p0 + byteOff, p1 + byteOff, p2 + byteOff, p3 + byteOff);
}
void spriteCompiledSaveUnder(const SurfaceT *src, SpriteT *sp, int16_t x, int16_t y, SpriteBackupT *backup) {
typedef void (*SaveFn)(uint8_t *p0, uint8_t *p1, uint8_t *p2, uint8_t *p3, uint8_t *backup);
uint8_t shift;
int16_t clippedX;
uint16_t widthPx;
uint16_t heightPx;
uint16_t byteOff;
uint8_t *p0;
uint8_t *p1;
uint8_t *p2;
uint8_t *p3;
SaveFn fn;
shift = (uint8_t)(x & 7);
clippedX = (int16_t)(x & ~7);
widthPx = (uint16_t)(sp->widthTiles * 8);
heightPx = (uint16_t)(sp->heightTiles * 8);
/* Shifts 1..7 spill into one extra plane byte per row (= +8 px). */
if (shift != 0u) {
widthPx = (uint16_t)(widthPx + 8u);
}
byteOff = (uint16_t)((uint16_t)y * AMIGA_BYTES_PER_ROW_LOCAL + ((uint16_t)clippedX >> 3));
backup->sprite = sp;
backup->x = clippedX;
backup->y = y;
backup->width = widthPx;
backup->height = heightPx;
/* 4 planes * h * (widthPx/8) bytes = h * widthPx/2. */
backup->sizeBytes = (uint16_t)((uint16_t)heightPx * (widthPx >> 1));
p0 = halSurfacePlanePtr(src, 0); if (p0 == NULL) return;
p1 = halSurfacePlanePtr(src, 1);
p2 = halSurfacePlanePtr(src, 2);
p3 = halSurfacePlanePtr(src, 3);
fn = (SaveFn)(codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_SAVE]);
fn(p0 + byteOff, p1 + byteOff, p2 + byteOff, p3 + byteOff, backup->bytes);
}
void spriteCompiledRestoreUnder(SurfaceT *dst, const SpriteBackupT *backup) {
typedef void (*RestoreFn)(uint8_t *p0, uint8_t *p1, uint8_t *p2, uint8_t *p3, const uint8_t *backup);
SpriteT *sp;
uint8_t shift;
uint16_t byteOff;
uint8_t *p0;
uint8_t *p1;
uint8_t *p2;
uint8_t *p3;
RestoreFn fn;
sp = backup->sprite;
/* backup->x is 8-px aligned (clippedX from save), so x & 7 is
* useless for picking the original shift. Encode it via
* backup->width: == widthTiles*8 means shift 0; > means shifted.
* Shifted slots 1..7 all alias to the same restore bytes, so
* slot 1 stands in for any non-zero shift. */
shift = (uint8_t)(backup->width > (uint16_t)(sp->widthTiles * 8) ? 1u : 0u);
byteOff = (uint16_t)((uint16_t)backup->y * AMIGA_BYTES_PER_ROW_LOCAL + ((uint16_t)backup->x >> 3));
p0 = halSurfacePlanePtr(dst, 0); if (p0 == NULL) return;
p1 = halSurfacePlanePtr(dst, 1);
p2 = halSurfacePlanePtr(dst, 2);
p3 = halSurfacePlanePtr(dst, 3);
fn = (RestoreFn)(codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_RESTORE]);
fn(p0 + byteOff, p1 + byteOff, p2 + byteOff, p3 + byteOff, backup->bytes);
}
#else
void spriteCompiledDraw(SurfaceT *dst, const SpriteT *sp, int16_t x, int16_t y) {

View file

@ -166,6 +166,13 @@ uint16_t spriteEmitDraw68k(uint8_t *out, const SpriteT *sp, uint8_t shift) {
uint8_t value;
uint8_t opaqueMask;
// Chunky 4bpp has only two nibble-alignment positions; the
// dispatcher uses x & 1 so shifts 2..7 are unreachable. Bail
// early so the arena slot stays SPRITE_NOT_COMPILED.
if (shift > 1u) {
return 0u;
}
cursor = 0;
heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS);
spriteBytesPerRow = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW);
@ -225,6 +232,10 @@ uint16_t spriteEmitRestore68k(uint8_t *out, const SpriteT *sp, uint8_t shift) {
uint16_t heightPx;
uint16_t copyBytes;
if (shift > 1u) {
return 0u;
}
cursor = 0;
heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS);
copyBytes = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW + (shift == 1u ? 1u : 0u));
@ -248,6 +259,10 @@ uint16_t spriteEmitSave68k(uint8_t *out, const SpriteT *sp, uint8_t shift) {
uint16_t heightPx;
uint16_t copyBytes;
if (shift > 1u) {
return 0u;
}
cursor = 0;
heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS);
copyBytes = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW + (shift == 1u ? 1u : 0u));

View file

@ -189,6 +189,10 @@ uint16_t spriteEmitSaveIigs(uint8_t *out, const SpriteT *sp, uint8_t shift) {
uint16_t spriteBytesPerRow;
uint16_t copyBytes;
if (shift > 1u) {
return 0u;
}
heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS);
spriteBytesPerRow = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW);
copyBytes = (uint16_t)(spriteBytesPerRow + (shift == 1 ? 1 : 0));
@ -205,6 +209,10 @@ uint16_t spriteEmitRestoreIigs(uint8_t *out, const SpriteT *sp, uint8_t shift) {
uint16_t spriteBytesPerRow;
uint16_t copyBytes;
if (shift > 1u) {
return 0u;
}
heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS);
spriteBytesPerRow = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW);
copyBytes = (uint16_t)(spriteBytesPerRow + (shift == 1 ? 1 : 0));
@ -258,6 +266,10 @@ uint16_t spriteEmitDrawIigs(uint8_t *out, const SpriteT *sp, uint8_t shift) {
uint8_t nextOpaqueMask;
bool wide;
if (shift > 1u) {
return 0u;
}
cursor = 0;
heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS);
spriteBytesPerRow = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW);

View file

@ -0,0 +1,505 @@
// Planar 68k sprite codegen for Amiga (post-Phase 9, no chunky shadow).
//
// Emits PIC routines that write directly to the four bitplanes via 4
// address-register pointers (a0..a3 = plane[0..3] base + byteOff,
// where byteOff = y*40 + x/8 -- the dispatcher pre-computes this).
//
// Calling convention (cdecl on m68k-amigaos-gcc):
// draw(p0, p1, p2, p3):
// args at 4(sp), 8(sp), 12(sp), 16(sp) -- one ULONG per plane.
// loaded into a0..a3 by the prologue.
// save(p0, p1, p2, p3, backup):
// 5 args; backup at 20(sp), loaded into a4.
// restore(p0, p1, p2, p3, backup):
// same as save but reads backup, writes planes.
//
// Per-byte plane write encoding decisions:
// - all-transparent (mask=0): skip the byte entirely
// - all-opaque (mask=0xFF): move.b #imm, d16(an) (6 bytes)
// - mixed (0<mask<0xFF): move.b d16(an), d0;
// andi.b #~mask, d0;
// ori.b #imm, d0;
// move.b d0, d16(an) (4+6+6+4 = 20 bytes)
//
// Per row advance: 4 plane pointers each get adda.w #SURFACE_WIDTH/8
// = adda.w #40, an (4 bytes encoded each, 16 bytes total per row).
// We omit the advance after the last row.
//
// Shift handling: shifts 0..7 are pre-baked. The dispatcher selects
// the variant via x % 8 and pre-computes byteOff = y*40 + (x & ~7)/8
// (i.e. round x DOWN to 8-pixel boundary). The variant for shift s
// then emits to (widthTiles + 1) plane bytes per row when s != 0
// (the rightmost shift bits spill into one extra plane byte) and to
// widthTiles plane bytes per row when s == 0.
//
// The emitter assumes sprite width is a multiple of 8 (= a multiple
// of one tile = a multiple of 8 pixels) so plane bytes per row are
// integer. JoeyLib sprites are always tile-multiple by API contract.
#include "joey/sprite.h"
#include "joey/surface.h"
#include "spriteEmitter.h"
#include "spriteInternal.h"
// ----- Constants -----
#define TILE_PIXELS 8
#define TILE_BYTES 32
#define TILE_BYTES_PER_ROW 4
#define TRANSPARENT_NIBBLE 0
#define AMIGA_BITPLANES 4
#define AMIGA_BYTES_PER_ROW 40
// ----- Instruction encoding helpers -----
static uint16_t writeBE16(uint8_t *out, uint16_t value) {
out[0] = (uint8_t)((value >> 8) & 0xFFu);
out[1] = (uint8_t)(value & 0xFFu);
return 2u;
}
// movea.l <d16,SP>, an -- load arg at SP+disp into An.
// Encoding: 0010 nnn 001 010 111 + disp16
// = 0x2057 + (n << 9), where n is dst An.
// a0: 0x206F, a1: 0x226F, a2: 0x246F, a3: 0x266F, a4: 0x286F.
static const uint16_t kMoveaSpToAn[] = {
0x206Fu, 0x226Fu, 0x246Fu, 0x266Fu, 0x286Fu, 0x2A6Fu, 0x2C6Fu, 0x2E6Fu
};
// adda.w #imm, an -- adds 16-bit signed imm to An (sign-extended).
// Encoding: 1101 nnn 011 111 100 + imm
// = 0xD0FC + (n << 9).
static const uint16_t kAddaWImmToAn[] = {
0xD0FCu, 0xD2FCu, 0xD4FCu, 0xD6FCu, 0xD8FCu, 0xDAFCu, 0xDCFCu, 0xDEFCu
};
// ANDI.B #imm, D0 -- 4 bytes (opcode word + imm word, byte in low half).
// Opcode: 0000 0010 00 000 000 (size=byte, mode=Dn, reg=D0)
#define ANDI_B_IMM_D0 0x0200u
// ORI.B #imm, D0 -- 4 bytes (opcode word + imm word, byte in low half).
// Opcode: 0000 0000 00 000 000
#define ORI_B_IMM_D0 0x0000u
// MOVE.B d16(An), D0 -- 4 bytes (opcode + disp).
// Encoding: 0001 000 000 mode reg
// = size=01 (byte), dst reg=000 (D0), dst mode=000 (Dn),
// src mode=101 (d16,An), src reg=An.
// = 0001000 000 101 nnn = 0x1028 + An.
static const uint16_t kMoveBD16AnToD0[] = {
0x1028u, 0x1029u, 0x102Au, 0x102Bu
};
// MOVE.B D0, d16(An) -- 4 bytes (opcode + disp).
// Encoding: 0001 nnn 101 000 000 = 0x1140 + (An << 9).
static const uint16_t kMoveBD0ToD16An[] = {
0x1140u, 0x1340u, 0x1540u, 0x1740u
};
// MOVE.B #imm, d16(An) -- 6 bytes (opcode + imm + disp).
// Encoding: 0001 nnn 101 111 100 = 0x117C + (An << 9).
// (Was 0x113C earlier -- that's mode=100=predec; mode=101=d16(An)
// is the bit difference. Predec emits a 4-byte instruction with no
// disp word, so the byte stream went out of sync and every
// subsequent instruction decoded into garbage.)
static const uint16_t kMoveBImmToD16An[] = {
0x117Cu, 0x137Cu, 0x157Cu, 0x177Cu
};
// MOVE.B (a4)+, d16(An) -- 4 bytes (opcode + disp). -- used by save/restore (backup in a4)
// Encoding: 0001 nnn 101 011 100 = 0x115C + (An << 9).
static const uint16_t kMoveBA4PostincToD16An[] = {
0x115Cu, 0x135Cu, 0x155Cu, 0x175Cu
};
// MOVE.B d16(An), (a4)+ -- 4 bytes (opcode + disp). -- used by save (planes -> backup)
// Encoding: 1001 100 011 mode reg
// Wait, MOVE.B src,(a4)+ : dst mode = 011 (an+), dst reg = 100 (A4),
// so dst reg=100, dst mode=011 -> opcode high = 0001 100 011 ...
// = 0001100011 mode reg = 0x18C0..
// 0001 100 011 101 nnn = 0x18E8 + An.
static const uint16_t kMoveBD16AnToA4Postinc[] = {
0x18E8u, 0x18E9u, 0x18EAu, 0x18EBu
};
// MOVEM.L reglist, -(SP) -- 4 bytes (opcode + reglist mask).
// Opcode 0x48E7. Predec mask is REVERSED vs all other modes:
// bit 15 = D0, ..., bit 8 = D7, bit 7 = A0, bit 6 = A1, bit 5 = A2,
// bit 4 = A3, bit 3 = A4, bit 2 = A5, bit 1 = A6, bit 0 = A7.
#define MOVEM_L_PUSH_OPCODE 0x48E7u
#define MOVEM_L_MASK_A2_A3 0x0030u /* bits 5,4 = A2,A3 (predec order) */
#define MOVEM_L_MASK_A2_A3_A4 0x0038u /* bits 5,4,3 = A2,A3,A4 */
// MOVEM.L (SP)+, reglist -- 4 bytes (opcode + reglist mask).
// Opcode 0x4CDF. Postinc mask follows the standard layout:
// bit 0 = D0, ..., bit 7 = D7, bit 8 = A0, ..., bit 15 = A7.
#define MOVEM_L_POP_OPCODE 0x4CDFu
#define MOVEM_L_MASK_POP_A2_A3 0x0C00u /* bits 11,10 = A3,A2 */
#define MOVEM_L_MASK_POP_A2_A3_A4 0x1C00u /* bits 12,11,10 = A4,A3,A2 */
// RTS opcode.
#define OPCODE_RTS 0x4E75u
// ----- Emit helpers -----
// For shift 0 (byte-aligned x), the sprite's chunky tile data converts
// directly to plane bytes without any sub-byte shifting. For each
// (row, col-byte, plane) we extract the 8 plane bits from 4 chunky
// bytes (= 8 pixels) and produce one plane byte; we also produce a
// mask byte indicating which pixel positions are non-transparent
// (any plane bit != 0 in the source means non-transparent if
// transparent index is 0, the JoeyLib convention).
//
// Sprite layout: tileData = wTiles x hTiles tiles, each tile = 8 rows
// x 4 chunky bytes (32 bytes). Tiles laid out row-major within the
// sprite. For plane-byte column `c` of row `r`:
// tileX = c (since each plane byte covers exactly one tile column)
// tileY = r / 8
// inTileY = r % 8
// chunky bytes = tileData + (tileY*wTiles + tileX)*32 + inTileY*4 + 0..3
//
// `col` must be in [0, widthTiles); callers handle out-of-range cols
// (used when computing shifted variants that span widthTiles+1 output
// bytes per row) by passing a sentinel and checking against widthTiles
// before invoking this helper.
static void planeByteAndMaskAt(const SpriteT *sp, uint16_t row, uint16_t col,
uint8_t *planeBytes /*[4]*/, uint8_t *maskByte)
{
uint16_t tileX;
uint16_t tileY;
uint16_t inTileY;
const uint8_t *tile;
const uint8_t *chunky;
uint8_t nibbles[8];
uint8_t b0, b1, b2, b3;
uint16_t p;
uint8_t bitMask;
uint8_t pix;
tileX = col;
tileY = row >> 3;
inTileY = row & 7u;
tile = sp->tileData + (uint32_t)((tileY * sp->widthTiles + tileX) * 32u);
chunky = tile + inTileY * 4u;
nibbles[0] = (uint8_t)(chunky[0] >> 4);
nibbles[1] = (uint8_t)(chunky[0] & 0x0Fu);
nibbles[2] = (uint8_t)(chunky[1] >> 4);
nibbles[3] = (uint8_t)(chunky[1] & 0x0Fu);
nibbles[4] = (uint8_t)(chunky[2] >> 4);
nibbles[5] = (uint8_t)(chunky[2] & 0x0Fu);
nibbles[6] = (uint8_t)(chunky[3] >> 4);
nibbles[7] = (uint8_t)(chunky[3] & 0x0Fu);
b0 = 0u; b1 = 0u; b2 = 0u; b3 = 0u;
*maskByte = 0u;
for (p = 0; p < 8u; p++) {
pix = nibbles[p];
if (pix == TRANSPARENT_NIBBLE) {
continue;
}
bitMask = (uint8_t)(0x80u >> p);
*maskByte = (uint8_t)(*maskByte | bitMask);
if (pix & 1u) b0 = (uint8_t)(b0 | bitMask);
if (pix & 2u) b1 = (uint8_t)(b1 | bitMask);
if (pix & 4u) b2 = (uint8_t)(b2 | bitMask);
if (pix & 8u) b3 = (uint8_t)(b3 | bitMask);
}
planeBytes[0] = b0;
planeBytes[1] = b1;
planeBytes[2] = b2;
planeBytes[3] = b3;
}
// Shifted variant: produces 4 plane bytes and 1 mask byte for output
// column `outCol` (0..widthTiles inclusive) of row `row` when the
// sprite is shifted right by `shift` pixels (1..7). For shift 0,
// callers should use planeByteAndMaskAt directly (faster, no spill).
//
// Each output byte is composed of bits drawn from up to two source
// plane bytes:
// leftPart = src[outCol-1] << (8 - shift) (high (shift) bits)
// rightPart = src[outCol] >> shift (low (8-shift) bits)
// with src[-1] and src[widthTiles] treated as 0/transparent. The
// resulting plane byte is leftPart | rightPart; the mask byte is the
// shifted union of the per-byte source masks.
static void planeByteAndMaskShifted(const SpriteT *sp, uint16_t row, uint16_t outCol,
uint8_t shift, uint16_t widthTiles,
uint8_t *planeBytes /*[4]*/, uint8_t *maskByte)
{
uint8_t leftPlanes[AMIGA_BITPLANES];
uint8_t leftMask;
uint8_t rightPlanes[AMIGA_BITPLANES];
uint8_t rightMask;
uint8_t i;
leftMask = 0u;
rightMask = 0u;
for (i = 0; i < AMIGA_BITPLANES; i++) {
leftPlanes[i] = 0u;
rightPlanes[i] = 0u;
}
if (outCol > 0u && (uint16_t)(outCol - 1u) < widthTiles) {
planeByteAndMaskAt(sp, row, (uint16_t)(outCol - 1u), leftPlanes, &leftMask);
}
if (outCol < widthTiles) {
planeByteAndMaskAt(sp, row, outCol, rightPlanes, &rightMask);
}
*maskByte = (uint8_t)(((leftMask << (8u - shift)) & 0xFFu) |
((rightMask >> shift) & 0xFFu));
for (i = 0; i < AMIGA_BITPLANES; i++) {
planeBytes[i] = (uint8_t)(((leftPlanes[i] << (8u - shift)) & 0xFFu) |
((rightPlanes[i] >> shift) & 0xFFu));
}
}
// Emit code that merges one plane byte into d16(an) where d16 is the
// row-relative byte offset (0 since we re-base each row by adda.w).
// The choice of all-opaque vs mixed encoding cuts code size when many
// pixels are opaque (typical for sprite interiors).
static uint16_t emitMergeByteToD16An(uint8_t *out, uint16_t cursor,
uint8_t an, uint8_t disp,
uint8_t maskByte, uint8_t srcByte)
{
if (maskByte == 0u) {
return cursor; /* nothing to write */
}
if (maskByte == 0xFFu) {
/* All-opaque shortcut: move.b #src, d16(an). */
cursor += writeBE16(out + cursor, kMoveBImmToD16An[an]);
cursor += writeBE16(out + cursor, (uint16_t)srcByte);
cursor += writeBE16(out + cursor, (uint16_t)disp);
return cursor;
}
/* Mixed: load existing, clear mask bits, OR in src, write back. */
cursor += writeBE16(out + cursor, kMoveBD16AnToD0[an]);
cursor += writeBE16(out + cursor, (uint16_t)disp);
cursor += writeBE16(out + cursor, ANDI_B_IMM_D0);
cursor += writeBE16(out + cursor, (uint16_t)((~maskByte) & 0xFFu));
cursor += writeBE16(out + cursor, ORI_B_IMM_D0);
cursor += writeBE16(out + cursor, (uint16_t)srcByte);
cursor += writeBE16(out + cursor, kMoveBD0ToD16An[an]);
cursor += writeBE16(out + cursor, (uint16_t)disp);
return cursor;
}
// ----- Public API -----
uint16_t spriteEmitDrawPlanar68k(uint8_t *out, const SpriteT *sp, uint8_t shift) {
uint16_t cursor;
uint16_t row;
uint16_t col;
uint16_t heightPx;
uint16_t widthTiles;
uint16_t bytesPerRow; /* per plane, per row */
uint8_t planeBytes[AMIGA_BITPLANES];
uint8_t maskByte;
uint8_t i;
if (shift > 7u) {
return 0u;
}
cursor = 0;
heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS);
widthTiles = (uint16_t)sp->widthTiles;
bytesPerRow = (uint16_t)(widthTiles + (shift == 0u ? 0u : 1u));
/* Prologue: m68k cdecl callee-saves a2-a6; we clobber a2 and a3
* loading plane pointers, so push them first. After the push, all
* stack arg displacements shift by +8 (two longs). */
cursor += writeBE16(out + cursor, MOVEM_L_PUSH_OPCODE);
cursor += writeBE16(out + cursor, MOVEM_L_MASK_A2_A3);
for (i = 0; i < AMIGA_BITPLANES; i++) {
cursor += writeBE16(out + cursor, kMoveaSpToAn[i]);
cursor += writeBE16(out + cursor, (uint16_t)(8u + 4u + i * 4u));
}
for (row = 0; row < heightPx; row++) {
for (col = 0; col < bytesPerRow; col++) {
if (shift == 0u) {
planeByteAndMaskAt(sp, row, col, planeBytes, &maskByte);
} else {
planeByteAndMaskShifted(sp, row, col, shift, widthTiles, planeBytes, &maskByte);
}
for (i = 0; i < AMIGA_BITPLANES; i++) {
cursor = emitMergeByteToD16An(out, cursor, i, (uint8_t)col,
maskByte, planeBytes[i]);
}
}
if (row + 1u < heightPx) {
for (i = 0; i < AMIGA_BITPLANES; i++) {
cursor += writeBE16(out + cursor, kAddaWImmToAn[i]);
cursor += writeBE16(out + cursor, (uint16_t)AMIGA_BYTES_PER_ROW);
}
}
}
/* Epilogue: restore a2-a3, rts. */
cursor += writeBE16(out + cursor, MOVEM_L_POP_OPCODE);
cursor += writeBE16(out + cursor, MOVEM_L_MASK_POP_A2_A3);
cursor += writeBE16(out + cursor, OPCODE_RTS);
return cursor;
}
// SAVE: planes -> backup. backup is one contiguous 4*H*W/8 byte buffer
// laid out as 4 plane stripes, matching halSpriteSavePlanes format
// (so cross-platform save buffer is interchangeable).
//
// Per row: for each plane, copy bytesPerRow bytes from d16(an) to
// (a4)+. After the row's reads, the planes need to advance by 40,
// while a4 advances naturally via post-increment.
//
// Plane stripes are sequential in backup. We could either (a) do all
// rows of plane 0, then plane 1, etc. (matches halSpriteSavePlanes
// layout), or (b) interleave rows of all 4 planes (different layout).
// halSpriteSavePlanes does (a) -- 4 separate plane stripes. The
// emitted code below matches that layout for compat.
uint16_t spriteEmitSavePlanar68k(uint8_t *out, const SpriteT *sp, uint8_t shift) {
uint16_t cursor;
uint16_t row;
uint16_t col;
uint16_t heightPx;
uint16_t bytesPerRow;
uint8_t i;
/* Shifts 2..7 reuse shift 1's bytes (identical memcpy). The
* spriteCompile post-emit pass aliases their routineOffsets to
* slot 1 so this routine is emitted once. */
if (shift > 1u) {
return 0u;
}
cursor = 0;
heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS);
bytesPerRow = (uint16_t)(sp->widthTiles + (shift == 0u ? 0u : 1u));
/* Prologue: callee-save a2/a3/a4 (m68k cdecl), then load 4 plane
* pointers + backup pointer. After the push, all stack arg disps
* shift by +12 (three longs). */
cursor += writeBE16(out + cursor, MOVEM_L_PUSH_OPCODE);
cursor += writeBE16(out + cursor, MOVEM_L_MASK_A2_A3_A4);
for (i = 0; i < AMIGA_BITPLANES; i++) {
cursor += writeBE16(out + cursor, kMoveaSpToAn[i]);
cursor += writeBE16(out + cursor, (uint16_t)(12u + 4u + i * 4u));
}
/* a4 = backup. */
cursor += writeBE16(out + cursor, kMoveaSpToAn[4]);
cursor += writeBE16(out + cursor, (uint16_t)(12u + 4u + 4u * 4u));
/* Plane-major: for each plane, walk all rows. After this routine,
* each An has advanced by H*40 (one frame full); we don't need to
* unwind because the function returns. We DO need to reset An
* back to start before walking the NEXT plane though.
*
* Simpler alternative: row-major (interleaved). Per row, copy
* bytesPerRow bytes from each plane to (a4)+, then advance all
* 4 planes by 40. Net: a4 advances by 4*H*bytesPerRow; planes
* advance by H*40. Backup layout becomes interleaved (plane0_row0,
* plane1_row0, plane2_row0, plane3_row0, plane0_row1, ...).
*
* That doesn't match halSpriteSavePlanes' plane-major layout. Need
* to either (a) match it -- emit per-plane outer loop with a4
* stride between planes -- or (b) change halSpriteSavePlanes to
* interleaved. Picking (b) is simpler in emitted code, but ALSO
* requires updating halSpriteRestorePlanes and halSpriteRestoreUnder
* fallback math.
*
* For now: use plane-major matching halSpriteSavePlanes. Per
* plane: walk rows, copy bytes from d16(an) to (a4)+, advance an
* by 40 after each row except the last; reset an back to start
* before next plane. */
for (i = 0; i < AMIGA_BITPLANES; i++) {
for (row = 0; row < heightPx; row++) {
for (col = 0; col < bytesPerRow; col++) {
cursor += writeBE16(out + cursor, kMoveBD16AnToA4Postinc[i]);
cursor += writeBE16(out + cursor, (uint16_t)col);
}
if (row + 1u < heightPx) {
cursor += writeBE16(out + cursor, kAddaWImmToAn[i]);
cursor += writeBE16(out + cursor, (uint16_t)AMIGA_BYTES_PER_ROW);
}
}
/* Reset An back to the plane base for next iteration. The
* total advance was (heightPx - 1) * 40. Subtract that. */
if (i + 1u < AMIGA_BITPLANES) {
cursor += writeBE16(out + cursor, kAddaWImmToAn[i]);
cursor += writeBE16(out + cursor, (uint16_t)(0u - ((heightPx - 1u) * AMIGA_BYTES_PER_ROW)));
}
}
cursor += writeBE16(out + cursor, MOVEM_L_POP_OPCODE);
cursor += writeBE16(out + cursor, MOVEM_L_MASK_POP_A2_A3_A4);
cursor += writeBE16(out + cursor, OPCODE_RTS);
return cursor;
}
// RESTORE: backup -> planes. Mirror of save. Uses MOVE.B (a4)+, d16(an).
uint16_t spriteEmitRestorePlanar68k(uint8_t *out, const SpriteT *sp, uint8_t shift) {
uint16_t cursor;
uint16_t row;
uint16_t col;
uint16_t heightPx;
uint16_t bytesPerRow;
uint8_t i;
if (shift > 1u) {
return 0u;
}
cursor = 0;
heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS);
bytesPerRow = (uint16_t)(sp->widthTiles + (shift == 0u ? 0u : 1u));
/* Callee-save a2/a3/a4; arg disps shift by +12. */
cursor += writeBE16(out + cursor, MOVEM_L_PUSH_OPCODE);
cursor += writeBE16(out + cursor, MOVEM_L_MASK_A2_A3_A4);
for (i = 0; i < AMIGA_BITPLANES; i++) {
cursor += writeBE16(out + cursor, kMoveaSpToAn[i]);
cursor += writeBE16(out + cursor, (uint16_t)(12u + 4u + i * 4u));
}
cursor += writeBE16(out + cursor, kMoveaSpToAn[4]);
cursor += writeBE16(out + cursor, (uint16_t)(12u + 4u + 4u * 4u));
for (i = 0; i < AMIGA_BITPLANES; i++) {
for (row = 0; row < heightPx; row++) {
for (col = 0; col < bytesPerRow; col++) {
cursor += writeBE16(out + cursor, kMoveBA4PostincToD16An[i]);
cursor += writeBE16(out + cursor, (uint16_t)col);
}
if (row + 1u < heightPx) {
cursor += writeBE16(out + cursor, kAddaWImmToAn[i]);
cursor += writeBE16(out + cursor, (uint16_t)AMIGA_BYTES_PER_ROW);
}
}
if (i + 1u < AMIGA_BITPLANES) {
cursor += writeBE16(out + cursor, kAddaWImmToAn[i]);
cursor += writeBE16(out + cursor, (uint16_t)(0u - ((heightPx - 1u) * AMIGA_BYTES_PER_ROW)));
}
}
cursor += writeBE16(out + cursor, MOVEM_L_POP_OPCODE);
cursor += writeBE16(out + cursor, MOVEM_L_MASK_POP_A2_A3_A4);
cursor += writeBE16(out + cursor, OPCODE_RTS);
return cursor;
}

View file

@ -200,6 +200,10 @@ uint16_t spriteEmitDrawX86(uint8_t *out, const SpriteT *sp, uint8_t shift) {
uint8_t v3;
uint8_t m;
if (shift > 1u) {
return 0u;
}
cursor = 0;
heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS);
spriteBytesPerRow = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW);
@ -313,6 +317,10 @@ uint16_t spriteEmitRestoreX86(uint8_t *out, const SpriteT *sp, uint8_t shift) {
uint16_t heightPx;
uint16_t copyBytes;
if (shift > 1u) {
return 0u;
}
cursor = 0;
heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS);
copyBytes = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW + (shift == 1u ? 1u : 0u));
@ -339,6 +347,10 @@ uint16_t spriteEmitSaveX86(uint8_t *out, const SpriteT *sp, uint8_t shift) {
uint16_t heightPx;
uint16_t copyBytes;
if (shift > 1u) {
return 0u;
}
cursor = 0;
heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS);
copyBytes = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW + (shift == 1u ? 1u : 0u));

View file

@ -42,4 +42,19 @@ uint16_t spriteEmitRestoreX86 (uint8_t *out, const SpriteT *sp, uint8_t shift);
uint16_t spriteEmitSave68k (uint8_t *out, const SpriteT *sp, uint8_t shift);
uint16_t spriteEmitRestore68k (uint8_t *out, const SpriteT *sp, uint8_t shift);
// Planar 68k emitters (Amiga). Distinct from the chunky 68k emitters
// above because the destination addressing is across 4 separate
// bitplane buffers, not a single packed-pixel surface. Calling
// convention for the emitted bytes (cdecl):
// void draw (uint8_t *p0, uint8_t *p1, uint8_t *p2, uint8_t *p3);
// void save (uint8_t *p0, uint8_t *p1, uint8_t *p2, uint8_t *p3, uint8_t *backup);
// void restore (uint8_t *p0, uint8_t *p1, uint8_t *p2, uint8_t *p3, const uint8_t *backup);
// Each pi is plane_base + byteOff (= y*40 + x/8 already added by the
// dispatcher). Returns 0 for shifts not yet implemented (today only
// shift 0 == byte-aligned x is emitted; shifts 1..7 fall back to the
// cross-platform interpreter).
uint16_t spriteEmitDrawPlanar68k (uint8_t *out, const SpriteT *sp, uint8_t shift);
uint16_t spriteEmitSavePlanar68k (uint8_t *out, const SpriteT *sp, uint8_t shift);
uint16_t spriteEmitRestorePlanar68k (uint8_t *out, const SpriteT *sp, uint8_t shift);
#endif

View file

@ -1,11 +1,18 @@
// Cross-platform "where did it hang?" logger. Each call opens
// joeylog.txt, appends a line, fflushes, closes. Slow but durable
// -- the last line in the file is guaranteed to be on disk before
// any subsequent operation that might hang.
// Cross-platform "where did it hang?" logger. Holds joeylog.txt open
// across calls; libc's stdio buffer absorbs writes (~4 KB) and the
// final fclose at program exit (via atexit) gets the buffer to disk.
//
// Build only as needed for diagnostics; remove the calls when the
// bug is fixed. The hang on ST kept us looking at the wrong layer
// without this kind of trace.
// Earlier rev opened+closed per call for crash durability ("last line
// guaranteed on disk if we hang"); that cost ~1 second per call
// through GoldenGate's ProDOS FST emulation -- a 50-line UBER run
// burned ~5 minutes in IO. Even per-line fflush is too expensive
// because every fflush forces an FST WRITE, and host-OS file IO time
// isn't tracked by the IIgs VBL counter so wall-time logs underreport.
//
// Tradeoff: if the program crashes mid-run, buffered log lines may
// not reach disk. For UBER and similar batch demos that's acceptable;
// for hang-debugging where durability matters, call joeyLogFlush()
// at the suspected hang points.
#include <stdio.h>
#include <stdarg.h>
@ -15,6 +22,27 @@
static const char *kLogPath = "joeylog.txt";
static FILE *gLogFp = NULL;
/* 16 KB is enough for UBER's full log (~5 KB) plus generous headroom,
* so the file never auto-flushes mid-run. ORCA-C / libnix default
* buffers are only ~512 bytes; with that, a 50-line log triggers ~10
* ProDOS / AmigaDOS WRITEs through the host FST, each of which is
* untracked-host-time (seconds). Buffer the whole thing in memory and
* let the atexit fclose flush once. */
#define JOEY_LOG_BUF_BYTES 16384
static char gLogBuf[JOEY_LOG_BUF_BYTES];
/* Lazy-open. Returns NULL if the open failed (silently disable). */
static FILE *logFile(void) {
if (gLogFp == NULL) {
gLogFp = fopen(kLogPath, "a");
if (gLogFp != NULL) {
(void)setvbuf(gLogFp, gLogBuf, _IOFBF, sizeof(gLogBuf));
}
}
return gLogFp;
}
void joeyLog(const char *msg) {
@ -22,13 +50,12 @@ void joeyLog(const char *msg) {
if (msg == NULL) {
return;
}
fp = fopen(kLogPath, "a");
fp = logFile();
if (fp == NULL) {
return;
}
fputs(msg, fp);
fputc('\n', fp);
fclose(fp);
}
@ -38,7 +65,7 @@ void joeyLogF(const char *fmt, ...) {
if (fmt == NULL) {
return;
}
fp = fopen(kLogPath, "a");
fp = logFile();
if (fp == NULL) {
return;
}
@ -46,14 +73,27 @@ void joeyLogF(const char *fmt, ...) {
vfprintf(fp, fmt, args);
va_end(args);
fputc('\n', fp);
fclose(fp);
}
void joeyLogFlush(void) {
if (gLogFp != NULL) {
fflush(gLogFp);
}
}
void joeyLogReset(void) {
FILE *fp;
fp = fopen(kLogPath, "w");
if (gLogFp != NULL) {
fclose(gLogFp);
gLogFp = NULL;
}
/* Truncate by opening for write then closing; subsequent
* joeyLog* will reopen for append. */
{
FILE *fp = fopen(kLogPath, "w");
if (fp != NULL) {
fclose(fp);
}
}
}

View file

@ -186,13 +186,17 @@ static void floodFillInternal(SurfaceT *s, int16_t startX, int16_t startY, uint8
continue;
}
/* Phase 9: planar ports have NULL s->pixels and the asm fast
* paths take a chunky-row pointer. Skip them on planar; the C
* fallback below uses halSamplePixel which works on both
* storage layouts. */
if (s->pixels != NULL) {
// Highest-tier asm fast path: seed-test + walk-left + walk-right
// + 1-row fill + scan-above + scan-below + push, all in one
// cross-segment call. The asm caches row addr / match decoder
// across every sub-operation. C just pops and dispatches; this
// path completes the entire per-seed work and computes the row
// address itself, so we don't pay y*160 in C unless we fall back.
{
bool seedMatched;
if (halFastFloodWalkAndScans(s->pixels, x, y,
matchColor, newNibble, matchEqual,
@ -203,22 +207,27 @@ static void floodFillInternal(SurfaceT *s, int16_t startX, int16_t startY, uint8
}
}
// Fallback path needs row; compute it here so the asm path
// above doesn't pay for an unused y*160 multiply on every iter.
row = &s->pixels[SURFACE_ROW_OFFSET(y)];
/* Fallback path: compute row only if chunky; halFastFloodWalk
* needs it but isn't implemented on Amiga. */
row = (s->pixels != NULL) ? &s->pixels[SURFACE_ROW_OFFSET(y)] : NULL;
// Tier-2 asm fast path: combined seed test + walk-left +
// walk-right in one cross-segment call. Falls back to the
// pure-C walks below on ports without an asm implementation.
{
bool seedMatched;
if (halFastFloodWalk(row, x, matchColor, newNibble, matchEqual,
if (row != NULL && halFastFloodWalk(row, x, matchColor, newNibble, matchEqual,
&seedMatched, &leftX, &rightX)) {
if (!seedMatched) {
continue;
}
} else if (halFloodWalkPlanes(s, x, y, matchColor, newNibble, matchEqual,
&seedMatched, &leftX, &rightX)) {
if (!seedMatched) {
continue;
}
} else {
pix = srcPixel(row, x);
pix = halSamplePixel(s, x, y);
pixMatch = (pix == matchColor);
if (matchEqual) {
if (!pixMatch) {
@ -233,7 +242,7 @@ static void floodFillInternal(SurfaceT *s, int16_t startX, int16_t startY, uint8
// Walk left to find the start of the matching run.
leftX = x;
while (leftX > 0) {
pix = srcPixel(row, (int16_t)(leftX - 1));
pix = halSamplePixel(s, (int16_t)(leftX - 1), y);
pixMatch = (pix == matchColor);
if (matchEqual ? !pixMatch : (pixMatch || pix == newNibble)) {
break;
@ -244,7 +253,7 @@ static void floodFillInternal(SurfaceT *s, int16_t startX, int16_t startY, uint8
// Walk right to find the end.
rightX = x;
while (rightX < SURFACE_WIDTH - 1) {
pix = srcPixel(row, (int16_t)(rightX + 1));
pix = halSamplePixel(s, (int16_t)(rightX + 1), y);
pixMatch = (pix == matchColor);
if (matchEqual ? !pixMatch : (pixMatch || pix == newNibble)) {
break;
@ -256,12 +265,18 @@ static void floodFillInternal(SurfaceT *s, int16_t startX, int16_t startY, uint8
// Fill the span. Bypass fillRect's clipping wrapper: walk-out
// already guaranteed leftX/rightX are in [0..SURFACE_WIDTH-1]
// and the seed-pop bounds check did the same for y.
// and the seed-pop bounds check did the same for y. We DO
// need the planar dual-write (which fillRect's wrapper would
// call), so invoke halFillRectPlanes explicitly after the
// chunky span fill -- otherwise PLANAR_PRESENT builds (and,
// post-Phase-9, every build) display flood-filled regions
// as the unfilled background.
{
int16_t spanW = (int16_t)(rightX - leftX + 1);
if (!halFastFillRect(s, leftX, y, (uint16_t)spanW, 1, newNibble)) {
fillRectClipped(s, leftX, y, spanW, 1, newNibble);
}
halFillRectPlanes(s, leftX, y, (uint16_t)spanW, 1, newNibble);
}
// Scan rows above and below for run boundaries. The hot
@ -291,19 +306,26 @@ static void floodFillInternal(SurfaceT *s, int16_t startX, int16_t startY, uint8
}
scanY = (int16_t)(y + 1);
}
scanRow = &s->pixels[SURFACE_ROW_OFFSET(scanY)];
scanRow = (s->pixels != NULL) ? &s->pixels[SURFACE_ROW_OFFSET(scanY)] : NULL;
// Prefer the combined scan+push asm path (one call per
// scan, no markBuf and no per-pixel C edge walk).
if (!halFastFloodScanAndPush(scanRow, leftX, rightX,
// scan, no markBuf and no per-pixel C edge walk). Skip
// the asm tiers if we don't have a chunky row pointer
// (Phase 9 planar ports).
if (scanRow == NULL ||
!halFastFloodScanAndPush(scanRow, leftX, rightX,
matchColor, newNibble, matchEqual,
scanY, stackX, stackY,
&sp, FLOOD_STACK_SIZE)) {
if (!halFastFloodScanRow(scanRow, leftX, rightX,
if ((scanRow == NULL ||
!halFastFloodScanRow(scanRow, leftX, rightX,
matchColor, newNibble, matchEqual,
floodMarkBuf)) &&
!halFloodScanRowPlanes(s, leftX, rightX, scanY,
matchColor, newNibble, matchEqual,
floodMarkBuf)) {
// C fallback: fill markBuf the slow way.
for (i = 0; i < spanLen; i++) {
pix = srcPixel(scanRow, (int16_t)(leftX + i));
pix = halSamplePixel(s, (int16_t)(leftX + i), scanY);
pixMatch = (pix == matchColor);
floodMarkBuf[i] = (uint8_t)(matchEqual
? (pixMatch ? 1 : 0)
@ -621,12 +643,12 @@ void fillRect(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t
if (!halFastFillRect(s, sx, sy, (uint16_t)sw, (uint16_t)sh, colorIndex)) {
fillRectClipped(s, sx, sy, sw, sh, colorIndex);
}
halFillRectPlanes(s, sx, sy, (uint16_t)sw, (uint16_t)sh, colorIndex);
surfaceMarkDirtyRect(s, sx, sy, sw, sh);
}
void floodFill(SurfaceT *s, int16_t x, int16_t y, uint8_t newColor) {
uint8_t *row;
uint8_t seedColor;
if (s == NULL) {
@ -635,8 +657,9 @@ void floodFill(SurfaceT *s, int16_t x, int16_t y, uint8_t newColor) {
if (x < 0 || x >= SURFACE_WIDTH || y < 0 || y >= SURFACE_HEIGHT) {
return;
}
row = &s->pixels[SURFACE_ROW_OFFSET(y)];
seedColor = srcPixel(row, x);
/* halSamplePixel reads from whichever storage the port uses --
* works on both chunky (s->pixels) and planar (s->portData) ports. */
seedColor = halSamplePixel(s, x, y);
if ((seedColor & 0x0F) == (newColor & 0x0F)) {
return;
}
@ -645,7 +668,6 @@ void floodFill(SurfaceT *s, int16_t x, int16_t y, uint8_t newColor) {
void floodFillBounded(SurfaceT *s, int16_t x, int16_t y, uint8_t newColor, uint8_t boundaryColor) {
uint8_t *row;
uint8_t pix;
if (s == NULL) {
@ -654,8 +676,7 @@ void floodFillBounded(SurfaceT *s, int16_t x, int16_t y, uint8_t newColor, uint8
if (x < 0 || x >= SURFACE_WIDTH || y < 0 || y >= SURFACE_HEIGHT) {
return;
}
row = &s->pixels[SURFACE_ROW_OFFSET(y)];
pix = srcPixel(row, x);
pix = halSamplePixel(s, x, y);
// Starting on a boundary pixel or already-filled pixel: nothing
// to do.
if ((pix & 0x0F) == (boundaryColor & 0x0F)) {
@ -669,25 +690,16 @@ void floodFillBounded(SurfaceT *s, int16_t x, int16_t y, uint8_t newColor, uint8
uint8_t samplePixel(const SurfaceT *s, int16_t x, int16_t y) {
uint8_t byte;
if (s == NULL) {
return 0;
}
if (x < 0 || x >= SURFACE_WIDTH || y < 0 || y >= SURFACE_HEIGHT) {
return 0;
}
/* Cast to uint16_t before shift -- already validated x >= 0,
* unsigned semantics match. Avoids ~SSHIFTRIGHT helper. */
byte = s->pixels[SURFACE_ROW_OFFSET(y) + ((uint16_t)x >> 1)];
if (x & 1) {
return (uint8_t)(byte & 0x0F);
}
/* `byte >> 4` is uint8_t but ORCA-C promotes to int (signed 16-bit)
* for the shift, then narrows -- triggers ~SSHIFTRIGHT. The
* mask-then-shift sidesteps the promotion path. */
return (uint8_t)((byte & 0xF0u) >> 4);
/* halSamplePixel reads from whichever storage the port uses --
* chunky ports return a nibble extracted from s->pixels; planar
* ports read 4 plane bits and assemble the nibble. */
return halSamplePixel(s, x, y);
}
@ -725,6 +737,8 @@ void surfaceBlit(SurfaceT *dst, const JoeyAssetT *src, int16_t x, int16_t y) {
}
}
}
halBlitRectPlanes(dst, x, y, src->pixels, srcX0, srcY0,
copyW, copyH, srcRowBytes, 0xFFFFu);
surfaceMarkDirtyRect(dst, x, y, copyW, copyH);
}
@ -768,6 +782,8 @@ void surfaceBlitMasked(SurfaceT *dst, const JoeyAssetT *src, int16_t x, int16_t
}
}
}
halBlitRectPlanes(dst, x, y, src->pixels, srcX0, srcY0,
copyW, copyH, srcRowBytes, (uint16_t)transparent);
surfaceMarkDirtyRect(dst, x, y, copyW, copyH);
}

View file

@ -9,8 +9,11 @@
#ifndef JOEYLIB_HAL_H
#define JOEYLIB_HAL_H
#include <stdio.h>
#include "joey/core.h"
#include "joey/input.h"
#include "joey/sprite.h"
#include "joey/surface.h"
// Per-port one-shot initialization. Called from joeyInit after config
@ -27,17 +30,131 @@ void halShutdown(void);
// backs the library-owned stage surface. Ports that have a
// hardware-friendly pin location for the back buffer (IIgs $01/2000
// with SHR shadow inhibited) return that address here; ports with no
// such constraint just malloc/free.
// such constraint just malloc/free. Planar 68k ports may return NULL
// if the surface is planar-only and has no chunky shadow.
uint8_t *halStageAllocPixels(void);
void halStageFreePixels(uint8_t *pixels);
// Present the entire source surface to the display.
void halPresent(const SurfaceT *src);
// Allocate / release the per-surface portData blob (see SurfaceT in
// surfaceInternal.h). Chunky ports return NULL from Init -- they keep
// portData unused and operate on the chunky `pixels` buffer. Planar
// 68k ports allocate a per-surface struct here describing the
// bitplane storage (Amiga: 4 separate plane buffers + stride; ST: one
// interleaved buffer + stride). Called by surfaceCreate / stageAlloc
// after pixels is allocated; freed by surfaceDestroy / stageFree
// before pixels is freed. `isStage` lets the port short-circuit for
// the stage if its planes are display-owned (e.g. Amiga's BitMap
// planes from OpenScreen) rather than allocated per surface.
void *halSurfaceAllocPortData(SurfaceT *s, bool isStage);
void halSurfaceFreePortData(SurfaceT *s, bool isStage, void *portData);
// Present a rectangular region of the source surface. The caller has
// already validated and clipped the rect to be fully inside the
// surface bounds and to have positive extents.
void halPresentRect(const SurfaceT *src, int16_t x, int16_t y, uint16_t w, uint16_t h);
// Phase 3 planar dual-write: called from cross-platform fillRect AFTER
// the chunky shadow has been written, with the same already-clipped
// (x, y, w, h) and the raw color index 0..15. Planar ports update
// the bitplanes with the rect's bit pattern (per-plane bit value =
// (color >> plane) & 1). Chunky ports (DOS, IIgs) provide a no-op
// stub. Called unconditionally so cross-platform code doesn't have
// to know the port is planar; the per-port stub is the cheapest
// possible thing on chunky ports.
void halFillRectPlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex);
// Phase 3 planar dual-write for surfaceCopy: called from cross-platform
// surfaceCopy AFTER the chunky pixel buffer is memcpy'd. Planar ports
// also memcpy the bitplanes from src to dst so JOEYLIB_PLANAR_PRESENT
// builds see correct planes. dst and src are non-NULL and distinct
// (caller's no-op guards already passed).
void halSurfaceCopyPlanes(SurfaceT *dst, const SurfaceT *src);
// Phase 5 planar dual-write for tile ops. Called from cross-platform
// tile.c AFTER the chunky path completes. (bx, by) are tile-grid
// coords (0..39 horiz, 0..24 vert; surface is 40x25 tiles).
// transparentIndex for tileCopyMasked: pixel value to skip. tilePaste
// reads from a packed 32-byte chunky TileT (4 bytes/row x 8 rows).
// All Amiga impls operate on the off-screen shadow planes via
// AmigaPlanarT; chunky-port stubs are no-ops. tileSnap is read-only
// so has no planar dual-write hook.
void halTileFillPlanes(SurfaceT *s, uint8_t bx, uint8_t by, uint8_t colorIndex);
void halTileCopyPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy);
void halTileCopyMaskedPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy, uint8_t transparentIndex);
void halTilePastePlanes(SurfaceT *dst, uint8_t bx, uint8_t by, const uint8_t *chunkyTile);
// tileSnap: cross-platform code reads s->pixels chunky bytes into a
// 32-byte TileT. On planar ports (s->pixels NULL) the chunky read
// crashes -- this hook is the planar derivation: reads bitplane bits
// for the tile rect and assembles 32 chunky bytes (4 bytes/row x 8
// rows) into chunkyTileOut. Chunky ports (s->pixels valid) implement
// this as a no-op since the cross-platform fallback already filled
// chunkyTileOut from s->pixels.
void halTileSnapPlanes(const SurfaceT *src, uint8_t bx, uint8_t by, uint8_t *chunkyTileOut);
// Phase 6 planar dual-write for spriteDraw. Called from cross-platform
// sprite.c AFTER spriteCompiledDraw or spriteDrawInterpreted has
// updated the chunky shadow. (x, y) is the destination top-left in
// surface pixels (may be partially off-surface; the hook does its own
// clipping). Walks the sprite's chunky tile data and updates dst
// surface planes for every non-transparent pixel (nibble != 0).
// Save/restore have NO planar dual-write yet -- after spriteSaveUnder
// + spriteDraw + spriteRestoreUnder under JOEYLIB_PLANAR_PRESENT, the
// planes still show the sprite (chunky restored, planes unchanged).
// Workable approach for that needs a parallel plane backup buffer;
// deferred until apps actually depend on PLANAR_PRESENT save/restore.
void halSpriteDrawPlanes(SurfaceT *s, const SpriteT *sp, int16_t x, int16_t y);
// Phase 8 planar dual-write for asset blits and full surface loads.
// halBlitRectPlanes is called from surfaceBlit / surfaceBlitMasked
// AFTER the chunky path. transparent == 0xFFFF means opaque blit; any
// other value is a nibble (0..15) to skip. srcBytes is the asset's
// raw chunky pixel buffer; srcRowBytes is its stride. (x, y) is the
// already-clipped destination top-left in dst surface pixels;
// srcX0/srcY0 is where in the asset the visible region starts after
// clip; copyW/copyH is the visible region size in pixels.
//
void halBlitRectPlanes(SurfaceT *dst, int16_t x, int16_t y, const uint8_t *srcBytes, int16_t srcX0, int16_t srcY0, int16_t copyW, int16_t copyH, int16_t srcRowBytes, uint16_t transparent);
// Phase 9 sprite save/restore plane data. Chunky ports already hold
// pixel data in backup->bytes via the cross-platform memcpy. Planar
// ports (Amiga) DO have chunky NULL, so backup->bytes is unused by
// the chunky path -- we repurpose it to hold per-plane bytes. Layout:
// 4 plane stripes of (h * bytesPerPlaneRow) bytes each, where
// bytesPerPlaneRow = w/8 (sprite x and w are guaranteed 2-pixel
// aligned by spriteSaveUnder; planar requires further 8-pixel
// rounding -- see Amiga impl notes). Total bytes:
// 4 * h * w/8 = h * w/2 = same as chunky. backup->sizeBytes capacity
// works on both ports. Chunky-port impls are no-ops; Amiga writes /
// reads plane bytes via AmigaPlanarT.
void halSpriteSavePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t *dstPlaneBytes);
void halSpriteRestorePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, const uint8_t *srcPlaneBytes);
// Phase 9 reader hooks. Cross-platform code calls these instead of
// reading from s->pixels directly so it works regardless of whether
// the port stores chunky or planar as the source of truth. Chunky
// ports (DOS, IIgs) implement these reading from s->pixels (cheap);
// Amiga reads from the bitplanes in AmigaPlanarT. (x, y) bounds are
// already validated by the caller.
//
// halSamplePixel: returns the 0..15 nibble at (x, y).
// halSurfaceHash: returns the FNV-style hash of pixel + scb + palette
// that surfaceHash currently computes by walking s->pixels. Allows
// ports to use their native pixel storage instead.
// halSurfaceCopyChunky: cross-platform surfaceCopy used to memcpy
// s->pixels src->dst; on planar ports there is no chunky to copy
// (planes already covered by halSurfaceCopyPlanes). Chunky ports
// do the memcpy here; Amiga is a no-op.
// halSurfaceLoadFileChunky / halSurfaceSaveFileChunky wrap fread /
// fwrite of the pixel data. Chunky ports stream directly to/from
// s->pixels; Amiga uses a scratch buffer + c2p (load) or
// plane->chunky derivation (save).
uint8_t halSamplePixel(const SurfaceT *s, int16_t x, int16_t y);
uint32_t halSurfaceHash(const SurfaceT *s);
void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src);
bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp);
bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp);
// Present the dirty regions of the source surface to the display.
// The cross-platform stagePresent walks the dirty arrays before
// calling this; ports may use the dirty arrays themselves to skip
// untouched rows.
void halPresent(const SurfaceT *src);
// Optional: returns a port-specific error message string for the last
// HAL failure, or NULL if none. Ports may return NULL always.
@ -73,9 +190,23 @@ uint16_t halFrameHz(void);
// Audio: per-port engine setup, module + SFX playback, teardown.
// halAudioInit returns true if the platform has a working engine.
// All entry points are safe to call when init failed -- they become
// no-ops. See joey/audio.h for the public API contract that wraps
// these.
// Per-surface chunky pixel allocation. Chunky ports (DOS, IIgs, ST
// while still chunky) allocate SURFACE_PIXELS_SIZE bytes (calloc-
// style, zero-filled). Pure-planar Amiga returns NULL -- there's no
// chunky shadow; cross-platform code that previously read s->pixels
// goes through halSamplePixel / halSurfaceCopyChunky / etc. instead.
// halSurfaceFreePixels mirrors free(); NULL is a valid input on
// planar ports.
uint8_t *halSurfaceAllocPixels(void);
void halSurfaceFreePixels(uint8_t *pixels);
// Get a pointer to the start of bitplane `planeIdx` (0..3) for surface
// `s`. Returns NULL on chunky ports (no planes). On Amiga returns
// pd->planes[planeIdx] from the AmigaPlanarT struct in portData.
// Used by the planar sprite codegen dispatcher to compute the 4
// plane addresses to hand the emitted asm.
uint8_t *halSurfacePlanePtr(const SurfaceT *s, uint8_t planeIdx);
bool halAudioInit(void);
void halAudioShutdown(void);
void halAudioPlayMod(const uint8_t *data, uint32_t length, bool loop);
@ -185,6 +316,21 @@ bool halFastFloodWalkAndScans(uint8_t *pixels, int16_t x, int16_t y,
bool *seedMatched,
int16_t *leftXOut, int16_t *rightXOut);
// Planar variants of halFastFloodWalk / halFastFloodScanRow. Take a
// SurfaceT* instead of a chunky-row pointer so they work on planar
// ports (Amiga post-Phase 9) where s->pixels is NULL. Same semantics;
// chunky ports return false (the chunky variants above are faster
// when a chunky row is available). Replace the per-pixel
// halSamplePixel walk on planar ports.
bool halFloodWalkPlanes(const SurfaceT *s, int16_t startX, int16_t y,
uint8_t matchColor, uint8_t newColor, bool matchEqual,
bool *seedMatched,
int16_t *leftXOut, int16_t *rightXOut);
bool halFloodScanRowPlanes(const SurfaceT *s, int16_t leftX, int16_t rightX, int16_t scanY,
uint8_t matchColor, uint8_t newColor, bool matchEqual,
uint8_t *markBuf);
// surfaceBlit / surfaceBlitMasked rect-copy helper. Caller has done
// the clip math: dstRow0 / srcRow0 point at row 0 of the source/dest
// regions, dstX / srcX are intra-row pixel offsets, copyW/copyH are
@ -333,6 +479,12 @@ extern uint16_t gFloodRightX;
#undef halFastFloodScanAndPush
#define halFastFloodScanAndPush(_row, _lx, _rx, _mc, _nc, _me, _sy, _sx, _syA, _sp, _ms) (false)
// IIgs is chunky; the planar flood hooks are never reachable.
#undef halFloodWalkPlanes
#define halFloodWalkPlanes(_s, _sx, _y, _mc, _nc, _me, _sm, _lx, _rx) (false)
#undef halFloodScanRowPlanes
#define halFloodScanRowPlanes(_s, _lx, _rx, _sy, _mc, _nc, _me, _mb) (false)
// Tier-1 flood: multi-output. Asm sets gFloodSeedMatch / gFloodLeftX /
// gFloodRightX; macro reads those into the caller's out-ptrs.
#undef halFastFloodWalkAndScans

View file

@ -2,8 +2,7 @@
//
// stagePresent walks the per-row dirty bands set by drawing primitives
// and asks the port HAL to flip just those rows to the display, then
// resets the dirty state. stagePresentRect bypasses dirty tracking
// entirely and slams a caller-specified rectangle (after clipping).
// resets the dirty state.
#include <stddef.h>
@ -25,48 +24,3 @@ void stagePresent(void) {
halPresent(stage);
stageDirtyClearAll();
}
void stagePresentRect(int16_t x, int16_t y, uint16_t w, uint16_t h) {
SurfaceT *stage;
int16_t sx;
int16_t sy;
int16_t sw;
int16_t sh;
stage = stageGet();
if (stage == NULL) {
return;
}
sx = x;
sy = y;
sw = (int16_t)w;
sh = (int16_t)h;
if (sw <= 0 || sh <= 0) {
return;
}
if (sx < 0) {
sw += sx;
sx = 0;
}
if (sy < 0) {
sh += sy;
sy = 0;
}
if (sx >= SURFACE_WIDTH || sy >= SURFACE_HEIGHT) {
return;
}
if (sx + sw > SURFACE_WIDTH) {
sw = SURFACE_WIDTH - sx;
}
if (sy + sh > SURFACE_HEIGHT) {
sh = SURFACE_HEIGHT - sy;
}
if (sw <= 0 || sh <= 0) {
return;
}
halPresentRect(stage, sx, sy, (uint16_t)sw, (uint16_t)sh);
}

View file

@ -10,6 +10,7 @@
#include "joey/sprite.h"
#include "codegenArenaInternal.h"
#include "hal.h"
#include "spriteInternal.h"
#include "surfaceInternal.h"
@ -22,6 +23,20 @@
// Color 0 is always transparent for sprites (DESIGN.md contract).
#define TRANSPARENT_NIBBLE 0
// On Amiga (post-Phase 9 / Phase 6 redux) the compiled sprite emitter
// writes directly to the bitplanes, so the halSpritePlanes hooks are
// pure duplicate work after a compiled call. On other ports the
// hooks are either no-op stubs (chunky-only IIgs/DOS) or the only
// thing writing planes (ST: chunky-shadow + planes). Slow / interpreter
// paths still need the hooks unconditionally on every platform -- the
// chunky interpreter is a no-op on Amiga (s->pixels NULL) so the hook
// is the only draw.
#if defined(JOEYLIB_PLATFORM_AMIGA)
#define COMPILED_SPRITE_WRITES_PLANES 1
#else
#define COMPILED_SPRITE_WRITES_PLANES 0
#endif
// ----- Prototypes -----
@ -144,6 +159,11 @@ static void spriteDrawInterpreted(SurfaceT *s, SpriteT *sp, int16_t x, int16_t y
return;
}
/* Skip the chunky write loop on planar ports (s->pixels == NULL).
* halSpriteDrawPlanes is called by the spriteDraw caller and does
* its own clip + plane write, so the dirty mark + planar update
* happen there. Phase 9 dropped the chunky shadow on Amiga. */
if (s->pixels != NULL) {
for (row = 0; row < h; row++) {
dstRow = &s->pixels[(dy + row) * SURFACE_BYTES_PER_ROW];
for (col = 0; col < w; col++) {
@ -154,6 +174,7 @@ static void spriteDrawInterpreted(SurfaceT *s, SpriteT *sp, int16_t x, int16_t y
writeDstNibble(dstRow, (int16_t)(dx + col), nibble);
}
}
}
surfaceMarkDirtyRect(s, dx, dy, w, h);
}
@ -200,6 +221,13 @@ SpriteT *spriteCreateFromSurface(const SurfaceT *src, int16_t x, int16_t y,
if (src == NULL || widthTiles == 0 || heightTiles == 0) {
return NULL;
}
/* Phase 9: planar ports have NULL src->pixels. Capturing a sprite
* from such a surface needs a planar-to-chunky derivation hook;
* not implemented yet, so refuse the call. Apps targeting Amiga
* should ship sprites as static tile data instead. */
if (src->pixels == NULL) {
return NULL;
}
// Source x/y must be on a tile boundary so each captured tile lands
// on whole bytes -- mid-byte snapshots would lose half a pixel at
// the left edge.
@ -284,10 +312,14 @@ void spriteDraw(SurfaceT *s, SpriteT *sp, int16_t x, int16_t y) {
// need clip math (they walk fixed offsets).
if (sp->slot != NULL && isFullyOnSurface(x, y, widthPx, heightPx)) {
spriteCompiledDraw(s, sp, x, y);
if (!COMPILED_SPRITE_WRITES_PLANES) {
halSpriteDrawPlanes(s, sp, x, y);
}
surfaceMarkDirtyRect(s, x, y, (int16_t)widthPx, (int16_t)heightPx);
return;
}
spriteDrawInterpreted(s, sp, x, y);
halSpriteDrawPlanes(s, sp, x, y);
}
@ -332,7 +364,7 @@ void spriteSaveAndDraw(SurfaceT *s, SpriteT *sp, int16_t x, int16_t y, SpriteBac
uint16_t saveIdx;
uint16_t drawIdx;
uint8_t *offsetsBase;
shift = (uint8_t)(x & 1);
shift = SPRITE_SHIFT_INDEX(x);
saveIdx = (uint16_t)(((uint16_t)shift << 1) + (uint16_t)shift + SPRITE_OP_SAVE);
drawIdx = (uint16_t)(((uint16_t)shift << 1) + (uint16_t)shift + SPRITE_OP_DRAW);
offsetsBase = (uint8_t *)sp->routineOffsets;
@ -340,6 +372,10 @@ void spriteSaveAndDraw(SurfaceT *s, SpriteT *sp, int16_t x, int16_t y, SpriteBac
*(uint16_t *)(offsetsBase + (drawIdx << 1)) != SPRITE_NOT_COMPILED) {
spriteCompiledSaveUnder(s, sp, x, y, backup);
spriteCompiledDraw (s, sp, x, y);
if (!COMPILED_SPRITE_WRITES_PLANES) {
halSpriteSavePlanes(s, backup->x, backup->y, backup->width, backup->height, backup->bytes);
halSpriteDrawPlanes(s, sp, x, y);
}
surfaceMarkDirtyRect (s, x, y, (int16_t)widthPx, (int16_t)heightPx);
return;
}
@ -630,13 +666,18 @@ void spriteRestoreUnder(SurfaceT *s, const SpriteBackupT *backup) {
routeOffset = *(uint16_t *)((uint8_t *)sp->routineOffsets + (routeIdx << 1));
if (routeOffset != SPRITE_NOT_COMPILED) {
spriteCompiledRestoreUnder(s, backup);
if (!COMPILED_SPRITE_WRITES_PLANES) {
halSpriteRestorePlanes(s, bx, by, bw, bh, backup->bytes);
}
surfaceMarkDirtyRect(s, bx, by, (int16_t)bw, (int16_t)bh);
return;
}
}
/* Slow / interpreted memcpy fallback. */
{
/* Slow / interpreted memcpy fallback. Skip the chunky memcpy if
* the port has no chunky shadow (Phase 9 Amiga: s->pixels NULL);
* halSpriteRestorePlanes below does the planar restore. */
if (s->pixels != NULL) {
int16_t row;
int16_t byteStart;
uint8_t *dstRow;
@ -650,6 +691,7 @@ void spriteRestoreUnder(SurfaceT *s, const SpriteBackupT *backup) {
(size_t)copyBytes);
}
}
halSpriteRestorePlanes(s, bx, by, bw, bh, backup->bytes);
surfaceMarkDirtyRect(s, bx, by, (int16_t)bw, (int16_t)bh);
}
@ -684,11 +726,14 @@ void spriteSaveUnder(const SurfaceT *s, SpriteT *sp, int16_t x, int16_t y, Sprit
if (backup->bytes != NULL && slot != NULL && isFullyOnSurface(x, y, widthPx, heightPx)) {
uint16_t routeIdx;
uint16_t routeOffset;
shift = (uint8_t)(x & 1);
shift = SPRITE_SHIFT_INDEX(x);
routeIdx = (uint16_t)(((uint16_t)shift << 1) + (uint16_t)shift + SPRITE_OP_SAVE);
routeOffset = *(uint16_t *)((uint8_t *)sp->routineOffsets + (routeIdx << 1));
if (routeOffset != SPRITE_NOT_COMPILED) {
spriteCompiledSaveUnder(s, sp, x, y, backup);
if (!COMPILED_SPRITE_WRITES_PLANES) {
halSpriteSavePlanes(s, backup->x, backup->y, backup->width, backup->height, backup->bytes);
}
return;
}
}
@ -744,11 +789,16 @@ void spriteSaveUnder(const SurfaceT *s, SpriteT *sp, int16_t x, int16_t y, Sprit
// backup with bytes==NULL.
return;
}
/* Chunky save path: skip on planar ports (s->pixels NULL).
* halSpriteSavePlanes below covers the planar case. */
if (s->pixels != NULL) {
for (row = 0; row < h; row++) {
srcRow = &s->pixels[(dy + row) * SURFACE_BYTES_PER_ROW];
memcpy(&backup->bytes[(uint16_t)row * (uint16_t)copyBytes],
&srcRow[byteStart],
(size_t)copyBytes);
}
}
halSpriteSavePlanes(s, clippedX, dy, (uint16_t)clippedW, (uint16_t)h, backup->bytes);
} /* end slow path */
}

View file

@ -13,6 +13,16 @@
#define SPRITE_OP_RESTORE 2
#define SPRITE_OP_COUNT 3
// Per-platform shift index used by the dispatcher. Chunky 4bpp ports
// store one nibble per pixel pair so the only sub-byte alignment is
// x % 2. Amiga planar packs 8 pixels per plane byte so all 8
// alignments matter.
#if defined(JOEYLIB_PLATFORM_AMIGA)
#define SPRITE_SHIFT_INDEX(x) ((uint8_t)((x) & 7))
#else
#define SPRITE_SHIFT_INDEX(x) ((uint8_t)((x) & 1))
#endif
// Sentinel stored in routineOffsets[shift][op] when that op's emitter
// returned 0 bytes (i.e., the platform doesn't implement compiled
// codegen for that op yet). Distinct from a real offset of 0, which

View file

@ -65,9 +65,10 @@ void surfaceCopy(SurfaceT *dst, const SurfaceT *src) {
if (dst == NULL || src == NULL || dst == src) {
return;
}
memcpy(dst->pixels, src->pixels, SURFACE_PIXELS_SIZE);
halSurfaceCopyChunky(dst, src); /* memcpy on chunky ports; no-op on planar */
memcpy(dst->scb, src->scb, sizeof(src->scb));
memcpy(dst->palette, src->palette, sizeof(src->palette));
halSurfaceCopyPlanes(dst, src); /* 4 plane memcpys on planar ports; no-op on chunky */
surfaceMarkDirtyAll(dst);
}
@ -79,11 +80,10 @@ SurfaceT *surfaceCreate(void) {
if (s == NULL) {
return NULL;
}
s->pixels = (uint8_t *)calloc(1, SURFACE_PIXELS_SIZE);
if (s->pixels == NULL) {
free(s);
return NULL;
}
/* halSurfaceAllocPixels returns NULL on planar ports (Amiga); the
* primary storage is the port-allocated planes via portData below. */
s->pixels = halSurfaceAllocPixels();
s->portData = halSurfaceAllocPortData(s, false);
paletteInitDefault(s);
return s;
}
@ -96,11 +96,44 @@ void surfaceDestroy(SurfaceT *s) {
if (s == gStage) {
return;
}
free(s->pixels);
halSurfaceFreePortData(s, false, s->portData);
halSurfaceFreePixels(s->pixels);
free(s);
}
// Cheapest deterministic hash that still detects per-byte changes:
// (hash << 1) ^ byte, a single 16-bit accumulator. ORCA-C / 65816
// compiles to ASL + EOR -- about 35 cyc per byte. A 32-bit multiply
// FNV-style hash takes ~200 cyc per byte via ~UMUL4, which adds
// 80+ seconds to a UBER run on IIgs. Discrimination is weaker than
// FNV but plenty for cross-port validation: we only need "did the
// same logical-pixel sequence produce the same hash?" -- not
// crypto-grade collision resistance over arbitrary inputs.
//
// Walks the chunky pixel buffer byte-by-byte, the same logical-pixel
// ordering on every chunky-format port (IIgs, DOS, Amiga and ST
// while still chunky). When the planar rewrite drops s->pixels on
// Amiga/ST this function will need a HAL hook (halSurfaceHash) to
// read planes natively while producing the same logical hash.
/* Cross-port FNV-style hash of pixels + SCB + palette. The hash logic
* (multiplier streams, byte ordering for palette) is identical across
* ports, but the pixel READS go through the port HAL so chunky ports
* walk s->pixels and planar ports walk plane bits and assemble nibble
* pairs into chunky bytes for the hash. Both produce the same logical-
* pixel hash because they hash the same logical pixel sequence in the
* same chunky byte order. SCB and palette are still hashed inline
* here because they live in the SurfaceT struct on every port (no
* port-specific storage) and the byte/value-with-explicit-byte-order
* walks are already endian-independent. */
uint32_t surfaceHash(const SurfaceT *s) {
if (s == NULL) {
return 0u;
}
return halSurfaceHash(s);
}
bool surfaceLoadFile(SurfaceT *dst, const char *path) {
FILE *fp;
long fileSize;
@ -125,7 +158,7 @@ bool surfaceLoadFile(SurfaceT *dst, const char *path) {
fclose(fp);
return false;
}
if (fread(dst->pixels, 1, SURFACE_PIXELS_SIZE, fp) != SURFACE_PIXELS_SIZE) {
if (!halSurfaceLoadFileChunky(dst, fp)) {
fclose(fp);
return false;
}
@ -153,7 +186,7 @@ bool surfaceSaveFile(const SurfaceT *src, const char *path) {
if (fp == NULL) {
return false;
}
if (fwrite(src->pixels, 1, SURFACE_PIXELS_SIZE, fp) != SURFACE_PIXELS_SIZE) {
if (!halSurfaceSaveFileChunky(src, fp)) {
fclose(fp);
return false;
}
@ -228,13 +261,14 @@ bool stageAlloc(void) {
if (gStage == NULL) {
return false;
}
/* halStageAllocPixels returns NULL on planar ports (Amiga) where
* the chunky shadow doesn't exist; the planes from portData are
* the source of truth. NULL pixels is no longer a failure. */
gStage->pixels = halStageAllocPixels();
if (gStage->pixels == NULL) {
free(gStage);
gStage = NULL;
return false;
}
if (gStage->pixels != NULL) {
memset(gStage->pixels, 0, SURFACE_PIXELS_SIZE);
}
gStage->portData = halSurfaceAllocPortData(gStage, true);
stageDirtyClearAll();
paletteInitDefault(gStage);
return true;
@ -255,6 +289,7 @@ void stageFree(void) {
if (gStage == NULL) {
return;
}
halSurfaceFreePortData(gStage, true, gStage->portData);
halStageFreePixels(gStage->pixels);
free(gStage);
gStage = NULL;

View file

@ -14,8 +14,17 @@
// auto-mirroring to $E1). Caller-side `s->pixels[i]` syntax is
// unchanged; only allocation/copy paths in surface.c shift to a
// two-buffer model.
//
// portData is per-port opaque storage. On chunky ports (IIgs, DOS) it
// stays NULL -- pixels is the source of truth. On planar ports
// (Amiga, Atari ST) it points to a port-private struct describing the
// 4 bitplanes (Amiga: 4 separate plane buffers + stride; ST: single
// interleaved buffer + stride). Cross-platform code never touches it
// directly -- all primitive access goes through halFast* on planar
// ports. See project_planar_68k_plan.md for the full architecture.
struct SurfaceT {
uint8_t *pixels;
void *portData;
uint8_t scb[SURFACE_HEIGHT];
uint16_t palette[SURFACE_PALETTE_COUNT][SURFACE_COLORS_PER_PALETTE];
};
@ -38,6 +47,18 @@ struct SurfaceT {
extern uint8_t gStageMinWord[SURFACE_HEIGHT];
extern uint8_t gStageMaxWord[SURFACE_HEIGHT];
// Per-byte mixer for surfaceHash. Two-stream: lo *= 31 + b, hi *= 251 + b.
// Strength-reduced to shifts so ORCA-C doesn't emit `~UMUL2` (~150 cyc
// per call); 32 KB hashed twice -> ~5 minutes per UBER run. The
// shift form is 16-bit-equivalent (mod 2^16) so hash values are
// identical to the original `* 31u` / `* 251u` form.
// lo *= 31 == (lo << 5) - lo
// hi *= 251 == (hi << 8) - (hi << 2) - hi
#define SURFACE_HASH_MIX_BYTE(lo_, hi_, b_) do { \
(lo_) = (uint16_t)(((((lo_) << 5) - (lo_)) + (b_))); \
(hi_) = (uint16_t)((((hi_) << 8) - ((hi_) << 2) - (hi_)) + (b_)); \
} while (0)
// Stage SCB / palette dirty flags. scbSet* and paletteSet set them
// true when the stage's data is modified; the per-port present code
// checks the flags and clears after upload. Replaces a per-frame
@ -50,6 +71,15 @@ extern bool gStagePaletteDirty;
// bands are widened to cover the rect. If `s` is any other surface,
// the call is a no-op -- non-stage surfaces never get presented, so
// they don't carry dirty state.
//
// Planar ports rely on the chunky shadow + c2p path through Phase 8.
// Planar-native primitives (Phases 3+) dual-write: they update both
// the chunky pixels and the bitplanes in the same call, so c2p at
// present time always derives correct planes from up-to-date chunky.
// Phase 9 deletes the chunky shadow + c2p; only at that point will
// per-row planar-vs-chunky tracking even be a possible question, and
// the plan is to avoid it entirely there too (planes become the only
// source of truth).
void surfaceMarkDirtyRect(const SurfaceT *s, int16_t x, int16_t y, int16_t w, int16_t h);
// Shorthand for "every row, full width" -- used by surfaceClear and

View file

@ -147,6 +147,7 @@ void tileCopy(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src,
if (!halFastTileCopy(dstRow0, srcRow0)) {
copyTileOpaque(dstRow0, srcRow0);
}
halTileCopyPlanes(dst, dstBx, dstBy, src, srcBx, srcBy);
surfaceMarkDirtyRect(dst, (int16_t)dstPixelX, (int16_t)dstPixelY,
TILE_PIXELS_PER_SIDE, TILE_PIXELS_PER_SIDE);
}
@ -178,6 +179,7 @@ void tileCopyMasked(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT
if (!halFastTileCopyMasked(dstRow0, srcRow0, transparentIndex)) {
copyTileMasked(dstRow0, srcRow0, transparentIndex);
}
halTileCopyMaskedPlanes(dst, dstBx, dstBy, src, srcBx, srcBy, transparentIndex);
surfaceMarkDirtyRect(dst, (int16_t)dstPixelX, (int16_t)dstPixelY,
TILE_PIXELS_PER_SIDE, TILE_PIXELS_PER_SIDE);
}
@ -209,6 +211,7 @@ void tileFill(SurfaceT *s, uint8_t bx, uint8_t by, uint8_t colorIndex) {
row += SURFACE_BYTES_PER_ROW;
}
}
halTileFillPlanes(s, bx, by, colorIndex);
surfaceMarkDirtyRect(s, (int16_t)pixelX, (int16_t)pixelY,
TILE_PIXELS_PER_SIDE, TILE_PIXELS_PER_SIDE);
}
@ -241,6 +244,7 @@ void tilePaste(SurfaceT *dst, uint8_t bx, uint8_t by, const TileT *in) {
src += TILE_BYTES_PER_ROW;
}
}
halTilePastePlanes(dst, bx, by, &in->pixels[0]);
surfaceMarkDirtyRect(dst, (int16_t)pixelX, (int16_t)pixelY,
TILE_PIXELS_PER_SIDE, TILE_PIXELS_PER_SIDE);
}
@ -261,9 +265,12 @@ void tileSnap(const SurfaceT *src, uint8_t bx, uint8_t by, TileT *out) {
}
pixelX = (uint16_t)((uint16_t)bx * TILE_PIXELS_PER_SIDE);
pixelY = (uint16_t)((uint16_t)by * TILE_PIXELS_PER_SIDE);
srcRow = &src->pixels[SURFACE_ROW_OFFSET(pixelY) + (pixelX >> 1)];
dst = &out->pixels[0];
if (!halFastTileSnap(dst, srcRow)) {
/* On planar ports (s->pixels NULL) the chunky read path is
* skipped; halTileSnapPlanes below derives the tile bytes from
* the bitplanes. */
if (src->pixels != NULL && !halFastTileSnap(dst, &src->pixels[SURFACE_ROW_OFFSET(pixelY) + (pixelX >> 1)])) {
srcRow = &src->pixels[SURFACE_ROW_OFFSET(pixelY) + (pixelX >> 1)];
for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) {
dst[0] = srcRow[0];
dst[1] = srcRow[1];
@ -273,4 +280,5 @@ void tileSnap(const SurfaceT *src, uint8_t bx, uint8_t by, TileT *out) {
dst += TILE_BYTES_PER_ROW;
}
}
halTileSnapPlanes(src, bx, by, &out->pixels[0]);
}

270
src/port/amiga/circle.s Normal file
View file

@ -0,0 +1,270 @@
| Amiga planar circle outline V4 -- 16-way color-specialized.
|
| Per Bresenham iter:
| 1. Precompute 4 xp records (xp_byte_w + bitMask_b + notMask_b) for
| cx +/- bx and cx +/- by, stored at sp+0..15 (4 records x 4 bytes).
| 2. Precompute 4 yp40 words for cy +/- by and cy +/- bx, stored at
| sp+16..23 (4 words x 2 bytes).
| 3. Plot 8 octant pixels with hardcoded color: each pixel does 4
| branchless plane RMW ops (or.b for set bits, and.b for clear
| bits) -- no btst, no per-plane branch.
| 4. Bresenham step.
|
| At function entry the color is masked to 4 bits and used as the index
| into a 16-entry jump table that selects the matching main loop.
| Each main loop has the color hardcoded into the per-plane RMW ops.
|
| The branchless plot saves ~20-28 cyc per plane vs V3's btst+branch
| pattern -- ~640-900 cyc per Bresenham iter.
|
| ABI: cdecl. d2-d7/a2-a6 callee-save.
|
| void surface68kAmigaCircleOutline(uint8_t *p0, uint8_t *p1,
| uint8_t *p2, uint8_t *p3,
| uint16_t cx, uint16_t cy,
| uint16_t r, uint8_t color);
|
| Register allocation across the iter loop:
| d2.w = bx (Bresenham)
| d3.w = by (Bresenham)
| d4.w = err (Bresenham)
| d5.w = cx (cached)
| a4 = cy (cached, sign-extended)
| a0..a3 = plane bases
| a5 = bitMaskLut
| d0,d1,d6,d7 = scratch in precompute / plot
|
| Scratch block (24 bytes) at sp+0..23:
| sp+0..3: xp1 record [xp_byte_w, bitMask_b, notMask_b] for cx+bx
| sp+4..7: xp2 record for cx-bx
| sp+8..11: xp3 record for cx+by
| sp+12..15: xp4 record for cx-by
| sp+16..17: yp1 word (cy+by) * 40
| sp+18..19: yp2 word (cy-by) * 40
| sp+20..21: yp3 word (cy+bx) * 40
| sp+22..23: yp4 word (cy-bx) * 40
.text
| ---- XP_REC: build xp record at sp+slot for xp = cx <signOp> <xreg> ----
| signOp: add or sub
| xreg: %d2 (bx) or %d3 (by)
| slot: 0, 4, 8, or 12
| Trashes: d0, d1, d6, d7
.macro XP_REC slot, signOp, xreg
move.w %d5,%d6
\signOp\().w \xreg,%d6 | d6 = xp
move.w %d6,%d7
lsr.w #3,%d7 | d7 = xp >> 3 (xp_byte)
and.w #7,%d6 | d6 = xp & 7
move.b (%a5,%d6.w),%d6 | d6 = bitMask
move.b %d6,%d1
not.b %d1 | d1 = notMask
move.w %d7,\slot(%sp) | xp_byte word
move.b %d6,\slot+2(%sp) | bitMask byte
move.b %d1,\slot+3(%sp) | notMask byte
.endm
| ---- YP_REC: build yp40 word at sp+slot for yp = cy <signOp> <yreg> ----
.macro YP_REC slot, signOp, yreg
move.l %a4,%d6
\signOp\().w \yreg,%d6 | d6.w = yp
move.w %d6,%d0
lsl.w #3,%d6 | d6 = yp << 3
lsl.w #5,%d0 | d0 = yp << 5
add.w %d6,%d0 | d0 = yp * 40
move.w %d0,\slot(%sp)
.endm
| ---- PLOT_FIXED: plot one pixel with hardcoded 4-bit color ----
| slotYp: 16, 18, 20, or 22 (yp40 word slot)
| slotXp: 0, 4, 8, or 12 (xp record slot)
| color: literal 0..15
| Trashes: d0, d1, d7
.macro PLOT_FIXED slotYp, slotXp, color
move.w \slotYp(%sp),%d0 | d0 = yp40
add.w \slotXp(%sp),%d0 | d0 += xp_byte
move.b \slotXp+2(%sp),%d1 | d1.b = bitMask
move.b \slotXp+3(%sp),%d7 | d7.b = notMask
.if ((\color) & 1)
or.b %d1,(%a0,%d0.w)
.else
and.b %d7,(%a0,%d0.w)
.endif
.if ((\color) & 2)
or.b %d1,(%a1,%d0.w)
.else
and.b %d7,(%a1,%d0.w)
.endif
.if ((\color) & 4)
or.b %d1,(%a2,%d0.w)
.else
and.b %d7,(%a2,%d0.w)
.endif
.if ((\color) & 8)
or.b %d1,(%a3,%d0.w)
.else
and.b %d7,(%a3,%d0.w)
.endif
.endm
| ---- PLOT_8: plot all 8 octant pixels for a given hardcoded color ----
.macro PLOT_8 color
PLOT_FIXED 16, 0, \color | (cx+bx, cy+by)
PLOT_FIXED 16, 4, \color | (cx-bx, cy+by)
PLOT_FIXED 18, 0, \color | (cx+bx, cy-by)
PLOT_FIXED 18, 4, \color | (cx-bx, cy-by)
PLOT_FIXED 20, 8, \color | (cx+by, cy+bx)
PLOT_FIXED 20, 12, \color | (cx-by, cy+bx)
PLOT_FIXED 22, 8, \color | (cx+by, cy-bx)
PLOT_FIXED 22, 12, \color | (cx-by, cy-bx)
.endm
| ---- CO_BODY: full Bresenham loop body for a hardcoded color ----
| Generates the per-iter precompute, branchless plot, and Bresenham
| step. Uses unique labels via \color suffix.
.macro CO_BODY color
XP_REC 0, add, %d2 | xp1 = cx+bx
XP_REC 4, sub, %d2 | xp2 = cx-bx
XP_REC 8, add, %d3 | xp3 = cx+by
XP_REC 12, sub, %d3 | xp4 = cx-by
YP_REC 16, add, %d3 | yp1 = cy+by
YP_REC 18, sub, %d3 | yp2 = cy-by
YP_REC 20, add, %d2 | yp3 = cy+bx
YP_REC 22, sub, %d2 | yp4 = cy-bx
PLOT_8 \color
addq.w #1,%d3
tst.w %d4
bgt .LcoDecX_\color
add.w %d3,%d4
add.w %d3,%d4
addq.w #1,%d4
bra.w .LcoLoop_\color
.LcoDecX_\color:
subq.w #1,%d2
add.w %d3,%d4
add.w %d3,%d4
sub.w %d2,%d4
sub.w %d2,%d4
addq.w #1,%d4
bra.w .LcoLoop_\color
.endm
| ---- CO_LOOP_HDR: emit a labelled loop header for a color ----
.macro CO_LOOP_HDR color
.LcoLoop_\color:
cmp.w %d3,%d2
bcs.w .LcoDone
CO_BODY \color
.endm
| ---- Function entry ----
.equ SP_SAVED, 44
.equ SP_LOCAL, 24
.equ SP_OFF, (SP_SAVED + 4 + SP_LOCAL)
.equ SP_P0, SP_OFF + 0
.equ SP_P1, SP_OFF + 4
.equ SP_P2, SP_OFF + 8
.equ SP_P3, SP_OFF + 12
.equ SP_CX, SP_OFF + 16 + 2
.equ SP_CY, SP_OFF + 20 + 2
.equ SP_R, SP_OFF + 24 + 2
.equ SP_COLOR, SP_OFF + 28 + 3
.globl _surface68kAmigaCircleOutline
_surface68kAmigaCircleOutline:
movem.l %d2-%d7/%a2-%a6,-(%sp)
lea -SP_LOCAL(%sp),%sp
| Plane bases.
move.l SP_P0(%sp),%a0
move.l SP_P1(%sp),%a1
move.l SP_P2(%sp),%a2
move.l SP_P3(%sp),%a3
lea bitMaskLut(%pc),%a5
| Cache cx in d5, cy (sign-extended) in a4.
move.w SP_CX(%sp),%d5
move.w SP_CY(%sp),%d6
ext.l %d6
movea.l %d6,%a4
| Bresenham init.
move.w SP_R(%sp),%d2 | bx = r
moveq #0,%d3 | by = 0
moveq #1,%d4
sub.w %d2,%d4 | err = 1 - bx
| Dispatch on color (low 4 bits) -> one of 16 main loops.
| Each table entry is a bra.w (4 bytes), so index *= 4.
moveq #0,%d6
move.b SP_COLOR(%sp),%d6
and.w #0x0F,%d6
add.w %d6,%d6
add.w %d6,%d6
lea .LcoTable(%pc),%a6
jmp 0(%a6,%d6.w)
.LcoTable:
bra.w .LcoLoop_0
bra.w .LcoLoop_1
bra.w .LcoLoop_2
bra.w .LcoLoop_3
bra.w .LcoLoop_4
bra.w .LcoLoop_5
bra.w .LcoLoop_6
bra.w .LcoLoop_7
bra.w .LcoLoop_8
bra.w .LcoLoop_9
bra.w .LcoLoop_10
bra.w .LcoLoop_11
bra.w .LcoLoop_12
bra.w .LcoLoop_13
bra.w .LcoLoop_14
bra.w .LcoLoop_15
CO_LOOP_HDR 0
CO_LOOP_HDR 1
CO_LOOP_HDR 2
CO_LOOP_HDR 3
CO_LOOP_HDR 4
CO_LOOP_HDR 5
CO_LOOP_HDR 6
CO_LOOP_HDR 7
CO_LOOP_HDR 8
CO_LOOP_HDR 9
CO_LOOP_HDR 10
CO_LOOP_HDR 11
CO_LOOP_HDR 12
CO_LOOP_HDR 13
CO_LOOP_HDR 14
CO_LOOP_HDR 15
.LcoDone:
lea SP_LOCAL(%sp),%sp
movem.l (%sp)+,%d2-%d7/%a2-%a6
rts
.align 2
bitMaskLut:
.byte 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01

File diff suppressed because it is too large Load diff

View file

@ -526,26 +526,6 @@ void halPresent(const SurfaceT *src) {
}
void halPresentRect(const SurfaceT *src, int16_t x, int16_t y, uint16_t w, uint16_t h) {
uint16_t groupStart;
uint16_t groupEnd;
if (src == NULL || !gModeSet) {
return;
}
refreshPaletteStateIfNeeded(src);
// Each c2p group covers 16 horizontal pixels. Round dirty pixel
// range to the enclosing group range to keep the planar word
// alignment without missing edge pixels.
groupStart = (uint16_t)(x >> 4);
groupEnd = (uint16_t)(((uint16_t)x + w + 15) >> 4);
if (groupEnd > ST_GROUPS_PER_ROW) {
groupEnd = ST_GROUPS_PER_ROW;
}
c2pRange(src, y, y + (int16_t)h, groupStart, groupEnd);
}
// Vsync() is XBIOS opcode 37; mintlib exposes it directly. It blocks
// until the next 50 Hz (PAL) or 60 Hz (NTSC) vertical blank.
void halWaitVBL(void) {
@ -730,6 +710,20 @@ bool halFastFloodWalk(uint8_t *row, int16_t startX, uint8_t matchColor, uint8_t
}
bool halFloodWalkPlanes(const SurfaceT *s, int16_t startX, int16_t y, uint8_t matchColor, uint8_t newColor, bool matchEqual, bool *seedMatched, int16_t *leftXOut, int16_t *rightXOut) {
(void)s; (void)startX; (void)y; (void)matchColor; (void)newColor; (void)matchEqual;
(void)seedMatched; (void)leftXOut; (void)rightXOut;
return false;
}
bool halFloodScanRowPlanes(const SurfaceT *s, int16_t leftX, int16_t rightX, int16_t scanY, uint8_t matchColor, uint8_t newColor, bool matchEqual, uint8_t *markBuf) {
(void)s; (void)leftX; (void)rightX; (void)scanY; (void)matchColor; (void)newColor; (void)matchEqual;
(void)markBuf;
return false;
}
bool halFastFloodScanRow(uint8_t *row, int16_t leftX, int16_t rightX, uint8_t matchColor, uint8_t newColor, bool matchEqual, uint8_t *markBuf) {
(void)row;
(void)leftX;
@ -798,6 +792,146 @@ bool halFastTileFill(SurfaceT *s, uint8_t bx, uint8_t by, uint16_t fillWord) {
}
// Phase-1 planar plumbing: portData hooks declared and exported, but
// returning NULL keeps the ST port operating in the legacy
// chunky-with-c2p model. Phase 4 replaces this with an interleaved
// planar buffer + stride blob, and rewrites every halFast* primitive
// to read/write planes directly.
void *halSurfaceAllocPortData(SurfaceT *s, bool isStage) {
(void)s;
(void)isStage;
return NULL;
}
void halSurfaceFreePortData(SurfaceT *s, bool isStage, void *portData) {
(void)s;
(void)isStage;
(void)portData;
}
// ST planar dual-write isn't implemented yet (interleaved word-planar
// layout needs a different code path than Amiga's separate plane
// buffers). Stub for now; chunky shadow + c2p still drives display.
void halFillRectPlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex) {
(void)s;
(void)x;
(void)y;
(void)w;
(void)h;
(void)colorIndex;
}
void halSurfaceCopyPlanes(SurfaceT *dst, const SurfaceT *src) {
(void)dst;
(void)src;
}
void halTileFillPlanes(SurfaceT *s, uint8_t bx, uint8_t by, uint8_t colorIndex) {
(void)s; (void)bx; (void)by; (void)colorIndex;
}
void halTileCopyPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy) {
(void)dst; (void)dstBx; (void)dstBy; (void)src; (void)srcBx; (void)srcBy;
}
void halTileCopyMaskedPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy, uint8_t transparentIndex) {
(void)dst; (void)dstBx; (void)dstBy; (void)src; (void)srcBx; (void)srcBy; (void)transparentIndex;
}
void halTilePastePlanes(SurfaceT *dst, uint8_t bx, uint8_t by, const uint8_t *chunkyTile) {
(void)dst; (void)bx; (void)by; (void)chunkyTile;
}
void halTileSnapPlanes(const SurfaceT *src, uint8_t bx, uint8_t by, uint8_t *chunkyTileOut) {
(void)src; (void)bx; (void)by; (void)chunkyTileOut;
}
void halSpriteDrawPlanes(SurfaceT *s, const SpriteT *sp, int16_t x, int16_t y) {
(void)s; (void)sp; (void)x; (void)y;
}
void halBlitRectPlanes(SurfaceT *dst, int16_t x, int16_t y, const uint8_t *srcBytes, int16_t srcX0, int16_t srcY0, int16_t copyW, int16_t copyH, int16_t srcRowBytes, uint16_t transparent) {
(void)dst; (void)x; (void)y; (void)srcBytes; (void)srcX0; (void)srcY0;
(void)copyW; (void)copyH; (void)srcRowBytes; (void)transparent;
}
void halSpriteSavePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t *dstPlaneBytes) {
(void)s; (void)x; (void)y; (void)w; (void)h; (void)dstPlaneBytes;
}
void halSpriteRestorePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, const uint8_t *srcPlaneBytes) {
(void)s; (void)x; (void)y; (void)w; (void)h; (void)srcPlaneBytes;
}
/* Phase 9 chunky reader hooks -- ST is still chunky-shadow + c2p,
* so reads come from s->pixels just like DOS / IIgs. */
uint8_t halSamplePixel(const SurfaceT *s, int16_t x, int16_t y) {
uint8_t byte = s->pixels[(uint16_t)y * SURFACE_BYTES_PER_ROW + ((uint16_t)x >> 1)];
if (x & 1) return (uint8_t)(byte & 0x0Fu);
return (uint8_t)((byte & 0xF0u) >> 4);
}
uint32_t halSurfaceHash(const SurfaceT *s) {
uint16_t lo = 0xACE1u, hi = 0x1357u, blocks, n, v;
const uint8_t *p;
const uint16_t *w;
uint8_t b;
p = s->pixels;
blocks = (uint16_t)(SURFACE_PIXELS_SIZE / 8);
do {
b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b);
b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b);
b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b);
b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b);
b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b);
b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b);
b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b);
b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b);
blocks--;
} while (blocks > 0u);
p = s->scb;
for (n = 0; n < (uint16_t)SURFACE_HEIGHT; n++) {
b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b);
}
w = &s->palette[0][0];
for (n = 0; n < (uint16_t)SURFACE_PALETTE_ENTRIES; n++) {
v = *w++;
b = (uint8_t)((v >> 8) & 0xFFu); SURFACE_HASH_MIX_BYTE(lo, hi, b);
b = (uint8_t)(v & 0xFFu); SURFACE_HASH_MIX_BYTE(lo, hi, b);
}
return ((uint32_t)hi << 16) | (uint32_t)lo;
}
void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src) {
memcpy(dst->pixels, src->pixels, SURFACE_PIXELS_SIZE);
}
bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp) {
return fread(dst->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE;
}
bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp) {
return fwrite(src->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE;
}
uint8_t *halSurfaceAllocPixels(void) {
return (uint8_t *)calloc(1, SURFACE_PIXELS_SIZE);
}
void halSurfaceFreePixels(uint8_t *pixels) {
free(pixels);
}
uint8_t *halSurfacePlanePtr(const SurfaceT *s, uint8_t planeIdx) {
(void)s; (void)planeIdx;
return NULL;
}
uint8_t *halStageAllocPixels(void) {
return (uint8_t *)malloc(SURFACE_PIXELS_SIZE);
}

View file

@ -244,21 +244,6 @@ void halPresent(const SurfaceT *src) {
}
void halPresentRect(const SurfaceT *src, int16_t x, int16_t y, uint16_t w, uint16_t h) {
int16_t py;
int16_t yEnd;
if (src == NULL || gVgaMem == NULL) {
return;
}
uploadPaletteIfNeeded(src);
yEnd = y + (int16_t)h;
for (py = y; py < yEnd; py++) {
expandAndWriteLine(src, py, x, w, &gVgaMem[py * VGA_STRIDE]);
}
}
// VGA mode 13h vertical refresh on a real CRT runs at ~70 Hz. We
// detect the start of vertical retrace by polling input status
// register 1 ($3DA) bit 3: 1 = currently in vretrace. To get a
@ -423,6 +408,20 @@ bool halFastFloodWalk(uint8_t *row, int16_t startX, uint8_t matchColor, uint8_t
}
bool halFloodWalkPlanes(const SurfaceT *s, int16_t startX, int16_t y, uint8_t matchColor, uint8_t newColor, bool matchEqual, bool *seedMatched, int16_t *leftXOut, int16_t *rightXOut) {
(void)s; (void)startX; (void)y; (void)matchColor; (void)newColor; (void)matchEqual;
(void)seedMatched; (void)leftXOut; (void)rightXOut;
return false;
}
bool halFloodScanRowPlanes(const SurfaceT *s, int16_t leftX, int16_t rightX, int16_t scanY, uint8_t matchColor, uint8_t newColor, bool matchEqual, uint8_t *markBuf) {
(void)s; (void)leftX; (void)rightX; (void)scanY; (void)matchColor; (void)newColor; (void)matchEqual;
(void)markBuf;
return false;
}
bool halFastFloodScanRow(uint8_t *row, int16_t leftX, int16_t rightX, uint8_t matchColor, uint8_t newColor, bool matchEqual, uint8_t *markBuf) {
(void)row;
(void)leftX;
@ -499,3 +498,143 @@ uint8_t *halStageAllocPixels(void) {
void halStageFreePixels(uint8_t *pixels) {
free(pixels);
}
// DOS / VGA mode 13h is chunky-native (8bpp linear). portData is
// unused; the chunky `pixels` buffer feeds the present-time
// nearest-neighbor copy to VGA RAM.
void *halSurfaceAllocPortData(SurfaceT *s, bool isStage) {
(void)s;
(void)isStage;
return NULL;
}
void halSurfaceFreePortData(SurfaceT *s, bool isStage, void *portData) {
(void)s;
(void)isStage;
(void)portData;
}
// DOS has no bitplanes -- chunky pixels are the source of truth and
// expandAndWriteLine derives the VGA DAC indices straight from them.
// This hook is a stub here; the cross-platform fillRect calls it
// unconditionally.
void halFillRectPlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex) {
(void)s;
(void)x;
(void)y;
(void)w;
(void)h;
(void)colorIndex;
}
void halSurfaceCopyPlanes(SurfaceT *dst, const SurfaceT *src) {
(void)dst;
(void)src;
}
void halTileFillPlanes(SurfaceT *s, uint8_t bx, uint8_t by, uint8_t colorIndex) {
(void)s; (void)bx; (void)by; (void)colorIndex;
}
void halTileCopyPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy) {
(void)dst; (void)dstBx; (void)dstBy; (void)src; (void)srcBx; (void)srcBy;
}
void halTileCopyMaskedPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy, uint8_t transparentIndex) {
(void)dst; (void)dstBx; (void)dstBy; (void)src; (void)srcBx; (void)srcBy; (void)transparentIndex;
}
void halTilePastePlanes(SurfaceT *dst, uint8_t bx, uint8_t by, const uint8_t *chunkyTile) {
(void)dst; (void)bx; (void)by; (void)chunkyTile;
}
void halTileSnapPlanes(const SurfaceT *src, uint8_t bx, uint8_t by, uint8_t *chunkyTileOut) {
(void)src; (void)bx; (void)by; (void)chunkyTileOut;
}
void halSpriteDrawPlanes(SurfaceT *s, const SpriteT *sp, int16_t x, int16_t y) {
(void)s; (void)sp; (void)x; (void)y;
}
void halBlitRectPlanes(SurfaceT *dst, int16_t x, int16_t y, const uint8_t *srcBytes, int16_t srcX0, int16_t srcY0, int16_t copyW, int16_t copyH, int16_t srcRowBytes, uint16_t transparent) {
(void)dst; (void)x; (void)y; (void)srcBytes; (void)srcX0; (void)srcY0;
(void)copyW; (void)copyH; (void)srcRowBytes; (void)transparent;
}
void halSpriteSavePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t *dstPlaneBytes) {
(void)s; (void)x; (void)y; (void)w; (void)h; (void)dstPlaneBytes;
}
void halSpriteRestorePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, const uint8_t *srcPlaneBytes) {
(void)s; (void)x; (void)y; (void)w; (void)h; (void)srcPlaneBytes;
}
/* Phase 9 reader hooks: chunky ports use the original s->pixels-based
* paths. */
uint8_t halSamplePixel(const SurfaceT *s, int16_t x, int16_t y) {
uint8_t byte = s->pixels[(uint16_t)y * SURFACE_BYTES_PER_ROW + ((uint16_t)x >> 1)];
if (x & 1) return (uint8_t)(byte & 0x0Fu);
return (uint8_t)((byte & 0xF0u) >> 4);
}
uint32_t halSurfaceHash(const SurfaceT *s) {
uint16_t lo = 0xACE1u, hi = 0x1357u, blocks, n, v;
const uint8_t *p;
const uint16_t *w;
uint8_t b;
p = s->pixels;
blocks = (uint16_t)(SURFACE_PIXELS_SIZE / 8);
do {
b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b);
b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b);
b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b);
b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b);
b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b);
b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b);
b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b);
b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b);
blocks--;
} while (blocks > 0u);
p = s->scb;
for (n = 0; n < (uint16_t)SURFACE_HEIGHT; n++) {
b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b);
}
w = &s->palette[0][0];
for (n = 0; n < (uint16_t)SURFACE_PALETTE_ENTRIES; n++) {
v = *w++;
b = (uint8_t)((v >> 8) & 0xFFu); SURFACE_HASH_MIX_BYTE(lo, hi, b);
b = (uint8_t)(v & 0xFFu); SURFACE_HASH_MIX_BYTE(lo, hi, b);
}
return ((uint32_t)hi << 16) | (uint32_t)lo;
}
void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src) {
memcpy(dst->pixels, src->pixels, SURFACE_PIXELS_SIZE);
}
bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp) {
return fread(dst->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE;
}
bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp) {
return fwrite(src->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE;
}
uint8_t *halSurfaceAllocPixels(void) {
return (uint8_t *)calloc(1, SURFACE_PIXELS_SIZE);
}
void halSurfaceFreePixels(uint8_t *pixels) {
free(pixels);
}
uint8_t *halSurfacePlanePtr(const SurfaceT *s, uint8_t planeIdx) {
(void)s; (void)planeIdx;
return NULL;
}

View file

@ -26,12 +26,25 @@
// crowd up against the 64 KB-per-bank limit).
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include "joey/debug.h"
#include "hal.h"
#include "surfaceInternal.h"
/* GetTick wrapper in peislam.asm: invokes the Misc Toolset GetTick
* ($2503) and returns the low 16 bits of the system's tick counter
* (firmware VBL ISR-driven). Polling $C019 from C user code missed
* transitions for any op over ~1 ms; the system's tick counter is
* updated by the actual interrupt handler so it stays accurate
* regardless of caller polling rate. Tick rate matches the video
* field rate -- 60 Hz on NTSC, 50 Hz on PAL. */
extern uint16_t iigsGetTickWord(void);
/* Reads battery RAM hrtz50or60: 0 = NTSC, 1 = PAL. */
extern uint16_t iigsReadHzParam(void);
static uint16_t gFrameHz = 60u;
// hal.c is the single TU that calls into joeyDraw.asm. Cross-
// platform draw.c / tile.c / etc. dispatch through halFast*
// functions defined here; they never reference the asm symbols
@ -210,6 +223,7 @@ bool halInit(const JoeyConfigT *config) {
// is unreliable from halInit's calling context, so we don't try
// it here -- the first present will set up SCB to 320 mode.
iigsInitRowLut();
gFrameHz = (iigsReadHzParam() == 1u) ? 50u : 60u;
gModeSet = true;
return true;
}
@ -234,40 +248,6 @@ void halPresent(const SurfaceT *src) {
}
void halPresentRect(const SurfaceT *src, int16_t x, int16_t y, uint16_t w, uint16_t h) {
uint16_t copyBytes;
int16_t byteStart;
uint16_t srcOffset;
if (src == NULL) {
return;
}
uploadScbAndPaletteIfNeeded(src);
// Pixel copy: byte-aligned runs per scanline. x is always >= 0
// after API-level clipping. Use unsigned shifts to avoid
// ~SSHIFTRIGHT helper for `x >> 1` on signed int16_t.
byteStart = (int16_t)((uint16_t)x >> 1);
copyBytes = (uint16_t)((((uint16_t)x + w + 1u) >> 1) - (uint16_t)byteStart);
if (copyBytes == 0 || h == 0) {
return;
}
// Pixel copy: prefer the PEI-slam variant when the rect satisfies
// its contract (copyBytes even, 2..80). Sprite-rect presents
// (typical 8 bytes wide) hit this ~3x faster than MVN. Wider or
// odd-byte rects fall back to MVN, which has no width cap.
srcOffset = (uint16_t)(0x2000 + SURFACE_ROW_OFFSET(y) + byteStart);
if ((copyBytes & 1) == 0 && copyBytes >= 2 && copyBytes <= 80) {
iigsBlitRectStageToShrPEI(srcOffset, copyBytes, h);
} else {
iigsBlitRectStageToShr(srcOffset, copyBytes, h);
}
}
void halShutdown(void) {
if (gModeSet) {
*IIGS_NEWVIDEO_REG = gPreviousNewVideo;
@ -305,6 +285,142 @@ void halStageFreePixels(uint8_t *pixels) {
}
// IIgs is chunky-native: portData is unused. The chunky `pixels`
// buffer at $01:2000 is the stage's pixel storage and the source for
// stagePresent's PEI-slam to $E1.
void *halSurfaceAllocPortData(SurfaceT *s, bool isStage) {
(void)s;
(void)isStage;
return NULL;
}
void halSurfaceFreePortData(SurfaceT *s, bool isStage, void *portData) {
(void)s;
(void)isStage;
(void)portData;
}
// IIgs SHR is chunky-native; no bitplanes to update.
void halFillRectPlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex) {
(void)s;
(void)x;
(void)y;
(void)w;
(void)h;
(void)colorIndex;
}
void halSurfaceCopyPlanes(SurfaceT *dst, const SurfaceT *src) {
(void)dst;
(void)src;
}
void halTileFillPlanes(SurfaceT *s, uint8_t bx, uint8_t by, uint8_t colorIndex) {
(void)s; (void)bx; (void)by; (void)colorIndex;
}
void halTileCopyPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy) {
(void)dst; (void)dstBx; (void)dstBy; (void)src; (void)srcBx; (void)srcBy;
}
void halTileCopyMaskedPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy, uint8_t transparentIndex) {
(void)dst; (void)dstBx; (void)dstBy; (void)src; (void)srcBx; (void)srcBy; (void)transparentIndex;
}
void halTilePastePlanes(SurfaceT *dst, uint8_t bx, uint8_t by, const uint8_t *chunkyTile) {
(void)dst; (void)bx; (void)by; (void)chunkyTile;
}
void halTileSnapPlanes(const SurfaceT *src, uint8_t bx, uint8_t by, uint8_t *chunkyTileOut) {
(void)src; (void)bx; (void)by; (void)chunkyTileOut;
}
void halSpriteDrawPlanes(SurfaceT *s, const SpriteT *sp, int16_t x, int16_t y) {
(void)s; (void)sp; (void)x; (void)y;
}
void halBlitRectPlanes(SurfaceT *dst, int16_t x, int16_t y, const uint8_t *srcBytes, int16_t srcX0, int16_t srcY0, int16_t copyW, int16_t copyH, int16_t srcRowBytes, uint16_t transparent) {
(void)dst; (void)x; (void)y; (void)srcBytes; (void)srcX0; (void)srcY0;
(void)copyW; (void)copyH; (void)srcRowBytes; (void)transparent;
}
void halSpriteSavePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t *dstPlaneBytes) {
(void)s; (void)x; (void)y; (void)w; (void)h; (void)dstPlaneBytes;
}
void halSpriteRestorePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, const uint8_t *srcPlaneBytes) {
(void)s; (void)x; (void)y; (void)w; (void)h; (void)srcPlaneBytes;
}
/* Phase 9 chunky reader hooks: IIgs reads from s->pixels just like
* the legacy paths did. Same logic as the DOS port. */
uint8_t halSamplePixel(const SurfaceT *s, int16_t x, int16_t y) {
uint8_t byte = s->pixels[(uint16_t)y * SURFACE_BYTES_PER_ROW + ((uint16_t)x >> 1)];
if (x & 1) return (uint8_t)(byte & 0x0Fu);
return (uint8_t)((byte & 0xF0u) >> 4);
}
uint32_t halSurfaceHash(const SurfaceT *s) {
uint16_t lo = 0xACE1u, hi = 0x1357u, blocks, n, v;
const uint8_t *p;
const uint16_t *w;
uint8_t b;
p = s->pixels;
blocks = (uint16_t)(SURFACE_PIXELS_SIZE / 8);
do {
b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b);
b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b);
b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b);
b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b);
b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b);
b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b);
b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b);
b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b);
blocks--;
} while (blocks > 0u);
p = s->scb;
for (n = 0; n < (uint16_t)SURFACE_HEIGHT; n++) {
b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b);
}
w = &s->palette[0][0];
for (n = 0; n < (uint16_t)SURFACE_PALETTE_ENTRIES; n++) {
v = *w++;
b = (uint8_t)((v >> 8) & 0xFFu); SURFACE_HASH_MIX_BYTE(lo, hi, b);
b = (uint8_t)(v & 0xFFu); SURFACE_HASH_MIX_BYTE(lo, hi, b);
}
return ((uint32_t)hi << 16) | (uint32_t)lo;
}
void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src) {
memcpy(dst->pixels, src->pixels, SURFACE_PIXELS_SIZE);
}
bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp) {
return fread(dst->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE;
}
bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp) {
return fwrite(src->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE;
}
uint8_t *halSurfaceAllocPixels(void) {
return (uint8_t *)calloc(1, SURFACE_PIXELS_SIZE);
}
void halSurfaceFreePixels(uint8_t *pixels) {
free(pixels);
}
uint8_t *halSurfacePlanePtr(const SurfaceT *s, uint8_t planeIdx) {
(void)s; (void)planeIdx;
return NULL;
}
// $C019 RDVBLBAR: bit 7 = 0 during vertical blank, 1 during active
// scan. To produce a rising-edge wait (one VBL per call), first spin
// while VBL is currently active (bit 7 = 0), then spin until VBL
@ -333,24 +449,11 @@ void halWaitVBL(void) {
// byte and the counter never advances. The explicit lda > / sta >
// pattern uses long-mode addressing throughout, which is
// DBR-independent.
static uint16_t gFrameCount = 0;
static uint8_t gPrevInVbl = 0;
uint16_t halFrameCount(void) {
uint8_t now;
uint16_t cnt;
now = (*IIGS_VBL_STATUS & VBL_BAR_BIT) == 0;
if (now && !gPrevInVbl) {
cnt = gFrameCount;
cnt = (uint16_t)(cnt + 1u);
gFrameCount = cnt;
}
gPrevInVbl = now;
return gFrameCount;
return iigsGetTickWord();
}
uint16_t halFrameHz(void) {
return 60u;
return gFrameHz;
}

View file

@ -1,15 +1,66 @@
* peislam.asm - placeholder.
*
* The original PEI-slam-per-row helper was removed; its functionality
* was rolled into iigsBlitStageToShr in joeyDraw.asm (full PEI-slam
* with per-row dirty skip). This stub remains so the build's
* PORT_ASM_SRCS_ALL wildcard pulls in a file with a recognized load
* segment and the linker keeps the same segment-bank layout it had
* when peislam.asm was a real translation unit.
* peislam.asm - originally a PEI-slam helper, now hosts the GetTick
* and ReadBParam trampolines. The PEI-slam logic was rolled into
* iigsBlitStageToShr in joeyDraw.asm.
keep PEISLAM
case on
* Stub kept so the PEISLAM load segment stays present (the build's
* PORT_ASM_SRCS_ALL wildcard pulls in this file by name).
peislamStub start IIGSASM
rtl
end
****************************************************************
* uint16_t iigsGetTickWord(void)
*
* Calls Misc Toolset GetTick ($2503) and returns the low 16 bits of
* the 32-bit tick counter. The system increments this counter from
* the actual VBL hardware interrupt, so it stays accurate regardless
* of caller polling rate -- C-side polling of $C019 missed transitions
* for any op over ~1 ms.
*
* GetTick output convention: caller pushes 4 bytes of output space,
* tool dispatcher writes the LongWord into them. We pull the low 16
* bits into A (ORCA-C Word return convention -- A holds the result,
* not Y; verified against jIIgs.asm asmGetVbl) and discard the high
* 16 into X.
*
* ORCA-C cdecl ABI: caller has M=I=16. Word return in A.
****************************************************************
iigsGetTickWord start IIGSASM
pha ; output space high word
pha ; output space low word
ldx #$2503 ; _GetTick
jsl $E10000
pla ; A = low 16 bits (return value)
plx ; discard high 16 bits
rtl
end
****************************************************************
* uint16_t iigsReadHzParam(void)
*
* Reads battery RAM parameter hrtz50or60 ($1D) via _ReadBParam ($0C03)
* and returns the raw value: 0 = NTSC (60 Hz), 1 = PAL (50 Hz).
*
* GetTick fires from the hardware VBL ISR, so its rate matches the
* video field rate -- 60 Hz on NTSC, 50 Hz on PAL. halFrameHz must
* report whichever this machine actually runs so wall-clock math
* (frames * 1000 / halFrameHz) is correct on both.
****************************************************************
iigsReadHzParam start IIGSASM
pha ; output space (Word)
pea $001D ; hrtz50or60 parameter ID
ldx #$0C03 ; _ReadBParam
jsl $E10000
pla ; A = result (ORCA-C Word return)
rtl
end

View file

@ -253,3 +253,253 @@ _surface68kFillRectByteAligned:
.Lfrb_done:
movem.l (%sp)+,%d2-%d6
rts
| ----------------------------------------------------------------
| void surface68kFillSpan4Planes(uint8_t *p0, uint8_t *p1,
| uint8_t *p2, uint8_t *p3,
| uint16_t numMid,
| uint8_t leftMask, uint8_t rightMask,
| uint8_t fb0, uint8_t fb1,
| uint8_t fb2, uint8_t fb3);
|
| Fill ONE planar row across 4 planes -- the per-row body of
| halFillRectPlanes lifted into asm. Each pN points at the leading
| byte (already advanced by planeBase + y*40 + byteFirst on the C
| side). leftMask and rightMask are the partial-byte masks for the
| left/right edges; numMid is the count of full bytes between them.
| fbN is 0x00 or 0xFF, the per-plane fill byte (caller pre-classifies
| (colorIndex >> N) & 1 -> 0xFF or 0x00).
|
| Used by Amiga halFastFillCircle (one call per scanline span) and
| Amiga halFillRectPlanes (one call per row of the rect). Replaces
| the C inner loop whose ~13 cyc/byte was the gating cost on
| fillCircle r=40 even after C-side inlining.
|
| Mask convention is uniform for all planes:
| leading byte := (*p & ~leftMask) | (fbN & leftMask)
| middle bytes := fbN
| trailing byte := (*p & ~rightMask) | (fbN & rightMask)
| -- branchless: the same arithmetic produces "set" or "clear" based
| on whether fbN is 0xFF or 0x00.
|
| ABI: m68k cdecl. d2-d7/a2-a6 callee-save (movem'd here).
| Stack offset to first arg after MOVEM: 11 regs * 4 = 44 bytes saved
| + 4 ret PC = 48.
| ----------------------------------------------------------------
.globl _surface68kFillSpan4Planes
.equ SP_SAVED, 44
.equ SP_RPC, 4
.equ SP_OFF, (SP_SAVED + SP_RPC)
.equ SP_P0, SP_OFF + 0
.equ SP_P1, SP_OFF + 4
.equ SP_P2, SP_OFF + 8
.equ SP_P3, SP_OFF + 12
.equ SP_NMID, SP_OFF + 16 + 2 | int -> low word at +2
.equ SP_LMASK, SP_OFF + 20 + 3 | int -> low byte at +3
.equ SP_RMASK, SP_OFF + 24 + 3
.equ SP_FB0, SP_OFF + 28 + 3
.equ SP_FB1, SP_OFF + 32 + 3
.equ SP_FB2, SP_OFF + 36 + 3
.equ SP_FB3, SP_OFF + 40 + 3
| Macro: per-plane work fully inlined. Args:
| plane_an = the address register holding this plane's pointer.
| fb_off = the stack offset for this plane's fillByte.
| Uses d6/d7 as scratch; d1=leftMask, d2=~leftMask, d3=rightMask,
| d4=~rightMask; d0=numMid-1 (only valid if mid_count > 0). The mid
| loop is skipped via .LfsSkipMid_<n> when numMid was 0 at entry --
| the per-plane caller branches to the right tail label.
|
| Hand-unrolled 4x rather than using bsr+rts to dodge ~12 cyc per
| return + the per-plane re-test of numMid that the previous build
| paid. The mid-loop label suffix is the plane index so all four
| inline copies can coexist without label collisions.
|
| Plain text version of the per-plane body (translate to asm 4x with
| different a-regs and fb stack offsets):
|
| move.b (an),%d6
| and.b %d2,%d6
| move.b fb,%d7
| and.b %d1,%d7
| or.b %d7,%d6
| move.b %d6,(an)+
| < if has-middle path: >
| move.w %d0,%d7
| .midN:
| move.b fb,(an)+
| dbra %d7,.midN
| < trailing: >
| move.b (an),%d6
| and.b %d4,%d6
| move.b fb,%d7
| and.b %d3,%d7
| or.b %d7,%d6
| move.b %d6,(an)
_surface68kFillSpan4Planes:
movem.l %d2-%d7/%a2-%a6,-(%sp)
move.b SP_LMASK(%sp),%d1
move.b %d1,%d2
not.b %d2
move.b SP_RMASK(%sp),%d3
move.b %d3,%d4
not.b %d4
move.l SP_P0(%sp),%a0
move.l SP_P1(%sp),%a1
move.l SP_P2(%sp),%a2
move.l SP_P3(%sp),%a3
| One-time numMid test. d0.w = numMid; if 0 jump to
| the no-middle entry, otherwise pre-decrement for dbra
| and fall into the with-middle entry. Both paths
| unroll all 4 planes.
move.w SP_NMID(%sp),%d0
beq .LfsNoMid
subq.w #1,%d0
| ---- WITH-MIDDLE PATH ----
| Plane 0
move.b (%a0),%d6
and.b %d2,%d6
move.b SP_FB0(%sp),%d5
move.b %d5,%d7
and.b %d1,%d7
or.b %d7,%d6
move.b %d6,(%a0)+
move.w %d0,%d7
.LfsMid0: move.b %d5,(%a0)+
dbra %d7,.LfsMid0
move.b (%a0),%d6
and.b %d4,%d6
move.b %d5,%d7
and.b %d3,%d7
or.b %d7,%d6
move.b %d6,(%a0)
| Plane 1
move.b (%a1),%d6
and.b %d2,%d6
move.b SP_FB1(%sp),%d5
move.b %d5,%d7
and.b %d1,%d7
or.b %d7,%d6
move.b %d6,(%a1)+
move.w %d0,%d7
.LfsMid1: move.b %d5,(%a1)+
dbra %d7,.LfsMid1
move.b (%a1),%d6
and.b %d4,%d6
move.b %d5,%d7
and.b %d3,%d7
or.b %d7,%d6
move.b %d6,(%a1)
| Plane 2
move.b (%a2),%d6
and.b %d2,%d6
move.b SP_FB2(%sp),%d5
move.b %d5,%d7
and.b %d1,%d7
or.b %d7,%d6
move.b %d6,(%a2)+
move.w %d0,%d7
.LfsMid2: move.b %d5,(%a2)+
dbra %d7,.LfsMid2
move.b (%a2),%d6
and.b %d4,%d6
move.b %d5,%d7
and.b %d3,%d7
or.b %d7,%d6
move.b %d6,(%a2)
| Plane 3
move.b (%a3),%d6
and.b %d2,%d6
move.b SP_FB3(%sp),%d5
move.b %d5,%d7
and.b %d1,%d7
or.b %d7,%d6
move.b %d6,(%a3)+
move.w %d0,%d7
.LfsMid3: move.b %d5,(%a3)+
dbra %d7,.LfsMid3
move.b (%a3),%d6
and.b %d4,%d6
move.b %d5,%d7
and.b %d3,%d7
or.b %d7,%d6
move.b %d6,(%a3)
movem.l (%sp)+,%d2-%d7/%a2-%a6
rts
.LfsNoMid:
| ---- NO-MIDDLE PATH (just leading + trailing) ----
| Plane 0
move.b (%a0),%d6
and.b %d2,%d6
move.b SP_FB0(%sp),%d5
move.b %d5,%d7
and.b %d1,%d7
or.b %d7,%d6
move.b %d6,(%a0)+
move.b (%a0),%d6
and.b %d4,%d6
move.b %d5,%d7
and.b %d3,%d7
or.b %d7,%d6
move.b %d6,(%a0)
| Plane 1
move.b (%a1),%d6
and.b %d2,%d6
move.b SP_FB1(%sp),%d5
move.b %d5,%d7
and.b %d1,%d7
or.b %d7,%d6
move.b %d6,(%a1)+
move.b (%a1),%d6
and.b %d4,%d6
move.b %d5,%d7
and.b %d3,%d7
or.b %d7,%d6
move.b %d6,(%a1)
| Plane 2
move.b (%a2),%d6
and.b %d2,%d6
move.b SP_FB2(%sp),%d5
move.b %d5,%d7
and.b %d1,%d7
or.b %d7,%d6
move.b %d6,(%a2)+
move.b (%a2),%d6
and.b %d4,%d6
move.b %d5,%d7
and.b %d3,%d7
or.b %d7,%d6
move.b %d6,(%a2)
| Plane 3
move.b (%a3),%d6
and.b %d2,%d6
move.b SP_FB3(%sp),%d5
move.b %d5,%d7
and.b %d1,%d7
or.b %d7,%d6
move.b %d6,(%a3)+
move.b (%a3),%d6
and.b %d4,%d6
move.b %d5,%d7
and.b %d3,%d7
or.b %d7,%d6
move.b %d6,(%a3)
movem.l (%sp)+,%d2-%d7/%a2-%a6
rts

93
tools/diff-uber-hashes Executable file
View file

@ -0,0 +1,93 @@
#!/usr/bin/env python3
"""Compare two UBER joeylog.txt files by per-op surface hash.
Used by the planar 68k rewrite (project_planar_68k_plan.md): IIgs
captures the golden reference, each 68k port re-runs UBER after a
primitive conversion, and this tool tells you which ops produced
different pixels. Without this, "looks right visually" misses the
subtle mismatches that cascade into hard-to-debug corruption.
Usage:
tools/diff-uber-hashes <reference-log> <test-log>
Exit code:
0 = all hashes match
1 = at least one mismatch
2 = usage error or missing file
"""
import re
import sys
# Match e.g.:
# UBER: drawCircle r=80: 56 iters / 4 frames = 840 ops/sec | hash=A1B2C3D4
LINE_RE = re.compile(
r"UBER:\s+(?P<op>[^:]+):\s+\d+\s+iters\s+/\s+\d+\s+frames\s+=\s+\d+\s+ops/sec\s+\|\s+hash=(?P<hash>[0-9A-Fa-f]+)"
)
def parse_log(path):
"""Return ordered dict {op_name: hash} from a UBER log file.
Multiple runs may be concatenated in the same log (joeyLog appends)
-- in that case the LAST hash for each op wins, matching the most
recent run.
"""
hashes = {}
with open(path) as f:
for line in f:
m = LINE_RE.search(line)
if m:
hashes[m.group("op").strip()] = m.group("hash").upper()
return hashes
def main(argv):
if len(argv) != 3:
sys.stderr.write(
"usage: diff-uber-hashes <reference-log> <test-log>\n"
)
return 2
try:
ref = parse_log(argv[1])
test = parse_log(argv[2])
except OSError as e:
sys.stderr.write(f"error: {e}\n")
return 2
if not ref:
sys.stderr.write(f"error: no UBER hash lines found in {argv[1]}\n")
return 2
if not test:
sys.stderr.write(f"error: no UBER hash lines found in {argv[2]}\n")
return 2
mismatches = 0
matches = 0
for op, ref_hash in ref.items():
test_hash = test.get(op)
if test_hash is None:
print(f" MISSING in test: {op} (ref={ref_hash})")
mismatches += 1
elif test_hash != ref_hash:
print(f" MISMATCH {op}: ref={ref_hash} test={test_hash}")
mismatches += 1
else:
matches += 1
extras = [op for op in test if op not in ref]
for op in extras:
print(f" EXTRA in test: {op} (test={test[op]})")
total = len(ref) + len(extras)
print()
if mismatches == 0 and not extras:
print(f"OK: {matches}/{total} ops match")
return 0
print(f"FAIL: {matches} match, {mismatches} mismatch, {len(extras)} extras")
return 1
if __name__ == "__main__":
sys.exit(main(sys.argv))

132
tools/diff-uber-perf Executable file
View file

@ -0,0 +1,132 @@
#!/usr/bin/env python3
"""Compare two UBER joeylog.txt files by per-op ops/sec.
Sibling of diff-uber-hashes (which compares pixel correctness). This
tool drives Phase 10 of project_planar_68k_plan.md: pick the
biggest perf gaps vs the IIgs reference and target asm/algorithmic
optimization at those.
Usage:
tools/diff-uber-perf <reference-log> <test-log> [--threshold 1.0]
Output is sorted by speed ratio (test/ref) ascending, so the worst
gaps print first. Ops missing from either log are flagged. The
threshold flag (default 1.0) marks ops below that ratio as FAIL --
project_perf_directive.md says "IIgs is the perf floor; every
other target must match or beat it", so parity = 1.0x. Use
--threshold 0.8 for the project_planar_68k_plan looser acceptance.
Exit code:
0 = all common ops at >= threshold
1 = at least one op below threshold (or missing)
2 = usage error or missing file
"""
import re
import sys
# Match e.g.:
# UBER: drawCircle r=80: 56 iters / 4 frames = 840 ops/sec | hash=A1B2C3D4
LINE_RE = re.compile(
r"UBER:\s+(?P<op>[^:]+):\s+\d+\s+iters\s+/\s+\d+\s+frames\s+=\s+(?P<ops>\d+)\s+ops/sec"
)
def parse_log(path):
"""Return ordered dict {op_name: ops_per_sec} from a UBER log file.
Multiple runs may be concatenated (joeyLog appends); last value
for each op wins, matching the most recent run.
"""
perf = {}
with open(path) as f:
for line in f:
m = LINE_RE.search(line)
if m:
perf[m.group("op").strip()] = int(m.group("ops"))
return perf
def main(argv):
threshold = 1.0
args = []
i = 1
while i < len(argv):
if argv[i] == "--threshold" and i + 1 < len(argv):
try:
threshold = float(argv[i + 1])
except ValueError:
sys.stderr.write(f"error: bad threshold {argv[i+1]}\n")
return 2
i += 2
else:
args.append(argv[i])
i += 1
if len(args) != 2:
sys.stderr.write(
"usage: diff-uber-perf <reference-log> <test-log> [--threshold 1.0]\n"
)
return 2
try:
ref = parse_log(args[0])
test = parse_log(args[1])
except OSError as e:
sys.stderr.write(f"error: {e}\n")
return 2
if not ref:
sys.stderr.write(f"error: no UBER lines found in {args[0]}\n")
return 2
if not test:
sys.stderr.write(f"error: no UBER lines found in {args[1]}\n")
return 2
rows = []
for op, ref_ops in ref.items():
test_ops = test.get(op)
if test_ops is None:
rows.append((op, ref_ops, None, None, "MISSING"))
continue
if ref_ops == 0:
ratio = float("inf") if test_ops > 0 else 1.0
else:
ratio = test_ops / ref_ops
status = "ok" if ratio >= threshold else "FAIL"
rows.append((op, ref_ops, test_ops, ratio, status))
extras = [(op, None, test[op], None, "EXTRA") for op in test if op not in ref]
# Sort: missing/fail first by worst ratio, then ok ascending by ratio.
def sort_key(row):
op, refv, testv, ratio, status = row
if status == "MISSING":
return (0, 0.0, op)
if status == "EXTRA":
return (3, 0.0, op)
return (1 if status == "FAIL" else 2, ratio, op)
rows.sort(key=sort_key)
op_w = max(len(op) for op in ref) if ref else 8
op_w = max(op_w, max((len(op) for op in test), default=8), len("op"))
print(f"{'op':<{op_w}} {'ref':>10} {'test':>10} {'ratio':>7} status")
print(f"{'-'*op_w} {'-'*10} {'-'*10} {'-'*7} ------")
fails = 0
for op, refv, testv, ratio, status in rows + extras:
refs = "" if refv is None else str(refv)
tests = "" if testv is None else str(testv)
rats = "" if ratio is None else f"{ratio:.2f}x"
print(f"{op:<{op_w}} {refs:>10} {tests:>10} {rats:>7} {status}")
if status in ("FAIL", "MISSING"):
fails += 1
print()
print(f"threshold: {threshold:.2f}x ({len(rows)} ops compared, {fails} below threshold)")
return 1 if fails > 0 else 0
if __name__ == "__main__":
sys.exit(main(sys.argv))