diff --git a/examples/audio/audio.c b/examples/audio/audio.c index bf0c9ae..90ac866 100644 --- a/examples/audio/audio.c +++ b/examples/audio/audio.c @@ -120,8 +120,8 @@ int main(void) { config.hostMode = HOST_MODE_TAKEOVER; config.codegenBytes = 8 * 1024; config.maxSurfaces = 4; - config.audioBytes = 64 * 1024; - config.assetBytes = 128 * 1024; + config.audioBytes = 64UL * 1024; + config.assetBytes = 128UL * 1024; if (!joeyInit(&config)) { fprintf(stderr, "joeyInit failed: %s\n", joeyLastError()); diff --git a/examples/draw/draw.c b/examples/draw/draw.c index fe881c5..cd83ef1 100644 --- a/examples/draw/draw.c +++ b/examples/draw/draw.c @@ -246,8 +246,8 @@ int main(void) { config.hostMode = HOST_MODE_TAKEOVER; config.codegenBytes = 8 * 1024; config.maxSurfaces = 4; - config.audioBytes = 64 * 1024; - config.assetBytes = 128 * 1024; + config.audioBytes = 64UL * 1024; + config.assetBytes = 128UL * 1024; if (!joeyInit(&config)) { fprintf(stderr, "joeyInit failed: %s\n", joeyLastError()); diff --git a/examples/hello/hello.c b/examples/hello/hello.c index 10a89bb..8440484 100644 --- a/examples/hello/hello.c +++ b/examples/hello/hello.c @@ -12,8 +12,8 @@ int main(void) { config.hostMode = HOST_MODE_OS; config.codegenBytes = 8 * 1024; config.maxSurfaces = 4; - config.audioBytes = 64 * 1024; - config.assetBytes = 128 * 1024; + config.audioBytes = 64UL * 1024; + config.assetBytes = 128UL * 1024; if (!joeyInit(&config)) { fprintf(stderr, "joeyInit failed: %s\n", joeyLastError()); diff --git a/examples/joy/joy.c b/examples/joy/joy.c index d2894c2..2683706 100644 --- a/examples/joy/joy.c +++ b/examples/joy/joy.c @@ -218,8 +218,8 @@ int main(void) { config.hostMode = HOST_MODE_TAKEOVER; config.codegenBytes = 8 * 1024; config.maxSurfaces = 4; - config.audioBytes = 64 * 1024; - config.assetBytes = 128 * 1024; + config.audioBytes = 64UL * 1024; + config.assetBytes = 128UL * 1024; if (!joeyInit(&config)) { fprintf(stderr, "joeyInit failed: %s\n", joeyLastError()); diff --git a/examples/keys/keys.c b/examples/keys/keys.c index 841acb5..31bbfef 100644 --- a/examples/keys/keys.c +++ b/examples/keys/keys.c @@ -225,8 +225,8 @@ int main(void) { config.hostMode = HOST_MODE_TAKEOVER; config.codegenBytes = 8 * 1024; config.maxSurfaces = 4; - config.audioBytes = 64 * 1024; - config.assetBytes = 128 * 1024; + config.audioBytes = 64UL * 1024; + config.assetBytes = 128UL * 1024; if (!joeyInit(&config)) { fprintf(stderr, "joeyInit failed: %s\n", joeyLastError()); diff --git a/examples/pattern/pattern.c b/examples/pattern/pattern.c index 76b1e0b..2fe3286 100644 --- a/examples/pattern/pattern.c +++ b/examples/pattern/pattern.c @@ -108,8 +108,8 @@ int main(void) { config.hostMode = HOST_MODE_TAKEOVER; config.codegenBytes = 8 * 1024; config.maxSurfaces = 4; - config.audioBytes = 64 * 1024; - config.assetBytes = 128 * 1024; + config.audioBytes = 64UL * 1024; + config.assetBytes = 128UL * 1024; if (!joeyInit(&config)) { fprintf(stderr, "joeyInit failed: %s\n", joeyLastError()); diff --git a/examples/sprite/sprite.c b/examples/sprite/sprite.c index 0abcbad..3c7509b 100644 --- a/examples/sprite/sprite.c +++ b/examples/sprite/sprite.c @@ -113,8 +113,8 @@ int main(void) { config.hostMode = HOST_MODE_TAKEOVER; config.codegenBytes = 8 * 1024; config.maxSurfaces = 4; - config.audioBytes = 64 * 1024; - config.assetBytes = 128 * 1024; + config.audioBytes = 64UL * 1024; + config.assetBytes = 128UL * 1024; if (!joeyInit(&config)) { fprintf(stderr, "joeyInit failed: %s\n", joeyLastError()); @@ -154,8 +154,7 @@ int main(void) { vy = 1; haveBackup = false; - spriteSaveUnder(screen, ball, x, y, &backup); - spriteDraw(screen, ball, x, y); + spriteSaveAndDraw(screen, ball, x, y, &backup); stagePresentRect(backup.x, backup.y, backup.width, backup.height); haveBackup = true; @@ -189,8 +188,7 @@ int main(void) { if (y <= 0) { y = 0; vy = (int16_t)-vy; } if (y >= SURFACE_HEIGHT - BALL_H) { y = SURFACE_HEIGHT - BALL_H; vy = (int16_t)-vy; } - spriteSaveUnder(screen, ball, x, y, &backup); - spriteDraw(screen, ball, x, y); + spriteSaveAndDraw(screen, ball, x, y, &backup); // Bounding box of (old rect) U (new rect). For typical // small-step motion the rects overlap heavily so the union diff --git a/examples/uber/uber.c b/examples/uber/uber.c new file mode 100644 index 0000000..5538b48 --- /dev/null +++ b/examples/uber/uber.c @@ -0,0 +1,350 @@ +// Uber demo: exercise every JoeyLib public API and measure throughput +// of the per-frame-hot ones. Results are written to joeylog.txt via +// joeyLogF. A green screen on exit means the run completed. +// +// Timing model: each test aligns to a VBL boundary via joeyWaitVBL, +// records the starting joeyFrameCount, then runs the op in a tight +// loop polling joeyFrameCount until UBER_FRAMES frames have elapsed. +// Reported metric is ops/sec, computed as iters * joeyFrameHz() / +// UBER_FRAMES so results are directly comparable across ports +// regardless of CPU speed or VBL rate. +// +// joeyFrameCount is wall-clock-based per port; the per-iter poll +// adds ~10-30 cyc per op which shows up as noise on the very +// fastest ops but is below ~5% even for ~500 cyc/op work. +// +// One-shot ops (spriteCompile) get one call each, timed by frame +// delta -- coarser but representative. + +#include +#include +#include +#include + +#include + + +// ----- Timing primitives ----- + +// 4-frame measurement window. Long enough that loop overhead doesn't +// dominate; short enough to keep the full demo run under ~10 sec. +#define UBER_FRAMES 4u + + +typedef void (*OpFn)(void); + +static const char *gCurName = "(none)"; +static SurfaceT *gStage = NULL; +static SpriteT *gSprite = NULL; +static SpriteBackupT gBackup; +static unsigned char gBackupBytes[256]; + +static TileT gTileScratch; + + +// Run `op` in a tight loop until `targetFrames` joeyFrameCount ticks +// have elapsed. Returns iterations completed. +static unsigned long runForFrames(OpFn op, unsigned int targetFrames) { + unsigned long count; + uint16_t startFrame; + + count = 0UL; + + joeyWaitVBL(); + startFrame = joeyFrameCount(); + + while ((uint16_t)(joeyFrameCount() - startFrame) < targetFrames) { + op(); + count++; + } + return count; +} + + +// Time and log one op. Reports iters / N frames AND the derived +// ops/sec so per-port results are directly comparable against IIgs +// regardless of CPU speed or display refresh rate. +static void timeOp(const char *name, OpFn op) { + unsigned long iters; + unsigned long opsPerSec; + + gCurName = name; + + iters = runForFrames(op, UBER_FRAMES); + + if (iters == 0UL) { + joeyLogF("UBER: %s: 0 iters (op too slow?)\n", name); + return; + } + + opsPerSec = (iters * (unsigned long)joeyFrameHz()) / (unsigned long)UBER_FRAMES; + joeyLogF("UBER: %s: %lu iters / %u frames = %lu ops/sec\n", + name, iters, UBER_FRAMES, opsPerSec); +} + + + + +// ----- Test ops ----- + +static void op_drawPixel (void) { drawPixel (gStage, 100, 100, 5); } +static void op_drawLineH (void) { drawLine (gStage, 0, 50, 319, 50, 5); } +static void op_drawLineV (void) { drawLine (gStage, 50, 0, 50, 199, 5); } +static void op_drawLineDiag (void) { drawLine (gStage, 0, 0, 319, 199, 5); } +static void op_drawRect (void) { drawRect (gStage, 10, 10, 100, 100, 5); } +static void op_drawCircleSmall (void) { drawCircle (gStage, 160, 100, 16, 5); } +static void op_drawCircleLarge (void) { drawCircle (gStage, 160, 100, 80, 5); } +static void op_fillRectSmall (void) { fillRect (gStage, 20, 20, 16, 16, 7); } +static void op_fillRectMid (void) { fillRect (gStage, 20, 20, 80, 80, 7); } +static void op_fillRectFull (void) { fillRect (gStage, 0, 0, 320, 200, 7); } +static void op_fillCircle (void) { fillCircle (gStage, 160, 100, 40, 7); } +static void op_samplePixel (void) { (void)samplePixel(gStage, 100, 100); } +static void op_surfaceClear (void) { surfaceClear (gStage, 0); } + +static void op_paletteSet(void) { + static uint16_t colors[16] = { + 0x000, 0xF00, 0x0F0, 0x00F, 0xFF0, 0xF0F, 0x0FF, 0xFFF, + 0x800, 0x080, 0x008, 0x880, 0x808, 0x088, 0x888, 0x444 + }; + paletteSet(gStage, 0, colors); +} +static void op_scbSetRange (void) { scbSetRange (gStage, 0, 199, 0); } + +static void op_tileFill (void) { tileFill (gStage, 5, 5, 7); } +static void op_tileCopy (void) { tileCopy (gStage, 6, 6, gStage, 5, 5); } +static void op_tileCopyMasked (void) { tileCopyMasked (gStage, 7, 7, gStage, 5, 5, 0); } +static void op_tilePaste (void) { tilePaste (gStage, 8, 8, &gTileScratch); } +static void op_tileSnap (void) { tileSnap (gStage, 5, 5, &gTileScratch); } + +static int16_t gSpriteX = 40; +static int16_t gSpriteY = 30; + +static void op_spriteSave (void) { spriteSaveUnder (gStage, gSprite, gSpriteX, gSpriteY, &gBackup); } +static void op_spriteDraw (void) { spriteDraw (gStage, gSprite, gSpriteX, gSpriteY); } +static void op_spriteRestore (void) { spriteRestoreUnder(gStage, &gBackup); } +static void op_spriteSaveAndDraw (void) { spriteSaveAndDraw (gStage, gSprite, gSpriteX, gSpriteY, &gBackup); } + +static void op_stagePresent (void) { stagePresent(); } +static void op_stagePresentRect8(void) { stagePresentRect( 40, 30, 16, 16); } +static void op_stagePresentRectF(void) { stagePresentRect( 0, 0, 320, 200); } + +static void op_inputPoll (void) { joeyInputPoll(); } +static void op_keyDown (void) { (void)joeyKeyDown(KEY_A); } +static void op_keyPressed (void) { (void)joeyKeyPressed(KEY_A); } +static void op_mouseX (void) { (void)joeyMouseX(); } +static void op_joyConnected (void) { (void)joeyJoystickConnected(JOYSTICK_1); } + +static void op_audioFrameTick (void) { joeyAudioFrameTick(); } +static void op_audioIsPlaying (void) { (void)joeyAudioIsPlayingMod(); } + +static void op_surfaceMarkDirty(void) { /* drawPixel already marks; use fill instead */ + fillRect(gStage, 0, 0, 32, 32, 0); } + + +// ----- Build the ball sprite procedurally ----- + +#define BALL_TILES_X 2 +#define BALL_TILES_Y 2 +#define BALL_TILE_BYTES (BALL_TILES_X * BALL_TILES_Y * 32u) + +static const uint8_t gBallAuthored[16 * 8] = { + 0x00, 0x00, 0x22, 0x22, 0x22, 0x22, 0x00, 0x00, + 0x00, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x00, + 0x02, 0x22, 0x32, 0x22, 0x22, 0x22, 0x22, 0x20, + 0x02, 0x23, 0x32, 0x22, 0x22, 0x22, 0x22, 0x20, + 0x22, 0x33, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, + 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, + 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, + 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, + 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, + 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, + 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, + 0x02, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x20, + 0x02, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x20, + 0x00, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x00, + 0x00, 0x00, 0x22, 0x22, 0x22, 0x22, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x22, 0x22, 0x00, 0x00, 0x00 +}; +static uint8_t gBallTiles[BALL_TILE_BYTES]; + +static void buildBallSprite(void) { + uint16_t tx; + uint16_t ty; + uint16_t row; + uint16_t b; + uint8_t *dst; + + for (ty = 0; ty < BALL_TILES_Y; ty++) { + for (tx = 0; tx < BALL_TILES_X; tx++) { + dst = &gBallTiles[(ty * BALL_TILES_X + tx) * 32u]; + for (row = 0; row < 8; row++) { + for (b = 0; b < 4; b++) { + dst[row * 4 + b] = + gBallAuthored[((ty * 8) + row) * 8 + (tx * 4) + b]; + } + } + } + } +} + + +// ----- Main ----- + +static void runAllTests(void) { + joeyLogF("UBER: ----- begin -----\n"); + + // Surface / palette / SCB. + timeOp("surfaceClear", op_surfaceClear); + timeOp("paletteSet", op_paletteSet); + timeOp("scbSetRange", op_scbSetRange); + + // Drawing primitives. + timeOp("drawPixel", op_drawPixel); + timeOp("drawLine H", op_drawLineH); + timeOp("drawLine V", op_drawLineV); + timeOp("drawLine diag", op_drawLineDiag); + timeOp("drawRect 100x100", op_drawRect); + timeOp("drawCircle r=16", op_drawCircleSmall); + timeOp("drawCircle r=80", op_drawCircleLarge); + timeOp("fillRect 16x16", op_fillRectSmall); + timeOp("fillRect 80x80", op_fillRectMid); + timeOp("fillRect 320x200", op_fillRectFull); + timeOp("fillCircle r=40", op_fillCircle); + timeOp("samplePixel", op_samplePixel); + + // Tiles. Seed scratch tile + dest cells with non-zero pixels first. + fillRect(gStage, 0, 0, 320, 64, 7); + tileSnap(gStage, 5, 5, &gTileScratch); + timeOp("tileFill", op_tileFill); + timeOp("tileCopy", op_tileCopy); + timeOp("tileCopyMasked", op_tileCopyMasked); + timeOp("tilePaste", op_tilePaste); + timeOp("tileSnap", op_tileSnap); + + // Sprites. Background must be non-empty so save-under has work + // to do (otherwise it's a 4 KB memset of zeros, atypical). + surfaceClear(gStage, 4); + timeOp("spriteSaveUnder", op_spriteSave); + timeOp("spriteDraw", op_spriteDraw); + timeOp("spriteRestoreUnder", op_spriteRestore); + timeOp("spriteSaveAndDraw", op_spriteSaveAndDraw); + + // Present. + timeOp("stagePresent full", op_stagePresent); + timeOp("stagePresentRect 8b",op_stagePresentRect8); + timeOp("stagePresentRect F", op_stagePresentRectF); + + // Input. + timeOp("joeyInputPoll", op_inputPoll); + timeOp("joeyKeyDown", op_keyDown); + timeOp("joeyKeyPressed", op_keyPressed); + timeOp("joeyMouseX", op_mouseX); + timeOp("joeyJoyConnected", op_joyConnected); + + // Audio. + timeOp("joeyAudioFrameTick", op_audioFrameTick); + timeOp("joeyAudioIsPlayingMod", op_audioIsPlaying); + + // Surface mark dirty (via fillRect's mark step). + timeOp("surfaceMarkDirtyRect (via fillRect 32x32)", op_surfaceMarkDirty); + + joeyLogF("UBER: ----- end -----\n"); +} + + +int main(void) { + JoeyConfigT config; + uint16_t pal[16]; + int i; + + config.hostMode = HOST_MODE_TAKEOVER; + config.codegenBytes = 8 * 1024; + config.maxSurfaces = 4; + config.audioBytes = 64UL * 1024; + config.assetBytes = 128UL * 1024; + + if (!joeyInit(&config)) { + return 1; + } + + gStage = stageGet(); + if (gStage == NULL) { + joeyShutdown(); + return 1; + } + + // A simple visible palette so users see SOMETHING during the run. + for (i = 0; i < 16; i++) { + pal[i] = (uint16_t)((i << 8) | (i << 4) | i); // grey ramp + } + pal[ 0] = 0x000; + pal[ 1] = 0x800; // dark red (running) + pal[ 2] = 0x080; // green (done) + pal[ 3] = 0x008; // blue + pal[ 5] = 0xFF0; // yellow (test pixels) + pal[ 7] = 0xFFF; // white (fills) + pal[15] = 0xF00; // red + paletteSet(gStage, 0, pal); + scbSetRange(gStage, 0, 199, 0); + + // Indicate "running": red bar at top of screen. + surfaceClear(gStage, 0); + fillRect(gStage, 0, 0, 320, 8, 1); + stagePresent(); + + buildBallSprite(); + gSprite = spriteCreate(gBallTiles, BALL_TILES_X, BALL_TILES_Y, SPRITE_FLAGS_NONE); + if (gSprite == NULL) { + joeyLog("UBER: spriteCreate failed"); + joeyShutdown(); + return 1; + } + // spriteCompile is a one-shot. Time at frame resolution. + { + uint16_t before; + + joeyWaitVBL(); + before = joeyFrameCount(); + if (!spriteCompile(gSprite)) { + joeyLog("UBER: spriteCompile failed"); + } + while (joeyFrameCount() == before) { + /* wait for next VBL edge */ + } + joeyLogF("UBER: spriteCompile: 1 call in <= 1 frame\n"); + } + gBackup.bytes = gBackupBytes; + + // Audio: only init/shutdown is exercised. Triggering joeyAudioPlaySfx + // without first calling joeyAudioPlayMod leaves NTP's engine in a + // half-initialized state -- NTPstreamsound is designed to OVERLAY on + // an already-running module. Without NTPprepare/NTPplay first, the + // streamer oscillator is fired but no music tick ever advances or + // silences it, and you get a stuck high-pitched scream. UBER doesn't + // ship a MOD asset, so we skip the SFX exercise. The frame-tick and + // isPlayingMod calls below still get timed (both are no-op fast + // paths on IIgs). + if (joeyAudioInit()) { + joeyLogF("UBER: audioInit OK\n"); + } else { + joeyLogF("UBER: audioInit failed (skipping audio)\n"); + } + + // Reset stage + run all per-frame timed tests. + surfaceClear(gStage, 0); + fillRect(gStage, 0, 0, 320, 8, 1); + stagePresent(); + + runAllTests(); + + // Done. Green screen + waitForKey. + surfaceClear(gStage, 2); + stagePresent(); + + joeyLogF("UBER: press any key to exit\n"); + joeyWaitForAnyKey(); + + spriteDestroy(gSprite); + joeyShutdown(); + return 0; +} diff --git a/include/joey/core.h b/include/joey/core.h index 95f55e7..256f95e 100644 --- a/include/joey/core.h +++ b/include/joey/core.h @@ -37,4 +37,17 @@ const char *joeyVersionString(void); // always a hardware-level wait, not a software timer. void joeyWaitVBL(void); +// Monotonic 16-bit frame counter. Polled by callers; ports detect +// the rising edge inside this call (IIgs $C019, DOS $3DA, Amiga +// VPOSR) or expose a counter maintained by a VBL ISR (Atari ST). +// Caller must poll faster than 2 * joeyFrameHz() so no edge is +// missed. Used by benchmarks and frame-rate-independent animation. +uint16_t joeyFrameCount(void); + +// Nominal display frame rate in Hz: 50 (Amiga PAL), 60 (IIgs / ST +// NTSC default), 70 (VGA mode 13h). The actual VBL cadence may +// drift slightly; the value reported here is what benchmarks divide +// by to convert iters-per-N-frames to ops/sec. +uint16_t joeyFrameHz(void); + #endif diff --git a/include/joey/sprite.h b/include/joey/sprite.h index 32ed7bb..0a34ee8 100644 --- a/include/joey/sprite.h +++ b/include/joey/sprite.h @@ -103,6 +103,20 @@ void spriteSaveUnder(const SurfaceT *s, SpriteT *sp, int16_t x, int16_t y, Sprit // by other writes that overlapped its captured region. void spriteRestoreUnder(SurfaceT *s, const SpriteBackupT *backup); +// Combined save-then-draw entry point. The common animation pattern +// captures the destination bytes about to be overwritten, then draws +// the sprite. Both ops share validation, the destination ptr is +// computed once, and a single dirty-rect mark covers both. Saves +// roughly one full dispatcher chain (~150 cyc on IIgs ORCA-C) per +// frame versus calling spriteSaveUnder + spriteDraw separately. +// +// Identical semantics to: +// spriteSaveUnder(s, sp, x, y, backup); +// spriteDraw(s, sp, x, y); +// modulo: the dirty rect is marked once for the union (which here is +// just the draw rect, since save doesn't write). +void spriteSaveAndDraw(SurfaceT *s, SpriteT *sp, int16_t x, int16_t y, SpriteBackupT *backup); + // Snapshot an 8x8-aligned region of a SurfaceT into a new SpriteT. // The captured pixel data is copied into a sprite-owned buffer so // the source surface can be modified afterwards. Width and height diff --git a/make/amiga.mk b/make/amiga.mk index 5429346..4e2f86b 100644 --- a/make/amiga.mk +++ b/make/amiga.mk @@ -70,6 +70,8 @@ SPRITE_SRC := $(EXAMPLES)/sprite/sprite.c SPRITE_BIN := $(BINDIR)/Sprite AUDIO_SRC := $(EXAMPLES)/audio/audio.c AUDIO_BIN := $(BINDIR)/Audio +UBER_SRC := $(EXAMPLES)/uber/uber.c +UBER_BIN := $(BINDIR)/Uber # Game data lives under bin/DATA/, ready to be copied into the # scratch JOEYLIB hard-drive dir staged by scripts/run-amiga.sh. @@ -78,7 +80,7 @@ DATA_DIR := $(BINDIR)/DATA DATA_FILES := $(DATA_DIR)/test.mod $(DATA_DIR)/test.sfx .PHONY: all amiga clean-amiga -all amiga: $(LIB) $(HELLO_BIN) $(PATTERN_BIN) $(DRAW_BIN) $(KEYS_BIN) $(JOY_BIN) $(SPRITE_BIN) $(AUDIO_BIN) $(DATA_FILES) +all amiga: $(LIB) $(HELLO_BIN) $(PATTERN_BIN) $(DRAW_BIN) $(KEYS_BIN) $(JOY_BIN) $(SPRITE_BIN) $(AUDIO_BIN) $(UBER_BIN) $(DATA_FILES) $(BUILD)/obj/core/%.o: $(SRC_CORE)/%.c @mkdir -p $(dir $@) @@ -140,6 +142,10 @@ $(AUDIO_BIN): $(AUDIO_SRC) $(LIB) @mkdir -p $(dir $@) $(AMIGA_CC) $(CFLAGS) $< $(LIB) -o $@ $(LDFLAGS) +$(UBER_BIN): $(UBER_SRC) $(LIB) + @mkdir -p $(dir $@) + $(AMIGA_CC) $(CFLAGS) $< $(LIB) -o $@ $(LDFLAGS) + $(DATA_DIR)/test.mod: $(REPO_DIR)/assets/test.mod @mkdir -p $(DATA_DIR) cp $< $@ diff --git a/make/atarist.mk b/make/atarist.mk index 63c87d7..a30f42e 100644 --- a/make/atarist.mk +++ b/make/atarist.mk @@ -55,6 +55,8 @@ SPRITE_SRC := $(EXAMPLES)/sprite/sprite.c SPRITE_BIN := $(BINDIR)/SPRITE.PRG AUDIO_SRC := $(EXAMPLES)/audio/audio.c AUDIO_BIN := $(BINDIR)/AUDIO.PRG +UBER_SRC := $(EXAMPLES)/uber/uber.c +UBER_BIN := $(BINDIR)/UBER.PRG # Game data lives under bin/DATA/, alongside the binaries Hatari picks # up when bin/ is mounted as the GEMDOS C: drive. audio.c fopens @@ -63,7 +65,7 @@ DATA_DIR := $(BINDIR)/DATA DATA_FILES := $(DATA_DIR)/test.mod $(DATA_DIR)/test.sfx .PHONY: all atarist clean-atarist -all atarist: $(LIB) $(LIBXMP_AR) $(HELLO_BIN) $(PATTERN_BIN) $(DRAW_BIN) $(KEYS_BIN) $(JOY_BIN) $(SPRITE_BIN) $(AUDIO_BIN) $(DATA_FILES) +all atarist: $(LIB) $(LIBXMP_AR) $(HELLO_BIN) $(PATTERN_BIN) $(DRAW_BIN) $(KEYS_BIN) $(JOY_BIN) $(SPRITE_BIN) $(AUDIO_BIN) $(UBER_BIN) $(DATA_FILES) $(BUILD)/obj/core/%.o: $(SRC_CORE)/%.c @mkdir -p $(dir $@) @@ -132,6 +134,10 @@ $(AUDIO_BIN): $(AUDIO_SRC) $(LIB) @mkdir -p $(dir $@) $(ST_CC) $(CFLAGS) $< $(LIB) $(LIBXMP_AR) -o $@ $(LDFLAGS) +$(UBER_BIN): $(UBER_SRC) $(LIB) + @mkdir -p $(dir $@) + $(ST_CC) $(CFLAGS) $< $(LIB) $(LIBXMP_AR) -o $@ $(LDFLAGS) + $(DATA_DIR)/test.mod: $(REPO_DIR)/assets/test.mod @mkdir -p $(DATA_DIR) cp $< $@ diff --git a/make/dos.mk b/make/dos.mk index 5becb7d..ee960ec 100644 --- a/make/dos.mk +++ b/make/dos.mk @@ -49,6 +49,8 @@ SPRITE_SRC := $(EXAMPLES)/sprite/sprite.c SPRITE_BIN := $(BINDIR)/SPRITE.EXE AUDIO_SRC := $(EXAMPLES)/audio/audio.c AUDIO_BIN := $(BINDIR)/AUDIO.EXE +UBER_SRC := $(EXAMPLES)/uber/uber.c +UBER_BIN := $(BINDIR)/UBER.EXE # Game data lives under bin/DATA/, alongside the binaries DOSBox picks # up when bin/ is mounted as C:. audio.c fopens "DATA/test.mod" etc. @@ -56,7 +58,7 @@ DATA_DIR := $(BINDIR)/DATA DATA_FILES := $(DATA_DIR)/test.mod $(DATA_DIR)/test.sfx .PHONY: all dos clean-dos -all dos: $(LIB) $(LIBXMP_AR) $(HELLO_BIN) $(PATTERN_BIN) $(DRAW_BIN) $(KEYS_BIN) $(JOY_BIN) $(SPRITE_BIN) $(AUDIO_BIN) $(DATA_FILES) +all dos: $(LIB) $(LIBXMP_AR) $(HELLO_BIN) $(PATTERN_BIN) $(DRAW_BIN) $(KEYS_BIN) $(JOY_BIN) $(SPRITE_BIN) $(AUDIO_BIN) $(UBER_BIN) $(DATA_FILES) $(BUILD)/obj/core/%.o: $(SRC_CORE)/%.c @mkdir -p $(dir $@) @@ -121,6 +123,11 @@ $(AUDIO_BIN): $(AUDIO_SRC) $(LIB) $(DOS_CC) $(CFLAGS) $< $(LIB) $(LIBXMP_AR) -o $@ $(DOS_EMBED_DPMI) $@ +$(UBER_BIN): $(UBER_SRC) $(LIB) + @mkdir -p $(dir $@) + $(DOS_CC) $(CFLAGS) $< $(LIB) $(LIBXMP_AR) -o $@ + $(DOS_EMBED_DPMI) $@ + $(DATA_DIR)/test.mod: $(REPO_DIR)/assets/test.mod @mkdir -p $(DATA_DIR) cp $< $@ diff --git a/make/iigs.mk b/make/iigs.mk index eb107a6..a049d5c 100644 --- a/make/iigs.mk +++ b/make/iigs.mk @@ -49,23 +49,13 @@ NTP_BIN := $(BUILD)/audio/ntpplayer.bin NTP_ASM := $(BUILD)/audio/ntpdata.asm IIGS_MERLIN := $(REPO_DIR)/toolchains/iigs/merlin32/bin/merlin32 -# IMPORTANT: CODEGEN_SRCS (specifically spriteEmitIigs.c) MUST be the -# first entry after the main object in the link order. ORCA-Linker's -# bank assignment is order-sensitive: when spriteEmitIigs.c lands at -# any later position, the linker assigns SPRITECG to a bank where its -# intra-OMF-segment static-symbol relocations (emitMvnCopyRoutine, -# shiftedByteAt, writeLE16) can't be encoded -- you get cryptic -# "Addressing error" / "Unresolved reference Label: ..." failures -# whose root cause is bank packing, not source. Putting CODEGEN_SRCS -# first gives SPRITECG prime placement and the relocations resolve. -# This was the underlying cause of feedback_orca_link_segment_count -# cases 2-5 (we'd been working around it by managing _ROOT mass). -LIB_SRCS := $(CODEGEN_SRCS) $(CORE_C_SRCS_IIGS) $(PORT_C_SRCS) $(PORT_ASM_SRCS_ALL) $(NTP_ASM) +LIB_SRCS := $(CORE_C_SRCS_IIGS) $(PORT_C_SRCS) $(PORT_ASM_SRCS_ALL) $(NTP_ASM) $(CODEGEN_SRCS) -HELLO_SRC := $(EXAMPLES)/hello/hello.c -HELLO_BIN := $(BINDIR)/HELLO -PATTERN_SRC := $(EXAMPLES)/pattern/pattern.c -PATTERN_BIN := $(BINDIR)/PATTERN +# HELLO and PATTERN are intentionally omitted from this list. The UBER +# demo (below) exercises every public API, including what those two +# small examples covered, and the IIgs disk image was running out of +# room. Source for HELLO/PATTERN is still in examples/{hello,pattern}/ +# for reference and for other ports that want them. DRAW_SRC := $(EXAMPLES)/draw/draw.c DRAW_BIN := $(BINDIR)/DRAW KEYS_SRC := $(EXAMPLES)/keys/keys.c @@ -74,6 +64,8 @@ JOY_SRC := $(EXAMPLES)/joy/joy.c JOY_BIN := $(BINDIR)/JOY SPRITE_SRC := $(EXAMPLES)/sprite/sprite.c SPRITE_BIN := $(BINDIR)/SPRITE +UBER_SRC := $(EXAMPLES)/uber/uber.c +UBER_BIN := $(BINDIR)/UBER AUDIO_SRC := $(EXAMPLES)/audio/audio.c AUDIO_BIN := $(BINDIR)/AUDIO AUDIO_MOD := $(REPO_DIR)/assets/test.mod @@ -128,16 +120,6 @@ $(NTP_ASM): $(NTP_BIN) $(REPO_DIR)/toolchains/iigs/bin-to-asm.sh # everywhere, so library asm can take SurfaceT* args via one # consistent ABI (small-mm 16-bit pointers truncated bank bytes, # which broke any asm that wanted to address bank-1 stage memory). -$(HELLO_BIN): $(HELLO_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD) - @mkdir -p $(dir $@) - $(IIGS_BUILD) -b $(IIX_INCLUDES) -o $@ $(HELLO_SRC) $(LIB_SRCS) - $(IIGS_IIX) chtyp -t S16 $@ - -$(PATTERN_BIN): $(PATTERN_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD) - @mkdir -p $(dir $@) - $(IIGS_BUILD) -b $(IIX_INCLUDES) -o $@ $(PATTERN_SRC) $(LIB_SRCS) - $(IIGS_IIX) chtyp -t S16 $@ - $(DRAW_BIN): $(DRAW_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD) @mkdir -p $(dir $@) $(IIGS_BUILD) -b $(IIX_INCLUDES) -o $@ $(DRAW_SRC) $(LIB_SRCS) @@ -158,6 +140,17 @@ $(SPRITE_BIN): $(SPRITE_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD) $(IIGS_BUILD) -b $(IIX_INCLUDES) -o $@ $(SPRITE_SRC) $(LIB_SRCS) $(IIGS_IIX) chtyp -t S16 $@ +# UBER bumps user stack to 16 KB. ORCA-C's default user stack is small +# (~1 KB) and vfprintf's parsing buffer + the demo's own stack-local +# format buffers were spilling past it -- the symptom was a crash to +# monitor on the second varargs-style joeyLogF call. The hand-rolled +# decimal formatter in uber.c also uses larger stack-local buffers +# (line[96], num[16]) than typical demos. 16 KB is plenty of headroom. +$(UBER_BIN): $(UBER_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD) + @mkdir -p $(dir $@) + $(IIGS_BUILD) -b -s 16384 $(IIX_INCLUDES) -o $@ $(UBER_SRC) $(LIB_SRCS) + $(IIGS_IIX) chtyp -t S16 $@ + # Convert the cross-platform .MOD asset to NinjaTrackerPlus runtime # format via joeymod (which shells out to ntpconverter.php). Without # php-cli the conversion is skipped; in that case the IIgs disk just @@ -181,13 +174,13 @@ $(AUDIO_BIN): $(AUDIO_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD) $(IIGS_BUILD) -b $(IIX_INCLUDES) -I $(EXAMPLES)/audio -o $@ $(AUDIO_SRC) $(LIB_SRCS) $(IIGS_IIX) chtyp -t S16 $@ -# Assemble an 800KB ProDOS 2img containing the examples, ready to -# mount in GSplus alongside a GS/OS boot volume. +# Assemble a ProDOS 2img containing the examples, ready to mount in +# GSplus alongside a GS/OS boot volume. iigs-disk: $(DISK_IMG) -$(DISK_IMG): $(HELLO_BIN) $(PATTERN_BIN) $(DRAW_BIN) $(KEYS_BIN) $(JOY_BIN) $(SPRITE_BIN) $(AUDIO_BIN) $(AUDIO_DATA_FILES) $(IIGS_PACKAGE) +$(DISK_IMG): $(DRAW_BIN) $(KEYS_BIN) $(JOY_BIN) $(SPRITE_BIN) $(AUDIO_BIN) $(UBER_BIN) $(AUDIO_DATA_FILES) $(IIGS_PACKAGE) @mkdir -p $(dir $@) - $(IIGS_PACKAGE) $@ $(HELLO_BIN) $(PATTERN_BIN) $(DRAW_BIN) $(KEYS_BIN) $(JOY_BIN) $(SPRITE_BIN) $(AUDIO_BIN) -- $(AUDIO_DATA_FILES) + $(IIGS_PACKAGE) $@ $(DRAW_BIN) $(KEYS_BIN) $(JOY_BIN) $(SPRITE_BIN) $(AUDIO_BIN) $(UBER_BIN) -- $(AUDIO_DATA_FILES) clean-iigs: rm -rf $(BUILD) diff --git a/scripts/run-iigs.sh b/scripts/run-iigs.sh index af581e6..87d777f 100755 --- a/scripts/run-iigs.sh +++ b/scripts/run-iigs.sh @@ -1,29 +1,24 @@ #!/usr/bin/env bash # Launch the built Apple IIgs examples in GSplus. GSplus is booted from # a GS/OS 6.0.4 System disk (toolchains/emulators/support/gsos-system.po) -# with joey.2mg mounted as the data disk on slot 5 drive 2. The user -# navigates to the JOEYLIB volume in Finder and double-clicks the -# example to run it. +# with joey.2mg mounted as the data disk on slot 5 drive 2. GS/OS drops +# to Finder; the user navigates to the JOEYLIB volume and double-clicks +# whichever example they want to run. # -# Unlike the other emulators, GS/OS does not auto-run on boot -- it -# drops to Finder. The argument just prints a reminder of which -# example to launch. -# -# scripts/run-iigs.sh # boots (Pattern hint) -# scripts/run-iigs.sh hello # boots, hints HELLO -# scripts/run-iigs.sh draw # boots, hints DRAW -# -# Argument is any built example name (case-insensitive); upper-case -# it for the Finder hint and existence-check. +# No argument: GSplus has no way to dispatch a specific binary on boot +# (Finder is interactive), so this script just stages the disk and +# launches the emulator. The post-run trap below extracts joeylog.txt +# from the data disk so demos that left a breadcrumb file are visible +# from the host shell after the emulator exits. set -euo pipefail -if [[ $# -gt 1 ]]; then - echo "usage: $0 [example-name]" >&2 +if [[ $# -ne 0 ]]; then + echo "usage: $0" >&2 + echo " (no arguments -- launch GSplus, pick the demo in Finder)" >&2 exit 2 fi -prog=${1:-pattern} repo=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd) # profuse looks up its FST helpers under $GOLDEN_GATE / $ORCA_ROOT and @@ -38,18 +33,6 @@ sys_disk=$repo/toolchains/emulators/support/gsos-system.po data_disk=$repo/build/iigs/bin/joey.2mg null_c600=$repo/toolchains/emulators/support/iigs-null-c600.rom -target=${prog^^} -bin_dir=$repo/build/iigs/bin -if [[ ! -f "$bin_dir/$target" ]]; then - echo "$bin_dir/$target not built. Run 'make iigs' first." >&2 - if compgen -G "$bin_dir/*" > /dev/null; then - echo "available examples in $bin_dir:" >&2 - find "$bin_dir" -maxdepth 1 -type f -printf '%f\n' \ - | grep -vE '\.2mg$|\.txt$' >&2 || true - fi - exit 1 -fi - for f in "$gsplus" "$rom" "$sys_disk" "$data_disk" "$null_c600"; do if [[ ! -f $f ]]; then echo "missing: $f" >&2 @@ -123,7 +106,7 @@ cat <pixels[SURFACE_ROW_OFFSET(y) + ((uint16_t)x >> 1)]; - memcpy(destBytes, &destPtr, 4); - destAddr = (uint32_t)destBytes[0] - | ((uint32_t)destBytes[1] << 8) - | ((uint32_t)destBytes[2] << 16); + shift = (uint8_t)(x & 1); + destPtr = &dst->pixels[SURFACE_ROW_OFFSET(y) + ((uint16_t)x >> 1)]; + destAddr = (uint32_t)destPtr; destOffset = (uint16_t)(destAddr & 0xFFFFu); destBank = (uint8_t)((destAddr >> 16) & 0xFFu); fnAddr = codegenArenaBaseAddr() @@ -248,9 +244,10 @@ void spriteCompiledDraw(SurfaceT *dst, const SpriteT *sp, int16_t x, int16_t y) // fnAddr changes only on shift parity flips or sprite swaps. if (fnAddr != gDrawStubLastFnAddr) { - gSpriteCallStub[ 9] = (unsigned char)(fnAddr & 0xFFu); - gSpriteCallStub[10] = (unsigned char)((fnAddr >> 8) & 0xFFu); - gSpriteCallStub[11] = (unsigned char)((fnAddr >> 16) & 0xFFu); + const uint8_t *fnB_ = (const uint8_t *)&fnAddr; + gSpriteCallStub[ 9] = fnB_[0]; + gSpriteCallStub[10] = fnB_[1]; + gSpriteCallStub[11] = fnB_[2]; gDrawStubLastFnAddr = fnAddr; } @@ -329,20 +326,41 @@ static void patchMvnBanks(uint8_t *routine, uint16_t heightPx, uint8_t dstBank, } -// Common helper: dump a 24-bit pointer's raw bytes via memcpy -// (avoiding ORCA-C's lossy (uint32_t) pointer cast under memorymodel -// 1) and split into low 16 bits + bank. -static void splitPointer(const void *ptr, uint16_t *outLo, uint8_t *outBank) { - uint8_t bytes[4]; - uint32_t addr; - memcpy(bytes, &ptr, 4); - addr = (uint32_t)bytes[0] - | ((uint32_t)bytes[1] << 8) - | ((uint32_t)bytes[2] << 16); - *outLo = (uint16_t)(addr & 0xFFFFu); - *outBank = (uint8_t)((addr >> 16) & 0xFFu); -} +// Split a 24-bit pointer into its low 16 bits + bank byte. The +// (uint32_t) cast works correctly in ORCA/C 2.2.1 (the 2.1.0 lossy- +// bank-byte bug is fixed). To avoid invoking the ~LSHR4 32-bit-shift +// helper for the `>> 16` to extract the bank byte, we cast to +// uint32_t and then byte-alias the storage -- gets the same bytes +// with simple loads. +#define SPLIT_POINTER(_ptr, _outLo, _outBank) \ + do { \ + uint32_t spAddr_ = (uint32_t)(_ptr); \ + const uint8_t *spB_ = (const uint8_t *)&spAddr_; \ + *(_outLo) = (uint16_t)(spB_[0] | ((uint16_t)spB_[1] << 8)); \ + *(_outBank) = spB_[2]; \ + } while (0) + +// Backup-buffer pointer split cache. backup->bytes is a user-supplied +// buffer (e.g. a static array) and effectively never changes after +// the first call -- caching its split saves both Save and Restore the +// macro expansion per frame. +static const void *gLastBackupBytes = (const void *)0; +static uint16_t gLastBackupBytesLo = 0; +static uint8_t gLastBackupBytesBank = 0; + +#define SPLIT_BACKUP_CACHED(_bytes, _outLo, _outBank) \ + do { \ + if ((const void *)(_bytes) == gLastBackupBytes) { \ + *(_outLo) = gLastBackupBytesLo; \ + *(_outBank) = gLastBackupBytesBank; \ + } else { \ + SPLIT_POINTER((_bytes), (_outLo), (_outBank)); \ + gLastBackupBytes = (const void *)(_bytes); \ + gLastBackupBytesLo = *(_outLo); \ + gLastBackupBytesBank = *(_outBank); \ + } \ + } while (0) void spriteCompiledSaveUnder(const SurfaceT *src, SpriteT *sp, int16_t x, int16_t y, SpriteBackupT *backup) { @@ -358,6 +376,10 @@ void spriteCompiledSaveUnder(const SurfaceT *src, SpriteT *sp, int16_t x, int16_ uint32_t fnAddr; uint8_t *routine; uint8_t *screenPtr; + uint16_t cacheIdx; /* shift * SPRITE_OP_COUNT + SPRITE_OP_SAVE, computed once */ + uint8_t *cachedDst; /* &sp->cachedDstBank[0][0] + cacheIdx */ + uint8_t *cachedSrc; /* &sp->cachedSrcBank[0][0] + cacheIdx */ + uint16_t routineOffset; /* sp->routineOffsets[shift][SPRITE_OP_SAVE], computed once */ shift = (uint8_t)(x & 1); clippedX = (int16_t)(x & ~1); @@ -366,19 +388,39 @@ void spriteCompiledSaveUnder(const SurfaceT *src, SpriteT *sp, int16_t x, int16_ copyBytes = (uint16_t)((widthPx >> 1) + (shift == 1 ? 1 : 0)); screenPtr = (uint8_t *)&src->pixels[SURFACE_ROW_OFFSET(y) + ((uint16_t)clippedX >> 1)]; - splitPointer(screenPtr, &screenLo, &screenBank); - splitPointer(backup->bytes, &backupLo, &backupBank); + SPLIT_POINTER(screenPtr, &screenLo, &screenBank); + SPLIT_BACKUP_CACHED(backup->bytes, &backupLo, &backupBank); backup->sprite = sp; backup->x = clippedX; backup->y = y; backup->width = (uint16_t)(copyBytes << 1); backup->height = heightPx; - backup->sizeBytes = (uint16_t)(copyBytes * heightPx); + /* sizeBytes is constant per (sprite, shift); cache to dodge the + * per-call ~CUMUL2 (uint16_t * uint16_t) helper. The byte-pointer + * arithmetic avoids reintroducing ~MUL4 for the uint16_t array + * indexing. */ + { + uint16_t *sizeCachePtr = (uint16_t *)((uint8_t *)sp->cachedSizeBytes + ((uint16_t)shift << 1)); + if (*sizeCachePtr == 0) { + *sizeCachePtr = (uint16_t)(copyBytes * heightPx); + } + backup->sizeBytes = *sizeCachePtr; + } + + /* Compute the 1D index into the cached* / routineOffsets 2D arrays + * once. ORCA-C 2.2.1 lowers `shift * SPRITE_OP_COUNT` (where + * SPRITE_OP_COUNT==3) to a ~MUL4 helper call; (shift<<1)+shift + * compiles to two ASLs and an ADC, no helper. */ + cacheIdx = (uint16_t)(((uint16_t)shift << 1) + (uint16_t)shift + SPRITE_OP_SAVE); + cachedDst = (uint8_t *)sp->cachedDstBank + cacheIdx; + cachedSrc = (uint8_t *)sp->cachedSrcBank + cacheIdx; + /* Same byte-pointer trick as SURFACE_ROW_OFFSET to dodge ~MUL4. */ + routineOffset = *(uint16_t *)((uint8_t *)sp->routineOffsets + (cacheIdx << 1)); fnAddr = codegenArenaBaseAddr() + sp->slot->offset - + (uint32_t)sp->routineOffsets[shift][SPRITE_OP_SAVE]; + + (uint32_t)routineOffset; // Stub: X = screen (source), Y = backup (destination). if (!gSaveStubInited) { @@ -401,22 +443,22 @@ void spriteCompiledSaveUnder(const SurfaceT *src, SpriteT *sp, int16_t x, int16_ gSaveStubLastYLo = backupLo; } if (fnAddr != gSaveStubLastFnAddr) { - gSpriteSaveStub[ 8] = (unsigned char)(fnAddr & 0xFFu); - gSpriteSaveStub[ 9] = (unsigned char)((fnAddr >> 8) & 0xFFu); - gSpriteSaveStub[10] = (unsigned char)((fnAddr >> 16) & 0xFFu); + /* Byte-alias the uint32_t to grab the 3 bank/lo/hi bytes + * without invoking ~LSHR4 for the >>16. */ + const uint8_t *fnB_ = (const uint8_t *)&fnAddr; + gSpriteSaveStub[ 8] = fnB_[0]; + gSpriteSaveStub[ 9] = fnB_[1]; + gSpriteSaveStub[10] = fnB_[2]; gSaveStubLastFnAddr = fnAddr; } // Skip the 16+ MVN-bank rewrites if the dst/src bank pair is the - // same as last call. Screen and backup buffer banks are stable - // for essentially every frame past the first, so this short- - // circuits ~5000 cyc/frame on the ball demo. - if (sp->cachedDstBank[shift][SPRITE_OP_SAVE] != backupBank || - sp->cachedSrcBank[shift][SPRITE_OP_SAVE] != screenBank) { - routine = codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_SAVE]; + // same as last call. + if (*cachedDst != backupBank || *cachedSrc != screenBank) { + routine = codegenArenaBase() + sp->slot->offset + routineOffset; patchMvnBanks(routine, heightPx, /*dst*/backupBank, /*src*/screenBank); - sp->cachedDstBank[shift][SPRITE_OP_SAVE] = backupBank; - sp->cachedSrcBank[shift][SPRITE_OP_SAVE] = screenBank; + *cachedDst = backupBank; + *cachedSrc = screenBank; } // MVN-based routine: needs M=16 / X=16; restore M=16 on exit @@ -442,6 +484,10 @@ void spriteCompiledRestoreUnder(SurfaceT *dst, const SpriteBackupT *backup) { uint8_t *routine; uint8_t *screenPtr; SpriteT *sp; + uint16_t cacheIdx; /* shift * SPRITE_OP_COUNT + SPRITE_OP_RESTORE, computed once */ + uint8_t *cachedDst; + uint8_t *cachedSrc; + uint16_t routineOffset; sp = backup->sprite; heightPx = backup->height; @@ -450,12 +496,19 @@ void spriteCompiledRestoreUnder(SurfaceT *dst, const SpriteBackupT *backup) { shift = (copyBytes == spriteBytesPerRow) ? 0 : 1; screenPtr = (uint8_t *)&dst->pixels[SURFACE_ROW_OFFSET(backup->y) + ((uint16_t)backup->x >> 1)]; - splitPointer(screenPtr, &screenLo, &screenBank); - splitPointer(backup->bytes, &backupLo, &backupBank); + SPLIT_POINTER(screenPtr, &screenLo, &screenBank); + SPLIT_BACKUP_CACHED(backup->bytes, &backupLo, &backupBank); + + /* Hoist 2D-array indexing -- see save-side comment. */ + cacheIdx = (uint16_t)(((uint16_t)shift << 1) + (uint16_t)shift + SPRITE_OP_RESTORE); + cachedDst = (uint8_t *)sp->cachedDstBank + cacheIdx; + cachedSrc = (uint8_t *)sp->cachedSrcBank + cacheIdx; + /* Same byte-pointer trick as SURFACE_ROW_OFFSET to dodge ~MUL4. */ + routineOffset = *(uint16_t *)((uint8_t *)sp->routineOffsets + (cacheIdx << 1)); fnAddr = codegenArenaBaseAddr() + sp->slot->offset - + (uint32_t)sp->routineOffsets[shift][SPRITE_OP_RESTORE]; + + (uint32_t)routineOffset; // Stub: X = backup (source), Y = screen (destination). if (!gRestoreStubInited) { @@ -478,20 +531,20 @@ void spriteCompiledRestoreUnder(SurfaceT *dst, const SpriteBackupT *backup) { gRestoreStubLastYLo = screenLo; } if (fnAddr != gRestoreStubLastFnAddr) { - gSpriteRestoreStub[ 8] = (unsigned char)(fnAddr & 0xFFu); - gSpriteRestoreStub[ 9] = (unsigned char)((fnAddr >> 8) & 0xFFu); - gSpriteRestoreStub[10] = (unsigned char)((fnAddr >> 16) & 0xFFu); + const uint8_t *fnB_ = (const uint8_t *)&fnAddr; + gSpriteRestoreStub[ 8] = fnB_[0]; + gSpriteRestoreStub[ 9] = fnB_[1]; + gSpriteRestoreStub[10] = fnB_[2]; gRestoreStubLastFnAddr = fnAddr; } // Same short-circuit as save: only re-stamp the bank operands if // they actually changed since last call. - if (sp->cachedDstBank[shift][SPRITE_OP_RESTORE] != screenBank || - sp->cachedSrcBank[shift][SPRITE_OP_RESTORE] != backupBank) { - routine = codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_RESTORE]; + if (*cachedDst != screenBank || *cachedSrc != backupBank) { + routine = codegenArenaBase() + sp->slot->offset + routineOffset; patchMvnBanks(routine, heightPx, /*dst*/screenBank, /*src*/backupBank); - sp->cachedDstBank[shift][SPRITE_OP_RESTORE] = screenBank; - sp->cachedSrcBank[shift][SPRITE_OP_RESTORE] = backupBank; + *cachedDst = screenBank; + *cachedSrc = backupBank; } asm { diff --git a/src/codegen/spriteEmitIigs.c b/src/codegen/spriteEmitIigs.c index a5fda1e..4975c1c 100644 --- a/src/codegen/spriteEmitIigs.c +++ b/src/codegen/spriteEmitIigs.c @@ -31,16 +31,6 @@ #include "spriteEmitter.h" #include "spriteInternal.h" -// Pin the IIgs sprite codegen statics into their own load segment -// instead of letting them ride in _ROOT. _ROOT also collects every -// other unsegmented .c (init.c, sprite.c, present.c, the example -// main, ...), so growth in any of those can shift the linker's -// per-bank packing and orphan intra-file static refs (we hit this -// when DRAWPRIMS grew with the chunked PEI-slam: PATTERN's link -// reported "Unresolved reference: emitMvnCopyRoutine" purely from -// _ROOT crowding). A dedicated load segment isolates this file. -JOEYLIB_SEGMENT("SPRITECG") - // ----- Constants ----- diff --git a/src/core/asset.c b/src/core/asset.c index 173cd49..d4fd9b6 100644 --- a/src/core/asset.c +++ b/src/core/asset.c @@ -12,8 +12,6 @@ #include "joey/asset.h" #include "joey/palette.h" -// CORESYS: hoisted out of _ROOT (see surface.c for rationale). -JOEYLIB_SEGMENT("CORESYS") #define JAS_HEADER_SIZE 44 #define JAS_PIXELS_OFFSET JAS_HEADER_SIZE diff --git a/src/core/audio.c b/src/core/audio.c index 3a39207..8a4b92f 100644 --- a/src/core/audio.c +++ b/src/core/audio.c @@ -8,8 +8,6 @@ #include "joey/audio.h" #include "hal.h" -// CORESYS: hoisted out of _ROOT (see surface.c for rationale). -JOEYLIB_SEGMENT("CORESYS") static bool gAudioReady = false; @@ -79,5 +77,10 @@ void joeyAudioFrameTick(void) { if (!gAudioReady) { return; } +#ifndef JOEYLIB_PLATFORM_IIGS + // IIgs: NTPstreamsound is fully DOC-IRQ-driven, halAudioFrameTick + // is an empty no-op there. Skip the wrapper JSL entirely on IIgs + // so per-frame audio cost stays at the gAudioReady branch above. halAudioFrameTick(); +#endif } diff --git a/src/core/codegenArena.c b/src/core/codegenArena.c index 4bd3eac..80a5f1a 100644 --- a/src/core/codegenArena.c +++ b/src/core/codegenArena.c @@ -23,22 +23,27 @@ #include "codegenArenaInternal.h" -// CORESYS: hoisted out of _ROOT (see surface.c for rationale). -JOEYLIB_SEGMENT("CORESYS") // ----- Module state ----- -static uint8_t *gBase = NULL; -// gBaseAddr mirrors gBase as a 24-bit absolute address. ORCA-C's -// (uint32_t)pointer cast on the IIgs zeros the bank byte for some -// pointer expressions, so JSL targets read this field directly. -static uint32_t gBaseAddr = 0; +// gCodegenArenaBase / gCodegenArenaBaseAddr are non-static so spriteCompile.c can read them +// directly via extern instead of paying a JSL/RTL per access through +// the codegenArenaBase() / codegenArenaBaseAddr() wrappers. Both are +// set once at codegenArenaInit and never moved (the underlying +// Memory Manager handle is locked-in-place on IIgs). Callers MUST +// treat them as read-only. +uint8_t *gCodegenArenaBase = NULL; +// gCodegenArenaBaseAddr mirrors gCodegenArenaBase as a 24-bit +// absolute address. ORCA-C's (uint32_t)pointer cast on the IIgs +// zeros the bank byte for some pointer expressions, so JSL targets +// read this field directly. +uint32_t gCodegenArenaBaseAddr = 0; static uint32_t gTotalBytes = 0; static uint32_t gUsedBytes = 0; static ArenaSlotT *gFirstSlot = NULL; #if defined(JOEYLIB_PLATFORM_IIGS) -static Handle gBaseHandle = NULL; +static Handle gCodegenArenaBaseHandle = NULL; #endif @@ -93,7 +98,7 @@ ArenaSlotT *codegenArenaAlloc(uint32_t bytes) { ArenaSlotT *slot; ArenaSlotT *remainder; - if (gBase == NULL || bytes == 0) { + if (gCodegenArenaBase == NULL || bytes == 0) { return NULL; } for (slot = gFirstSlot; slot != NULL; slot = slot->next) { @@ -123,14 +128,11 @@ ArenaSlotT *codegenArenaAlloc(uint32_t bytes) { } -uint8_t *codegenArenaBase(void) { - return gBase; -} - - -uint32_t codegenArenaBaseAddr(void) { - return gBaseAddr; -} +// codegenArenaBase() / codegenArenaBaseAddr() are now header-only +// macros that read gCodegenArenaBase / gCodegenArenaBaseAddr +// directly, so the C function bodies that used to live here are +// gone. The wrappers cost ~30 cyc per call on IIgs and were hit +// 3x per sprite frame. uint32_t codegenArenaBytesTotal(void) { @@ -149,7 +151,7 @@ void codegenArenaCompact(void) { ArenaSlotT *trailing; uint32_t cursor; - if (gBase == NULL) { + if (gCodegenArenaBase == NULL) { return; } cursor = 0; @@ -158,7 +160,7 @@ void codegenArenaCompact(void) { next = slot->next; if (slot->used) { if (slot->offset != cursor) { - memmove(gBase + cursor, gBase + slot->offset, slot->size); + memmove(gCodegenArenaBase + cursor, gCodegenArenaBase + slot->offset, slot->size); slot->offset = cursor; } cursor += slot->size; @@ -200,7 +202,7 @@ void codegenArenaCompact(void) { void codegenArenaFree(ArenaSlotT *slot) { - if (slot == NULL || gBase == NULL) { + if (slot == NULL || gCodegenArenaBase == NULL) { return; } if (!slot->used) { @@ -215,21 +217,21 @@ void codegenArenaFree(ArenaSlotT *slot) { bool codegenArenaInit(uint32_t totalBytes) { - if (gBase != NULL) { + if (gCodegenArenaBase != NULL) { return true; } if (totalBytes == 0) { return false; } #if defined(JOEYLIB_PLATFORM_IIGS) - gBaseHandle = NewHandle(totalBytes, _ownerid, + gCodegenArenaBaseHandle = NewHandle(totalBytes, _ownerid, attrFixed | attrLocked | attrPage | attrNoCross, NULL); - if (gBaseHandle == NULL || _toolErr != 0) { - gBaseHandle = NULL; + if (gCodegenArenaBaseHandle == NULL || _toolErr != 0) { + gCodegenArenaBaseHandle = NULL; return false; } - HLock(gBaseHandle); + HLock(gCodegenArenaBaseHandle); // Capture the 24-bit absolute address by copying the Pointer's // raw bytes -- (uint32_t)pointer through a chain of expressions // has been observed to drop the bank byte under ORCA-C's @@ -238,35 +240,35 @@ bool codegenArenaInit(uint32_t totalBytes) { { Pointer p; uint8_t bytes[4]; - p = *gBaseHandle; - gBase = (uint8_t *)p; + p = *gCodegenArenaBaseHandle; + gCodegenArenaBase = (uint8_t *)p; memcpy(bytes, &p, 4); - gBaseAddr = (uint32_t)bytes[0] + gCodegenArenaBaseAddr = (uint32_t)bytes[0] | ((uint32_t)bytes[1] << 8) | ((uint32_t)bytes[2] << 16); } - if (gBase == NULL) { - DisposeHandle(gBaseHandle); - gBaseHandle = NULL; + if (gCodegenArenaBase == NULL) { + DisposeHandle(gCodegenArenaBaseHandle); + gCodegenArenaBaseHandle = NULL; return false; } #else - gBase = (uint8_t *)malloc(totalBytes); - if (gBase == NULL) { + gCodegenArenaBase = (uint8_t *)malloc(totalBytes); + if (gCodegenArenaBase == NULL) { return false; } - gBaseAddr = (uint32_t)gBase; + gCodegenArenaBaseAddr = (uint32_t)gCodegenArenaBase; #endif gFirstSlot = newSlot(0, totalBytes, false); if (gFirstSlot == NULL) { #if defined(JOEYLIB_PLATFORM_IIGS) - DisposeHandle(gBaseHandle); - gBaseHandle = NULL; + DisposeHandle(gCodegenArenaBaseHandle); + gCodegenArenaBaseHandle = NULL; #else - free(gBase); + free(gCodegenArenaBase); #endif - gBase = NULL; - gBaseAddr = 0; + gCodegenArenaBase = NULL; + gCodegenArenaBaseAddr = 0; return false; } gTotalBytes = totalBytes; @@ -279,7 +281,7 @@ void codegenArenaShutdown(void) { ArenaSlotT *slot; ArenaSlotT *next; - if (gBase == NULL) { + if (gCodegenArenaBase == NULL) { return; } for (slot = gFirstSlot; slot != NULL; slot = next) { @@ -287,13 +289,13 @@ void codegenArenaShutdown(void) { free(slot); } #if defined(JOEYLIB_PLATFORM_IIGS) - DisposeHandle(gBaseHandle); - gBaseHandle = NULL; + DisposeHandle(gCodegenArenaBaseHandle); + gCodegenArenaBaseHandle = NULL; #else - free(gBase); + free(gCodegenArenaBase); #endif - gBase = NULL; - gBaseAddr = 0; + gCodegenArenaBase = NULL; + gCodegenArenaBaseAddr = 0; gFirstSlot = NULL; gTotalBytes = 0; gUsedBytes = 0; diff --git a/src/core/codegenArenaInternal.h b/src/core/codegenArenaInternal.h index 1e28347..b3f4208 100644 --- a/src/core/codegenArenaInternal.h +++ b/src/core/codegenArenaInternal.h @@ -58,14 +58,15 @@ void codegenArenaCompact(void); // Used for spriteDraw's address computation. The base pointer is // stable for the lifetime of the arena; only slot->offset moves. -uint8_t *codegenArenaBase(void); - -// Same address as codegenArenaBase() but returned as an integer. The -// IIgs JSL trampoline needs the 24-bit absolute address as a number -// it can split into bank/offset bytes; ORCA-C's pointer-to-uint32_t -// cast has dropped the bank byte in some expressions, so we expose -// the integer view directly. -uint32_t codegenArenaBaseAddr(void); +// +// Direct extern access (instead of a getter function) so per-frame +// hot paths in spriteCompile.c skip the JSL/PHB/RTL/PLB the wrapper +// would impose. Both globals are read-only after codegenArenaInit; +// the function-form getters below are kept as a back-compat shim. +extern uint8_t *gCodegenArenaBase; +extern uint32_t gCodegenArenaBaseAddr; +#define codegenArenaBase() ((uint8_t *)gCodegenArenaBase) +#define codegenArenaBaseAddr() ((uint32_t)gCodegenArenaBaseAddr) // Public-API support: sum of live slot sizes, total arena size. // Difference is free space (which may be fragmented across holes diff --git a/src/core/debug.c b/src/core/debug.c index bb85d7d..b748ac2 100644 --- a/src/core/debug.c +++ b/src/core/debug.c @@ -13,8 +13,6 @@ #include "joey/platform.h" #include "joey/debug.h" -// CORESYS: hoisted out of _ROOT (see surface.c for rationale). -JOEYLIB_SEGMENT("CORESYS") static const char *kLogPath = "joeylog.txt"; diff --git a/src/core/draw.c b/src/core/draw.c index ee12b79..c60d092 100644 --- a/src/core/draw.c +++ b/src/core/draw.c @@ -12,12 +12,6 @@ #include "hal.h" #include "surfaceInternal.h" -// On IIgs, hoist all primitive functions out of _ROOT into a named -// DRAWPRIMS load segment. drawLine/drawCircle/fillCircle/floodFill/ -// floodFillBounded together push past the 64 KB-per-bank budget for -// the simpler binaries (PATTERN was the first to fail). On other -// ports this macro vanishes. -JOEYLIB_SEGMENT("DRAWPRIMS") // ----- Constants ----- @@ -107,25 +101,28 @@ static void fillRectClipped(SurfaceT *s, int16_t x, int16_t y, int16_t w, int16_ uint8_t nibble = colorIndex & 0x0F; uint8_t doubled = (uint8_t)((nibble << 4) | nibble); int16_t row; - int16_t pxStart; - int16_t pxEnd; - int16_t midBytes; + uint16_t pxStart; + uint16_t pxEnd; + uint16_t midBytes; uint8_t *line; + /* px* and midBytes are uint16_t (clipped values are non-negative) + * so `>>1` lowers to a single LSR instead of ORCA-C's + * ~SSHIFTRIGHT helper. Same with `<<1` for midBytes. */ for (row = 0; row < h; row++) { line = &s->pixels[SURFACE_ROW_OFFSET(y + row)]; - pxStart = x; - pxEnd = x + w; + pxStart = (uint16_t)x; + pxEnd = (uint16_t)(x + w); - if (pxStart & 1) { + if (pxStart & 1u) { line[pxStart >> 1] = (uint8_t)((line[pxStart >> 1] & 0xF0) | nibble); pxStart++; } - midBytes = (pxEnd - pxStart) >> 1; - if (midBytes > 0) { + midBytes = (uint16_t)((pxEnd - pxStart) >> 1); + if (midBytes > 0u) { memset(&line[pxStart >> 1], doubled, (size_t)midBytes); - pxStart += midBytes << 1; + pxStart = (uint16_t)(pxStart + (midBytes << 1)); } if (pxStart < pxEnd) { @@ -343,7 +340,10 @@ static void floodFillInternal(SurfaceT *s, int16_t startX, int16_t startY, uint8 static void dstPixel(uint8_t *row, int16_t x, uint8_t nibble) { uint8_t *byte; - byte = &row[x >> 1]; + /* `(uint16_t)x >> 1` instead of `x >> 1` -- caller has already + * range-checked x non-negative, and unsigned shift dodges the + * ~SSHIFTRIGHT helper ORCA-C emits for signed `>>`. */ + byte = &row[(uint16_t)x >> 1]; if (x & 1) { *byte = (uint8_t)((*byte & 0xF0) | nibble); } else { @@ -355,7 +355,7 @@ static void dstPixel(uint8_t *row, int16_t x, uint8_t nibble) { static uint8_t srcPixel(const uint8_t *row, int16_t x) { uint8_t byte; - byte = row[x >> 1]; + byte = row[(uint16_t)x >> 1]; if (x & 1) { return (uint8_t)(byte & 0x0F); } @@ -407,11 +407,13 @@ void drawCircle(SurfaceT *s, int16_t cx, int16_t cy, uint16_t r, uint8_t colorIn drawPixel(s, (int16_t)(cx + y), (int16_t)(cy - x), colorIndex); drawPixel(s, (int16_t)(cx - y), (int16_t)(cy - x), colorIndex); y++; + /* Use `+ + 1` instead of `2 * y + 1` so ORCA-C never emits + * the ~SMUL2 helper -- two ADDs are unconditionally cheaper. */ if (err <= 0) { - err = (int16_t)(err + 2 * y + 1); + err = (int16_t)(err + y + y + 1); } else { x--; - err = (int16_t)(err + 2 * (y - x) + 1); + err = (int16_t)(err + y + y - x - x + 1); } } } @@ -502,7 +504,9 @@ void drawPixel(SurfaceT *s, int16_t x, int16_t y, uint8_t colorIndex) { } if (!halFastDrawPixel(s, (uint16_t)x, (uint16_t)y, colorIndex)) { - byte = &s->pixels[SURFACE_ROW_OFFSET(y) + (x >> 1)]; + /* Cast to uint16_t before shift -- already validated x >= 0, + * so unsigned semantics match. Avoids ~SSHIFTRIGHT helper. */ + byte = &s->pixels[SURFACE_ROW_OFFSET(y) + ((uint16_t)x >> 1)]; nibble = colorIndex & 0x0F; if (x & 1) { *byte = (uint8_t)((*byte & 0xF0) | nibble); @@ -571,20 +575,26 @@ void fillCircle(SurfaceT *s, int16_t cx, int16_t cy, uint16_t r, uint8_t colorIn // (y+1)^2 = y^2 + 2y + 1; (x-1)^2 = x^2 - 2x + 1. r is uint16_t // so xx, yy, r2 fit in uint16_t for any r where x*x+y*y can equal // r2 (i.e. r <= 255 -> r2 <= 65025). + /* Same `+ +` pattern as drawCircle so ORCA-C doesn't emit ~SMUL2 / + * ~CUMUL2 helpers for the `2 * ...` constants. spanWidth is hoisted + * because both fillRect calls in the body need it. */ xx = (uint16_t)(r * r); r2 = xx; yy = 0; x = (int16_t)r; for (y = 0; y <= (int16_t)r; y++) { + uint16_t spanWidth; + while (xx + yy > r2) { - xx = (uint16_t)(xx - (uint16_t)(2 * x - 1)); + xx = (uint16_t)(xx - (uint16_t)((uint16_t)x + (uint16_t)x - 1u)); x--; } - fillRect(s, (int16_t)(cx - x), (int16_t)(cy + y), (uint16_t)(2 * x + 1), 1, colorIndex); + spanWidth = (uint16_t)((uint16_t)x + (uint16_t)x + 1u); + fillRect(s, (int16_t)(cx - x), (int16_t)(cy + y), spanWidth, 1, colorIndex); if (y > 0) { - fillRect(s, (int16_t)(cx - x), (int16_t)(cy - y), (uint16_t)(2 * x + 1), 1, colorIndex); + fillRect(s, (int16_t)(cx - x), (int16_t)(cy - y), spanWidth, 1, colorIndex); } - yy = (uint16_t)(yy + (uint16_t)(2 * y + 1)); + yy = (uint16_t)(yy + (uint16_t)((uint16_t)y + (uint16_t)y + 1u)); } } @@ -668,11 +678,16 @@ uint8_t samplePixel(const SurfaceT *s, int16_t x, int16_t y) { return 0; } - byte = s->pixels[SURFACE_ROW_OFFSET(y) + (x >> 1)]; + /* Cast to uint16_t before shift -- already validated x >= 0, + * unsigned semantics match. Avoids ~SSHIFTRIGHT helper. */ + byte = s->pixels[SURFACE_ROW_OFFSET(y) + ((uint16_t)x >> 1)]; if (x & 1) { return (uint8_t)(byte & 0x0F); } - return (uint8_t)(byte >> 4); + /* `byte >> 4` is uint8_t but ORCA-C promotes to int (signed 16-bit) + * for the shift, then narrows -- triggers ~SSHIFTRIGHT. The + * mask-then-shift sidesteps the promotion path. */ + return (uint8_t)((byte & 0xF0u) >> 4); } diff --git a/src/core/hal.h b/src/core/hal.h index 2726846..fec0777 100644 --- a/src/core/hal.h +++ b/src/core/hal.h @@ -58,6 +58,19 @@ void halInputPoll(void); // graphics.library WaitTOF, XBIOS Vsync, $C019 polling). void halWaitVBL(void); +// Monotonic 16-bit frame counter. Caller polls; ports either detect +// the rising edge inside this call (IIgs $C019 / DOS $3DA / Amiga +// VPOSR) or return a counter maintained by a VBL ISR (ST). Required +// caller invariant: poll faster than 2 * halFrameHz() so no edge is +// missed. Used by benchmarks; cheap enough for animation cadence too. +uint16_t halFrameCount(void); + +// Nominal display frame rate in Hz (50 PAL Amiga, 60 NTSC IIgs / ST, +// ~70 VGA mode 13h). Reported only -- no API contract that VBLs +// arrive at exactly this rate. Benchmarks divide by it to convert +// iters-per-N-frames to ops/sec. +uint16_t halFrameHz(void); + // Audio: per-port engine setup, module + SFX playback, teardown. // halAudioInit returns true if the platform has a working engine. // All entry points are safe to call when init failed -- they become @@ -278,11 +291,12 @@ extern uint16_t gFloodRightX; // Tile primitives operate on caller-computed row pointers; just // forward the args. by/bx are tile coords -> bx*4 + by*8*160 byte -// offset within the surface. +// offset within the surface. Use SURFACE_ROW_OFFSET (LUT lookup) to +// dodge ORCA-C 2.2.1's ~CUMUL2 helper for the *160 multiply. #undef halFastTileFill #define halFastTileFill(_s, _bx, _by, _fw) \ - (iigsTileFillInner(&(_s)->pixels[(uint16_t)(_by) * 8 * SURFACE_BYTES_PER_ROW \ - + (uint16_t)(_bx) * 4], \ + (iigsTileFillInner(&(_s)->pixels[SURFACE_ROW_OFFSET((uint16_t)(_by) << 3) \ + + ((uint16_t)(_bx) << 2)], \ (_fw)), \ true) diff --git a/src/core/init.c b/src/core/init.c index fc263ff..d3b53f9 100644 --- a/src/core/init.c +++ b/src/core/init.c @@ -12,8 +12,6 @@ #include "hal.h" #include "surfaceInternal.h" -// CORESYS: hoisted out of _ROOT (see surface.c for rationale). -JOEYLIB_SEGMENT("CORESYS") // 8 KB fits the largest typical sprite working set (~3-4 KB per // 32x32 sprite at all opaque) and keeps malloc requests small enough @@ -121,3 +119,13 @@ const char *joeyVersionString(void) { void joeyWaitVBL(void) { halWaitVBL(); } + + +uint16_t joeyFrameCount(void) { + return halFrameCount(); +} + + +uint16_t joeyFrameHz(void) { + return halFrameHz(); +} diff --git a/src/core/input.c b/src/core/input.c index 28e65c5..aed3020 100644 --- a/src/core/input.c +++ b/src/core/input.c @@ -15,34 +15,39 @@ #include "hal.h" #include "inputInternal.h" -// CORESYS: hoisted out of _ROOT (see surface.c for rationale). -JOEYLIB_SEGMENT("CORESYS") -bool gKeyState [KEY_COUNT]; -bool gKeyPrev [KEY_COUNT]; +// See inputInternal.h for why these are uint8_t and not bool. +uint8_t gKeyState [KEY_COUNT]; +uint8_t gKeyPrev [KEY_COUNT]; int16_t gMouseX = 0; int16_t gMouseY = 0; -bool gMouseButtonState[MOUSE_BUTTON_COUNT]; -bool gMouseButtonPrev [MOUSE_BUTTON_COUNT]; +uint8_t gMouseButtonState[MOUSE_BUTTON_COUNT]; +uint8_t gMouseButtonPrev [MOUSE_BUTTON_COUNT]; -bool gJoyConnected [JOYSTICK_COUNT]; +uint8_t gJoyConnected [JOYSTICK_COUNT]; int8_t gJoyAxisX [JOYSTICK_COUNT]; int8_t gJoyAxisY [JOYSTICK_COUNT]; -bool gJoyButtonState[JOYSTICK_COUNT][JOY_BUTTON_COUNT]; -bool gJoyButtonPrev [JOYSTICK_COUNT][JOY_BUTTON_COUNT]; +uint8_t gJoyButtonState[JOYSTICK_COUNT][JOY_BUTTON_COUNT]; +uint8_t gJoyButtonPrev [JOYSTICK_COUNT][JOY_BUTTON_COUNT]; uint8_t gJoyDeadZone [JOYSTICK_COUNT]; #ifdef JOEYLIB_PLATFORM_IIGS extern void iigsInputSnapshot(void); -// Build-time check: iigsInputSnapshot's asm hard-codes KEY_COUNT=60 -// and the small button counts. If a future change adds/removes keys -// or buttons the asm must be updated; this declares a zero-size -// array if the math no longer matches, which is a compile error. -typedef int joey_keycount_check[(KEY_COUNT == 60) ? 1 : -1]; -typedef int joey_mousebtn_check[(MOUSE_BUTTON_COUNT == 4) ? 1 : -1]; -typedef int joey_joybtn_check[(JOYSTICK_COUNT * JOY_BUTTON_COUNT == 4) ? 1 : -1]; +// Build-time checks: iigsInputSnapshot's asm hard-codes KEY_COUNT=60 +// and the small button counts, and walks every array one byte per +// element. If a future change adds/removes keys or buttons the asm +// must be updated; if anyone re-types the arrays back to bool the +// per-element size grows to ORCA-C's 2-byte _Bool and the asm reads +// the wrong bytes. Either condition declares a zero-size array +// below, which is a compile error. +typedef int joey_keycount_check [(KEY_COUNT == 60) ? 1 : -1]; +typedef int joey_mousebtn_check [(MOUSE_BUTTON_COUNT == 4) ? 1 : -1]; +typedef int joey_joybtn_check [(JOYSTICK_COUNT * JOY_BUTTON_COUNT == 4) ? 1 : -1]; +typedef int joey_keystate_size_check [(sizeof(gKeyState) == KEY_COUNT) ? 1 : -1]; +typedef int joey_mousebtn_size_check [(sizeof(gMouseButtonState) == MOUSE_BUTTON_COUNT) ? 1 : -1]; +typedef int joey_joybtn_size_check [(sizeof(gJoyButtonState) == JOYSTICK_COUNT * JOY_BUTTON_COUNT) ? 1 : -1]; #endif void joeyInputPoll(void) { @@ -79,8 +84,14 @@ void joeyWaitForAnyKey(void) { } +/* All six key/mouse predicates fold the lower-bound check (`<= NONE`) + * and upper-bound check (`>= COUNT`) into a single unsigned compare. + * Index 0 (KEY_NONE / MOUSE_BUTTON_NONE) is a sentinel that no HAL + * ever writes, so reading gKeyState[0] / gMouseButtonState[0] is + * always 0 -- the predicate result is unchanged but ORCA-C drops the + * compound `||` into one branch each. */ bool joeyKeyDown(JoeyKeyE key) { - if (key <= KEY_NONE || key >= KEY_COUNT) { + if ((uint16_t)key >= (uint16_t)KEY_COUNT) { return false; } return gKeyState[key]; @@ -88,7 +99,7 @@ bool joeyKeyDown(JoeyKeyE key) { bool joeyKeyPressed(JoeyKeyE key) { - if (key <= KEY_NONE || key >= KEY_COUNT) { + if ((uint16_t)key >= (uint16_t)KEY_COUNT) { return false; } return gKeyState[key] && !gKeyPrev[key]; @@ -96,7 +107,7 @@ bool joeyKeyPressed(JoeyKeyE key) { bool joeyKeyReleased(JoeyKeyE key) { - if (key <= KEY_NONE || key >= KEY_COUNT) { + if ((uint16_t)key >= (uint16_t)KEY_COUNT) { return false; } return !gKeyState[key] && gKeyPrev[key]; @@ -104,7 +115,7 @@ bool joeyKeyReleased(JoeyKeyE key) { bool joeyMouseDown(JoeyMouseButtonE button) { - if (button <= MOUSE_BUTTON_NONE || button >= MOUSE_BUTTON_COUNT) { + if ((uint16_t)button >= (uint16_t)MOUSE_BUTTON_COUNT) { return false; } return gMouseButtonState[button]; @@ -112,7 +123,7 @@ bool joeyMouseDown(JoeyMouseButtonE button) { bool joeyMousePressed(JoeyMouseButtonE button) { - if (button <= MOUSE_BUTTON_NONE || button >= MOUSE_BUTTON_COUNT) { + if ((uint16_t)button >= (uint16_t)MOUSE_BUTTON_COUNT) { return false; } return gMouseButtonState[button] && !gMouseButtonPrev[button]; @@ -120,7 +131,7 @@ bool joeyMousePressed(JoeyMouseButtonE button) { bool joeyMouseReleased(JoeyMouseButtonE button) { - if (button <= MOUSE_BUTTON_NONE || button >= MOUSE_BUTTON_COUNT) { + if ((uint16_t)button >= (uint16_t)MOUSE_BUTTON_COUNT) { return false; } return !gMouseButtonState[button] && gMouseButtonPrev[button]; @@ -138,7 +149,7 @@ int16_t joeyMouseY(void) { bool joeyJoystickConnected(JoeyJoystickE js) { - if ((int)js < 0 || (int)js >= JOYSTICK_COUNT) { + if ((uint16_t)js >= (uint16_t)JOYSTICK_COUNT) { return false; } return gJoyConnected[js]; @@ -146,7 +157,7 @@ bool joeyJoystickConnected(JoeyJoystickE js) { int8_t joeyJoystickX(JoeyJoystickE js) { - if ((int)js < 0 || (int)js >= JOYSTICK_COUNT) { + if ((uint16_t)js >= (uint16_t)JOYSTICK_COUNT) { return 0; } return gJoyAxisX[js]; @@ -154,48 +165,59 @@ int8_t joeyJoystickX(JoeyJoystickE js) { int8_t joeyJoystickY(JoeyJoystickE js) { - if ((int)js < 0 || (int)js >= JOYSTICK_COUNT) { + if ((uint16_t)js >= (uint16_t)JOYSTICK_COUNT) { return 0; } return gJoyAxisY[js]; } +/* Joystick button predicates: ORCA-C 2.2.1 lowers `gJoyButtonState[js][button]` + * to a ~MUL4 helper per access. Compute the 1D byte index once and read + * via an explicit (uint8_t *) cast -- no helpers. */ bool joeyJoyDown(JoeyJoystickE js, JoeyJoyButtonE button) { - if ((int)js < 0 || (int)js >= JOYSTICK_COUNT) { + uint16_t idx; + if ((uint16_t)js >= (uint16_t)JOYSTICK_COUNT) { return false; } - if ((int)button < 0 || (int)button >= JOY_BUTTON_COUNT) { + if ((uint16_t)button >= (uint16_t)JOY_BUTTON_COUNT) { return false; } - return gJoyButtonState[js][button]; + idx = (uint16_t)((uint16_t)js * JOY_BUTTON_COUNT + (uint16_t)button); + return ((const uint8_t *)gJoyButtonState)[idx] != 0; } bool joeyJoyPressed(JoeyJoystickE js, JoeyJoyButtonE button) { - if ((int)js < 0 || (int)js >= JOYSTICK_COUNT) { + uint16_t idx; + if ((uint16_t)js >= (uint16_t)JOYSTICK_COUNT) { return false; } - if ((int)button < 0 || (int)button >= JOY_BUTTON_COUNT) { + if ((uint16_t)button >= (uint16_t)JOY_BUTTON_COUNT) { return false; } - return gJoyButtonState[js][button] && !gJoyButtonPrev[js][button]; + idx = (uint16_t)((uint16_t)js * JOY_BUTTON_COUNT + (uint16_t)button); + return (((const uint8_t *)gJoyButtonState)[idx] != 0) && + (((const uint8_t *)gJoyButtonPrev) [idx] == 0); } bool joeyJoyReleased(JoeyJoystickE js, JoeyJoyButtonE button) { - if ((int)js < 0 || (int)js >= JOYSTICK_COUNT) { + uint16_t idx; + if ((uint16_t)js >= (uint16_t)JOYSTICK_COUNT) { return false; } - if ((int)button < 0 || (int)button >= JOY_BUTTON_COUNT) { + if ((uint16_t)button >= (uint16_t)JOY_BUTTON_COUNT) { return false; } - return !gJoyButtonState[js][button] && gJoyButtonPrev[js][button]; + idx = (uint16_t)((uint16_t)js * JOY_BUTTON_COUNT + (uint16_t)button); + return (((const uint8_t *)gJoyButtonState)[idx] == 0) && + (((const uint8_t *)gJoyButtonPrev) [idx] != 0); } void joeyJoystickReset(JoeyJoystickE js, uint8_t deadZone) { - if ((int)js < 0 || (int)js >= JOYSTICK_COUNT) { + if ((uint16_t)js >= (uint16_t)JOYSTICK_COUNT) { return; } gJoyDeadZone[js] = deadZone; diff --git a/src/core/inputInternal.h b/src/core/inputInternal.h index 765bcc2..b5a3cef 100644 --- a/src/core/inputInternal.h +++ b/src/core/inputInternal.h @@ -12,19 +12,26 @@ #include "joey/input.h" #include "joey/types.h" -extern bool gKeyState[KEY_COUNT]; -extern bool gKeyPrev [KEY_COUNT]; +// Stored as uint8_t (not bool) because ORCA-C compiles _Bool as a +// 2-byte word (Symbol.pas: size := cgWordSize). The IIgs asm fast +// path (iigsInputSnapshot) walks these arrays one byte per element; +// a 2-byte bool would put element k at byte offset 2*k and the asm's +// per-byte clear would never reach the live half. uint8_t pins the +// storage to one byte per element on every port. Public predicates +// still return bool via implicit coercion. +extern uint8_t gKeyState[KEY_COUNT]; +extern uint8_t gKeyPrev [KEY_COUNT]; extern int16_t gMouseX; extern int16_t gMouseY; -extern bool gMouseButtonState[MOUSE_BUTTON_COUNT]; -extern bool gMouseButtonPrev [MOUSE_BUTTON_COUNT]; +extern uint8_t gMouseButtonState[MOUSE_BUTTON_COUNT]; +extern uint8_t gMouseButtonPrev [MOUSE_BUTTON_COUNT]; -extern bool gJoyConnected[JOYSTICK_COUNT]; -extern int8_t gJoyAxisX [JOYSTICK_COUNT]; -extern int8_t gJoyAxisY [JOYSTICK_COUNT]; -extern bool gJoyButtonState[JOYSTICK_COUNT][JOY_BUTTON_COUNT]; -extern bool gJoyButtonPrev [JOYSTICK_COUNT][JOY_BUTTON_COUNT]; +extern uint8_t gJoyConnected[JOYSTICK_COUNT]; +extern int8_t gJoyAxisX [JOYSTICK_COUNT]; +extern int8_t gJoyAxisY [JOYSTICK_COUNT]; +extern uint8_t gJoyButtonState[JOYSTICK_COUNT][JOY_BUTTON_COUNT]; +extern uint8_t gJoyButtonPrev [JOYSTICK_COUNT][JOY_BUTTON_COUNT]; // Per-stick analog calibration. Set by joeyJoystickReset on platforms // with analog paddles (IIgs); ignored on digital-stick platforms. diff --git a/src/core/palette.c b/src/core/palette.c index 5a85459..d94ddf7 100644 --- a/src/core/palette.c +++ b/src/core/palette.c @@ -10,24 +10,69 @@ #include "joey/palette.h" #include "surfaceInternal.h" -// CORESYS: hoisted out of _ROOT (see surface.c for rationale). -JOEYLIB_SEGMENT("CORESYS") + +// Standard 16-color EGA palette in IIgs $0RGB format. Used as the +// per-surface default at allocation time (paletteInitDefault) so a +// program that draws without first calling paletteSet still gets +// recognizable colors instead of an all-black palette. EGA index 6 +// is the canonical "brown" hack ($0A50, half-green) so CGA monitors +// rendered the third primary as brown rather than dark yellow. +static const uint16_t kDefaultPaletteEga[SURFACE_COLORS_PER_PALETTE] = { + 0x0000, // 0: Black + 0x000A, // 1: Blue + 0x00A0, // 2: Green + 0x00AA, // 3: Cyan + 0x0A00, // 4: Red + 0x0A0A, // 5: Magenta + 0x0A50, // 6: Brown + 0x0AAA, // 7: Light Gray + 0x0555, // 8: Dark Gray + 0x055F, // 9: Light Blue + 0x05F5, // 10: Light Green + 0x05FF, // 11: Light Cyan + 0x0F55, // 12: Light Red + 0x0F5F, // 13: Light Magenta + 0x0FF5, // 14: Yellow + 0x0FFF // 15: White +}; + + +// ----- Internal API ----- + +void paletteInitDefault(SurfaceT *s) { + uint8_t i; + + if (s == NULL) { + return; + } + for (i = 0; i < SURFACE_PALETTE_COUNT; i++) { + paletteSet(s, i, kDefaultPaletteEga); + } +} + // ----- Public API (alphabetical) ----- void paletteGet(const SurfaceT *s, uint8_t paletteIndex, uint16_t *out16) { + const uint16_t *row; + if (s == NULL || out16 == NULL) { return; } if (paletteIndex >= SURFACE_PALETTE_COUNT) { return; } - memcpy(out16, s->palette[paletteIndex], SURFACE_COLORS_PER_PALETTE * sizeof(uint16_t)); + /* Byte-pointer math + shift to skip the ~MUL4 helper -- see + * paletteSet for the reasoning. */ + row = (const uint16_t *)((const uint8_t *)s->palette + ((uint16_t)paletteIndex << 5)); + memcpy(out16, row, SURFACE_COLORS_PER_PALETTE * sizeof(uint16_t)); } void paletteSet(SurfaceT *s, uint8_t paletteIndex, const uint16_t *colors16) { - uint8_t i; + uint8_t i; + uint16_t *row; + const uint16_t *src; if (s == NULL || colors16 == NULL) { return; @@ -36,9 +81,18 @@ void paletteSet(SurfaceT *s, uint8_t paletteIndex, const uint16_t *colors16) { return; } - s->palette[paletteIndex][0] = 0x0000; + /* Compute the row pointer via byte-pointer math + a single shift + * (16 entries * 2 bytes = 32 = 1 << 5) so ORCA-C doesn't emit a + * ~MUL4 helper for the 2D-array indexing. Then walk both arrays + * with post-increment pointers so the inner loop avoids ~MUL4 + * for every `row[i]` / `colors16[i]` index multiply too. */ + row = (uint16_t *)((uint8_t *)s->palette + ((uint16_t)paletteIndex << 5)); + src = colors16; + + *row++ = 0x0000; + src++; for (i = 1; i < SURFACE_COLORS_PER_PALETTE; i++) { - s->palette[paletteIndex][i] = colors16[i] & 0x0FFF; + *row++ = (uint16_t)(*src++ & 0x0FFF); } if (s == stageGet()) { gStagePaletteDirty = true; diff --git a/src/core/present.c b/src/core/present.c index 02468b2..1184f27 100644 --- a/src/core/present.c +++ b/src/core/present.c @@ -12,8 +12,6 @@ #include "hal.h" #include "surfaceInternal.h" -// CORESYS: hoisted out of _ROOT (see surface.c for rationale). -JOEYLIB_SEGMENT("CORESYS") // ----- Public API (alphabetical) ----- diff --git a/src/core/scb.c b/src/core/scb.c index 6946c79..112ffdc 100644 --- a/src/core/scb.c +++ b/src/core/scb.c @@ -9,8 +9,6 @@ #include "joey/palette.h" #include "surfaceInternal.h" -// CORESYS: hoisted out of _ROOT (see surface.c for rationale). -JOEYLIB_SEGMENT("CORESYS") // ----- Public API (alphabetical) ----- diff --git a/src/core/sprite.c b/src/core/sprite.c index 7daf805..80bf036 100644 --- a/src/core/sprite.c +++ b/src/core/sprite.c @@ -13,8 +13,6 @@ #include "spriteInternal.h" #include "surfaceInternal.h" -// CORESYS: hoisted out of _ROOT (see surface.c for rationale). -JOEYLIB_SEGMENT("CORESYS") // 8x8 tiles, 4bpp packed = 4 bytes/row * 8 rows = 32 bytes/tile. #define TILE_BYTES 32 @@ -180,6 +178,7 @@ SpriteT *spriteCreate(const uint8_t *tileData, uint8_t widthTiles, uint8_t heigh memset(sp->routineOffsets, 0, sizeof(sp->routineOffsets)); memset(sp->cachedDstBank, 0xFF, sizeof(sp->cachedDstBank)); memset(sp->cachedSrcBank, 0xFF, sizeof(sp->cachedSrcBank)); + memset(sp->cachedSizeBytes, 0, sizeof(sp->cachedSizeBytes)); sp->flags = flags; return sp; } @@ -249,6 +248,7 @@ SpriteT *spriteCreateFromSurface(const SurfaceT *src, int16_t x, int16_t y, memset(sp->routineOffsets, 0, sizeof(sp->routineOffsets)); memset(sp->cachedDstBank, 0xFF, sizeof(sp->cachedDstBank)); memset(sp->cachedSrcBank, 0xFF, sizeof(sp->cachedSrcBank)); + memset(sp->cachedSizeBytes, 0, sizeof(sp->cachedSizeBytes)); sp->flags = flags; return sp; } @@ -296,6 +296,63 @@ void spritePrewarm(SpriteT *sp) { } +// Combined save-then-draw fast path. Routes both ops through the +// compiled save/draw entry points after a single shared validation +// pass. Falls back to calling the public spriteSaveUnder + spriteDraw +// when the fast path isn't applicable -- semantically identical, just +// pays the dispatcher overhead twice. +void spriteSaveAndDraw(SurfaceT *s, SpriteT *sp, int16_t x, int16_t y, SpriteBackupT *backup) { + uint16_t widthPx; + uint16_t heightPx; + uint8_t wTiles; + uint8_t hTiles; + ArenaSlotT *slot; + uint8_t shift; + + if (s == NULL || sp == NULL || backup == NULL) { + return; + } + backup->sprite = sp; + backup->sizeBytes = 0; + + wTiles = sp->widthTiles; + hTiles = sp->heightTiles; + slot = sp->slot; + + widthPx = (uint16_t)(wTiles * TILE_PIXELS); + heightPx = (uint16_t)(hTiles * TILE_PIXELS); + + // Fast path: compiled bytes available, fully on surface, backup + // buffer supplied. Save fills out backup->{x,y,width,height, + // sizeBytes}; draw reuses (x,y,widthPx,heightPx) for the dirty + // mark. One mark instead of two (save doesn't dirty -- it's a + // read; only draw dirties). + if (slot != NULL && backup->bytes != NULL && isFullyOnSurface(x, y, widthPx, heightPx)) { + /* Byte-pointer arithmetic dodges ~MUL4 for 2D-array indexing. */ + uint16_t saveIdx; + uint16_t drawIdx; + uint8_t *offsetsBase; + shift = (uint8_t)(x & 1); + saveIdx = (uint16_t)(((uint16_t)shift << 1) + (uint16_t)shift + SPRITE_OP_SAVE); + drawIdx = (uint16_t)(((uint16_t)shift << 1) + (uint16_t)shift + SPRITE_OP_DRAW); + offsetsBase = (uint8_t *)sp->routineOffsets; + if (*(uint16_t *)(offsetsBase + (saveIdx << 1)) != SPRITE_NOT_COMPILED && + *(uint16_t *)(offsetsBase + (drawIdx << 1)) != SPRITE_NOT_COMPILED) { + spriteCompiledSaveUnder(s, sp, x, y, backup); + spriteCompiledDraw (s, sp, x, y); + surfaceMarkDirtyRect (s, x, y, (int16_t)widthPx, (int16_t)heightPx); + return; + } + } + + // Fall back to the slow paths through the public API. These + // pay the full dispatcher chain twice but handle every edge + // case (interpreter, partial clip, no-backup-buffer modes). + spriteSaveUnder(s, sp, x, y, backup); + spriteDraw (s, sp, x, y); +} + + // .spr file format: // offset bytes field // ------ ----- -------------------------------------------- @@ -394,6 +451,7 @@ SpriteT *spriteFromCompiledMem(const uint8_t *data, uint32_t length, SpriteFlags sp->flags = flags; memset(sp->cachedDstBank, 0xFF, sizeof(sp->cachedDstBank)); memset(sp->cachedSrcBank, 0xFF, sizeof(sp->cachedSrcBank)); + memset(sp->cachedSizeBytes, 0, sizeof(sp->cachedSizeBytes)); return sp; } @@ -528,78 +586,82 @@ uint32_t spriteCodegenBytesUsed(void) { void spriteRestoreUnder(SurfaceT *s, const SpriteBackupT *backup) { - int16_t row; - int16_t byteStart; - int16_t copyBytes; - uint16_t spriteBytesPerRow; - uint8_t shift; - uint8_t *dstRow; + /* Fast-path locals only. Slow-path uses an inner block. */ + int16_t bx; + int16_t by; + uint16_t bw; + uint16_t bh; SpriteT *sp; + uint16_t spriteBytesPerRow; + int16_t copyBytes; + uint8_t shift; - if (s == NULL || backup == NULL || backup->bytes == NULL) { + if (s == NULL || backup == NULL) { return; } - if (backup->width == 0 || backup->height == 0) { - return; - } - if (backup->x < 0 || backup->y < 0) { - return; - } - if (backup->x >= SURFACE_WIDTH || backup->y >= SURFACE_HEIGHT) { - return; - } - if (backup->x + backup->width > SURFACE_WIDTH) { - return; - } - if (backup->y + backup->height > SURFACE_HEIGHT) { - return; - } - // Saved region is byte-aligned; sub-byte boundaries can't be - // represented without losing the neighboring pixel under the byte. - if ((backup->x & 1) || (backup->width & 1)) { + bx = backup->x; + by = backup->y; + bw = backup->width; + bh = backup->height; + + /* Validate. Note: SURFACE_WIDTH - bx and SURFACE_HEIGHT - by stay + * in uint16_t range once bx >= 0 / by >= 0 has been checked, so + * the right-edge / bottom-edge tests don't need 32-bit arithmetic + * (which would invoke ORCA-C's ~GRTL helper, ~50 cyc per call). */ + if (backup->bytes == NULL || + bw == 0 || bh == 0 || + bx < 0 || by < 0 || + bx >= SURFACE_WIDTH || by >= SURFACE_HEIGHT || + bw > (uint16_t)(SURFACE_WIDTH - bx) || + bh > (uint16_t)(SURFACE_HEIGHT - by) || + (bx & 1) || (bw & 1)) { return; } sp = backup->sprite; - if (sp != NULL && sp->slot != NULL && backup->height == sp->heightTiles * TILE_PIXELS) { + if (sp != NULL && sp->slot != NULL && bh == sp->heightTiles * TILE_PIXELS) { + uint16_t routeIdx; + uint16_t routeOffset; spriteBytesPerRow = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW); - copyBytes = (int16_t)(backup->width >> 1); + copyBytes = (int16_t)(bw >> 1); shift = (copyBytes == (int16_t)spriteBytesPerRow) ? 0 : 1; - if (sp->routineOffsets[shift][SPRITE_OP_RESTORE] != SPRITE_NOT_COMPILED) { + /* Byte-pointer arithmetic dodges ~MUL4 for 2D-array indexing. */ + routeIdx = (uint16_t)(((uint16_t)shift << 1) + (uint16_t)shift + SPRITE_OP_RESTORE); + routeOffset = *(uint16_t *)((uint8_t *)sp->routineOffsets + (routeIdx << 1)); + if (routeOffset != SPRITE_NOT_COMPILED) { spriteCompiledRestoreUnder(s, backup); - surfaceMarkDirtyRect(s, backup->x, backup->y, - (int16_t)backup->width, (int16_t)backup->height); + surfaceMarkDirtyRect(s, bx, by, (int16_t)bw, (int16_t)bh); return; } } - byteStart = (int16_t)(backup->x >> 1); - copyBytes = (int16_t)(backup->width >> 1); - for (row = 0; row < backup->height; row++) { - dstRow = &s->pixels[(backup->y + row) * SURFACE_BYTES_PER_ROW]; - memcpy(&dstRow[byteStart], - &backup->bytes[(uint16_t)row * (uint16_t)copyBytes], - (size_t)copyBytes); + /* Slow / interpreted memcpy fallback. */ + { + int16_t row; + int16_t byteStart; + uint8_t *dstRow; + + byteStart = (int16_t)(bx >> 1); + copyBytes = (int16_t)(bw >> 1); + for (row = 0; row < (int16_t)bh; row++) { + dstRow = &s->pixels[(by + row) * SURFACE_BYTES_PER_ROW]; + memcpy(&dstRow[byteStart], + &backup->bytes[(uint16_t)row * (uint16_t)copyBytes], + (size_t)copyBytes); + } } - surfaceMarkDirtyRect(s, backup->x, backup->y, - (int16_t)backup->width, (int16_t)backup->height); + surfaceMarkDirtyRect(s, bx, by, (int16_t)bw, (int16_t)bh); } void spriteSaveUnder(const SurfaceT *s, SpriteT *sp, int16_t x, int16_t y, SpriteBackupT *backup) { - int16_t dx; - int16_t dy; - int16_t sx; - int16_t sy; - int16_t w; - int16_t h; - int16_t row; - int16_t byteStart; - int16_t copyBytes; - int16_t clippedX; - int16_t clippedW; + /* Only fast-path locals here. Slow-path declarations live inside + * the slow-path block below so ORCA-C with -b doesn't reserve + * stack frame for them on every fast-path call. */ + uint16_t widthPx; + uint16_t heightPx; + ArenaSlotT *slot; uint8_t shift; - const uint8_t *srcRow; if (s == NULL || sp == NULL || backup == NULL) { return; @@ -607,23 +669,50 @@ void spriteSaveUnder(const SurfaceT *s, SpriteT *sp, int16_t x, int16_t y, Sprit backup->sprite = sp; backup->sizeBytes = 0; - dx = x; - dy = y; - w = (int16_t)(sp->widthTiles * TILE_PIXELS); - h = (int16_t)(sp->heightTiles * TILE_PIXELS); + slot = sp->slot; + widthPx = (uint16_t)(sp->widthTiles * TILE_PIXELS); + heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS); // Compiled fast path: fully on surface and the platform emitted // bytes for SAVE at this shift. The compiled routine assumes a // full-size, unclipped rectangle, so anything off-edge falls // through to the interpreted memcpy loop below. - if (backup->bytes != NULL && sp->slot != NULL && isFullyOnSurface(x, y, (uint16_t)w, (uint16_t)h)) { - shift = (uint8_t)(x & 1); - if (sp->routineOffsets[shift][SPRITE_OP_SAVE] != SPRITE_NOT_COMPILED) { + // + // The routineOffsets[shift][SPRITE_OP_SAVE] access is rewritten as + // explicit byte-pointer arithmetic to dodge ORCA-C 2.2.1's ~MUL4 + // helper that gets emitted for `uint16_t arr[N][M]` indexing. + if (backup->bytes != NULL && slot != NULL && isFullyOnSurface(x, y, widthPx, heightPx)) { + uint16_t routeIdx; + uint16_t routeOffset; + shift = (uint8_t)(x & 1); + routeIdx = (uint16_t)(((uint16_t)shift << 1) + (uint16_t)shift + SPRITE_OP_SAVE); + routeOffset = *(uint16_t *)((uint8_t *)sp->routineOffsets + (routeIdx << 1)); + if (routeOffset != SPRITE_NOT_COMPILED) { spriteCompiledSaveUnder(s, sp, x, y, backup); return; } } + /* Slow / fallback path: clipping + interpreted memcpy. */ + { + int16_t dx; + int16_t dy; + int16_t sx; + int16_t sy; + int16_t w; + int16_t h; + int16_t row; + int16_t byteStart; + int16_t copyBytes; + int16_t clippedX; + int16_t clippedW; + const uint8_t *srcRow; + + dx = x; + dy = y; + w = (int16_t)widthPx; + h = (int16_t)heightPx; + if (!clipRect(&dx, &dy, &sx, &sy, &w, &h)) { backup->x = 0; backup->y = 0; @@ -661,4 +750,5 @@ void spriteSaveUnder(const SurfaceT *s, SpriteT *sp, int16_t x, int16_t y, Sprit &srcRow[byteStart], (size_t)copyBytes); } + } /* end slow path */ } diff --git a/src/core/spriteInternal.h b/src/core/spriteInternal.h index fd68445..8e4733a 100644 --- a/src/core/spriteInternal.h +++ b/src/core/spriteInternal.h @@ -45,6 +45,12 @@ struct SpriteT { // 12 bytes per sprite. Unused on non-IIgs. uint8_t cachedDstBank[JOEY_SPRITE_SHIFT_COUNT][SPRITE_OP_COUNT]; uint8_t cachedSrcBank[JOEY_SPRITE_SHIFT_COUNT][SPRITE_OP_COUNT]; + + // Cached `copyBytes * heightPx` per shift for spriteCompiledSaveUnder's + // `backup->sizeBytes` field. uint16_t * uint16_t goes through ORCA-C + // 2.2.1's ~CUMUL2 helper (~30-50 cyc); cache hit dodges it. Filled + // lazily on first call (0 sentinel = uncached). + uint16_t cachedSizeBytes[JOEY_SPRITE_SHIFT_COUNT]; }; // Compiled entry points. Implemented alongside spriteCompile in diff --git a/src/core/surface.c b/src/core/surface.c index 013981d..486620f 100644 --- a/src/core/surface.c +++ b/src/core/surface.c @@ -10,13 +10,6 @@ #include "hal.h" #include "surfaceInternal.h" -// Hoist into a CORESYS load segment alongside the other small core -// files. Keeps _ROOT thin and stable so it stops reacting to per-file -// source changes -- _ROOT size flux was tripping ORCA-Linker bank -// packing in spriteEmitIigs.c (see feedback_orca_link_segment_count -// cases 2-4). -JOEYLIB_SEGMENT("CORESYS") - #ifdef JOEYLIB_PLATFORM_IIGS extern void iigsMarkDirtyRowsInner(uint16_t yStart, uint16_t yEnd, uint16_t minWord, uint16_t maxWord); #endif @@ -91,6 +84,7 @@ SurfaceT *surfaceCreate(void) { free(s); return NULL; } + paletteInitDefault(s); return s; } @@ -207,8 +201,11 @@ void surfaceMarkDirtyRect(const SurfaceT *s, int16_t x, int16_t y, int16_t w, in if (w <= 0 || h <= 0) { return; } - minWord = (uint8_t)(x >> 2); - maxWord = (uint8_t)((x + w - 1) >> 2); + /* Clipped x/w are non-negative; cast to uint16_t before `>> 2` so + * ORCA-C lowers to a pair of LSRs instead of the ~SSHIFTRIGHT + * helper signed shifts emit. */ + minWord = (uint8_t)((uint16_t)x >> 2); + maxWord = (uint8_t)((uint16_t)(x + w - 1) >> 2); yEnd = y + h; #ifdef JOEYLIB_PLATFORM_IIGS iigsMarkDirtyRowsInner((uint16_t)y, (uint16_t)yEnd, @@ -239,6 +236,7 @@ bool stageAlloc(void) { } memset(gStage->pixels, 0, SURFACE_PIXELS_SIZE); stageDirtyClearAll(); + paletteInitDefault(gStage); return true; } diff --git a/src/core/surfaceInternal.h b/src/core/surfaceInternal.h index 175ec9f..45017a5 100644 --- a/src/core/surfaceInternal.h +++ b/src/core/surfaceInternal.h @@ -62,13 +62,19 @@ void stageDirtyClearAll(void); // y -> byte offset of row y in a SURFACE_BYTES_PER_ROW-strided buffer. // On IIgs this expands to a single indexed long-mode read against -// gRowOffsetLut (built once at halInit). On other ports it's the -// straight multiply -- those compilers (gcc, OpenWatcom) optimize the -// constant 160 to a shift+add chain that's already cheap. The point -// is to dodge ORCA-C's __mul16 JSL on every per-row pointer compute. +// gRowOffsetLut (built once at halInit). +// +// The explicit (y << 1) byte-pointer arithmetic dodges ORCA-C 2.2.1's +// `~MUL4` helper that gets emitted for `uint16_t arr[N]` indexing +// (the implicit *sizeof(uint16_t)). With the byte-cast + shift, the +// compiler emits a single ASL + indexed long-mode read. +// +// Other ports get the straight multiply -- gcc / OpenWatcom optimize +// the constant 160 to a shift+add chain. #ifdef JOEYLIB_PLATFORM_IIGS extern const uint16_t gRowOffsetLut[200]; -#define SURFACE_ROW_OFFSET(_y) ((uint16_t)gRowOffsetLut[(uint16_t)(_y)]) +#define SURFACE_ROW_OFFSET(_y) \ + (*((const uint16_t *)((const uint8_t *)gRowOffsetLut + ((uint16_t)(_y) << 1)))) #else #define SURFACE_ROW_OFFSET(_y) ((uint16_t)((uint16_t)(_y) * SURFACE_BYTES_PER_ROW)) #endif @@ -80,4 +86,10 @@ extern const uint16_t gRowOffsetLut[200]; bool stageAlloc(void); void stageFree(void); +// Fill all 16 of `s`'s palettes with the standard 16-color EGA +// palette. Called by stageAlloc and surfaceCreate so a program that +// draws without first calling paletteSet still gets recognizable +// colors instead of an all-black palette. +void paletteInitDefault(SurfaceT *s); + #endif diff --git a/src/core/tile.c b/src/core/tile.c index ce4ac1f..e451425 100644 --- a/src/core/tile.c +++ b/src/core/tile.c @@ -20,10 +20,6 @@ // without that the ORCA Linker hits "Expression too complex" on // the small-binary builds.) -// Hoist tile primitives into the DRAWPRIMS load segment. Asm -// dispatches go through halFast* hooks in src/port/iigs/hal.c so -// only one TU references the asm symbols (avoids the cumulative -// "Expression too complex" link failure). JOEYLIB_SEGMENT("DRAWPRIMS") // ----- Prototypes ----- diff --git a/src/port/amiga/hal.c b/src/port/amiga/hal.c index a802e1d..054605d 100644 --- a/src/port/amiga/hal.c +++ b/src/port/amiga/hal.c @@ -509,6 +509,37 @@ void halWaitVBL(void) { } +// VPOSR ($DFF004) upper byte: low 3 bits = vertical scanline bits +// 8..10. The bit-8 transition from 1 -> 0 marks "vertical wrap" -- +// a fresh frame. Edge-detected per call so caller (UBER, etc.) +// just polls; no IRQ server needed. +#define AMIGA_VPOSR ((volatile uint16_t *)0xDFF004UL) + +static uint16_t gFrameCount = 0; +static uint8_t gPrevVbHi = 0; + +uint16_t halFrameCount(void) { + uint8_t now; + + /* Bit 0 of the upper byte = scanline bit 8. PAL frame is ~313 + * lines, NTSC ~263 -- both wrap bit 8 once per frame, which is + * what we want as the "frame edge" signal. */ + now = (uint8_t)((*AMIGA_VPOSR >> 8) & 1u); + if (gPrevVbHi && !now) { + gFrameCount++; + } + gPrevVbHi = now; + return gFrameCount; +} + + +uint16_t halFrameHz(void) { + /* PAL by default. The toolchain doesn't currently switch modes + * at runtime; if we ever expose NTSC this returns 60. */ + return 50u; +} + + void halShutdown(void) { if (gScreen != NULL) { // CloseScreen should free attached UCopList, but be explicit diff --git a/src/port/atarist/hal.c b/src/port/atarist/hal.c index 0ab4f8d..c9fd03c 100644 --- a/src/port/atarist/hal.c +++ b/src/port/atarist/hal.c @@ -562,6 +562,21 @@ void halWaitVBL(void) { } +// gFrameCount is already maintained by our VBL ISR; just narrow to +// uint16_t for the cross-port HAL contract. +uint16_t halFrameCount(void) { + return (uint16_t)gFrameCount; +} + + +uint16_t halFrameHz(void) { + /* PAL ST is 50 Hz; NTSC ST and SM124 mono are ~60 / ~70. We + * report 50 as the baseline -- close enough for ops/sec scaling, + * and the actual frame rate is still observable via iter counts. */ + return 50u; +} + + void halShutdown(void) { if (!gModeSet) { return; diff --git a/src/port/atarist/input.c b/src/port/atarist/input.c index 4c137e5..73144e6 100644 --- a/src/port/atarist/input.c +++ b/src/port/atarist/input.c @@ -150,7 +150,9 @@ static volatile uint8_t gPacketRemaining = 0; static volatile uint8_t gPacketKind = PKT_KIND_NONE; static volatile uint8_t gMousePacketByte = 0; // bytes consumed in current packet static bool gHooked = false; -static volatile bool gIsrState[KEY_COUNT]; +// uint8_t (not bool) so element size matches gKeyState's. See +// src/core/inputInternal.h for the full rationale. +static volatile uint8_t gIsrState[KEY_COUNT]; // Mouse delta accumulator. Each ACIA mouse packet adds dx/dy here; the // poll routine clamps the running absolute position into the surface diff --git a/src/port/dos/hal.c b/src/port/dos/hal.c index 70989cd..7e446c2 100644 --- a/src/port/dos/hal.c +++ b/src/port/dos/hal.c @@ -275,6 +275,32 @@ void halWaitVBL(void) { } +// Frame counter via $3DA bit 3 polling; rising edge marks the start +// of vertical retrace. Caller polls fast enough that no edge is +// missed (UBER's hot loop is far below 70 Hz period even on a 386). +static uint16_t gFrameCount = 0; +static uint8_t gPrevInVret = 0; + +uint16_t halFrameCount(void) { + uint8_t now; + + now = (uint8_t)(inportb(VGA_INPUT_STAT_1) & VGA_VRETRACE_BIT); + if (now && !gPrevInVret) { + gFrameCount++; + } + gPrevInVret = now; + return gFrameCount; +} + + +uint16_t halFrameHz(void) { + /* VGA mode 13h vertical refresh on a real CRT runs at ~70 Hz + * (70.086 to be exact). Reporting 70 keeps ops/sec scaling + * accurate within ~0.1%. */ + return 70u; +} + + void halShutdown(void) { __dpmi_regs regs; diff --git a/src/port/dos/input.c b/src/port/dos/input.c index 15269ac..c899b8b 100644 --- a/src/port/dos/input.c +++ b/src/port/dos/input.c @@ -152,7 +152,9 @@ static const uint8_t gScanToKey[SCAN_TABLE_SIZE] = { static _go32_dpmi_seginfo gOldHandler; static _go32_dpmi_seginfo gNewHandler; static bool gHooked = false; -static volatile bool gIsrState[KEY_COUNT]; +// uint8_t (not bool) so element size matches gKeyState's. See +// src/core/inputInternal.h for the full rationale. +static volatile uint8_t gIsrState[KEY_COUNT]; static bool gMousePresent = false; static bool gJoystickPresent = false; diff --git a/src/port/iigs/audio_full.c b/src/port/iigs/audio_full.c index 76da6b3..3809dfe 100644 --- a/src/port/iigs/audio_full.c +++ b/src/port/iigs/audio_full.c @@ -29,9 +29,7 @@ // _ROOT in every binary that includes this TU. (See ORCA/C ch. 30 // "segment statement". Reusing the same segment as draw.c / tile.c // rather than picking a unique name keeps the linker's symbol- -// resolution expressions flat -- per-name extras nest the -// expression and trip the "too complex" threshold on small -// binaries.) +// resolution expressions flat.) // // The 34 KB NTP replayer bytes are NOT in this segment -- ORCA/C's // `segment` statement only relocates functions, not data. They live @@ -99,6 +97,17 @@ static uint32_t gSfxBase = 0; static bool gNTPReady = false; static bool gNTPPlaying = false; +// Per-slot config cache. halAudioPlaySfx's biggest cost is the +// per-byte XOR-with-$80 loop over the entire sample (signed -> DOC's +// unsigned format), which on a 4 KB sample is ~120 k cyc / ~43 ms in +// ORCA-C. Most callers play the same SFX repeatedly into the same +// slot. Cache (sample ptr, length, rate) per slot; on cache hit +// (same sample re-triggered) skip the byte copy AND the struct +// rebuild, just re-fire NTPstreamsound. +static const uint8_t *gSfxSlotSample[JOEY_AUDIO_SFX_SLOTS] = { 0 }; +static uint32_t gSfxSlotLength[JOEY_AUDIO_SFX_SLOTS] = { 0 }; +static uint16_t gSfxSlotRateHz[JOEY_AUDIO_SFX_SLOTS] = { 0 }; + // SFX handle layout: stream structure first, sample bytes after. // Both end up at known 24-bit addresses, side-stepping the small // memory model's 16-bit pointer issue. @@ -244,6 +253,21 @@ void halAudioShutdown(void) { if (gNTPPlaying) { halAudioStopMod(); } + // Silence every SFX slot before disposing the handles. NTP's DOC + // IRQ vector points into the buffer we are about to free; if any + // oscillator finishes its sample after the dispose, the wave-done + // interrupt fires into freed memory and the IIgs reports + // "Unclaimed Sound Interrupt" plus a stuck high-pitched whine + // (whatever sample byte was last loaded into the DOC). + { + uint8_t i; + for (i = 0; i < JOEY_AUDIO_SFX_SLOTS; i++) { + halAudioStopSfx(i); + gSfxSlotSample[i] = (const uint8_t *)0; + gSfxSlotLength[i] = 0; + gSfxSlotRateHz[i] = 0; + } + } if (gSfxHandle != NULL) { DisposeHandle(gSfxHandle); gSfxHandle = NULL; @@ -325,36 +349,51 @@ void halAudioPlaySfx(uint8_t slot, const uint8_t *sample, uint32_t length, uint1 structAddr = slotBase; sampleAddr = slotBase + SFX_SAMPLE_OFFSET; - // Copy the sample into this slot's fixed-bank region, converting - // signed 8-bit (public API contract) to unsigned 8-bit (DOC RAM - // format) by flipping the sign bit. - { - unsigned char *dst; - uint32_t i; + // Cache check: same sample, length, and rate as the prior trigger + // for this slot? Then the slot's DOC sample bytes and stream + // struct are already correct -- skip the 4 KB byte-XOR loop and + // the 15-byte struct rebuild, both of which together can run + // ~50 ms per call in ORCA-C. + if (sample == gSfxSlotSample[slot] && + length == gSfxSlotLength[slot] && + rateHz == gSfxSlotRateHz[slot]) { + // Cache hit -- jump straight to the NTPstreamsound trigger. + } else { + // Cache miss: copy the sample into this slot's fixed-bank + // region, converting signed 8-bit (public API contract) to + // unsigned 8-bit (DOC RAM format) by flipping the sign bit. + { + unsigned char *dst; + uint32_t i; - dst = (unsigned char *)sampleAddr; - for (i = 0; i < length; i++) { - dst[i] = (unsigned char)(sample[i] ^ 0x80); + dst = (unsigned char *)sampleAddr; + for (i = 0; i < length; i++) { + dst[i] = (unsigned char)(sample[i] ^ 0x80); + } } - } - // Build the stream structure in this slot's first 16 bytes. - sfx = (unsigned char *)slotBase; - sfx[0] = (unsigned char)(sampleAddr & 0xFFu); - sfx[1] = (unsigned char)((sampleAddr >> 8) & 0xFFu); - sfx[2] = (unsigned char)((sampleAddr >> 16) & 0xFFu); - sfx[3] = 0; - sfx[4] = (unsigned char)(length & 0xFFu); - sfx[5] = (unsigned char)((length >> 8) & 0xFFu); - sfx[6] = (unsigned char)((length >> 16) & 0xFFu); - sfx[7] = (unsigned char)((length >> 24) & 0xFFu); - sfx[8] = (unsigned char)(freqWord & 0xFFu); - sfx[9] = (unsigned char)((freqWord >> 8) & 0xFFu); - sfx[10] = (unsigned char)(SFX_BASE_DOC_PAGE + slot * SFX_DOC_PAGE_STEP); - sfx[11] = (unsigned char)(SFX_BASE_OSC + slot * SFX_OSCS_PER_SLOT); - sfx[12] = 1; // one playing osc - sfx[13] = SFX_VOLUME; - sfx[14] = SFX_CHANNEL_LEFT; + // Build the stream structure in this slot's first 16 bytes. + sfx = (unsigned char *)slotBase; + sfx[0] = (unsigned char)(sampleAddr & 0xFFu); + sfx[1] = (unsigned char)((sampleAddr >> 8) & 0xFFu); + sfx[2] = (unsigned char)((sampleAddr >> 16) & 0xFFu); + sfx[3] = 0; + sfx[4] = (unsigned char)(length & 0xFFu); + sfx[5] = (unsigned char)((length >> 8) & 0xFFu); + sfx[6] = (unsigned char)((length >> 16) & 0xFFu); + sfx[7] = (unsigned char)((length >> 24) & 0xFFu); + sfx[8] = (unsigned char)(freqWord & 0xFFu); + sfx[9] = (unsigned char)((freqWord >> 8) & 0xFFu); + sfx[10] = (unsigned char)(SFX_BASE_DOC_PAGE + slot * SFX_DOC_PAGE_STEP); + sfx[11] = (unsigned char)(SFX_BASE_OSC + slot * SFX_OSCS_PER_SLOT); + sfx[12] = 1; // one playing osc + sfx[13] = SFX_VOLUME; + sfx[14] = SFX_CHANNEL_LEFT; + + gSfxSlotSample[slot] = sample; + gSfxSlotLength[slot] = length; + gSfxSlotRateHz[slot] = rateHz; + } // NTPstreamsound(structPtr in X/Y). Same 24-bit address packing // pattern as NTPprepare: low 16 in X, bank in Y. diff --git a/src/port/iigs/hal.c b/src/port/iigs/hal.c index 6df76a8..3435f5e 100644 --- a/src/port/iigs/hal.c +++ b/src/port/iigs/hal.c @@ -95,6 +95,11 @@ extern void iigsInitRowLut(void); // subsequent rows are at srcOffset + 160, etc. ~9 cyc/byte vs // ORCA-C memcpy's ~30 cyc/byte. extern void iigsBlitRectStageToShr(uint16_t srcOffset, uint16_t copyBytes, uint16_t rowsLeft); +// PEI-slam variant of the per-row rect blit. ~3 cyc/byte vs MVN's +// ~9 cyc/byte. Constraints: copyBytes must be even and 2..80 +// (caller / dispatcher checks). For sprite-rect presents (typical +// 8 bytes wide x 16 rows) saves ~600 cyc/frame vs the MVN form. +extern void iigsBlitRectStageToShrPEI(uint16_t srcOffset, uint16_t copyBytes, uint16_t rowsLeft); // Filled circle, scanline-style. fillWord low byte is the doubled // nibble (e.g., 0x33 for nibble 3). extern void iigsFillCircleInner(uint8_t *pixels, uint16_t cx, uint16_t cy, uint16_t r, uint16_t fillWord); @@ -240,21 +245,26 @@ void halPresentRect(const SurfaceT *src, int16_t x, int16_t y, uint16_t w, uint1 uploadScbAndPaletteIfNeeded(src); - // Pixel copy: byte-aligned runs per scanline. x is always even - // after API-level clipping for 4bpp packed if caller aligned it; - // otherwise we include the byte containing the leftmost pixel. - byteStart = x >> 1; - copyBytes = (uint16_t)(((x + (int16_t)w + 1) >> 1) - byteStart); + // Pixel copy: byte-aligned runs per scanline. x is always >= 0 + // after API-level clipping. Use unsigned shifts to avoid + // ~SSHIFTRIGHT helper for `x >> 1` on signed int16_t. + byteStart = (int16_t)((uint16_t)x >> 1); + copyBytes = (uint16_t)((((uint16_t)x + w + 1u) >> 1) - (uint16_t)byteStart); if (copyBytes == 0 || h == 0) { return; } - // Asm per-row MVN blit. Stage pixels live at $01:2000; SHR display - // at $E1:2000 (same offset within their banks). srcOffset is the - // byte offset of the first byte to copy on the first row. + // Pixel copy: prefer the PEI-slam variant when the rect satisfies + // its contract (copyBytes even, 2..80). Sprite-rect presents + // (typical 8 bytes wide) hit this ~3x faster than MVN. Wider or + // odd-byte rects fall back to MVN, which has no width cap. srcOffset = (uint16_t)(0x2000 + SURFACE_ROW_OFFSET(y) + byteStart); - iigsBlitRectStageToShr(srcOffset, copyBytes, h); + if ((copyBytes & 1) == 0 && copyBytes >= 2 && copyBytes <= 80) { + iigsBlitRectStageToShrPEI(srcOffset, copyBytes, h); + } else { + iigsBlitRectStageToShr(srcOffset, copyBytes, h); + } } @@ -307,3 +317,27 @@ void halWaitVBL(void) { /* scanning: wait for next VBL */; } } + + +// Frame counter via $C019 polling. Edge-detected on each call: the +// caller (UBER, animation loops) polls fast enough that we never +// miss a VBL transition. No IRQ involvement; safe in the S16 takeover +// context where ToolBox interrupt setup would be intrusive. +static uint16_t gFrameCount = 0; +static uint8_t gPrevInVbl = 0; + +uint16_t halFrameCount(void) { + uint8_t now; + + now = (*IIGS_VBL_STATUS & VBL_BAR_BIT) == 0; + if (now && !gPrevInVbl) { + gFrameCount++; + } + gPrevInVbl = now; + return gFrameCount; +} + + +uint16_t halFrameHz(void) { + return 60u; +} diff --git a/src/port/iigs/input.c b/src/port/iigs/input.c index 337821a..3cede63 100644 --- a/src/port/iigs/input.c +++ b/src/port/iigs/input.c @@ -14,10 +14,15 @@ // it is enough for feature parity with the other platforms on typical // "press a key, act on it" flows. // -// Held-key state is synthesized via a TTL counter: a fresh strobe on -// $C000 refreshes the TTL; each halInputPoll decays it; when TTL hits -// zero we assume the key was released. KEY_TTL is sized to cover the -// typematic initial delay so that a held key does not flicker. +// Release detection uses the IIe-inherited "any key currently down" +// live flag at $C010 bit 7 (set by the keyboard scanner independently +// of the strobe). Each halInputPoll drains pending strobe events to +// pick up presses, then samples $C010: bit 7 == 0 means no +// non-modifier key is physically held, and we wholesale-clear +// gKeyState. readModifierKeys then re-asserts the modifiers from +// $C025's live state, so shift/ctrl/option stay accurate. Avoids +// the inferred-release lag the old TTL-decay scheme had, and works +// on every IIgs (real or stealth) without ToolBox / ADB Tool init. // // Mouse: $C024 (delta data) and $C027 (status). Each $C024 read // returns one signed 7-bit delta; $C027 bit 1 indicates whether the @@ -37,8 +42,6 @@ #include "inputInternal.h" #include "joey/surface.h" -// CORESYS: hoisted out of _ROOT (see surface.c for rationale). -JOEYLIB_SEGMENT("CORESYS") // ----- Hardware registers ----- @@ -63,6 +66,18 @@ JOEYLIB_SEGMENT("CORESYS") #define KBD_STROBE_BIT 0x80 #define KBD_ASCII_MASK 0x7F +// $C010 RDKBDSTRB: reading clears the keyboard strobe at $C000 and +// returns the live "any key currently held" flag in bit 7 (set by +// the keyboard scanner / ADB MCU independently of the strobe). Used +// to drive immediate release detection without an inferred-release +// TTL counter. +#define KBD_ANY_KEY_DOWN_BIT 0x80 + +// Cap on the per-poll keyboard-FIFO drain. The IIgs ADB queue is +// small in practice; this is purely a defensive bound so a stuck +// strobe can't spin halInputPoll forever. +#define KBD_DRAIN_GUARD 32u + // $C025 layout (IIgs Hardware Reference): bit 0 = shift, bit 1 = ctrl, // bit 6 = option (Closed-Apple), bit 7 = command (Open-Apple). #define MOD_SHIFT 0x01 @@ -79,11 +94,6 @@ JOEYLIB_SEGMENT("CORESYS") #define MOUSE_DELTA_SIGN_BIT 0x40 #define MOUSE_BUTTON_INV 0x80 -// Polls a key stays "down" after the last observed strobe. Covers the -// typematic initial delay so a held key does not flicker off/on between -// repeats. -#define KEY_TTL 45 - #define ASCII_TABLE_SIZE 128 // Apple II arrow-key ASCII conventions. @@ -113,11 +123,6 @@ static int8_t thresholdPaddle(uint8_t v); // O(1) instead of a 40-plus-case switch. static uint8_t gAsciiToKey[ASCII_TABLE_SIZE]; -// Non-static so iigsInputSnapshot (joeyDraw.asm) can reference it via -// long-mode addressing through the linker. The C TTL-decrement loop -// that used to live in halInputPoll moved to that asm helper. -uint8_t gKeyTtl [KEY_COUNT]; - static int16_t gMouseAbsX = SURFACE_WIDTH / 2; static int16_t gMouseAbsY = SURFACE_HEIGHT / 2; @@ -246,14 +251,18 @@ static bool gJoyDisconnectLatched = false; // to the digital threshold mapping. gJoyRecalibrate is set by // halJoystickReset and cleared on the next successful poll, which // captures the new center. +// uint8_t (not bool) so the per-element stride is 1 byte. ORCA-C's +// _Bool is 2 bytes, which forces a ~MUL4 helper for every index +// multiply -- even when the index is a constant the compiler doesn't +// fold. Storage is still 0 or 1 either way. static uint8_t gJoyCenterX [JOYSTICK_COUNT]; static uint8_t gJoyCenterY [JOYSTICK_COUNT]; -static bool gJoyCenterValid [JOYSTICK_COUNT]; -static bool gJoyRecalibrate [JOYSTICK_COUNT]; +static uint8_t gJoyCenterValid [JOYSTICK_COUNT]; +static uint8_t gJoyRecalibrate [JOYSTICK_COUNT]; void halJoystickReset(JoeyJoystickE js) { - if ((int)js < 0 || (int)js >= JOYSTICK_COUNT) { + if ((uint16_t)js >= (uint16_t)JOYSTICK_COUNT) { return; } // Re-enable polling and arm a fresh center capture for the next @@ -281,8 +290,14 @@ static void pollJoystick(void) { bool yResolved; // Buttons are I/O reads -- always cheap, do them every frame. - gJoyButtonState[JOYSTICK_0][JOY_BUTTON_0] = (*IIGS_BTN0 & IIGS_BUTTON_BIT) != 0; - gJoyButtonState[JOYSTICK_0][JOY_BUTTON_1] = (*IIGS_BTN1 & IIGS_BUTTON_BIT) != 0; + // ORCA-C 2.2.1 doesn't constant-fold the row-stride multiply for + // 2D arrays even when both indices are constants, so each + // gJoyButtonState[i][j] write emits a ~MUL4 helper. Indexing + // through a (uint8_t *) cast collapses to a literal byte offset. + ((uint8_t *)gJoyButtonState)[JOYSTICK_0 * JOY_BUTTON_COUNT + JOY_BUTTON_0] + = (*IIGS_BTN0 & IIGS_BUTTON_BIT) != 0; + ((uint8_t *)gJoyButtonState)[JOYSTICK_0 * JOY_BUTTON_COUNT + JOY_BUTTON_1] + = (*IIGS_BTN1 & IIGS_BUTTON_BIT) != 0; gJoyConnected[JOYSTICK_1] = false; // Once the stick has been latched as disconnected, only buttons @@ -394,7 +409,6 @@ static void pollMouse(void) { void halInputInit(void) { memset(gKeyState, 0, sizeof(gKeyState)); memset(gKeyPrev, 0, sizeof(gKeyPrev)); - memset(gKeyTtl, 0, sizeof(gKeyTtl)); buildAsciiTable(); gMouseAbsX = SURFACE_WIDTH / 2; @@ -408,26 +422,53 @@ void halInputInit(void) { void halInputPoll(void) { - uint8_t kbd; - uint8_t ascii; - uint8_t key; + uint8_t kbd; + uint8_t ascii; + uint8_t key; + uint8_t kbdStrb; + uint16_t drainGuard; + bool strobeObserved; - // The KEY_COUNT TTL-decrement loop and the gKeyState/gKeyPrev/ - // gMouseButtonPrev/gJoyButtonPrev snapshots all happen earlier in - // joeyInputPoll's call to iigsInputSnapshot (asm). We just read - // the live hardware state here. + // The gKeyState/gKeyPrev/gMouseButtonPrev/gJoyButtonPrev snapshots + // all happen earlier in joeyInputPoll's call to iigsInputSnapshot + // (asm). We just read the live hardware state here. - kbd = *IIGS_KBD; - if (kbd & KBD_STROBE_BIT) { + // Drain the keyboard FIFO, not just the head. The IIgs ADB MCU + // queues press + autorepeat events; consuming only one per poll + // would leave queued events waiting to refresh state on later + // polls. KBD_DRAIN_GUARD bounds the loop in case a stuck strobe + // ever fails to clear. + strobeObserved = false; + for (drainGuard = 0; drainGuard < KBD_DRAIN_GUARD; drainGuard++) { + kbd = *IIGS_KBD; + if ((kbd & KBD_STROBE_BIT) == 0) { + break; + } + strobeObserved = true; ascii = (uint8_t)(kbd & KBD_ASCII_MASK); key = gAsciiToKey[ascii]; if (key != KEY_NONE) { gKeyState[key] = true; - gKeyTtl[key] = KEY_TTL; } (void)*IIGS_KBDSTRB; } + // $C010 bit 7 is the live "any non-modifier key currently held" + // flag (IIe-inherited; updated by the keyboard scanner / ADB MCU + // independently of the strobe). When 0 we know all non-modifier + // keys are physically released, so wholesale-clear gKeyState and + // let readModifierKeys re-assert the modifiers from $C025 below. + // + // strobeObserved guard: a press that arrived AND was released + // between two polls would otherwise be set-then-cleared in a + // single poll, losing the rising edge that joeyKeyPressed needs. + // Holding the press for one poll preserves it; the next poll's + // bit-7 read will clear normally. + kbdStrb = *IIGS_KBDSTRB; + if (!strobeObserved && (kbdStrb & KBD_ANY_KEY_DOWN_BIT) == 0) { + memset(gKeyState, 0, sizeof(gKeyState)); + } + readModifierKeys(); pollMouse(); pollJoystick(); diff --git a/src/port/iigs/joeyDraw.asm b/src/port/iigs/joeyDraw.asm index 77ea229..9cc4669 100644 --- a/src/port/iigs/joeyDraw.asm +++ b/src/port/iigs/joeyDraw.asm @@ -2740,6 +2740,221 @@ brsBytesM1 data DRAWPRIMS end +**************************************************************** +* iigsBlitRectStageToShrPEI(srcOffset, copyBytes, rowsLeft) +* +* PEI-slam variant of iigsBlitRectStageToShr for partial-rect +* presents. Uses the SHR shadow trick + AUXWRITE/RAMRD stack hijack +* to push pixel words from $01:row to $E1:row at ~3 cyc/byte instead +* of MVN's ~9 cyc/byte (against $E1 wait states). For a 16x16 sprite +* present (16 rows x 8 bytes) that's ~640 cyc vs MVN's ~1300 cyc. +* +* Caller contract: +* - copyBytes must be even and >= 2 and <= 80. Caller (C wrapper) +* verifies; this asm assumes the contract holds. +* - srcOffset is the byte offset within bank $01 of the FIRST byte +* of the FIRST row to copy. Rows advance by 160. +* +* SEI window for the duration: copyBytes/2 PEIs * rowsLeft + setup +* per row. For a 16x16 sprite that's ~700 cyc = ~0.25 ms; safe for +* DOC IRQ. For larger rects the C wrapper falls back to MVN to keep +* the SEI window tiny. +* +* Args after PHP+PHB+PHD (TCD = SP+8): +* srcOffset at D+0..1 +* copyBytes at D+2..3 +* rowsLeft at D+4..5 +**************************************************************** + +iigsBlitRectStageToShrPEI start RECTPEI +brpOff equ 0 +brpBytes equ 2 +brpRows equ 4 + + php + phb + phd + rep #$30 + LONGA ON + LONGI ON + tsc + clc + adc #8 + tcd + +* Save SP and shadow state for teardown. + tsc + sta >brpOrigSp + sep #$20 + LONGA OFF + lda >$00C035 + sta >brpOrigShadow + rep #$20 + LONGA ON + +* Stash inputs into long-mode globals so they survive TCD changes. +* Per-row code does TCD = rowBase, which means D-relative reads no +* longer reach the original args at D+0..5. Everything we still need +* per-row goes into a long-mode global below. + lda brpOff + sta >brpRowBase + lda brpRows + sta >brpRowsRem + lda brpBytes + dec a + sta >brpBytesM1Saved ; copyBytes - 1, for TCS = base + bytes - 1 + +* Compute jump entry into the unrolled PEI sequence. +* words = copyBytes / 2 +* entry = peiSeqEnd - words * 2 (each PEI dp is 2 bytes; sequence +* ends at peiSeqEnd with PEI $00 as +* the LAST entry; offsets descend so +* jumping `words*2` bytes BEFORE the +* end starts at PEI $(2*(words-1)). + lda brpBytes + lsr a ; A = words + asl a ; A = words * 2 (bytes of PEI to execute) + sta >brpJmpDelta + lda #peiSeqEnd + sec + sbc >brpJmpDelta + sta >brpJmpTarget+1 ; patch JMP abs operand + + sei + + sep #$20 + LONGA OFF + lda >brpOrigShadow + and #$F1 ; SHR shadow ON (clear bits 1,2,3) + sta >$00C035 + lda #0 + sta >$00C005 ; AUXWRITE on + sta >$00C003 ; RAMRD on + rep #$20 + LONGA ON + +brpRowLoop anop + lda >brpRowsRem + bne brpDoRow + brl brpExit +brpDoRow anop + +* Per-row: set DP = row base (so PEI dp pulls from the source row), +* set SP = row base + copyBytes - 1 (so PEIs decrement-push into the +* row in-place; bytes mirror to $E1 via SHR shadow). +* NB: brpBytes is at original D+2 -- after TCD = rowBase that read +* would land in pixel data. Use the long-mode brpBytesM1Saved instead. + lda >brpRowBase + clc + adc >brpBytesM1Saved + tcs ; SP = row base + copyBytes - 1 + lda >brpRowBase + tcd ; D = row base + +* Jump into the unrolled PEI sequence at the right offset. operand +* low byte was patched above; high byte is fixed at link time. +brpJmpTarget anop + jmp peiSeqEnd ; operand low byte is patched per call + +* ----- Unrolled PEI sequence: 40 PEIs, walking DP offsets DOWN from +* $4E to $00 in 2-byte steps. JMP target lands at the right offset +* so only `words` PEIs execute. Each PEI: 6 cyc, pushes 2 bytes to +* SP (which mirrors to $E1 via shadow). Falls through to row +* advance after PEI $00. + pei $4E + pei $4C + pei $4A + pei $48 + pei $46 + pei $44 + pei $42 + pei $40 + pei $3E + pei $3C + pei $3A + pei $38 + pei $36 + pei $34 + pei $32 + pei $30 + pei $2E + pei $2C + pei $2A + pei $28 + pei $26 + pei $24 + pei $22 + pei $20 + pei $1E + pei $1C + pei $1A + pei $18 + pei $16 + pei $14 + pei $12 + pei $10 + pei $0E + pei $0C + pei $0A + pei $08 + pei $06 + pei $04 + pei $02 + pei $00 +peiSeqEnd anop + +* Advance row base by 160 and decrement rows-remaining. + lda >brpRowBase + clc + adc #160 + sta >brpRowBase + lda >brpRowsRem + dec a + sta >brpRowsRem + brl brpRowLoop + +brpExit anop + lda >brpOrigSp + tcs + sep #$20 + LONGA OFF + lda >brpOrigShadow + sta >$00C035 + lda #0 + sta >$00C004 ; AUXWRITE off + sta >$00C002 ; RAMRD off + rep #$20 + LONGA ON + + LONGA OFF + LONGI OFF + pld + plb + plp ; restores I (pre-SEI value) + rtl + end + + +brpOrigSp data RECTPEI + ds 2 + end +brpOrigShadow data RECTPEI + ds 1 + end +brpRowBase data RECTPEI + ds 2 + end +brpRowsRem data RECTPEI + ds 2 + end +brpJmpDelta data RECTPEI + ds 2 + end +brpBytesM1Saved data RECTPEI + ds 2 + end + + **************************************************************** * iigsMarkDirtyRowsInner(yStart, yEnd, minWord, maxWord) * @@ -2969,16 +3184,18 @@ gJoyOrigSpeed data DRAWPRIMS * iigsInputSnapshot(void) * * Per-frame input bookkeeping done in one tight asm pass instead of -* the three C memcpys + C TTL loop that joeyInputPoll used to do. -* Saves ~0.6 ms per frame in animated demos. +* three C memcpys. Saves ~0.5 ms per frame in animated demos. * -* Three combined operations: -* 1. Decrement gKeyTtl[i] for every key; on transition to zero, -* clear gKeyState[i] (key is now "released"). -* 2. Snapshot gKeyState -> gKeyPrev (KEY_COUNT bytes via long-mode +* Two combined operations: +* 1. Snapshot gKeyState -> gKeyPrev (KEY_COUNT bytes via long-mode * lda/sta loop, ~15 cyc/byte). -* 3. Snapshot gMouseButtonState/gJoyButtonState (4 bytes each) -* via 4 inline lda/sta pairs. +* 2. Snapshot gMouseButtonState/gJoyButtonState (4 bytes each) via +* 4 inline lda/sta pairs. +* +* The TTL-decay loop this used to run has been removed: the IIgs +* port now derives release directly from $C010 bit 7 in halInputPoll +* (the live "any key currently held" flag), so the inferred-release +* TTL mechanism is no longer needed. * * IMPORTANT: KEY_COUNT is hard-coded at 60 below. If you add or * remove a key in joey/input.h, bump the constant or the loop bounds @@ -2994,19 +3211,6 @@ iigsInputSnapshot start IIGSASM sep #$20 LONGA OFF -* TTL decrement + key-released detection. ~12 cyc / iter fast path. - ldx #59 ; KEY_COUNT - 1 -isnTtlLoop anop - lda >gKeyTtl,x - beq isnTtlNext ; ttl==0, nothing to do - dec a - sta >gKeyTtl,x - bne isnTtlNext ; not yet zero - sta >gKeyState,x ; A==0 -> mark released -isnTtlNext anop - dex - bpl isnTtlLoop - * Snapshot gKeyState -> gKeyPrev (60 bytes), long-mode loop. ldx #59 isnKeyLoop anop