Major IIgs improvements. Preparing to benchmark all ports.

This commit is contained in:
Scott Duensing 2026-05-01 00:50:56 -05:00
parent 20cbccaca5
commit 91fcd49f6f
44 changed files with 1482 additions and 455 deletions

View file

@ -120,8 +120,8 @@ int main(void) {
config.hostMode = HOST_MODE_TAKEOVER;
config.codegenBytes = 8 * 1024;
config.maxSurfaces = 4;
config.audioBytes = 64 * 1024;
config.assetBytes = 128 * 1024;
config.audioBytes = 64UL * 1024;
config.assetBytes = 128UL * 1024;
if (!joeyInit(&config)) {
fprintf(stderr, "joeyInit failed: %s\n", joeyLastError());

View file

@ -246,8 +246,8 @@ int main(void) {
config.hostMode = HOST_MODE_TAKEOVER;
config.codegenBytes = 8 * 1024;
config.maxSurfaces = 4;
config.audioBytes = 64 * 1024;
config.assetBytes = 128 * 1024;
config.audioBytes = 64UL * 1024;
config.assetBytes = 128UL * 1024;
if (!joeyInit(&config)) {
fprintf(stderr, "joeyInit failed: %s\n", joeyLastError());

View file

@ -12,8 +12,8 @@ int main(void) {
config.hostMode = HOST_MODE_OS;
config.codegenBytes = 8 * 1024;
config.maxSurfaces = 4;
config.audioBytes = 64 * 1024;
config.assetBytes = 128 * 1024;
config.audioBytes = 64UL * 1024;
config.assetBytes = 128UL * 1024;
if (!joeyInit(&config)) {
fprintf(stderr, "joeyInit failed: %s\n", joeyLastError());

View file

@ -218,8 +218,8 @@ int main(void) {
config.hostMode = HOST_MODE_TAKEOVER;
config.codegenBytes = 8 * 1024;
config.maxSurfaces = 4;
config.audioBytes = 64 * 1024;
config.assetBytes = 128 * 1024;
config.audioBytes = 64UL * 1024;
config.assetBytes = 128UL * 1024;
if (!joeyInit(&config)) {
fprintf(stderr, "joeyInit failed: %s\n", joeyLastError());

View file

@ -225,8 +225,8 @@ int main(void) {
config.hostMode = HOST_MODE_TAKEOVER;
config.codegenBytes = 8 * 1024;
config.maxSurfaces = 4;
config.audioBytes = 64 * 1024;
config.assetBytes = 128 * 1024;
config.audioBytes = 64UL * 1024;
config.assetBytes = 128UL * 1024;
if (!joeyInit(&config)) {
fprintf(stderr, "joeyInit failed: %s\n", joeyLastError());

View file

@ -108,8 +108,8 @@ int main(void) {
config.hostMode = HOST_MODE_TAKEOVER;
config.codegenBytes = 8 * 1024;
config.maxSurfaces = 4;
config.audioBytes = 64 * 1024;
config.assetBytes = 128 * 1024;
config.audioBytes = 64UL * 1024;
config.assetBytes = 128UL * 1024;
if (!joeyInit(&config)) {
fprintf(stderr, "joeyInit failed: %s\n", joeyLastError());

View file

@ -113,8 +113,8 @@ int main(void) {
config.hostMode = HOST_MODE_TAKEOVER;
config.codegenBytes = 8 * 1024;
config.maxSurfaces = 4;
config.audioBytes = 64 * 1024;
config.assetBytes = 128 * 1024;
config.audioBytes = 64UL * 1024;
config.assetBytes = 128UL * 1024;
if (!joeyInit(&config)) {
fprintf(stderr, "joeyInit failed: %s\n", joeyLastError());
@ -154,8 +154,7 @@ int main(void) {
vy = 1;
haveBackup = false;
spriteSaveUnder(screen, ball, x, y, &backup);
spriteDraw(screen, ball, x, y);
spriteSaveAndDraw(screen, ball, x, y, &backup);
stagePresentRect(backup.x, backup.y, backup.width, backup.height);
haveBackup = true;
@ -189,8 +188,7 @@ int main(void) {
if (y <= 0) { y = 0; vy = (int16_t)-vy; }
if (y >= SURFACE_HEIGHT - BALL_H) { y = SURFACE_HEIGHT - BALL_H; vy = (int16_t)-vy; }
spriteSaveUnder(screen, ball, x, y, &backup);
spriteDraw(screen, ball, x, y);
spriteSaveAndDraw(screen, ball, x, y, &backup);
// Bounding box of (old rect) U (new rect). For typical
// small-step motion the rects overlap heavily so the union

350
examples/uber/uber.c Normal file
View file

@ -0,0 +1,350 @@
// Uber demo: exercise every JoeyLib public API and measure throughput
// of the per-frame-hot ones. Results are written to joeylog.txt via
// joeyLogF. A green screen on exit means the run completed.
//
// Timing model: each test aligns to a VBL boundary via joeyWaitVBL,
// records the starting joeyFrameCount, then runs the op in a tight
// loop polling joeyFrameCount until UBER_FRAMES frames have elapsed.
// Reported metric is ops/sec, computed as iters * joeyFrameHz() /
// UBER_FRAMES so results are directly comparable across ports
// regardless of CPU speed or VBL rate.
//
// joeyFrameCount is wall-clock-based per port; the per-iter poll
// adds ~10-30 cyc per op which shows up as noise on the very
// fastest ops but is below ~5% even for ~500 cyc/op work.
//
// One-shot ops (spriteCompile) get one call each, timed by frame
// delta -- coarser but representative.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stddef.h>
#include <joey/joey.h>
// ----- Timing primitives -----
// 4-frame measurement window. Long enough that loop overhead doesn't
// dominate; short enough to keep the full demo run under ~10 sec.
#define UBER_FRAMES 4u
typedef void (*OpFn)(void);
static const char *gCurName = "(none)";
static SurfaceT *gStage = NULL;
static SpriteT *gSprite = NULL;
static SpriteBackupT gBackup;
static unsigned char gBackupBytes[256];
static TileT gTileScratch;
// Run `op` in a tight loop until `targetFrames` joeyFrameCount ticks
// have elapsed. Returns iterations completed.
static unsigned long runForFrames(OpFn op, unsigned int targetFrames) {
unsigned long count;
uint16_t startFrame;
count = 0UL;
joeyWaitVBL();
startFrame = joeyFrameCount();
while ((uint16_t)(joeyFrameCount() - startFrame) < targetFrames) {
op();
count++;
}
return count;
}
// Time and log one op. Reports iters / N frames AND the derived
// ops/sec so per-port results are directly comparable against IIgs
// regardless of CPU speed or display refresh rate.
static void timeOp(const char *name, OpFn op) {
unsigned long iters;
unsigned long opsPerSec;
gCurName = name;
iters = runForFrames(op, UBER_FRAMES);
if (iters == 0UL) {
joeyLogF("UBER: %s: 0 iters (op too slow?)\n", name);
return;
}
opsPerSec = (iters * (unsigned long)joeyFrameHz()) / (unsigned long)UBER_FRAMES;
joeyLogF("UBER: %s: %lu iters / %u frames = %lu ops/sec\n",
name, iters, UBER_FRAMES, opsPerSec);
}
// ----- Test ops -----
static void op_drawPixel (void) { drawPixel (gStage, 100, 100, 5); }
static void op_drawLineH (void) { drawLine (gStage, 0, 50, 319, 50, 5); }
static void op_drawLineV (void) { drawLine (gStage, 50, 0, 50, 199, 5); }
static void op_drawLineDiag (void) { drawLine (gStage, 0, 0, 319, 199, 5); }
static void op_drawRect (void) { drawRect (gStage, 10, 10, 100, 100, 5); }
static void op_drawCircleSmall (void) { drawCircle (gStage, 160, 100, 16, 5); }
static void op_drawCircleLarge (void) { drawCircle (gStage, 160, 100, 80, 5); }
static void op_fillRectSmall (void) { fillRect (gStage, 20, 20, 16, 16, 7); }
static void op_fillRectMid (void) { fillRect (gStage, 20, 20, 80, 80, 7); }
static void op_fillRectFull (void) { fillRect (gStage, 0, 0, 320, 200, 7); }
static void op_fillCircle (void) { fillCircle (gStage, 160, 100, 40, 7); }
static void op_samplePixel (void) { (void)samplePixel(gStage, 100, 100); }
static void op_surfaceClear (void) { surfaceClear (gStage, 0); }
static void op_paletteSet(void) {
static uint16_t colors[16] = {
0x000, 0xF00, 0x0F0, 0x00F, 0xFF0, 0xF0F, 0x0FF, 0xFFF,
0x800, 0x080, 0x008, 0x880, 0x808, 0x088, 0x888, 0x444
};
paletteSet(gStage, 0, colors);
}
static void op_scbSetRange (void) { scbSetRange (gStage, 0, 199, 0); }
static void op_tileFill (void) { tileFill (gStage, 5, 5, 7); }
static void op_tileCopy (void) { tileCopy (gStage, 6, 6, gStage, 5, 5); }
static void op_tileCopyMasked (void) { tileCopyMasked (gStage, 7, 7, gStage, 5, 5, 0); }
static void op_tilePaste (void) { tilePaste (gStage, 8, 8, &gTileScratch); }
static void op_tileSnap (void) { tileSnap (gStage, 5, 5, &gTileScratch); }
static int16_t gSpriteX = 40;
static int16_t gSpriteY = 30;
static void op_spriteSave (void) { spriteSaveUnder (gStage, gSprite, gSpriteX, gSpriteY, &gBackup); }
static void op_spriteDraw (void) { spriteDraw (gStage, gSprite, gSpriteX, gSpriteY); }
static void op_spriteRestore (void) { spriteRestoreUnder(gStage, &gBackup); }
static void op_spriteSaveAndDraw (void) { spriteSaveAndDraw (gStage, gSprite, gSpriteX, gSpriteY, &gBackup); }
static void op_stagePresent (void) { stagePresent(); }
static void op_stagePresentRect8(void) { stagePresentRect( 40, 30, 16, 16); }
static void op_stagePresentRectF(void) { stagePresentRect( 0, 0, 320, 200); }
static void op_inputPoll (void) { joeyInputPoll(); }
static void op_keyDown (void) { (void)joeyKeyDown(KEY_A); }
static void op_keyPressed (void) { (void)joeyKeyPressed(KEY_A); }
static void op_mouseX (void) { (void)joeyMouseX(); }
static void op_joyConnected (void) { (void)joeyJoystickConnected(JOYSTICK_1); }
static void op_audioFrameTick (void) { joeyAudioFrameTick(); }
static void op_audioIsPlaying (void) { (void)joeyAudioIsPlayingMod(); }
static void op_surfaceMarkDirty(void) { /* drawPixel already marks; use fill instead */
fillRect(gStage, 0, 0, 32, 32, 0); }
// ----- Build the ball sprite procedurally -----
#define BALL_TILES_X 2
#define BALL_TILES_Y 2
#define BALL_TILE_BYTES (BALL_TILES_X * BALL_TILES_Y * 32u)
static const uint8_t gBallAuthored[16 * 8] = {
0x00, 0x00, 0x22, 0x22, 0x22, 0x22, 0x00, 0x00,
0x00, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x00,
0x02, 0x22, 0x32, 0x22, 0x22, 0x22, 0x22, 0x20,
0x02, 0x23, 0x32, 0x22, 0x22, 0x22, 0x22, 0x20,
0x22, 0x33, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
0x02, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x20,
0x02, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x20,
0x00, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x00,
0x00, 0x00, 0x22, 0x22, 0x22, 0x22, 0x00, 0x00,
0x00, 0x00, 0x00, 0x22, 0x22, 0x00, 0x00, 0x00
};
static uint8_t gBallTiles[BALL_TILE_BYTES];
static void buildBallSprite(void) {
uint16_t tx;
uint16_t ty;
uint16_t row;
uint16_t b;
uint8_t *dst;
for (ty = 0; ty < BALL_TILES_Y; ty++) {
for (tx = 0; tx < BALL_TILES_X; tx++) {
dst = &gBallTiles[(ty * BALL_TILES_X + tx) * 32u];
for (row = 0; row < 8; row++) {
for (b = 0; b < 4; b++) {
dst[row * 4 + b] =
gBallAuthored[((ty * 8) + row) * 8 + (tx * 4) + b];
}
}
}
}
}
// ----- Main -----
static void runAllTests(void) {
joeyLogF("UBER: ----- begin -----\n");
// Surface / palette / SCB.
timeOp("surfaceClear", op_surfaceClear);
timeOp("paletteSet", op_paletteSet);
timeOp("scbSetRange", op_scbSetRange);
// Drawing primitives.
timeOp("drawPixel", op_drawPixel);
timeOp("drawLine H", op_drawLineH);
timeOp("drawLine V", op_drawLineV);
timeOp("drawLine diag", op_drawLineDiag);
timeOp("drawRect 100x100", op_drawRect);
timeOp("drawCircle r=16", op_drawCircleSmall);
timeOp("drawCircle r=80", op_drawCircleLarge);
timeOp("fillRect 16x16", op_fillRectSmall);
timeOp("fillRect 80x80", op_fillRectMid);
timeOp("fillRect 320x200", op_fillRectFull);
timeOp("fillCircle r=40", op_fillCircle);
timeOp("samplePixel", op_samplePixel);
// Tiles. Seed scratch tile + dest cells with non-zero pixels first.
fillRect(gStage, 0, 0, 320, 64, 7);
tileSnap(gStage, 5, 5, &gTileScratch);
timeOp("tileFill", op_tileFill);
timeOp("tileCopy", op_tileCopy);
timeOp("tileCopyMasked", op_tileCopyMasked);
timeOp("tilePaste", op_tilePaste);
timeOp("tileSnap", op_tileSnap);
// Sprites. Background must be non-empty so save-under has work
// to do (otherwise it's a 4 KB memset of zeros, atypical).
surfaceClear(gStage, 4);
timeOp("spriteSaveUnder", op_spriteSave);
timeOp("spriteDraw", op_spriteDraw);
timeOp("spriteRestoreUnder", op_spriteRestore);
timeOp("spriteSaveAndDraw", op_spriteSaveAndDraw);
// Present.
timeOp("stagePresent full", op_stagePresent);
timeOp("stagePresentRect 8b",op_stagePresentRect8);
timeOp("stagePresentRect F", op_stagePresentRectF);
// Input.
timeOp("joeyInputPoll", op_inputPoll);
timeOp("joeyKeyDown", op_keyDown);
timeOp("joeyKeyPressed", op_keyPressed);
timeOp("joeyMouseX", op_mouseX);
timeOp("joeyJoyConnected", op_joyConnected);
// Audio.
timeOp("joeyAudioFrameTick", op_audioFrameTick);
timeOp("joeyAudioIsPlayingMod", op_audioIsPlaying);
// Surface mark dirty (via fillRect's mark step).
timeOp("surfaceMarkDirtyRect (via fillRect 32x32)", op_surfaceMarkDirty);
joeyLogF("UBER: ----- end -----\n");
}
int main(void) {
JoeyConfigT config;
uint16_t pal[16];
int i;
config.hostMode = HOST_MODE_TAKEOVER;
config.codegenBytes = 8 * 1024;
config.maxSurfaces = 4;
config.audioBytes = 64UL * 1024;
config.assetBytes = 128UL * 1024;
if (!joeyInit(&config)) {
return 1;
}
gStage = stageGet();
if (gStage == NULL) {
joeyShutdown();
return 1;
}
// A simple visible palette so users see SOMETHING during the run.
for (i = 0; i < 16; i++) {
pal[i] = (uint16_t)((i << 8) | (i << 4) | i); // grey ramp
}
pal[ 0] = 0x000;
pal[ 1] = 0x800; // dark red (running)
pal[ 2] = 0x080; // green (done)
pal[ 3] = 0x008; // blue
pal[ 5] = 0xFF0; // yellow (test pixels)
pal[ 7] = 0xFFF; // white (fills)
pal[15] = 0xF00; // red
paletteSet(gStage, 0, pal);
scbSetRange(gStage, 0, 199, 0);
// Indicate "running": red bar at top of screen.
surfaceClear(gStage, 0);
fillRect(gStage, 0, 0, 320, 8, 1);
stagePresent();
buildBallSprite();
gSprite = spriteCreate(gBallTiles, BALL_TILES_X, BALL_TILES_Y, SPRITE_FLAGS_NONE);
if (gSprite == NULL) {
joeyLog("UBER: spriteCreate failed");
joeyShutdown();
return 1;
}
// spriteCompile is a one-shot. Time at frame resolution.
{
uint16_t before;
joeyWaitVBL();
before = joeyFrameCount();
if (!spriteCompile(gSprite)) {
joeyLog("UBER: spriteCompile failed");
}
while (joeyFrameCount() == before) {
/* wait for next VBL edge */
}
joeyLogF("UBER: spriteCompile: 1 call in <= 1 frame\n");
}
gBackup.bytes = gBackupBytes;
// Audio: only init/shutdown is exercised. Triggering joeyAudioPlaySfx
// without first calling joeyAudioPlayMod leaves NTP's engine in a
// half-initialized state -- NTPstreamsound is designed to OVERLAY on
// an already-running module. Without NTPprepare/NTPplay first, the
// streamer oscillator is fired but no music tick ever advances or
// silences it, and you get a stuck high-pitched scream. UBER doesn't
// ship a MOD asset, so we skip the SFX exercise. The frame-tick and
// isPlayingMod calls below still get timed (both are no-op fast
// paths on IIgs).
if (joeyAudioInit()) {
joeyLogF("UBER: audioInit OK\n");
} else {
joeyLogF("UBER: audioInit failed (skipping audio)\n");
}
// Reset stage + run all per-frame timed tests.
surfaceClear(gStage, 0);
fillRect(gStage, 0, 0, 320, 8, 1);
stagePresent();
runAllTests();
// Done. Green screen + waitForKey.
surfaceClear(gStage, 2);
stagePresent();
joeyLogF("UBER: press any key to exit\n");
joeyWaitForAnyKey();
spriteDestroy(gSprite);
joeyShutdown();
return 0;
}

View file

@ -37,4 +37,17 @@ const char *joeyVersionString(void);
// always a hardware-level wait, not a software timer.
void joeyWaitVBL(void);
// Monotonic 16-bit frame counter. Polled by callers; ports detect
// the rising edge inside this call (IIgs $C019, DOS $3DA, Amiga
// VPOSR) or expose a counter maintained by a VBL ISR (Atari ST).
// Caller must poll faster than 2 * joeyFrameHz() so no edge is
// missed. Used by benchmarks and frame-rate-independent animation.
uint16_t joeyFrameCount(void);
// Nominal display frame rate in Hz: 50 (Amiga PAL), 60 (IIgs / ST
// NTSC default), 70 (VGA mode 13h). The actual VBL cadence may
// drift slightly; the value reported here is what benchmarks divide
// by to convert iters-per-N-frames to ops/sec.
uint16_t joeyFrameHz(void);
#endif

View file

@ -103,6 +103,20 @@ void spriteSaveUnder(const SurfaceT *s, SpriteT *sp, int16_t x, int16_t y, Sprit
// by other writes that overlapped its captured region.
void spriteRestoreUnder(SurfaceT *s, const SpriteBackupT *backup);
// Combined save-then-draw entry point. The common animation pattern
// captures the destination bytes about to be overwritten, then draws
// the sprite. Both ops share validation, the destination ptr is
// computed once, and a single dirty-rect mark covers both. Saves
// roughly one full dispatcher chain (~150 cyc on IIgs ORCA-C) per
// frame versus calling spriteSaveUnder + spriteDraw separately.
//
// Identical semantics to:
// spriteSaveUnder(s, sp, x, y, backup);
// spriteDraw(s, sp, x, y);
// modulo: the dirty rect is marked once for the union (which here is
// just the draw rect, since save doesn't write).
void spriteSaveAndDraw(SurfaceT *s, SpriteT *sp, int16_t x, int16_t y, SpriteBackupT *backup);
// Snapshot an 8x8-aligned region of a SurfaceT into a new SpriteT.
// The captured pixel data is copied into a sprite-owned buffer so
// the source surface can be modified afterwards. Width and height

View file

@ -70,6 +70,8 @@ SPRITE_SRC := $(EXAMPLES)/sprite/sprite.c
SPRITE_BIN := $(BINDIR)/Sprite
AUDIO_SRC := $(EXAMPLES)/audio/audio.c
AUDIO_BIN := $(BINDIR)/Audio
UBER_SRC := $(EXAMPLES)/uber/uber.c
UBER_BIN := $(BINDIR)/Uber
# Game data lives under bin/DATA/, ready to be copied into the
# scratch JOEYLIB hard-drive dir staged by scripts/run-amiga.sh.
@ -78,7 +80,7 @@ DATA_DIR := $(BINDIR)/DATA
DATA_FILES := $(DATA_DIR)/test.mod $(DATA_DIR)/test.sfx
.PHONY: all amiga clean-amiga
all amiga: $(LIB) $(HELLO_BIN) $(PATTERN_BIN) $(DRAW_BIN) $(KEYS_BIN) $(JOY_BIN) $(SPRITE_BIN) $(AUDIO_BIN) $(DATA_FILES)
all amiga: $(LIB) $(HELLO_BIN) $(PATTERN_BIN) $(DRAW_BIN) $(KEYS_BIN) $(JOY_BIN) $(SPRITE_BIN) $(AUDIO_BIN) $(UBER_BIN) $(DATA_FILES)
$(BUILD)/obj/core/%.o: $(SRC_CORE)/%.c
@mkdir -p $(dir $@)
@ -140,6 +142,10 @@ $(AUDIO_BIN): $(AUDIO_SRC) $(LIB)
@mkdir -p $(dir $@)
$(AMIGA_CC) $(CFLAGS) $< $(LIB) -o $@ $(LDFLAGS)
$(UBER_BIN): $(UBER_SRC) $(LIB)
@mkdir -p $(dir $@)
$(AMIGA_CC) $(CFLAGS) $< $(LIB) -o $@ $(LDFLAGS)
$(DATA_DIR)/test.mod: $(REPO_DIR)/assets/test.mod
@mkdir -p $(DATA_DIR)
cp $< $@

View file

@ -55,6 +55,8 @@ SPRITE_SRC := $(EXAMPLES)/sprite/sprite.c
SPRITE_BIN := $(BINDIR)/SPRITE.PRG
AUDIO_SRC := $(EXAMPLES)/audio/audio.c
AUDIO_BIN := $(BINDIR)/AUDIO.PRG
UBER_SRC := $(EXAMPLES)/uber/uber.c
UBER_BIN := $(BINDIR)/UBER.PRG
# Game data lives under bin/DATA/, alongside the binaries Hatari picks
# up when bin/ is mounted as the GEMDOS C: drive. audio.c fopens
@ -63,7 +65,7 @@ DATA_DIR := $(BINDIR)/DATA
DATA_FILES := $(DATA_DIR)/test.mod $(DATA_DIR)/test.sfx
.PHONY: all atarist clean-atarist
all atarist: $(LIB) $(LIBXMP_AR) $(HELLO_BIN) $(PATTERN_BIN) $(DRAW_BIN) $(KEYS_BIN) $(JOY_BIN) $(SPRITE_BIN) $(AUDIO_BIN) $(DATA_FILES)
all atarist: $(LIB) $(LIBXMP_AR) $(HELLO_BIN) $(PATTERN_BIN) $(DRAW_BIN) $(KEYS_BIN) $(JOY_BIN) $(SPRITE_BIN) $(AUDIO_BIN) $(UBER_BIN) $(DATA_FILES)
$(BUILD)/obj/core/%.o: $(SRC_CORE)/%.c
@mkdir -p $(dir $@)
@ -132,6 +134,10 @@ $(AUDIO_BIN): $(AUDIO_SRC) $(LIB)
@mkdir -p $(dir $@)
$(ST_CC) $(CFLAGS) $< $(LIB) $(LIBXMP_AR) -o $@ $(LDFLAGS)
$(UBER_BIN): $(UBER_SRC) $(LIB)
@mkdir -p $(dir $@)
$(ST_CC) $(CFLAGS) $< $(LIB) $(LIBXMP_AR) -o $@ $(LDFLAGS)
$(DATA_DIR)/test.mod: $(REPO_DIR)/assets/test.mod
@mkdir -p $(DATA_DIR)
cp $< $@

View file

@ -49,6 +49,8 @@ SPRITE_SRC := $(EXAMPLES)/sprite/sprite.c
SPRITE_BIN := $(BINDIR)/SPRITE.EXE
AUDIO_SRC := $(EXAMPLES)/audio/audio.c
AUDIO_BIN := $(BINDIR)/AUDIO.EXE
UBER_SRC := $(EXAMPLES)/uber/uber.c
UBER_BIN := $(BINDIR)/UBER.EXE
# Game data lives under bin/DATA/, alongside the binaries DOSBox picks
# up when bin/ is mounted as C:. audio.c fopens "DATA/test.mod" etc.
@ -56,7 +58,7 @@ DATA_DIR := $(BINDIR)/DATA
DATA_FILES := $(DATA_DIR)/test.mod $(DATA_DIR)/test.sfx
.PHONY: all dos clean-dos
all dos: $(LIB) $(LIBXMP_AR) $(HELLO_BIN) $(PATTERN_BIN) $(DRAW_BIN) $(KEYS_BIN) $(JOY_BIN) $(SPRITE_BIN) $(AUDIO_BIN) $(DATA_FILES)
all dos: $(LIB) $(LIBXMP_AR) $(HELLO_BIN) $(PATTERN_BIN) $(DRAW_BIN) $(KEYS_BIN) $(JOY_BIN) $(SPRITE_BIN) $(AUDIO_BIN) $(UBER_BIN) $(DATA_FILES)
$(BUILD)/obj/core/%.o: $(SRC_CORE)/%.c
@mkdir -p $(dir $@)
@ -121,6 +123,11 @@ $(AUDIO_BIN): $(AUDIO_SRC) $(LIB)
$(DOS_CC) $(CFLAGS) $< $(LIB) $(LIBXMP_AR) -o $@
$(DOS_EMBED_DPMI) $@
$(UBER_BIN): $(UBER_SRC) $(LIB)
@mkdir -p $(dir $@)
$(DOS_CC) $(CFLAGS) $< $(LIB) $(LIBXMP_AR) -o $@
$(DOS_EMBED_DPMI) $@
$(DATA_DIR)/test.mod: $(REPO_DIR)/assets/test.mod
@mkdir -p $(DATA_DIR)
cp $< $@

View file

@ -49,23 +49,13 @@ NTP_BIN := $(BUILD)/audio/ntpplayer.bin
NTP_ASM := $(BUILD)/audio/ntpdata.asm
IIGS_MERLIN := $(REPO_DIR)/toolchains/iigs/merlin32/bin/merlin32
# IMPORTANT: CODEGEN_SRCS (specifically spriteEmitIigs.c) MUST be the
# first entry after the main object in the link order. ORCA-Linker's
# bank assignment is order-sensitive: when spriteEmitIigs.c lands at
# any later position, the linker assigns SPRITECG to a bank where its
# intra-OMF-segment static-symbol relocations (emitMvnCopyRoutine,
# shiftedByteAt, writeLE16) can't be encoded -- you get cryptic
# "Addressing error" / "Unresolved reference Label: ..." failures
# whose root cause is bank packing, not source. Putting CODEGEN_SRCS
# first gives SPRITECG prime placement and the relocations resolve.
# This was the underlying cause of feedback_orca_link_segment_count
# cases 2-5 (we'd been working around it by managing _ROOT mass).
LIB_SRCS := $(CODEGEN_SRCS) $(CORE_C_SRCS_IIGS) $(PORT_C_SRCS) $(PORT_ASM_SRCS_ALL) $(NTP_ASM)
LIB_SRCS := $(CORE_C_SRCS_IIGS) $(PORT_C_SRCS) $(PORT_ASM_SRCS_ALL) $(NTP_ASM) $(CODEGEN_SRCS)
HELLO_SRC := $(EXAMPLES)/hello/hello.c
HELLO_BIN := $(BINDIR)/HELLO
PATTERN_SRC := $(EXAMPLES)/pattern/pattern.c
PATTERN_BIN := $(BINDIR)/PATTERN
# HELLO and PATTERN are intentionally omitted from this list. The UBER
# demo (below) exercises every public API, including what those two
# small examples covered, and the IIgs disk image was running out of
# room. Source for HELLO/PATTERN is still in examples/{hello,pattern}/
# for reference and for other ports that want them.
DRAW_SRC := $(EXAMPLES)/draw/draw.c
DRAW_BIN := $(BINDIR)/DRAW
KEYS_SRC := $(EXAMPLES)/keys/keys.c
@ -74,6 +64,8 @@ JOY_SRC := $(EXAMPLES)/joy/joy.c
JOY_BIN := $(BINDIR)/JOY
SPRITE_SRC := $(EXAMPLES)/sprite/sprite.c
SPRITE_BIN := $(BINDIR)/SPRITE
UBER_SRC := $(EXAMPLES)/uber/uber.c
UBER_BIN := $(BINDIR)/UBER
AUDIO_SRC := $(EXAMPLES)/audio/audio.c
AUDIO_BIN := $(BINDIR)/AUDIO
AUDIO_MOD := $(REPO_DIR)/assets/test.mod
@ -128,16 +120,6 @@ $(NTP_ASM): $(NTP_BIN) $(REPO_DIR)/toolchains/iigs/bin-to-asm.sh
# everywhere, so library asm can take SurfaceT* args via one
# consistent ABI (small-mm 16-bit pointers truncated bank bytes,
# which broke any asm that wanted to address bank-1 stage memory).
$(HELLO_BIN): $(HELLO_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD)
@mkdir -p $(dir $@)
$(IIGS_BUILD) -b $(IIX_INCLUDES) -o $@ $(HELLO_SRC) $(LIB_SRCS)
$(IIGS_IIX) chtyp -t S16 $@
$(PATTERN_BIN): $(PATTERN_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD)
@mkdir -p $(dir $@)
$(IIGS_BUILD) -b $(IIX_INCLUDES) -o $@ $(PATTERN_SRC) $(LIB_SRCS)
$(IIGS_IIX) chtyp -t S16 $@
$(DRAW_BIN): $(DRAW_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD)
@mkdir -p $(dir $@)
$(IIGS_BUILD) -b $(IIX_INCLUDES) -o $@ $(DRAW_SRC) $(LIB_SRCS)
@ -158,6 +140,17 @@ $(SPRITE_BIN): $(SPRITE_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD)
$(IIGS_BUILD) -b $(IIX_INCLUDES) -o $@ $(SPRITE_SRC) $(LIB_SRCS)
$(IIGS_IIX) chtyp -t S16 $@
# UBER bumps user stack to 16 KB. ORCA-C's default user stack is small
# (~1 KB) and vfprintf's parsing buffer + the demo's own stack-local
# format buffers were spilling past it -- the symptom was a crash to
# monitor on the second varargs-style joeyLogF call. The hand-rolled
# decimal formatter in uber.c also uses larger stack-local buffers
# (line[96], num[16]) than typical demos. 16 KB is plenty of headroom.
$(UBER_BIN): $(UBER_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD)
@mkdir -p $(dir $@)
$(IIGS_BUILD) -b -s 16384 $(IIX_INCLUDES) -o $@ $(UBER_SRC) $(LIB_SRCS)
$(IIGS_IIX) chtyp -t S16 $@
# Convert the cross-platform .MOD asset to NinjaTrackerPlus runtime
# format via joeymod (which shells out to ntpconverter.php). Without
# php-cli the conversion is skipped; in that case the IIgs disk just
@ -181,13 +174,13 @@ $(AUDIO_BIN): $(AUDIO_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD)
$(IIGS_BUILD) -b $(IIX_INCLUDES) -I $(EXAMPLES)/audio -o $@ $(AUDIO_SRC) $(LIB_SRCS)
$(IIGS_IIX) chtyp -t S16 $@
# Assemble an 800KB ProDOS 2img containing the examples, ready to
# mount in GSplus alongside a GS/OS boot volume.
# Assemble a ProDOS 2img containing the examples, ready to mount in
# GSplus alongside a GS/OS boot volume.
iigs-disk: $(DISK_IMG)
$(DISK_IMG): $(HELLO_BIN) $(PATTERN_BIN) $(DRAW_BIN) $(KEYS_BIN) $(JOY_BIN) $(SPRITE_BIN) $(AUDIO_BIN) $(AUDIO_DATA_FILES) $(IIGS_PACKAGE)
$(DISK_IMG): $(DRAW_BIN) $(KEYS_BIN) $(JOY_BIN) $(SPRITE_BIN) $(AUDIO_BIN) $(UBER_BIN) $(AUDIO_DATA_FILES) $(IIGS_PACKAGE)
@mkdir -p $(dir $@)
$(IIGS_PACKAGE) $@ $(HELLO_BIN) $(PATTERN_BIN) $(DRAW_BIN) $(KEYS_BIN) $(JOY_BIN) $(SPRITE_BIN) $(AUDIO_BIN) -- $(AUDIO_DATA_FILES)
$(IIGS_PACKAGE) $@ $(DRAW_BIN) $(KEYS_BIN) $(JOY_BIN) $(SPRITE_BIN) $(AUDIO_BIN) $(UBER_BIN) -- $(AUDIO_DATA_FILES)
clean-iigs:
rm -rf $(BUILD)

View file

@ -1,29 +1,24 @@
#!/usr/bin/env bash
# Launch the built Apple IIgs examples in GSplus. GSplus is booted from
# a GS/OS 6.0.4 System disk (toolchains/emulators/support/gsos-system.po)
# with joey.2mg mounted as the data disk on slot 5 drive 2. The user
# navigates to the JOEYLIB volume in Finder and double-clicks the
# example to run it.
# with joey.2mg mounted as the data disk on slot 5 drive 2. GS/OS drops
# to Finder; the user navigates to the JOEYLIB volume and double-clicks
# whichever example they want to run.
#
# Unlike the other emulators, GS/OS does not auto-run on boot -- it
# drops to Finder. The argument just prints a reminder of which
# example to launch.
#
# scripts/run-iigs.sh # boots (Pattern hint)
# scripts/run-iigs.sh hello # boots, hints HELLO
# scripts/run-iigs.sh draw # boots, hints DRAW
#
# Argument is any built example name (case-insensitive); upper-case
# it for the Finder hint and existence-check.
# No argument: GSplus has no way to dispatch a specific binary on boot
# (Finder is interactive), so this script just stages the disk and
# launches the emulator. The post-run trap below extracts joeylog.txt
# from the data disk so demos that left a breadcrumb file are visible
# from the host shell after the emulator exits.
set -euo pipefail
if [[ $# -gt 1 ]]; then
echo "usage: $0 [example-name]" >&2
if [[ $# -ne 0 ]]; then
echo "usage: $0" >&2
echo " (no arguments -- launch GSplus, pick the demo in Finder)" >&2
exit 2
fi
prog=${1:-pattern}
repo=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)
# profuse looks up its FST helpers under $GOLDEN_GATE / $ORCA_ROOT and
@ -38,18 +33,6 @@ sys_disk=$repo/toolchains/emulators/support/gsos-system.po
data_disk=$repo/build/iigs/bin/joey.2mg
null_c600=$repo/toolchains/emulators/support/iigs-null-c600.rom
target=${prog^^}
bin_dir=$repo/build/iigs/bin
if [[ ! -f "$bin_dir/$target" ]]; then
echo "$bin_dir/$target not built. Run 'make iigs' first." >&2
if compgen -G "$bin_dir/*" > /dev/null; then
echo "available examples in $bin_dir:" >&2
find "$bin_dir" -maxdepth 1 -type f -printf '%f\n' \
| grep -vE '\.2mg$|\.txt$' >&2 || true
fi
exit 1
fi
for f in "$gsplus" "$rom" "$sys_disk" "$data_disk" "$null_c600"; do
if [[ ! -f $f ]]; then
echo "missing: $f" >&2
@ -123,7 +106,7 @@ cat <<EOF
GSplus launching GS/OS 6.0.4.
Once Finder is up:
1. Open the JOEYLIB disk on the desktop.
2. Double-click $target to run.
2. Double-click whichever demo you want to run.
EOF
# GSplus auto-creates config.kegs in its cwd on first run. cd into

View file

@ -209,13 +209,9 @@ void spriteCompiledDraw(SurfaceT *dst, const SpriteT *sp, int16_t x, int16_t y)
{
uint8_t *destPtr;
uint8_t destBytes[4];
shift = (uint8_t)(x & 1);
destPtr = &dst->pixels[SURFACE_ROW_OFFSET(y) + ((uint16_t)x >> 1)];
memcpy(destBytes, &destPtr, 4);
destAddr = (uint32_t)destBytes[0]
| ((uint32_t)destBytes[1] << 8)
| ((uint32_t)destBytes[2] << 16);
destAddr = (uint32_t)destPtr;
destOffset = (uint16_t)(destAddr & 0xFFFFu);
destBank = (uint8_t)((destAddr >> 16) & 0xFFu);
fnAddr = codegenArenaBaseAddr()
@ -248,9 +244,10 @@ void spriteCompiledDraw(SurfaceT *dst, const SpriteT *sp, int16_t x, int16_t y)
// fnAddr changes only on shift parity flips or sprite swaps.
if (fnAddr != gDrawStubLastFnAddr) {
gSpriteCallStub[ 9] = (unsigned char)(fnAddr & 0xFFu);
gSpriteCallStub[10] = (unsigned char)((fnAddr >> 8) & 0xFFu);
gSpriteCallStub[11] = (unsigned char)((fnAddr >> 16) & 0xFFu);
const uint8_t *fnB_ = (const uint8_t *)&fnAddr;
gSpriteCallStub[ 9] = fnB_[0];
gSpriteCallStub[10] = fnB_[1];
gSpriteCallStub[11] = fnB_[2];
gDrawStubLastFnAddr = fnAddr;
}
@ -329,20 +326,41 @@ static void patchMvnBanks(uint8_t *routine, uint16_t heightPx, uint8_t dstBank,
}
// Common helper: dump a 24-bit pointer's raw bytes via memcpy
// (avoiding ORCA-C's lossy (uint32_t) pointer cast under memorymodel
// 1) and split into low 16 bits + bank.
static void splitPointer(const void *ptr, uint16_t *outLo, uint8_t *outBank) {
uint8_t bytes[4];
uint32_t addr;
memcpy(bytes, &ptr, 4);
addr = (uint32_t)bytes[0]
| ((uint32_t)bytes[1] << 8)
| ((uint32_t)bytes[2] << 16);
*outLo = (uint16_t)(addr & 0xFFFFu);
*outBank = (uint8_t)((addr >> 16) & 0xFFu);
}
// Split a 24-bit pointer into its low 16 bits + bank byte. The
// (uint32_t) cast works correctly in ORCA/C 2.2.1 (the 2.1.0 lossy-
// bank-byte bug is fixed). To avoid invoking the ~LSHR4 32-bit-shift
// helper for the `>> 16` to extract the bank byte, we cast to
// uint32_t and then byte-alias the storage -- gets the same bytes
// with simple loads.
#define SPLIT_POINTER(_ptr, _outLo, _outBank) \
do { \
uint32_t spAddr_ = (uint32_t)(_ptr); \
const uint8_t *spB_ = (const uint8_t *)&spAddr_; \
*(_outLo) = (uint16_t)(spB_[0] | ((uint16_t)spB_[1] << 8)); \
*(_outBank) = spB_[2]; \
} while (0)
// Backup-buffer pointer split cache. backup->bytes is a user-supplied
// buffer (e.g. a static array) and effectively never changes after
// the first call -- caching its split saves both Save and Restore the
// macro expansion per frame.
static const void *gLastBackupBytes = (const void *)0;
static uint16_t gLastBackupBytesLo = 0;
static uint8_t gLastBackupBytesBank = 0;
#define SPLIT_BACKUP_CACHED(_bytes, _outLo, _outBank) \
do { \
if ((const void *)(_bytes) == gLastBackupBytes) { \
*(_outLo) = gLastBackupBytesLo; \
*(_outBank) = gLastBackupBytesBank; \
} else { \
SPLIT_POINTER((_bytes), (_outLo), (_outBank)); \
gLastBackupBytes = (const void *)(_bytes); \
gLastBackupBytesLo = *(_outLo); \
gLastBackupBytesBank = *(_outBank); \
} \
} while (0)
void spriteCompiledSaveUnder(const SurfaceT *src, SpriteT *sp, int16_t x, int16_t y, SpriteBackupT *backup) {
@ -358,6 +376,10 @@ void spriteCompiledSaveUnder(const SurfaceT *src, SpriteT *sp, int16_t x, int16_
uint32_t fnAddr;
uint8_t *routine;
uint8_t *screenPtr;
uint16_t cacheIdx; /* shift * SPRITE_OP_COUNT + SPRITE_OP_SAVE, computed once */
uint8_t *cachedDst; /* &sp->cachedDstBank[0][0] + cacheIdx */
uint8_t *cachedSrc; /* &sp->cachedSrcBank[0][0] + cacheIdx */
uint16_t routineOffset; /* sp->routineOffsets[shift][SPRITE_OP_SAVE], computed once */
shift = (uint8_t)(x & 1);
clippedX = (int16_t)(x & ~1);
@ -366,19 +388,39 @@ void spriteCompiledSaveUnder(const SurfaceT *src, SpriteT *sp, int16_t x, int16_
copyBytes = (uint16_t)((widthPx >> 1) + (shift == 1 ? 1 : 0));
screenPtr = (uint8_t *)&src->pixels[SURFACE_ROW_OFFSET(y) + ((uint16_t)clippedX >> 1)];
splitPointer(screenPtr, &screenLo, &screenBank);
splitPointer(backup->bytes, &backupLo, &backupBank);
SPLIT_POINTER(screenPtr, &screenLo, &screenBank);
SPLIT_BACKUP_CACHED(backup->bytes, &backupLo, &backupBank);
backup->sprite = sp;
backup->x = clippedX;
backup->y = y;
backup->width = (uint16_t)(copyBytes << 1);
backup->height = heightPx;
backup->sizeBytes = (uint16_t)(copyBytes * heightPx);
/* sizeBytes is constant per (sprite, shift); cache to dodge the
* per-call ~CUMUL2 (uint16_t * uint16_t) helper. The byte-pointer
* arithmetic avoids reintroducing ~MUL4 for the uint16_t array
* indexing. */
{
uint16_t *sizeCachePtr = (uint16_t *)((uint8_t *)sp->cachedSizeBytes + ((uint16_t)shift << 1));
if (*sizeCachePtr == 0) {
*sizeCachePtr = (uint16_t)(copyBytes * heightPx);
}
backup->sizeBytes = *sizeCachePtr;
}
/* Compute the 1D index into the cached* / routineOffsets 2D arrays
* once. ORCA-C 2.2.1 lowers `shift * SPRITE_OP_COUNT` (where
* SPRITE_OP_COUNT==3) to a ~MUL4 helper call; (shift<<1)+shift
* compiles to two ASLs and an ADC, no helper. */
cacheIdx = (uint16_t)(((uint16_t)shift << 1) + (uint16_t)shift + SPRITE_OP_SAVE);
cachedDst = (uint8_t *)sp->cachedDstBank + cacheIdx;
cachedSrc = (uint8_t *)sp->cachedSrcBank + cacheIdx;
/* Same byte-pointer trick as SURFACE_ROW_OFFSET to dodge ~MUL4. */
routineOffset = *(uint16_t *)((uint8_t *)sp->routineOffsets + (cacheIdx << 1));
fnAddr = codegenArenaBaseAddr()
+ sp->slot->offset
+ (uint32_t)sp->routineOffsets[shift][SPRITE_OP_SAVE];
+ (uint32_t)routineOffset;
// Stub: X = screen (source), Y = backup (destination).
if (!gSaveStubInited) {
@ -401,22 +443,22 @@ void spriteCompiledSaveUnder(const SurfaceT *src, SpriteT *sp, int16_t x, int16_
gSaveStubLastYLo = backupLo;
}
if (fnAddr != gSaveStubLastFnAddr) {
gSpriteSaveStub[ 8] = (unsigned char)(fnAddr & 0xFFu);
gSpriteSaveStub[ 9] = (unsigned char)((fnAddr >> 8) & 0xFFu);
gSpriteSaveStub[10] = (unsigned char)((fnAddr >> 16) & 0xFFu);
/* Byte-alias the uint32_t to grab the 3 bank/lo/hi bytes
* without invoking ~LSHR4 for the >>16. */
const uint8_t *fnB_ = (const uint8_t *)&fnAddr;
gSpriteSaveStub[ 8] = fnB_[0];
gSpriteSaveStub[ 9] = fnB_[1];
gSpriteSaveStub[10] = fnB_[2];
gSaveStubLastFnAddr = fnAddr;
}
// Skip the 16+ MVN-bank rewrites if the dst/src bank pair is the
// same as last call. Screen and backup buffer banks are stable
// for essentially every frame past the first, so this short-
// circuits ~5000 cyc/frame on the ball demo.
if (sp->cachedDstBank[shift][SPRITE_OP_SAVE] != backupBank ||
sp->cachedSrcBank[shift][SPRITE_OP_SAVE] != screenBank) {
routine = codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_SAVE];
// same as last call.
if (*cachedDst != backupBank || *cachedSrc != screenBank) {
routine = codegenArenaBase() + sp->slot->offset + routineOffset;
patchMvnBanks(routine, heightPx, /*dst*/backupBank, /*src*/screenBank);
sp->cachedDstBank[shift][SPRITE_OP_SAVE] = backupBank;
sp->cachedSrcBank[shift][SPRITE_OP_SAVE] = screenBank;
*cachedDst = backupBank;
*cachedSrc = screenBank;
}
// MVN-based routine: needs M=16 / X=16; restore M=16 on exit
@ -442,6 +484,10 @@ void spriteCompiledRestoreUnder(SurfaceT *dst, const SpriteBackupT *backup) {
uint8_t *routine;
uint8_t *screenPtr;
SpriteT *sp;
uint16_t cacheIdx; /* shift * SPRITE_OP_COUNT + SPRITE_OP_RESTORE, computed once */
uint8_t *cachedDst;
uint8_t *cachedSrc;
uint16_t routineOffset;
sp = backup->sprite;
heightPx = backup->height;
@ -450,12 +496,19 @@ void spriteCompiledRestoreUnder(SurfaceT *dst, const SpriteBackupT *backup) {
shift = (copyBytes == spriteBytesPerRow) ? 0 : 1;
screenPtr = (uint8_t *)&dst->pixels[SURFACE_ROW_OFFSET(backup->y) + ((uint16_t)backup->x >> 1)];
splitPointer(screenPtr, &screenLo, &screenBank);
splitPointer(backup->bytes, &backupLo, &backupBank);
SPLIT_POINTER(screenPtr, &screenLo, &screenBank);
SPLIT_BACKUP_CACHED(backup->bytes, &backupLo, &backupBank);
/* Hoist 2D-array indexing -- see save-side comment. */
cacheIdx = (uint16_t)(((uint16_t)shift << 1) + (uint16_t)shift + SPRITE_OP_RESTORE);
cachedDst = (uint8_t *)sp->cachedDstBank + cacheIdx;
cachedSrc = (uint8_t *)sp->cachedSrcBank + cacheIdx;
/* Same byte-pointer trick as SURFACE_ROW_OFFSET to dodge ~MUL4. */
routineOffset = *(uint16_t *)((uint8_t *)sp->routineOffsets + (cacheIdx << 1));
fnAddr = codegenArenaBaseAddr()
+ sp->slot->offset
+ (uint32_t)sp->routineOffsets[shift][SPRITE_OP_RESTORE];
+ (uint32_t)routineOffset;
// Stub: X = backup (source), Y = screen (destination).
if (!gRestoreStubInited) {
@ -478,20 +531,20 @@ void spriteCompiledRestoreUnder(SurfaceT *dst, const SpriteBackupT *backup) {
gRestoreStubLastYLo = screenLo;
}
if (fnAddr != gRestoreStubLastFnAddr) {
gSpriteRestoreStub[ 8] = (unsigned char)(fnAddr & 0xFFu);
gSpriteRestoreStub[ 9] = (unsigned char)((fnAddr >> 8) & 0xFFu);
gSpriteRestoreStub[10] = (unsigned char)((fnAddr >> 16) & 0xFFu);
const uint8_t *fnB_ = (const uint8_t *)&fnAddr;
gSpriteRestoreStub[ 8] = fnB_[0];
gSpriteRestoreStub[ 9] = fnB_[1];
gSpriteRestoreStub[10] = fnB_[2];
gRestoreStubLastFnAddr = fnAddr;
}
// Same short-circuit as save: only re-stamp the bank operands if
// they actually changed since last call.
if (sp->cachedDstBank[shift][SPRITE_OP_RESTORE] != screenBank ||
sp->cachedSrcBank[shift][SPRITE_OP_RESTORE] != backupBank) {
routine = codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_RESTORE];
if (*cachedDst != screenBank || *cachedSrc != backupBank) {
routine = codegenArenaBase() + sp->slot->offset + routineOffset;
patchMvnBanks(routine, heightPx, /*dst*/screenBank, /*src*/backupBank);
sp->cachedDstBank[shift][SPRITE_OP_RESTORE] = screenBank;
sp->cachedSrcBank[shift][SPRITE_OP_RESTORE] = backupBank;
*cachedDst = screenBank;
*cachedSrc = backupBank;
}
asm {

View file

@ -31,16 +31,6 @@
#include "spriteEmitter.h"
#include "spriteInternal.h"
// Pin the IIgs sprite codegen statics into their own load segment
// instead of letting them ride in _ROOT. _ROOT also collects every
// other unsegmented .c (init.c, sprite.c, present.c, the example
// main, ...), so growth in any of those can shift the linker's
// per-bank packing and orphan intra-file static refs (we hit this
// when DRAWPRIMS grew with the chunked PEI-slam: PATTERN's link
// reported "Unresolved reference: emitMvnCopyRoutine" purely from
// _ROOT crowding). A dedicated load segment isolates this file.
JOEYLIB_SEGMENT("SPRITECG")
// ----- Constants -----

View file

@ -12,8 +12,6 @@
#include "joey/asset.h"
#include "joey/palette.h"
// CORESYS: hoisted out of _ROOT (see surface.c for rationale).
JOEYLIB_SEGMENT("CORESYS")
#define JAS_HEADER_SIZE 44
#define JAS_PIXELS_OFFSET JAS_HEADER_SIZE

View file

@ -8,8 +8,6 @@
#include "joey/audio.h"
#include "hal.h"
// CORESYS: hoisted out of _ROOT (see surface.c for rationale).
JOEYLIB_SEGMENT("CORESYS")
static bool gAudioReady = false;
@ -79,5 +77,10 @@ void joeyAudioFrameTick(void) {
if (!gAudioReady) {
return;
}
#ifndef JOEYLIB_PLATFORM_IIGS
// IIgs: NTPstreamsound is fully DOC-IRQ-driven, halAudioFrameTick
// is an empty no-op there. Skip the wrapper JSL entirely on IIgs
// so per-frame audio cost stays at the gAudioReady branch above.
halAudioFrameTick();
#endif
}

View file

@ -23,22 +23,27 @@
#include "codegenArenaInternal.h"
// CORESYS: hoisted out of _ROOT (see surface.c for rationale).
JOEYLIB_SEGMENT("CORESYS")
// ----- Module state -----
static uint8_t *gBase = NULL;
// gBaseAddr mirrors gBase as a 24-bit absolute address. ORCA-C's
// (uint32_t)pointer cast on the IIgs zeros the bank byte for some
// pointer expressions, so JSL targets read this field directly.
static uint32_t gBaseAddr = 0;
// gCodegenArenaBase / gCodegenArenaBaseAddr are non-static so spriteCompile.c can read them
// directly via extern instead of paying a JSL/RTL per access through
// the codegenArenaBase() / codegenArenaBaseAddr() wrappers. Both are
// set once at codegenArenaInit and never moved (the underlying
// Memory Manager handle is locked-in-place on IIgs). Callers MUST
// treat them as read-only.
uint8_t *gCodegenArenaBase = NULL;
// gCodegenArenaBaseAddr mirrors gCodegenArenaBase as a 24-bit
// absolute address. ORCA-C's (uint32_t)pointer cast on the IIgs
// zeros the bank byte for some pointer expressions, so JSL targets
// read this field directly.
uint32_t gCodegenArenaBaseAddr = 0;
static uint32_t gTotalBytes = 0;
static uint32_t gUsedBytes = 0;
static ArenaSlotT *gFirstSlot = NULL;
#if defined(JOEYLIB_PLATFORM_IIGS)
static Handle gBaseHandle = NULL;
static Handle gCodegenArenaBaseHandle = NULL;
#endif
@ -93,7 +98,7 @@ ArenaSlotT *codegenArenaAlloc(uint32_t bytes) {
ArenaSlotT *slot;
ArenaSlotT *remainder;
if (gBase == NULL || bytes == 0) {
if (gCodegenArenaBase == NULL || bytes == 0) {
return NULL;
}
for (slot = gFirstSlot; slot != NULL; slot = slot->next) {
@ -123,14 +128,11 @@ ArenaSlotT *codegenArenaAlloc(uint32_t bytes) {
}
uint8_t *codegenArenaBase(void) {
return gBase;
}
uint32_t codegenArenaBaseAddr(void) {
return gBaseAddr;
}
// codegenArenaBase() / codegenArenaBaseAddr() are now header-only
// macros that read gCodegenArenaBase / gCodegenArenaBaseAddr
// directly, so the C function bodies that used to live here are
// gone. The wrappers cost ~30 cyc per call on IIgs and were hit
// 3x per sprite frame.
uint32_t codegenArenaBytesTotal(void) {
@ -149,7 +151,7 @@ void codegenArenaCompact(void) {
ArenaSlotT *trailing;
uint32_t cursor;
if (gBase == NULL) {
if (gCodegenArenaBase == NULL) {
return;
}
cursor = 0;
@ -158,7 +160,7 @@ void codegenArenaCompact(void) {
next = slot->next;
if (slot->used) {
if (slot->offset != cursor) {
memmove(gBase + cursor, gBase + slot->offset, slot->size);
memmove(gCodegenArenaBase + cursor, gCodegenArenaBase + slot->offset, slot->size);
slot->offset = cursor;
}
cursor += slot->size;
@ -200,7 +202,7 @@ void codegenArenaCompact(void) {
void codegenArenaFree(ArenaSlotT *slot) {
if (slot == NULL || gBase == NULL) {
if (slot == NULL || gCodegenArenaBase == NULL) {
return;
}
if (!slot->used) {
@ -215,21 +217,21 @@ void codegenArenaFree(ArenaSlotT *slot) {
bool codegenArenaInit(uint32_t totalBytes) {
if (gBase != NULL) {
if (gCodegenArenaBase != NULL) {
return true;
}
if (totalBytes == 0) {
return false;
}
#if defined(JOEYLIB_PLATFORM_IIGS)
gBaseHandle = NewHandle(totalBytes, _ownerid,
gCodegenArenaBaseHandle = NewHandle(totalBytes, _ownerid,
attrFixed | attrLocked | attrPage | attrNoCross,
NULL);
if (gBaseHandle == NULL || _toolErr != 0) {
gBaseHandle = NULL;
if (gCodegenArenaBaseHandle == NULL || _toolErr != 0) {
gCodegenArenaBaseHandle = NULL;
return false;
}
HLock(gBaseHandle);
HLock(gCodegenArenaBaseHandle);
// Capture the 24-bit absolute address by copying the Pointer's
// raw bytes -- (uint32_t)pointer through a chain of expressions
// has been observed to drop the bank byte under ORCA-C's
@ -238,35 +240,35 @@ bool codegenArenaInit(uint32_t totalBytes) {
{
Pointer p;
uint8_t bytes[4];
p = *gBaseHandle;
gBase = (uint8_t *)p;
p = *gCodegenArenaBaseHandle;
gCodegenArenaBase = (uint8_t *)p;
memcpy(bytes, &p, 4);
gBaseAddr = (uint32_t)bytes[0]
gCodegenArenaBaseAddr = (uint32_t)bytes[0]
| ((uint32_t)bytes[1] << 8)
| ((uint32_t)bytes[2] << 16);
}
if (gBase == NULL) {
DisposeHandle(gBaseHandle);
gBaseHandle = NULL;
if (gCodegenArenaBase == NULL) {
DisposeHandle(gCodegenArenaBaseHandle);
gCodegenArenaBaseHandle = NULL;
return false;
}
#else
gBase = (uint8_t *)malloc(totalBytes);
if (gBase == NULL) {
gCodegenArenaBase = (uint8_t *)malloc(totalBytes);
if (gCodegenArenaBase == NULL) {
return false;
}
gBaseAddr = (uint32_t)gBase;
gCodegenArenaBaseAddr = (uint32_t)gCodegenArenaBase;
#endif
gFirstSlot = newSlot(0, totalBytes, false);
if (gFirstSlot == NULL) {
#if defined(JOEYLIB_PLATFORM_IIGS)
DisposeHandle(gBaseHandle);
gBaseHandle = NULL;
DisposeHandle(gCodegenArenaBaseHandle);
gCodegenArenaBaseHandle = NULL;
#else
free(gBase);
free(gCodegenArenaBase);
#endif
gBase = NULL;
gBaseAddr = 0;
gCodegenArenaBase = NULL;
gCodegenArenaBaseAddr = 0;
return false;
}
gTotalBytes = totalBytes;
@ -279,7 +281,7 @@ void codegenArenaShutdown(void) {
ArenaSlotT *slot;
ArenaSlotT *next;
if (gBase == NULL) {
if (gCodegenArenaBase == NULL) {
return;
}
for (slot = gFirstSlot; slot != NULL; slot = next) {
@ -287,13 +289,13 @@ void codegenArenaShutdown(void) {
free(slot);
}
#if defined(JOEYLIB_PLATFORM_IIGS)
DisposeHandle(gBaseHandle);
gBaseHandle = NULL;
DisposeHandle(gCodegenArenaBaseHandle);
gCodegenArenaBaseHandle = NULL;
#else
free(gBase);
free(gCodegenArenaBase);
#endif
gBase = NULL;
gBaseAddr = 0;
gCodegenArenaBase = NULL;
gCodegenArenaBaseAddr = 0;
gFirstSlot = NULL;
gTotalBytes = 0;
gUsedBytes = 0;

View file

@ -58,14 +58,15 @@ void codegenArenaCompact(void);
// Used for spriteDraw's address computation. The base pointer is
// stable for the lifetime of the arena; only slot->offset moves.
uint8_t *codegenArenaBase(void);
// Same address as codegenArenaBase() but returned as an integer. The
// IIgs JSL trampoline needs the 24-bit absolute address as a number
// it can split into bank/offset bytes; ORCA-C's pointer-to-uint32_t
// cast has dropped the bank byte in some expressions, so we expose
// the integer view directly.
uint32_t codegenArenaBaseAddr(void);
//
// Direct extern access (instead of a getter function) so per-frame
// hot paths in spriteCompile.c skip the JSL/PHB/RTL/PLB the wrapper
// would impose. Both globals are read-only after codegenArenaInit;
// the function-form getters below are kept as a back-compat shim.
extern uint8_t *gCodegenArenaBase;
extern uint32_t gCodegenArenaBaseAddr;
#define codegenArenaBase() ((uint8_t *)gCodegenArenaBase)
#define codegenArenaBaseAddr() ((uint32_t)gCodegenArenaBaseAddr)
// Public-API support: sum of live slot sizes, total arena size.
// Difference is free space (which may be fragmented across holes

View file

@ -13,8 +13,6 @@
#include "joey/platform.h"
#include "joey/debug.h"
// CORESYS: hoisted out of _ROOT (see surface.c for rationale).
JOEYLIB_SEGMENT("CORESYS")
static const char *kLogPath = "joeylog.txt";

View file

@ -12,12 +12,6 @@
#include "hal.h"
#include "surfaceInternal.h"
// On IIgs, hoist all primitive functions out of _ROOT into a named
// DRAWPRIMS load segment. drawLine/drawCircle/fillCircle/floodFill/
// floodFillBounded together push past the 64 KB-per-bank budget for
// the simpler binaries (PATTERN was the first to fail). On other
// ports this macro vanishes.
JOEYLIB_SEGMENT("DRAWPRIMS")
// ----- Constants -----
@ -107,25 +101,28 @@ static void fillRectClipped(SurfaceT *s, int16_t x, int16_t y, int16_t w, int16_
uint8_t nibble = colorIndex & 0x0F;
uint8_t doubled = (uint8_t)((nibble << 4) | nibble);
int16_t row;
int16_t pxStart;
int16_t pxEnd;
int16_t midBytes;
uint16_t pxStart;
uint16_t pxEnd;
uint16_t midBytes;
uint8_t *line;
/* px* and midBytes are uint16_t (clipped values are non-negative)
* so `>>1` lowers to a single LSR instead of ORCA-C's
* ~SSHIFTRIGHT helper. Same with `<<1` for midBytes. */
for (row = 0; row < h; row++) {
line = &s->pixels[SURFACE_ROW_OFFSET(y + row)];
pxStart = x;
pxEnd = x + w;
pxStart = (uint16_t)x;
pxEnd = (uint16_t)(x + w);
if (pxStart & 1) {
if (pxStart & 1u) {
line[pxStart >> 1] = (uint8_t)((line[pxStart >> 1] & 0xF0) | nibble);
pxStart++;
}
midBytes = (pxEnd - pxStart) >> 1;
if (midBytes > 0) {
midBytes = (uint16_t)((pxEnd - pxStart) >> 1);
if (midBytes > 0u) {
memset(&line[pxStart >> 1], doubled, (size_t)midBytes);
pxStart += midBytes << 1;
pxStart = (uint16_t)(pxStart + (midBytes << 1));
}
if (pxStart < pxEnd) {
@ -343,7 +340,10 @@ static void floodFillInternal(SurfaceT *s, int16_t startX, int16_t startY, uint8
static void dstPixel(uint8_t *row, int16_t x, uint8_t nibble) {
uint8_t *byte;
byte = &row[x >> 1];
/* `(uint16_t)x >> 1` instead of `x >> 1` -- caller has already
* range-checked x non-negative, and unsigned shift dodges the
* ~SSHIFTRIGHT helper ORCA-C emits for signed `>>`. */
byte = &row[(uint16_t)x >> 1];
if (x & 1) {
*byte = (uint8_t)((*byte & 0xF0) | nibble);
} else {
@ -355,7 +355,7 @@ static void dstPixel(uint8_t *row, int16_t x, uint8_t nibble) {
static uint8_t srcPixel(const uint8_t *row, int16_t x) {
uint8_t byte;
byte = row[x >> 1];
byte = row[(uint16_t)x >> 1];
if (x & 1) {
return (uint8_t)(byte & 0x0F);
}
@ -407,11 +407,13 @@ void drawCircle(SurfaceT *s, int16_t cx, int16_t cy, uint16_t r, uint8_t colorIn
drawPixel(s, (int16_t)(cx + y), (int16_t)(cy - x), colorIndex);
drawPixel(s, (int16_t)(cx - y), (int16_t)(cy - x), colorIndex);
y++;
/* Use `+ + 1` instead of `2 * y + 1` so ORCA-C never emits
* the ~SMUL2 helper -- two ADDs are unconditionally cheaper. */
if (err <= 0) {
err = (int16_t)(err + 2 * y + 1);
err = (int16_t)(err + y + y + 1);
} else {
x--;
err = (int16_t)(err + 2 * (y - x) + 1);
err = (int16_t)(err + y + y - x - x + 1);
}
}
}
@ -502,7 +504,9 @@ void drawPixel(SurfaceT *s, int16_t x, int16_t y, uint8_t colorIndex) {
}
if (!halFastDrawPixel(s, (uint16_t)x, (uint16_t)y, colorIndex)) {
byte = &s->pixels[SURFACE_ROW_OFFSET(y) + (x >> 1)];
/* Cast to uint16_t before shift -- already validated x >= 0,
* so unsigned semantics match. Avoids ~SSHIFTRIGHT helper. */
byte = &s->pixels[SURFACE_ROW_OFFSET(y) + ((uint16_t)x >> 1)];
nibble = colorIndex & 0x0F;
if (x & 1) {
*byte = (uint8_t)((*byte & 0xF0) | nibble);
@ -571,20 +575,26 @@ void fillCircle(SurfaceT *s, int16_t cx, int16_t cy, uint16_t r, uint8_t colorIn
// (y+1)^2 = y^2 + 2y + 1; (x-1)^2 = x^2 - 2x + 1. r is uint16_t
// so xx, yy, r2 fit in uint16_t for any r where x*x+y*y can equal
// r2 (i.e. r <= 255 -> r2 <= 65025).
/* Same `+ +` pattern as drawCircle so ORCA-C doesn't emit ~SMUL2 /
* ~CUMUL2 helpers for the `2 * ...` constants. spanWidth is hoisted
* because both fillRect calls in the body need it. */
xx = (uint16_t)(r * r);
r2 = xx;
yy = 0;
x = (int16_t)r;
for (y = 0; y <= (int16_t)r; y++) {
uint16_t spanWidth;
while (xx + yy > r2) {
xx = (uint16_t)(xx - (uint16_t)(2 * x - 1));
xx = (uint16_t)(xx - (uint16_t)((uint16_t)x + (uint16_t)x - 1u));
x--;
}
fillRect(s, (int16_t)(cx - x), (int16_t)(cy + y), (uint16_t)(2 * x + 1), 1, colorIndex);
spanWidth = (uint16_t)((uint16_t)x + (uint16_t)x + 1u);
fillRect(s, (int16_t)(cx - x), (int16_t)(cy + y), spanWidth, 1, colorIndex);
if (y > 0) {
fillRect(s, (int16_t)(cx - x), (int16_t)(cy - y), (uint16_t)(2 * x + 1), 1, colorIndex);
fillRect(s, (int16_t)(cx - x), (int16_t)(cy - y), spanWidth, 1, colorIndex);
}
yy = (uint16_t)(yy + (uint16_t)(2 * y + 1));
yy = (uint16_t)(yy + (uint16_t)((uint16_t)y + (uint16_t)y + 1u));
}
}
@ -668,11 +678,16 @@ uint8_t samplePixel(const SurfaceT *s, int16_t x, int16_t y) {
return 0;
}
byte = s->pixels[SURFACE_ROW_OFFSET(y) + (x >> 1)];
/* Cast to uint16_t before shift -- already validated x >= 0,
* unsigned semantics match. Avoids ~SSHIFTRIGHT helper. */
byte = s->pixels[SURFACE_ROW_OFFSET(y) + ((uint16_t)x >> 1)];
if (x & 1) {
return (uint8_t)(byte & 0x0F);
}
return (uint8_t)(byte >> 4);
/* `byte >> 4` is uint8_t but ORCA-C promotes to int (signed 16-bit)
* for the shift, then narrows -- triggers ~SSHIFTRIGHT. The
* mask-then-shift sidesteps the promotion path. */
return (uint8_t)((byte & 0xF0u) >> 4);
}

View file

@ -58,6 +58,19 @@ void halInputPoll(void);
// graphics.library WaitTOF, XBIOS Vsync, $C019 polling).
void halWaitVBL(void);
// Monotonic 16-bit frame counter. Caller polls; ports either detect
// the rising edge inside this call (IIgs $C019 / DOS $3DA / Amiga
// VPOSR) or return a counter maintained by a VBL ISR (ST). Required
// caller invariant: poll faster than 2 * halFrameHz() so no edge is
// missed. Used by benchmarks; cheap enough for animation cadence too.
uint16_t halFrameCount(void);
// Nominal display frame rate in Hz (50 PAL Amiga, 60 NTSC IIgs / ST,
// ~70 VGA mode 13h). Reported only -- no API contract that VBLs
// arrive at exactly this rate. Benchmarks divide by it to convert
// iters-per-N-frames to ops/sec.
uint16_t halFrameHz(void);
// Audio: per-port engine setup, module + SFX playback, teardown.
// halAudioInit returns true if the platform has a working engine.
// All entry points are safe to call when init failed -- they become
@ -278,11 +291,12 @@ extern uint16_t gFloodRightX;
// Tile primitives operate on caller-computed row pointers; just
// forward the args. by/bx are tile coords -> bx*4 + by*8*160 byte
// offset within the surface.
// offset within the surface. Use SURFACE_ROW_OFFSET (LUT lookup) to
// dodge ORCA-C 2.2.1's ~CUMUL2 helper for the *160 multiply.
#undef halFastTileFill
#define halFastTileFill(_s, _bx, _by, _fw) \
(iigsTileFillInner(&(_s)->pixels[(uint16_t)(_by) * 8 * SURFACE_BYTES_PER_ROW \
+ (uint16_t)(_bx) * 4], \
(iigsTileFillInner(&(_s)->pixels[SURFACE_ROW_OFFSET((uint16_t)(_by) << 3) \
+ ((uint16_t)(_bx) << 2)], \
(_fw)), \
true)

View file

@ -12,8 +12,6 @@
#include "hal.h"
#include "surfaceInternal.h"
// CORESYS: hoisted out of _ROOT (see surface.c for rationale).
JOEYLIB_SEGMENT("CORESYS")
// 8 KB fits the largest typical sprite working set (~3-4 KB per
// 32x32 sprite at all opaque) and keeps malloc requests small enough
@ -121,3 +119,13 @@ const char *joeyVersionString(void) {
void joeyWaitVBL(void) {
halWaitVBL();
}
uint16_t joeyFrameCount(void) {
return halFrameCount();
}
uint16_t joeyFrameHz(void) {
return halFrameHz();
}

View file

@ -15,34 +15,39 @@
#include "hal.h"
#include "inputInternal.h"
// CORESYS: hoisted out of _ROOT (see surface.c for rationale).
JOEYLIB_SEGMENT("CORESYS")
bool gKeyState [KEY_COUNT];
bool gKeyPrev [KEY_COUNT];
// See inputInternal.h for why these are uint8_t and not bool.
uint8_t gKeyState [KEY_COUNT];
uint8_t gKeyPrev [KEY_COUNT];
int16_t gMouseX = 0;
int16_t gMouseY = 0;
bool gMouseButtonState[MOUSE_BUTTON_COUNT];
bool gMouseButtonPrev [MOUSE_BUTTON_COUNT];
uint8_t gMouseButtonState[MOUSE_BUTTON_COUNT];
uint8_t gMouseButtonPrev [MOUSE_BUTTON_COUNT];
bool gJoyConnected [JOYSTICK_COUNT];
uint8_t gJoyConnected [JOYSTICK_COUNT];
int8_t gJoyAxisX [JOYSTICK_COUNT];
int8_t gJoyAxisY [JOYSTICK_COUNT];
bool gJoyButtonState[JOYSTICK_COUNT][JOY_BUTTON_COUNT];
bool gJoyButtonPrev [JOYSTICK_COUNT][JOY_BUTTON_COUNT];
uint8_t gJoyButtonState[JOYSTICK_COUNT][JOY_BUTTON_COUNT];
uint8_t gJoyButtonPrev [JOYSTICK_COUNT][JOY_BUTTON_COUNT];
uint8_t gJoyDeadZone [JOYSTICK_COUNT];
#ifdef JOEYLIB_PLATFORM_IIGS
extern void iigsInputSnapshot(void);
// Build-time check: iigsInputSnapshot's asm hard-codes KEY_COUNT=60
// and the small button counts. If a future change adds/removes keys
// or buttons the asm must be updated; this declares a zero-size
// array if the math no longer matches, which is a compile error.
// Build-time checks: iigsInputSnapshot's asm hard-codes KEY_COUNT=60
// and the small button counts, and walks every array one byte per
// element. If a future change adds/removes keys or buttons the asm
// must be updated; if anyone re-types the arrays back to bool the
// per-element size grows to ORCA-C's 2-byte _Bool and the asm reads
// the wrong bytes. Either condition declares a zero-size array
// below, which is a compile error.
typedef int joey_keycount_check [(KEY_COUNT == 60) ? 1 : -1];
typedef int joey_mousebtn_check [(MOUSE_BUTTON_COUNT == 4) ? 1 : -1];
typedef int joey_joybtn_check [(JOYSTICK_COUNT * JOY_BUTTON_COUNT == 4) ? 1 : -1];
typedef int joey_keystate_size_check [(sizeof(gKeyState) == KEY_COUNT) ? 1 : -1];
typedef int joey_mousebtn_size_check [(sizeof(gMouseButtonState) == MOUSE_BUTTON_COUNT) ? 1 : -1];
typedef int joey_joybtn_size_check [(sizeof(gJoyButtonState) == JOYSTICK_COUNT * JOY_BUTTON_COUNT) ? 1 : -1];
#endif
void joeyInputPoll(void) {
@ -79,8 +84,14 @@ void joeyWaitForAnyKey(void) {
}
/* All six key/mouse predicates fold the lower-bound check (`<= NONE`)
* and upper-bound check (`>= COUNT`) into a single unsigned compare.
* Index 0 (KEY_NONE / MOUSE_BUTTON_NONE) is a sentinel that no HAL
* ever writes, so reading gKeyState[0] / gMouseButtonState[0] is
* always 0 -- the predicate result is unchanged but ORCA-C drops the
* compound `||` into one branch each. */
bool joeyKeyDown(JoeyKeyE key) {
if (key <= KEY_NONE || key >= KEY_COUNT) {
if ((uint16_t)key >= (uint16_t)KEY_COUNT) {
return false;
}
return gKeyState[key];
@ -88,7 +99,7 @@ bool joeyKeyDown(JoeyKeyE key) {
bool joeyKeyPressed(JoeyKeyE key) {
if (key <= KEY_NONE || key >= KEY_COUNT) {
if ((uint16_t)key >= (uint16_t)KEY_COUNT) {
return false;
}
return gKeyState[key] && !gKeyPrev[key];
@ -96,7 +107,7 @@ bool joeyKeyPressed(JoeyKeyE key) {
bool joeyKeyReleased(JoeyKeyE key) {
if (key <= KEY_NONE || key >= KEY_COUNT) {
if ((uint16_t)key >= (uint16_t)KEY_COUNT) {
return false;
}
return !gKeyState[key] && gKeyPrev[key];
@ -104,7 +115,7 @@ bool joeyKeyReleased(JoeyKeyE key) {
bool joeyMouseDown(JoeyMouseButtonE button) {
if (button <= MOUSE_BUTTON_NONE || button >= MOUSE_BUTTON_COUNT) {
if ((uint16_t)button >= (uint16_t)MOUSE_BUTTON_COUNT) {
return false;
}
return gMouseButtonState[button];
@ -112,7 +123,7 @@ bool joeyMouseDown(JoeyMouseButtonE button) {
bool joeyMousePressed(JoeyMouseButtonE button) {
if (button <= MOUSE_BUTTON_NONE || button >= MOUSE_BUTTON_COUNT) {
if ((uint16_t)button >= (uint16_t)MOUSE_BUTTON_COUNT) {
return false;
}
return gMouseButtonState[button] && !gMouseButtonPrev[button];
@ -120,7 +131,7 @@ bool joeyMousePressed(JoeyMouseButtonE button) {
bool joeyMouseReleased(JoeyMouseButtonE button) {
if (button <= MOUSE_BUTTON_NONE || button >= MOUSE_BUTTON_COUNT) {
if ((uint16_t)button >= (uint16_t)MOUSE_BUTTON_COUNT) {
return false;
}
return !gMouseButtonState[button] && gMouseButtonPrev[button];
@ -138,7 +149,7 @@ int16_t joeyMouseY(void) {
bool joeyJoystickConnected(JoeyJoystickE js) {
if ((int)js < 0 || (int)js >= JOYSTICK_COUNT) {
if ((uint16_t)js >= (uint16_t)JOYSTICK_COUNT) {
return false;
}
return gJoyConnected[js];
@ -146,7 +157,7 @@ bool joeyJoystickConnected(JoeyJoystickE js) {
int8_t joeyJoystickX(JoeyJoystickE js) {
if ((int)js < 0 || (int)js >= JOYSTICK_COUNT) {
if ((uint16_t)js >= (uint16_t)JOYSTICK_COUNT) {
return 0;
}
return gJoyAxisX[js];
@ -154,48 +165,59 @@ int8_t joeyJoystickX(JoeyJoystickE js) {
int8_t joeyJoystickY(JoeyJoystickE js) {
if ((int)js < 0 || (int)js >= JOYSTICK_COUNT) {
if ((uint16_t)js >= (uint16_t)JOYSTICK_COUNT) {
return 0;
}
return gJoyAxisY[js];
}
/* Joystick button predicates: ORCA-C 2.2.1 lowers `gJoyButtonState[js][button]`
* to a ~MUL4 helper per access. Compute the 1D byte index once and read
* via an explicit (uint8_t *) cast -- no helpers. */
bool joeyJoyDown(JoeyJoystickE js, JoeyJoyButtonE button) {
if ((int)js < 0 || (int)js >= JOYSTICK_COUNT) {
uint16_t idx;
if ((uint16_t)js >= (uint16_t)JOYSTICK_COUNT) {
return false;
}
if ((int)button < 0 || (int)button >= JOY_BUTTON_COUNT) {
if ((uint16_t)button >= (uint16_t)JOY_BUTTON_COUNT) {
return false;
}
return gJoyButtonState[js][button];
idx = (uint16_t)((uint16_t)js * JOY_BUTTON_COUNT + (uint16_t)button);
return ((const uint8_t *)gJoyButtonState)[idx] != 0;
}
bool joeyJoyPressed(JoeyJoystickE js, JoeyJoyButtonE button) {
if ((int)js < 0 || (int)js >= JOYSTICK_COUNT) {
uint16_t idx;
if ((uint16_t)js >= (uint16_t)JOYSTICK_COUNT) {
return false;
}
if ((int)button < 0 || (int)button >= JOY_BUTTON_COUNT) {
if ((uint16_t)button >= (uint16_t)JOY_BUTTON_COUNT) {
return false;
}
return gJoyButtonState[js][button] && !gJoyButtonPrev[js][button];
idx = (uint16_t)((uint16_t)js * JOY_BUTTON_COUNT + (uint16_t)button);
return (((const uint8_t *)gJoyButtonState)[idx] != 0) &&
(((const uint8_t *)gJoyButtonPrev) [idx] == 0);
}
bool joeyJoyReleased(JoeyJoystickE js, JoeyJoyButtonE button) {
if ((int)js < 0 || (int)js >= JOYSTICK_COUNT) {
uint16_t idx;
if ((uint16_t)js >= (uint16_t)JOYSTICK_COUNT) {
return false;
}
if ((int)button < 0 || (int)button >= JOY_BUTTON_COUNT) {
if ((uint16_t)button >= (uint16_t)JOY_BUTTON_COUNT) {
return false;
}
return !gJoyButtonState[js][button] && gJoyButtonPrev[js][button];
idx = (uint16_t)((uint16_t)js * JOY_BUTTON_COUNT + (uint16_t)button);
return (((const uint8_t *)gJoyButtonState)[idx] == 0) &&
(((const uint8_t *)gJoyButtonPrev) [idx] != 0);
}
void joeyJoystickReset(JoeyJoystickE js, uint8_t deadZone) {
if ((int)js < 0 || (int)js >= JOYSTICK_COUNT) {
if ((uint16_t)js >= (uint16_t)JOYSTICK_COUNT) {
return;
}
gJoyDeadZone[js] = deadZone;

View file

@ -12,19 +12,26 @@
#include "joey/input.h"
#include "joey/types.h"
extern bool gKeyState[KEY_COUNT];
extern bool gKeyPrev [KEY_COUNT];
// Stored as uint8_t (not bool) because ORCA-C compiles _Bool as a
// 2-byte word (Symbol.pas: size := cgWordSize). The IIgs asm fast
// path (iigsInputSnapshot) walks these arrays one byte per element;
// a 2-byte bool would put element k at byte offset 2*k and the asm's
// per-byte clear would never reach the live half. uint8_t pins the
// storage to one byte per element on every port. Public predicates
// still return bool via implicit coercion.
extern uint8_t gKeyState[KEY_COUNT];
extern uint8_t gKeyPrev [KEY_COUNT];
extern int16_t gMouseX;
extern int16_t gMouseY;
extern bool gMouseButtonState[MOUSE_BUTTON_COUNT];
extern bool gMouseButtonPrev [MOUSE_BUTTON_COUNT];
extern uint8_t gMouseButtonState[MOUSE_BUTTON_COUNT];
extern uint8_t gMouseButtonPrev [MOUSE_BUTTON_COUNT];
extern bool gJoyConnected[JOYSTICK_COUNT];
extern uint8_t gJoyConnected[JOYSTICK_COUNT];
extern int8_t gJoyAxisX [JOYSTICK_COUNT];
extern int8_t gJoyAxisY [JOYSTICK_COUNT];
extern bool gJoyButtonState[JOYSTICK_COUNT][JOY_BUTTON_COUNT];
extern bool gJoyButtonPrev [JOYSTICK_COUNT][JOY_BUTTON_COUNT];
extern uint8_t gJoyButtonState[JOYSTICK_COUNT][JOY_BUTTON_COUNT];
extern uint8_t gJoyButtonPrev [JOYSTICK_COUNT][JOY_BUTTON_COUNT];
// Per-stick analog calibration. Set by joeyJoystickReset on platforms
// with analog paddles (IIgs); ignored on digital-stick platforms.

View file

@ -10,24 +10,69 @@
#include "joey/palette.h"
#include "surfaceInternal.h"
// CORESYS: hoisted out of _ROOT (see surface.c for rationale).
JOEYLIB_SEGMENT("CORESYS")
// Standard 16-color EGA palette in IIgs $0RGB format. Used as the
// per-surface default at allocation time (paletteInitDefault) so a
// program that draws without first calling paletteSet still gets
// recognizable colors instead of an all-black palette. EGA index 6
// is the canonical "brown" hack ($0A50, half-green) so CGA monitors
// rendered the third primary as brown rather than dark yellow.
static const uint16_t kDefaultPaletteEga[SURFACE_COLORS_PER_PALETTE] = {
0x0000, // 0: Black
0x000A, // 1: Blue
0x00A0, // 2: Green
0x00AA, // 3: Cyan
0x0A00, // 4: Red
0x0A0A, // 5: Magenta
0x0A50, // 6: Brown
0x0AAA, // 7: Light Gray
0x0555, // 8: Dark Gray
0x055F, // 9: Light Blue
0x05F5, // 10: Light Green
0x05FF, // 11: Light Cyan
0x0F55, // 12: Light Red
0x0F5F, // 13: Light Magenta
0x0FF5, // 14: Yellow
0x0FFF // 15: White
};
// ----- Internal API -----
void paletteInitDefault(SurfaceT *s) {
uint8_t i;
if (s == NULL) {
return;
}
for (i = 0; i < SURFACE_PALETTE_COUNT; i++) {
paletteSet(s, i, kDefaultPaletteEga);
}
}
// ----- Public API (alphabetical) -----
void paletteGet(const SurfaceT *s, uint8_t paletteIndex, uint16_t *out16) {
const uint16_t *row;
if (s == NULL || out16 == NULL) {
return;
}
if (paletteIndex >= SURFACE_PALETTE_COUNT) {
return;
}
memcpy(out16, s->palette[paletteIndex], SURFACE_COLORS_PER_PALETTE * sizeof(uint16_t));
/* Byte-pointer math + shift to skip the ~MUL4 helper -- see
* paletteSet for the reasoning. */
row = (const uint16_t *)((const uint8_t *)s->palette + ((uint16_t)paletteIndex << 5));
memcpy(out16, row, SURFACE_COLORS_PER_PALETTE * sizeof(uint16_t));
}
void paletteSet(SurfaceT *s, uint8_t paletteIndex, const uint16_t *colors16) {
uint8_t i;
uint16_t *row;
const uint16_t *src;
if (s == NULL || colors16 == NULL) {
return;
@ -36,9 +81,18 @@ void paletteSet(SurfaceT *s, uint8_t paletteIndex, const uint16_t *colors16) {
return;
}
s->palette[paletteIndex][0] = 0x0000;
/* Compute the row pointer via byte-pointer math + a single shift
* (16 entries * 2 bytes = 32 = 1 << 5) so ORCA-C doesn't emit a
* ~MUL4 helper for the 2D-array indexing. Then walk both arrays
* with post-increment pointers so the inner loop avoids ~MUL4
* for every `row[i]` / `colors16[i]` index multiply too. */
row = (uint16_t *)((uint8_t *)s->palette + ((uint16_t)paletteIndex << 5));
src = colors16;
*row++ = 0x0000;
src++;
for (i = 1; i < SURFACE_COLORS_PER_PALETTE; i++) {
s->palette[paletteIndex][i] = colors16[i] & 0x0FFF;
*row++ = (uint16_t)(*src++ & 0x0FFF);
}
if (s == stageGet()) {
gStagePaletteDirty = true;

View file

@ -12,8 +12,6 @@
#include "hal.h"
#include "surfaceInternal.h"
// CORESYS: hoisted out of _ROOT (see surface.c for rationale).
JOEYLIB_SEGMENT("CORESYS")
// ----- Public API (alphabetical) -----

View file

@ -9,8 +9,6 @@
#include "joey/palette.h"
#include "surfaceInternal.h"
// CORESYS: hoisted out of _ROOT (see surface.c for rationale).
JOEYLIB_SEGMENT("CORESYS")
// ----- Public API (alphabetical) -----

View file

@ -13,8 +13,6 @@
#include "spriteInternal.h"
#include "surfaceInternal.h"
// CORESYS: hoisted out of _ROOT (see surface.c for rationale).
JOEYLIB_SEGMENT("CORESYS")
// 8x8 tiles, 4bpp packed = 4 bytes/row * 8 rows = 32 bytes/tile.
#define TILE_BYTES 32
@ -180,6 +178,7 @@ SpriteT *spriteCreate(const uint8_t *tileData, uint8_t widthTiles, uint8_t heigh
memset(sp->routineOffsets, 0, sizeof(sp->routineOffsets));
memset(sp->cachedDstBank, 0xFF, sizeof(sp->cachedDstBank));
memset(sp->cachedSrcBank, 0xFF, sizeof(sp->cachedSrcBank));
memset(sp->cachedSizeBytes, 0, sizeof(sp->cachedSizeBytes));
sp->flags = flags;
return sp;
}
@ -249,6 +248,7 @@ SpriteT *spriteCreateFromSurface(const SurfaceT *src, int16_t x, int16_t y,
memset(sp->routineOffsets, 0, sizeof(sp->routineOffsets));
memset(sp->cachedDstBank, 0xFF, sizeof(sp->cachedDstBank));
memset(sp->cachedSrcBank, 0xFF, sizeof(sp->cachedSrcBank));
memset(sp->cachedSizeBytes, 0, sizeof(sp->cachedSizeBytes));
sp->flags = flags;
return sp;
}
@ -296,6 +296,63 @@ void spritePrewarm(SpriteT *sp) {
}
// Combined save-then-draw fast path. Routes both ops through the
// compiled save/draw entry points after a single shared validation
// pass. Falls back to calling the public spriteSaveUnder + spriteDraw
// when the fast path isn't applicable -- semantically identical, just
// pays the dispatcher overhead twice.
void spriteSaveAndDraw(SurfaceT *s, SpriteT *sp, int16_t x, int16_t y, SpriteBackupT *backup) {
uint16_t widthPx;
uint16_t heightPx;
uint8_t wTiles;
uint8_t hTiles;
ArenaSlotT *slot;
uint8_t shift;
if (s == NULL || sp == NULL || backup == NULL) {
return;
}
backup->sprite = sp;
backup->sizeBytes = 0;
wTiles = sp->widthTiles;
hTiles = sp->heightTiles;
slot = sp->slot;
widthPx = (uint16_t)(wTiles * TILE_PIXELS);
heightPx = (uint16_t)(hTiles * TILE_PIXELS);
// Fast path: compiled bytes available, fully on surface, backup
// buffer supplied. Save fills out backup->{x,y,width,height,
// sizeBytes}; draw reuses (x,y,widthPx,heightPx) for the dirty
// mark. One mark instead of two (save doesn't dirty -- it's a
// read; only draw dirties).
if (slot != NULL && backup->bytes != NULL && isFullyOnSurface(x, y, widthPx, heightPx)) {
/* Byte-pointer arithmetic dodges ~MUL4 for 2D-array indexing. */
uint16_t saveIdx;
uint16_t drawIdx;
uint8_t *offsetsBase;
shift = (uint8_t)(x & 1);
saveIdx = (uint16_t)(((uint16_t)shift << 1) + (uint16_t)shift + SPRITE_OP_SAVE);
drawIdx = (uint16_t)(((uint16_t)shift << 1) + (uint16_t)shift + SPRITE_OP_DRAW);
offsetsBase = (uint8_t *)sp->routineOffsets;
if (*(uint16_t *)(offsetsBase + (saveIdx << 1)) != SPRITE_NOT_COMPILED &&
*(uint16_t *)(offsetsBase + (drawIdx << 1)) != SPRITE_NOT_COMPILED) {
spriteCompiledSaveUnder(s, sp, x, y, backup);
spriteCompiledDraw (s, sp, x, y);
surfaceMarkDirtyRect (s, x, y, (int16_t)widthPx, (int16_t)heightPx);
return;
}
}
// Fall back to the slow paths through the public API. These
// pay the full dispatcher chain twice but handle every edge
// case (interpreter, partial clip, no-backup-buffer modes).
spriteSaveUnder(s, sp, x, y, backup);
spriteDraw (s, sp, x, y);
}
// .spr file format:
// offset bytes field
// ------ ----- --------------------------------------------
@ -394,6 +451,7 @@ SpriteT *spriteFromCompiledMem(const uint8_t *data, uint32_t length, SpriteFlags
sp->flags = flags;
memset(sp->cachedDstBank, 0xFF, sizeof(sp->cachedDstBank));
memset(sp->cachedSrcBank, 0xFF, sizeof(sp->cachedSrcBank));
memset(sp->cachedSizeBytes, 0, sizeof(sp->cachedSizeBytes));
return sp;
}
@ -528,65 +586,115 @@ uint32_t spriteCodegenBytesUsed(void) {
void spriteRestoreUnder(SurfaceT *s, const SpriteBackupT *backup) {
int16_t row;
int16_t byteStart;
int16_t copyBytes;
uint16_t spriteBytesPerRow;
uint8_t shift;
uint8_t *dstRow;
/* Fast-path locals only. Slow-path uses an inner block. */
int16_t bx;
int16_t by;
uint16_t bw;
uint16_t bh;
SpriteT *sp;
uint16_t spriteBytesPerRow;
int16_t copyBytes;
uint8_t shift;
if (s == NULL || backup == NULL || backup->bytes == NULL) {
if (s == NULL || backup == NULL) {
return;
}
if (backup->width == 0 || backup->height == 0) {
return;
}
if (backup->x < 0 || backup->y < 0) {
return;
}
if (backup->x >= SURFACE_WIDTH || backup->y >= SURFACE_HEIGHT) {
return;
}
if (backup->x + backup->width > SURFACE_WIDTH) {
return;
}
if (backup->y + backup->height > SURFACE_HEIGHT) {
return;
}
// Saved region is byte-aligned; sub-byte boundaries can't be
// represented without losing the neighboring pixel under the byte.
if ((backup->x & 1) || (backup->width & 1)) {
bx = backup->x;
by = backup->y;
bw = backup->width;
bh = backup->height;
/* Validate. Note: SURFACE_WIDTH - bx and SURFACE_HEIGHT - by stay
* in uint16_t range once bx >= 0 / by >= 0 has been checked, so
* the right-edge / bottom-edge tests don't need 32-bit arithmetic
* (which would invoke ORCA-C's ~GRTL helper, ~50 cyc per call). */
if (backup->bytes == NULL ||
bw == 0 || bh == 0 ||
bx < 0 || by < 0 ||
bx >= SURFACE_WIDTH || by >= SURFACE_HEIGHT ||
bw > (uint16_t)(SURFACE_WIDTH - bx) ||
bh > (uint16_t)(SURFACE_HEIGHT - by) ||
(bx & 1) || (bw & 1)) {
return;
}
sp = backup->sprite;
if (sp != NULL && sp->slot != NULL && backup->height == sp->heightTiles * TILE_PIXELS) {
if (sp != NULL && sp->slot != NULL && bh == sp->heightTiles * TILE_PIXELS) {
uint16_t routeIdx;
uint16_t routeOffset;
spriteBytesPerRow = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW);
copyBytes = (int16_t)(backup->width >> 1);
copyBytes = (int16_t)(bw >> 1);
shift = (copyBytes == (int16_t)spriteBytesPerRow) ? 0 : 1;
if (sp->routineOffsets[shift][SPRITE_OP_RESTORE] != SPRITE_NOT_COMPILED) {
/* Byte-pointer arithmetic dodges ~MUL4 for 2D-array indexing. */
routeIdx = (uint16_t)(((uint16_t)shift << 1) + (uint16_t)shift + SPRITE_OP_RESTORE);
routeOffset = *(uint16_t *)((uint8_t *)sp->routineOffsets + (routeIdx << 1));
if (routeOffset != SPRITE_NOT_COMPILED) {
spriteCompiledRestoreUnder(s, backup);
surfaceMarkDirtyRect(s, backup->x, backup->y,
(int16_t)backup->width, (int16_t)backup->height);
surfaceMarkDirtyRect(s, bx, by, (int16_t)bw, (int16_t)bh);
return;
}
}
byteStart = (int16_t)(backup->x >> 1);
copyBytes = (int16_t)(backup->width >> 1);
for (row = 0; row < backup->height; row++) {
dstRow = &s->pixels[(backup->y + row) * SURFACE_BYTES_PER_ROW];
/* Slow / interpreted memcpy fallback. */
{
int16_t row;
int16_t byteStart;
uint8_t *dstRow;
byteStart = (int16_t)(bx >> 1);
copyBytes = (int16_t)(bw >> 1);
for (row = 0; row < (int16_t)bh; row++) {
dstRow = &s->pixels[(by + row) * SURFACE_BYTES_PER_ROW];
memcpy(&dstRow[byteStart],
&backup->bytes[(uint16_t)row * (uint16_t)copyBytes],
(size_t)copyBytes);
}
surfaceMarkDirtyRect(s, backup->x, backup->y,
(int16_t)backup->width, (int16_t)backup->height);
}
surfaceMarkDirtyRect(s, bx, by, (int16_t)bw, (int16_t)bh);
}
void spriteSaveUnder(const SurfaceT *s, SpriteT *sp, int16_t x, int16_t y, SpriteBackupT *backup) {
/* Only fast-path locals here. Slow-path declarations live inside
* the slow-path block below so ORCA-C with -b doesn't reserve
* stack frame for them on every fast-path call. */
uint16_t widthPx;
uint16_t heightPx;
ArenaSlotT *slot;
uint8_t shift;
if (s == NULL || sp == NULL || backup == NULL) {
return;
}
backup->sprite = sp;
backup->sizeBytes = 0;
slot = sp->slot;
widthPx = (uint16_t)(sp->widthTiles * TILE_PIXELS);
heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS);
// Compiled fast path: fully on surface and the platform emitted
// bytes for SAVE at this shift. The compiled routine assumes a
// full-size, unclipped rectangle, so anything off-edge falls
// through to the interpreted memcpy loop below.
//
// The routineOffsets[shift][SPRITE_OP_SAVE] access is rewritten as
// explicit byte-pointer arithmetic to dodge ORCA-C 2.2.1's ~MUL4
// helper that gets emitted for `uint16_t arr[N][M]` indexing.
if (backup->bytes != NULL && slot != NULL && isFullyOnSurface(x, y, widthPx, heightPx)) {
uint16_t routeIdx;
uint16_t routeOffset;
shift = (uint8_t)(x & 1);
routeIdx = (uint16_t)(((uint16_t)shift << 1) + (uint16_t)shift + SPRITE_OP_SAVE);
routeOffset = *(uint16_t *)((uint8_t *)sp->routineOffsets + (routeIdx << 1));
if (routeOffset != SPRITE_NOT_COMPILED) {
spriteCompiledSaveUnder(s, sp, x, y, backup);
return;
}
}
/* Slow / fallback path: clipping + interpreted memcpy. */
{
int16_t dx;
int16_t dy;
int16_t sx;
@ -598,31 +706,12 @@ void spriteSaveUnder(const SurfaceT *s, SpriteT *sp, int16_t x, int16_t y, Sprit
int16_t copyBytes;
int16_t clippedX;
int16_t clippedW;
uint8_t shift;
const uint8_t *srcRow;
if (s == NULL || sp == NULL || backup == NULL) {
return;
}
backup->sprite = sp;
backup->sizeBytes = 0;
dx = x;
dy = y;
w = (int16_t)(sp->widthTiles * TILE_PIXELS);
h = (int16_t)(sp->heightTiles * TILE_PIXELS);
// Compiled fast path: fully on surface and the platform emitted
// bytes for SAVE at this shift. The compiled routine assumes a
// full-size, unclipped rectangle, so anything off-edge falls
// through to the interpreted memcpy loop below.
if (backup->bytes != NULL && sp->slot != NULL && isFullyOnSurface(x, y, (uint16_t)w, (uint16_t)h)) {
shift = (uint8_t)(x & 1);
if (sp->routineOffsets[shift][SPRITE_OP_SAVE] != SPRITE_NOT_COMPILED) {
spriteCompiledSaveUnder(s, sp, x, y, backup);
return;
}
}
w = (int16_t)widthPx;
h = (int16_t)heightPx;
if (!clipRect(&dx, &dy, &sx, &sy, &w, &h)) {
backup->x = 0;
@ -661,4 +750,5 @@ void spriteSaveUnder(const SurfaceT *s, SpriteT *sp, int16_t x, int16_t y, Sprit
&srcRow[byteStart],
(size_t)copyBytes);
}
} /* end slow path */
}

View file

@ -45,6 +45,12 @@ struct SpriteT {
// 12 bytes per sprite. Unused on non-IIgs.
uint8_t cachedDstBank[JOEY_SPRITE_SHIFT_COUNT][SPRITE_OP_COUNT];
uint8_t cachedSrcBank[JOEY_SPRITE_SHIFT_COUNT][SPRITE_OP_COUNT];
// Cached `copyBytes * heightPx` per shift for spriteCompiledSaveUnder's
// `backup->sizeBytes` field. uint16_t * uint16_t goes through ORCA-C
// 2.2.1's ~CUMUL2 helper (~30-50 cyc); cache hit dodges it. Filled
// lazily on first call (0 sentinel = uncached).
uint16_t cachedSizeBytes[JOEY_SPRITE_SHIFT_COUNT];
};
// Compiled entry points. Implemented alongside spriteCompile in

View file

@ -10,13 +10,6 @@
#include "hal.h"
#include "surfaceInternal.h"
// Hoist into a CORESYS load segment alongside the other small core
// files. Keeps _ROOT thin and stable so it stops reacting to per-file
// source changes -- _ROOT size flux was tripping ORCA-Linker bank
// packing in spriteEmitIigs.c (see feedback_orca_link_segment_count
// cases 2-4).
JOEYLIB_SEGMENT("CORESYS")
#ifdef JOEYLIB_PLATFORM_IIGS
extern void iigsMarkDirtyRowsInner(uint16_t yStart, uint16_t yEnd, uint16_t minWord, uint16_t maxWord);
#endif
@ -91,6 +84,7 @@ SurfaceT *surfaceCreate(void) {
free(s);
return NULL;
}
paletteInitDefault(s);
return s;
}
@ -207,8 +201,11 @@ void surfaceMarkDirtyRect(const SurfaceT *s, int16_t x, int16_t y, int16_t w, in
if (w <= 0 || h <= 0) {
return;
}
minWord = (uint8_t)(x >> 2);
maxWord = (uint8_t)((x + w - 1) >> 2);
/* Clipped x/w are non-negative; cast to uint16_t before `>> 2` so
* ORCA-C lowers to a pair of LSRs instead of the ~SSHIFTRIGHT
* helper signed shifts emit. */
minWord = (uint8_t)((uint16_t)x >> 2);
maxWord = (uint8_t)((uint16_t)(x + w - 1) >> 2);
yEnd = y + h;
#ifdef JOEYLIB_PLATFORM_IIGS
iigsMarkDirtyRowsInner((uint16_t)y, (uint16_t)yEnd,
@ -239,6 +236,7 @@ bool stageAlloc(void) {
}
memset(gStage->pixels, 0, SURFACE_PIXELS_SIZE);
stageDirtyClearAll();
paletteInitDefault(gStage);
return true;
}

View file

@ -62,13 +62,19 @@ void stageDirtyClearAll(void);
// y -> byte offset of row y in a SURFACE_BYTES_PER_ROW-strided buffer.
// On IIgs this expands to a single indexed long-mode read against
// gRowOffsetLut (built once at halInit). On other ports it's the
// straight multiply -- those compilers (gcc, OpenWatcom) optimize the
// constant 160 to a shift+add chain that's already cheap. The point
// is to dodge ORCA-C's __mul16 JSL on every per-row pointer compute.
// gRowOffsetLut (built once at halInit).
//
// The explicit (y << 1) byte-pointer arithmetic dodges ORCA-C 2.2.1's
// `~MUL4` helper that gets emitted for `uint16_t arr[N]` indexing
// (the implicit *sizeof(uint16_t)). With the byte-cast + shift, the
// compiler emits a single ASL + indexed long-mode read.
//
// Other ports get the straight multiply -- gcc / OpenWatcom optimize
// the constant 160 to a shift+add chain.
#ifdef JOEYLIB_PLATFORM_IIGS
extern const uint16_t gRowOffsetLut[200];
#define SURFACE_ROW_OFFSET(_y) ((uint16_t)gRowOffsetLut[(uint16_t)(_y)])
#define SURFACE_ROW_OFFSET(_y) \
(*((const uint16_t *)((const uint8_t *)gRowOffsetLut + ((uint16_t)(_y) << 1))))
#else
#define SURFACE_ROW_OFFSET(_y) ((uint16_t)((uint16_t)(_y) * SURFACE_BYTES_PER_ROW))
#endif
@ -80,4 +86,10 @@ extern const uint16_t gRowOffsetLut[200];
bool stageAlloc(void);
void stageFree(void);
// Fill all 16 of `s`'s palettes with the standard 16-color EGA
// palette. Called by stageAlloc and surfaceCreate so a program that
// draws without first calling paletteSet still gets recognizable
// colors instead of an all-black palette.
void paletteInitDefault(SurfaceT *s);
#endif

View file

@ -20,10 +20,6 @@
// without that the ORCA Linker hits "Expression too complex" on
// the small-binary builds.)
// Hoist tile primitives into the DRAWPRIMS load segment. Asm
// dispatches go through halFast* hooks in src/port/iigs/hal.c so
// only one TU references the asm symbols (avoids the cumulative
// "Expression too complex" link failure).
JOEYLIB_SEGMENT("DRAWPRIMS")
// ----- Prototypes -----

View file

@ -509,6 +509,37 @@ void halWaitVBL(void) {
}
// VPOSR ($DFF004) upper byte: low 3 bits = vertical scanline bits
// 8..10. The bit-8 transition from 1 -> 0 marks "vertical wrap" --
// a fresh frame. Edge-detected per call so caller (UBER, etc.)
// just polls; no IRQ server needed.
#define AMIGA_VPOSR ((volatile uint16_t *)0xDFF004UL)
static uint16_t gFrameCount = 0;
static uint8_t gPrevVbHi = 0;
uint16_t halFrameCount(void) {
uint8_t now;
/* Bit 0 of the upper byte = scanline bit 8. PAL frame is ~313
* lines, NTSC ~263 -- both wrap bit 8 once per frame, which is
* what we want as the "frame edge" signal. */
now = (uint8_t)((*AMIGA_VPOSR >> 8) & 1u);
if (gPrevVbHi && !now) {
gFrameCount++;
}
gPrevVbHi = now;
return gFrameCount;
}
uint16_t halFrameHz(void) {
/* PAL by default. The toolchain doesn't currently switch modes
* at runtime; if we ever expose NTSC this returns 60. */
return 50u;
}
void halShutdown(void) {
if (gScreen != NULL) {
// CloseScreen should free attached UCopList, but be explicit

View file

@ -562,6 +562,21 @@ void halWaitVBL(void) {
}
// gFrameCount is already maintained by our VBL ISR; just narrow to
// uint16_t for the cross-port HAL contract.
uint16_t halFrameCount(void) {
return (uint16_t)gFrameCount;
}
uint16_t halFrameHz(void) {
/* PAL ST is 50 Hz; NTSC ST and SM124 mono are ~60 / ~70. We
* report 50 as the baseline -- close enough for ops/sec scaling,
* and the actual frame rate is still observable via iter counts. */
return 50u;
}
void halShutdown(void) {
if (!gModeSet) {
return;

View file

@ -150,7 +150,9 @@ static volatile uint8_t gPacketRemaining = 0;
static volatile uint8_t gPacketKind = PKT_KIND_NONE;
static volatile uint8_t gMousePacketByte = 0; // bytes consumed in current packet
static bool gHooked = false;
static volatile bool gIsrState[KEY_COUNT];
// uint8_t (not bool) so element size matches gKeyState's. See
// src/core/inputInternal.h for the full rationale.
static volatile uint8_t gIsrState[KEY_COUNT];
// Mouse delta accumulator. Each ACIA mouse packet adds dx/dy here; the
// poll routine clamps the running absolute position into the surface

View file

@ -275,6 +275,32 @@ void halWaitVBL(void) {
}
// Frame counter via $3DA bit 3 polling; rising edge marks the start
// of vertical retrace. Caller polls fast enough that no edge is
// missed (UBER's hot loop is far below 70 Hz period even on a 386).
static uint16_t gFrameCount = 0;
static uint8_t gPrevInVret = 0;
uint16_t halFrameCount(void) {
uint8_t now;
now = (uint8_t)(inportb(VGA_INPUT_STAT_1) & VGA_VRETRACE_BIT);
if (now && !gPrevInVret) {
gFrameCount++;
}
gPrevInVret = now;
return gFrameCount;
}
uint16_t halFrameHz(void) {
/* VGA mode 13h vertical refresh on a real CRT runs at ~70 Hz
* (70.086 to be exact). Reporting 70 keeps ops/sec scaling
* accurate within ~0.1%. */
return 70u;
}
void halShutdown(void) {
__dpmi_regs regs;

View file

@ -152,7 +152,9 @@ static const uint8_t gScanToKey[SCAN_TABLE_SIZE] = {
static _go32_dpmi_seginfo gOldHandler;
static _go32_dpmi_seginfo gNewHandler;
static bool gHooked = false;
static volatile bool gIsrState[KEY_COUNT];
// uint8_t (not bool) so element size matches gKeyState's. See
// src/core/inputInternal.h for the full rationale.
static volatile uint8_t gIsrState[KEY_COUNT];
static bool gMousePresent = false;
static bool gJoystickPresent = false;

View file

@ -29,9 +29,7 @@
// _ROOT in every binary that includes this TU. (See ORCA/C ch. 30
// "segment statement". Reusing the same segment as draw.c / tile.c
// rather than picking a unique name keeps the linker's symbol-
// resolution expressions flat -- per-name extras nest the
// expression and trip the "too complex" threshold on small
// binaries.)
// resolution expressions flat.)
//
// The 34 KB NTP replayer bytes are NOT in this segment -- ORCA/C's
// `segment` statement only relocates functions, not data. They live
@ -99,6 +97,17 @@ static uint32_t gSfxBase = 0;
static bool gNTPReady = false;
static bool gNTPPlaying = false;
// Per-slot config cache. halAudioPlaySfx's biggest cost is the
// per-byte XOR-with-$80 loop over the entire sample (signed -> DOC's
// unsigned format), which on a 4 KB sample is ~120 k cyc / ~43 ms in
// ORCA-C. Most callers play the same SFX repeatedly into the same
// slot. Cache (sample ptr, length, rate) per slot; on cache hit
// (same sample re-triggered) skip the byte copy AND the struct
// rebuild, just re-fire NTPstreamsound.
static const uint8_t *gSfxSlotSample[JOEY_AUDIO_SFX_SLOTS] = { 0 };
static uint32_t gSfxSlotLength[JOEY_AUDIO_SFX_SLOTS] = { 0 };
static uint16_t gSfxSlotRateHz[JOEY_AUDIO_SFX_SLOTS] = { 0 };
// SFX handle layout: stream structure first, sample bytes after.
// Both end up at known 24-bit addresses, side-stepping the small
// memory model's 16-bit pointer issue.
@ -244,6 +253,21 @@ void halAudioShutdown(void) {
if (gNTPPlaying) {
halAudioStopMod();
}
// Silence every SFX slot before disposing the handles. NTP's DOC
// IRQ vector points into the buffer we are about to free; if any
// oscillator finishes its sample after the dispose, the wave-done
// interrupt fires into freed memory and the IIgs reports
// "Unclaimed Sound Interrupt" plus a stuck high-pitched whine
// (whatever sample byte was last loaded into the DOC).
{
uint8_t i;
for (i = 0; i < JOEY_AUDIO_SFX_SLOTS; i++) {
halAudioStopSfx(i);
gSfxSlotSample[i] = (const uint8_t *)0;
gSfxSlotLength[i] = 0;
gSfxSlotRateHz[i] = 0;
}
}
if (gSfxHandle != NULL) {
DisposeHandle(gSfxHandle);
gSfxHandle = NULL;
@ -325,9 +349,19 @@ void halAudioPlaySfx(uint8_t slot, const uint8_t *sample, uint32_t length, uint1
structAddr = slotBase;
sampleAddr = slotBase + SFX_SAMPLE_OFFSET;
// Copy the sample into this slot's fixed-bank region, converting
// signed 8-bit (public API contract) to unsigned 8-bit (DOC RAM
// format) by flipping the sign bit.
// Cache check: same sample, length, and rate as the prior trigger
// for this slot? Then the slot's DOC sample bytes and stream
// struct are already correct -- skip the 4 KB byte-XOR loop and
// the 15-byte struct rebuild, both of which together can run
// ~50 ms per call in ORCA-C.
if (sample == gSfxSlotSample[slot] &&
length == gSfxSlotLength[slot] &&
rateHz == gSfxSlotRateHz[slot]) {
// Cache hit -- jump straight to the NTPstreamsound trigger.
} else {
// Cache miss: copy the sample into this slot's fixed-bank
// region, converting signed 8-bit (public API contract) to
// unsigned 8-bit (DOC RAM format) by flipping the sign bit.
{
unsigned char *dst;
uint32_t i;
@ -356,6 +390,11 @@ void halAudioPlaySfx(uint8_t slot, const uint8_t *sample, uint32_t length, uint1
sfx[13] = SFX_VOLUME;
sfx[14] = SFX_CHANNEL_LEFT;
gSfxSlotSample[slot] = sample;
gSfxSlotLength[slot] = length;
gSfxSlotRateHz[slot] = rateHz;
}
// NTPstreamsound(structPtr in X/Y). Same 24-bit address packing
// pattern as NTPprepare: low 16 in X, bank in Y.
buildCallStub(gNTPBase + 24,

View file

@ -95,6 +95,11 @@ extern void iigsInitRowLut(void);
// subsequent rows are at srcOffset + 160, etc. ~9 cyc/byte vs
// ORCA-C memcpy's ~30 cyc/byte.
extern void iigsBlitRectStageToShr(uint16_t srcOffset, uint16_t copyBytes, uint16_t rowsLeft);
// PEI-slam variant of the per-row rect blit. ~3 cyc/byte vs MVN's
// ~9 cyc/byte. Constraints: copyBytes must be even and 2..80
// (caller / dispatcher checks). For sprite-rect presents (typical
// 8 bytes wide x 16 rows) saves ~600 cyc/frame vs the MVN form.
extern void iigsBlitRectStageToShrPEI(uint16_t srcOffset, uint16_t copyBytes, uint16_t rowsLeft);
// Filled circle, scanline-style. fillWord low byte is the doubled
// nibble (e.g., 0x33 for nibble 3).
extern void iigsFillCircleInner(uint8_t *pixels, uint16_t cx, uint16_t cy, uint16_t r, uint16_t fillWord);
@ -240,22 +245,27 @@ void halPresentRect(const SurfaceT *src, int16_t x, int16_t y, uint16_t w, uint1
uploadScbAndPaletteIfNeeded(src);
// Pixel copy: byte-aligned runs per scanline. x is always even
// after API-level clipping for 4bpp packed if caller aligned it;
// otherwise we include the byte containing the leftmost pixel.
byteStart = x >> 1;
copyBytes = (uint16_t)(((x + (int16_t)w + 1) >> 1) - byteStart);
// Pixel copy: byte-aligned runs per scanline. x is always >= 0
// after API-level clipping. Use unsigned shifts to avoid
// ~SSHIFTRIGHT helper for `x >> 1` on signed int16_t.
byteStart = (int16_t)((uint16_t)x >> 1);
copyBytes = (uint16_t)((((uint16_t)x + w + 1u) >> 1) - (uint16_t)byteStart);
if (copyBytes == 0 || h == 0) {
return;
}
// Asm per-row MVN blit. Stage pixels live at $01:2000; SHR display
// at $E1:2000 (same offset within their banks). srcOffset is the
// byte offset of the first byte to copy on the first row.
// Pixel copy: prefer the PEI-slam variant when the rect satisfies
// its contract (copyBytes even, 2..80). Sprite-rect presents
// (typical 8 bytes wide) hit this ~3x faster than MVN. Wider or
// odd-byte rects fall back to MVN, which has no width cap.
srcOffset = (uint16_t)(0x2000 + SURFACE_ROW_OFFSET(y) + byteStart);
if ((copyBytes & 1) == 0 && copyBytes >= 2 && copyBytes <= 80) {
iigsBlitRectStageToShrPEI(srcOffset, copyBytes, h);
} else {
iigsBlitRectStageToShr(srcOffset, copyBytes, h);
}
}
void halShutdown(void) {
@ -307,3 +317,27 @@ void halWaitVBL(void) {
/* scanning: wait for next VBL */;
}
}
// Frame counter via $C019 polling. Edge-detected on each call: the
// caller (UBER, animation loops) polls fast enough that we never
// miss a VBL transition. No IRQ involvement; safe in the S16 takeover
// context where ToolBox interrupt setup would be intrusive.
static uint16_t gFrameCount = 0;
static uint8_t gPrevInVbl = 0;
uint16_t halFrameCount(void) {
uint8_t now;
now = (*IIGS_VBL_STATUS & VBL_BAR_BIT) == 0;
if (now && !gPrevInVbl) {
gFrameCount++;
}
gPrevInVbl = now;
return gFrameCount;
}
uint16_t halFrameHz(void) {
return 60u;
}

View file

@ -14,10 +14,15 @@
// it is enough for feature parity with the other platforms on typical
// "press a key, act on it" flows.
//
// Held-key state is synthesized via a TTL counter: a fresh strobe on
// $C000 refreshes the TTL; each halInputPoll decays it; when TTL hits
// zero we assume the key was released. KEY_TTL is sized to cover the
// typematic initial delay so that a held key does not flicker.
// Release detection uses the IIe-inherited "any key currently down"
// live flag at $C010 bit 7 (set by the keyboard scanner independently
// of the strobe). Each halInputPoll drains pending strobe events to
// pick up presses, then samples $C010: bit 7 == 0 means no
// non-modifier key is physically held, and we wholesale-clear
// gKeyState. readModifierKeys then re-asserts the modifiers from
// $C025's live state, so shift/ctrl/option stay accurate. Avoids
// the inferred-release lag the old TTL-decay scheme had, and works
// on every IIgs (real or stealth) without ToolBox / ADB Tool init.
//
// Mouse: $C024 (delta data) and $C027 (status). Each $C024 read
// returns one signed 7-bit delta; $C027 bit 1 indicates whether the
@ -37,8 +42,6 @@
#include "inputInternal.h"
#include "joey/surface.h"
// CORESYS: hoisted out of _ROOT (see surface.c for rationale).
JOEYLIB_SEGMENT("CORESYS")
// ----- Hardware registers -----
@ -63,6 +66,18 @@ JOEYLIB_SEGMENT("CORESYS")
#define KBD_STROBE_BIT 0x80
#define KBD_ASCII_MASK 0x7F
// $C010 RDKBDSTRB: reading clears the keyboard strobe at $C000 and
// returns the live "any key currently held" flag in bit 7 (set by
// the keyboard scanner / ADB MCU independently of the strobe). Used
// to drive immediate release detection without an inferred-release
// TTL counter.
#define KBD_ANY_KEY_DOWN_BIT 0x80
// Cap on the per-poll keyboard-FIFO drain. The IIgs ADB queue is
// small in practice; this is purely a defensive bound so a stuck
// strobe can't spin halInputPoll forever.
#define KBD_DRAIN_GUARD 32u
// $C025 layout (IIgs Hardware Reference): bit 0 = shift, bit 1 = ctrl,
// bit 6 = option (Closed-Apple), bit 7 = command (Open-Apple).
#define MOD_SHIFT 0x01
@ -79,11 +94,6 @@ JOEYLIB_SEGMENT("CORESYS")
#define MOUSE_DELTA_SIGN_BIT 0x40
#define MOUSE_BUTTON_INV 0x80
// Polls a key stays "down" after the last observed strobe. Covers the
// typematic initial delay so a held key does not flicker off/on between
// repeats.
#define KEY_TTL 45
#define ASCII_TABLE_SIZE 128
// Apple II arrow-key ASCII conventions.
@ -113,11 +123,6 @@ static int8_t thresholdPaddle(uint8_t v);
// O(1) instead of a 40-plus-case switch.
static uint8_t gAsciiToKey[ASCII_TABLE_SIZE];
// Non-static so iigsInputSnapshot (joeyDraw.asm) can reference it via
// long-mode addressing through the linker. The C TTL-decrement loop
// that used to live in halInputPoll moved to that asm helper.
uint8_t gKeyTtl [KEY_COUNT];
static int16_t gMouseAbsX = SURFACE_WIDTH / 2;
static int16_t gMouseAbsY = SURFACE_HEIGHT / 2;
@ -246,14 +251,18 @@ static bool gJoyDisconnectLatched = false;
// to the digital threshold mapping. gJoyRecalibrate is set by
// halJoystickReset and cleared on the next successful poll, which
// captures the new center.
// uint8_t (not bool) so the per-element stride is 1 byte. ORCA-C's
// _Bool is 2 bytes, which forces a ~MUL4 helper for every index
// multiply -- even when the index is a constant the compiler doesn't
// fold. Storage is still 0 or 1 either way.
static uint8_t gJoyCenterX [JOYSTICK_COUNT];
static uint8_t gJoyCenterY [JOYSTICK_COUNT];
static bool gJoyCenterValid [JOYSTICK_COUNT];
static bool gJoyRecalibrate [JOYSTICK_COUNT];
static uint8_t gJoyCenterValid [JOYSTICK_COUNT];
static uint8_t gJoyRecalibrate [JOYSTICK_COUNT];
void halJoystickReset(JoeyJoystickE js) {
if ((int)js < 0 || (int)js >= JOYSTICK_COUNT) {
if ((uint16_t)js >= (uint16_t)JOYSTICK_COUNT) {
return;
}
// Re-enable polling and arm a fresh center capture for the next
@ -281,8 +290,14 @@ static void pollJoystick(void) {
bool yResolved;
// Buttons are I/O reads -- always cheap, do them every frame.
gJoyButtonState[JOYSTICK_0][JOY_BUTTON_0] = (*IIGS_BTN0 & IIGS_BUTTON_BIT) != 0;
gJoyButtonState[JOYSTICK_0][JOY_BUTTON_1] = (*IIGS_BTN1 & IIGS_BUTTON_BIT) != 0;
// ORCA-C 2.2.1 doesn't constant-fold the row-stride multiply for
// 2D arrays even when both indices are constants, so each
// gJoyButtonState[i][j] write emits a ~MUL4 helper. Indexing
// through a (uint8_t *) cast collapses to a literal byte offset.
((uint8_t *)gJoyButtonState)[JOYSTICK_0 * JOY_BUTTON_COUNT + JOY_BUTTON_0]
= (*IIGS_BTN0 & IIGS_BUTTON_BIT) != 0;
((uint8_t *)gJoyButtonState)[JOYSTICK_0 * JOY_BUTTON_COUNT + JOY_BUTTON_1]
= (*IIGS_BTN1 & IIGS_BUTTON_BIT) != 0;
gJoyConnected[JOYSTICK_1] = false;
// Once the stick has been latched as disconnected, only buttons
@ -394,7 +409,6 @@ static void pollMouse(void) {
void halInputInit(void) {
memset(gKeyState, 0, sizeof(gKeyState));
memset(gKeyPrev, 0, sizeof(gKeyPrev));
memset(gKeyTtl, 0, sizeof(gKeyTtl));
buildAsciiTable();
gMouseAbsX = SURFACE_WIDTH / 2;
@ -411,23 +425,50 @@ void halInputPoll(void) {
uint8_t kbd;
uint8_t ascii;
uint8_t key;
uint8_t kbdStrb;
uint16_t drainGuard;
bool strobeObserved;
// The KEY_COUNT TTL-decrement loop and the gKeyState/gKeyPrev/
// gMouseButtonPrev/gJoyButtonPrev snapshots all happen earlier in
// joeyInputPoll's call to iigsInputSnapshot (asm). We just read
// the live hardware state here.
// The gKeyState/gKeyPrev/gMouseButtonPrev/gJoyButtonPrev snapshots
// all happen earlier in joeyInputPoll's call to iigsInputSnapshot
// (asm). We just read the live hardware state here.
// Drain the keyboard FIFO, not just the head. The IIgs ADB MCU
// queues press + autorepeat events; consuming only one per poll
// would leave queued events waiting to refresh state on later
// polls. KBD_DRAIN_GUARD bounds the loop in case a stuck strobe
// ever fails to clear.
strobeObserved = false;
for (drainGuard = 0; drainGuard < KBD_DRAIN_GUARD; drainGuard++) {
kbd = *IIGS_KBD;
if (kbd & KBD_STROBE_BIT) {
if ((kbd & KBD_STROBE_BIT) == 0) {
break;
}
strobeObserved = true;
ascii = (uint8_t)(kbd & KBD_ASCII_MASK);
key = gAsciiToKey[ascii];
if (key != KEY_NONE) {
gKeyState[key] = true;
gKeyTtl[key] = KEY_TTL;
}
(void)*IIGS_KBDSTRB;
}
// $C010 bit 7 is the live "any non-modifier key currently held"
// flag (IIe-inherited; updated by the keyboard scanner / ADB MCU
// independently of the strobe). When 0 we know all non-modifier
// keys are physically released, so wholesale-clear gKeyState and
// let readModifierKeys re-assert the modifiers from $C025 below.
//
// strobeObserved guard: a press that arrived AND was released
// between two polls would otherwise be set-then-cleared in a
// single poll, losing the rising edge that joeyKeyPressed needs.
// Holding the press for one poll preserves it; the next poll's
// bit-7 read will clear normally.
kbdStrb = *IIGS_KBDSTRB;
if (!strobeObserved && (kbdStrb & KBD_ANY_KEY_DOWN_BIT) == 0) {
memset(gKeyState, 0, sizeof(gKeyState));
}
readModifierKeys();
pollMouse();
pollJoystick();

View file

@ -2740,6 +2740,221 @@ brsBytesM1 data DRAWPRIMS
end
****************************************************************
* iigsBlitRectStageToShrPEI(srcOffset, copyBytes, rowsLeft)
*
* PEI-slam variant of iigsBlitRectStageToShr for partial-rect
* presents. Uses the SHR shadow trick + AUXWRITE/RAMRD stack hijack
* to push pixel words from $01:row to $E1:row at ~3 cyc/byte instead
* of MVN's ~9 cyc/byte (against $E1 wait states). For a 16x16 sprite
* present (16 rows x 8 bytes) that's ~640 cyc vs MVN's ~1300 cyc.
*
* Caller contract:
* - copyBytes must be even and >= 2 and <= 80. Caller (C wrapper)
* verifies; this asm assumes the contract holds.
* - srcOffset is the byte offset within bank $01 of the FIRST byte
* of the FIRST row to copy. Rows advance by 160.
*
* SEI window for the duration: copyBytes/2 PEIs * rowsLeft + setup
* per row. For a 16x16 sprite that's ~700 cyc = ~0.25 ms; safe for
* DOC IRQ. For larger rects the C wrapper falls back to MVN to keep
* the SEI window tiny.
*
* Args after PHP+PHB+PHD (TCD = SP+8):
* srcOffset at D+0..1
* copyBytes at D+2..3
* rowsLeft at D+4..5
****************************************************************
iigsBlitRectStageToShrPEI start RECTPEI
brpOff equ 0
brpBytes equ 2
brpRows equ 4
php
phb
phd
rep #$30
LONGA ON
LONGI ON
tsc
clc
adc #8
tcd
* Save SP and shadow state for teardown.
tsc
sta >brpOrigSp
sep #$20
LONGA OFF
lda >$00C035
sta >brpOrigShadow
rep #$20
LONGA ON
* Stash inputs into long-mode globals so they survive TCD changes.
* Per-row code does TCD = rowBase, which means D-relative reads no
* longer reach the original args at D+0..5. Everything we still need
* per-row goes into a long-mode global below.
lda brpOff
sta >brpRowBase
lda brpRows
sta >brpRowsRem
lda brpBytes
dec a
sta >brpBytesM1Saved ; copyBytes - 1, for TCS = base + bytes - 1
* Compute jump entry into the unrolled PEI sequence.
* words = copyBytes / 2
* entry = peiSeqEnd - words * 2 (each PEI dp is 2 bytes; sequence
* ends at peiSeqEnd with PEI $00 as
* the LAST entry; offsets descend so
* jumping `words*2` bytes BEFORE the
* end starts at PEI $(2*(words-1)).
lda brpBytes
lsr a ; A = words
asl a ; A = words * 2 (bytes of PEI to execute)
sta >brpJmpDelta
lda #peiSeqEnd
sec
sbc >brpJmpDelta
sta >brpJmpTarget+1 ; patch JMP abs operand
sei
sep #$20
LONGA OFF
lda >brpOrigShadow
and #$F1 ; SHR shadow ON (clear bits 1,2,3)
sta >$00C035
lda #0
sta >$00C005 ; AUXWRITE on
sta >$00C003 ; RAMRD on
rep #$20
LONGA ON
brpRowLoop anop
lda >brpRowsRem
bne brpDoRow
brl brpExit
brpDoRow anop
* Per-row: set DP = row base (so PEI dp pulls from the source row),
* set SP = row base + copyBytes - 1 (so PEIs decrement-push into the
* row in-place; bytes mirror to $E1 via SHR shadow).
* NB: brpBytes is at original D+2 -- after TCD = rowBase that read
* would land in pixel data. Use the long-mode brpBytesM1Saved instead.
lda >brpRowBase
clc
adc >brpBytesM1Saved
tcs ; SP = row base + copyBytes - 1
lda >brpRowBase
tcd ; D = row base
* Jump into the unrolled PEI sequence at the right offset. operand
* low byte was patched above; high byte is fixed at link time.
brpJmpTarget anop
jmp peiSeqEnd ; operand low byte is patched per call
* ----- Unrolled PEI sequence: 40 PEIs, walking DP offsets DOWN from
* $4E to $00 in 2-byte steps. JMP target lands at the right offset
* so only `words` PEIs execute. Each PEI: 6 cyc, pushes 2 bytes to
* SP (which mirrors to $E1 via shadow). Falls through to row
* advance after PEI $00.
pei $4E
pei $4C
pei $4A
pei $48
pei $46
pei $44
pei $42
pei $40
pei $3E
pei $3C
pei $3A
pei $38
pei $36
pei $34
pei $32
pei $30
pei $2E
pei $2C
pei $2A
pei $28
pei $26
pei $24
pei $22
pei $20
pei $1E
pei $1C
pei $1A
pei $18
pei $16
pei $14
pei $12
pei $10
pei $0E
pei $0C
pei $0A
pei $08
pei $06
pei $04
pei $02
pei $00
peiSeqEnd anop
* Advance row base by 160 and decrement rows-remaining.
lda >brpRowBase
clc
adc #160
sta >brpRowBase
lda >brpRowsRem
dec a
sta >brpRowsRem
brl brpRowLoop
brpExit anop
lda >brpOrigSp
tcs
sep #$20
LONGA OFF
lda >brpOrigShadow
sta >$00C035
lda #0
sta >$00C004 ; AUXWRITE off
sta >$00C002 ; RAMRD off
rep #$20
LONGA ON
LONGA OFF
LONGI OFF
pld
plb
plp ; restores I (pre-SEI value)
rtl
end
brpOrigSp data RECTPEI
ds 2
end
brpOrigShadow data RECTPEI
ds 1
end
brpRowBase data RECTPEI
ds 2
end
brpRowsRem data RECTPEI
ds 2
end
brpJmpDelta data RECTPEI
ds 2
end
brpBytesM1Saved data RECTPEI
ds 2
end
****************************************************************
* iigsMarkDirtyRowsInner(yStart, yEnd, minWord, maxWord)
*
@ -2969,16 +3184,18 @@ gJoyOrigSpeed data DRAWPRIMS
* iigsInputSnapshot(void)
*
* Per-frame input bookkeeping done in one tight asm pass instead of
* the three C memcpys + C TTL loop that joeyInputPoll used to do.
* Saves ~0.6 ms per frame in animated demos.
* three C memcpys. Saves ~0.5 ms per frame in animated demos.
*
* Three combined operations:
* 1. Decrement gKeyTtl[i] for every key; on transition to zero,
* clear gKeyState[i] (key is now "released").
* 2. Snapshot gKeyState -> gKeyPrev (KEY_COUNT bytes via long-mode
* Two combined operations:
* 1. Snapshot gKeyState -> gKeyPrev (KEY_COUNT bytes via long-mode
* lda/sta loop, ~15 cyc/byte).
* 3. Snapshot gMouseButtonState/gJoyButtonState (4 bytes each)
* via 4 inline lda/sta pairs.
* 2. Snapshot gMouseButtonState/gJoyButtonState (4 bytes each) via
* 4 inline lda/sta pairs.
*
* The TTL-decay loop this used to run has been removed: the IIgs
* port now derives release directly from $C010 bit 7 in halInputPoll
* (the live "any key currently held" flag), so the inferred-release
* TTL mechanism is no longer needed.
*
* IMPORTANT: KEY_COUNT is hard-coded at 60 below. If you add or
* remove a key in joey/input.h, bump the constant or the loop bounds
@ -2994,19 +3211,6 @@ iigsInputSnapshot start IIGSASM
sep #$20
LONGA OFF
* TTL decrement + key-released detection. ~12 cyc / iter fast path.
ldx #59 ; KEY_COUNT - 1
isnTtlLoop anop
lda >gKeyTtl,x
beq isnTtlNext ; ttl==0, nothing to do
dec a
sta >gKeyTtl,x
bne isnTtlNext ; not yet zero
sta >gKeyState,x ; A==0 -> mark released
isnTtlNext anop
dex
bpl isnTtlLoop
* Snapshot gKeyState -> gKeyPrev (60 bytes), long-mode loop.
ldx #59
isnKeyLoop anop