From 04a95504212556a39e368eda3ab1036c75ddf6b8 Mon Sep 17 00:00:00 2001 From: Scott Duensing Date: Thu, 30 Apr 2026 17:04:08 -0500 Subject: [PATCH] Mass ASM optimization on IIgs. --- examples/sprite/sprite.c | 7 +- include/joey/input.h | 18 + src/codegen/spriteCompile.c | 187 +++-- src/codegen/spriteEmitIigs.c | 10 + src/core/hal.h | 149 ++++ src/core/input.c | 28 + src/core/inputInternal.h | 8 + src/core/palette.c | 3 + src/core/scb.c | 15 +- src/core/sprite.c | 6 + src/core/spriteInternal.h | 10 + src/core/surface.c | 27 +- src/core/surfaceInternal.h | 7 + src/port/amiga/input.c | 6 + src/port/atarist/input.c | 6 + src/port/dos/input.c | 6 + src/port/iigs/hal.c | 343 ++------ src/port/iigs/input.c | 201 +++-- src/port/iigs/joeyDraw.asm | 1472 ++++++++++++++++++---------------- src/port/iigs/peislam.asm | 77 +- 20 files changed, 1432 insertions(+), 1154 deletions(-) diff --git a/examples/sprite/sprite.c b/examples/sprite/sprite.c index 8006afd..0abcbad 100644 --- a/examples/sprite/sprite.c +++ b/examples/sprite/sprite.c @@ -204,7 +204,12 @@ int main(void) { ? (oldY + oldH) : (backup.y + backup.height)); - joeyWaitVBL(); + // VBL wait removed -- the demo runs at the native compute speed + // of save+restore+draw+presentRect so we can SEE the sprite + // pipeline's actual throughput. Expect tearing on the ball + // since the present can land mid-scan; that's the cost of + // showing real frame rate. Add joeyWaitVBL() back here for + // tear-free 60 Hz motion. stagePresentRect(unionX, unionY, (uint16_t)(unionRight - unionX), (uint16_t)(unionBottom - unionY)); diff --git a/include/joey/input.h b/include/joey/input.h index c9414ba..ea003b2 100644 --- a/include/joey/input.h +++ b/include/joey/input.h @@ -112,4 +112,22 @@ bool joeyJoyDown(JoeyJoystickE js, JoeyJoyButtonE button); bool joeyJoyPressed(JoeyJoystickE js, JoeyJoyButtonE button); bool joeyJoyReleased(JoeyJoystickE js, JoeyJoyButtonE button); +// Re-enable joystick polling and recalibrate the resting (center) +// position. The IIgs port auto-disables polling after a short window +// of detecting no stick (saves ~3 ms/frame of busy-wait). It does NOT +// auto-re-probe -- the application must call this function to resume +// polling after plugging a stick in. +// +// The next poll after this call captures the stick's CURRENT raw +// position as the new center -- so the user must hold the stick +// centered when calling. Subsequent polls report position relative +// to that center; raw readings within `deadZone` units of the center +// clamp to 0 (use 0 to disable the dead zone). +// +// On platforms with truly digital sticks (Amiga / ST / DOS) the +// recalibration is a no-op -- those ports already report -1 / 0 / +1 +// directly -- and `deadZone` is ignored. The function still clears +// any auto-disconnect state so polling resumes. +void joeyJoystickReset(JoeyJoystickE js, uint8_t deadZone); + #endif diff --git a/src/codegen/spriteCompile.c b/src/codegen/spriteCompile.c index ce7b196..b8ebf98 100644 --- a/src/codegen/spriteCompile.c +++ b/src/codegen/spriteCompile.c @@ -18,6 +18,7 @@ #include "spriteInternal.h" #include "surfaceInternal.h" + // Largest scratch buffer needed for any single emit call. 16 KB // covers a 32x32 sprite even on 68k (the biggest mixed-RMW byte- // emit at 16 bytes/byte * (16*17 dest bytes per shift) ~= 4.5 KB, @@ -157,6 +158,11 @@ bool spriteCompile(SpriteT *sp) { #if defined(JOEYLIB_PLATFORM_IIGS) +// y*160 lookup. gRowOffsetLut is the 200-entry uint16_t table built +// once by iigsInitRowLut at halInit. Replaces ORCA-C's runtime +// multiply (a JSL into __mul16) with a single indexed long-mode read. +extern const uint16_t gRowOffsetLut[200]; + // IIgs uses inline asm + a self-modifying call stub instead of a C // function-pointer cast. The build uses ORCA-C large memory model // (-b for sprite demos) so pointers are 24-bit and JSL works @@ -182,7 +188,18 @@ bool spriteCompile(SpriteT *sp) { // Patched per call: byte 2 (destBank), bytes 6-7 (destOffset16), // bytes 9-11 (target 24-bit). The compiled routine assumes // M=8 / X=16 / Y=destOffset on entry; the stub arranges that. +// +// Stub bytes are split into two phases: +// 1. The 8 opcode bytes are written ONCE on first call (gDrawStubInited). +// 2. Of the 6 operand bytes, only those that actually changed since +// the previous call get re-stamped: destBank and fnAddr are cached +// and rarely change (per-shift / per-bank). destOffset is the only +// one that changes every call as the sprite moves. Net per-frame +// patching for the typical case drops from 14 stores to 2. static unsigned char gSpriteCallStub[14]; +static bool gDrawStubInited = false; +static uint8_t gDrawStubLastBank = 0xFF; +static uint32_t gDrawStubLastFnAddr = 0xFFFFFFFFul; void spriteCompiledDraw(SurfaceT *dst, const SpriteT *sp, int16_t x, int16_t y) { uint8_t shift; @@ -195,7 +212,7 @@ void spriteCompiledDraw(SurfaceT *dst, const SpriteT *sp, int16_t x, int16_t y) uint8_t *destPtr; uint8_t destBytes[4]; shift = (uint8_t)(x & 1); - destPtr = &dst->pixels[(uint16_t)y * SURFACE_BYTES_PER_ROW + ((uint16_t)x >> 1)]; + destPtr = &dst->pixels[gRowOffsetLut[(uint16_t)y] + ((uint16_t)x >> 1)]; memcpy(destBytes, &destPtr, 4); destAddr = (uint32_t)destBytes[0] | ((uint32_t)destBytes[1] << 8) @@ -208,20 +225,35 @@ void spriteCompiledDraw(SurfaceT *dst, const SpriteT *sp, int16_t x, int16_t y) } (void)destAddr; - gSpriteCallStub[ 0] = 0x8B; - gSpriteCallStub[ 1] = 0xA9; - gSpriteCallStub[ 2] = destBank; - gSpriteCallStub[ 3] = 0x48; - gSpriteCallStub[ 4] = 0xAB; - gSpriteCallStub[ 5] = 0xA0; + if (!gDrawStubInited) { + gSpriteCallStub[ 0] = 0x8B; + gSpriteCallStub[ 1] = 0xA9; + gSpriteCallStub[ 3] = 0x48; + gSpriteCallStub[ 4] = 0xAB; + gSpriteCallStub[ 5] = 0xA0; + gSpriteCallStub[ 8] = 0x22; + gSpriteCallStub[12] = 0xAB; + gSpriteCallStub[13] = 0x6B; + gDrawStubInited = true; + } + + // destOffset always changes (sprite moves every frame). gSpriteCallStub[ 6] = (unsigned char)(destOffset & 0xFFu); gSpriteCallStub[ 7] = (unsigned char)((destOffset >> 8) & 0xFFu); - gSpriteCallStub[ 8] = 0x22; - gSpriteCallStub[ 9] = (unsigned char)(fnAddr & 0xFFu); - gSpriteCallStub[10] = (unsigned char)((fnAddr >> 8) & 0xFFu); - gSpriteCallStub[11] = (unsigned char)((fnAddr >> 16) & 0xFFu); - gSpriteCallStub[12] = 0xAB; - gSpriteCallStub[13] = 0x6B; + + // destBank only changes if the dst surface migrates banks (~never). + if (destBank != gDrawStubLastBank) { + gSpriteCallStub[ 2] = destBank; + gDrawStubLastBank = destBank; + } + + // fnAddr changes only on shift parity flips or sprite swaps. + if (fnAddr != gDrawStubLastFnAddr) { + gSpriteCallStub[ 9] = (unsigned char)(fnAddr & 0xFFu); + gSpriteCallStub[10] = (unsigned char)((fnAddr >> 8) & 0xFFu); + gSpriteCallStub[11] = (unsigned char)((fnAddr >> 16) & 0xFFu); + gDrawStubLastFnAddr = fnAddr; + } // ORCA-C compiles this function under `longa on / longi on` // (M=16, X=16) and emits the function epilogue assuming those @@ -259,7 +291,26 @@ void spriteCompiledDraw(SurfaceT *dst, const SpriteT *sp, int16_t x, int16_t y) // // For SAVE: X = screen lo, Y = backup lo // For RESTORE: X = backup lo, Y = screen lo -static unsigned char gSpriteCopyStub[13]; +// +// Two distinct stubs (one per op) instead of a shared one. Save and +// restore alternate every frame and they swap the X/Y meanings, so a +// shared stub forced a full re-stamp on every call. Per-op stubs let +// us cache: only the bytes that genuinely change frame-to-frame +// (typically just one of screenLo/backupLo as the sprite moves) get +// rewritten. Cuts per-call patching from 13 stores to 2 in the typical +// case (static backup buffer, stable shift parity). +static unsigned char gSpriteSaveStub[13]; +static unsigned char gSpriteRestoreStub[13]; + +static bool gSaveStubInited = false; +static uint16_t gSaveStubLastXLo = 0xFFFFu; +static uint16_t gSaveStubLastYLo = 0xFFFFu; +static uint32_t gSaveStubLastFnAddr = 0xFFFFFFFFul; + +static bool gRestoreStubInited = false; +static uint16_t gRestoreStubLastXLo = 0xFFFFu; +static uint16_t gRestoreStubLastYLo = 0xFFFFu; +static uint32_t gRestoreStubLastFnAddr= 0xFFFFFFFFul; // patchMvnBanks stamps the destination and source bank operand bytes @@ -315,7 +366,7 @@ void spriteCompiledSaveUnder(const SurfaceT *src, SpriteT *sp, int16_t x, int16_ heightPx = (uint16_t)(sp->heightTiles * 8); copyBytes = (uint16_t)((widthPx >> 1) + (shift == 1 ? 1 : 0)); - screenPtr = (uint8_t *)&src->pixels[(uint16_t)y * SURFACE_BYTES_PER_ROW + ((uint16_t)clippedX >> 1)]; + screenPtr = (uint8_t *)&src->pixels[gRowOffsetLut[(uint16_t)y] + ((uint16_t)clippedX >> 1)]; splitPointer(screenPtr, &screenLo, &screenBank); splitPointer(backup->bytes, &backupLo, &backupBank); @@ -331,28 +382,49 @@ void spriteCompiledSaveUnder(const SurfaceT *src, SpriteT *sp, int16_t x, int16_ + (uint32_t)sp->routineOffsets[shift][SPRITE_OP_SAVE]; // Stub: X = screen (source), Y = backup (destination). - gSpriteCopyStub[ 0] = 0x8B; - gSpriteCopyStub[ 1] = 0xA2; - gSpriteCopyStub[ 2] = (unsigned char)(screenLo & 0xFFu); - gSpriteCopyStub[ 3] = (unsigned char)((screenLo >> 8) & 0xFFu); - gSpriteCopyStub[ 4] = 0xA0; - gSpriteCopyStub[ 5] = (unsigned char)(backupLo & 0xFFu); - gSpriteCopyStub[ 6] = (unsigned char)((backupLo >> 8) & 0xFFu); - gSpriteCopyStub[ 7] = 0x22; - gSpriteCopyStub[ 8] = (unsigned char)(fnAddr & 0xFFu); - gSpriteCopyStub[ 9] = (unsigned char)((fnAddr >> 8) & 0xFFu); - gSpriteCopyStub[10] = (unsigned char)((fnAddr >> 16) & 0xFFu); - gSpriteCopyStub[11] = 0xAB; - gSpriteCopyStub[12] = 0x6B; + if (!gSaveStubInited) { + gSpriteSaveStub[ 0] = 0x8B; + gSpriteSaveStub[ 1] = 0xA2; + gSpriteSaveStub[ 4] = 0xA0; + gSpriteSaveStub[ 7] = 0x22; + gSpriteSaveStub[11] = 0xAB; + gSpriteSaveStub[12] = 0x6B; + gSaveStubInited = true; + } + if (screenLo != gSaveStubLastXLo) { + gSpriteSaveStub[ 2] = (unsigned char)(screenLo & 0xFFu); + gSpriteSaveStub[ 3] = (unsigned char)((screenLo >> 8) & 0xFFu); + gSaveStubLastXLo = screenLo; + } + if (backupLo != gSaveStubLastYLo) { + gSpriteSaveStub[ 5] = (unsigned char)(backupLo & 0xFFu); + gSpriteSaveStub[ 6] = (unsigned char)((backupLo >> 8) & 0xFFu); + gSaveStubLastYLo = backupLo; + } + if (fnAddr != gSaveStubLastFnAddr) { + gSpriteSaveStub[ 8] = (unsigned char)(fnAddr & 0xFFu); + gSpriteSaveStub[ 9] = (unsigned char)((fnAddr >> 8) & 0xFFu); + gSpriteSaveStub[10] = (unsigned char)((fnAddr >> 16) & 0xFFu); + gSaveStubLastFnAddr = fnAddr; + } - routine = codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_SAVE]; - patchMvnBanks(routine, heightPx, /*dst*/backupBank, /*src*/screenBank); + // Skip the 16+ MVN-bank rewrites if the dst/src bank pair is the + // same as last call. Screen and backup buffer banks are stable + // for essentially every frame past the first, so this short- + // circuits ~5000 cyc/frame on the ball demo. + if (sp->cachedDstBank[shift][SPRITE_OP_SAVE] != backupBank || + sp->cachedSrcBank[shift][SPRITE_OP_SAVE] != screenBank) { + routine = codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_SAVE]; + patchMvnBanks(routine, heightPx, /*dst*/backupBank, /*src*/screenBank); + sp->cachedDstBank[shift][SPRITE_OP_SAVE] = backupBank; + sp->cachedSrcBank[shift][SPRITE_OP_SAVE] = screenBank; + } // MVN-based routine: needs M=16 / X=16; restore M=16 on exit // matches ORCA-C `longa on` epilogue expectations. asm { rep #0x30 - jsl gSpriteCopyStub + jsl gSpriteSaveStub rep #0x30 } } @@ -378,7 +450,7 @@ void spriteCompiledRestoreUnder(SurfaceT *dst, const SpriteBackupT *backup) { spriteBytesPerRow = (uint16_t)(sp->widthTiles * 4); shift = (copyBytes == spriteBytesPerRow) ? 0 : 1; - screenPtr = (uint8_t *)&dst->pixels[(uint16_t)backup->y * SURFACE_BYTES_PER_ROW + ((uint16_t)backup->x >> 1)]; + screenPtr = (uint8_t *)&dst->pixels[gRowOffsetLut[(uint16_t)backup->y] + ((uint16_t)backup->x >> 1)]; splitPointer(screenPtr, &screenLo, &screenBank); splitPointer(backup->bytes, &backupLo, &backupBank); @@ -387,26 +459,45 @@ void spriteCompiledRestoreUnder(SurfaceT *dst, const SpriteBackupT *backup) { + (uint32_t)sp->routineOffsets[shift][SPRITE_OP_RESTORE]; // Stub: X = backup (source), Y = screen (destination). - gSpriteCopyStub[ 0] = 0x8B; - gSpriteCopyStub[ 1] = 0xA2; - gSpriteCopyStub[ 2] = (unsigned char)(backupLo & 0xFFu); - gSpriteCopyStub[ 3] = (unsigned char)((backupLo >> 8) & 0xFFu); - gSpriteCopyStub[ 4] = 0xA0; - gSpriteCopyStub[ 5] = (unsigned char)(screenLo & 0xFFu); - gSpriteCopyStub[ 6] = (unsigned char)((screenLo >> 8) & 0xFFu); - gSpriteCopyStub[ 7] = 0x22; - gSpriteCopyStub[ 8] = (unsigned char)(fnAddr & 0xFFu); - gSpriteCopyStub[ 9] = (unsigned char)((fnAddr >> 8) & 0xFFu); - gSpriteCopyStub[10] = (unsigned char)((fnAddr >> 16) & 0xFFu); - gSpriteCopyStub[11] = 0xAB; - gSpriteCopyStub[12] = 0x6B; + if (!gRestoreStubInited) { + gSpriteRestoreStub[ 0] = 0x8B; + gSpriteRestoreStub[ 1] = 0xA2; + gSpriteRestoreStub[ 4] = 0xA0; + gSpriteRestoreStub[ 7] = 0x22; + gSpriteRestoreStub[11] = 0xAB; + gSpriteRestoreStub[12] = 0x6B; + gRestoreStubInited = true; + } + if (backupLo != gRestoreStubLastXLo) { + gSpriteRestoreStub[ 2] = (unsigned char)(backupLo & 0xFFu); + gSpriteRestoreStub[ 3] = (unsigned char)((backupLo >> 8) & 0xFFu); + gRestoreStubLastXLo = backupLo; + } + if (screenLo != gRestoreStubLastYLo) { + gSpriteRestoreStub[ 5] = (unsigned char)(screenLo & 0xFFu); + gSpriteRestoreStub[ 6] = (unsigned char)((screenLo >> 8) & 0xFFu); + gRestoreStubLastYLo = screenLo; + } + if (fnAddr != gRestoreStubLastFnAddr) { + gSpriteRestoreStub[ 8] = (unsigned char)(fnAddr & 0xFFu); + gSpriteRestoreStub[ 9] = (unsigned char)((fnAddr >> 8) & 0xFFu); + gSpriteRestoreStub[10] = (unsigned char)((fnAddr >> 16) & 0xFFu); + gRestoreStubLastFnAddr = fnAddr; + } - routine = codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_RESTORE]; - patchMvnBanks(routine, heightPx, /*dst*/screenBank, /*src*/backupBank); + // Same short-circuit as save: only re-stamp the bank operands if + // they actually changed since last call. + if (sp->cachedDstBank[shift][SPRITE_OP_RESTORE] != screenBank || + sp->cachedSrcBank[shift][SPRITE_OP_RESTORE] != backupBank) { + routine = codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_RESTORE]; + patchMvnBanks(routine, heightPx, /*dst*/screenBank, /*src*/backupBank); + sp->cachedDstBank[shift][SPRITE_OP_RESTORE] = screenBank; + sp->cachedSrcBank[shift][SPRITE_OP_RESTORE] = backupBank; + } asm { rep #0x30 - jsl gSpriteCopyStub + jsl gSpriteRestoreStub rep #0x30 } } diff --git a/src/codegen/spriteEmitIigs.c b/src/codegen/spriteEmitIigs.c index 0829ef6..01936d5 100644 --- a/src/codegen/spriteEmitIigs.c +++ b/src/codegen/spriteEmitIigs.c @@ -31,6 +31,16 @@ #include "spriteEmitter.h" #include "spriteInternal.h" +// Pin the IIgs sprite codegen statics into their own load segment +// instead of letting them ride in _ROOT. _ROOT also collects every +// other unsegmented .c (init.c, sprite.c, present.c, the example +// main, ...), so growth in any of those can shift the linker's +// per-bank packing and orphan intra-file static refs (we hit this +// when DRAWPRIMS grew with the chunked PEI-slam: PATTERN's link +// reported "Unresolved reference: emitMvnCopyRoutine" purely from +// _ROOT crowding). A dedicated load segment isolates this file. +JOEYLIB_SEGMENT("SPRITECG") + // ----- Constants ----- diff --git a/src/core/hal.h b/src/core/hal.h index 68d72d2..3a4665d 100644 --- a/src/core/hal.h +++ b/src/core/hal.h @@ -184,4 +184,153 @@ bool halFastBlitRect(uint8_t *dstRow0, int16_t dstX, int16_t copyW, int16_t copyH, int16_t srcRowBytes, uint16_t transparent); + +#ifdef JOEYLIB_PLATFORM_IIGS +// ===================================================================== +// IIgs direct-dispatch macros. +// +// The halFast* function declarations above are the cross-platform API. +// On IIgs, those wrappers were ~60-80 cyc/call of pure plumbing on top +// of the asm itself: wrapper prologue (PHB/PHD/TCD), redundant arg +// re-push for the inner JSL, then wrapper epilogue. The macros below +// take effect at preprocess time and inline the asm call at the call +// site, eliminating the wrapper layer entirely. +// +// Cross-platform code in src/core/*.c is unchanged -- it still calls +// halFastDrawPixel(...) etc. On IIgs the preprocessor swaps that for +// the macro expansion before ORCA-C compiles the file. The matching +// halFast* C definitions in src/port/iigs/hal.c are deleted, since +// nothing references them once the macros take effect. +// +// Macros use comma-expression form so they evaluate to a `bool` value +// (most halFast* return true on IIgs since the asm always succeeds). +// ===================================================================== + +extern void iigsDrawPixelInner (uint8_t *pixels, uint16_t x, uint16_t y, uint16_t nibble); +extern void iigsDrawLineInner (uint8_t *pixels, uint16_t x0, uint16_t y0, uint16_t x1, uint16_t y1, uint16_t nibble); +extern void iigsDrawCircleInner (uint8_t *pixels, uint16_t cx, uint16_t cy, uint16_t r, uint16_t nibble); +extern void iigsFillCircleInner (uint8_t *pixels, uint16_t cx, uint16_t cy, uint16_t r, uint16_t fillWord); +extern void iigsSurfaceClearInner(uint8_t *pixels, uint16_t fillWord); +extern void iigsTileFillInner (uint8_t *dstRow0, uint16_t fillWord); +extern void iigsTileCopyInner (uint8_t *dstRow0, const uint8_t *srcRow0); +extern void iigsTileCopyMaskedInner(uint8_t *dstRow0, const uint8_t *srcRow0, uint16_t transparent); +extern void iigsTilePasteInner (uint8_t *dstRow0, const uint8_t *srcTilePixels); +extern void iigsTileSnapInner (uint8_t *dstTilePixels, const uint8_t *srcRow0); +extern void iigsBlitRectInner (uint8_t *dstRow0, uint16_t dstX, const uint8_t *srcRow0, uint16_t srcX, uint16_t copyW, uint16_t copyH, uint16_t srcRowBytes, uint16_t transparent); +extern void iigsFillRectInner (uint8_t *pixels, uint16_t x, uint16_t y, uint16_t w, uint16_t h, uint16_t nibble); +extern void iigsFloodWalkAndScansInner(uint8_t *pixels, uint16_t x, uint16_t y, uint16_t matchColor, uint16_t newColor, uint16_t matchEqual, int16_t *stackX, int16_t *stackY, uint16_t *spInOut, uint16_t maxSp); +extern uint16_t gFloodSeedMatch; +extern uint16_t gFloodLeftX; +extern uint16_t gFloodRightX; + +#undef halFastDrawPixel +#define halFastDrawPixel(_s, _x, _y, _c) \ + (iigsDrawPixelInner((_s)->pixels, (uint16_t)(_x), (uint16_t)(_y), \ + (uint16_t)((_c) & 0x0F)), \ + true) + +#undef halFastDrawLine +#define halFastDrawLine(_s, _x0, _y0, _x1, _y1, _c) \ + (iigsDrawLineInner((_s)->pixels, (uint16_t)(_x0), (uint16_t)(_y0), \ + (uint16_t)(_x1), (uint16_t)(_y1), \ + (uint16_t)((_c) & 0x0F)), \ + true) + +#undef halFastDrawCircle +#define halFastDrawCircle(_s, _cx, _cy, _r, _c) \ + (iigsDrawCircleInner((_s)->pixels, (uint16_t)(_cx), (uint16_t)(_cy), \ + (_r), (uint16_t)((_c) & 0x0F)), \ + true) + +// fillWord = doubled byte * $0101 = (nib*$11) * $101 = nib * $1111. +// Compile-time arithmetic when caller passes a constant; at most a +// single multiply when the nibble is variable (still cheaper than +// the wrapper's three sequential ORs / shifts). +#undef halFastFillCircle +#define halFastFillCircle(_s, _cx, _cy, _r, _c) \ + ((_s) == stageGet() \ + ? (iigsFillCircleInner((_s)->pixels, (uint16_t)(_cx), (uint16_t)(_cy), \ + (_r), (uint16_t)(((_c) & 0x0F) * 0x1111)), \ + true) \ + : false) + +#undef halFastSurfaceClear +#define halFastSurfaceClear(_s, _d) \ + ((_s) == stageGet() \ + ? (iigsSurfaceClearInner((_s)->pixels, \ + (uint16_t)((uint16_t)(_d) | ((uint16_t)(_d) << 8))), \ + true) \ + : false) + +// halFastFillRect stays as a real C wrapper -- removing it triggered +// an unrelated ORCA linker bank-placement failure (same mode as the +// peislam.asm deletion: `Unresolved reference Label: +// emitMvnCopyRoutine` in sprite codegen). The wrapper now just +// forwards to iigsFillRectInner (asm does partial+middle); we lose +// the call-site macro inlining for fillRect specifically but keep +// the rest of the macros AND the new asm helper. Per-call wrapper +// overhead for halFastFillRect is back (~80 cyc) but at least the +// per-row partial-byte logic happens in asm now. + +// Tile primitives operate on caller-computed row pointers; just +// forward the args. by/bx are tile coords -> bx*4 + by*8*160 byte +// offset within the surface. +#undef halFastTileFill +#define halFastTileFill(_s, _bx, _by, _fw) \ + (iigsTileFillInner(&(_s)->pixels[(uint16_t)(_by) * 8 * SURFACE_BYTES_PER_ROW \ + + (uint16_t)(_bx) * 4], \ + (_fw)), \ + true) + +#undef halFastTileCopy +#define halFastTileCopy(_d, _s) (iigsTileCopyInner((_d), (_s)), true) + +#undef halFastTileCopyMasked +#define halFastTileCopyMasked(_d, _s, _t) \ + (iigsTileCopyMaskedInner((_d), (_s), (uint16_t)(_t)), true) + +#undef halFastTilePaste +#define halFastTilePaste(_d, _s) (iigsTilePasteInner((_d), (_s)), true) + +#undef halFastTileSnap +#define halFastTileSnap(_d, _s) (iigsTileSnapInner((_d), (_s)), true) + +#undef halFastBlitRect +#define halFastBlitRect(_dr, _dx, _sr, _sx, _w, _h, _ss, _t) \ + (iigsBlitRectInner((_dr), (uint16_t)(_dx), (_sr), (uint16_t)(_sx), \ + (uint16_t)(_w), (uint16_t)(_h), \ + (uint16_t)(_ss), (_t)), \ + true) + +// Tier 2/3 flood fallbacks always returned false on IIgs (the asm +// impls were deleted as unreachable). Macros to constant false so +// ORCA-C dead-code-eliminates the never-taken fallback branches in +// floodFillInternal. +#undef halFastFloodWalk +#define halFastFloodWalk(_row, _sx, _mc, _nc, _me, _sm, _lx, _rx) (false) + +#undef halFastFloodScanRow +#define halFastFloodScanRow(_row, _lx, _rx, _mc, _nc, _me, _mb) (false) + +#undef halFastFloodScanAndPush +#define halFastFloodScanAndPush(_row, _lx, _rx, _mc, _nc, _me, _sy, _sx, _syA, _sp, _ms) (false) + +// Tier-1 flood: multi-output. Asm sets gFloodSeedMatch / gFloodLeftX / +// gFloodRightX; macro reads those into the caller's out-ptrs. +#undef halFastFloodWalkAndScans +#define halFastFloodWalkAndScans(_pix, _x, _y, _mc, _nc, _me, _sx, _sy, _sp, _ms, _smOut, _lxOut, _rxOut) \ + (iigsFloodWalkAndScansInner((_pix), (uint16_t)(_x), (uint16_t)(_y), \ + (uint16_t)((_mc) & 0x0F), \ + (uint16_t)((_nc) & 0x0F), \ + (uint16_t)((_me) ? 1 : 0), \ + (_sx), (_sy), \ + (uint16_t *)(_sp), \ + (uint16_t)(_ms)), \ + *(_smOut) = (gFloodSeedMatch != 0), \ + *(_lxOut) = (int16_t)gFloodLeftX, \ + *(_rxOut) = (int16_t)gFloodRightX, \ + true) + +#endif /* JOEYLIB_PLATFORM_IIGS */ + #endif diff --git a/src/core/input.c b/src/core/input.c index ad7cacf..8347179 100644 --- a/src/core/input.c +++ b/src/core/input.c @@ -28,12 +28,31 @@ int8_t gJoyAxisX [JOYSTICK_COUNT]; int8_t gJoyAxisY [JOYSTICK_COUNT]; bool gJoyButtonState[JOYSTICK_COUNT][JOY_BUTTON_COUNT]; bool gJoyButtonPrev [JOYSTICK_COUNT][JOY_BUTTON_COUNT]; +uint8_t gJoyDeadZone [JOYSTICK_COUNT]; +#ifdef JOEYLIB_PLATFORM_IIGS +extern void iigsInputSnapshot(void); +// Build-time check: iigsInputSnapshot's asm hard-codes KEY_COUNT=60 +// and the small button counts. If a future change adds/removes keys +// or buttons the asm must be updated; this declares a zero-size +// array if the math no longer matches, which is a compile error. +typedef int joey_keycount_check[(KEY_COUNT == 60) ? 1 : -1]; +typedef int joey_mousebtn_check[(MOUSE_BUTTON_COUNT == 4) ? 1 : -1]; +typedef int joey_joybtn_check[(JOYSTICK_COUNT * JOY_BUTTON_COUNT == 4) ? 1 : -1]; +#endif + void joeyInputPoll(void) { +#ifdef JOEYLIB_PLATFORM_IIGS + // One asm pass for: TTL decrement + key snapshot + mouse/joy + // button snapshots. Replaces 3 ORCA-C memcpys + the C TTL loop + // that used to live in halInputPoll. ~0.6 ms saved per frame. + iigsInputSnapshot(); +#else memcpy(gKeyPrev, gKeyState, sizeof(gKeyState)); memcpy(gMouseButtonPrev, gMouseButtonState, sizeof(gMouseButtonState)); memcpy(gJoyButtonPrev, gJoyButtonState, sizeof(gJoyButtonState)); +#endif halInputPoll(); } @@ -170,3 +189,12 @@ bool joeyJoyReleased(JoeyJoystickE js, JoeyJoyButtonE button) { } return !gJoyButtonState[js][button] && gJoyButtonPrev[js][button]; } + + +void joeyJoystickReset(JoeyJoystickE js, uint8_t deadZone) { + if ((int)js < 0 || (int)js >= JOYSTICK_COUNT) { + return; + } + gJoyDeadZone[js] = deadZone; + halJoystickReset(js); +} diff --git a/src/core/inputInternal.h b/src/core/inputInternal.h index 5b94418..765bcc2 100644 --- a/src/core/inputInternal.h +++ b/src/core/inputInternal.h @@ -26,4 +26,12 @@ extern int8_t gJoyAxisY [JOYSTICK_COUNT]; extern bool gJoyButtonState[JOYSTICK_COUNT][JOY_BUTTON_COUNT]; extern bool gJoyButtonPrev [JOYSTICK_COUNT][JOY_BUTTON_COUNT]; +// Per-stick analog calibration. Set by joeyJoystickReset on platforms +// with analog paddles (IIgs); ignored on digital-stick platforms. +extern uint8_t gJoyDeadZone [JOYSTICK_COUNT]; + +// Per-port hook: called from joeyJoystickReset to clear any auto- +// disconnect state and arm a fresh center capture on the next poll. +void halJoystickReset(JoeyJoystickE js); + #endif diff --git a/src/core/palette.c b/src/core/palette.c index 94f2e2f..812fb94 100644 --- a/src/core/palette.c +++ b/src/core/palette.c @@ -37,4 +37,7 @@ void paletteSet(SurfaceT *s, uint8_t paletteIndex, const uint16_t *colors16) { for (i = 1; i < SURFACE_COLORS_PER_PALETTE; i++) { s->palette[paletteIndex][i] = colors16[i] & 0x0FFF; } + if (s == stageGet()) { + gStagePaletteDirty = true; + } } diff --git a/src/core/scb.c b/src/core/scb.c index cba68d9..59d44dd 100644 --- a/src/core/scb.c +++ b/src/core/scb.c @@ -4,6 +4,7 @@ // which of the 16 palettes that scanline uses at display time. #include +#include #include "joey/palette.h" #include "surfaceInternal.h" @@ -26,6 +27,9 @@ void scbSet(SurfaceT *s, uint16_t line, uint8_t paletteIndex) { return; } s->scb[line] = paletteIndex; + if (s == stageGet()) { + gStageScbDirty = true; + } } @@ -51,7 +55,14 @@ void scbSetRange(SurfaceT *s, uint16_t firstLine, uint16_t lastLine, uint8_t pal return; } - for (line = firstLine; line <= last; line++) { - s->scb[line] = paletteIndex; + // memset is far cheaper than the per-iter loop on ORCA-C with -b + // (scb is uint8_t, sizeof(uint8_t)==1, so the call form below is + // exact). On IIgs ORCA-C lowers small fixed-size memsets to MVP / + // PEI tricks; on Amiga/ST/DOS it uses libc memset which is + // already vectorized. Either way, much tighter than the C loop. + (void)line; + memset(&s->scb[firstLine], paletteIndex, (size_t)(last - firstLine + 1)); + if (s == stageGet()) { + gStageScbDirty = true; } } diff --git a/src/core/sprite.c b/src/core/sprite.c index d505c90..e2b6da8 100644 --- a/src/core/sprite.c +++ b/src/core/sprite.c @@ -175,6 +175,8 @@ SpriteT *spriteCreate(const uint8_t *tileData, uint8_t widthTiles, uint8_t heigh sp->ownsTileData = false; sp->slot = NULL; memset(sp->routineOffsets, 0, sizeof(sp->routineOffsets)); + memset(sp->cachedDstBank, 0xFF, sizeof(sp->cachedDstBank)); + memset(sp->cachedSrcBank, 0xFF, sizeof(sp->cachedSrcBank)); sp->flags = flags; return sp; } @@ -242,6 +244,8 @@ SpriteT *spriteCreateFromSurface(const SurfaceT *src, int16_t x, int16_t y, sp->ownsTileData = true; sp->slot = NULL; memset(sp->routineOffsets, 0, sizeof(sp->routineOffsets)); + memset(sp->cachedDstBank, 0xFF, sizeof(sp->cachedDstBank)); + memset(sp->cachedSrcBank, 0xFF, sizeof(sp->cachedSrcBank)); sp->flags = flags; return sp; } @@ -385,6 +389,8 @@ SpriteT *spriteFromCompiledMem(const uint8_t *data, uint32_t length, SpriteFlags sp->ownsTileData = true; sp->slot = slot; sp->flags = flags; + memset(sp->cachedDstBank, 0xFF, sizeof(sp->cachedDstBank)); + memset(sp->cachedSrcBank, 0xFF, sizeof(sp->cachedSrcBank)); return sp; } diff --git a/src/core/spriteInternal.h b/src/core/spriteInternal.h index d9971b7..fd68445 100644 --- a/src/core/spriteInternal.h +++ b/src/core/spriteInternal.h @@ -35,6 +35,16 @@ struct SpriteT { uint16_t routineOffsets[JOEY_SPRITE_SHIFT_COUNT][SPRITE_OP_COUNT]; SpriteFlagsE flags; + + // Per-shift, per-op MVN bank-patch cache for IIgs save/restore. + // patchMvnBanks rewrites 16+ MVN bank operands every call, but the + // banks themselves rarely change frame-to-frame (screen surface + // is fixed; backup buffer is allocated once). After the first + // patch, subsequent calls compare requested banks to the cache + // and skip the re-stamp loop. 0xFF means "never patched yet". + // 12 bytes per sprite. Unused on non-IIgs. + uint8_t cachedDstBank[JOEY_SPRITE_SHIFT_COUNT][SPRITE_OP_COUNT]; + uint8_t cachedSrcBank[JOEY_SPRITE_SHIFT_COUNT][SPRITE_OP_COUNT]; }; // Compiled entry points. Implemented alongside spriteCompile in diff --git a/src/core/surface.c b/src/core/surface.c index 0468ff9..9035797 100644 --- a/src/core/surface.c +++ b/src/core/surface.c @@ -10,6 +10,10 @@ #include "hal.h" #include "surfaceInternal.h" +#ifdef JOEYLIB_PLATFORM_IIGS +extern void iigsMarkDirtyRowsInner(uint16_t yStart, uint16_t yEnd, uint16_t minWord, uint16_t maxWord); +#endif + #define SURFACE_PALETTE_BYTES (SURFACE_PALETTE_ENTRIES * (uint32_t)sizeof(uint16_t)) #define SURFACE_FILE_BYTES (SURFACE_PIXELS_SIZE + SURFACE_HEIGHT + SURFACE_PALETTE_BYTES) @@ -25,8 +29,21 @@ static SurfaceT *gStage = NULL; uint8_t gStageMinWord[SURFACE_HEIGHT]; uint8_t gStageMaxWord[SURFACE_HEIGHT]; +// "Stage SCB / palette has changed since last present-side upload." +// Cheap flag check at present time replaces the 200+512 byte memcmps +// the IIgs port used to run every frame in halPresentRect's +// uploadScbAndPaletteIfNeeded -- ~7 ms / frame saved on demos that +// don't churn palette/SCB (i.e., almost all demos). +// +// Initially true so the first present uploads. scbSet*/paletteSet +// re-mark dirty when the stage's data changes; per-port present code +// clears the flag after consuming. +bool gStageScbDirty = true; +bool gStagePaletteDirty = true; + // ----- Internal helpers (alphabetical) ----- +#ifndef JOEYLIB_PLATFORM_IIGS static void widenRow(int16_t y, uint8_t minWord, uint8_t maxWord) { if (minWord < gStageMinWord[y]) { gStageMinWord[y] = minWord; @@ -35,6 +52,7 @@ static void widenRow(int16_t y, uint8_t minWord, uint8_t maxWord) { gStageMaxWord[y] = maxWord; } } +#endif // ----- Public API (alphabetical) ----- @@ -169,10 +187,12 @@ void surfaceMarkDirtyAll(const SurfaceT *s) { // the call is a no-op so primitives can call unconditionally without // branching themselves. void surfaceMarkDirtyRect(const SurfaceT *s, int16_t x, int16_t y, int16_t w, int16_t h) { - int16_t row; int16_t yEnd; uint8_t minWord; uint8_t maxWord; +#ifndef JOEYLIB_PLATFORM_IIGS + int16_t row; +#endif if (s != gStage) { return; @@ -183,9 +203,14 @@ void surfaceMarkDirtyRect(const SurfaceT *s, int16_t x, int16_t y, int16_t w, in minWord = (uint8_t)(x >> 2); maxWord = (uint8_t)((x + w - 1) >> 2); yEnd = y + h; +#ifdef JOEYLIB_PLATFORM_IIGS + iigsMarkDirtyRowsInner((uint16_t)y, (uint16_t)yEnd, + (uint16_t)minWord, (uint16_t)maxWord); +#else for (row = y; row < yEnd; row++) { widenRow(row, minWord, maxWord); } +#endif } diff --git a/src/core/surfaceInternal.h b/src/core/surfaceInternal.h index aa006ee..0f1d9d7 100644 --- a/src/core/surfaceInternal.h +++ b/src/core/surfaceInternal.h @@ -38,6 +38,13 @@ struct SurfaceT { extern uint8_t gStageMinWord[SURFACE_HEIGHT]; extern uint8_t gStageMaxWord[SURFACE_HEIGHT]; +// Stage SCB / palette dirty flags. scbSet* and paletteSet set them +// true when the stage's data is modified; the per-port present code +// checks the flags and clears after upload. Replaces a per-frame +// 712-byte memcmp pair the IIgs port used to run unconditionally. +extern bool gStageScbDirty; +extern bool gStagePaletteDirty; + // Drawing primitives call this with their already-clipped destination // rect. If `s` is the stage, the affected rows' [minWord, maxWord] // bands are widened to cover the rect. If `s` is any other surface, diff --git a/src/port/amiga/input.c b/src/port/amiga/input.c index ee28cc3..1bccc81 100644 --- a/src/port/amiga/input.c +++ b/src/port/amiga/input.c @@ -226,6 +226,12 @@ static void pollJoysticks(void) { // ----- HAL API (alphabetical) ----- +void halJoystickReset(JoeyJoystickE js) { + // Amiga sticks are digital -- no calibration to do. + (void)js; +} + + void halInputInit(void) { memset(gKeyState, 0, sizeof(gKeyState)); memset(gKeyPrev, 0, sizeof(gKeyPrev)); diff --git a/src/port/atarist/input.c b/src/port/atarist/input.c index 7411dd5..4c137e5 100644 --- a/src/port/atarist/input.c +++ b/src/port/atarist/input.c @@ -281,6 +281,12 @@ static long restoreIkbdVector(void) { // ----- HAL API (alphabetical) ----- +void halJoystickReset(JoeyJoystickE js) { + // Atari ST sticks are digital -- no calibration to do. + (void)js; +} + + void halInputInit(void) { memset(gKeyState, 0, sizeof(gKeyState)); memset(gKeyPrev, 0, sizeof(gKeyPrev)); diff --git a/src/port/dos/input.c b/src/port/dos/input.c index 201880f..15269ac 100644 --- a/src/port/dos/input.c +++ b/src/port/dos/input.c @@ -305,6 +305,12 @@ static void mousePoll(void) { // ----- HAL API (alphabetical) ----- +void halJoystickReset(JoeyJoystickE js) { + // DOS sticks are digital -- no calibration to do. + (void)js; +} + + void halInputInit(void) { memset(gKeyState, 0, sizeof(gKeyState)); memset(gKeyPrev, 0, sizeof(gKeyPrev)); diff --git a/src/port/iigs/hal.c b/src/port/iigs/hal.c index a198ff0..db4bda1 100644 --- a/src/port/iigs/hal.c +++ b/src/port/iigs/hal.c @@ -42,10 +42,9 @@ JOEYLIB_SEGMENT("DRAWPRIMS") // 32 KB stack-slam fill via AUXWRITE. ~25 ms full-screen. extern void iigsSurfaceClearInner(uint8_t *pixels, uint16_t fillWord); -// PEI-slam fill of `bytesPerRow` doubled bytes per row across `rows` -// rows, advancing 160 bytes per row. firstRow must be in bank $01. -// Caller handles partial-nibble edges in C; bytesPerRow is even. -extern void iigsFillRectStageInner(uint8_t *firstRow, uint16_t bytesPerRow, uint16_t rows, uint16_t fillWord); +// Full-fill asm helper (partial leading byte + middle MVN + partial +// trailing byte). Called by halFastFillRect below. +extern void iigsFillRectInner(uint8_t *pixels, uint16_t x, uint16_t y, uint16_t w, uint16_t h, uint16_t nibble); // 16 STA abs,X stores at fixed offsets along a 160-byte stride. // ~120 cyc per call. extern void iigsTileFillInner(uint8_t *dstRow0, uint16_t fillWord); @@ -72,26 +71,15 @@ extern void iigsDrawCircleInner(uint8_t *pixels, uint16_t cx, uint16_t cy, uint1 // Replaces ORCA-C's memcpy path which silently fails when called // from halPresent (DBR-state quirk after prior asm primitives). extern void iigsBlitStageToShr(uint8_t *scbPtr, uint16_t *palettePtr); -// floodFill row walk: tests seed pixel and walks left/right to find -// the matching run. Writes results to gFloodSeedMatch / gFloodLeftX / -// gFloodRightX (DRAWPRIMS globals). -extern void iigsFloodWalkInner(uint8_t *row, uint16_t startX, uint16_t matchColor, uint16_t newColor, uint16_t matchEqual); +// floodFill walk results: written by iigsFloodWalkAndScansInner, +// read back by halFastFloodWalkAndScans. extern uint16_t gFloodSeedMatch; extern uint16_t gFloodLeftX; extern uint16_t gFloodRightX; -// Per-pixel match scan over [leftX..rightX] of `row`. Writes 1/0 to -// markBuf[i] for each pixel. matchEqual selects boundary vs equal mode -// (see C srcPixel match logic). -extern void iigsFloodScanRowInner(uint8_t *row, uint16_t leftX, uint16_t rightX, uint16_t matchColor, uint16_t newColor, uint16_t matchEqual, uint8_t *markBuf); // Per-pixel rect blit (src->dst). transparent == $FFFF means opaque // (always copy); else pixels with src nibble == (transparent & $0F) // are skipped. Dst stride is hardcoded 160 (SURFACE_BYTES_PER_ROW). extern void iigsBlitRectInner(uint8_t *dstRow0, uint16_t dstX, const uint8_t *srcRow0, uint16_t srcX, uint16_t copyW, uint16_t copyH, uint16_t srcRowBytes, uint16_t transparent); -// Combined scan + push: matches each pixel, tracks run state, pushes -// (x, scanY) to the (stackX, stackY) arrays at *spInOut on every -// falling edge and at the end of the row if still in a run. *spInOut -// is read on entry and updated with the new top-of-stack on return. -extern void iigsFloodScanAndPushInner(uint8_t *row, uint16_t leftX, uint16_t rightX, uint16_t matchColor, uint16_t newColor, uint16_t matchEqual, uint16_t scanY, int16_t *stackX, int16_t *stackY, uint16_t *spInOut, uint16_t maxSp); // Single-call per-popped-seed worker: seed test + walk-left + walk-right // + scan-above + scan-below + push, all sharing cached row addr and // match decoders. Outputs to gFloodSeedMatch / gFloodLeftX / gFloodRightX. @@ -101,6 +89,12 @@ extern void iigsFloodWalkAndScansInner(uint8_t *pixels, uint16_t x, uint16_t y, // every asm primitive that needs row offset can do `lda >lut,x` instead // of the 7-instruction shift-add. extern void iigsInitRowLut(void); +// Per-row MVN blit from $01:srcOffset to $E1:srcOffset for partial- +// screen presents (halPresentRect). srcOffset is the byte offset +// within bank $01 of the FIRST byte to copy on the FIRST row; +// subsequent rows are at srcOffset + 160, etc. ~9 cyc/byte vs +// ORCA-C memcpy's ~30 cyc/byte. +extern void iigsBlitRectStageToShr(uint16_t srcOffset, uint16_t copyBytes, uint16_t rowsLeft); // Filled circle, scanline-style. fillWord low byte is the doubled // nibble (e.g., 0x33 for nibble 3). extern void iigsFillCircleInner(uint8_t *pixels, uint16_t cx, uint16_t cy, uint16_t r, uint16_t fillWord); @@ -155,14 +149,9 @@ static uint8_t gPreviousBorder = 0; static uint8_t gPreviousShadow = 0; static bool gModeSet = false; -// Last-uploaded SCB and palette. Both registers live in bank $E1; on a -// 2.8 MHz 65816 the 200+512-byte memcpy across the bank boundary is a -// real cost when it runs every present. Caching here lets the typical -// game loop (which mutates pixels but rarely SCB/palette) skip the -// upload entirely on clean frames. -static uint8_t gCachedScb [SURFACE_HEIGHT]; -static uint16_t gCachedPalette[SURFACE_PALETTE_COUNT][SURFACE_COLORS_PER_PALETTE]; -static bool gCacheValid = false; +// SCB / palette upload skipping is now driven by gStageScbDirty / +// gStagePaletteDirty (core/surface.c). The old per-frame memcmp- +// against-cached-copy approach was costing ~7 ms / frame on ORCA-C. // PEI slam scratch. File-scope non-static so the asm can `ext` them; // all accesses inside the slam use long-mode `>` addressing so they @@ -171,30 +160,30 @@ volatile uint16_t gPeiOrigSp; volatile uint8_t gPeiOrigShadow; volatile uint16_t gPeiTempRowBase; volatile uint16_t gPeiCurRow; // row counter saved across slam (stack is hijacked) +volatile uint16_t gPeiChunkRow; // in-chunk row counter saved across slam (Y reg storage) -// Defined in src/port/iigs/peislam.asm, in its own load segment -// (DRAWPRIMS) so the GS/OS loader places it in a different bank from -// AUDIO's _ROOT. PEI-slams the full 80 words of stage row `y` into -// the matching $E1 SHR row, ~530 cyc/row vs ~1120 cyc for memcpy/MVN. -extern void peiSlamFullRow(int16_t y); +// peislam.asm's per-row peiSlamFullRow helper is no longer wired in; +// the present pipeline now does its own PEI-slam loop inside +// iigsBlitStageToShr above (with dirty-row skip). -// Upload SCB and palette into bank-$E1 SHR memory only when they have -// changed since the last call. paletteOrScbChanged returns false when -// the cache is already in sync, in which case both memcpys to $E1 are -// skipped. +// Upload SCB / palette into bank-$E1 SHR memory only when the +// matching dirty flag is set. Replaces a per-frame 712-byte memcmp +// pair (~7 ms / frame on ORCA-C with -b) with a 2-cyc flag check. +// gStageScbDirty / gStagePaletteDirty live in core/surface.c; they +// start true (forces the very first present to upload), get set true +// again whenever scbSet* / paletteSet mutate the stage's data, and +// get cleared here after upload. static void uploadScbAndPaletteIfNeeded(const SurfaceT *src) { - if (gCacheValid - && memcmp(gCachedScb, src->scb, sizeof(gCachedScb)) == 0 - && memcmp(gCachedPalette, src->palette, sizeof(gCachedPalette)) == 0) { - return; + if (gStageScbDirty) { + memcpy(IIGS_SHR_SCB, src->scb, SURFACE_HEIGHT); + gStageScbDirty = false; + } + if (gStagePaletteDirty) { + memcpy(IIGS_SHR_PALETTE, src->palette, sizeof(src->palette)); + gStagePaletteDirty = false; } - memcpy(IIGS_SHR_SCB, src->scb, SURFACE_HEIGHT); - memcpy(IIGS_SHR_PALETTE, src->palette, sizeof(src->palette)); - memcpy(gCachedScb, src->scb, sizeof(gCachedScb)); - memcpy(gCachedPalette, src->palette, sizeof(gCachedPalette)); - gCacheValid = true; } @@ -241,10 +230,9 @@ void halPresent(const SurfaceT *src) { void halPresentRect(const SurfaceT *src, int16_t x, int16_t y, uint16_t w, uint16_t h) { - int16_t py; - int16_t yEnd; uint16_t copyBytes; int16_t byteStart; + uint16_t srcOffset; if (src == NULL) { return; @@ -257,13 +245,16 @@ void halPresentRect(const SurfaceT *src, int16_t x, int16_t y, uint16_t w, uint1 // otherwise we include the byte containing the leftmost pixel. byteStart = x >> 1; copyBytes = (uint16_t)(((x + (int16_t)w + 1) >> 1) - byteStart); - yEnd = y + (int16_t)h; - for (py = y; py < yEnd; py++) { - memcpy(&IIGS_SHR_PIXELS[py * SURFACE_BYTES_PER_ROW + byteStart], - &src->pixels[py * SURFACE_BYTES_PER_ROW + byteStart], - copyBytes); + if (copyBytes == 0 || h == 0) { + return; } + + // Asm per-row MVN blit. Stage pixels live at $01:2000; SHR display + // at $E1:2000 (same offset within their banks). srcOffset is the + // byte offset of the first byte to copy on the first row. + srcOffset = (uint16_t)(0x2000 + y * SURFACE_BYTES_PER_ROW + byteStart); + iigsBlitRectStageToShr(srcOffset, copyBytes, h); } @@ -277,249 +268,35 @@ void halShutdown(void) { } -bool halFastSurfaceClear(SurfaceT *s, uint8_t doubled) { - uint16_t fillWord; - - if (s == NULL) { - return false; - } - if (s != stageGet()) { - return false; - } - fillWord = (uint16_t)((uint16_t)doubled | ((uint16_t)doubled << 8)); - iigsSurfaceClearInner(s->pixels, fillWord); - return true; -} +// halFastSurfaceClear / halFastDrawLine / halFastDrawCircle / +// halFastFillCircle / halFastTileCopy / halFastTileCopyMasked / +// halFastTilePaste / halFastTileSnap / halFastTileFill / +// halFastBlitRect / halFastFloodWalk[AndScans] / +// halFastFloodScanRow / halFastFloodScanAndPush all dispatch via +// macros in core/hal.h on IIgs (#ifdef JOEYLIB_PLATFORM_IIGS block). +// Only halFastFillRect remains a real function below because its +// partial-byte (nibble-edge) handling is too gnarly for a macro. +// halFastFillRect: thin wrapper around iigsFillRectInner. The asm +// helper now handles the partial-byte (nibble-edge) logic that used +// to live here, so this function is just a stage-check + forward. +// (It's not macro-dispatched like the others because removing it +// from the C side triggers an unrelated ORCA-linker bank-placement +// failure -- the binary needs enough mass in _ROOT to keep sprite +// codegen's static symbols at addresses the linker can resolve.) bool halFastFillRect(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex) { - int16_t pxStart; - int16_t pxEnd; - int16_t midStart; - int16_t midBytes; - int16_t trailingByte; - int16_t leadingByte; - bool hasLeading; - bool hasTrailing; - int16_t row; - uint8_t *line; - uint16_t fillWord; - uint8_t nibble; - uint8_t doubled; - - if (s == NULL) { + if (s == NULL || s != stageGet()) { return false; } - if (s != stageGet()) { - return false; - } - - pxStart = x; - pxEnd = (int16_t)(x + (int16_t)w); - leadingByte = (int16_t)(pxStart >> 1); - hasLeading = (pxStart & 1) != 0; - if (hasLeading) { - pxStart++; - } - midStart = (int16_t)(pxStart >> 1); - midBytes = (int16_t)((pxEnd - pxStart) >> 1); - hasTrailing = ((pxEnd - pxStart) & 1) != 0; - trailingByte = (int16_t)(midStart + midBytes); - - if (midBytes <= 0) { - return false; - } - - nibble = (uint8_t)(colorIndex & 0x0F); - doubled = (uint8_t)((nibble << 4) | nibble); - - if (hasLeading || hasTrailing) { - for (row = 0; row < (int16_t)h; row++) { - line = &s->pixels[(y + row) * SURFACE_BYTES_PER_ROW]; - if (hasLeading) { - line[leadingByte] = (uint8_t)((line[leadingByte] & 0xF0) | nibble); - } - if (hasTrailing) { - line[trailingByte] = (uint8_t)((line[trailingByte] & 0x0F) | (nibble << 4)); - } - } - } - - fillWord = (uint16_t)((uint16_t)doubled | ((uint16_t)doubled << 8)); - line = &s->pixels[y * SURFACE_BYTES_PER_ROW + midStart]; - iigsFillRectStageInner(line, (uint16_t)midBytes, h, fillWord); - return true; -} - - -bool halFastTileCopy(uint8_t *dstRow0, const uint8_t *srcRow0) { - iigsTileCopyInner(dstRow0, srcRow0); - return true; -} - - -bool halFastTileCopyMasked(uint8_t *dstRow0, const uint8_t *srcRow0, uint8_t transparent) { - iigsTileCopyMaskedInner(dstRow0, srcRow0, (uint16_t)transparent); - return true; -} - - -bool halFastTilePaste(uint8_t *dstRow0, const uint8_t *srcTilePixels) { - iigsTilePasteInner(dstRow0, srcTilePixels); - return true; -} - - -bool halFastTileSnap(uint8_t *dstTilePixels, const uint8_t *srcRow0) { - iigsTileSnapInner(dstTilePixels, srcRow0); - return true; -} - -bool halFastDrawPixel(SurfaceT *s, uint16_t x, uint16_t y, uint8_t colorIndex) { - if (s == NULL) { - return false; - } - iigsDrawPixelInner(s->pixels, x, y, (uint16_t)(colorIndex & 0x0F)); - return true; -} - - -bool halFastDrawLine(SurfaceT *s, int16_t x0, int16_t y0, int16_t x1, int16_t y1, uint8_t colorIndex) { - if (s == NULL) { - return false; - } - iigsDrawLineInner(s->pixels, - (uint16_t)x0, (uint16_t)y0, - (uint16_t)x1, (uint16_t)y1, + iigsFillRectInner(s->pixels, + (uint16_t)x, (uint16_t)y, + (uint16_t)w, (uint16_t)h, (uint16_t)(colorIndex & 0x0F)); return true; } -bool halFastDrawCircle(SurfaceT *s, int16_t cx, int16_t cy, uint16_t r, uint8_t colorIndex) { - if (s == NULL) { - return false; - } - iigsDrawCircleInner(s->pixels, - (uint16_t)cx, (uint16_t)cy, r, - (uint16_t)(colorIndex & 0x0F)); - return true; -} - - -bool halFastFillCircle(SurfaceT *s, int16_t cx, int16_t cy, uint16_t r, uint8_t colorIndex) { - uint16_t fillWord; - uint8_t nibble; - uint8_t doubled; - if (s == NULL) { - return false; - } - if (s != stageGet()) { - return false; - } - nibble = (uint8_t)(colorIndex & 0x0F); - doubled = (uint8_t)((nibble << 4) | nibble); - fillWord = (uint16_t)((uint16_t)doubled | ((uint16_t)doubled << 8)); - iigsFillCircleInner(s->pixels, (uint16_t)cx, (uint16_t)cy, r, fillWord); - return true; -} - - -bool halFastFloodWalk(uint8_t *row, int16_t startX, uint8_t matchColor, uint8_t newColor, bool matchEqual, bool *seedMatched, int16_t *leftXOut, int16_t *rightXOut) { - if (row == NULL || seedMatched == NULL || leftXOut == NULL || rightXOut == NULL) { - return false; - } - iigsFloodWalkInner(row, (uint16_t)startX, - (uint16_t)(matchColor & 0x0F), - (uint16_t)(newColor & 0x0F), - (uint16_t)(matchEqual ? 1 : 0)); - *seedMatched = (gFloodSeedMatch != 0); - if (*seedMatched) { - *leftXOut = (int16_t)gFloodLeftX; - *rightXOut = (int16_t)gFloodRightX; - } - return true; -} - - -bool halFastFloodScanRow(uint8_t *row, int16_t leftX, int16_t rightX, uint8_t matchColor, uint8_t newColor, bool matchEqual, uint8_t *markBuf) { - if (row == NULL || markBuf == NULL) { - return false; - } - iigsFloodScanRowInner(row, (uint16_t)leftX, (uint16_t)rightX, - (uint16_t)(matchColor & 0x0F), - (uint16_t)(newColor & 0x0F), - (uint16_t)(matchEqual ? 1 : 0), - markBuf); - return true; -} - - -bool halFastFloodScanAndPush(uint8_t *row, int16_t leftX, int16_t rightX, uint8_t matchColor, uint8_t newColor, bool matchEqual, int16_t scanY, int16_t *stackX, int16_t *stackY, int16_t *spInOut, int16_t maxSp) { - if (row == NULL || stackX == NULL || stackY == NULL || spInOut == NULL) { - return false; - } - iigsFloodScanAndPushInner(row, - (uint16_t)leftX, (uint16_t)rightX, - (uint16_t)(matchColor & 0x0F), - (uint16_t)(newColor & 0x0F), - (uint16_t)(matchEqual ? 1 : 0), - (uint16_t)scanY, - stackX, stackY, - (uint16_t *)spInOut, - (uint16_t)maxSp); - return true; -} - - -bool halFastFloodWalkAndScans(uint8_t *pixels, int16_t x, int16_t y, uint8_t matchColor, uint8_t newColor, bool matchEqual, int16_t *stackX, int16_t *stackY, int16_t *spInOut, int16_t maxSp, bool *seedMatched, int16_t *leftXOut, int16_t *rightXOut) { - if (pixels == NULL || stackX == NULL || stackY == NULL || spInOut == NULL || seedMatched == NULL || leftXOut == NULL || rightXOut == NULL) { - return false; - } - iigsFloodWalkAndScansInner(pixels, - (uint16_t)x, (uint16_t)y, - (uint16_t)(matchColor & 0x0F), - (uint16_t)(newColor & 0x0F), - (uint16_t)(matchEqual ? 1 : 0), - stackX, stackY, - (uint16_t *)spInOut, - (uint16_t)maxSp); - *seedMatched = (gFloodSeedMatch != 0); - *leftXOut = (int16_t)gFloodLeftX; - *rightXOut = (int16_t)gFloodRightX; - return true; -} - - -bool halFastBlitRect(uint8_t *dstRow0, int16_t dstX, const uint8_t *srcRow0, int16_t srcX, int16_t copyW, int16_t copyH, int16_t srcRowBytes, uint16_t transparent) { - if (dstRow0 == NULL || srcRow0 == NULL || copyW <= 0 || copyH <= 0) { - return false; - } - iigsBlitRectInner(dstRow0, (uint16_t)dstX, - srcRow0, (uint16_t)srcX, - (uint16_t)copyW, (uint16_t)copyH, - (uint16_t)srcRowBytes, - transparent); - return true; -} - - -bool halFastTileFill(SurfaceT *s, uint8_t bx, uint8_t by, uint16_t fillWord) { - uint8_t *row; - uint16_t pixelX; - uint16_t pixelY; - - if (s == NULL) { - return false; - } - pixelX = (uint16_t)((uint16_t)bx * 8u); - pixelY = (uint16_t)((uint16_t)by * 8u); - row = &s->pixels[pixelY * SURFACE_BYTES_PER_ROW + (pixelX >> 1)]; - iigsTileFillInner(row, fillWord); - return true; -} - - uint8_t *halStageAllocPixels(void) { return IIGS_STAGE_PIXELS; } diff --git a/src/port/iigs/input.c b/src/port/iigs/input.c index b54554b..f4ede3d 100644 --- a/src/port/iigs/input.c +++ b/src/port/iigs/input.c @@ -109,7 +109,11 @@ static int8_t thresholdPaddle(uint8_t v); // does not accept designated initializers; runtime fill keeps lookup // O(1) instead of a 40-plus-case switch. static uint8_t gAsciiToKey[ASCII_TABLE_SIZE]; -static uint8_t gKeyTtl [KEY_COUNT]; + +// Non-static so iigsInputSnapshot (joeyDraw.asm) can reference it via +// long-mode addressing through the linker. The C TTL-decrement loop +// that used to live in halInputPoll moved to that asm helper. +uint8_t gKeyTtl [KEY_COUNT]; static int16_t gMouseAbsX = SURFACE_WIDTH / 2; static int16_t gMouseAbsY = SURFACE_HEIGHT / 2; @@ -166,9 +170,38 @@ static int8_t signExtend7(uint8_t raw) { } +// Map a raw 0..255 paddle reading to JOYSTICK_AXIS_MIN..MAX, using the +// stick's calibrated center (captured by joeyJoystickReset) and a +// dead-zone band around it. Returns 0 if reading is within deadZone of +// the center; otherwise the offset from center, clamped to int8_t. +static int8_t analogPaddle(uint8_t v, uint8_t center, uint8_t deadZone) { + int16_t delta; + + delta = (int16_t)v - (int16_t)center; + if (delta < 0) { + if ((-delta) <= (int16_t)deadZone) { + return 0; + } + if (delta < (int16_t)JOYSTICK_AXIS_MIN) { + return JOYSTICK_AXIS_MIN; + } + } else { + if (delta <= (int16_t)deadZone) { + return 0; + } + if (delta > (int16_t)JOYSTICK_AXIS_MAX) { + return JOYSTICK_AXIS_MAX; + } + } + return (int8_t)delta; +} + + // Threshold a 0..255 paddle reading into a digital direction so the // IIgs analog stick presents the same axis semantics as the digital -// sticks on ST/Amiga/DOS. Center range is treated as zero. +// sticks on ST/Amiga/DOS. Center range is treated as zero. Used +// before joeyJoystickReset has been called -- once the app calibrates, +// we switch to analogPaddle for finer control. static int8_t thresholdPaddle(uint8_t v) { if (v < PADDLE_LO_THRESHOLD) { return JOYSTICK_AXIS_MIN; @@ -191,53 +224,122 @@ static int8_t thresholdPaddle(uint8_t v) { // approximates the paddle's 0..255 position (the Apple firmware // PREAD routine works the same way). The two reads are inlined here // rather than factored into a helper because ORCA/C 2.1 trips over -// `volatile uint8_t *` function parameters. +// Auto-disconnect tracking. The paddle one-shot timer takes ~3 ms to +// charge at full deflection; if NO joystick is wired up, the BUSY bit +// stays set forever and the busy-wait runs the full PADDLE_TIMEOUT +// every frame -- ~3 ms wasted per frame on a stick that isn't there. +// +// After JOY_DISCONNECT_THRESHOLD consecutive timeouts we latch the +// stick as absent and stop polling entirely. The app calls +// joeyJoystickReset to clear the latch and resume polling. +#define JOY_DISCONNECT_THRESHOLD 60u + +static uint16_t gJoyConsecutiveTimeouts = 0; +static bool gJoyDisconnectLatched = false; + +// Analog calibration: gJoyCenterX/Y hold the raw paddle reading we +// captured the last time the user called joeyJoystickReset. Until +// that's called, gJoyCenterValid is false and pollJoystick falls back +// to the digital threshold mapping. gJoyRecalibrate is set by +// halJoystickReset and cleared on the next successful poll, which +// captures the new center. +static uint8_t gJoyCenterX [JOYSTICK_COUNT]; +static uint8_t gJoyCenterY [JOYSTICK_COUNT]; +static bool gJoyCenterValid [JOYSTICK_COUNT]; +static bool gJoyRecalibrate [JOYSTICK_COUNT]; + + +void halJoystickReset(JoeyJoystickE js) { + if ((int)js < 0 || (int)js >= JOYSTICK_COUNT) { + return; + } + // Re-enable polling and arm a fresh center capture for the next + // poll. The dead-zone value lives in core's gJoyDeadZone[js]. + gJoyConsecutiveTimeouts = 0; + gJoyDisconnectLatched = false; + gJoyRecalibrate[js] = true; +} + +// Asm paddle reader (joeyDraw.asm). Switches CPU to 1 MHz for the +// duration of the poll so paddle counts match what every other +// IIgs/Apple II joystick game produces (the C busy-wait at 2.8 MHz +// inflated counts). Returns results via gJoy* DRAWPRIMS scratch. +extern void iigsPollJoystickInner(void); + +extern volatile uint8_t gJoyPx; +extern volatile uint8_t gJoyPy; +extern volatile uint8_t gJoyResolved; // bit0: pdl0 fired; bit1: pdl1 fired + static void pollJoystick(void) { - uint16_t count; uint8_t px; uint8_t py; - uint8_t byte; + uint8_t resolvedFlags; bool xResolved; bool yResolved; - // One PTRIG read starts BOTH paddle timers simultaneously per the - // IIgs Hardware Reference. Polling them in parallel halves the - // wall-clock time vs. polling each serially after its own trigger. - byte = *IIGS_PTRIG; - px = 0; - py = 0; - xResolved = false; - yResolved = false; - for (count = 0; count < PADDLE_TIMEOUT; count++) { - if (!xResolved) { - byte = *IIGS_PADDLE0; - if ((byte & IIGS_PADDLE_BUSY) == 0) { - px = (uint8_t)count; - xResolved = true; - } - } - if (!yResolved) { - byte = *IIGS_PADDLE1; - if ((byte & IIGS_PADDLE_BUSY) == 0) { - py = (uint8_t)count; - yResolved = true; - } - } - if (xResolved && yResolved) { - break; - } - } - - // Timed-out paddles default to centered axis. Without an explicit - // resolved flag we couldn't distinguish "no joystick" from "stick - // hard right" -- both would yield px=255 and report AXIS_MAX. - gJoyAxisX[JOYSTICK_0] = xResolved ? thresholdPaddle(px) : 0; - gJoyAxisY[JOYSTICK_0] = yResolved ? thresholdPaddle(py) : 0; + // Buttons are I/O reads -- always cheap, do them every frame. gJoyButtonState[JOYSTICK_0][JOY_BUTTON_0] = (*IIGS_BTN0 & IIGS_BUTTON_BIT) != 0; gJoyButtonState[JOYSTICK_0][JOY_BUTTON_1] = (*IIGS_BTN1 & IIGS_BUTTON_BIT) != 0; - - gJoyConnected[JOYSTICK_0] = true; gJoyConnected[JOYSTICK_1] = false; + + // Once the stick has been latched as disconnected, only buttons + // get polled. The app must call joeyJoystickReset to resume axis + // polling (e.g., when the user has just plugged in a stick). + if (gJoyDisconnectLatched) { + gJoyAxisX[JOYSTICK_0] = 0; + gJoyAxisY[JOYSTICK_0] = 0; + gJoyConnected[JOYSTICK_0] = false; + return; + } + + // Asm read at 1 MHz -- accurate paddle counts. + iigsPollJoystickInner(); + px = gJoyPx; + py = gJoyPy; + resolvedFlags = gJoyResolved; + xResolved = (resolvedFlags & 0x01) != 0; + yResolved = (resolvedFlags & 0x02) != 0; + + gJoyConnected[JOYSTICK_0] = xResolved || yResolved; + + // Update auto-disconnect counter. Both axes failing => probably no + // stick. One resolves => stick is present, reset the counter. + if (!xResolved && !yResolved) { + if (gJoyConsecutiveTimeouts < 0xFFFFu) { + gJoyConsecutiveTimeouts++; + } + if (gJoyConsecutiveTimeouts >= JOY_DISCONNECT_THRESHOLD) { + gJoyDisconnectLatched = true; + } + gJoyAxisX[JOYSTICK_0] = 0; + gJoyAxisY[JOYSTICK_0] = 0; + return; + } + + gJoyConsecutiveTimeouts = 0; + + // Capture the resting position on recalibrate (one-shot). + if (gJoyRecalibrate[JOYSTICK_0]) { + gJoyCenterX [JOYSTICK_0] = px; + gJoyCenterY [JOYSTICK_0] = py; + gJoyCenterValid[JOYSTICK_0] = true; + gJoyRecalibrate[JOYSTICK_0] = false; + } + + // Calibrated => analog axis report (offset from center, dead-zone + // clamped). Uncalibrated => the legacy 3-state digital threshold, + // matching how the stick behaved before joeyJoystickReset existed. + if (gJoyCenterValid[JOYSTICK_0]) { + gJoyAxisX[JOYSTICK_0] = analogPaddle(px, + gJoyCenterX[JOYSTICK_0], + gJoyDeadZone[JOYSTICK_0]); + gJoyAxisY[JOYSTICK_0] = analogPaddle(py, + gJoyCenterY[JOYSTICK_0], + gJoyDeadZone[JOYSTICK_0]); + } else { + gJoyAxisX[JOYSTICK_0] = thresholdPaddle(px); + gJoyAxisY[JOYSTICK_0] = thresholdPaddle(py); + } } @@ -303,19 +405,14 @@ void halInputInit(void) { void halInputPoll(void) { - uint8_t kbd; - uint8_t ascii; - uint8_t key; - uint16_t i; + uint8_t kbd; + uint8_t ascii; + uint8_t key; - for (i = 0; i < KEY_COUNT; i++) { - if (gKeyTtl[i] > 0) { - gKeyTtl[i]--; - if (gKeyTtl[i] == 0) { - gKeyState[i] = false; - } - } - } + // The KEY_COUNT TTL-decrement loop and the gKeyState/gKeyPrev/ + // gMouseButtonPrev/gJoyButtonPrev snapshots all happen earlier in + // joeyInputPoll's call to iigsInputSnapshot (asm). We just read + // the live hardware state here. kbd = *IIGS_KBD; if (kbd & KBD_STROBE_BIT) { diff --git a/src/port/iigs/joeyDraw.asm b/src/port/iigs/joeyDraw.asm index 7a74fb0..77ea229 100644 --- a/src/port/iigs/joeyDraw.asm +++ b/src/port/iigs/joeyDraw.asm @@ -422,6 +422,257 @@ fillrDone anop end +**************************************************************** +* iigsFillRectInner(uint8_t *pixels, uint16_t x, uint16_t y, +* uint16_t w, uint16_t h, uint16_t nibble) +* +* Full fillRect, including the leading- and trailing-partial-byte +* nibble handling that used to live in halFastFillRect's C wrapper. +* Macro-dispatched directly from cross-platform draw.c (see hal.h's +* IIgs block) so the wrapper layer is gone. +* +* Stage-only: hard-coded MVN bank $01 in the middle-bytes loop. +* Caller (the macro) checks `s == stageGet()` before invoking. +* +* Per-row work: leading partial-byte RMW (if x odd) + middle MVN +* (seed + MVN copies seed forward) + trailing partial-byte RMW +* (if (x+w) odd). All long-mode stores so DBR doesn't matter. +* +* Args after PHP+PHB+PHD (TCD makes D = SP+8): +* pixels D+0..3 (long ptr; only the bank-1 16-bit offset of the +* first row's start matters since asm hard-codes +* bank 1 anyway -- but pass it for symmetry) +* x D+4..5 +* y D+6..7 +* w D+8..9 +* h D+10..11 +* nibble D+12..13 (low byte = 0..15) +**************************************************************** + +iigsFillRectInner start DRAWPRIMS +fri_pix equ 0 +fri_x equ 4 +fri_y equ 6 +fri_w equ 8 +fri_h equ 10 +fri_nib equ 12 + + php + sei + phb + phd + rep #$30 + LONGA ON + LONGI ON + tsc + clc + adc #8 + tcd + +* pxStart = x; pxEnd = x + w. + lda fri_x + sta >friPxStart + clc + adc fri_w + sta >friPxEnd + +* leadingByte = x >> 1; hasLeading = x & 1. + lda fri_x + lsr a + sta >friLeadingByte + lda fri_x + and #1 + sta >friHasLeading + beq friStartReady + lda >friPxStart + inc a + sta >friPxStart +friStartReady anop + +* midStart = pxStart >> 1. + lda >friPxStart + lsr a + sta >friMidStart + +* spanLen = pxEnd - pxStart; hasTrailing = spanLen & 1; midBytes = spanLen >> 1. + lda >friPxEnd + sec + sbc >friPxStart + sta >friMidBytes ; temporary hold spanLen + and #1 + sta >friHasTrailing + lda >friMidBytes + lsr a + sta >friMidBytes + +* trailingByte = midStart + midBytes. + lda >friMidStart + clc + adc >friMidBytes + sta >friTrailingByte + +* nibble decoding: friNibLo = nib (low half), friNibHi = nib<<4 +* (high half), friDoubled = (nib<<4)|nib (full byte fill). + sep #$20 + LONGA OFF + lda fri_nib + and #$0F + sta >friNibLo + asl a + asl a + asl a + asl a + sta >friNibHi + ora >friNibLo + sta >friDoubled + rep #$20 + LONGA ON + +* curRow = $2000 + y*160 (LUT-driven). + lda fri_y + asl a + tax + lda >gRowOffsetLut,x + clc + adc #$2000 + sta >friCurRow + + lda fri_h + sta >friRowsLeft + +friRowLoop anop + lda >friRowsLeft + bne friDoRow + brl friExit +friDoRow anop + +* Leading partial-byte RMW. + lda >friHasLeading + beq friNoLead + lda >friCurRow + clc + adc >friLeadingByte + tax + sep #$20 + LONGA OFF + lda >$010000,x + and #$F0 + ora >friNibLo + sta >$010000,x + rep #$20 + LONGA ON +friNoLead anop + +* Middle bytes: if midBytes == 0, skip; if midBytes == 1, just seed +* one byte; if midBytes >= 2, seed first byte then MVN propagates the +* seed across the rest of the row. + lda >friMidBytes + bne friMidNonZero + brl friNoMid +friMidNonZero anop + lda >friCurRow + clc + adc >friMidStart + tax + sep #$20 + LONGA OFF + lda >friDoubled + sta >$010000,x + rep #$20 + LONGA ON + lda >friMidBytes + cmp #2 + bcc friNoMid + lda >friCurRow + clc + adc >friMidStart + tax ; src offset + inc a + tay ; dst offset = src+1 + lda >friMidBytes + dec a + dec a ; count = midBytes-2 (MVN copies count+1) + mvn $010000,$010000 +friNoMid anop + +* Trailing partial-byte RMW. + lda >friHasTrailing + beq friNoTrail + lda >friCurRow + clc + adc >friTrailingByte + tax + sep #$20 + LONGA OFF + lda >$010000,x + and #$0F + ora >friNibHi + sta >$010000,x + rep #$20 + LONGA ON +friNoTrail anop + +* Advance to next row. + lda >friCurRow + clc + adc #160 + sta >friCurRow + lda >friRowsLeft + dec a + sta >friRowsLeft + brl friRowLoop + +friExit anop + LONGA OFF + LONGI OFF + pld + plb + plp + rtl + end + + +friPxStart data DRAWPRIMS + ds 2 + end +friPxEnd data DRAWPRIMS + ds 2 + end +friLeadingByte data DRAWPRIMS + ds 2 + end +friHasLeading data DRAWPRIMS + ds 2 + end +friMidStart data DRAWPRIMS + ds 2 + end +friMidBytes data DRAWPRIMS + ds 2 + end +friHasTrailing data DRAWPRIMS + ds 2 + end +friTrailingByte data DRAWPRIMS + ds 2 + end +friNibLo data DRAWPRIMS + ds 2 + end +friNibHi data DRAWPRIMS + ds 2 + end +friDoubled data DRAWPRIMS + ds 2 + end +friCurRow data DRAWPRIMS + ds 2 + end +friRowsLeft data DRAWPRIMS + ds 2 + end + + fillrSeed data DRAWPRIMS ds 2 ; 2 bytes -- read as 16-bit fillWord end @@ -2110,189 +2361,13 @@ fcFillHi data DRAWPRIMS **************************************************************** -* iigsFloodWalkInner(uint8_t *row, uint16_t startX, -* uint8_t matchColor, uint8_t newColor, -* uint8_t matchEqual) -* -* Combined seed test + walk-left + walk-right for floodFill. -* Tests pixel at startX against the match criterion. If it matches, -* walks left and right finding the contiguous run of matching pixels. -* -* Match criterion: -* matchEqual != 0: pix == matchColor (used by floodFill) -* matchEqual == 0: pix != matchColor && pix != newColor -* (used by floodFillBounded) -* -* Outputs (DRAWPRIMS globals): -* gFloodSeedMatch -- 1 if seed pixel matches, 0 otherwise -* gFloodLeftX -- leftmost matching column -* gFloodRightX -- rightmost matching column -* -* If seed doesn't match, leftX/rightX are not meaningful; caller -* should bail out on gFloodSeedMatch == 0. -* -* Args after PHP+PHB+PHD (TCD makes D = SP+8): -* row at D+0..3 (long ptr to row's first byte) -* startX at D+4..5 -* matchColor at D+6..7 (low byte) -* newColor at D+8..9 (low byte) -* matchEqual at D+10..11 (low byte: 0 = bounded mode) +* iigsFloodWalkInner / fwTest deleted -- the C fallback path that +* called halFastFloodWalk (which delegated here) is unreachable on +* IIgs because halFastFloodWalkAndScans always succeeds first. The +* gFloodSeedMatch / gFloodLeftX / gFloodRightX globals below are +* still used by iigsFloodWalkAndScansInner and stay. **************************************************************** -iigsFloodWalkInner start IIGSASM -fwrow equ 0 -fwstart equ 4 -fwmatch equ 6 -fwnew equ 8 -fweq equ 10 - - php - phb - phd - rep #$30 - LONGA ON - LONGI ON - - tsc - clc - adc #8 - tcd ; D = SP+8 - -* Cache match nibble + new nibble + matchEqual flag for fast -* comparisons. Use M=16 stores (high byte = 0 since values are -* small). - lda fwmatch - and #$00FF - sta >fwMatchNib - lda fwnew - and #$00FF - sta >fwNewNib - lda fweq - and #$00FF - sta >fwEqFlag - -* Seed test at startX. - lda fwstart - tax - jsr fwTest ; A = 1 if match, 0 if not - sta >gFloodSeedMatch - cmp #0 - bne fwSeedOk - brl fwExit ; seed doesn't match -fwSeedOk anop - -* Walk left: leftX = startX; while leftX > 0 and pix(leftX-1) match: leftX--. - lda fwstart - sta >gFloodLeftX -fwLeftLoop anop - lda >gFloodLeftX - beq fwLeftDone ; leftX == 0 - dec a - tax ; X = leftX - 1 - jsr fwTest - cmp #0 - beq fwLeftDone ; mismatch - lda >gFloodLeftX - dec a - sta >gFloodLeftX - brl fwLeftLoop -fwLeftDone anop - -* Walk right: rightX = startX; while rightX < 319 and pix(rightX+1) match: rightX++. - lda fwstart - sta >gFloodRightX -fwRightLoop anop - lda >gFloodRightX - cmp #319 - bcs fwRightDone ; rightX >= 319 - inc a - tax ; X = rightX + 1 - jsr fwTest - cmp #0 - beq fwRightDone - lda >gFloodRightX - inc a - sta >gFloodRightX - brl fwRightLoop -fwRightDone anop - -fwExit anop - LONGA OFF - LONGI OFF - pld - plb - plp - rtl - -**************************************************************** -* fwTest: test pixel at column X against match criterion. -* Input: X = column (M=16 X=16 on entry) -* Output: A = 1 if pixel matches, 0 if not (M=16 on exit) -* Trashes: A, Y, P. Preserves X, D, B. -**************************************************************** -fwTest anop - LONGA ON - LONGI ON -* byteOffset = X >> 1, parity = X & 1 - txa - lsr a ; A = X>>1, C = X & 1 - tay ; Y = byte offset - bcs fwTestOdd -* Even X -- high nibble. - sep #$20 - LONGA OFF - lda [fwrow],y - lsr a - lsr a - lsr a - lsr a ; A = high nibble - bra fwTestGotPix -fwTestOdd anop - sep #$20 - LONGA OFF - lda [fwrow],y - and #$0F ; A = low nibble -fwTestGotPix anop -* A holds nibble (M=8). Compare against match criterion. - cmp >fwMatchNib - bne fwNotMatchColor -* pix == matchColor. - lda >fwEqFlag - bne fwTestMatch ; matchEqual=1 + pix==matchColor -> match - bra fwTestNoMatch ; matchEqual=0 + pix==matchColor -> NOT match -fwNotMatchColor anop -* pix != matchColor. - lda >fwEqFlag - bne fwTestNoMatch ; matchEqual=1 + pix!=matchColor -> NOT match -* matchEqual=0 path: also need pix != newColor. Re-extract the nibble. - txa - and #$01 ; bit 0 = parity - bne fwNibLoCheck - lda [fwrow],y - lsr a - lsr a - lsr a - lsr a - bra fwGotNibForNew -fwNibLoCheck anop - lda [fwrow],y - and #$0F -fwGotNibForNew anop - cmp >fwNewNib - beq fwTestNoMatch - bra fwTestMatch -fwTestMatch anop - rep #$20 - LONGA ON - lda #1 - rts -fwTestNoMatch anop - rep #$20 - LONGA ON - lda #0 - rts - end - **************************************************************** * Globals for floodFill asm walk results. Live in DRAWPRIMS so * both asm and C see them at the same long address. @@ -2307,15 +2382,6 @@ gFloodLeftX data DRAWPRIMS gFloodRightX data DRAWPRIMS ds 2 end -fwMatchNib data DRAWPRIMS - ds 2 - end -fwNewNib data DRAWPRIMS - ds 2 - end -fwEqFlag data DRAWPRIMS - ds 2 - end **************************************************************** @@ -2379,18 +2445,30 @@ mvnScbInst mvn $000000,$E10000 lda #511 mvnPalInst mvn $000000,$E10000 -* 3. Pixel blit via PEI-slam, with per-row dirty skip. +* 3. Pixel blit via PEI-slam, with per-row dirty skip and chunked SEI. * PEI-slam: SP hijacked into the SHR shadow region of bank $01, AUXWRITE * + RAMRD remap bank-$00 stack pushes to bank $01, SHR shadow mirrors * bank-$01 writes to $E1. Result: PEI dp pushes from DP=$01:row_start * land at $E1:row_start (160 bytes / row at ~6 cyc per 2 bytes). * ~480 cyc/row vs MVN's ~1120 cyc/row -- 2.3x faster per row. -* SEI for the duration: soft-switch state and stack hijack would -* corrupt any IRQ handler that touches bank-0 globals. ~38 ms SEI -* total for a full 200-row slam; chunk later if audio glitches. +* +* Chunked SEI: the slam runs in 5 chunks of up to 40 rows each, with +* CLI between chunks so audio (DOC IRQ) and other periodic interrupts +* aren't starved. Each chunk holds SEI for ~7.6 ms worst case (40 +* dirty rows). Per-chunk teardown re-saves the original soft-switch +* state, then the next chunk re-arms it. Original SP / shadow are +* captured once before the loop, so the per-chunk reset just reloads +* them. +* * Dirty skip: rows where gStageMinWord[y] > gStageMaxWord[y] are * clean and not slammed. Saves big on sparse-update demos; for * full-screen presents (DRAW), every row slams. +* +* Register usage during the chunk loop: +* X = absolute row counter (0..200) +* Y = in-chunk row counter (0..40) +* Both are clobbered by the per-row slam (TCD/TCS) so we save through +* gPeiCurRow / gPeiChunkRow long-mode scratch around each slam. tsc sta >gPeiOrigSp @@ -2401,6 +2479,11 @@ mvnPalInst mvn $000000,$E10000 rep #$20 LONGA ON + ldx #0 ; X = absolute row counter + +peiChunkBegin anop + ldy #0 ; Y = in-chunk row counter + sei sep #$20 @@ -2414,15 +2497,15 @@ mvnPalInst mvn $000000,$E10000 rep #$20 LONGA ON - ldx #0 ; X = row counter (need X because -* long-abs,Y doesn't exist on 65816 -- -* only long-abs,X does, so the dirty- -* check `lda >gStageMinWord,x` works.) peiRowLoop anop cpx #200 - bcc peiRowCheck - brl peiRowsDone -peiRowCheck anop + bcc peiNotAllDone ; X < 200 -> still rows to do + brl peiChunkEnd ; 8-bit BCS can't reach across the 80-PEI body +peiNotAllDone anop + cpy #40 + bcc peiCheckDirty ; Y < 40 -> chunk has room + brl peiChunkEnd +peiCheckDirty anop sep #$20 LONGA OFF lda >gStageMinWord,x @@ -2432,12 +2515,17 @@ peiRowCheck anop bcc peiSlamRow ; min < max -> dirty beq peiSlamRow ; min == max -> 1-word dirty inx ; clean row, skip + iny brl peiRowLoop peiSlamRow anop -* Save X into long-mode scratch (stack is hijacked into $E1, can't PHX). +* Save X and Y into long-mode scratch (stack is hijacked into $E1, +* can't PHX/PHY without polluting the pixel data we're slamming). txa sta >gPeiCurRow + tya + sta >gPeiChunkRow + lda >gPeiCurRow asl a ; A = y*2 (LUT byte offset) tax lda >gRowOffsetLut,x ; A = y*160 @@ -2536,10 +2624,15 @@ peiSlamRow anop lda >gPeiCurRow tax inx + lda >gPeiChunkRow + tay + iny brl peiRowLoop -peiRowsDone anop -* Restore SP, soft-switches. +peiChunkEnd anop +* Per-chunk teardown: restore SP, shadow, AUXWRITE/RAMRD, then CLI to +* let pending IRQs run. If more rows remain, the next iteration of +* peiChunkBegin re-arms everything. lda >gPeiOrigSp tcs sep #$20 @@ -2551,7 +2644,13 @@ peiRowsDone anop sta >$00C002 ; RAMRD off rep #$20 LONGA ON + cli + cpx #200 + bcs peiAllDone + brl peiChunkBegin + +peiAllDone anop LONGA OFF LONGI OFF pld @@ -2561,41 +2660,30 @@ peiRowsDone anop end - **************************************************************** -* iigsFloodScanRowInner(uint8_t *row, uint16_t leftX, -* uint16_t rightX, uint8_t matchColor, -* uint8_t newColor, uint8_t matchEqual, -* uint8_t *markBuf) +* iigsBlitRectStageToShr(uint16_t srcOffset, uint16_t copyBytes, +* uint16_t rowsLeft) * -* Walk pixels [leftX..rightX] (inclusive) of `row`. For each, write -* 1 to markBuf if the pixel "qualifies for flood": -* matchEqual != 0: pix == matchColor -* matchEqual == 0: pix != matchColor && pix != newColor -* Else write 0. +* Per-row MVN blit of `copyBytes` bytes from $01:srcOffset to +* $E1:srcOffset, advancing srcOffset by 160 each row for `rowsLeft` +* rows. Used by halPresentRect for partial-screen presents (e.g., +* sprite blits where only the union of old/new ball rect is dirty). * -* C side then walks markBuf for run-edge transitions (no per-pixel -* nibble extract / function call) -- much faster than the C loop -* with srcPixel. +* MVN at ~9 cyc/byte (with $E1 slow-RAM wait states) beats ORCA-C's +* per-byte memcpy loop (~30 cyc/byte). For a 16x16 sprite rect (16 +* rows of ~8 bytes) the savings are ~2 KB cyc per present (~0.7 ms), +* which adds up across an animation loop running at full tilt. * * Args after PHP+PHB+PHD (TCD makes D = SP+8): -* row at D+0..3 (long ptr) -* leftX at D+4..5 -* rightX at D+6..7 -* matchColor at D+8..9 (low byte) -* newColor at D+10..11 (low byte) -* matchEqual at D+12..13 (low byte) -* markBuf at D+14..17 (long ptr) +* srcOffset at D+0..1 (within bank-$01, e.g. $2000..$9C60) +* copyBytes at D+2..3 (>= 1) +* rowsLeft at D+4..5 (>= 1) **************************************************************** -iigsFloodScanRowInner start IIGSASM -fsRow equ 0 -fsLeft equ 4 -fsRight equ 6 -fsMatch equ 8 -fsNew equ 10 -fsEq equ 12 -fsMark equ 14 +iigsBlitRectStageToShr start DRAWPRIMS +brsOff equ 0 +brsBytes equ 2 +brsRows equ 4 php phb @@ -2608,96 +2696,36 @@ fsMark equ 14 adc #8 tcd -* spanLen = rightX - leftX + 1; bail if rightX < leftX. - lda fsRight - sec - sbc fsLeft - bpl fsSpanOk - brl fsExit -fsSpanOk anop - inc a - sta >fsSpanLen +* MVN expects (count - 1) in A. Cache once. + lda brsBytes + dec a + sta >brsBytesM1 -* Cache 8-bit constants used in the inner loop. - sep #$20 - LONGA OFF - lda fsMatch - and #$0F - sta >fsMatchByte - lda fsNew - and #$0F - sta >fsNewByte - lda fsEq - sta >fsEqByte - rep #$20 - LONGA ON +brsRowLoop anop + lda brsRows + bne brsRowGo + brl brsExit +brsRowGo anop - ldy #0 ; Y = i = 0..spanLen-1 - -fsLoop anop - tya - cmp >fsSpanLen - bcc fsBody - brl fsExit -fsBody anop -* curX = leftX + Y; byteIdx = curX >> 1; use Y for [fsRow],y read. - phy - clc - adc fsLeft - sta >fsCurXTemp - lsr a +* X = src offset in bank $01, Y = dst offset in bank $E1 (same value). + lda brsOff + tax tay - sep #$20 - LONGA OFF - lda [fsRow],y - sta >fsByte - rep #$20 - LONGA ON - ply + lda >brsBytesM1 + mvn $010000,$E10000 -* M=8 for the rest of the inner body. - sep #$20 - LONGA OFF - lda >fsCurXTemp ; A = low byte of curX - lsr a ; carry = curX & 1 - bcs fsOddPix - lda >fsByte - lsr a - lsr a - lsr a - lsr a - bra fsHaveNib -fsOddPix anop - lda >fsByte - and #$0F -fsHaveNib anop - sta >fsPixNib +* Advance offset by SURFACE_BYTES_PER_ROW (160). + clc + lda brsOff + adc #160 + sta brsOff - cmp >fsMatchByte - bne fsNotMatch - lda >fsEqByte - bne fsOne - bra fsZero -fsNotMatch anop - lda >fsEqByte - bne fsZero - lda >fsPixNib - cmp >fsNewByte - beq fsZero -fsOne anop - lda #1 - sta [fsMark],y - bra fsNextIter -fsZero anop - lda #0 - sta [fsMark],y -fsNextIter anop - rep #$20 - LONGA ON - iny - brl fsLoop + lda brsRows + dec a + sta brsRows + brl brsRowLoop -fsExit anop +brsExit anop LONGA OFF LONGI OFF pld @@ -2707,68 +2735,40 @@ fsExit anop end -fsSpanLen data DRAWPRIMS - ds 2 - end -fsCurXTemp data DRAWPRIMS - ds 2 - end -fsMatchByte data DRAWPRIMS - ds 2 - end -fsNewByte data DRAWPRIMS - ds 2 - end -fsEqByte data DRAWPRIMS - ds 2 - end -fsByte data DRAWPRIMS - ds 2 - end -fsPixNib data DRAWPRIMS +brsBytesM1 data DRAWPRIMS ds 2 end **************************************************************** -* iigsFloodScanAndPushInner(uint8_t *row, uint16_t leftX, -* uint16_t rightX, uint16_t matchColor, -* uint16_t newColor, uint16_t matchEqual, -* uint16_t scanY, -* int16_t *stackX, int16_t *stackY, -* uint16_t *spInOut, uint16_t maxSp) +* iigsMarkDirtyRowsInner(yStart, yEnd, minWord, maxWord) * -* Combined per-pixel match scan + run-edge walk + seed push. Replaces -* the C loop that follows iigsFloodScanRowInner and walks markBuf for -* push edges. Doing it all in one asm call eliminates the per-pixel C -* loop overhead (call setup + array indexing under -b 4-byte ptrs). +* Tight inline of surfaceMarkDirtyRect's per-row widenRow loop. +* For each row in [yStart, yEnd) (yEnd exclusive), widen +* gStageMinWord[row] DOWN to minWord (if smaller) and +* gStageMaxWord[row] UP to maxWord (if larger). * -* Args after PHP+PHB+PHD (TCD makes D = SP+8): -* row D+0..3 (long ptr) -* leftX D+4..5 -* rightX D+6..7 -* matchColor D+8..9 (low byte) -* newColor D+10..11 -* matchEqual D+12..13 -* scanY D+14..15 -* stackX D+16..19 (long ptr) -* stackY D+20..23 (long ptr) -* spInOut D+24..27 (long ptr to int16_t sp; read+write) -* maxSp D+28..29 +* Replaces the ORCA-C version's per-row JSL/PHB/RTL/PLB chain +* (~150 cyc/row) with a ~35 cyc/row asm body. Two calls per +* sprite frame (one for restore, one for draw) on a 16-row +* sprite saves ~3500 cyc/frame; cumulative savings across all +* draw primitives that mark dirty are larger. +* +* Caller is responsible for clipping yStart/yEnd to [0, 200); +* matches the original C contract (no internal bounds check). +* +* Args after PHP+PHB+PHD (TCD = SP+8): +* yStart at D+0..1 +* yEnd at D+2..3 (exclusive) +* minWord at D+4..5 (low byte used; high byte ignored) +* maxWord at D+6..7 (low byte used; high byte ignored) **************************************************************** -iigsFloodScanAndPushInner start IIGSASM -fpRow equ 0 -fpLeft equ 4 -fpRight equ 6 -fpMatch equ 8 -fpNew equ 10 -fpEq equ 12 -fpScanY equ 14 -fpStackX equ 16 -fpStackY equ 20 -fpSpInOut equ 24 -fpMaxSp equ 28 +iigsMarkDirtyRowsInner start MARKDIRTY +mdrYStart equ 0 +mdrYEnd equ 2 +mdrMinWord equ 4 +mdrMaxWord equ 6 php phb @@ -2781,270 +2781,266 @@ fpMaxSp equ 28 adc #8 tcd -* spanLen = rightX - leftX + 1; bail if rightX < leftX. - lda fpRight - sec - sbc fpLeft - bpl fpSpanOk - brl fpExit -fpSpanOk anop - inc a - sta >fpSpanLen - -* Load sp from *spInOut into scratch. - ldy #0 - lda [fpSpInOut],y - sta >fpSp - -* Cache maxSp. - lda fpMaxSp - sta >fpMaxSpCache - -* Cache 8-bit constants and clear prevHit. - sep #$20 - LONGA OFF - lda fpMatch - and #$0F - sta >fpMatchByte - lda fpNew - and #$0F - sta >fpNewByte - lda #0 - sta >fpPrevHit - rep #$20 - LONGA ON - -* Branch on matchEqual ONCE: two specialized inner loops avoid the -* per-pixel test-and-branch on fpEqByte. - lda fpEq - and #$00FF - bne fpEqEntry - brl fpBoundEntry - -***** EQUAL MODE LOOP: hit = (pix == matchColor) ***** -fpEqEntry anop - ldy #0 ; Y = i - -fpEqLoop anop - tya - cmp >fpSpanLen - bcc fpEqBody - brl fpAfterLoop -fpEqBody anop - phy - clc - adc fpLeft - sta >fpCurX - lsr a - tay - sep #$20 - LONGA OFF - lda [fpRow],y - sta >fpByte - rep #$20 - LONGA ON - ply + ldx mdrYStart + cpx mdrYEnd + bcs mdrExit ; empty range -> nothing to do sep #$20 LONGA OFF - lda >fpCurX - lsr a - bcs fpEqOdd - lda >fpByte - lsr a - lsr a - lsr a - lsr a - bra fpEqHaveNib -fpEqOdd anop - lda >fpByte - and #$0F -fpEqHaveNib anop - cmp >fpMatchByte - beq fpEqHit - lda #0 - bra fpEqStoreHit -fpEqHit anop - lda #1 -fpEqStoreHit anop - sta >fpCurHit -* Falling edge: prevHit=1, curHit=0 -> push (curX - 1, scanY). - bne fpEqNoFall - lda >fpPrevHit - beq fpEqNoFall +mdrLoop anop + lda mdrMinWord + cmp >gStageMinWord,x + bcs mdrNoMin ; A >= existing -> existing already <= minWord + sta >gStageMinWord,x ; widen down +mdrNoMin anop + lda mdrMaxWord + cmp >gStageMaxWord,x + bcc mdrNoMax ; A < existing -> existing already >= maxWord + beq mdrNoMax + sta >gStageMaxWord,x ; widen up +mdrNoMax anop + inx + cpx mdrYEnd + bcc mdrLoop + rep #$20 LONGA ON - lda >fpCurX - dec a - sta >fpPushX - jsr fpPushXY - sep #$20 - LONGA OFF -fpEqNoFall anop - lda >fpCurHit - sta >fpPrevHit - rep #$20 - LONGA ON - iny - brl fpEqLoop -***** BOUNDARY MODE LOOP: hit = (pix != matchColor && pix != newColor) ***** -fpBoundEntry anop - ldy #0 - -fpBoundLoop anop - tya - cmp >fpSpanLen - bcc fpBoundBody - brl fpAfterLoop -fpBoundBody anop - phy - clc - adc fpLeft - sta >fpCurX - lsr a - tay - sep #$20 - LONGA OFF - lda [fpRow],y - sta >fpByte - rep #$20 - LONGA ON - ply - - sep #$20 - LONGA OFF - lda >fpCurX - lsr a - bcs fpBoundOdd - lda >fpByte - lsr a - lsr a - lsr a - lsr a - bra fpBoundHaveNib -fpBoundOdd anop - lda >fpByte - and #$0F -fpBoundHaveNib anop - cmp >fpMatchByte - beq fpBoundMiss - cmp >fpNewByte - beq fpBoundMiss - lda #1 - bra fpBoundStoreHit -fpBoundMiss anop - lda #0 -fpBoundStoreHit anop - sta >fpCurHit - - bne fpBoundNoFall - lda >fpPrevHit - beq fpBoundNoFall - rep #$20 - LONGA ON - lda >fpCurX - dec a - sta >fpPushX - jsr fpPushXY - sep #$20 - LONGA OFF -fpBoundNoFall anop - lda >fpCurHit - sta >fpPrevHit - rep #$20 - LONGA ON - iny - brl fpBoundLoop - -fpAfterLoop anop -* Trailing run: if prevHit, push (rightX, scanY). - sep #$20 - LONGA OFF - lda >fpPrevHit - beq fpStoreSp - rep #$20 - LONGA ON - lda fpRight - sta >fpPushX - jsr fpPushXY - bra fpStoreSpDone -fpStoreSp anop - rep #$20 - LONGA ON -fpStoreSpDone anop - ldy #0 - lda >fpSp - sta [fpSpInOut],y - -fpExit anop +mdrExit anop LONGA OFF LONGI OFF pld plb plp rtl - -* fpPushXY: push (fpPushX, fpScanY) onto stackX/stackY at sp. -* No-op if sp >= maxSp. M=16, X=16 on entry; preserves all caller regs. -fpPushXY anop - pha - phy - lda >fpSp - cmp >fpMaxSpCache - bcs fpPushSkip - asl a ; sp*2 = byte offset - tay - lda >fpPushX - sta [fpStackX],y - lda fpScanY - sta [fpStackY],y - lda >fpSp - inc a - sta >fpSp -fpPushSkip anop - ply - pla - rts end -fpSpanLen data DRAWPRIMS +**************************************************************** +* iigsPollJoystickInner(void) +* +* Reads the IIgs paddle ports (PDL0/PDL1) at the period-correct 1 MHz +* CPU speed, ported from the old joeylib's jIIgs.asm. The polling rate +* and the paddle one-shot timer rate must match for the count value to +* mean what every other IIgs/Apple II joystick game expects. +* +* Algorithm: +* 1. PHP / SEI / save DBR / DBR=0 (so abs $C0xx I/O reads work). +* 2. Save current $C036 CYAREG; clear bit 7 to force 1 MHz. +* 3. PTRIG ($C070) to start both paddle timers in parallel. +* 4. Tight interleaved poll loop: +* - Both busy: lda PDL0; if done -> pdl1-only loop. Else inx; +* inx; lda PDL1; if done -> pdl0-only loop. Else iny; loop. +* - X overflow (256 iter * 2) marks pdl0 unresolved (timeout). +* - Y overflow (256 iter * 1) marks pdl1 unresolved. +* 5. Y is incremented half as often as X (mirrors old jIIgs.asm), so +* the trailing `asl a` doubles it back to the same scale. +* 6. Restore $C036, restore DBR, restore P (incl I). +* +* Outputs (DRAWPRIMS scratch): +* gJoyPx -- pdl0 count, 0..254 (unconnected/timeout: 0) +* gJoyPy -- pdl1 count, 0..254 +* gJoyResolved -- bit 0 set iff pdl0 fired before timeout; +* bit 1 set iff pdl1 fired +**************************************************************** + +iigsPollJoystickInner start IIGSASM + php + sei + + rep #$30 + LONGA ON + LONGI ON + +* Save caller's DBR; force DBR=0 so abs $C0xx hits the I/O page. + phb + pea $0000 + plb ; pull low byte of pushed word -> DBR=0 + plb ; pull high byte (discard, was 0) + + sep #$30 ; M=8, X=8 (tight inner loop) + LONGA OFF + LONGI OFF + +* Save CYAREG and force 1 MHz (clear bit 7). + lda $C036 + sta >gJoyOrigSpeed + and #$7F + sta $C036 + + ldx #0 ; pdl0 count (increments by 2) + ldy #0 ; pdl1 count (increments by 1; asl at end) + lda #0 + sta >gJoyResolved + +* PTRIG: start both paddle one-shot timers. + lda $C070 + +* Both-busy poll loop. ~25 cyc / iter; X reaches ~240 at full +* deflection, X wraps (8-bit) at ~3.2 ms (paddle timer max). +joyChkPdl0 anop + lda $C064 ; PDL0 + bpl joyGotPdl0 ; bit7=0 -> pdl0 done + inx + inx + beq joyTimeoutX ; X wrapped -> pdl0 never fired + lda $C065 ; PDL1 + bmi joyNoGots ; bit7=1 -> pdl1 still busy +* pdl1 just fired. Mark resolved, switch to pdl0-only loop. + lda #$02 + sta >gJoyResolved +joyPdl0Only anop + lda $C064 + bpl joyAllDone + inx + inx + bne joyPdl0Only + bra joyTimeoutX +joyNoGots anop + iny + bne joyChkPdl0 + bra joyTimeoutY + +joyGotPdl0 anop + lda #$01 + sta >gJoyResolved +joyPdl1Only anop + lda $C065 + bpl joyAllDone + iny + bne joyPdl1Only + bra joyTimeoutY + +joyAllDone anop + lda >gJoyResolved + ora #$03 + sta >gJoyResolved + bra joyExit + +joyTimeoutX anop + lda >gJoyResolved + and #$02 ; clear bit 0 (pdl0 unresolved) + sta >gJoyResolved + bra joyExit + +joyTimeoutY anop + lda >gJoyResolved + and #$01 ; clear bit 1 (pdl1 unresolved) + sta >gJoyResolved + +joyExit anop +* STX/STY have no long-abs form. Stash via TXA/TYA. + txa + sta >gJoyPx + tya + asl a ; scale Y from 0..127 to 0..254 + sta >gJoyPy + +* Restore CYAREG (other bits preserved by saving full byte). + lda >gJoyOrigSpeed + sta $C036 + + rep #$30 + LONGA ON + LONGI ON + plb ; restore caller DBR + + plp ; restores I from pre-SEI value + rtl + end + + +gJoyPx data DRAWPRIMS ds 2 end -fpSp data DRAWPRIMS +gJoyPy data DRAWPRIMS ds 2 end -fpMaxSpCache data DRAWPRIMS +gJoyResolved data DRAWPRIMS ds 2 end -fpMatchByte data DRAWPRIMS +gJoyOrigSpeed data DRAWPRIMS ds 2 end -fpNewByte data DRAWPRIMS - ds 2 - end -fpEqByte data DRAWPRIMS - ds 2 - end -fpPrevHit data DRAWPRIMS - ds 2 - end -fpCurHit data DRAWPRIMS - ds 2 - end -fpCurX data DRAWPRIMS - ds 2 - end -fpByte data DRAWPRIMS - ds 2 - end -fpPixNib data DRAWPRIMS - ds 2 - end -fpPushX data DRAWPRIMS - ds 2 + + +**************************************************************** +* iigsInputSnapshot(void) +* +* Per-frame input bookkeeping done in one tight asm pass instead of +* the three C memcpys + C TTL loop that joeyInputPoll used to do. +* Saves ~0.6 ms per frame in animated demos. +* +* Three combined operations: +* 1. Decrement gKeyTtl[i] for every key; on transition to zero, +* clear gKeyState[i] (key is now "released"). +* 2. Snapshot gKeyState -> gKeyPrev (KEY_COUNT bytes via long-mode +* lda/sta loop, ~15 cyc/byte). +* 3. Snapshot gMouseButtonState/gJoyButtonState (4 bytes each) +* via 4 inline lda/sta pairs. +* +* IMPORTANT: KEY_COUNT is hard-coded at 60 below. If you add or +* remove a key in joey/input.h, bump the constant or the loop bounds +* will silently miss / spill. The C side has a STATIC_ASSERT-ish +* check via a build-time array dimension trick (see input.c). +**************************************************************** + +iigsInputSnapshot start IIGSASM + php + rep #$30 + LONGA ON + LONGI ON + sep #$20 + LONGA OFF + +* TTL decrement + key-released detection. ~12 cyc / iter fast path. + ldx #59 ; KEY_COUNT - 1 +isnTtlLoop anop + lda >gKeyTtl,x + beq isnTtlNext ; ttl==0, nothing to do + dec a + sta >gKeyTtl,x + bne isnTtlNext ; not yet zero + sta >gKeyState,x ; A==0 -> mark released +isnTtlNext anop + dex + bpl isnTtlLoop + +* Snapshot gKeyState -> gKeyPrev (60 bytes), long-mode loop. + ldx #59 +isnKeyLoop anop + lda >gKeyState,x + sta >gKeyPrev,x + dex + bpl isnKeyLoop + +* Snapshot gMouseButtonState -> gMouseButtonPrev (4 bytes inline). + lda >gMouseButtonState + sta >gMouseButtonPrev + lda >gMouseButtonState+1 + sta >gMouseButtonPrev+1 + lda >gMouseButtonState+2 + sta >gMouseButtonPrev+2 + lda >gMouseButtonState+3 + sta >gMouseButtonPrev+3 + +* Snapshot gJoyButtonState -> gJoyButtonPrev (4 bytes inline, +* JOYSTICK_COUNT * JOY_BUTTON_COUNT = 2*2). + lda >gJoyButtonState + sta >gJoyButtonPrev + lda >gJoyButtonState+1 + sta >gJoyButtonPrev+1 + lda >gJoyButtonState+2 + sta >gJoyButtonPrev+2 + lda >gJoyButtonState+3 + sta >gJoyButtonPrev+3 + + rep #$30 + LONGA ON + LONGI ON + plp + rtl end @@ -4114,160 +4110,237 @@ brTrans equ 18 adc #8 tcd -* rowsLeft = copyH. +* Cache rowsLeft, transparent low byte. Branch on transparent==$FF +* (opaque mode) once at top so per-pixel loop has no transparency +* test in the hot path. lda brH sta >brRowsLeft + sep #$20 + LONGA OFF + lda brTrans + sta >brTransByte + cmp #$FF + rep #$20 + LONGA ON + beq brOpaqueRowLoop + brl brMaskedRowLoop -brRowLoop anop +***** OPAQUE MODE: every src pixel writes its dst nibble unconditionally ***** +brOpaqueRowLoop anop lda >brRowsLeft - bne brRowGo + bne brOpRowGo brl brExit -brRowGo anop - -* For this row: walk col 0..copyW-1. +brOpRowGo anop lda #0 sta >brCol - -brColLoop anop +brOpColLoop anop lda >brCol cmp brW - bcc brColGo - brl brColDone -brColGo anop + bcc brOpColGo + brl brOpRowAdv +brOpColGo anop -* Compute src x = srcX + col, src byte addr offset = sx >> 1, parity = sx & 1. +* Src nib extract. lda brSrcX clc adc >brCol sta >brSx lsr a - tay ; Y = src byte offset - lda >brSx - and #$0001 - sta >brSxParity - + tay sep #$20 LONGA OFF lda [brSrc],y sta >brSrcByte - rep #$20 - LONGA ON - lda >brSxParity - bne brSrcOdd -* Even src x -> high nibble. - sep #$20 - LONGA OFF + lda >brSx + lsr a + bcs brOpSrcOdd lda >brSrcByte lsr a lsr a lsr a lsr a - bra brSrcGotNib -brSrcOdd anop - sep #$20 - LONGA OFF + bra brOpSrcDone +brOpSrcOdd anop lda >brSrcByte and #$0F -brSrcGotNib anop - sta >brNib ; M=8 store low byte +brOpSrcDone anop + sta >brNib -* Transparency check: if low byte of brTrans == nib, skip write. +* Dst RMW. rep #$20 LONGA ON - lda brTrans - and #$00FF - cmp #$00FF - beq brOpaque ; $FF means no transparency (opaque mode) - lda brTrans - and #$00FF - cmp >brNib - bne brOpaque ; not transparent, keep going - brl brColAdvance ; transparent, skip dst write -brOpaque anop - -* Compute dst x = dstX + col, dst byte addr offset = dx >> 1, parity = dx & 1. lda brDstX clc adc >brCol sta >brDx lsr a - tay ; Y = dst byte offset + tay + sep #$20 + LONGA OFF lda >brDx - and #$0001 - sta >brDxParity - - sep #$20 - LONGA OFF + lsr a + bcs brOpDstOdd + lda >brNib + asl a + asl a + asl a + asl a + sta >brDstNibPart lda [brDst],y - sta >brDstByte - rep #$20 - LONGA ON - lda >brDxParity - bne brDstOdd -* Even dst x -> high nibble. dst = (dst & 0x0F) | (nib << 4). - sep #$20 - LONGA OFF - lda >brNib - asl a - asl a - asl a - asl a - sta >brDstNibPart - lda >brDstByte and #$0F ora >brDstNibPart sta [brDst],y - rep #$20 - LONGA ON - bra brColAdvance -brDstOdd anop - sep #$20 - LONGA OFF - lda >brNib - and #$0F - sta >brDstNibPart - lda >brDstByte + bra brOpDstDone +brOpDstOdd anop + lda [brDst],y and #$F0 - ora >brDstNibPart + ora >brNib sta [brDst],y +brOpDstDone anop rep #$20 LONGA ON -brColAdvance anop lda >brCol inc a sta >brCol - brl brColLoop + brl brOpColLoop -brColDone anop -* Advance srcRow ptr by srcRowBytes. +brOpRowAdv anop clc - lda brSrc ; low 16 of srcRow + lda brSrc adc brSrcStride sta brSrc - bcc brSrcNoCarry - lda brSrc+2 ; bank/pad + bcc brOpSrcNC + lda brSrc+2 clc adc #1 sta brSrc+2 -brSrcNoCarry anop - -* Advance dstRow ptr by 160. +brOpSrcNC anop clc lda brDst adc #160 sta brDst - bcc brDstNoCarry + bcc brOpDstNC lda brDst+2 clc adc #1 sta brDst+2 -brDstNoCarry anop - +brOpDstNC anop lda >brRowsLeft dec a sta >brRowsLeft - brl brRowLoop + brl brOpaqueRowLoop + +***** MASKED MODE: skip dst write when src nibble equals brTransByte ***** +brMaskedRowLoop anop + lda >brRowsLeft + bne brMkRowGo + brl brExit +brMkRowGo anop + lda #0 + sta >brCol +brMkColLoop anop + lda >brCol + cmp brW + bcc brMkColGo + brl brMkRowAdv +brMkColGo anop + +* Src nib extract. + lda brSrcX + clc + adc >brCol + sta >brSx + lsr a + tay + sep #$20 + LONGA OFF + lda [brSrc],y + sta >brSrcByte + lda >brSx + lsr a + bcs brMkSrcOdd + lda >brSrcByte + lsr a + lsr a + lsr a + lsr a + bra brMkSrcDone +brMkSrcOdd anop + lda >brSrcByte + and #$0F +brMkSrcDone anop + sta >brNib + +* Transparency check (M=8): skip dst write if nib == brTransByte. + cmp >brTransByte + beq brMkSkipDst + +* Dst RMW. + rep #$20 + LONGA ON + lda brDstX + clc + adc >brCol + sta >brDx + lsr a + tay + sep #$20 + LONGA OFF + lda >brDx + lsr a + bcs brMkDstOdd + lda >brNib + asl a + asl a + asl a + asl a + sta >brDstNibPart + lda [brDst],y + and #$0F + ora >brDstNibPart + sta [brDst],y + bra brMkDstDone +brMkDstOdd anop + lda [brDst],y + and #$F0 + ora >brNib + sta [brDst],y +brMkDstDone anop +brMkSkipDst anop + rep #$20 + LONGA ON + + lda >brCol + inc a + sta >brCol + brl brMkColLoop + +brMkRowAdv anop + clc + lda brSrc + adc brSrcStride + sta brSrc + bcc brMkSrcNC + lda brSrc+2 + clc + adc #1 + sta brSrc+2 +brMkSrcNC anop + clc + lda brDst + adc #160 + sta brDst + bcc brMkDstNC + lda brDst+2 + clc + adc #1 + sta brDst+2 +brMkDstNC anop + lda >brRowsLeft + dec a + sta >brRowsLeft + brl brMaskedRowLoop brExit anop LONGA OFF @@ -4309,3 +4382,6 @@ brDstByte data DRAWPRIMS brDstNibPart data DRAWPRIMS ds 2 end +brTransByte data DRAWPRIMS + ds 2 + end diff --git a/src/port/iigs/peislam.asm b/src/port/iigs/peislam.asm index 75a92ad..74fa672 100644 --- a/src/port/iigs/peislam.asm +++ b/src/port/iigs/peislam.asm @@ -1,76 +1,15 @@ -* peislam.asm - PEI-slam stage row to bank-$E1 SHR. +* peislam.asm - placeholder. * -* Implements the //e AUXWRITE + RAMRD + SHR-shadow trick that lets -* 65816 stack pushes (which are bank-$00-implicit) end up in bank -* $E1 SHR display memory: -* -* - SHR shadow temporarily ENABLED (clear $C035 bit 3) so writes -* to bank-$01 in $2000-$9FFF mirror to $E1 SHR. -* - AUXWRITE on (any write to $C005) so bank-$00 stack writes -* redirect to bank $01, then mirror to $E1 via shadow. -* - RAMRD on (any write to $C003) so PEI dp's bank-$00-implicit -* reads redirect to bank $01 = the stage source. -* - SEI for the duration: stack pointer is hijacked to point at -* $E1-mapped stack space, soft-switch state would corrupt any -* C code that tried to access bank-$00 globals. -* -* All scratch reads/writes within the slam use long-mode `>name` -* addressing (24-bit, explicit bank) so they bypass RAMRD redirect -* and reach the actual bank-$00 global storage. -* -* Calling convention: ORCA-C memory model 1 (large model, JSL/RTL). -* void peiSlamFullRow(int16_t y); -* - Caller PHAs y (2 bytes) before JSL. -* - JSL pushes 3-byte return address. -* - On entry: y_LO at SP+4, y_HI at SP+5 (SP points one below PCL). -* - Function preserves DBR; returns via RTL with original SP. -* - Caller pops the y arg after RTL. -* -* Per call: ~50 cyc bracket + 80 PEIs * 6 cyc = ~530 cyc, vs the -* memcpy/MVN fallback's 7 cyc/byte * 160 bytes = ~1120 cyc. +* The original PEI-slam-per-row helper was removed; its functionality +* was rolled into iigsBlitStageToShr in joeyDraw.asm (full PEI-slam +* with per-row dirty skip). This stub remains so the build's +* PORT_ASM_SRCS_ALL wildcard pulls in a file with a recognized load +* segment and the linker keeps the same segment-bank layout it had +* when peislam.asm was a real translation unit. keep PEISLAM case on -* The operand to START names the LOAD segment this object segment -* belongs to (per ORCA/M for IIgs manual, ch. 6 "Load Segments"). -* Object segments without an operand land in the unnamed "blank -* segment" -- which on AUDIO is _ROOT, the very segment whose 64 KB -* budget peislam.asm was busting. Naming a load segment forces the -* linker to put us in our own segment, which the GS/OS loader then -* allocates in its own bank. -peiSlamFullRow start IIGSASM -* MVN-based row copy. Replaces the PEI-stack-slam approach (which -* needs RAMRD/AUXWRITE/SHADOW soft-switches and is sensitive to -* DRAWDATA bank placement). MVN copies 160 bytes from the bank-$01 -* stage row to the matching bank-$E1 SHR row at ~7 cyc/byte; that's -* slower than PEI-slam but rock-solid. -* -* Args after PHP: y (int16) at SP+5..6. Compute rowOffset = $2000 -* + y*160. MVN $01,$E1 with X=Y=rowOffset, A=159 copies 160 bytes -* from $01:rowOffset to $E1:rowOffset. - php - rep #$30 ; M=16, X=16 - - lda 5,s ; y - asl a - asl a - asl a - asl a - asl a ; A = y << 5 = y*32 - sta >gPeiTempRowBase - asl a - asl a ; A = y << 7 = y*128 - clc - adc >gPeiTempRowBase ; A = y*160 - clc - adc #$2000 ; A = $2000 + y*160 = row offset - - tax ; X = source offset (bank $01) - tay ; Y = dest offset (bank $E1) - lda #159 ; count - 1 (MVN copies count+1 = 160 bytes) - mvn $01,$E1 - - plp +peislamStub start IIGSASM rtl end