Mass ASM optimization on IIgs.

This commit is contained in:
Scott Duensing 2026-04-30 17:04:08 -05:00
parent 065be89bff
commit 04a9550421
20 changed files with 1432 additions and 1154 deletions

View file

@ -204,7 +204,12 @@ int main(void) {
? (oldY + oldH)
: (backup.y + backup.height));
joeyWaitVBL();
// VBL wait removed -- the demo runs at the native compute speed
// of save+restore+draw+presentRect so we can SEE the sprite
// pipeline's actual throughput. Expect tearing on the ball
// since the present can land mid-scan; that's the cost of
// showing real frame rate. Add joeyWaitVBL() back here for
// tear-free 60 Hz motion.
stagePresentRect(unionX, unionY,
(uint16_t)(unionRight - unionX),
(uint16_t)(unionBottom - unionY));

View file

@ -112,4 +112,22 @@ bool joeyJoyDown(JoeyJoystickE js, JoeyJoyButtonE button);
bool joeyJoyPressed(JoeyJoystickE js, JoeyJoyButtonE button);
bool joeyJoyReleased(JoeyJoystickE js, JoeyJoyButtonE button);
// Re-enable joystick polling and recalibrate the resting (center)
// position. The IIgs port auto-disables polling after a short window
// of detecting no stick (saves ~3 ms/frame of busy-wait). It does NOT
// auto-re-probe -- the application must call this function to resume
// polling after plugging a stick in.
//
// The next poll after this call captures the stick's CURRENT raw
// position as the new center -- so the user must hold the stick
// centered when calling. Subsequent polls report position relative
// to that center; raw readings within `deadZone` units of the center
// clamp to 0 (use 0 to disable the dead zone).
//
// On platforms with truly digital sticks (Amiga / ST / DOS) the
// recalibration is a no-op -- those ports already report -1 / 0 / +1
// directly -- and `deadZone` is ignored. The function still clears
// any auto-disconnect state so polling resumes.
void joeyJoystickReset(JoeyJoystickE js, uint8_t deadZone);
#endif

View file

@ -18,6 +18,7 @@
#include "spriteInternal.h"
#include "surfaceInternal.h"
// Largest scratch buffer needed for any single emit call. 16 KB
// covers a 32x32 sprite even on 68k (the biggest mixed-RMW byte-
// emit at 16 bytes/byte * (16*17 dest bytes per shift) ~= 4.5 KB,
@ -157,6 +158,11 @@ bool spriteCompile(SpriteT *sp) {
#if defined(JOEYLIB_PLATFORM_IIGS)
// y*160 lookup. gRowOffsetLut is the 200-entry uint16_t table built
// once by iigsInitRowLut at halInit. Replaces ORCA-C's runtime
// multiply (a JSL into __mul16) with a single indexed long-mode read.
extern const uint16_t gRowOffsetLut[200];
// IIgs uses inline asm + a self-modifying call stub instead of a C
// function-pointer cast. The build uses ORCA-C large memory model
// (-b for sprite demos) so pointers are 24-bit and JSL works
@ -182,7 +188,18 @@ bool spriteCompile(SpriteT *sp) {
// Patched per call: byte 2 (destBank), bytes 6-7 (destOffset16),
// bytes 9-11 (target 24-bit). The compiled routine assumes
// M=8 / X=16 / Y=destOffset on entry; the stub arranges that.
//
// Stub bytes are split into two phases:
// 1. The 8 opcode bytes are written ONCE on first call (gDrawStubInited).
// 2. Of the 6 operand bytes, only those that actually changed since
// the previous call get re-stamped: destBank and fnAddr are cached
// and rarely change (per-shift / per-bank). destOffset is the only
// one that changes every call as the sprite moves. Net per-frame
// patching for the typical case drops from 14 stores to 2.
static unsigned char gSpriteCallStub[14];
static bool gDrawStubInited = false;
static uint8_t gDrawStubLastBank = 0xFF;
static uint32_t gDrawStubLastFnAddr = 0xFFFFFFFFul;
void spriteCompiledDraw(SurfaceT *dst, const SpriteT *sp, int16_t x, int16_t y) {
uint8_t shift;
@ -195,7 +212,7 @@ void spriteCompiledDraw(SurfaceT *dst, const SpriteT *sp, int16_t x, int16_t y)
uint8_t *destPtr;
uint8_t destBytes[4];
shift = (uint8_t)(x & 1);
destPtr = &dst->pixels[(uint16_t)y * SURFACE_BYTES_PER_ROW + ((uint16_t)x >> 1)];
destPtr = &dst->pixels[gRowOffsetLut[(uint16_t)y] + ((uint16_t)x >> 1)];
memcpy(destBytes, &destPtr, 4);
destAddr = (uint32_t)destBytes[0]
| ((uint32_t)destBytes[1] << 8)
@ -208,20 +225,35 @@ void spriteCompiledDraw(SurfaceT *dst, const SpriteT *sp, int16_t x, int16_t y)
}
(void)destAddr;
gSpriteCallStub[ 0] = 0x8B;
gSpriteCallStub[ 1] = 0xA9;
gSpriteCallStub[ 2] = destBank;
gSpriteCallStub[ 3] = 0x48;
gSpriteCallStub[ 4] = 0xAB;
gSpriteCallStub[ 5] = 0xA0;
if (!gDrawStubInited) {
gSpriteCallStub[ 0] = 0x8B;
gSpriteCallStub[ 1] = 0xA9;
gSpriteCallStub[ 3] = 0x48;
gSpriteCallStub[ 4] = 0xAB;
gSpriteCallStub[ 5] = 0xA0;
gSpriteCallStub[ 8] = 0x22;
gSpriteCallStub[12] = 0xAB;
gSpriteCallStub[13] = 0x6B;
gDrawStubInited = true;
}
// destOffset always changes (sprite moves every frame).
gSpriteCallStub[ 6] = (unsigned char)(destOffset & 0xFFu);
gSpriteCallStub[ 7] = (unsigned char)((destOffset >> 8) & 0xFFu);
gSpriteCallStub[ 8] = 0x22;
gSpriteCallStub[ 9] = (unsigned char)(fnAddr & 0xFFu);
gSpriteCallStub[10] = (unsigned char)((fnAddr >> 8) & 0xFFu);
gSpriteCallStub[11] = (unsigned char)((fnAddr >> 16) & 0xFFu);
gSpriteCallStub[12] = 0xAB;
gSpriteCallStub[13] = 0x6B;
// destBank only changes if the dst surface migrates banks (~never).
if (destBank != gDrawStubLastBank) {
gSpriteCallStub[ 2] = destBank;
gDrawStubLastBank = destBank;
}
// fnAddr changes only on shift parity flips or sprite swaps.
if (fnAddr != gDrawStubLastFnAddr) {
gSpriteCallStub[ 9] = (unsigned char)(fnAddr & 0xFFu);
gSpriteCallStub[10] = (unsigned char)((fnAddr >> 8) & 0xFFu);
gSpriteCallStub[11] = (unsigned char)((fnAddr >> 16) & 0xFFu);
gDrawStubLastFnAddr = fnAddr;
}
// ORCA-C compiles this function under `longa on / longi on`
// (M=16, X=16) and emits the function epilogue assuming those
@ -259,7 +291,26 @@ void spriteCompiledDraw(SurfaceT *dst, const SpriteT *sp, int16_t x, int16_t y)
//
// For SAVE: X = screen lo, Y = backup lo
// For RESTORE: X = backup lo, Y = screen lo
static unsigned char gSpriteCopyStub[13];
//
// Two distinct stubs (one per op) instead of a shared one. Save and
// restore alternate every frame and they swap the X/Y meanings, so a
// shared stub forced a full re-stamp on every call. Per-op stubs let
// us cache: only the bytes that genuinely change frame-to-frame
// (typically just one of screenLo/backupLo as the sprite moves) get
// rewritten. Cuts per-call patching from 13 stores to 2 in the typical
// case (static backup buffer, stable shift parity).
static unsigned char gSpriteSaveStub[13];
static unsigned char gSpriteRestoreStub[13];
static bool gSaveStubInited = false;
static uint16_t gSaveStubLastXLo = 0xFFFFu;
static uint16_t gSaveStubLastYLo = 0xFFFFu;
static uint32_t gSaveStubLastFnAddr = 0xFFFFFFFFul;
static bool gRestoreStubInited = false;
static uint16_t gRestoreStubLastXLo = 0xFFFFu;
static uint16_t gRestoreStubLastYLo = 0xFFFFu;
static uint32_t gRestoreStubLastFnAddr= 0xFFFFFFFFul;
// patchMvnBanks stamps the destination and source bank operand bytes
@ -315,7 +366,7 @@ void spriteCompiledSaveUnder(const SurfaceT *src, SpriteT *sp, int16_t x, int16_
heightPx = (uint16_t)(sp->heightTiles * 8);
copyBytes = (uint16_t)((widthPx >> 1) + (shift == 1 ? 1 : 0));
screenPtr = (uint8_t *)&src->pixels[(uint16_t)y * SURFACE_BYTES_PER_ROW + ((uint16_t)clippedX >> 1)];
screenPtr = (uint8_t *)&src->pixels[gRowOffsetLut[(uint16_t)y] + ((uint16_t)clippedX >> 1)];
splitPointer(screenPtr, &screenLo, &screenBank);
splitPointer(backup->bytes, &backupLo, &backupBank);
@ -331,28 +382,49 @@ void spriteCompiledSaveUnder(const SurfaceT *src, SpriteT *sp, int16_t x, int16_
+ (uint32_t)sp->routineOffsets[shift][SPRITE_OP_SAVE];
// Stub: X = screen (source), Y = backup (destination).
gSpriteCopyStub[ 0] = 0x8B;
gSpriteCopyStub[ 1] = 0xA2;
gSpriteCopyStub[ 2] = (unsigned char)(screenLo & 0xFFu);
gSpriteCopyStub[ 3] = (unsigned char)((screenLo >> 8) & 0xFFu);
gSpriteCopyStub[ 4] = 0xA0;
gSpriteCopyStub[ 5] = (unsigned char)(backupLo & 0xFFu);
gSpriteCopyStub[ 6] = (unsigned char)((backupLo >> 8) & 0xFFu);
gSpriteCopyStub[ 7] = 0x22;
gSpriteCopyStub[ 8] = (unsigned char)(fnAddr & 0xFFu);
gSpriteCopyStub[ 9] = (unsigned char)((fnAddr >> 8) & 0xFFu);
gSpriteCopyStub[10] = (unsigned char)((fnAddr >> 16) & 0xFFu);
gSpriteCopyStub[11] = 0xAB;
gSpriteCopyStub[12] = 0x6B;
if (!gSaveStubInited) {
gSpriteSaveStub[ 0] = 0x8B;
gSpriteSaveStub[ 1] = 0xA2;
gSpriteSaveStub[ 4] = 0xA0;
gSpriteSaveStub[ 7] = 0x22;
gSpriteSaveStub[11] = 0xAB;
gSpriteSaveStub[12] = 0x6B;
gSaveStubInited = true;
}
if (screenLo != gSaveStubLastXLo) {
gSpriteSaveStub[ 2] = (unsigned char)(screenLo & 0xFFu);
gSpriteSaveStub[ 3] = (unsigned char)((screenLo >> 8) & 0xFFu);
gSaveStubLastXLo = screenLo;
}
if (backupLo != gSaveStubLastYLo) {
gSpriteSaveStub[ 5] = (unsigned char)(backupLo & 0xFFu);
gSpriteSaveStub[ 6] = (unsigned char)((backupLo >> 8) & 0xFFu);
gSaveStubLastYLo = backupLo;
}
if (fnAddr != gSaveStubLastFnAddr) {
gSpriteSaveStub[ 8] = (unsigned char)(fnAddr & 0xFFu);
gSpriteSaveStub[ 9] = (unsigned char)((fnAddr >> 8) & 0xFFu);
gSpriteSaveStub[10] = (unsigned char)((fnAddr >> 16) & 0xFFu);
gSaveStubLastFnAddr = fnAddr;
}
routine = codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_SAVE];
patchMvnBanks(routine, heightPx, /*dst*/backupBank, /*src*/screenBank);
// Skip the 16+ MVN-bank rewrites if the dst/src bank pair is the
// same as last call. Screen and backup buffer banks are stable
// for essentially every frame past the first, so this short-
// circuits ~5000 cyc/frame on the ball demo.
if (sp->cachedDstBank[shift][SPRITE_OP_SAVE] != backupBank ||
sp->cachedSrcBank[shift][SPRITE_OP_SAVE] != screenBank) {
routine = codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_SAVE];
patchMvnBanks(routine, heightPx, /*dst*/backupBank, /*src*/screenBank);
sp->cachedDstBank[shift][SPRITE_OP_SAVE] = backupBank;
sp->cachedSrcBank[shift][SPRITE_OP_SAVE] = screenBank;
}
// MVN-based routine: needs M=16 / X=16; restore M=16 on exit
// matches ORCA-C `longa on` epilogue expectations.
asm {
rep #0x30
jsl gSpriteCopyStub
jsl gSpriteSaveStub
rep #0x30
}
}
@ -378,7 +450,7 @@ void spriteCompiledRestoreUnder(SurfaceT *dst, const SpriteBackupT *backup) {
spriteBytesPerRow = (uint16_t)(sp->widthTiles * 4);
shift = (copyBytes == spriteBytesPerRow) ? 0 : 1;
screenPtr = (uint8_t *)&dst->pixels[(uint16_t)backup->y * SURFACE_BYTES_PER_ROW + ((uint16_t)backup->x >> 1)];
screenPtr = (uint8_t *)&dst->pixels[gRowOffsetLut[(uint16_t)backup->y] + ((uint16_t)backup->x >> 1)];
splitPointer(screenPtr, &screenLo, &screenBank);
splitPointer(backup->bytes, &backupLo, &backupBank);
@ -387,26 +459,45 @@ void spriteCompiledRestoreUnder(SurfaceT *dst, const SpriteBackupT *backup) {
+ (uint32_t)sp->routineOffsets[shift][SPRITE_OP_RESTORE];
// Stub: X = backup (source), Y = screen (destination).
gSpriteCopyStub[ 0] = 0x8B;
gSpriteCopyStub[ 1] = 0xA2;
gSpriteCopyStub[ 2] = (unsigned char)(backupLo & 0xFFu);
gSpriteCopyStub[ 3] = (unsigned char)((backupLo >> 8) & 0xFFu);
gSpriteCopyStub[ 4] = 0xA0;
gSpriteCopyStub[ 5] = (unsigned char)(screenLo & 0xFFu);
gSpriteCopyStub[ 6] = (unsigned char)((screenLo >> 8) & 0xFFu);
gSpriteCopyStub[ 7] = 0x22;
gSpriteCopyStub[ 8] = (unsigned char)(fnAddr & 0xFFu);
gSpriteCopyStub[ 9] = (unsigned char)((fnAddr >> 8) & 0xFFu);
gSpriteCopyStub[10] = (unsigned char)((fnAddr >> 16) & 0xFFu);
gSpriteCopyStub[11] = 0xAB;
gSpriteCopyStub[12] = 0x6B;
if (!gRestoreStubInited) {
gSpriteRestoreStub[ 0] = 0x8B;
gSpriteRestoreStub[ 1] = 0xA2;
gSpriteRestoreStub[ 4] = 0xA0;
gSpriteRestoreStub[ 7] = 0x22;
gSpriteRestoreStub[11] = 0xAB;
gSpriteRestoreStub[12] = 0x6B;
gRestoreStubInited = true;
}
if (backupLo != gRestoreStubLastXLo) {
gSpriteRestoreStub[ 2] = (unsigned char)(backupLo & 0xFFu);
gSpriteRestoreStub[ 3] = (unsigned char)((backupLo >> 8) & 0xFFu);
gRestoreStubLastXLo = backupLo;
}
if (screenLo != gRestoreStubLastYLo) {
gSpriteRestoreStub[ 5] = (unsigned char)(screenLo & 0xFFu);
gSpriteRestoreStub[ 6] = (unsigned char)((screenLo >> 8) & 0xFFu);
gRestoreStubLastYLo = screenLo;
}
if (fnAddr != gRestoreStubLastFnAddr) {
gSpriteRestoreStub[ 8] = (unsigned char)(fnAddr & 0xFFu);
gSpriteRestoreStub[ 9] = (unsigned char)((fnAddr >> 8) & 0xFFu);
gSpriteRestoreStub[10] = (unsigned char)((fnAddr >> 16) & 0xFFu);
gRestoreStubLastFnAddr = fnAddr;
}
routine = codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_RESTORE];
patchMvnBanks(routine, heightPx, /*dst*/screenBank, /*src*/backupBank);
// Same short-circuit as save: only re-stamp the bank operands if
// they actually changed since last call.
if (sp->cachedDstBank[shift][SPRITE_OP_RESTORE] != screenBank ||
sp->cachedSrcBank[shift][SPRITE_OP_RESTORE] != backupBank) {
routine = codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_RESTORE];
patchMvnBanks(routine, heightPx, /*dst*/screenBank, /*src*/backupBank);
sp->cachedDstBank[shift][SPRITE_OP_RESTORE] = screenBank;
sp->cachedSrcBank[shift][SPRITE_OP_RESTORE] = backupBank;
}
asm {
rep #0x30
jsl gSpriteCopyStub
jsl gSpriteRestoreStub
rep #0x30
}
}

View file

@ -31,6 +31,16 @@
#include "spriteEmitter.h"
#include "spriteInternal.h"
// Pin the IIgs sprite codegen statics into their own load segment
// instead of letting them ride in _ROOT. _ROOT also collects every
// other unsegmented .c (init.c, sprite.c, present.c, the example
// main, ...), so growth in any of those can shift the linker's
// per-bank packing and orphan intra-file static refs (we hit this
// when DRAWPRIMS grew with the chunked PEI-slam: PATTERN's link
// reported "Unresolved reference: emitMvnCopyRoutine" purely from
// _ROOT crowding). A dedicated load segment isolates this file.
JOEYLIB_SEGMENT("SPRITECG")
// ----- Constants -----

View file

@ -184,4 +184,153 @@ bool halFastBlitRect(uint8_t *dstRow0, int16_t dstX,
int16_t copyW, int16_t copyH, int16_t srcRowBytes,
uint16_t transparent);
#ifdef JOEYLIB_PLATFORM_IIGS
// =====================================================================
// IIgs direct-dispatch macros.
//
// The halFast* function declarations above are the cross-platform API.
// On IIgs, those wrappers were ~60-80 cyc/call of pure plumbing on top
// of the asm itself: wrapper prologue (PHB/PHD/TCD), redundant arg
// re-push for the inner JSL, then wrapper epilogue. The macros below
// take effect at preprocess time and inline the asm call at the call
// site, eliminating the wrapper layer entirely.
//
// Cross-platform code in src/core/*.c is unchanged -- it still calls
// halFastDrawPixel(...) etc. On IIgs the preprocessor swaps that for
// the macro expansion before ORCA-C compiles the file. The matching
// halFast* C definitions in src/port/iigs/hal.c are deleted, since
// nothing references them once the macros take effect.
//
// Macros use comma-expression form so they evaluate to a `bool` value
// (most halFast* return true on IIgs since the asm always succeeds).
// =====================================================================
extern void iigsDrawPixelInner (uint8_t *pixels, uint16_t x, uint16_t y, uint16_t nibble);
extern void iigsDrawLineInner (uint8_t *pixels, uint16_t x0, uint16_t y0, uint16_t x1, uint16_t y1, uint16_t nibble);
extern void iigsDrawCircleInner (uint8_t *pixels, uint16_t cx, uint16_t cy, uint16_t r, uint16_t nibble);
extern void iigsFillCircleInner (uint8_t *pixels, uint16_t cx, uint16_t cy, uint16_t r, uint16_t fillWord);
extern void iigsSurfaceClearInner(uint8_t *pixels, uint16_t fillWord);
extern void iigsTileFillInner (uint8_t *dstRow0, uint16_t fillWord);
extern void iigsTileCopyInner (uint8_t *dstRow0, const uint8_t *srcRow0);
extern void iigsTileCopyMaskedInner(uint8_t *dstRow0, const uint8_t *srcRow0, uint16_t transparent);
extern void iigsTilePasteInner (uint8_t *dstRow0, const uint8_t *srcTilePixels);
extern void iigsTileSnapInner (uint8_t *dstTilePixels, const uint8_t *srcRow0);
extern void iigsBlitRectInner (uint8_t *dstRow0, uint16_t dstX, const uint8_t *srcRow0, uint16_t srcX, uint16_t copyW, uint16_t copyH, uint16_t srcRowBytes, uint16_t transparent);
extern void iigsFillRectInner (uint8_t *pixels, uint16_t x, uint16_t y, uint16_t w, uint16_t h, uint16_t nibble);
extern void iigsFloodWalkAndScansInner(uint8_t *pixels, uint16_t x, uint16_t y, uint16_t matchColor, uint16_t newColor, uint16_t matchEqual, int16_t *stackX, int16_t *stackY, uint16_t *spInOut, uint16_t maxSp);
extern uint16_t gFloodSeedMatch;
extern uint16_t gFloodLeftX;
extern uint16_t gFloodRightX;
#undef halFastDrawPixel
#define halFastDrawPixel(_s, _x, _y, _c) \
(iigsDrawPixelInner((_s)->pixels, (uint16_t)(_x), (uint16_t)(_y), \
(uint16_t)((_c) & 0x0F)), \
true)
#undef halFastDrawLine
#define halFastDrawLine(_s, _x0, _y0, _x1, _y1, _c) \
(iigsDrawLineInner((_s)->pixels, (uint16_t)(_x0), (uint16_t)(_y0), \
(uint16_t)(_x1), (uint16_t)(_y1), \
(uint16_t)((_c) & 0x0F)), \
true)
#undef halFastDrawCircle
#define halFastDrawCircle(_s, _cx, _cy, _r, _c) \
(iigsDrawCircleInner((_s)->pixels, (uint16_t)(_cx), (uint16_t)(_cy), \
(_r), (uint16_t)((_c) & 0x0F)), \
true)
// fillWord = doubled byte * $0101 = (nib*$11) * $101 = nib * $1111.
// Compile-time arithmetic when caller passes a constant; at most a
// single multiply when the nibble is variable (still cheaper than
// the wrapper's three sequential ORs / shifts).
#undef halFastFillCircle
#define halFastFillCircle(_s, _cx, _cy, _r, _c) \
((_s) == stageGet() \
? (iigsFillCircleInner((_s)->pixels, (uint16_t)(_cx), (uint16_t)(_cy), \
(_r), (uint16_t)(((_c) & 0x0F) * 0x1111)), \
true) \
: false)
#undef halFastSurfaceClear
#define halFastSurfaceClear(_s, _d) \
((_s) == stageGet() \
? (iigsSurfaceClearInner((_s)->pixels, \
(uint16_t)((uint16_t)(_d) | ((uint16_t)(_d) << 8))), \
true) \
: false)
// halFastFillRect stays as a real C wrapper -- removing it triggered
// an unrelated ORCA linker bank-placement failure (same mode as the
// peislam.asm deletion: `Unresolved reference Label:
// emitMvnCopyRoutine` in sprite codegen). The wrapper now just
// forwards to iigsFillRectInner (asm does partial+middle); we lose
// the call-site macro inlining for fillRect specifically but keep
// the rest of the macros AND the new asm helper. Per-call wrapper
// overhead for halFastFillRect is back (~80 cyc) but at least the
// per-row partial-byte logic happens in asm now.
// Tile primitives operate on caller-computed row pointers; just
// forward the args. by/bx are tile coords -> bx*4 + by*8*160 byte
// offset within the surface.
#undef halFastTileFill
#define halFastTileFill(_s, _bx, _by, _fw) \
(iigsTileFillInner(&(_s)->pixels[(uint16_t)(_by) * 8 * SURFACE_BYTES_PER_ROW \
+ (uint16_t)(_bx) * 4], \
(_fw)), \
true)
#undef halFastTileCopy
#define halFastTileCopy(_d, _s) (iigsTileCopyInner((_d), (_s)), true)
#undef halFastTileCopyMasked
#define halFastTileCopyMasked(_d, _s, _t) \
(iigsTileCopyMaskedInner((_d), (_s), (uint16_t)(_t)), true)
#undef halFastTilePaste
#define halFastTilePaste(_d, _s) (iigsTilePasteInner((_d), (_s)), true)
#undef halFastTileSnap
#define halFastTileSnap(_d, _s) (iigsTileSnapInner((_d), (_s)), true)
#undef halFastBlitRect
#define halFastBlitRect(_dr, _dx, _sr, _sx, _w, _h, _ss, _t) \
(iigsBlitRectInner((_dr), (uint16_t)(_dx), (_sr), (uint16_t)(_sx), \
(uint16_t)(_w), (uint16_t)(_h), \
(uint16_t)(_ss), (_t)), \
true)
// Tier 2/3 flood fallbacks always returned false on IIgs (the asm
// impls were deleted as unreachable). Macros to constant false so
// ORCA-C dead-code-eliminates the never-taken fallback branches in
// floodFillInternal.
#undef halFastFloodWalk
#define halFastFloodWalk(_row, _sx, _mc, _nc, _me, _sm, _lx, _rx) (false)
#undef halFastFloodScanRow
#define halFastFloodScanRow(_row, _lx, _rx, _mc, _nc, _me, _mb) (false)
#undef halFastFloodScanAndPush
#define halFastFloodScanAndPush(_row, _lx, _rx, _mc, _nc, _me, _sy, _sx, _syA, _sp, _ms) (false)
// Tier-1 flood: multi-output. Asm sets gFloodSeedMatch / gFloodLeftX /
// gFloodRightX; macro reads those into the caller's out-ptrs.
#undef halFastFloodWalkAndScans
#define halFastFloodWalkAndScans(_pix, _x, _y, _mc, _nc, _me, _sx, _sy, _sp, _ms, _smOut, _lxOut, _rxOut) \
(iigsFloodWalkAndScansInner((_pix), (uint16_t)(_x), (uint16_t)(_y), \
(uint16_t)((_mc) & 0x0F), \
(uint16_t)((_nc) & 0x0F), \
(uint16_t)((_me) ? 1 : 0), \
(_sx), (_sy), \
(uint16_t *)(_sp), \
(uint16_t)(_ms)), \
*(_smOut) = (gFloodSeedMatch != 0), \
*(_lxOut) = (int16_t)gFloodLeftX, \
*(_rxOut) = (int16_t)gFloodRightX, \
true)
#endif /* JOEYLIB_PLATFORM_IIGS */
#endif

View file

@ -28,12 +28,31 @@ int8_t gJoyAxisX [JOYSTICK_COUNT];
int8_t gJoyAxisY [JOYSTICK_COUNT];
bool gJoyButtonState[JOYSTICK_COUNT][JOY_BUTTON_COUNT];
bool gJoyButtonPrev [JOYSTICK_COUNT][JOY_BUTTON_COUNT];
uint8_t gJoyDeadZone [JOYSTICK_COUNT];
#ifdef JOEYLIB_PLATFORM_IIGS
extern void iigsInputSnapshot(void);
// Build-time check: iigsInputSnapshot's asm hard-codes KEY_COUNT=60
// and the small button counts. If a future change adds/removes keys
// or buttons the asm must be updated; this declares a zero-size
// array if the math no longer matches, which is a compile error.
typedef int joey_keycount_check[(KEY_COUNT == 60) ? 1 : -1];
typedef int joey_mousebtn_check[(MOUSE_BUTTON_COUNT == 4) ? 1 : -1];
typedef int joey_joybtn_check[(JOYSTICK_COUNT * JOY_BUTTON_COUNT == 4) ? 1 : -1];
#endif
void joeyInputPoll(void) {
#ifdef JOEYLIB_PLATFORM_IIGS
// One asm pass for: TTL decrement + key snapshot + mouse/joy
// button snapshots. Replaces 3 ORCA-C memcpys + the C TTL loop
// that used to live in halInputPoll. ~0.6 ms saved per frame.
iigsInputSnapshot();
#else
memcpy(gKeyPrev, gKeyState, sizeof(gKeyState));
memcpy(gMouseButtonPrev, gMouseButtonState, sizeof(gMouseButtonState));
memcpy(gJoyButtonPrev, gJoyButtonState, sizeof(gJoyButtonState));
#endif
halInputPoll();
}
@ -170,3 +189,12 @@ bool joeyJoyReleased(JoeyJoystickE js, JoeyJoyButtonE button) {
}
return !gJoyButtonState[js][button] && gJoyButtonPrev[js][button];
}
void joeyJoystickReset(JoeyJoystickE js, uint8_t deadZone) {
if ((int)js < 0 || (int)js >= JOYSTICK_COUNT) {
return;
}
gJoyDeadZone[js] = deadZone;
halJoystickReset(js);
}

View file

@ -26,4 +26,12 @@ extern int8_t gJoyAxisY [JOYSTICK_COUNT];
extern bool gJoyButtonState[JOYSTICK_COUNT][JOY_BUTTON_COUNT];
extern bool gJoyButtonPrev [JOYSTICK_COUNT][JOY_BUTTON_COUNT];
// Per-stick analog calibration. Set by joeyJoystickReset on platforms
// with analog paddles (IIgs); ignored on digital-stick platforms.
extern uint8_t gJoyDeadZone [JOYSTICK_COUNT];
// Per-port hook: called from joeyJoystickReset to clear any auto-
// disconnect state and arm a fresh center capture on the next poll.
void halJoystickReset(JoeyJoystickE js);
#endif

View file

@ -37,4 +37,7 @@ void paletteSet(SurfaceT *s, uint8_t paletteIndex, const uint16_t *colors16) {
for (i = 1; i < SURFACE_COLORS_PER_PALETTE; i++) {
s->palette[paletteIndex][i] = colors16[i] & 0x0FFF;
}
if (s == stageGet()) {
gStagePaletteDirty = true;
}
}

View file

@ -4,6 +4,7 @@
// which of the 16 palettes that scanline uses at display time.
#include <stddef.h>
#include <string.h>
#include "joey/palette.h"
#include "surfaceInternal.h"
@ -26,6 +27,9 @@ void scbSet(SurfaceT *s, uint16_t line, uint8_t paletteIndex) {
return;
}
s->scb[line] = paletteIndex;
if (s == stageGet()) {
gStageScbDirty = true;
}
}
@ -51,7 +55,14 @@ void scbSetRange(SurfaceT *s, uint16_t firstLine, uint16_t lastLine, uint8_t pal
return;
}
for (line = firstLine; line <= last; line++) {
s->scb[line] = paletteIndex;
// memset is far cheaper than the per-iter loop on ORCA-C with -b
// (scb is uint8_t, sizeof(uint8_t)==1, so the call form below is
// exact). On IIgs ORCA-C lowers small fixed-size memsets to MVP /
// PEI tricks; on Amiga/ST/DOS it uses libc memset which is
// already vectorized. Either way, much tighter than the C loop.
(void)line;
memset(&s->scb[firstLine], paletteIndex, (size_t)(last - firstLine + 1));
if (s == stageGet()) {
gStageScbDirty = true;
}
}

View file

@ -175,6 +175,8 @@ SpriteT *spriteCreate(const uint8_t *tileData, uint8_t widthTiles, uint8_t heigh
sp->ownsTileData = false;
sp->slot = NULL;
memset(sp->routineOffsets, 0, sizeof(sp->routineOffsets));
memset(sp->cachedDstBank, 0xFF, sizeof(sp->cachedDstBank));
memset(sp->cachedSrcBank, 0xFF, sizeof(sp->cachedSrcBank));
sp->flags = flags;
return sp;
}
@ -242,6 +244,8 @@ SpriteT *spriteCreateFromSurface(const SurfaceT *src, int16_t x, int16_t y,
sp->ownsTileData = true;
sp->slot = NULL;
memset(sp->routineOffsets, 0, sizeof(sp->routineOffsets));
memset(sp->cachedDstBank, 0xFF, sizeof(sp->cachedDstBank));
memset(sp->cachedSrcBank, 0xFF, sizeof(sp->cachedSrcBank));
sp->flags = flags;
return sp;
}
@ -385,6 +389,8 @@ SpriteT *spriteFromCompiledMem(const uint8_t *data, uint32_t length, SpriteFlags
sp->ownsTileData = true;
sp->slot = slot;
sp->flags = flags;
memset(sp->cachedDstBank, 0xFF, sizeof(sp->cachedDstBank));
memset(sp->cachedSrcBank, 0xFF, sizeof(sp->cachedSrcBank));
return sp;
}

View file

@ -35,6 +35,16 @@ struct SpriteT {
uint16_t routineOffsets[JOEY_SPRITE_SHIFT_COUNT][SPRITE_OP_COUNT];
SpriteFlagsE flags;
// Per-shift, per-op MVN bank-patch cache for IIgs save/restore.
// patchMvnBanks rewrites 16+ MVN bank operands every call, but the
// banks themselves rarely change frame-to-frame (screen surface
// is fixed; backup buffer is allocated once). After the first
// patch, subsequent calls compare requested banks to the cache
// and skip the re-stamp loop. 0xFF means "never patched yet".
// 12 bytes per sprite. Unused on non-IIgs.
uint8_t cachedDstBank[JOEY_SPRITE_SHIFT_COUNT][SPRITE_OP_COUNT];
uint8_t cachedSrcBank[JOEY_SPRITE_SHIFT_COUNT][SPRITE_OP_COUNT];
};
// Compiled entry points. Implemented alongside spriteCompile in

View file

@ -10,6 +10,10 @@
#include "hal.h"
#include "surfaceInternal.h"
#ifdef JOEYLIB_PLATFORM_IIGS
extern void iigsMarkDirtyRowsInner(uint16_t yStart, uint16_t yEnd, uint16_t minWord, uint16_t maxWord);
#endif
#define SURFACE_PALETTE_BYTES (SURFACE_PALETTE_ENTRIES * (uint32_t)sizeof(uint16_t))
#define SURFACE_FILE_BYTES (SURFACE_PIXELS_SIZE + SURFACE_HEIGHT + SURFACE_PALETTE_BYTES)
@ -25,8 +29,21 @@ static SurfaceT *gStage = NULL;
uint8_t gStageMinWord[SURFACE_HEIGHT];
uint8_t gStageMaxWord[SURFACE_HEIGHT];
// "Stage SCB / palette has changed since last present-side upload."
// Cheap flag check at present time replaces the 200+512 byte memcmps
// the IIgs port used to run every frame in halPresentRect's
// uploadScbAndPaletteIfNeeded -- ~7 ms / frame saved on demos that
// don't churn palette/SCB (i.e., almost all demos).
//
// Initially true so the first present uploads. scbSet*/paletteSet
// re-mark dirty when the stage's data changes; per-port present code
// clears the flag after consuming.
bool gStageScbDirty = true;
bool gStagePaletteDirty = true;
// ----- Internal helpers (alphabetical) -----
#ifndef JOEYLIB_PLATFORM_IIGS
static void widenRow(int16_t y, uint8_t minWord, uint8_t maxWord) {
if (minWord < gStageMinWord[y]) {
gStageMinWord[y] = minWord;
@ -35,6 +52,7 @@ static void widenRow(int16_t y, uint8_t minWord, uint8_t maxWord) {
gStageMaxWord[y] = maxWord;
}
}
#endif
// ----- Public API (alphabetical) -----
@ -169,10 +187,12 @@ void surfaceMarkDirtyAll(const SurfaceT *s) {
// the call is a no-op so primitives can call unconditionally without
// branching themselves.
void surfaceMarkDirtyRect(const SurfaceT *s, int16_t x, int16_t y, int16_t w, int16_t h) {
int16_t row;
int16_t yEnd;
uint8_t minWord;
uint8_t maxWord;
#ifndef JOEYLIB_PLATFORM_IIGS
int16_t row;
#endif
if (s != gStage) {
return;
@ -183,9 +203,14 @@ void surfaceMarkDirtyRect(const SurfaceT *s, int16_t x, int16_t y, int16_t w, in
minWord = (uint8_t)(x >> 2);
maxWord = (uint8_t)((x + w - 1) >> 2);
yEnd = y + h;
#ifdef JOEYLIB_PLATFORM_IIGS
iigsMarkDirtyRowsInner((uint16_t)y, (uint16_t)yEnd,
(uint16_t)minWord, (uint16_t)maxWord);
#else
for (row = y; row < yEnd; row++) {
widenRow(row, minWord, maxWord);
}
#endif
}

View file

@ -38,6 +38,13 @@ struct SurfaceT {
extern uint8_t gStageMinWord[SURFACE_HEIGHT];
extern uint8_t gStageMaxWord[SURFACE_HEIGHT];
// Stage SCB / palette dirty flags. scbSet* and paletteSet set them
// true when the stage's data is modified; the per-port present code
// checks the flags and clears after upload. Replaces a per-frame
// 712-byte memcmp pair the IIgs port used to run unconditionally.
extern bool gStageScbDirty;
extern bool gStagePaletteDirty;
// Drawing primitives call this with their already-clipped destination
// rect. If `s` is the stage, the affected rows' [minWord, maxWord]
// bands are widened to cover the rect. If `s` is any other surface,

View file

@ -226,6 +226,12 @@ static void pollJoysticks(void) {
// ----- HAL API (alphabetical) -----
void halJoystickReset(JoeyJoystickE js) {
// Amiga sticks are digital -- no calibration to do.
(void)js;
}
void halInputInit(void) {
memset(gKeyState, 0, sizeof(gKeyState));
memset(gKeyPrev, 0, sizeof(gKeyPrev));

View file

@ -281,6 +281,12 @@ static long restoreIkbdVector(void) {
// ----- HAL API (alphabetical) -----
void halJoystickReset(JoeyJoystickE js) {
// Atari ST sticks are digital -- no calibration to do.
(void)js;
}
void halInputInit(void) {
memset(gKeyState, 0, sizeof(gKeyState));
memset(gKeyPrev, 0, sizeof(gKeyPrev));

View file

@ -305,6 +305,12 @@ static void mousePoll(void) {
// ----- HAL API (alphabetical) -----
void halJoystickReset(JoeyJoystickE js) {
// DOS sticks are digital -- no calibration to do.
(void)js;
}
void halInputInit(void) {
memset(gKeyState, 0, sizeof(gKeyState));
memset(gKeyPrev, 0, sizeof(gKeyPrev));

View file

@ -42,10 +42,9 @@ JOEYLIB_SEGMENT("DRAWPRIMS")
// 32 KB stack-slam fill via AUXWRITE. ~25 ms full-screen.
extern void iigsSurfaceClearInner(uint8_t *pixels, uint16_t fillWord);
// PEI-slam fill of `bytesPerRow` doubled bytes per row across `rows`
// rows, advancing 160 bytes per row. firstRow must be in bank $01.
// Caller handles partial-nibble edges in C; bytesPerRow is even.
extern void iigsFillRectStageInner(uint8_t *firstRow, uint16_t bytesPerRow, uint16_t rows, uint16_t fillWord);
// Full-fill asm helper (partial leading byte + middle MVN + partial
// trailing byte). Called by halFastFillRect below.
extern void iigsFillRectInner(uint8_t *pixels, uint16_t x, uint16_t y, uint16_t w, uint16_t h, uint16_t nibble);
// 16 STA abs,X stores at fixed offsets along a 160-byte stride.
// ~120 cyc per call.
extern void iigsTileFillInner(uint8_t *dstRow0, uint16_t fillWord);
@ -72,26 +71,15 @@ extern void iigsDrawCircleInner(uint8_t *pixels, uint16_t cx, uint16_t cy, uint1
// Replaces ORCA-C's memcpy path which silently fails when called
// from halPresent (DBR-state quirk after prior asm primitives).
extern void iigsBlitStageToShr(uint8_t *scbPtr, uint16_t *palettePtr);
// floodFill row walk: tests seed pixel and walks left/right to find
// the matching run. Writes results to gFloodSeedMatch / gFloodLeftX /
// gFloodRightX (DRAWPRIMS globals).
extern void iigsFloodWalkInner(uint8_t *row, uint16_t startX, uint16_t matchColor, uint16_t newColor, uint16_t matchEqual);
// floodFill walk results: written by iigsFloodWalkAndScansInner,
// read back by halFastFloodWalkAndScans.
extern uint16_t gFloodSeedMatch;
extern uint16_t gFloodLeftX;
extern uint16_t gFloodRightX;
// Per-pixel match scan over [leftX..rightX] of `row`. Writes 1/0 to
// markBuf[i] for each pixel. matchEqual selects boundary vs equal mode
// (see C srcPixel match logic).
extern void iigsFloodScanRowInner(uint8_t *row, uint16_t leftX, uint16_t rightX, uint16_t matchColor, uint16_t newColor, uint16_t matchEqual, uint8_t *markBuf);
// Per-pixel rect blit (src->dst). transparent == $FFFF means opaque
// (always copy); else pixels with src nibble == (transparent & $0F)
// are skipped. Dst stride is hardcoded 160 (SURFACE_BYTES_PER_ROW).
extern void iigsBlitRectInner(uint8_t *dstRow0, uint16_t dstX, const uint8_t *srcRow0, uint16_t srcX, uint16_t copyW, uint16_t copyH, uint16_t srcRowBytes, uint16_t transparent);
// Combined scan + push: matches each pixel, tracks run state, pushes
// (x, scanY) to the (stackX, stackY) arrays at *spInOut on every
// falling edge and at the end of the row if still in a run. *spInOut
// is read on entry and updated with the new top-of-stack on return.
extern void iigsFloodScanAndPushInner(uint8_t *row, uint16_t leftX, uint16_t rightX, uint16_t matchColor, uint16_t newColor, uint16_t matchEqual, uint16_t scanY, int16_t *stackX, int16_t *stackY, uint16_t *spInOut, uint16_t maxSp);
// Single-call per-popped-seed worker: seed test + walk-left + walk-right
// + scan-above + scan-below + push, all sharing cached row addr and
// match decoders. Outputs to gFloodSeedMatch / gFloodLeftX / gFloodRightX.
@ -101,6 +89,12 @@ extern void iigsFloodWalkAndScansInner(uint8_t *pixels, uint16_t x, uint16_t y,
// every asm primitive that needs row offset can do `lda >lut,x` instead
// of the 7-instruction shift-add.
extern void iigsInitRowLut(void);
// Per-row MVN blit from $01:srcOffset to $E1:srcOffset for partial-
// screen presents (halPresentRect). srcOffset is the byte offset
// within bank $01 of the FIRST byte to copy on the FIRST row;
// subsequent rows are at srcOffset + 160, etc. ~9 cyc/byte vs
// ORCA-C memcpy's ~30 cyc/byte.
extern void iigsBlitRectStageToShr(uint16_t srcOffset, uint16_t copyBytes, uint16_t rowsLeft);
// Filled circle, scanline-style. fillWord low byte is the doubled
// nibble (e.g., 0x33 for nibble 3).
extern void iigsFillCircleInner(uint8_t *pixels, uint16_t cx, uint16_t cy, uint16_t r, uint16_t fillWord);
@ -155,14 +149,9 @@ static uint8_t gPreviousBorder = 0;
static uint8_t gPreviousShadow = 0;
static bool gModeSet = false;
// Last-uploaded SCB and palette. Both registers live in bank $E1; on a
// 2.8 MHz 65816 the 200+512-byte memcpy across the bank boundary is a
// real cost when it runs every present. Caching here lets the typical
// game loop (which mutates pixels but rarely SCB/palette) skip the
// upload entirely on clean frames.
static uint8_t gCachedScb [SURFACE_HEIGHT];
static uint16_t gCachedPalette[SURFACE_PALETTE_COUNT][SURFACE_COLORS_PER_PALETTE];
static bool gCacheValid = false;
// SCB / palette upload skipping is now driven by gStageScbDirty /
// gStagePaletteDirty (core/surface.c). The old per-frame memcmp-
// against-cached-copy approach was costing ~7 ms / frame on ORCA-C.
// PEI slam scratch. File-scope non-static so the asm can `ext` them;
// all accesses inside the slam use long-mode `>` addressing so they
@ -171,30 +160,30 @@ volatile uint16_t gPeiOrigSp;
volatile uint8_t gPeiOrigShadow;
volatile uint16_t gPeiTempRowBase;
volatile uint16_t gPeiCurRow; // row counter saved across slam (stack is hijacked)
volatile uint16_t gPeiChunkRow; // in-chunk row counter saved across slam (Y reg storage)
// Defined in src/port/iigs/peislam.asm, in its own load segment
// (DRAWPRIMS) so the GS/OS loader places it in a different bank from
// AUDIO's _ROOT. PEI-slams the full 80 words of stage row `y` into
// the matching $E1 SHR row, ~530 cyc/row vs ~1120 cyc for memcpy/MVN.
extern void peiSlamFullRow(int16_t y);
// peislam.asm's per-row peiSlamFullRow helper is no longer wired in;
// the present pipeline now does its own PEI-slam loop inside
// iigsBlitStageToShr above (with dirty-row skip).
// Upload SCB and palette into bank-$E1 SHR memory only when they have
// changed since the last call. paletteOrScbChanged returns false when
// the cache is already in sync, in which case both memcpys to $E1 are
// skipped.
// Upload SCB / palette into bank-$E1 SHR memory only when the
// matching dirty flag is set. Replaces a per-frame 712-byte memcmp
// pair (~7 ms / frame on ORCA-C with -b) with a 2-cyc flag check.
// gStageScbDirty / gStagePaletteDirty live in core/surface.c; they
// start true (forces the very first present to upload), get set true
// again whenever scbSet* / paletteSet mutate the stage's data, and
// get cleared here after upload.
static void uploadScbAndPaletteIfNeeded(const SurfaceT *src) {
if (gCacheValid
&& memcmp(gCachedScb, src->scb, sizeof(gCachedScb)) == 0
&& memcmp(gCachedPalette, src->palette, sizeof(gCachedPalette)) == 0) {
return;
if (gStageScbDirty) {
memcpy(IIGS_SHR_SCB, src->scb, SURFACE_HEIGHT);
gStageScbDirty = false;
}
if (gStagePaletteDirty) {
memcpy(IIGS_SHR_PALETTE, src->palette, sizeof(src->palette));
gStagePaletteDirty = false;
}
memcpy(IIGS_SHR_SCB, src->scb, SURFACE_HEIGHT);
memcpy(IIGS_SHR_PALETTE, src->palette, sizeof(src->palette));
memcpy(gCachedScb, src->scb, sizeof(gCachedScb));
memcpy(gCachedPalette, src->palette, sizeof(gCachedPalette));
gCacheValid = true;
}
@ -241,10 +230,9 @@ void halPresent(const SurfaceT *src) {
void halPresentRect(const SurfaceT *src, int16_t x, int16_t y, uint16_t w, uint16_t h) {
int16_t py;
int16_t yEnd;
uint16_t copyBytes;
int16_t byteStart;
uint16_t srcOffset;
if (src == NULL) {
return;
@ -257,13 +245,16 @@ void halPresentRect(const SurfaceT *src, int16_t x, int16_t y, uint16_t w, uint1
// otherwise we include the byte containing the leftmost pixel.
byteStart = x >> 1;
copyBytes = (uint16_t)(((x + (int16_t)w + 1) >> 1) - byteStart);
yEnd = y + (int16_t)h;
for (py = y; py < yEnd; py++) {
memcpy(&IIGS_SHR_PIXELS[py * SURFACE_BYTES_PER_ROW + byteStart],
&src->pixels[py * SURFACE_BYTES_PER_ROW + byteStart],
copyBytes);
if (copyBytes == 0 || h == 0) {
return;
}
// Asm per-row MVN blit. Stage pixels live at $01:2000; SHR display
// at $E1:2000 (same offset within their banks). srcOffset is the
// byte offset of the first byte to copy on the first row.
srcOffset = (uint16_t)(0x2000 + y * SURFACE_BYTES_PER_ROW + byteStart);
iigsBlitRectStageToShr(srcOffset, copyBytes, h);
}
@ -277,249 +268,35 @@ void halShutdown(void) {
}
bool halFastSurfaceClear(SurfaceT *s, uint8_t doubled) {
uint16_t fillWord;
if (s == NULL) {
return false;
}
if (s != stageGet()) {
return false;
}
fillWord = (uint16_t)((uint16_t)doubled | ((uint16_t)doubled << 8));
iigsSurfaceClearInner(s->pixels, fillWord);
return true;
}
// halFastSurfaceClear / halFastDrawLine / halFastDrawCircle /
// halFastFillCircle / halFastTileCopy / halFastTileCopyMasked /
// halFastTilePaste / halFastTileSnap / halFastTileFill /
// halFastBlitRect / halFastFloodWalk[AndScans] /
// halFastFloodScanRow / halFastFloodScanAndPush all dispatch via
// macros in core/hal.h on IIgs (#ifdef JOEYLIB_PLATFORM_IIGS block).
// Only halFastFillRect remains a real function below because its
// partial-byte (nibble-edge) handling is too gnarly for a macro.
// halFastFillRect: thin wrapper around iigsFillRectInner. The asm
// helper now handles the partial-byte (nibble-edge) logic that used
// to live here, so this function is just a stage-check + forward.
// (It's not macro-dispatched like the others because removing it
// from the C side triggers an unrelated ORCA-linker bank-placement
// failure -- the binary needs enough mass in _ROOT to keep sprite
// codegen's static symbols at addresses the linker can resolve.)
bool halFastFillRect(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex) {
int16_t pxStart;
int16_t pxEnd;
int16_t midStart;
int16_t midBytes;
int16_t trailingByte;
int16_t leadingByte;
bool hasLeading;
bool hasTrailing;
int16_t row;
uint8_t *line;
uint16_t fillWord;
uint8_t nibble;
uint8_t doubled;
if (s == NULL) {
if (s == NULL || s != stageGet()) {
return false;
}
if (s != stageGet()) {
return false;
}
pxStart = x;
pxEnd = (int16_t)(x + (int16_t)w);
leadingByte = (int16_t)(pxStart >> 1);
hasLeading = (pxStart & 1) != 0;
if (hasLeading) {
pxStart++;
}
midStart = (int16_t)(pxStart >> 1);
midBytes = (int16_t)((pxEnd - pxStart) >> 1);
hasTrailing = ((pxEnd - pxStart) & 1) != 0;
trailingByte = (int16_t)(midStart + midBytes);
if (midBytes <= 0) {
return false;
}
nibble = (uint8_t)(colorIndex & 0x0F);
doubled = (uint8_t)((nibble << 4) | nibble);
if (hasLeading || hasTrailing) {
for (row = 0; row < (int16_t)h; row++) {
line = &s->pixels[(y + row) * SURFACE_BYTES_PER_ROW];
if (hasLeading) {
line[leadingByte] = (uint8_t)((line[leadingByte] & 0xF0) | nibble);
}
if (hasTrailing) {
line[trailingByte] = (uint8_t)((line[trailingByte] & 0x0F) | (nibble << 4));
}
}
}
fillWord = (uint16_t)((uint16_t)doubled | ((uint16_t)doubled << 8));
line = &s->pixels[y * SURFACE_BYTES_PER_ROW + midStart];
iigsFillRectStageInner(line, (uint16_t)midBytes, h, fillWord);
return true;
}
bool halFastTileCopy(uint8_t *dstRow0, const uint8_t *srcRow0) {
iigsTileCopyInner(dstRow0, srcRow0);
return true;
}
bool halFastTileCopyMasked(uint8_t *dstRow0, const uint8_t *srcRow0, uint8_t transparent) {
iigsTileCopyMaskedInner(dstRow0, srcRow0, (uint16_t)transparent);
return true;
}
bool halFastTilePaste(uint8_t *dstRow0, const uint8_t *srcTilePixels) {
iigsTilePasteInner(dstRow0, srcTilePixels);
return true;
}
bool halFastTileSnap(uint8_t *dstTilePixels, const uint8_t *srcRow0) {
iigsTileSnapInner(dstTilePixels, srcRow0);
return true;
}
bool halFastDrawPixel(SurfaceT *s, uint16_t x, uint16_t y, uint8_t colorIndex) {
if (s == NULL) {
return false;
}
iigsDrawPixelInner(s->pixels, x, y, (uint16_t)(colorIndex & 0x0F));
return true;
}
bool halFastDrawLine(SurfaceT *s, int16_t x0, int16_t y0, int16_t x1, int16_t y1, uint8_t colorIndex) {
if (s == NULL) {
return false;
}
iigsDrawLineInner(s->pixels,
(uint16_t)x0, (uint16_t)y0,
(uint16_t)x1, (uint16_t)y1,
iigsFillRectInner(s->pixels,
(uint16_t)x, (uint16_t)y,
(uint16_t)w, (uint16_t)h,
(uint16_t)(colorIndex & 0x0F));
return true;
}
bool halFastDrawCircle(SurfaceT *s, int16_t cx, int16_t cy, uint16_t r, uint8_t colorIndex) {
if (s == NULL) {
return false;
}
iigsDrawCircleInner(s->pixels,
(uint16_t)cx, (uint16_t)cy, r,
(uint16_t)(colorIndex & 0x0F));
return true;
}
bool halFastFillCircle(SurfaceT *s, int16_t cx, int16_t cy, uint16_t r, uint8_t colorIndex) {
uint16_t fillWord;
uint8_t nibble;
uint8_t doubled;
if (s == NULL) {
return false;
}
if (s != stageGet()) {
return false;
}
nibble = (uint8_t)(colorIndex & 0x0F);
doubled = (uint8_t)((nibble << 4) | nibble);
fillWord = (uint16_t)((uint16_t)doubled | ((uint16_t)doubled << 8));
iigsFillCircleInner(s->pixels, (uint16_t)cx, (uint16_t)cy, r, fillWord);
return true;
}
bool halFastFloodWalk(uint8_t *row, int16_t startX, uint8_t matchColor, uint8_t newColor, bool matchEqual, bool *seedMatched, int16_t *leftXOut, int16_t *rightXOut) {
if (row == NULL || seedMatched == NULL || leftXOut == NULL || rightXOut == NULL) {
return false;
}
iigsFloodWalkInner(row, (uint16_t)startX,
(uint16_t)(matchColor & 0x0F),
(uint16_t)(newColor & 0x0F),
(uint16_t)(matchEqual ? 1 : 0));
*seedMatched = (gFloodSeedMatch != 0);
if (*seedMatched) {
*leftXOut = (int16_t)gFloodLeftX;
*rightXOut = (int16_t)gFloodRightX;
}
return true;
}
bool halFastFloodScanRow(uint8_t *row, int16_t leftX, int16_t rightX, uint8_t matchColor, uint8_t newColor, bool matchEqual, uint8_t *markBuf) {
if (row == NULL || markBuf == NULL) {
return false;
}
iigsFloodScanRowInner(row, (uint16_t)leftX, (uint16_t)rightX,
(uint16_t)(matchColor & 0x0F),
(uint16_t)(newColor & 0x0F),
(uint16_t)(matchEqual ? 1 : 0),
markBuf);
return true;
}
bool halFastFloodScanAndPush(uint8_t *row, int16_t leftX, int16_t rightX, uint8_t matchColor, uint8_t newColor, bool matchEqual, int16_t scanY, int16_t *stackX, int16_t *stackY, int16_t *spInOut, int16_t maxSp) {
if (row == NULL || stackX == NULL || stackY == NULL || spInOut == NULL) {
return false;
}
iigsFloodScanAndPushInner(row,
(uint16_t)leftX, (uint16_t)rightX,
(uint16_t)(matchColor & 0x0F),
(uint16_t)(newColor & 0x0F),
(uint16_t)(matchEqual ? 1 : 0),
(uint16_t)scanY,
stackX, stackY,
(uint16_t *)spInOut,
(uint16_t)maxSp);
return true;
}
bool halFastFloodWalkAndScans(uint8_t *pixels, int16_t x, int16_t y, uint8_t matchColor, uint8_t newColor, bool matchEqual, int16_t *stackX, int16_t *stackY, int16_t *spInOut, int16_t maxSp, bool *seedMatched, int16_t *leftXOut, int16_t *rightXOut) {
if (pixels == NULL || stackX == NULL || stackY == NULL || spInOut == NULL || seedMatched == NULL || leftXOut == NULL || rightXOut == NULL) {
return false;
}
iigsFloodWalkAndScansInner(pixels,
(uint16_t)x, (uint16_t)y,
(uint16_t)(matchColor & 0x0F),
(uint16_t)(newColor & 0x0F),
(uint16_t)(matchEqual ? 1 : 0),
stackX, stackY,
(uint16_t *)spInOut,
(uint16_t)maxSp);
*seedMatched = (gFloodSeedMatch != 0);
*leftXOut = (int16_t)gFloodLeftX;
*rightXOut = (int16_t)gFloodRightX;
return true;
}
bool halFastBlitRect(uint8_t *dstRow0, int16_t dstX, const uint8_t *srcRow0, int16_t srcX, int16_t copyW, int16_t copyH, int16_t srcRowBytes, uint16_t transparent) {
if (dstRow0 == NULL || srcRow0 == NULL || copyW <= 0 || copyH <= 0) {
return false;
}
iigsBlitRectInner(dstRow0, (uint16_t)dstX,
srcRow0, (uint16_t)srcX,
(uint16_t)copyW, (uint16_t)copyH,
(uint16_t)srcRowBytes,
transparent);
return true;
}
bool halFastTileFill(SurfaceT *s, uint8_t bx, uint8_t by, uint16_t fillWord) {
uint8_t *row;
uint16_t pixelX;
uint16_t pixelY;
if (s == NULL) {
return false;
}
pixelX = (uint16_t)((uint16_t)bx * 8u);
pixelY = (uint16_t)((uint16_t)by * 8u);
row = &s->pixels[pixelY * SURFACE_BYTES_PER_ROW + (pixelX >> 1)];
iigsTileFillInner(row, fillWord);
return true;
}
uint8_t *halStageAllocPixels(void) {
return IIGS_STAGE_PIXELS;
}

View file

@ -109,7 +109,11 @@ static int8_t thresholdPaddle(uint8_t v);
// does not accept designated initializers; runtime fill keeps lookup
// O(1) instead of a 40-plus-case switch.
static uint8_t gAsciiToKey[ASCII_TABLE_SIZE];
static uint8_t gKeyTtl [KEY_COUNT];
// Non-static so iigsInputSnapshot (joeyDraw.asm) can reference it via
// long-mode addressing through the linker. The C TTL-decrement loop
// that used to live in halInputPoll moved to that asm helper.
uint8_t gKeyTtl [KEY_COUNT];
static int16_t gMouseAbsX = SURFACE_WIDTH / 2;
static int16_t gMouseAbsY = SURFACE_HEIGHT / 2;
@ -166,9 +170,38 @@ static int8_t signExtend7(uint8_t raw) {
}
// Map a raw 0..255 paddle reading to JOYSTICK_AXIS_MIN..MAX, using the
// stick's calibrated center (captured by joeyJoystickReset) and a
// dead-zone band around it. Returns 0 if reading is within deadZone of
// the center; otherwise the offset from center, clamped to int8_t.
static int8_t analogPaddle(uint8_t v, uint8_t center, uint8_t deadZone) {
int16_t delta;
delta = (int16_t)v - (int16_t)center;
if (delta < 0) {
if ((-delta) <= (int16_t)deadZone) {
return 0;
}
if (delta < (int16_t)JOYSTICK_AXIS_MIN) {
return JOYSTICK_AXIS_MIN;
}
} else {
if (delta <= (int16_t)deadZone) {
return 0;
}
if (delta > (int16_t)JOYSTICK_AXIS_MAX) {
return JOYSTICK_AXIS_MAX;
}
}
return (int8_t)delta;
}
// Threshold a 0..255 paddle reading into a digital direction so the
// IIgs analog stick presents the same axis semantics as the digital
// sticks on ST/Amiga/DOS. Center range is treated as zero.
// sticks on ST/Amiga/DOS. Center range is treated as zero. Used
// before joeyJoystickReset has been called -- once the app calibrates,
// we switch to analogPaddle for finer control.
static int8_t thresholdPaddle(uint8_t v) {
if (v < PADDLE_LO_THRESHOLD) {
return JOYSTICK_AXIS_MIN;
@ -191,53 +224,122 @@ static int8_t thresholdPaddle(uint8_t v) {
// approximates the paddle's 0..255 position (the Apple firmware
// PREAD routine works the same way). The two reads are inlined here
// rather than factored into a helper because ORCA/C 2.1 trips over
// `volatile uint8_t *` function parameters.
// Auto-disconnect tracking. The paddle one-shot timer takes ~3 ms to
// charge at full deflection; if NO joystick is wired up, the BUSY bit
// stays set forever and the busy-wait runs the full PADDLE_TIMEOUT
// every frame -- ~3 ms wasted per frame on a stick that isn't there.
//
// After JOY_DISCONNECT_THRESHOLD consecutive timeouts we latch the
// stick as absent and stop polling entirely. The app calls
// joeyJoystickReset to clear the latch and resume polling.
#define JOY_DISCONNECT_THRESHOLD 60u
static uint16_t gJoyConsecutiveTimeouts = 0;
static bool gJoyDisconnectLatched = false;
// Analog calibration: gJoyCenterX/Y hold the raw paddle reading we
// captured the last time the user called joeyJoystickReset. Until
// that's called, gJoyCenterValid is false and pollJoystick falls back
// to the digital threshold mapping. gJoyRecalibrate is set by
// halJoystickReset and cleared on the next successful poll, which
// captures the new center.
static uint8_t gJoyCenterX [JOYSTICK_COUNT];
static uint8_t gJoyCenterY [JOYSTICK_COUNT];
static bool gJoyCenterValid [JOYSTICK_COUNT];
static bool gJoyRecalibrate [JOYSTICK_COUNT];
void halJoystickReset(JoeyJoystickE js) {
if ((int)js < 0 || (int)js >= JOYSTICK_COUNT) {
return;
}
// Re-enable polling and arm a fresh center capture for the next
// poll. The dead-zone value lives in core's gJoyDeadZone[js].
gJoyConsecutiveTimeouts = 0;
gJoyDisconnectLatched = false;
gJoyRecalibrate[js] = true;
}
// Asm paddle reader (joeyDraw.asm). Switches CPU to 1 MHz for the
// duration of the poll so paddle counts match what every other
// IIgs/Apple II joystick game produces (the C busy-wait at 2.8 MHz
// inflated counts). Returns results via gJoy* DRAWPRIMS scratch.
extern void iigsPollJoystickInner(void);
extern volatile uint8_t gJoyPx;
extern volatile uint8_t gJoyPy;
extern volatile uint8_t gJoyResolved; // bit0: pdl0 fired; bit1: pdl1 fired
static void pollJoystick(void) {
uint16_t count;
uint8_t px;
uint8_t py;
uint8_t byte;
uint8_t resolvedFlags;
bool xResolved;
bool yResolved;
// One PTRIG read starts BOTH paddle timers simultaneously per the
// IIgs Hardware Reference. Polling them in parallel halves the
// wall-clock time vs. polling each serially after its own trigger.
byte = *IIGS_PTRIG;
px = 0;
py = 0;
xResolved = false;
yResolved = false;
for (count = 0; count < PADDLE_TIMEOUT; count++) {
if (!xResolved) {
byte = *IIGS_PADDLE0;
if ((byte & IIGS_PADDLE_BUSY) == 0) {
px = (uint8_t)count;
xResolved = true;
}
}
if (!yResolved) {
byte = *IIGS_PADDLE1;
if ((byte & IIGS_PADDLE_BUSY) == 0) {
py = (uint8_t)count;
yResolved = true;
}
}
if (xResolved && yResolved) {
break;
}
}
// Timed-out paddles default to centered axis. Without an explicit
// resolved flag we couldn't distinguish "no joystick" from "stick
// hard right" -- both would yield px=255 and report AXIS_MAX.
gJoyAxisX[JOYSTICK_0] = xResolved ? thresholdPaddle(px) : 0;
gJoyAxisY[JOYSTICK_0] = yResolved ? thresholdPaddle(py) : 0;
// Buttons are I/O reads -- always cheap, do them every frame.
gJoyButtonState[JOYSTICK_0][JOY_BUTTON_0] = (*IIGS_BTN0 & IIGS_BUTTON_BIT) != 0;
gJoyButtonState[JOYSTICK_0][JOY_BUTTON_1] = (*IIGS_BTN1 & IIGS_BUTTON_BIT) != 0;
gJoyConnected[JOYSTICK_0] = true;
gJoyConnected[JOYSTICK_1] = false;
// Once the stick has been latched as disconnected, only buttons
// get polled. The app must call joeyJoystickReset to resume axis
// polling (e.g., when the user has just plugged in a stick).
if (gJoyDisconnectLatched) {
gJoyAxisX[JOYSTICK_0] = 0;
gJoyAxisY[JOYSTICK_0] = 0;
gJoyConnected[JOYSTICK_0] = false;
return;
}
// Asm read at 1 MHz -- accurate paddle counts.
iigsPollJoystickInner();
px = gJoyPx;
py = gJoyPy;
resolvedFlags = gJoyResolved;
xResolved = (resolvedFlags & 0x01) != 0;
yResolved = (resolvedFlags & 0x02) != 0;
gJoyConnected[JOYSTICK_0] = xResolved || yResolved;
// Update auto-disconnect counter. Both axes failing => probably no
// stick. One resolves => stick is present, reset the counter.
if (!xResolved && !yResolved) {
if (gJoyConsecutiveTimeouts < 0xFFFFu) {
gJoyConsecutiveTimeouts++;
}
if (gJoyConsecutiveTimeouts >= JOY_DISCONNECT_THRESHOLD) {
gJoyDisconnectLatched = true;
}
gJoyAxisX[JOYSTICK_0] = 0;
gJoyAxisY[JOYSTICK_0] = 0;
return;
}
gJoyConsecutiveTimeouts = 0;
// Capture the resting position on recalibrate (one-shot).
if (gJoyRecalibrate[JOYSTICK_0]) {
gJoyCenterX [JOYSTICK_0] = px;
gJoyCenterY [JOYSTICK_0] = py;
gJoyCenterValid[JOYSTICK_0] = true;
gJoyRecalibrate[JOYSTICK_0] = false;
}
// Calibrated => analog axis report (offset from center, dead-zone
// clamped). Uncalibrated => the legacy 3-state digital threshold,
// matching how the stick behaved before joeyJoystickReset existed.
if (gJoyCenterValid[JOYSTICK_0]) {
gJoyAxisX[JOYSTICK_0] = analogPaddle(px,
gJoyCenterX[JOYSTICK_0],
gJoyDeadZone[JOYSTICK_0]);
gJoyAxisY[JOYSTICK_0] = analogPaddle(py,
gJoyCenterY[JOYSTICK_0],
gJoyDeadZone[JOYSTICK_0]);
} else {
gJoyAxisX[JOYSTICK_0] = thresholdPaddle(px);
gJoyAxisY[JOYSTICK_0] = thresholdPaddle(py);
}
}
@ -303,19 +405,14 @@ void halInputInit(void) {
void halInputPoll(void) {
uint8_t kbd;
uint8_t ascii;
uint8_t key;
uint16_t i;
uint8_t kbd;
uint8_t ascii;
uint8_t key;
for (i = 0; i < KEY_COUNT; i++) {
if (gKeyTtl[i] > 0) {
gKeyTtl[i]--;
if (gKeyTtl[i] == 0) {
gKeyState[i] = false;
}
}
}
// The KEY_COUNT TTL-decrement loop and the gKeyState/gKeyPrev/
// gMouseButtonPrev/gJoyButtonPrev snapshots all happen earlier in
// joeyInputPoll's call to iigsInputSnapshot (asm). We just read
// the live hardware state here.
kbd = *IIGS_KBD;
if (kbd & KBD_STROBE_BIT) {

File diff suppressed because it is too large Load diff

View file

@ -1,76 +1,15 @@
* peislam.asm - PEI-slam stage row to bank-$E1 SHR.
* peislam.asm - placeholder.
*
* Implements the //e AUXWRITE + RAMRD + SHR-shadow trick that lets
* 65816 stack pushes (which are bank-$00-implicit) end up in bank
* $E1 SHR display memory:
*
* - SHR shadow temporarily ENABLED (clear $C035 bit 3) so writes
* to bank-$01 in $2000-$9FFF mirror to $E1 SHR.
* - AUXWRITE on (any write to $C005) so bank-$00 stack writes
* redirect to bank $01, then mirror to $E1 via shadow.
* - RAMRD on (any write to $C003) so PEI dp's bank-$00-implicit
* reads redirect to bank $01 = the stage source.
* - SEI for the duration: stack pointer is hijacked to point at
* $E1-mapped stack space, soft-switch state would corrupt any
* C code that tried to access bank-$00 globals.
*
* All scratch reads/writes within the slam use long-mode `>name`
* addressing (24-bit, explicit bank) so they bypass RAMRD redirect
* and reach the actual bank-$00 global storage.
*
* Calling convention: ORCA-C memory model 1 (large model, JSL/RTL).
* void peiSlamFullRow(int16_t y);
* - Caller PHAs y (2 bytes) before JSL.
* - JSL pushes 3-byte return address.
* - On entry: y_LO at SP+4, y_HI at SP+5 (SP points one below PCL).
* - Function preserves DBR; returns via RTL with original SP.
* - Caller pops the y arg after RTL.
*
* Per call: ~50 cyc bracket + 80 PEIs * 6 cyc = ~530 cyc, vs the
* memcpy/MVN fallback's 7 cyc/byte * 160 bytes = ~1120 cyc.
* The original PEI-slam-per-row helper was removed; its functionality
* was rolled into iigsBlitStageToShr in joeyDraw.asm (full PEI-slam
* with per-row dirty skip). This stub remains so the build's
* PORT_ASM_SRCS_ALL wildcard pulls in a file with a recognized load
* segment and the linker keeps the same segment-bank layout it had
* when peislam.asm was a real translation unit.
keep PEISLAM
case on
* The operand to START names the LOAD segment this object segment
* belongs to (per ORCA/M for IIgs manual, ch. 6 "Load Segments").
* Object segments without an operand land in the unnamed "blank
* segment" -- which on AUDIO is _ROOT, the very segment whose 64 KB
* budget peislam.asm was busting. Naming a load segment forces the
* linker to put us in our own segment, which the GS/OS loader then
* allocates in its own bank.
peiSlamFullRow start IIGSASM
* MVN-based row copy. Replaces the PEI-stack-slam approach (which
* needs RAMRD/AUXWRITE/SHADOW soft-switches and is sensitive to
* DRAWDATA bank placement). MVN copies 160 bytes from the bank-$01
* stage row to the matching bank-$E1 SHR row at ~7 cyc/byte; that's
* slower than PEI-slam but rock-solid.
*
* Args after PHP: y (int16) at SP+5..6. Compute rowOffset = $2000
* + y*160. MVN $01,$E1 with X=Y=rowOffset, A=159 copies 160 bytes
* from $01:rowOffset to $E1:rowOffset.
php
rep #$30 ; M=16, X=16
lda 5,s ; y
asl a
asl a
asl a
asl a
asl a ; A = y << 5 = y*32
sta >gPeiTempRowBase
asl a
asl a ; A = y << 7 = y*128
clc
adc >gPeiTempRowBase ; A = y*160
clc
adc #$2000 ; A = $2000 + y*160 = row offset
tax ; X = source offset (bank $01)
tay ; Y = dest offset (bank $E1)
lda #159 ; count - 1 (MVN copies count+1 = 160 bytes)
mvn $01,$E1
plp
peislamStub start IIGSASM
rtl
end