Mass ASM optimization on IIgs.
This commit is contained in:
parent
065be89bff
commit
04a9550421
20 changed files with 1432 additions and 1154 deletions
|
|
@ -204,7 +204,12 @@ int main(void) {
|
|||
? (oldY + oldH)
|
||||
: (backup.y + backup.height));
|
||||
|
||||
joeyWaitVBL();
|
||||
// VBL wait removed -- the demo runs at the native compute speed
|
||||
// of save+restore+draw+presentRect so we can SEE the sprite
|
||||
// pipeline's actual throughput. Expect tearing on the ball
|
||||
// since the present can land mid-scan; that's the cost of
|
||||
// showing real frame rate. Add joeyWaitVBL() back here for
|
||||
// tear-free 60 Hz motion.
|
||||
stagePresentRect(unionX, unionY,
|
||||
(uint16_t)(unionRight - unionX),
|
||||
(uint16_t)(unionBottom - unionY));
|
||||
|
|
|
|||
|
|
@ -112,4 +112,22 @@ bool joeyJoyDown(JoeyJoystickE js, JoeyJoyButtonE button);
|
|||
bool joeyJoyPressed(JoeyJoystickE js, JoeyJoyButtonE button);
|
||||
bool joeyJoyReleased(JoeyJoystickE js, JoeyJoyButtonE button);
|
||||
|
||||
// Re-enable joystick polling and recalibrate the resting (center)
|
||||
// position. The IIgs port auto-disables polling after a short window
|
||||
// of detecting no stick (saves ~3 ms/frame of busy-wait). It does NOT
|
||||
// auto-re-probe -- the application must call this function to resume
|
||||
// polling after plugging a stick in.
|
||||
//
|
||||
// The next poll after this call captures the stick's CURRENT raw
|
||||
// position as the new center -- so the user must hold the stick
|
||||
// centered when calling. Subsequent polls report position relative
|
||||
// to that center; raw readings within `deadZone` units of the center
|
||||
// clamp to 0 (use 0 to disable the dead zone).
|
||||
//
|
||||
// On platforms with truly digital sticks (Amiga / ST / DOS) the
|
||||
// recalibration is a no-op -- those ports already report -1 / 0 / +1
|
||||
// directly -- and `deadZone` is ignored. The function still clears
|
||||
// any auto-disconnect state so polling resumes.
|
||||
void joeyJoystickReset(JoeyJoystickE js, uint8_t deadZone);
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -18,6 +18,7 @@
|
|||
#include "spriteInternal.h"
|
||||
#include "surfaceInternal.h"
|
||||
|
||||
|
||||
// Largest scratch buffer needed for any single emit call. 16 KB
|
||||
// covers a 32x32 sprite even on 68k (the biggest mixed-RMW byte-
|
||||
// emit at 16 bytes/byte * (16*17 dest bytes per shift) ~= 4.5 KB,
|
||||
|
|
@ -157,6 +158,11 @@ bool spriteCompile(SpriteT *sp) {
|
|||
|
||||
#if defined(JOEYLIB_PLATFORM_IIGS)
|
||||
|
||||
// y*160 lookup. gRowOffsetLut is the 200-entry uint16_t table built
|
||||
// once by iigsInitRowLut at halInit. Replaces ORCA-C's runtime
|
||||
// multiply (a JSL into __mul16) with a single indexed long-mode read.
|
||||
extern const uint16_t gRowOffsetLut[200];
|
||||
|
||||
// IIgs uses inline asm + a self-modifying call stub instead of a C
|
||||
// function-pointer cast. The build uses ORCA-C large memory model
|
||||
// (-b for sprite demos) so pointers are 24-bit and JSL works
|
||||
|
|
@ -182,7 +188,18 @@ bool spriteCompile(SpriteT *sp) {
|
|||
// Patched per call: byte 2 (destBank), bytes 6-7 (destOffset16),
|
||||
// bytes 9-11 (target 24-bit). The compiled routine assumes
|
||||
// M=8 / X=16 / Y=destOffset on entry; the stub arranges that.
|
||||
//
|
||||
// Stub bytes are split into two phases:
|
||||
// 1. The 8 opcode bytes are written ONCE on first call (gDrawStubInited).
|
||||
// 2. Of the 6 operand bytes, only those that actually changed since
|
||||
// the previous call get re-stamped: destBank and fnAddr are cached
|
||||
// and rarely change (per-shift / per-bank). destOffset is the only
|
||||
// one that changes every call as the sprite moves. Net per-frame
|
||||
// patching for the typical case drops from 14 stores to 2.
|
||||
static unsigned char gSpriteCallStub[14];
|
||||
static bool gDrawStubInited = false;
|
||||
static uint8_t gDrawStubLastBank = 0xFF;
|
||||
static uint32_t gDrawStubLastFnAddr = 0xFFFFFFFFul;
|
||||
|
||||
void spriteCompiledDraw(SurfaceT *dst, const SpriteT *sp, int16_t x, int16_t y) {
|
||||
uint8_t shift;
|
||||
|
|
@ -195,7 +212,7 @@ void spriteCompiledDraw(SurfaceT *dst, const SpriteT *sp, int16_t x, int16_t y)
|
|||
uint8_t *destPtr;
|
||||
uint8_t destBytes[4];
|
||||
shift = (uint8_t)(x & 1);
|
||||
destPtr = &dst->pixels[(uint16_t)y * SURFACE_BYTES_PER_ROW + ((uint16_t)x >> 1)];
|
||||
destPtr = &dst->pixels[gRowOffsetLut[(uint16_t)y] + ((uint16_t)x >> 1)];
|
||||
memcpy(destBytes, &destPtr, 4);
|
||||
destAddr = (uint32_t)destBytes[0]
|
||||
| ((uint32_t)destBytes[1] << 8)
|
||||
|
|
@ -208,20 +225,35 @@ void spriteCompiledDraw(SurfaceT *dst, const SpriteT *sp, int16_t x, int16_t y)
|
|||
}
|
||||
(void)destAddr;
|
||||
|
||||
gSpriteCallStub[ 0] = 0x8B;
|
||||
gSpriteCallStub[ 1] = 0xA9;
|
||||
gSpriteCallStub[ 2] = destBank;
|
||||
gSpriteCallStub[ 3] = 0x48;
|
||||
gSpriteCallStub[ 4] = 0xAB;
|
||||
gSpriteCallStub[ 5] = 0xA0;
|
||||
if (!gDrawStubInited) {
|
||||
gSpriteCallStub[ 0] = 0x8B;
|
||||
gSpriteCallStub[ 1] = 0xA9;
|
||||
gSpriteCallStub[ 3] = 0x48;
|
||||
gSpriteCallStub[ 4] = 0xAB;
|
||||
gSpriteCallStub[ 5] = 0xA0;
|
||||
gSpriteCallStub[ 8] = 0x22;
|
||||
gSpriteCallStub[12] = 0xAB;
|
||||
gSpriteCallStub[13] = 0x6B;
|
||||
gDrawStubInited = true;
|
||||
}
|
||||
|
||||
// destOffset always changes (sprite moves every frame).
|
||||
gSpriteCallStub[ 6] = (unsigned char)(destOffset & 0xFFu);
|
||||
gSpriteCallStub[ 7] = (unsigned char)((destOffset >> 8) & 0xFFu);
|
||||
gSpriteCallStub[ 8] = 0x22;
|
||||
gSpriteCallStub[ 9] = (unsigned char)(fnAddr & 0xFFu);
|
||||
gSpriteCallStub[10] = (unsigned char)((fnAddr >> 8) & 0xFFu);
|
||||
gSpriteCallStub[11] = (unsigned char)((fnAddr >> 16) & 0xFFu);
|
||||
gSpriteCallStub[12] = 0xAB;
|
||||
gSpriteCallStub[13] = 0x6B;
|
||||
|
||||
// destBank only changes if the dst surface migrates banks (~never).
|
||||
if (destBank != gDrawStubLastBank) {
|
||||
gSpriteCallStub[ 2] = destBank;
|
||||
gDrawStubLastBank = destBank;
|
||||
}
|
||||
|
||||
// fnAddr changes only on shift parity flips or sprite swaps.
|
||||
if (fnAddr != gDrawStubLastFnAddr) {
|
||||
gSpriteCallStub[ 9] = (unsigned char)(fnAddr & 0xFFu);
|
||||
gSpriteCallStub[10] = (unsigned char)((fnAddr >> 8) & 0xFFu);
|
||||
gSpriteCallStub[11] = (unsigned char)((fnAddr >> 16) & 0xFFu);
|
||||
gDrawStubLastFnAddr = fnAddr;
|
||||
}
|
||||
|
||||
// ORCA-C compiles this function under `longa on / longi on`
|
||||
// (M=16, X=16) and emits the function epilogue assuming those
|
||||
|
|
@ -259,7 +291,26 @@ void spriteCompiledDraw(SurfaceT *dst, const SpriteT *sp, int16_t x, int16_t y)
|
|||
//
|
||||
// For SAVE: X = screen lo, Y = backup lo
|
||||
// For RESTORE: X = backup lo, Y = screen lo
|
||||
static unsigned char gSpriteCopyStub[13];
|
||||
//
|
||||
// Two distinct stubs (one per op) instead of a shared one. Save and
|
||||
// restore alternate every frame and they swap the X/Y meanings, so a
|
||||
// shared stub forced a full re-stamp on every call. Per-op stubs let
|
||||
// us cache: only the bytes that genuinely change frame-to-frame
|
||||
// (typically just one of screenLo/backupLo as the sprite moves) get
|
||||
// rewritten. Cuts per-call patching from 13 stores to 2 in the typical
|
||||
// case (static backup buffer, stable shift parity).
|
||||
static unsigned char gSpriteSaveStub[13];
|
||||
static unsigned char gSpriteRestoreStub[13];
|
||||
|
||||
static bool gSaveStubInited = false;
|
||||
static uint16_t gSaveStubLastXLo = 0xFFFFu;
|
||||
static uint16_t gSaveStubLastYLo = 0xFFFFu;
|
||||
static uint32_t gSaveStubLastFnAddr = 0xFFFFFFFFul;
|
||||
|
||||
static bool gRestoreStubInited = false;
|
||||
static uint16_t gRestoreStubLastXLo = 0xFFFFu;
|
||||
static uint16_t gRestoreStubLastYLo = 0xFFFFu;
|
||||
static uint32_t gRestoreStubLastFnAddr= 0xFFFFFFFFul;
|
||||
|
||||
|
||||
// patchMvnBanks stamps the destination and source bank operand bytes
|
||||
|
|
@ -315,7 +366,7 @@ void spriteCompiledSaveUnder(const SurfaceT *src, SpriteT *sp, int16_t x, int16_
|
|||
heightPx = (uint16_t)(sp->heightTiles * 8);
|
||||
copyBytes = (uint16_t)((widthPx >> 1) + (shift == 1 ? 1 : 0));
|
||||
|
||||
screenPtr = (uint8_t *)&src->pixels[(uint16_t)y * SURFACE_BYTES_PER_ROW + ((uint16_t)clippedX >> 1)];
|
||||
screenPtr = (uint8_t *)&src->pixels[gRowOffsetLut[(uint16_t)y] + ((uint16_t)clippedX >> 1)];
|
||||
splitPointer(screenPtr, &screenLo, &screenBank);
|
||||
splitPointer(backup->bytes, &backupLo, &backupBank);
|
||||
|
||||
|
|
@ -331,28 +382,49 @@ void spriteCompiledSaveUnder(const SurfaceT *src, SpriteT *sp, int16_t x, int16_
|
|||
+ (uint32_t)sp->routineOffsets[shift][SPRITE_OP_SAVE];
|
||||
|
||||
// Stub: X = screen (source), Y = backup (destination).
|
||||
gSpriteCopyStub[ 0] = 0x8B;
|
||||
gSpriteCopyStub[ 1] = 0xA2;
|
||||
gSpriteCopyStub[ 2] = (unsigned char)(screenLo & 0xFFu);
|
||||
gSpriteCopyStub[ 3] = (unsigned char)((screenLo >> 8) & 0xFFu);
|
||||
gSpriteCopyStub[ 4] = 0xA0;
|
||||
gSpriteCopyStub[ 5] = (unsigned char)(backupLo & 0xFFu);
|
||||
gSpriteCopyStub[ 6] = (unsigned char)((backupLo >> 8) & 0xFFu);
|
||||
gSpriteCopyStub[ 7] = 0x22;
|
||||
gSpriteCopyStub[ 8] = (unsigned char)(fnAddr & 0xFFu);
|
||||
gSpriteCopyStub[ 9] = (unsigned char)((fnAddr >> 8) & 0xFFu);
|
||||
gSpriteCopyStub[10] = (unsigned char)((fnAddr >> 16) & 0xFFu);
|
||||
gSpriteCopyStub[11] = 0xAB;
|
||||
gSpriteCopyStub[12] = 0x6B;
|
||||
if (!gSaveStubInited) {
|
||||
gSpriteSaveStub[ 0] = 0x8B;
|
||||
gSpriteSaveStub[ 1] = 0xA2;
|
||||
gSpriteSaveStub[ 4] = 0xA0;
|
||||
gSpriteSaveStub[ 7] = 0x22;
|
||||
gSpriteSaveStub[11] = 0xAB;
|
||||
gSpriteSaveStub[12] = 0x6B;
|
||||
gSaveStubInited = true;
|
||||
}
|
||||
if (screenLo != gSaveStubLastXLo) {
|
||||
gSpriteSaveStub[ 2] = (unsigned char)(screenLo & 0xFFu);
|
||||
gSpriteSaveStub[ 3] = (unsigned char)((screenLo >> 8) & 0xFFu);
|
||||
gSaveStubLastXLo = screenLo;
|
||||
}
|
||||
if (backupLo != gSaveStubLastYLo) {
|
||||
gSpriteSaveStub[ 5] = (unsigned char)(backupLo & 0xFFu);
|
||||
gSpriteSaveStub[ 6] = (unsigned char)((backupLo >> 8) & 0xFFu);
|
||||
gSaveStubLastYLo = backupLo;
|
||||
}
|
||||
if (fnAddr != gSaveStubLastFnAddr) {
|
||||
gSpriteSaveStub[ 8] = (unsigned char)(fnAddr & 0xFFu);
|
||||
gSpriteSaveStub[ 9] = (unsigned char)((fnAddr >> 8) & 0xFFu);
|
||||
gSpriteSaveStub[10] = (unsigned char)((fnAddr >> 16) & 0xFFu);
|
||||
gSaveStubLastFnAddr = fnAddr;
|
||||
}
|
||||
|
||||
routine = codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_SAVE];
|
||||
patchMvnBanks(routine, heightPx, /*dst*/backupBank, /*src*/screenBank);
|
||||
// Skip the 16+ MVN-bank rewrites if the dst/src bank pair is the
|
||||
// same as last call. Screen and backup buffer banks are stable
|
||||
// for essentially every frame past the first, so this short-
|
||||
// circuits ~5000 cyc/frame on the ball demo.
|
||||
if (sp->cachedDstBank[shift][SPRITE_OP_SAVE] != backupBank ||
|
||||
sp->cachedSrcBank[shift][SPRITE_OP_SAVE] != screenBank) {
|
||||
routine = codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_SAVE];
|
||||
patchMvnBanks(routine, heightPx, /*dst*/backupBank, /*src*/screenBank);
|
||||
sp->cachedDstBank[shift][SPRITE_OP_SAVE] = backupBank;
|
||||
sp->cachedSrcBank[shift][SPRITE_OP_SAVE] = screenBank;
|
||||
}
|
||||
|
||||
// MVN-based routine: needs M=16 / X=16; restore M=16 on exit
|
||||
// matches ORCA-C `longa on` epilogue expectations.
|
||||
asm {
|
||||
rep #0x30
|
||||
jsl gSpriteCopyStub
|
||||
jsl gSpriteSaveStub
|
||||
rep #0x30
|
||||
}
|
||||
}
|
||||
|
|
@ -378,7 +450,7 @@ void spriteCompiledRestoreUnder(SurfaceT *dst, const SpriteBackupT *backup) {
|
|||
spriteBytesPerRow = (uint16_t)(sp->widthTiles * 4);
|
||||
shift = (copyBytes == spriteBytesPerRow) ? 0 : 1;
|
||||
|
||||
screenPtr = (uint8_t *)&dst->pixels[(uint16_t)backup->y * SURFACE_BYTES_PER_ROW + ((uint16_t)backup->x >> 1)];
|
||||
screenPtr = (uint8_t *)&dst->pixels[gRowOffsetLut[(uint16_t)backup->y] + ((uint16_t)backup->x >> 1)];
|
||||
splitPointer(screenPtr, &screenLo, &screenBank);
|
||||
splitPointer(backup->bytes, &backupLo, &backupBank);
|
||||
|
||||
|
|
@ -387,26 +459,45 @@ void spriteCompiledRestoreUnder(SurfaceT *dst, const SpriteBackupT *backup) {
|
|||
+ (uint32_t)sp->routineOffsets[shift][SPRITE_OP_RESTORE];
|
||||
|
||||
// Stub: X = backup (source), Y = screen (destination).
|
||||
gSpriteCopyStub[ 0] = 0x8B;
|
||||
gSpriteCopyStub[ 1] = 0xA2;
|
||||
gSpriteCopyStub[ 2] = (unsigned char)(backupLo & 0xFFu);
|
||||
gSpriteCopyStub[ 3] = (unsigned char)((backupLo >> 8) & 0xFFu);
|
||||
gSpriteCopyStub[ 4] = 0xA0;
|
||||
gSpriteCopyStub[ 5] = (unsigned char)(screenLo & 0xFFu);
|
||||
gSpriteCopyStub[ 6] = (unsigned char)((screenLo >> 8) & 0xFFu);
|
||||
gSpriteCopyStub[ 7] = 0x22;
|
||||
gSpriteCopyStub[ 8] = (unsigned char)(fnAddr & 0xFFu);
|
||||
gSpriteCopyStub[ 9] = (unsigned char)((fnAddr >> 8) & 0xFFu);
|
||||
gSpriteCopyStub[10] = (unsigned char)((fnAddr >> 16) & 0xFFu);
|
||||
gSpriteCopyStub[11] = 0xAB;
|
||||
gSpriteCopyStub[12] = 0x6B;
|
||||
if (!gRestoreStubInited) {
|
||||
gSpriteRestoreStub[ 0] = 0x8B;
|
||||
gSpriteRestoreStub[ 1] = 0xA2;
|
||||
gSpriteRestoreStub[ 4] = 0xA0;
|
||||
gSpriteRestoreStub[ 7] = 0x22;
|
||||
gSpriteRestoreStub[11] = 0xAB;
|
||||
gSpriteRestoreStub[12] = 0x6B;
|
||||
gRestoreStubInited = true;
|
||||
}
|
||||
if (backupLo != gRestoreStubLastXLo) {
|
||||
gSpriteRestoreStub[ 2] = (unsigned char)(backupLo & 0xFFu);
|
||||
gSpriteRestoreStub[ 3] = (unsigned char)((backupLo >> 8) & 0xFFu);
|
||||
gRestoreStubLastXLo = backupLo;
|
||||
}
|
||||
if (screenLo != gRestoreStubLastYLo) {
|
||||
gSpriteRestoreStub[ 5] = (unsigned char)(screenLo & 0xFFu);
|
||||
gSpriteRestoreStub[ 6] = (unsigned char)((screenLo >> 8) & 0xFFu);
|
||||
gRestoreStubLastYLo = screenLo;
|
||||
}
|
||||
if (fnAddr != gRestoreStubLastFnAddr) {
|
||||
gSpriteRestoreStub[ 8] = (unsigned char)(fnAddr & 0xFFu);
|
||||
gSpriteRestoreStub[ 9] = (unsigned char)((fnAddr >> 8) & 0xFFu);
|
||||
gSpriteRestoreStub[10] = (unsigned char)((fnAddr >> 16) & 0xFFu);
|
||||
gRestoreStubLastFnAddr = fnAddr;
|
||||
}
|
||||
|
||||
routine = codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_RESTORE];
|
||||
patchMvnBanks(routine, heightPx, /*dst*/screenBank, /*src*/backupBank);
|
||||
// Same short-circuit as save: only re-stamp the bank operands if
|
||||
// they actually changed since last call.
|
||||
if (sp->cachedDstBank[shift][SPRITE_OP_RESTORE] != screenBank ||
|
||||
sp->cachedSrcBank[shift][SPRITE_OP_RESTORE] != backupBank) {
|
||||
routine = codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_RESTORE];
|
||||
patchMvnBanks(routine, heightPx, /*dst*/screenBank, /*src*/backupBank);
|
||||
sp->cachedDstBank[shift][SPRITE_OP_RESTORE] = screenBank;
|
||||
sp->cachedSrcBank[shift][SPRITE_OP_RESTORE] = backupBank;
|
||||
}
|
||||
|
||||
asm {
|
||||
rep #0x30
|
||||
jsl gSpriteCopyStub
|
||||
jsl gSpriteRestoreStub
|
||||
rep #0x30
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -31,6 +31,16 @@
|
|||
#include "spriteEmitter.h"
|
||||
#include "spriteInternal.h"
|
||||
|
||||
// Pin the IIgs sprite codegen statics into their own load segment
|
||||
// instead of letting them ride in _ROOT. _ROOT also collects every
|
||||
// other unsegmented .c (init.c, sprite.c, present.c, the example
|
||||
// main, ...), so growth in any of those can shift the linker's
|
||||
// per-bank packing and orphan intra-file static refs (we hit this
|
||||
// when DRAWPRIMS grew with the chunked PEI-slam: PATTERN's link
|
||||
// reported "Unresolved reference: emitMvnCopyRoutine" purely from
|
||||
// _ROOT crowding). A dedicated load segment isolates this file.
|
||||
JOEYLIB_SEGMENT("SPRITECG")
|
||||
|
||||
|
||||
// ----- Constants -----
|
||||
|
||||
|
|
|
|||
149
src/core/hal.h
149
src/core/hal.h
|
|
@ -184,4 +184,153 @@ bool halFastBlitRect(uint8_t *dstRow0, int16_t dstX,
|
|||
int16_t copyW, int16_t copyH, int16_t srcRowBytes,
|
||||
uint16_t transparent);
|
||||
|
||||
|
||||
#ifdef JOEYLIB_PLATFORM_IIGS
|
||||
// =====================================================================
|
||||
// IIgs direct-dispatch macros.
|
||||
//
|
||||
// The halFast* function declarations above are the cross-platform API.
|
||||
// On IIgs, those wrappers were ~60-80 cyc/call of pure plumbing on top
|
||||
// of the asm itself: wrapper prologue (PHB/PHD/TCD), redundant arg
|
||||
// re-push for the inner JSL, then wrapper epilogue. The macros below
|
||||
// take effect at preprocess time and inline the asm call at the call
|
||||
// site, eliminating the wrapper layer entirely.
|
||||
//
|
||||
// Cross-platform code in src/core/*.c is unchanged -- it still calls
|
||||
// halFastDrawPixel(...) etc. On IIgs the preprocessor swaps that for
|
||||
// the macro expansion before ORCA-C compiles the file. The matching
|
||||
// halFast* C definitions in src/port/iigs/hal.c are deleted, since
|
||||
// nothing references them once the macros take effect.
|
||||
//
|
||||
// Macros use comma-expression form so they evaluate to a `bool` value
|
||||
// (most halFast* return true on IIgs since the asm always succeeds).
|
||||
// =====================================================================
|
||||
|
||||
extern void iigsDrawPixelInner (uint8_t *pixels, uint16_t x, uint16_t y, uint16_t nibble);
|
||||
extern void iigsDrawLineInner (uint8_t *pixels, uint16_t x0, uint16_t y0, uint16_t x1, uint16_t y1, uint16_t nibble);
|
||||
extern void iigsDrawCircleInner (uint8_t *pixels, uint16_t cx, uint16_t cy, uint16_t r, uint16_t nibble);
|
||||
extern void iigsFillCircleInner (uint8_t *pixels, uint16_t cx, uint16_t cy, uint16_t r, uint16_t fillWord);
|
||||
extern void iigsSurfaceClearInner(uint8_t *pixels, uint16_t fillWord);
|
||||
extern void iigsTileFillInner (uint8_t *dstRow0, uint16_t fillWord);
|
||||
extern void iigsTileCopyInner (uint8_t *dstRow0, const uint8_t *srcRow0);
|
||||
extern void iigsTileCopyMaskedInner(uint8_t *dstRow0, const uint8_t *srcRow0, uint16_t transparent);
|
||||
extern void iigsTilePasteInner (uint8_t *dstRow0, const uint8_t *srcTilePixels);
|
||||
extern void iigsTileSnapInner (uint8_t *dstTilePixels, const uint8_t *srcRow0);
|
||||
extern void iigsBlitRectInner (uint8_t *dstRow0, uint16_t dstX, const uint8_t *srcRow0, uint16_t srcX, uint16_t copyW, uint16_t copyH, uint16_t srcRowBytes, uint16_t transparent);
|
||||
extern void iigsFillRectInner (uint8_t *pixels, uint16_t x, uint16_t y, uint16_t w, uint16_t h, uint16_t nibble);
|
||||
extern void iigsFloodWalkAndScansInner(uint8_t *pixels, uint16_t x, uint16_t y, uint16_t matchColor, uint16_t newColor, uint16_t matchEqual, int16_t *stackX, int16_t *stackY, uint16_t *spInOut, uint16_t maxSp);
|
||||
extern uint16_t gFloodSeedMatch;
|
||||
extern uint16_t gFloodLeftX;
|
||||
extern uint16_t gFloodRightX;
|
||||
|
||||
#undef halFastDrawPixel
|
||||
#define halFastDrawPixel(_s, _x, _y, _c) \
|
||||
(iigsDrawPixelInner((_s)->pixels, (uint16_t)(_x), (uint16_t)(_y), \
|
||||
(uint16_t)((_c) & 0x0F)), \
|
||||
true)
|
||||
|
||||
#undef halFastDrawLine
|
||||
#define halFastDrawLine(_s, _x0, _y0, _x1, _y1, _c) \
|
||||
(iigsDrawLineInner((_s)->pixels, (uint16_t)(_x0), (uint16_t)(_y0), \
|
||||
(uint16_t)(_x1), (uint16_t)(_y1), \
|
||||
(uint16_t)((_c) & 0x0F)), \
|
||||
true)
|
||||
|
||||
#undef halFastDrawCircle
|
||||
#define halFastDrawCircle(_s, _cx, _cy, _r, _c) \
|
||||
(iigsDrawCircleInner((_s)->pixels, (uint16_t)(_cx), (uint16_t)(_cy), \
|
||||
(_r), (uint16_t)((_c) & 0x0F)), \
|
||||
true)
|
||||
|
||||
// fillWord = doubled byte * $0101 = (nib*$11) * $101 = nib * $1111.
|
||||
// Compile-time arithmetic when caller passes a constant; at most a
|
||||
// single multiply when the nibble is variable (still cheaper than
|
||||
// the wrapper's three sequential ORs / shifts).
|
||||
#undef halFastFillCircle
|
||||
#define halFastFillCircle(_s, _cx, _cy, _r, _c) \
|
||||
((_s) == stageGet() \
|
||||
? (iigsFillCircleInner((_s)->pixels, (uint16_t)(_cx), (uint16_t)(_cy), \
|
||||
(_r), (uint16_t)(((_c) & 0x0F) * 0x1111)), \
|
||||
true) \
|
||||
: false)
|
||||
|
||||
#undef halFastSurfaceClear
|
||||
#define halFastSurfaceClear(_s, _d) \
|
||||
((_s) == stageGet() \
|
||||
? (iigsSurfaceClearInner((_s)->pixels, \
|
||||
(uint16_t)((uint16_t)(_d) | ((uint16_t)(_d) << 8))), \
|
||||
true) \
|
||||
: false)
|
||||
|
||||
// halFastFillRect stays as a real C wrapper -- removing it triggered
|
||||
// an unrelated ORCA linker bank-placement failure (same mode as the
|
||||
// peislam.asm deletion: `Unresolved reference Label:
|
||||
// emitMvnCopyRoutine` in sprite codegen). The wrapper now just
|
||||
// forwards to iigsFillRectInner (asm does partial+middle); we lose
|
||||
// the call-site macro inlining for fillRect specifically but keep
|
||||
// the rest of the macros AND the new asm helper. Per-call wrapper
|
||||
// overhead for halFastFillRect is back (~80 cyc) but at least the
|
||||
// per-row partial-byte logic happens in asm now.
|
||||
|
||||
// Tile primitives operate on caller-computed row pointers; just
|
||||
// forward the args. by/bx are tile coords -> bx*4 + by*8*160 byte
|
||||
// offset within the surface.
|
||||
#undef halFastTileFill
|
||||
#define halFastTileFill(_s, _bx, _by, _fw) \
|
||||
(iigsTileFillInner(&(_s)->pixels[(uint16_t)(_by) * 8 * SURFACE_BYTES_PER_ROW \
|
||||
+ (uint16_t)(_bx) * 4], \
|
||||
(_fw)), \
|
||||
true)
|
||||
|
||||
#undef halFastTileCopy
|
||||
#define halFastTileCopy(_d, _s) (iigsTileCopyInner((_d), (_s)), true)
|
||||
|
||||
#undef halFastTileCopyMasked
|
||||
#define halFastTileCopyMasked(_d, _s, _t) \
|
||||
(iigsTileCopyMaskedInner((_d), (_s), (uint16_t)(_t)), true)
|
||||
|
||||
#undef halFastTilePaste
|
||||
#define halFastTilePaste(_d, _s) (iigsTilePasteInner((_d), (_s)), true)
|
||||
|
||||
#undef halFastTileSnap
|
||||
#define halFastTileSnap(_d, _s) (iigsTileSnapInner((_d), (_s)), true)
|
||||
|
||||
#undef halFastBlitRect
|
||||
#define halFastBlitRect(_dr, _dx, _sr, _sx, _w, _h, _ss, _t) \
|
||||
(iigsBlitRectInner((_dr), (uint16_t)(_dx), (_sr), (uint16_t)(_sx), \
|
||||
(uint16_t)(_w), (uint16_t)(_h), \
|
||||
(uint16_t)(_ss), (_t)), \
|
||||
true)
|
||||
|
||||
// Tier 2/3 flood fallbacks always returned false on IIgs (the asm
|
||||
// impls were deleted as unreachable). Macros to constant false so
|
||||
// ORCA-C dead-code-eliminates the never-taken fallback branches in
|
||||
// floodFillInternal.
|
||||
#undef halFastFloodWalk
|
||||
#define halFastFloodWalk(_row, _sx, _mc, _nc, _me, _sm, _lx, _rx) (false)
|
||||
|
||||
#undef halFastFloodScanRow
|
||||
#define halFastFloodScanRow(_row, _lx, _rx, _mc, _nc, _me, _mb) (false)
|
||||
|
||||
#undef halFastFloodScanAndPush
|
||||
#define halFastFloodScanAndPush(_row, _lx, _rx, _mc, _nc, _me, _sy, _sx, _syA, _sp, _ms) (false)
|
||||
|
||||
// Tier-1 flood: multi-output. Asm sets gFloodSeedMatch / gFloodLeftX /
|
||||
// gFloodRightX; macro reads those into the caller's out-ptrs.
|
||||
#undef halFastFloodWalkAndScans
|
||||
#define halFastFloodWalkAndScans(_pix, _x, _y, _mc, _nc, _me, _sx, _sy, _sp, _ms, _smOut, _lxOut, _rxOut) \
|
||||
(iigsFloodWalkAndScansInner((_pix), (uint16_t)(_x), (uint16_t)(_y), \
|
||||
(uint16_t)((_mc) & 0x0F), \
|
||||
(uint16_t)((_nc) & 0x0F), \
|
||||
(uint16_t)((_me) ? 1 : 0), \
|
||||
(_sx), (_sy), \
|
||||
(uint16_t *)(_sp), \
|
||||
(uint16_t)(_ms)), \
|
||||
*(_smOut) = (gFloodSeedMatch != 0), \
|
||||
*(_lxOut) = (int16_t)gFloodLeftX, \
|
||||
*(_rxOut) = (int16_t)gFloodRightX, \
|
||||
true)
|
||||
|
||||
#endif /* JOEYLIB_PLATFORM_IIGS */
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -28,12 +28,31 @@ int8_t gJoyAxisX [JOYSTICK_COUNT];
|
|||
int8_t gJoyAxisY [JOYSTICK_COUNT];
|
||||
bool gJoyButtonState[JOYSTICK_COUNT][JOY_BUTTON_COUNT];
|
||||
bool gJoyButtonPrev [JOYSTICK_COUNT][JOY_BUTTON_COUNT];
|
||||
uint8_t gJoyDeadZone [JOYSTICK_COUNT];
|
||||
|
||||
|
||||
#ifdef JOEYLIB_PLATFORM_IIGS
|
||||
extern void iigsInputSnapshot(void);
|
||||
// Build-time check: iigsInputSnapshot's asm hard-codes KEY_COUNT=60
|
||||
// and the small button counts. If a future change adds/removes keys
|
||||
// or buttons the asm must be updated; this declares a zero-size
|
||||
// array if the math no longer matches, which is a compile error.
|
||||
typedef int joey_keycount_check[(KEY_COUNT == 60) ? 1 : -1];
|
||||
typedef int joey_mousebtn_check[(MOUSE_BUTTON_COUNT == 4) ? 1 : -1];
|
||||
typedef int joey_joybtn_check[(JOYSTICK_COUNT * JOY_BUTTON_COUNT == 4) ? 1 : -1];
|
||||
#endif
|
||||
|
||||
void joeyInputPoll(void) {
|
||||
#ifdef JOEYLIB_PLATFORM_IIGS
|
||||
// One asm pass for: TTL decrement + key snapshot + mouse/joy
|
||||
// button snapshots. Replaces 3 ORCA-C memcpys + the C TTL loop
|
||||
// that used to live in halInputPoll. ~0.6 ms saved per frame.
|
||||
iigsInputSnapshot();
|
||||
#else
|
||||
memcpy(gKeyPrev, gKeyState, sizeof(gKeyState));
|
||||
memcpy(gMouseButtonPrev, gMouseButtonState, sizeof(gMouseButtonState));
|
||||
memcpy(gJoyButtonPrev, gJoyButtonState, sizeof(gJoyButtonState));
|
||||
#endif
|
||||
halInputPoll();
|
||||
}
|
||||
|
||||
|
|
@ -170,3 +189,12 @@ bool joeyJoyReleased(JoeyJoystickE js, JoeyJoyButtonE button) {
|
|||
}
|
||||
return !gJoyButtonState[js][button] && gJoyButtonPrev[js][button];
|
||||
}
|
||||
|
||||
|
||||
void joeyJoystickReset(JoeyJoystickE js, uint8_t deadZone) {
|
||||
if ((int)js < 0 || (int)js >= JOYSTICK_COUNT) {
|
||||
return;
|
||||
}
|
||||
gJoyDeadZone[js] = deadZone;
|
||||
halJoystickReset(js);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -26,4 +26,12 @@ extern int8_t gJoyAxisY [JOYSTICK_COUNT];
|
|||
extern bool gJoyButtonState[JOYSTICK_COUNT][JOY_BUTTON_COUNT];
|
||||
extern bool gJoyButtonPrev [JOYSTICK_COUNT][JOY_BUTTON_COUNT];
|
||||
|
||||
// Per-stick analog calibration. Set by joeyJoystickReset on platforms
|
||||
// with analog paddles (IIgs); ignored on digital-stick platforms.
|
||||
extern uint8_t gJoyDeadZone [JOYSTICK_COUNT];
|
||||
|
||||
// Per-port hook: called from joeyJoystickReset to clear any auto-
|
||||
// disconnect state and arm a fresh center capture on the next poll.
|
||||
void halJoystickReset(JoeyJoystickE js);
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -37,4 +37,7 @@ void paletteSet(SurfaceT *s, uint8_t paletteIndex, const uint16_t *colors16) {
|
|||
for (i = 1; i < SURFACE_COLORS_PER_PALETTE; i++) {
|
||||
s->palette[paletteIndex][i] = colors16[i] & 0x0FFF;
|
||||
}
|
||||
if (s == stageGet()) {
|
||||
gStagePaletteDirty = true;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@
|
|||
// which of the 16 palettes that scanline uses at display time.
|
||||
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "joey/palette.h"
|
||||
#include "surfaceInternal.h"
|
||||
|
|
@ -26,6 +27,9 @@ void scbSet(SurfaceT *s, uint16_t line, uint8_t paletteIndex) {
|
|||
return;
|
||||
}
|
||||
s->scb[line] = paletteIndex;
|
||||
if (s == stageGet()) {
|
||||
gStageScbDirty = true;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -51,7 +55,14 @@ void scbSetRange(SurfaceT *s, uint16_t firstLine, uint16_t lastLine, uint8_t pal
|
|||
return;
|
||||
}
|
||||
|
||||
for (line = firstLine; line <= last; line++) {
|
||||
s->scb[line] = paletteIndex;
|
||||
// memset is far cheaper than the per-iter loop on ORCA-C with -b
|
||||
// (scb is uint8_t, sizeof(uint8_t)==1, so the call form below is
|
||||
// exact). On IIgs ORCA-C lowers small fixed-size memsets to MVP /
|
||||
// PEI tricks; on Amiga/ST/DOS it uses libc memset which is
|
||||
// already vectorized. Either way, much tighter than the C loop.
|
||||
(void)line;
|
||||
memset(&s->scb[firstLine], paletteIndex, (size_t)(last - firstLine + 1));
|
||||
if (s == stageGet()) {
|
||||
gStageScbDirty = true;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -175,6 +175,8 @@ SpriteT *spriteCreate(const uint8_t *tileData, uint8_t widthTiles, uint8_t heigh
|
|||
sp->ownsTileData = false;
|
||||
sp->slot = NULL;
|
||||
memset(sp->routineOffsets, 0, sizeof(sp->routineOffsets));
|
||||
memset(sp->cachedDstBank, 0xFF, sizeof(sp->cachedDstBank));
|
||||
memset(sp->cachedSrcBank, 0xFF, sizeof(sp->cachedSrcBank));
|
||||
sp->flags = flags;
|
||||
return sp;
|
||||
}
|
||||
|
|
@ -242,6 +244,8 @@ SpriteT *spriteCreateFromSurface(const SurfaceT *src, int16_t x, int16_t y,
|
|||
sp->ownsTileData = true;
|
||||
sp->slot = NULL;
|
||||
memset(sp->routineOffsets, 0, sizeof(sp->routineOffsets));
|
||||
memset(sp->cachedDstBank, 0xFF, sizeof(sp->cachedDstBank));
|
||||
memset(sp->cachedSrcBank, 0xFF, sizeof(sp->cachedSrcBank));
|
||||
sp->flags = flags;
|
||||
return sp;
|
||||
}
|
||||
|
|
@ -385,6 +389,8 @@ SpriteT *spriteFromCompiledMem(const uint8_t *data, uint32_t length, SpriteFlags
|
|||
sp->ownsTileData = true;
|
||||
sp->slot = slot;
|
||||
sp->flags = flags;
|
||||
memset(sp->cachedDstBank, 0xFF, sizeof(sp->cachedDstBank));
|
||||
memset(sp->cachedSrcBank, 0xFF, sizeof(sp->cachedSrcBank));
|
||||
return sp;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -35,6 +35,16 @@ struct SpriteT {
|
|||
uint16_t routineOffsets[JOEY_SPRITE_SHIFT_COUNT][SPRITE_OP_COUNT];
|
||||
|
||||
SpriteFlagsE flags;
|
||||
|
||||
// Per-shift, per-op MVN bank-patch cache for IIgs save/restore.
|
||||
// patchMvnBanks rewrites 16+ MVN bank operands every call, but the
|
||||
// banks themselves rarely change frame-to-frame (screen surface
|
||||
// is fixed; backup buffer is allocated once). After the first
|
||||
// patch, subsequent calls compare requested banks to the cache
|
||||
// and skip the re-stamp loop. 0xFF means "never patched yet".
|
||||
// 12 bytes per sprite. Unused on non-IIgs.
|
||||
uint8_t cachedDstBank[JOEY_SPRITE_SHIFT_COUNT][SPRITE_OP_COUNT];
|
||||
uint8_t cachedSrcBank[JOEY_SPRITE_SHIFT_COUNT][SPRITE_OP_COUNT];
|
||||
};
|
||||
|
||||
// Compiled entry points. Implemented alongside spriteCompile in
|
||||
|
|
|
|||
|
|
@ -10,6 +10,10 @@
|
|||
#include "hal.h"
|
||||
#include "surfaceInternal.h"
|
||||
|
||||
#ifdef JOEYLIB_PLATFORM_IIGS
|
||||
extern void iigsMarkDirtyRowsInner(uint16_t yStart, uint16_t yEnd, uint16_t minWord, uint16_t maxWord);
|
||||
#endif
|
||||
|
||||
#define SURFACE_PALETTE_BYTES (SURFACE_PALETTE_ENTRIES * (uint32_t)sizeof(uint16_t))
|
||||
#define SURFACE_FILE_BYTES (SURFACE_PIXELS_SIZE + SURFACE_HEIGHT + SURFACE_PALETTE_BYTES)
|
||||
|
||||
|
|
@ -25,8 +29,21 @@ static SurfaceT *gStage = NULL;
|
|||
uint8_t gStageMinWord[SURFACE_HEIGHT];
|
||||
uint8_t gStageMaxWord[SURFACE_HEIGHT];
|
||||
|
||||
// "Stage SCB / palette has changed since last present-side upload."
|
||||
// Cheap flag check at present time replaces the 200+512 byte memcmps
|
||||
// the IIgs port used to run every frame in halPresentRect's
|
||||
// uploadScbAndPaletteIfNeeded -- ~7 ms / frame saved on demos that
|
||||
// don't churn palette/SCB (i.e., almost all demos).
|
||||
//
|
||||
// Initially true so the first present uploads. scbSet*/paletteSet
|
||||
// re-mark dirty when the stage's data changes; per-port present code
|
||||
// clears the flag after consuming.
|
||||
bool gStageScbDirty = true;
|
||||
bool gStagePaletteDirty = true;
|
||||
|
||||
// ----- Internal helpers (alphabetical) -----
|
||||
|
||||
#ifndef JOEYLIB_PLATFORM_IIGS
|
||||
static void widenRow(int16_t y, uint8_t minWord, uint8_t maxWord) {
|
||||
if (minWord < gStageMinWord[y]) {
|
||||
gStageMinWord[y] = minWord;
|
||||
|
|
@ -35,6 +52,7 @@ static void widenRow(int16_t y, uint8_t minWord, uint8_t maxWord) {
|
|||
gStageMaxWord[y] = maxWord;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// ----- Public API (alphabetical) -----
|
||||
|
||||
|
|
@ -169,10 +187,12 @@ void surfaceMarkDirtyAll(const SurfaceT *s) {
|
|||
// the call is a no-op so primitives can call unconditionally without
|
||||
// branching themselves.
|
||||
void surfaceMarkDirtyRect(const SurfaceT *s, int16_t x, int16_t y, int16_t w, int16_t h) {
|
||||
int16_t row;
|
||||
int16_t yEnd;
|
||||
uint8_t minWord;
|
||||
uint8_t maxWord;
|
||||
#ifndef JOEYLIB_PLATFORM_IIGS
|
||||
int16_t row;
|
||||
#endif
|
||||
|
||||
if (s != gStage) {
|
||||
return;
|
||||
|
|
@ -183,9 +203,14 @@ void surfaceMarkDirtyRect(const SurfaceT *s, int16_t x, int16_t y, int16_t w, in
|
|||
minWord = (uint8_t)(x >> 2);
|
||||
maxWord = (uint8_t)((x + w - 1) >> 2);
|
||||
yEnd = y + h;
|
||||
#ifdef JOEYLIB_PLATFORM_IIGS
|
||||
iigsMarkDirtyRowsInner((uint16_t)y, (uint16_t)yEnd,
|
||||
(uint16_t)minWord, (uint16_t)maxWord);
|
||||
#else
|
||||
for (row = y; row < yEnd; row++) {
|
||||
widenRow(row, minWord, maxWord);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -38,6 +38,13 @@ struct SurfaceT {
|
|||
extern uint8_t gStageMinWord[SURFACE_HEIGHT];
|
||||
extern uint8_t gStageMaxWord[SURFACE_HEIGHT];
|
||||
|
||||
// Stage SCB / palette dirty flags. scbSet* and paletteSet set them
|
||||
// true when the stage's data is modified; the per-port present code
|
||||
// checks the flags and clears after upload. Replaces a per-frame
|
||||
// 712-byte memcmp pair the IIgs port used to run unconditionally.
|
||||
extern bool gStageScbDirty;
|
||||
extern bool gStagePaletteDirty;
|
||||
|
||||
// Drawing primitives call this with their already-clipped destination
|
||||
// rect. If `s` is the stage, the affected rows' [minWord, maxWord]
|
||||
// bands are widened to cover the rect. If `s` is any other surface,
|
||||
|
|
|
|||
|
|
@ -226,6 +226,12 @@ static void pollJoysticks(void) {
|
|||
|
||||
// ----- HAL API (alphabetical) -----
|
||||
|
||||
void halJoystickReset(JoeyJoystickE js) {
|
||||
// Amiga sticks are digital -- no calibration to do.
|
||||
(void)js;
|
||||
}
|
||||
|
||||
|
||||
void halInputInit(void) {
|
||||
memset(gKeyState, 0, sizeof(gKeyState));
|
||||
memset(gKeyPrev, 0, sizeof(gKeyPrev));
|
||||
|
|
|
|||
|
|
@ -281,6 +281,12 @@ static long restoreIkbdVector(void) {
|
|||
|
||||
// ----- HAL API (alphabetical) -----
|
||||
|
||||
void halJoystickReset(JoeyJoystickE js) {
|
||||
// Atari ST sticks are digital -- no calibration to do.
|
||||
(void)js;
|
||||
}
|
||||
|
||||
|
||||
void halInputInit(void) {
|
||||
memset(gKeyState, 0, sizeof(gKeyState));
|
||||
memset(gKeyPrev, 0, sizeof(gKeyPrev));
|
||||
|
|
|
|||
|
|
@ -305,6 +305,12 @@ static void mousePoll(void) {
|
|||
|
||||
// ----- HAL API (alphabetical) -----
|
||||
|
||||
void halJoystickReset(JoeyJoystickE js) {
|
||||
// DOS sticks are digital -- no calibration to do.
|
||||
(void)js;
|
||||
}
|
||||
|
||||
|
||||
void halInputInit(void) {
|
||||
memset(gKeyState, 0, sizeof(gKeyState));
|
||||
memset(gKeyPrev, 0, sizeof(gKeyPrev));
|
||||
|
|
|
|||
|
|
@ -42,10 +42,9 @@ JOEYLIB_SEGMENT("DRAWPRIMS")
|
|||
|
||||
// 32 KB stack-slam fill via AUXWRITE. ~25 ms full-screen.
|
||||
extern void iigsSurfaceClearInner(uint8_t *pixels, uint16_t fillWord);
|
||||
// PEI-slam fill of `bytesPerRow` doubled bytes per row across `rows`
|
||||
// rows, advancing 160 bytes per row. firstRow must be in bank $01.
|
||||
// Caller handles partial-nibble edges in C; bytesPerRow is even.
|
||||
extern void iigsFillRectStageInner(uint8_t *firstRow, uint16_t bytesPerRow, uint16_t rows, uint16_t fillWord);
|
||||
// Full-fill asm helper (partial leading byte + middle MVN + partial
|
||||
// trailing byte). Called by halFastFillRect below.
|
||||
extern void iigsFillRectInner(uint8_t *pixels, uint16_t x, uint16_t y, uint16_t w, uint16_t h, uint16_t nibble);
|
||||
// 16 STA abs,X stores at fixed offsets along a 160-byte stride.
|
||||
// ~120 cyc per call.
|
||||
extern void iigsTileFillInner(uint8_t *dstRow0, uint16_t fillWord);
|
||||
|
|
@ -72,26 +71,15 @@ extern void iigsDrawCircleInner(uint8_t *pixels, uint16_t cx, uint16_t cy, uint1
|
|||
// Replaces ORCA-C's memcpy path which silently fails when called
|
||||
// from halPresent (DBR-state quirk after prior asm primitives).
|
||||
extern void iigsBlitStageToShr(uint8_t *scbPtr, uint16_t *palettePtr);
|
||||
// floodFill row walk: tests seed pixel and walks left/right to find
|
||||
// the matching run. Writes results to gFloodSeedMatch / gFloodLeftX /
|
||||
// gFloodRightX (DRAWPRIMS globals).
|
||||
extern void iigsFloodWalkInner(uint8_t *row, uint16_t startX, uint16_t matchColor, uint16_t newColor, uint16_t matchEqual);
|
||||
// floodFill walk results: written by iigsFloodWalkAndScansInner,
|
||||
// read back by halFastFloodWalkAndScans.
|
||||
extern uint16_t gFloodSeedMatch;
|
||||
extern uint16_t gFloodLeftX;
|
||||
extern uint16_t gFloodRightX;
|
||||
// Per-pixel match scan over [leftX..rightX] of `row`. Writes 1/0 to
|
||||
// markBuf[i] for each pixel. matchEqual selects boundary vs equal mode
|
||||
// (see C srcPixel match logic).
|
||||
extern void iigsFloodScanRowInner(uint8_t *row, uint16_t leftX, uint16_t rightX, uint16_t matchColor, uint16_t newColor, uint16_t matchEqual, uint8_t *markBuf);
|
||||
// Per-pixel rect blit (src->dst). transparent == $FFFF means opaque
|
||||
// (always copy); else pixels with src nibble == (transparent & $0F)
|
||||
// are skipped. Dst stride is hardcoded 160 (SURFACE_BYTES_PER_ROW).
|
||||
extern void iigsBlitRectInner(uint8_t *dstRow0, uint16_t dstX, const uint8_t *srcRow0, uint16_t srcX, uint16_t copyW, uint16_t copyH, uint16_t srcRowBytes, uint16_t transparent);
|
||||
// Combined scan + push: matches each pixel, tracks run state, pushes
|
||||
// (x, scanY) to the (stackX, stackY) arrays at *spInOut on every
|
||||
// falling edge and at the end of the row if still in a run. *spInOut
|
||||
// is read on entry and updated with the new top-of-stack on return.
|
||||
extern void iigsFloodScanAndPushInner(uint8_t *row, uint16_t leftX, uint16_t rightX, uint16_t matchColor, uint16_t newColor, uint16_t matchEqual, uint16_t scanY, int16_t *stackX, int16_t *stackY, uint16_t *spInOut, uint16_t maxSp);
|
||||
// Single-call per-popped-seed worker: seed test + walk-left + walk-right
|
||||
// + scan-above + scan-below + push, all sharing cached row addr and
|
||||
// match decoders. Outputs to gFloodSeedMatch / gFloodLeftX / gFloodRightX.
|
||||
|
|
@ -101,6 +89,12 @@ extern void iigsFloodWalkAndScansInner(uint8_t *pixels, uint16_t x, uint16_t y,
|
|||
// every asm primitive that needs row offset can do `lda >lut,x` instead
|
||||
// of the 7-instruction shift-add.
|
||||
extern void iigsInitRowLut(void);
|
||||
// Per-row MVN blit from $01:srcOffset to $E1:srcOffset for partial-
|
||||
// screen presents (halPresentRect). srcOffset is the byte offset
|
||||
// within bank $01 of the FIRST byte to copy on the FIRST row;
|
||||
// subsequent rows are at srcOffset + 160, etc. ~9 cyc/byte vs
|
||||
// ORCA-C memcpy's ~30 cyc/byte.
|
||||
extern void iigsBlitRectStageToShr(uint16_t srcOffset, uint16_t copyBytes, uint16_t rowsLeft);
|
||||
// Filled circle, scanline-style. fillWord low byte is the doubled
|
||||
// nibble (e.g., 0x33 for nibble 3).
|
||||
extern void iigsFillCircleInner(uint8_t *pixels, uint16_t cx, uint16_t cy, uint16_t r, uint16_t fillWord);
|
||||
|
|
@ -155,14 +149,9 @@ static uint8_t gPreviousBorder = 0;
|
|||
static uint8_t gPreviousShadow = 0;
|
||||
static bool gModeSet = false;
|
||||
|
||||
// Last-uploaded SCB and palette. Both registers live in bank $E1; on a
|
||||
// 2.8 MHz 65816 the 200+512-byte memcpy across the bank boundary is a
|
||||
// real cost when it runs every present. Caching here lets the typical
|
||||
// game loop (which mutates pixels but rarely SCB/palette) skip the
|
||||
// upload entirely on clean frames.
|
||||
static uint8_t gCachedScb [SURFACE_HEIGHT];
|
||||
static uint16_t gCachedPalette[SURFACE_PALETTE_COUNT][SURFACE_COLORS_PER_PALETTE];
|
||||
static bool gCacheValid = false;
|
||||
// SCB / palette upload skipping is now driven by gStageScbDirty /
|
||||
// gStagePaletteDirty (core/surface.c). The old per-frame memcmp-
|
||||
// against-cached-copy approach was costing ~7 ms / frame on ORCA-C.
|
||||
|
||||
// PEI slam scratch. File-scope non-static so the asm can `ext` them;
|
||||
// all accesses inside the slam use long-mode `>` addressing so they
|
||||
|
|
@ -171,30 +160,30 @@ volatile uint16_t gPeiOrigSp;
|
|||
volatile uint8_t gPeiOrigShadow;
|
||||
volatile uint16_t gPeiTempRowBase;
|
||||
volatile uint16_t gPeiCurRow; // row counter saved across slam (stack is hijacked)
|
||||
volatile uint16_t gPeiChunkRow; // in-chunk row counter saved across slam (Y reg storage)
|
||||
|
||||
// Defined in src/port/iigs/peislam.asm, in its own load segment
|
||||
// (DRAWPRIMS) so the GS/OS loader places it in a different bank from
|
||||
// AUDIO's _ROOT. PEI-slams the full 80 words of stage row `y` into
|
||||
// the matching $E1 SHR row, ~530 cyc/row vs ~1120 cyc for memcpy/MVN.
|
||||
extern void peiSlamFullRow(int16_t y);
|
||||
// peislam.asm's per-row peiSlamFullRow helper is no longer wired in;
|
||||
// the present pipeline now does its own PEI-slam loop inside
|
||||
// iigsBlitStageToShr above (with dirty-row skip).
|
||||
|
||||
|
||||
|
||||
// Upload SCB and palette into bank-$E1 SHR memory only when they have
|
||||
// changed since the last call. paletteOrScbChanged returns false when
|
||||
// the cache is already in sync, in which case both memcpys to $E1 are
|
||||
// skipped.
|
||||
// Upload SCB / palette into bank-$E1 SHR memory only when the
|
||||
// matching dirty flag is set. Replaces a per-frame 712-byte memcmp
|
||||
// pair (~7 ms / frame on ORCA-C with -b) with a 2-cyc flag check.
|
||||
// gStageScbDirty / gStagePaletteDirty live in core/surface.c; they
|
||||
// start true (forces the very first present to upload), get set true
|
||||
// again whenever scbSet* / paletteSet mutate the stage's data, and
|
||||
// get cleared here after upload.
|
||||
static void uploadScbAndPaletteIfNeeded(const SurfaceT *src) {
|
||||
if (gCacheValid
|
||||
&& memcmp(gCachedScb, src->scb, sizeof(gCachedScb)) == 0
|
||||
&& memcmp(gCachedPalette, src->palette, sizeof(gCachedPalette)) == 0) {
|
||||
return;
|
||||
if (gStageScbDirty) {
|
||||
memcpy(IIGS_SHR_SCB, src->scb, SURFACE_HEIGHT);
|
||||
gStageScbDirty = false;
|
||||
}
|
||||
if (gStagePaletteDirty) {
|
||||
memcpy(IIGS_SHR_PALETTE, src->palette, sizeof(src->palette));
|
||||
gStagePaletteDirty = false;
|
||||
}
|
||||
memcpy(IIGS_SHR_SCB, src->scb, SURFACE_HEIGHT);
|
||||
memcpy(IIGS_SHR_PALETTE, src->palette, sizeof(src->palette));
|
||||
memcpy(gCachedScb, src->scb, sizeof(gCachedScb));
|
||||
memcpy(gCachedPalette, src->palette, sizeof(gCachedPalette));
|
||||
gCacheValid = true;
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -241,10 +230,9 @@ void halPresent(const SurfaceT *src) {
|
|||
|
||||
|
||||
void halPresentRect(const SurfaceT *src, int16_t x, int16_t y, uint16_t w, uint16_t h) {
|
||||
int16_t py;
|
||||
int16_t yEnd;
|
||||
uint16_t copyBytes;
|
||||
int16_t byteStart;
|
||||
uint16_t srcOffset;
|
||||
|
||||
if (src == NULL) {
|
||||
return;
|
||||
|
|
@ -257,13 +245,16 @@ void halPresentRect(const SurfaceT *src, int16_t x, int16_t y, uint16_t w, uint1
|
|||
// otherwise we include the byte containing the leftmost pixel.
|
||||
byteStart = x >> 1;
|
||||
copyBytes = (uint16_t)(((x + (int16_t)w + 1) >> 1) - byteStart);
|
||||
yEnd = y + (int16_t)h;
|
||||
|
||||
for (py = y; py < yEnd; py++) {
|
||||
memcpy(&IIGS_SHR_PIXELS[py * SURFACE_BYTES_PER_ROW + byteStart],
|
||||
&src->pixels[py * SURFACE_BYTES_PER_ROW + byteStart],
|
||||
copyBytes);
|
||||
if (copyBytes == 0 || h == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Asm per-row MVN blit. Stage pixels live at $01:2000; SHR display
|
||||
// at $E1:2000 (same offset within their banks). srcOffset is the
|
||||
// byte offset of the first byte to copy on the first row.
|
||||
srcOffset = (uint16_t)(0x2000 + y * SURFACE_BYTES_PER_ROW + byteStart);
|
||||
iigsBlitRectStageToShr(srcOffset, copyBytes, h);
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -277,249 +268,35 @@ void halShutdown(void) {
|
|||
}
|
||||
|
||||
|
||||
bool halFastSurfaceClear(SurfaceT *s, uint8_t doubled) {
|
||||
uint16_t fillWord;
|
||||
|
||||
if (s == NULL) {
|
||||
return false;
|
||||
}
|
||||
if (s != stageGet()) {
|
||||
return false;
|
||||
}
|
||||
fillWord = (uint16_t)((uint16_t)doubled | ((uint16_t)doubled << 8));
|
||||
iigsSurfaceClearInner(s->pixels, fillWord);
|
||||
return true;
|
||||
}
|
||||
// halFastSurfaceClear / halFastDrawLine / halFastDrawCircle /
|
||||
// halFastFillCircle / halFastTileCopy / halFastTileCopyMasked /
|
||||
// halFastTilePaste / halFastTileSnap / halFastTileFill /
|
||||
// halFastBlitRect / halFastFloodWalk[AndScans] /
|
||||
// halFastFloodScanRow / halFastFloodScanAndPush all dispatch via
|
||||
// macros in core/hal.h on IIgs (#ifdef JOEYLIB_PLATFORM_IIGS block).
|
||||
// Only halFastFillRect remains a real function below because its
|
||||
// partial-byte (nibble-edge) handling is too gnarly for a macro.
|
||||
|
||||
|
||||
// halFastFillRect: thin wrapper around iigsFillRectInner. The asm
|
||||
// helper now handles the partial-byte (nibble-edge) logic that used
|
||||
// to live here, so this function is just a stage-check + forward.
|
||||
// (It's not macro-dispatched like the others because removing it
|
||||
// from the C side triggers an unrelated ORCA-linker bank-placement
|
||||
// failure -- the binary needs enough mass in _ROOT to keep sprite
|
||||
// codegen's static symbols at addresses the linker can resolve.)
|
||||
bool halFastFillRect(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex) {
|
||||
int16_t pxStart;
|
||||
int16_t pxEnd;
|
||||
int16_t midStart;
|
||||
int16_t midBytes;
|
||||
int16_t trailingByte;
|
||||
int16_t leadingByte;
|
||||
bool hasLeading;
|
||||
bool hasTrailing;
|
||||
int16_t row;
|
||||
uint8_t *line;
|
||||
uint16_t fillWord;
|
||||
uint8_t nibble;
|
||||
uint8_t doubled;
|
||||
|
||||
if (s == NULL) {
|
||||
if (s == NULL || s != stageGet()) {
|
||||
return false;
|
||||
}
|
||||
if (s != stageGet()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
pxStart = x;
|
||||
pxEnd = (int16_t)(x + (int16_t)w);
|
||||
leadingByte = (int16_t)(pxStart >> 1);
|
||||
hasLeading = (pxStart & 1) != 0;
|
||||
if (hasLeading) {
|
||||
pxStart++;
|
||||
}
|
||||
midStart = (int16_t)(pxStart >> 1);
|
||||
midBytes = (int16_t)((pxEnd - pxStart) >> 1);
|
||||
hasTrailing = ((pxEnd - pxStart) & 1) != 0;
|
||||
trailingByte = (int16_t)(midStart + midBytes);
|
||||
|
||||
if (midBytes <= 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
nibble = (uint8_t)(colorIndex & 0x0F);
|
||||
doubled = (uint8_t)((nibble << 4) | nibble);
|
||||
|
||||
if (hasLeading || hasTrailing) {
|
||||
for (row = 0; row < (int16_t)h; row++) {
|
||||
line = &s->pixels[(y + row) * SURFACE_BYTES_PER_ROW];
|
||||
if (hasLeading) {
|
||||
line[leadingByte] = (uint8_t)((line[leadingByte] & 0xF0) | nibble);
|
||||
}
|
||||
if (hasTrailing) {
|
||||
line[trailingByte] = (uint8_t)((line[trailingByte] & 0x0F) | (nibble << 4));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fillWord = (uint16_t)((uint16_t)doubled | ((uint16_t)doubled << 8));
|
||||
line = &s->pixels[y * SURFACE_BYTES_PER_ROW + midStart];
|
||||
iigsFillRectStageInner(line, (uint16_t)midBytes, h, fillWord);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool halFastTileCopy(uint8_t *dstRow0, const uint8_t *srcRow0) {
|
||||
iigsTileCopyInner(dstRow0, srcRow0);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool halFastTileCopyMasked(uint8_t *dstRow0, const uint8_t *srcRow0, uint8_t transparent) {
|
||||
iigsTileCopyMaskedInner(dstRow0, srcRow0, (uint16_t)transparent);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool halFastTilePaste(uint8_t *dstRow0, const uint8_t *srcTilePixels) {
|
||||
iigsTilePasteInner(dstRow0, srcTilePixels);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool halFastTileSnap(uint8_t *dstTilePixels, const uint8_t *srcRow0) {
|
||||
iigsTileSnapInner(dstTilePixels, srcRow0);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool halFastDrawPixel(SurfaceT *s, uint16_t x, uint16_t y, uint8_t colorIndex) {
|
||||
if (s == NULL) {
|
||||
return false;
|
||||
}
|
||||
iigsDrawPixelInner(s->pixels, x, y, (uint16_t)(colorIndex & 0x0F));
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool halFastDrawLine(SurfaceT *s, int16_t x0, int16_t y0, int16_t x1, int16_t y1, uint8_t colorIndex) {
|
||||
if (s == NULL) {
|
||||
return false;
|
||||
}
|
||||
iigsDrawLineInner(s->pixels,
|
||||
(uint16_t)x0, (uint16_t)y0,
|
||||
(uint16_t)x1, (uint16_t)y1,
|
||||
iigsFillRectInner(s->pixels,
|
||||
(uint16_t)x, (uint16_t)y,
|
||||
(uint16_t)w, (uint16_t)h,
|
||||
(uint16_t)(colorIndex & 0x0F));
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool halFastDrawCircle(SurfaceT *s, int16_t cx, int16_t cy, uint16_t r, uint8_t colorIndex) {
|
||||
if (s == NULL) {
|
||||
return false;
|
||||
}
|
||||
iigsDrawCircleInner(s->pixels,
|
||||
(uint16_t)cx, (uint16_t)cy, r,
|
||||
(uint16_t)(colorIndex & 0x0F));
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool halFastFillCircle(SurfaceT *s, int16_t cx, int16_t cy, uint16_t r, uint8_t colorIndex) {
|
||||
uint16_t fillWord;
|
||||
uint8_t nibble;
|
||||
uint8_t doubled;
|
||||
if (s == NULL) {
|
||||
return false;
|
||||
}
|
||||
if (s != stageGet()) {
|
||||
return false;
|
||||
}
|
||||
nibble = (uint8_t)(colorIndex & 0x0F);
|
||||
doubled = (uint8_t)((nibble << 4) | nibble);
|
||||
fillWord = (uint16_t)((uint16_t)doubled | ((uint16_t)doubled << 8));
|
||||
iigsFillCircleInner(s->pixels, (uint16_t)cx, (uint16_t)cy, r, fillWord);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool halFastFloodWalk(uint8_t *row, int16_t startX, uint8_t matchColor, uint8_t newColor, bool matchEqual, bool *seedMatched, int16_t *leftXOut, int16_t *rightXOut) {
|
||||
if (row == NULL || seedMatched == NULL || leftXOut == NULL || rightXOut == NULL) {
|
||||
return false;
|
||||
}
|
||||
iigsFloodWalkInner(row, (uint16_t)startX,
|
||||
(uint16_t)(matchColor & 0x0F),
|
||||
(uint16_t)(newColor & 0x0F),
|
||||
(uint16_t)(matchEqual ? 1 : 0));
|
||||
*seedMatched = (gFloodSeedMatch != 0);
|
||||
if (*seedMatched) {
|
||||
*leftXOut = (int16_t)gFloodLeftX;
|
||||
*rightXOut = (int16_t)gFloodRightX;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool halFastFloodScanRow(uint8_t *row, int16_t leftX, int16_t rightX, uint8_t matchColor, uint8_t newColor, bool matchEqual, uint8_t *markBuf) {
|
||||
if (row == NULL || markBuf == NULL) {
|
||||
return false;
|
||||
}
|
||||
iigsFloodScanRowInner(row, (uint16_t)leftX, (uint16_t)rightX,
|
||||
(uint16_t)(matchColor & 0x0F),
|
||||
(uint16_t)(newColor & 0x0F),
|
||||
(uint16_t)(matchEqual ? 1 : 0),
|
||||
markBuf);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool halFastFloodScanAndPush(uint8_t *row, int16_t leftX, int16_t rightX, uint8_t matchColor, uint8_t newColor, bool matchEqual, int16_t scanY, int16_t *stackX, int16_t *stackY, int16_t *spInOut, int16_t maxSp) {
|
||||
if (row == NULL || stackX == NULL || stackY == NULL || spInOut == NULL) {
|
||||
return false;
|
||||
}
|
||||
iigsFloodScanAndPushInner(row,
|
||||
(uint16_t)leftX, (uint16_t)rightX,
|
||||
(uint16_t)(matchColor & 0x0F),
|
||||
(uint16_t)(newColor & 0x0F),
|
||||
(uint16_t)(matchEqual ? 1 : 0),
|
||||
(uint16_t)scanY,
|
||||
stackX, stackY,
|
||||
(uint16_t *)spInOut,
|
||||
(uint16_t)maxSp);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool halFastFloodWalkAndScans(uint8_t *pixels, int16_t x, int16_t y, uint8_t matchColor, uint8_t newColor, bool matchEqual, int16_t *stackX, int16_t *stackY, int16_t *spInOut, int16_t maxSp, bool *seedMatched, int16_t *leftXOut, int16_t *rightXOut) {
|
||||
if (pixels == NULL || stackX == NULL || stackY == NULL || spInOut == NULL || seedMatched == NULL || leftXOut == NULL || rightXOut == NULL) {
|
||||
return false;
|
||||
}
|
||||
iigsFloodWalkAndScansInner(pixels,
|
||||
(uint16_t)x, (uint16_t)y,
|
||||
(uint16_t)(matchColor & 0x0F),
|
||||
(uint16_t)(newColor & 0x0F),
|
||||
(uint16_t)(matchEqual ? 1 : 0),
|
||||
stackX, stackY,
|
||||
(uint16_t *)spInOut,
|
||||
(uint16_t)maxSp);
|
||||
*seedMatched = (gFloodSeedMatch != 0);
|
||||
*leftXOut = (int16_t)gFloodLeftX;
|
||||
*rightXOut = (int16_t)gFloodRightX;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool halFastBlitRect(uint8_t *dstRow0, int16_t dstX, const uint8_t *srcRow0, int16_t srcX, int16_t copyW, int16_t copyH, int16_t srcRowBytes, uint16_t transparent) {
|
||||
if (dstRow0 == NULL || srcRow0 == NULL || copyW <= 0 || copyH <= 0) {
|
||||
return false;
|
||||
}
|
||||
iigsBlitRectInner(dstRow0, (uint16_t)dstX,
|
||||
srcRow0, (uint16_t)srcX,
|
||||
(uint16_t)copyW, (uint16_t)copyH,
|
||||
(uint16_t)srcRowBytes,
|
||||
transparent);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool halFastTileFill(SurfaceT *s, uint8_t bx, uint8_t by, uint16_t fillWord) {
|
||||
uint8_t *row;
|
||||
uint16_t pixelX;
|
||||
uint16_t pixelY;
|
||||
|
||||
if (s == NULL) {
|
||||
return false;
|
||||
}
|
||||
pixelX = (uint16_t)((uint16_t)bx * 8u);
|
||||
pixelY = (uint16_t)((uint16_t)by * 8u);
|
||||
row = &s->pixels[pixelY * SURFACE_BYTES_PER_ROW + (pixelX >> 1)];
|
||||
iigsTileFillInner(row, fillWord);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
uint8_t *halStageAllocPixels(void) {
|
||||
return IIGS_STAGE_PIXELS;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -109,7 +109,11 @@ static int8_t thresholdPaddle(uint8_t v);
|
|||
// does not accept designated initializers; runtime fill keeps lookup
|
||||
// O(1) instead of a 40-plus-case switch.
|
||||
static uint8_t gAsciiToKey[ASCII_TABLE_SIZE];
|
||||
static uint8_t gKeyTtl [KEY_COUNT];
|
||||
|
||||
// Non-static so iigsInputSnapshot (joeyDraw.asm) can reference it via
|
||||
// long-mode addressing through the linker. The C TTL-decrement loop
|
||||
// that used to live in halInputPoll moved to that asm helper.
|
||||
uint8_t gKeyTtl [KEY_COUNT];
|
||||
|
||||
static int16_t gMouseAbsX = SURFACE_WIDTH / 2;
|
||||
static int16_t gMouseAbsY = SURFACE_HEIGHT / 2;
|
||||
|
|
@ -166,9 +170,38 @@ static int8_t signExtend7(uint8_t raw) {
|
|||
}
|
||||
|
||||
|
||||
// Map a raw 0..255 paddle reading to JOYSTICK_AXIS_MIN..MAX, using the
|
||||
// stick's calibrated center (captured by joeyJoystickReset) and a
|
||||
// dead-zone band around it. Returns 0 if reading is within deadZone of
|
||||
// the center; otherwise the offset from center, clamped to int8_t.
|
||||
static int8_t analogPaddle(uint8_t v, uint8_t center, uint8_t deadZone) {
|
||||
int16_t delta;
|
||||
|
||||
delta = (int16_t)v - (int16_t)center;
|
||||
if (delta < 0) {
|
||||
if ((-delta) <= (int16_t)deadZone) {
|
||||
return 0;
|
||||
}
|
||||
if (delta < (int16_t)JOYSTICK_AXIS_MIN) {
|
||||
return JOYSTICK_AXIS_MIN;
|
||||
}
|
||||
} else {
|
||||
if (delta <= (int16_t)deadZone) {
|
||||
return 0;
|
||||
}
|
||||
if (delta > (int16_t)JOYSTICK_AXIS_MAX) {
|
||||
return JOYSTICK_AXIS_MAX;
|
||||
}
|
||||
}
|
||||
return (int8_t)delta;
|
||||
}
|
||||
|
||||
|
||||
// Threshold a 0..255 paddle reading into a digital direction so the
|
||||
// IIgs analog stick presents the same axis semantics as the digital
|
||||
// sticks on ST/Amiga/DOS. Center range is treated as zero.
|
||||
// sticks on ST/Amiga/DOS. Center range is treated as zero. Used
|
||||
// before joeyJoystickReset has been called -- once the app calibrates,
|
||||
// we switch to analogPaddle for finer control.
|
||||
static int8_t thresholdPaddle(uint8_t v) {
|
||||
if (v < PADDLE_LO_THRESHOLD) {
|
||||
return JOYSTICK_AXIS_MIN;
|
||||
|
|
@ -191,53 +224,122 @@ static int8_t thresholdPaddle(uint8_t v) {
|
|||
// approximates the paddle's 0..255 position (the Apple firmware
|
||||
// PREAD routine works the same way). The two reads are inlined here
|
||||
// rather than factored into a helper because ORCA/C 2.1 trips over
|
||||
// `volatile uint8_t *` function parameters.
|
||||
// Auto-disconnect tracking. The paddle one-shot timer takes ~3 ms to
|
||||
// charge at full deflection; if NO joystick is wired up, the BUSY bit
|
||||
// stays set forever and the busy-wait runs the full PADDLE_TIMEOUT
|
||||
// every frame -- ~3 ms wasted per frame on a stick that isn't there.
|
||||
//
|
||||
// After JOY_DISCONNECT_THRESHOLD consecutive timeouts we latch the
|
||||
// stick as absent and stop polling entirely. The app calls
|
||||
// joeyJoystickReset to clear the latch and resume polling.
|
||||
#define JOY_DISCONNECT_THRESHOLD 60u
|
||||
|
||||
static uint16_t gJoyConsecutiveTimeouts = 0;
|
||||
static bool gJoyDisconnectLatched = false;
|
||||
|
||||
// Analog calibration: gJoyCenterX/Y hold the raw paddle reading we
|
||||
// captured the last time the user called joeyJoystickReset. Until
|
||||
// that's called, gJoyCenterValid is false and pollJoystick falls back
|
||||
// to the digital threshold mapping. gJoyRecalibrate is set by
|
||||
// halJoystickReset and cleared on the next successful poll, which
|
||||
// captures the new center.
|
||||
static uint8_t gJoyCenterX [JOYSTICK_COUNT];
|
||||
static uint8_t gJoyCenterY [JOYSTICK_COUNT];
|
||||
static bool gJoyCenterValid [JOYSTICK_COUNT];
|
||||
static bool gJoyRecalibrate [JOYSTICK_COUNT];
|
||||
|
||||
|
||||
void halJoystickReset(JoeyJoystickE js) {
|
||||
if ((int)js < 0 || (int)js >= JOYSTICK_COUNT) {
|
||||
return;
|
||||
}
|
||||
// Re-enable polling and arm a fresh center capture for the next
|
||||
// poll. The dead-zone value lives in core's gJoyDeadZone[js].
|
||||
gJoyConsecutiveTimeouts = 0;
|
||||
gJoyDisconnectLatched = false;
|
||||
gJoyRecalibrate[js] = true;
|
||||
}
|
||||
|
||||
// Asm paddle reader (joeyDraw.asm). Switches CPU to 1 MHz for the
|
||||
// duration of the poll so paddle counts match what every other
|
||||
// IIgs/Apple II joystick game produces (the C busy-wait at 2.8 MHz
|
||||
// inflated counts). Returns results via gJoy* DRAWPRIMS scratch.
|
||||
extern void iigsPollJoystickInner(void);
|
||||
|
||||
extern volatile uint8_t gJoyPx;
|
||||
extern volatile uint8_t gJoyPy;
|
||||
extern volatile uint8_t gJoyResolved; // bit0: pdl0 fired; bit1: pdl1 fired
|
||||
|
||||
static void pollJoystick(void) {
|
||||
uint16_t count;
|
||||
uint8_t px;
|
||||
uint8_t py;
|
||||
uint8_t byte;
|
||||
uint8_t resolvedFlags;
|
||||
bool xResolved;
|
||||
bool yResolved;
|
||||
|
||||
// One PTRIG read starts BOTH paddle timers simultaneously per the
|
||||
// IIgs Hardware Reference. Polling them in parallel halves the
|
||||
// wall-clock time vs. polling each serially after its own trigger.
|
||||
byte = *IIGS_PTRIG;
|
||||
px = 0;
|
||||
py = 0;
|
||||
xResolved = false;
|
||||
yResolved = false;
|
||||
for (count = 0; count < PADDLE_TIMEOUT; count++) {
|
||||
if (!xResolved) {
|
||||
byte = *IIGS_PADDLE0;
|
||||
if ((byte & IIGS_PADDLE_BUSY) == 0) {
|
||||
px = (uint8_t)count;
|
||||
xResolved = true;
|
||||
}
|
||||
}
|
||||
if (!yResolved) {
|
||||
byte = *IIGS_PADDLE1;
|
||||
if ((byte & IIGS_PADDLE_BUSY) == 0) {
|
||||
py = (uint8_t)count;
|
||||
yResolved = true;
|
||||
}
|
||||
}
|
||||
if (xResolved && yResolved) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Timed-out paddles default to centered axis. Without an explicit
|
||||
// resolved flag we couldn't distinguish "no joystick" from "stick
|
||||
// hard right" -- both would yield px=255 and report AXIS_MAX.
|
||||
gJoyAxisX[JOYSTICK_0] = xResolved ? thresholdPaddle(px) : 0;
|
||||
gJoyAxisY[JOYSTICK_0] = yResolved ? thresholdPaddle(py) : 0;
|
||||
// Buttons are I/O reads -- always cheap, do them every frame.
|
||||
gJoyButtonState[JOYSTICK_0][JOY_BUTTON_0] = (*IIGS_BTN0 & IIGS_BUTTON_BIT) != 0;
|
||||
gJoyButtonState[JOYSTICK_0][JOY_BUTTON_1] = (*IIGS_BTN1 & IIGS_BUTTON_BIT) != 0;
|
||||
|
||||
gJoyConnected[JOYSTICK_0] = true;
|
||||
gJoyConnected[JOYSTICK_1] = false;
|
||||
|
||||
// Once the stick has been latched as disconnected, only buttons
|
||||
// get polled. The app must call joeyJoystickReset to resume axis
|
||||
// polling (e.g., when the user has just plugged in a stick).
|
||||
if (gJoyDisconnectLatched) {
|
||||
gJoyAxisX[JOYSTICK_0] = 0;
|
||||
gJoyAxisY[JOYSTICK_0] = 0;
|
||||
gJoyConnected[JOYSTICK_0] = false;
|
||||
return;
|
||||
}
|
||||
|
||||
// Asm read at 1 MHz -- accurate paddle counts.
|
||||
iigsPollJoystickInner();
|
||||
px = gJoyPx;
|
||||
py = gJoyPy;
|
||||
resolvedFlags = gJoyResolved;
|
||||
xResolved = (resolvedFlags & 0x01) != 0;
|
||||
yResolved = (resolvedFlags & 0x02) != 0;
|
||||
|
||||
gJoyConnected[JOYSTICK_0] = xResolved || yResolved;
|
||||
|
||||
// Update auto-disconnect counter. Both axes failing => probably no
|
||||
// stick. One resolves => stick is present, reset the counter.
|
||||
if (!xResolved && !yResolved) {
|
||||
if (gJoyConsecutiveTimeouts < 0xFFFFu) {
|
||||
gJoyConsecutiveTimeouts++;
|
||||
}
|
||||
if (gJoyConsecutiveTimeouts >= JOY_DISCONNECT_THRESHOLD) {
|
||||
gJoyDisconnectLatched = true;
|
||||
}
|
||||
gJoyAxisX[JOYSTICK_0] = 0;
|
||||
gJoyAxisY[JOYSTICK_0] = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
gJoyConsecutiveTimeouts = 0;
|
||||
|
||||
// Capture the resting position on recalibrate (one-shot).
|
||||
if (gJoyRecalibrate[JOYSTICK_0]) {
|
||||
gJoyCenterX [JOYSTICK_0] = px;
|
||||
gJoyCenterY [JOYSTICK_0] = py;
|
||||
gJoyCenterValid[JOYSTICK_0] = true;
|
||||
gJoyRecalibrate[JOYSTICK_0] = false;
|
||||
}
|
||||
|
||||
// Calibrated => analog axis report (offset from center, dead-zone
|
||||
// clamped). Uncalibrated => the legacy 3-state digital threshold,
|
||||
// matching how the stick behaved before joeyJoystickReset existed.
|
||||
if (gJoyCenterValid[JOYSTICK_0]) {
|
||||
gJoyAxisX[JOYSTICK_0] = analogPaddle(px,
|
||||
gJoyCenterX[JOYSTICK_0],
|
||||
gJoyDeadZone[JOYSTICK_0]);
|
||||
gJoyAxisY[JOYSTICK_0] = analogPaddle(py,
|
||||
gJoyCenterY[JOYSTICK_0],
|
||||
gJoyDeadZone[JOYSTICK_0]);
|
||||
} else {
|
||||
gJoyAxisX[JOYSTICK_0] = thresholdPaddle(px);
|
||||
gJoyAxisY[JOYSTICK_0] = thresholdPaddle(py);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -303,19 +405,14 @@ void halInputInit(void) {
|
|||
|
||||
|
||||
void halInputPoll(void) {
|
||||
uint8_t kbd;
|
||||
uint8_t ascii;
|
||||
uint8_t key;
|
||||
uint16_t i;
|
||||
uint8_t kbd;
|
||||
uint8_t ascii;
|
||||
uint8_t key;
|
||||
|
||||
for (i = 0; i < KEY_COUNT; i++) {
|
||||
if (gKeyTtl[i] > 0) {
|
||||
gKeyTtl[i]--;
|
||||
if (gKeyTtl[i] == 0) {
|
||||
gKeyState[i] = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
// The KEY_COUNT TTL-decrement loop and the gKeyState/gKeyPrev/
|
||||
// gMouseButtonPrev/gJoyButtonPrev snapshots all happen earlier in
|
||||
// joeyInputPoll's call to iigsInputSnapshot (asm). We just read
|
||||
// the live hardware state here.
|
||||
|
||||
kbd = *IIGS_KBD;
|
||||
if (kbd & KBD_STROBE_BIT) {
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -1,76 +1,15 @@
|
|||
* peislam.asm - PEI-slam stage row to bank-$E1 SHR.
|
||||
* peislam.asm - placeholder.
|
||||
*
|
||||
* Implements the //e AUXWRITE + RAMRD + SHR-shadow trick that lets
|
||||
* 65816 stack pushes (which are bank-$00-implicit) end up in bank
|
||||
* $E1 SHR display memory:
|
||||
*
|
||||
* - SHR shadow temporarily ENABLED (clear $C035 bit 3) so writes
|
||||
* to bank-$01 in $2000-$9FFF mirror to $E1 SHR.
|
||||
* - AUXWRITE on (any write to $C005) so bank-$00 stack writes
|
||||
* redirect to bank $01, then mirror to $E1 via shadow.
|
||||
* - RAMRD on (any write to $C003) so PEI dp's bank-$00-implicit
|
||||
* reads redirect to bank $01 = the stage source.
|
||||
* - SEI for the duration: stack pointer is hijacked to point at
|
||||
* $E1-mapped stack space, soft-switch state would corrupt any
|
||||
* C code that tried to access bank-$00 globals.
|
||||
*
|
||||
* All scratch reads/writes within the slam use long-mode `>name`
|
||||
* addressing (24-bit, explicit bank) so they bypass RAMRD redirect
|
||||
* and reach the actual bank-$00 global storage.
|
||||
*
|
||||
* Calling convention: ORCA-C memory model 1 (large model, JSL/RTL).
|
||||
* void peiSlamFullRow(int16_t y);
|
||||
* - Caller PHAs y (2 bytes) before JSL.
|
||||
* - JSL pushes 3-byte return address.
|
||||
* - On entry: y_LO at SP+4, y_HI at SP+5 (SP points one below PCL).
|
||||
* - Function preserves DBR; returns via RTL with original SP.
|
||||
* - Caller pops the y arg after RTL.
|
||||
*
|
||||
* Per call: ~50 cyc bracket + 80 PEIs * 6 cyc = ~530 cyc, vs the
|
||||
* memcpy/MVN fallback's 7 cyc/byte * 160 bytes = ~1120 cyc.
|
||||
* The original PEI-slam-per-row helper was removed; its functionality
|
||||
* was rolled into iigsBlitStageToShr in joeyDraw.asm (full PEI-slam
|
||||
* with per-row dirty skip). This stub remains so the build's
|
||||
* PORT_ASM_SRCS_ALL wildcard pulls in a file with a recognized load
|
||||
* segment and the linker keeps the same segment-bank layout it had
|
||||
* when peislam.asm was a real translation unit.
|
||||
|
||||
keep PEISLAM
|
||||
case on
|
||||
|
||||
* The operand to START names the LOAD segment this object segment
|
||||
* belongs to (per ORCA/M for IIgs manual, ch. 6 "Load Segments").
|
||||
* Object segments without an operand land in the unnamed "blank
|
||||
* segment" -- which on AUDIO is _ROOT, the very segment whose 64 KB
|
||||
* budget peislam.asm was busting. Naming a load segment forces the
|
||||
* linker to put us in our own segment, which the GS/OS loader then
|
||||
* allocates in its own bank.
|
||||
peiSlamFullRow start IIGSASM
|
||||
* MVN-based row copy. Replaces the PEI-stack-slam approach (which
|
||||
* needs RAMRD/AUXWRITE/SHADOW soft-switches and is sensitive to
|
||||
* DRAWDATA bank placement). MVN copies 160 bytes from the bank-$01
|
||||
* stage row to the matching bank-$E1 SHR row at ~7 cyc/byte; that's
|
||||
* slower than PEI-slam but rock-solid.
|
||||
*
|
||||
* Args after PHP: y (int16) at SP+5..6. Compute rowOffset = $2000
|
||||
* + y*160. MVN $01,$E1 with X=Y=rowOffset, A=159 copies 160 bytes
|
||||
* from $01:rowOffset to $E1:rowOffset.
|
||||
php
|
||||
rep #$30 ; M=16, X=16
|
||||
|
||||
lda 5,s ; y
|
||||
asl a
|
||||
asl a
|
||||
asl a
|
||||
asl a
|
||||
asl a ; A = y << 5 = y*32
|
||||
sta >gPeiTempRowBase
|
||||
asl a
|
||||
asl a ; A = y << 7 = y*128
|
||||
clc
|
||||
adc >gPeiTempRowBase ; A = y*160
|
||||
clc
|
||||
adc #$2000 ; A = $2000 + y*160 = row offset
|
||||
|
||||
tax ; X = source offset (bank $01)
|
||||
tay ; Y = dest offset (bank $E1)
|
||||
lda #159 ; count - 1 (MVN copies count+1 = 160 bytes)
|
||||
mvn $01,$E1
|
||||
|
||||
plp
|
||||
peislamStub start IIGSASM
|
||||
rtl
|
||||
end
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue