joeylib2/src/codegen/spriteCompile.c

760 lines
29 KiB
C

// Cross-platform sprite codegen runtime: spriteCompile uses the
// per-CPU emit function selected at compile time, allocates a slot
// in the codegen arena, copies the emitted bytes in, and populates
// sp->slot + sp->routineOffsets. spriteCompiledDraw casts the slot
// address to a function pointer and calls it through cdecl.
//
// Each per-CPU emitter (src/codegen/spriteEmit{X86,68k,Iigs}.c)
// just produces bytes; this file is the only consumer of the
// codegen arena from the sprite side.
#include <stdlib.h>
#include <string.h>
#include "joey/sprite.h"
#include "joey/surface.h"
#include "codegenArenaInternal.h"
#include "hal.h"
#include "spriteEmitter.h"
#include "spriteInternal.h"
#include "surfaceInternal.h"
// Largest scratch buffer needed for any single emit call. 16 KB
// covers a 32x32 sprite even on 68k (the biggest mixed-RMW byte-
// emit at 16 bytes/byte * (16*17 dest bytes per shift) ~= 4.5 KB,
// times shift count 2). Round up generously.
#define SPRITE_EMIT_SCRATCH_BYTES (16u * 1024u)
// Compile-time selection of the per-CPU emitter. One src/codegen/
// spriteEmit*.c file is built per platform, but the dispatch lives
// in this file so spriteCompile + spriteCompiledDraw aren't
// duplicated three times.
static uint16_t emitDrawForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift) {
#if defined(JOEYLIB_PLATFORM_DOS)
return spriteEmitDrawX86(out, sp, shift);
#elif defined(JOEYLIB_PLATFORM_AMIGA)
return spriteEmitDrawPlanar68k(out, sp, shift);
#elif defined(JOEYLIB_PLATFORM_ATARIST)
return spriteEmitDraw68k(out, sp, shift);
#elif defined(JOEYLIB_PLATFORM_IIGS)
return spriteEmitDrawIigs(out, sp, shift);
#else
# error "spriteCompile: no emitter selected for this platform"
#endif
}
// Save-under and restore-under emit dispatch. Each per-CPU pair
// produces row-by-row copy bytes; the runtime dispatch in
// src/core/sprite.c gates on routineOffsets[shift][SPRITE_OP_SAVE]
// != SPRITE_NOT_COMPILED and falls back to the interpreted memcpy
// path otherwise.
static uint16_t emitSaveForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift) {
#if defined(JOEYLIB_PLATFORM_DOS)
return spriteEmitSaveX86(out, sp, shift);
#elif defined(JOEYLIB_PLATFORM_AMIGA)
return spriteEmitSavePlanar68k(out, sp, shift);
#elif defined(JOEYLIB_PLATFORM_ATARIST)
return spriteEmitSave68k(out, sp, shift);
#elif defined(JOEYLIB_PLATFORM_IIGS)
return spriteEmitSaveIigs(out, sp, shift);
#else
(void)out; (void)sp; (void)shift;
return 0;
#endif
}
static uint16_t emitRestoreForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift) {
#if defined(JOEYLIB_PLATFORM_DOS)
return spriteEmitRestoreX86(out, sp, shift);
#elif defined(JOEYLIB_PLATFORM_AMIGA)
return spriteEmitRestorePlanar68k(out, sp, shift);
#elif defined(JOEYLIB_PLATFORM_ATARIST)
return spriteEmitRestore68k(out, sp, shift);
#elif defined(JOEYLIB_PLATFORM_IIGS)
return spriteEmitRestoreIigs(out, sp, shift);
#else
(void)out; (void)sp; (void)shift;
return 0;
#endif
}
// Sizing pass: returns total bytes the emitters will produce for
// this sprite's DRAW (per shift) + SAVE (per shift) + RESTORE (per
// shift). Emitters that aren't implemented for the current platform
// return 0 here, so totalSize tracks only the ops that will actually
// land in the arena.
static uint32_t emitTotalSize(uint8_t *scratch, const SpriteT *sp) {
uint32_t total;
uint8_t shift;
total = 0;
for (shift = 0; shift < JOEY_SPRITE_SHIFT_COUNT; shift++) {
total += emitDrawForTarget(scratch, sp, shift);
total += emitSaveForTarget(scratch, sp, shift);
total += emitRestoreForTarget(scratch, sp, shift);
}
return total;
}
bool spriteCompile(SpriteT *sp) {
uint8_t *scratch;
uint32_t totalSize;
uint8_t shift;
uint8_t op;
ArenaSlotT *slot;
uint8_t *dst;
uint16_t written;
uint16_t offset;
if (sp == NULL) {
return false;
}
if (sp->slot != NULL) {
return true;
}
if (sp->tileData == NULL) {
return false;
}
/* Amiga (post-Phase 9) uses spriteEmitPlanar68k.c which writes
* directly to bitplanes. DRAW emits a unique pre-shifted variant
* per shift in 0..7 (smooth horizontal motion at any pixel x);
* SAVE/RESTORE emit only shift 0 and shift 1 since shifted variants
* 1..7 share identical bytes (plain memcpy of widthTiles+1 plane
* bytes per row). The post-emit pass below aliases slots 2..7
* for save/restore to slot 1's bytes. */
scratch = (uint8_t *)malloc(SPRITE_EMIT_SCRATCH_BYTES);
if (scratch == NULL) {
return false;
}
totalSize = emitTotalSize(scratch, sp);
if (totalSize > 0xFFFFu) {
free(scratch);
return false;
}
slot = codegenArenaAlloc(totalSize);
if (slot == NULL) {
free(scratch);
return false;
}
dst = codegenArenaBase() + slot->offset;
offset = 0;
for (shift = 0; shift < JOEY_SPRITE_SHIFT_COUNT; shift++) {
for (op = 0; op < SPRITE_OP_COUNT; op++) {
switch (op) {
case SPRITE_OP_DRAW: written = emitDrawForTarget (dst + offset, sp, shift); break;
case SPRITE_OP_SAVE: written = emitSaveForTarget (dst + offset, sp, shift); break;
case SPRITE_OP_RESTORE: written = emitRestoreForTarget(dst + offset, sp, shift); break;
default: written = 0; break;
}
if (written == 0) {
sp->routineOffsets[shift][op] = SPRITE_NOT_COMPILED;
} else {
sp->routineOffsets[shift][op] = offset;
offset = (uint16_t)(offset + written);
}
}
}
#if defined(JOEYLIB_PLATFORM_AMIGA)
/* Save/restore bytes for any non-zero shift are identical (plain
* memcpy of widthTiles+1 plane bytes per row). The emitter emits
* them once at slot 1; alias slots 2..7 here so the dispatcher
* gate (sprite.c) sees them as compiled. */
for (shift = 2; shift < JOEY_SPRITE_SHIFT_COUNT; shift++) {
sp->routineOffsets[shift][SPRITE_OP_SAVE] = sp->routineOffsets[1][SPRITE_OP_SAVE];
sp->routineOffsets[shift][SPRITE_OP_RESTORE] = sp->routineOffsets[1][SPRITE_OP_RESTORE];
}
#endif
sp->slot = slot;
free(scratch);
return true;
}
#if defined(JOEYLIB_PLATFORM_IIGS)
// SURFACE_ROW_OFFSET dispatches to the gRowOffsetLut lookup on IIgs;
// declared in surfaceInternal.h. Replaces ORCA-C's __mul16 JSL with a
// single indexed long-mode read.
// IIgs uses inline asm + a self-modifying call stub instead of a C
// function-pointer cast. The build uses ORCA-C large memory model
// (-b for sprite demos) so pointers are 24-bit and JSL works
// cross-bank.
//
// `sta abs,Y` on 65816 uses the data bank register (DBR) for the
// high byte of the effective address, so we need DBR = dst's bank
// during the body. malloc under -b can return memory in any bank,
// so we don't trust DBR to already match -- the stub explicitly
// sets DBR from the dst pointer's bank byte and restores it before
// returning to C.
//
// Stub layout (14 bytes):
// 00: 8B PHB ; save caller DBR
// 01: A9 bk LDA #destBank ; A = dst bank (8-bit M)
// 03: 48 PHA
// 04: AB PLB ; DBR = dst bank
// 05: A0 lo hi LDY #destOffset ; Y = low 16 of dst (X=16)
// 08: 22 lo mid bk JSL routine
// 0C: AB PLB ; restore caller DBR
// 0D: 6B RTL
//
// Patched per call: byte 2 (destBank), bytes 6-7 (destOffset16),
// bytes 9-11 (target 24-bit). The compiled routine assumes
// M=8 / X=16 / Y=destOffset on entry; the stub arranges that.
//
// Stub bytes are split into two phases:
// 1. The 8 opcode bytes are written ONCE on first call (gDrawStubInited).
// 2. Of the 6 operand bytes, only those that actually changed since
// the previous call get re-stamped: destBank and fnAddr are cached
// and rarely change (per-shift / per-bank). destOffset is the only
// one that changes every call as the sprite moves. Net per-frame
// patching for the typical case drops from 14 stores to 2.
static unsigned char gSpriteCallStub[14];
static bool gDrawStubInited = false;
static uint8_t gDrawStubLastBank = 0xFF;
static uint32_t gDrawStubLastFnAddr = 0xFFFFFFFFul;
void spriteCompiledDraw(SurfaceT *dst, const SpriteT *sp, int16_t x, int16_t y) {
uint8_t shift;
uint32_t destAddr;
uint16_t destOffset;
uint8_t destBank;
uint32_t fnAddr;
{
uint8_t *destPtr;
shift = (uint8_t)(x & 1);
destPtr = &dst->pixels[SURFACE_ROW_OFFSET(y) + ((uint16_t)x >> 1)];
destAddr = (uint32_t)destPtr;
destOffset = (uint16_t)(destAddr & 0xFFFFu);
destBank = (uint8_t)((destAddr >> 16) & 0xFFu);
fnAddr = codegenArenaBaseAddr()
+ sp->slot->offset
+ (uint32_t)sp->routineOffsets[shift][SPRITE_OP_DRAW];
}
(void)destAddr;
if (!gDrawStubInited) {
gSpriteCallStub[ 0] = 0x8B;
gSpriteCallStub[ 1] = 0xA9;
gSpriteCallStub[ 3] = 0x48;
gSpriteCallStub[ 4] = 0xAB;
gSpriteCallStub[ 5] = 0xA0;
gSpriteCallStub[ 8] = 0x22;
gSpriteCallStub[12] = 0xAB;
gSpriteCallStub[13] = 0x6B;
gDrawStubInited = true;
}
// destOffset always changes (sprite moves every frame).
gSpriteCallStub[ 6] = (unsigned char)(destOffset & 0xFFu);
gSpriteCallStub[ 7] = (unsigned char)((destOffset >> 8) & 0xFFu);
// destBank only changes if the dst surface migrates banks (~never).
if (destBank != gDrawStubLastBank) {
gSpriteCallStub[ 2] = destBank;
gDrawStubLastBank = destBank;
}
// fnAddr changes only on shift parity flips or sprite swaps.
if (fnAddr != gDrawStubLastFnAddr) {
const uint8_t *fnB_ = (const uint8_t *)&fnAddr;
gSpriteCallStub[ 9] = fnB_[0];
gSpriteCallStub[10] = fnB_[1];
gSpriteCallStub[11] = fnB_[2];
gDrawStubLastFnAddr = fnAddr;
}
// ORCA-C compiles this function under `longa on / longi on`
// (M=16, X=16) and emits the function epilogue assuming those
// widths at exit -- the deallocation ADC takes a 2-byte immediate
// and any LDX/LDY use 2-byte immediates. The byte writes to
// gSpriteCallStub above leave M=8, and an earlier PHP/PLP-only
// wrapper let the asm block exit in the wrong M state. The
// epilogue's `ADC #imm; TCS` then decoded as a wider ADC that
// swallowed the TCS, S was never adjusted, RTL popped wrong
// bytes, control fell into BSS, and the IIgs hit BRK on a zero
// byte. Force M=16/X=16 before returning to compiled C.
asm {
rep #0x30
sep #0x20
jsl gSpriteCallStub
rep #0x30
}
}
// Save/Restore call stub. The compiled MVN-row routines need
// M=16 / X=16 and expect index registers preset to source/dest
// offsets within their respective banks. MVN's own bank operands
// are in the routine bytes (patched per call), so this stub doesn't
// need to load DBR -- it just sets X and Y, JSLs, and restores DBR
// (MVN itself sets DBR to its destination bank as a side effect).
//
// Stub layout (13 bytes):
// 00: 8B PHB ; save caller DBR
// 01: A2 lo hi LDX #srcOffset
// 04: A0 lo hi LDY #dstOffset
// 07: 22 lo mid bk JSL routine
// 0B: AB PLB ; restore caller DBR
// 0C: 6B RTL
//
// For SAVE: X = screen lo, Y = backup lo
// For RESTORE: X = backup lo, Y = screen lo
//
// Two distinct stubs (one per op) instead of a shared one. Save and
// restore alternate every frame and they swap the X/Y meanings, so a
// shared stub forced a full re-stamp on every call. Per-op stubs let
// us cache: only the bytes that genuinely change frame-to-frame
// (typically just one of screenLo/backupLo as the sprite moves) get
// rewritten. Cuts per-call patching from 13 stores to 2 in the typical
// case (static backup buffer, stable shift parity).
static unsigned char gSpriteSaveStub[13];
static unsigned char gSpriteRestoreStub[13];
static bool gSaveStubInited = false;
static uint16_t gSaveStubLastXLo = 0xFFFFu;
static uint16_t gSaveStubLastYLo = 0xFFFFu;
static uint32_t gSaveStubLastFnAddr = 0xFFFFFFFFul;
static bool gRestoreStubInited = false;
static uint16_t gRestoreStubLastXLo = 0xFFFFu;
static uint16_t gRestoreStubLastYLo = 0xFFFFu;
static uint32_t gRestoreStubLastFnAddr= 0xFFFFFFFFul;
// patchMvnBanks stamps the destination and source bank operand bytes
// into each MVN inside an emitted save/restore routine. Layout from
// spriteEmitIigs.c::emitMvnCopyRoutine:
// row 0 (6 bytes): A9 lo hi 54 db sb
// row R (12 bytes, R>=1): 8A/98 18 69 lo hi AA/A8 A9 lo hi 54 db sb
// end (1 byte): 6B
// MVN dstbk is at offset (12*R + 4); srcbk at (12*R + 5).
static void patchMvnBanks(uint8_t *routine, uint16_t heightPx, uint8_t dstBank, uint8_t srcBank) {
uint16_t r;
for (r = 0; r < heightPx; r++) {
routine[12u * r + 4u] = dstBank;
routine[12u * r + 5u] = srcBank;
}
}
// Split a 24-bit pointer into its low 16 bits + bank byte. The
// (uint32_t) cast works correctly in ORCA/C 2.2.1 (the 2.1.0 lossy-
// bank-byte bug is fixed). To avoid invoking the ~LSHR4 32-bit-shift
// helper for the `>> 16` to extract the bank byte, we cast to
// uint32_t and then byte-alias the storage -- gets the same bytes
// with simple loads.
#define SPLIT_POINTER(_ptr, _outLo, _outBank) \
do { \
uint32_t spAddr_ = (uint32_t)(_ptr); \
const uint8_t *spB_ = (const uint8_t *)&spAddr_; \
*(_outLo) = (uint16_t)(spB_[0] | ((uint16_t)spB_[1] << 8)); \
*(_outBank) = spB_[2]; \
} while (0)
// Backup-buffer pointer split cache. backup->bytes is a user-supplied
// buffer (e.g. a static array) and effectively never changes after
// the first call -- caching its split saves both Save and Restore the
// macro expansion per frame.
static const void *gLastBackupBytes = (const void *)0;
static uint16_t gLastBackupBytesLo = 0;
static uint8_t gLastBackupBytesBank = 0;
#define SPLIT_BACKUP_CACHED(_bytes, _outLo, _outBank) \
do { \
if ((const void *)(_bytes) == gLastBackupBytes) { \
*(_outLo) = gLastBackupBytesLo; \
*(_outBank) = gLastBackupBytesBank; \
} else { \
SPLIT_POINTER((_bytes), (_outLo), (_outBank)); \
gLastBackupBytes = (const void *)(_bytes); \
gLastBackupBytesLo = *(_outLo); \
gLastBackupBytesBank = *(_outBank); \
} \
} while (0)
void spriteCompiledSaveUnder(const SurfaceT *src, SpriteT *sp, int16_t x, int16_t y, SpriteBackupT *backup) {
uint8_t shift;
int16_t clippedX;
uint16_t widthPx;
uint16_t heightPx;
uint16_t copyBytes;
uint16_t screenLo;
uint16_t backupLo;
uint8_t screenBank;
uint8_t backupBank;
uint32_t fnAddr;
uint8_t *routine;
uint8_t *screenPtr;
uint16_t cacheIdx; /* shift * SPRITE_OP_COUNT + SPRITE_OP_SAVE, computed once */
uint8_t *cachedDst; /* &sp->cachedDstBank[0][0] + cacheIdx */
uint8_t *cachedSrc; /* &sp->cachedSrcBank[0][0] + cacheIdx */
uint16_t routineOffset; /* sp->routineOffsets[shift][SPRITE_OP_SAVE], computed once */
shift = (uint8_t)(x & 1);
clippedX = (int16_t)(x & ~1);
widthPx = (uint16_t)(sp->widthTiles * 8);
heightPx = (uint16_t)(sp->heightTiles * 8);
copyBytes = (uint16_t)((widthPx >> 1) + (shift == 1 ? 1 : 0));
screenPtr = (uint8_t *)&src->pixels[SURFACE_ROW_OFFSET(y) + ((uint16_t)clippedX >> 1)];
SPLIT_POINTER(screenPtr, &screenLo, &screenBank);
SPLIT_BACKUP_CACHED(backup->bytes, &backupLo, &backupBank);
backup->sprite = sp;
backup->x = clippedX;
backup->y = y;
backup->width = (uint16_t)(copyBytes << 1);
backup->height = heightPx;
/* sizeBytes is constant per (sprite, shift); cache to dodge the
* per-call ~CUMUL2 (uint16_t * uint16_t) helper. The byte-pointer
* arithmetic avoids reintroducing ~MUL4 for the uint16_t array
* indexing. */
{
uint16_t *sizeCachePtr = (uint16_t *)((uint8_t *)sp->cachedSizeBytes + ((uint16_t)shift << 1));
if (*sizeCachePtr == 0) {
*sizeCachePtr = (uint16_t)(copyBytes * heightPx);
}
backup->sizeBytes = *sizeCachePtr;
}
/* Compute the 1D index into the cached* / routineOffsets 2D arrays
* once. ORCA-C 2.2.1 lowers `shift * SPRITE_OP_COUNT` (where
* SPRITE_OP_COUNT==3) to a ~MUL4 helper call; (shift<<1)+shift
* compiles to two ASLs and an ADC, no helper. */
cacheIdx = (uint16_t)(((uint16_t)shift << 1) + (uint16_t)shift + SPRITE_OP_SAVE);
cachedDst = (uint8_t *)sp->cachedDstBank + cacheIdx;
cachedSrc = (uint8_t *)sp->cachedSrcBank + cacheIdx;
/* Same byte-pointer trick as SURFACE_ROW_OFFSET to dodge ~MUL4. */
routineOffset = *(uint16_t *)((uint8_t *)sp->routineOffsets + (cacheIdx << 1));
fnAddr = codegenArenaBaseAddr()
+ sp->slot->offset
+ (uint32_t)routineOffset;
// Stub: X = screen (source), Y = backup (destination).
if (!gSaveStubInited) {
gSpriteSaveStub[ 0] = 0x8B;
gSpriteSaveStub[ 1] = 0xA2;
gSpriteSaveStub[ 4] = 0xA0;
gSpriteSaveStub[ 7] = 0x22;
gSpriteSaveStub[11] = 0xAB;
gSpriteSaveStub[12] = 0x6B;
gSaveStubInited = true;
}
if (screenLo != gSaveStubLastXLo) {
gSpriteSaveStub[ 2] = (unsigned char)(screenLo & 0xFFu);
gSpriteSaveStub[ 3] = (unsigned char)((screenLo >> 8) & 0xFFu);
gSaveStubLastXLo = screenLo;
}
if (backupLo != gSaveStubLastYLo) {
gSpriteSaveStub[ 5] = (unsigned char)(backupLo & 0xFFu);
gSpriteSaveStub[ 6] = (unsigned char)((backupLo >> 8) & 0xFFu);
gSaveStubLastYLo = backupLo;
}
if (fnAddr != gSaveStubLastFnAddr) {
/* Byte-alias the uint32_t to grab the 3 bank/lo/hi bytes
* without invoking ~LSHR4 for the >>16. */
const uint8_t *fnB_ = (const uint8_t *)&fnAddr;
gSpriteSaveStub[ 8] = fnB_[0];
gSpriteSaveStub[ 9] = fnB_[1];
gSpriteSaveStub[10] = fnB_[2];
gSaveStubLastFnAddr = fnAddr;
}
// Skip the 16+ MVN-bank rewrites if the dst/src bank pair is the
// same as last call.
if (*cachedDst != backupBank || *cachedSrc != screenBank) {
routine = codegenArenaBase() + sp->slot->offset + routineOffset;
patchMvnBanks(routine, heightPx, /*dst*/backupBank, /*src*/screenBank);
*cachedDst = backupBank;
*cachedSrc = screenBank;
}
// MVN-based routine: needs M=16 / X=16; restore M=16 on exit
// matches ORCA-C `longa on` epilogue expectations.
asm {
rep #0x30
jsl gSpriteSaveStub
rep #0x30
}
}
void spriteCompiledRestoreUnder(SurfaceT *dst, const SpriteBackupT *backup) {
uint8_t shift;
uint16_t heightPx;
uint16_t copyBytes;
uint16_t spriteBytesPerRow;
uint16_t screenLo;
uint16_t backupLo;
uint8_t screenBank;
uint8_t backupBank;
uint32_t fnAddr;
uint8_t *routine;
uint8_t *screenPtr;
SpriteT *sp;
uint16_t cacheIdx; /* shift * SPRITE_OP_COUNT + SPRITE_OP_RESTORE, computed once */
uint8_t *cachedDst;
uint8_t *cachedSrc;
uint16_t routineOffset;
sp = backup->sprite;
heightPx = backup->height;
copyBytes = (uint16_t)(backup->width >> 1);
spriteBytesPerRow = (uint16_t)(sp->widthTiles * 4);
shift = (copyBytes == spriteBytesPerRow) ? 0 : 1;
screenPtr = (uint8_t *)&dst->pixels[SURFACE_ROW_OFFSET(backup->y) + ((uint16_t)backup->x >> 1)];
SPLIT_POINTER(screenPtr, &screenLo, &screenBank);
SPLIT_BACKUP_CACHED(backup->bytes, &backupLo, &backupBank);
/* Hoist 2D-array indexing -- see save-side comment. */
cacheIdx = (uint16_t)(((uint16_t)shift << 1) + (uint16_t)shift + SPRITE_OP_RESTORE);
cachedDst = (uint8_t *)sp->cachedDstBank + cacheIdx;
cachedSrc = (uint8_t *)sp->cachedSrcBank + cacheIdx;
/* Same byte-pointer trick as SURFACE_ROW_OFFSET to dodge ~MUL4. */
routineOffset = *(uint16_t *)((uint8_t *)sp->routineOffsets + (cacheIdx << 1));
fnAddr = codegenArenaBaseAddr()
+ sp->slot->offset
+ (uint32_t)routineOffset;
// Stub: X = backup (source), Y = screen (destination).
if (!gRestoreStubInited) {
gSpriteRestoreStub[ 0] = 0x8B;
gSpriteRestoreStub[ 1] = 0xA2;
gSpriteRestoreStub[ 4] = 0xA0;
gSpriteRestoreStub[ 7] = 0x22;
gSpriteRestoreStub[11] = 0xAB;
gSpriteRestoreStub[12] = 0x6B;
gRestoreStubInited = true;
}
if (backupLo != gRestoreStubLastXLo) {
gSpriteRestoreStub[ 2] = (unsigned char)(backupLo & 0xFFu);
gSpriteRestoreStub[ 3] = (unsigned char)((backupLo >> 8) & 0xFFu);
gRestoreStubLastXLo = backupLo;
}
if (screenLo != gRestoreStubLastYLo) {
gSpriteRestoreStub[ 5] = (unsigned char)(screenLo & 0xFFu);
gSpriteRestoreStub[ 6] = (unsigned char)((screenLo >> 8) & 0xFFu);
gRestoreStubLastYLo = screenLo;
}
if (fnAddr != gRestoreStubLastFnAddr) {
const uint8_t *fnB_ = (const uint8_t *)&fnAddr;
gSpriteRestoreStub[ 8] = fnB_[0];
gSpriteRestoreStub[ 9] = fnB_[1];
gSpriteRestoreStub[10] = fnB_[2];
gRestoreStubLastFnAddr = fnAddr;
}
// Same short-circuit as save: only re-stamp the bank operands if
// they actually changed since last call.
if (*cachedDst != screenBank || *cachedSrc != backupBank) {
routine = codegenArenaBase() + sp->slot->offset + routineOffset;
patchMvnBanks(routine, heightPx, /*dst*/screenBank, /*src*/backupBank);
*cachedDst = screenBank;
*cachedSrc = backupBank;
}
asm {
rep #0x30
jsl gSpriteRestoreStub
rep #0x30
}
}
#elif defined(JOEYLIB_PLATFORM_AMIGA)
/* Amiga planar dispatchers. spriteEmitPlanar68k.c emits routines with
* cdecl(p0, p1, p2, p3[, backup]) signatures that write directly to
* bitplanes. Compute byteOff = y*40 + x/8 and pass plane[i]+byteOff
* as the 4 plane args. shift = x % 8 selects the variant; today only
* shift 0 emits non-zero bytes, so callers should already have
* gated on routineOffsets[shift][op] != SPRITE_NOT_COMPILED.
*
* For non-zero shifts (x not 8-px-aligned), the dispatcher in
* src/core/sprite.c (spriteDraw / spriteSaveUnder / spriteRestoreUnder)
* sees SPRITE_NOT_COMPILED for the shift and falls back to the
* interpreter, which handles arbitrary x via halSpriteDrawPlanes /
* halSpriteSavePlanes / halSpriteRestorePlanes. */
#define AMIGA_BYTES_PER_ROW_LOCAL 40
void spriteCompiledDraw(SurfaceT *dst, const SpriteT *sp, int16_t x, int16_t y) {
typedef void (*DrawFn)(uint8_t *p0, uint8_t *p1, uint8_t *p2, uint8_t *p3);
uint8_t shift;
uint16_t byteOff;
uint8_t *p0;
uint8_t *p1;
uint8_t *p2;
uint8_t *p3;
DrawFn fn;
shift = (uint8_t)(x & 7);
byteOff = (uint16_t)((uint16_t)y * AMIGA_BYTES_PER_ROW_LOCAL + ((uint16_t)x >> 3));
p0 = halSurfacePlanePtr(dst, 0); if (p0 == NULL) return;
p1 = halSurfacePlanePtr(dst, 1);
p2 = halSurfacePlanePtr(dst, 2);
p3 = halSurfacePlanePtr(dst, 3);
fn = (DrawFn)(codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_DRAW]);
fn(p0 + byteOff, p1 + byteOff, p2 + byteOff, p3 + byteOff);
}
void spriteCompiledSaveUnder(const SurfaceT *src, SpriteT *sp, int16_t x, int16_t y, SpriteBackupT *backup) {
typedef void (*SaveFn)(uint8_t *p0, uint8_t *p1, uint8_t *p2, uint8_t *p3, uint8_t *backup);
uint8_t shift;
int16_t clippedX;
uint16_t widthPx;
uint16_t heightPx;
uint16_t byteOff;
uint8_t *p0;
uint8_t *p1;
uint8_t *p2;
uint8_t *p3;
SaveFn fn;
shift = (uint8_t)(x & 7);
clippedX = (int16_t)(x & ~7);
widthPx = (uint16_t)(sp->widthTiles * 8);
heightPx = (uint16_t)(sp->heightTiles * 8);
/* Shifts 1..7 spill into one extra plane byte per row (= +8 px). */
if (shift != 0u) {
widthPx = (uint16_t)(widthPx + 8u);
}
byteOff = (uint16_t)((uint16_t)y * AMIGA_BYTES_PER_ROW_LOCAL + ((uint16_t)clippedX >> 3));
backup->sprite = sp;
backup->x = clippedX;
backup->y = y;
backup->width = widthPx;
backup->height = heightPx;
/* 4 planes * h * (widthPx/8) bytes = h * widthPx/2. */
backup->sizeBytes = (uint16_t)((uint16_t)heightPx * (widthPx >> 1));
p0 = halSurfacePlanePtr(src, 0); if (p0 == NULL) return;
p1 = halSurfacePlanePtr(src, 1);
p2 = halSurfacePlanePtr(src, 2);
p3 = halSurfacePlanePtr(src, 3);
fn = (SaveFn)(codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_SAVE]);
fn(p0 + byteOff, p1 + byteOff, p2 + byteOff, p3 + byteOff, backup->bytes);
}
void spriteCompiledRestoreUnder(SurfaceT *dst, const SpriteBackupT *backup) {
typedef void (*RestoreFn)(uint8_t *p0, uint8_t *p1, uint8_t *p2, uint8_t *p3, const uint8_t *backup);
SpriteT *sp;
uint8_t shift;
uint16_t byteOff;
uint8_t *p0;
uint8_t *p1;
uint8_t *p2;
uint8_t *p3;
RestoreFn fn;
sp = backup->sprite;
/* backup->x is 8-px aligned (clippedX from save), so x & 7 is
* useless for picking the original shift. Encode it via
* backup->width: == widthTiles*8 means shift 0; > means shifted.
* Shifted slots 1..7 all alias to the same restore bytes, so
* slot 1 stands in for any non-zero shift. */
shift = (uint8_t)(backup->width > (uint16_t)(sp->widthTiles * 8) ? 1u : 0u);
byteOff = (uint16_t)((uint16_t)backup->y * AMIGA_BYTES_PER_ROW_LOCAL + ((uint16_t)backup->x >> 3));
p0 = halSurfacePlanePtr(dst, 0); if (p0 == NULL) return;
p1 = halSurfacePlanePtr(dst, 1);
p2 = halSurfacePlanePtr(dst, 2);
p3 = halSurfacePlanePtr(dst, 3);
fn = (RestoreFn)(codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_RESTORE]);
fn(p0 + byteOff, p1 + byteOff, p2 + byteOff, p3 + byteOff, backup->bytes);
}
#else
void spriteCompiledDraw(SurfaceT *dst, const SpriteT *sp, int16_t x, int16_t y) {
typedef void (*DrawFn)(uint8_t *destRow);
uint8_t shift;
uint8_t *destRow;
DrawFn fn;
shift = (uint8_t)(x & 1);
destRow = &dst->pixels[(uint16_t)y * SURFACE_BYTES_PER_ROW + ((uint16_t)x >> 1)];
fn = (DrawFn)(codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_DRAW]);
fn(destRow);
}
// x86 / 68k compiled save: bytes are a cdecl
// void copy(const uint8_t *src, uint8_t *dst)
// that walks heightPx rows of copyBytes from screen (stride
// SURFACE_BYTES_PER_ROW) into the contiguous backup buffer.
void spriteCompiledSaveUnder(const SurfaceT *src, SpriteT *sp, int16_t x, int16_t y, SpriteBackupT *backup) {
typedef void (*CopyFn)(const uint8_t *src, uint8_t *dst);
uint8_t shift;
int16_t clippedX;
uint16_t widthPx;
uint16_t heightPx;
uint16_t copyBytes;
uint8_t *screenPtr;
CopyFn fn;
shift = (uint8_t)(x & 1);
clippedX = (int16_t)(x & ~1);
widthPx = (uint16_t)(sp->widthTiles * 8);
heightPx = (uint16_t)(sp->heightTiles * 8);
copyBytes = (uint16_t)((widthPx >> 1) + (shift == 1 ? 1 : 0));
screenPtr = (uint8_t *)&src->pixels[(uint16_t)y * SURFACE_BYTES_PER_ROW + ((uint16_t)clippedX >> 1)];
backup->sprite = sp;
backup->x = clippedX;
backup->y = y;
backup->width = (uint16_t)(copyBytes << 1);
backup->height = heightPx;
backup->sizeBytes = (uint16_t)(copyBytes * heightPx);
fn = (CopyFn)(codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_SAVE]);
fn(screenPtr, backup->bytes);
}
// Mirror of save: caller swaps arg order so the same emitted shape
// drives backup -> screen. The screen-side stride lives inside the
// emitted bytes, so RESTORE has its own routine bytes (stride is
// applied to dst instead of src).
void spriteCompiledRestoreUnder(SurfaceT *dst, const SpriteBackupT *backup) {
typedef void (*CopyFn)(const uint8_t *src, uint8_t *dst);
SpriteT *sp;
uint8_t shift;
uint16_t copyBytes;
uint16_t spriteBytesPerRow;
uint8_t *screenPtr;
CopyFn fn;
sp = backup->sprite;
copyBytes = (uint16_t)(backup->width >> 1);
spriteBytesPerRow = (uint16_t)(sp->widthTiles * 4);
shift = (copyBytes == spriteBytesPerRow) ? 0 : 1;
screenPtr = (uint8_t *)&dst->pixels[(uint16_t)backup->y * SURFACE_BYTES_PER_ROW + ((uint16_t)backup->x >> 1)];
fn = (CopyFn)(codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_RESTORE]);
fn(backup->bytes, screenPtr);
}
#endif