760 lines
29 KiB
C
760 lines
29 KiB
C
// Cross-platform sprite codegen runtime: spriteCompile uses the
|
|
// per-CPU emit function selected at compile time, allocates a slot
|
|
// in the codegen arena, copies the emitted bytes in, and populates
|
|
// sp->slot + sp->routineOffsets. spriteCompiledDraw casts the slot
|
|
// address to a function pointer and calls it through cdecl.
|
|
//
|
|
// Each per-CPU emitter (src/codegen/spriteEmit{X86,68k,Iigs}.c)
|
|
// just produces bytes; this file is the only consumer of the
|
|
// codegen arena from the sprite side.
|
|
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
|
|
#include "joey/sprite.h"
|
|
#include "joey/surface.h"
|
|
#include "codegenArenaInternal.h"
|
|
#include "hal.h"
|
|
#include "spriteEmitter.h"
|
|
#include "spriteInternal.h"
|
|
#include "surfaceInternal.h"
|
|
|
|
|
|
// Largest scratch buffer needed for any single emit call. 16 KB
|
|
// covers a 32x32 sprite even on 68k (the biggest mixed-RMW byte-
|
|
// emit at 16 bytes/byte * (16*17 dest bytes per shift) ~= 4.5 KB,
|
|
// times shift count 2). Round up generously.
|
|
#define SPRITE_EMIT_SCRATCH_BYTES (16u * 1024u)
|
|
|
|
|
|
// Compile-time selection of the per-CPU emitter. One src/codegen/
|
|
// spriteEmit*.c file is built per platform, but the dispatch lives
|
|
// in this file so spriteCompile + spriteCompiledDraw aren't
|
|
// duplicated three times.
|
|
static uint16_t emitDrawForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift) {
|
|
#if defined(JOEYLIB_PLATFORM_DOS)
|
|
return spriteEmitDrawX86(out, sp, shift);
|
|
#elif defined(JOEYLIB_PLATFORM_AMIGA)
|
|
return spriteEmitDrawPlanar68k(out, sp, shift);
|
|
#elif defined(JOEYLIB_PLATFORM_ATARIST)
|
|
return spriteEmitDraw68k(out, sp, shift);
|
|
#elif defined(JOEYLIB_PLATFORM_IIGS)
|
|
return spriteEmitDrawIigs(out, sp, shift);
|
|
#else
|
|
# error "spriteCompile: no emitter selected for this platform"
|
|
#endif
|
|
}
|
|
|
|
|
|
// Save-under and restore-under emit dispatch. Each per-CPU pair
|
|
// produces row-by-row copy bytes; the runtime dispatch in
|
|
// src/core/sprite.c gates on routineOffsets[shift][SPRITE_OP_SAVE]
|
|
// != SPRITE_NOT_COMPILED and falls back to the interpreted memcpy
|
|
// path otherwise.
|
|
static uint16_t emitSaveForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift) {
|
|
#if defined(JOEYLIB_PLATFORM_DOS)
|
|
return spriteEmitSaveX86(out, sp, shift);
|
|
#elif defined(JOEYLIB_PLATFORM_AMIGA)
|
|
return spriteEmitSavePlanar68k(out, sp, shift);
|
|
#elif defined(JOEYLIB_PLATFORM_ATARIST)
|
|
return spriteEmitSave68k(out, sp, shift);
|
|
#elif defined(JOEYLIB_PLATFORM_IIGS)
|
|
return spriteEmitSaveIigs(out, sp, shift);
|
|
#else
|
|
(void)out; (void)sp; (void)shift;
|
|
return 0;
|
|
#endif
|
|
}
|
|
|
|
|
|
static uint16_t emitRestoreForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift) {
|
|
#if defined(JOEYLIB_PLATFORM_DOS)
|
|
return spriteEmitRestoreX86(out, sp, shift);
|
|
#elif defined(JOEYLIB_PLATFORM_AMIGA)
|
|
return spriteEmitRestorePlanar68k(out, sp, shift);
|
|
#elif defined(JOEYLIB_PLATFORM_ATARIST)
|
|
return spriteEmitRestore68k(out, sp, shift);
|
|
#elif defined(JOEYLIB_PLATFORM_IIGS)
|
|
return spriteEmitRestoreIigs(out, sp, shift);
|
|
#else
|
|
(void)out; (void)sp; (void)shift;
|
|
return 0;
|
|
#endif
|
|
}
|
|
|
|
|
|
// Sizing pass: returns total bytes the emitters will produce for
|
|
// this sprite's DRAW (per shift) + SAVE (per shift) + RESTORE (per
|
|
// shift). Emitters that aren't implemented for the current platform
|
|
// return 0 here, so totalSize tracks only the ops that will actually
|
|
// land in the arena.
|
|
static uint32_t emitTotalSize(uint8_t *scratch, const SpriteT *sp) {
|
|
uint32_t total;
|
|
uint8_t shift;
|
|
|
|
total = 0;
|
|
for (shift = 0; shift < JOEY_SPRITE_SHIFT_COUNT; shift++) {
|
|
total += emitDrawForTarget(scratch, sp, shift);
|
|
total += emitSaveForTarget(scratch, sp, shift);
|
|
total += emitRestoreForTarget(scratch, sp, shift);
|
|
}
|
|
return total;
|
|
}
|
|
|
|
|
|
bool spriteCompile(SpriteT *sp) {
|
|
uint8_t *scratch;
|
|
uint32_t totalSize;
|
|
uint8_t shift;
|
|
uint8_t op;
|
|
ArenaSlotT *slot;
|
|
uint8_t *dst;
|
|
uint16_t written;
|
|
uint16_t offset;
|
|
|
|
if (sp == NULL) {
|
|
return false;
|
|
}
|
|
if (sp->slot != NULL) {
|
|
return true;
|
|
}
|
|
if (sp->tileData == NULL) {
|
|
return false;
|
|
}
|
|
/* Amiga (post-Phase 9) uses spriteEmitPlanar68k.c which writes
|
|
* directly to bitplanes. DRAW emits a unique pre-shifted variant
|
|
* per shift in 0..7 (smooth horizontal motion at any pixel x);
|
|
* SAVE/RESTORE emit only shift 0 and shift 1 since shifted variants
|
|
* 1..7 share identical bytes (plain memcpy of widthTiles+1 plane
|
|
* bytes per row). The post-emit pass below aliases slots 2..7
|
|
* for save/restore to slot 1's bytes. */
|
|
|
|
scratch = (uint8_t *)malloc(SPRITE_EMIT_SCRATCH_BYTES);
|
|
if (scratch == NULL) {
|
|
return false;
|
|
}
|
|
|
|
totalSize = emitTotalSize(scratch, sp);
|
|
if (totalSize > 0xFFFFu) {
|
|
free(scratch);
|
|
return false;
|
|
}
|
|
|
|
slot = codegenArenaAlloc(totalSize);
|
|
if (slot == NULL) {
|
|
free(scratch);
|
|
return false;
|
|
}
|
|
|
|
dst = codegenArenaBase() + slot->offset;
|
|
offset = 0;
|
|
for (shift = 0; shift < JOEY_SPRITE_SHIFT_COUNT; shift++) {
|
|
for (op = 0; op < SPRITE_OP_COUNT; op++) {
|
|
switch (op) {
|
|
case SPRITE_OP_DRAW: written = emitDrawForTarget (dst + offset, sp, shift); break;
|
|
case SPRITE_OP_SAVE: written = emitSaveForTarget (dst + offset, sp, shift); break;
|
|
case SPRITE_OP_RESTORE: written = emitRestoreForTarget(dst + offset, sp, shift); break;
|
|
default: written = 0; break;
|
|
}
|
|
if (written == 0) {
|
|
sp->routineOffsets[shift][op] = SPRITE_NOT_COMPILED;
|
|
} else {
|
|
sp->routineOffsets[shift][op] = offset;
|
|
offset = (uint16_t)(offset + written);
|
|
}
|
|
}
|
|
}
|
|
#if defined(JOEYLIB_PLATFORM_AMIGA)
|
|
/* Save/restore bytes for any non-zero shift are identical (plain
|
|
* memcpy of widthTiles+1 plane bytes per row). The emitter emits
|
|
* them once at slot 1; alias slots 2..7 here so the dispatcher
|
|
* gate (sprite.c) sees them as compiled. */
|
|
for (shift = 2; shift < JOEY_SPRITE_SHIFT_COUNT; shift++) {
|
|
sp->routineOffsets[shift][SPRITE_OP_SAVE] = sp->routineOffsets[1][SPRITE_OP_SAVE];
|
|
sp->routineOffsets[shift][SPRITE_OP_RESTORE] = sp->routineOffsets[1][SPRITE_OP_RESTORE];
|
|
}
|
|
#endif
|
|
sp->slot = slot;
|
|
free(scratch);
|
|
return true;
|
|
}
|
|
|
|
|
|
#if defined(JOEYLIB_PLATFORM_IIGS)
|
|
|
|
// SURFACE_ROW_OFFSET dispatches to the gRowOffsetLut lookup on IIgs;
|
|
// declared in surfaceInternal.h. Replaces ORCA-C's __mul16 JSL with a
|
|
// single indexed long-mode read.
|
|
|
|
// IIgs uses inline asm + a self-modifying call stub instead of a C
|
|
// function-pointer cast. The build uses ORCA-C large memory model
|
|
// (-b for sprite demos) so pointers are 24-bit and JSL works
|
|
// cross-bank.
|
|
//
|
|
// `sta abs,Y` on 65816 uses the data bank register (DBR) for the
|
|
// high byte of the effective address, so we need DBR = dst's bank
|
|
// during the body. malloc under -b can return memory in any bank,
|
|
// so we don't trust DBR to already match -- the stub explicitly
|
|
// sets DBR from the dst pointer's bank byte and restores it before
|
|
// returning to C.
|
|
//
|
|
// Stub layout (14 bytes):
|
|
// 00: 8B PHB ; save caller DBR
|
|
// 01: A9 bk LDA #destBank ; A = dst bank (8-bit M)
|
|
// 03: 48 PHA
|
|
// 04: AB PLB ; DBR = dst bank
|
|
// 05: A0 lo hi LDY #destOffset ; Y = low 16 of dst (X=16)
|
|
// 08: 22 lo mid bk JSL routine
|
|
// 0C: AB PLB ; restore caller DBR
|
|
// 0D: 6B RTL
|
|
//
|
|
// Patched per call: byte 2 (destBank), bytes 6-7 (destOffset16),
|
|
// bytes 9-11 (target 24-bit). The compiled routine assumes
|
|
// M=8 / X=16 / Y=destOffset on entry; the stub arranges that.
|
|
//
|
|
// Stub bytes are split into two phases:
|
|
// 1. The 8 opcode bytes are written ONCE on first call (gDrawStubInited).
|
|
// 2. Of the 6 operand bytes, only those that actually changed since
|
|
// the previous call get re-stamped: destBank and fnAddr are cached
|
|
// and rarely change (per-shift / per-bank). destOffset is the only
|
|
// one that changes every call as the sprite moves. Net per-frame
|
|
// patching for the typical case drops from 14 stores to 2.
|
|
static unsigned char gSpriteCallStub[14];
|
|
static bool gDrawStubInited = false;
|
|
static uint8_t gDrawStubLastBank = 0xFF;
|
|
static uint32_t gDrawStubLastFnAddr = 0xFFFFFFFFul;
|
|
|
|
void spriteCompiledDraw(SurfaceT *dst, const SpriteT *sp, int16_t x, int16_t y) {
|
|
uint8_t shift;
|
|
uint32_t destAddr;
|
|
uint16_t destOffset;
|
|
uint8_t destBank;
|
|
uint32_t fnAddr;
|
|
|
|
{
|
|
uint8_t *destPtr;
|
|
shift = (uint8_t)(x & 1);
|
|
destPtr = &dst->pixels[SURFACE_ROW_OFFSET(y) + ((uint16_t)x >> 1)];
|
|
destAddr = (uint32_t)destPtr;
|
|
destOffset = (uint16_t)(destAddr & 0xFFFFu);
|
|
destBank = (uint8_t)((destAddr >> 16) & 0xFFu);
|
|
fnAddr = codegenArenaBaseAddr()
|
|
+ sp->slot->offset
|
|
+ (uint32_t)sp->routineOffsets[shift][SPRITE_OP_DRAW];
|
|
}
|
|
(void)destAddr;
|
|
|
|
if (!gDrawStubInited) {
|
|
gSpriteCallStub[ 0] = 0x8B;
|
|
gSpriteCallStub[ 1] = 0xA9;
|
|
gSpriteCallStub[ 3] = 0x48;
|
|
gSpriteCallStub[ 4] = 0xAB;
|
|
gSpriteCallStub[ 5] = 0xA0;
|
|
gSpriteCallStub[ 8] = 0x22;
|
|
gSpriteCallStub[12] = 0xAB;
|
|
gSpriteCallStub[13] = 0x6B;
|
|
gDrawStubInited = true;
|
|
}
|
|
|
|
// destOffset always changes (sprite moves every frame).
|
|
gSpriteCallStub[ 6] = (unsigned char)(destOffset & 0xFFu);
|
|
gSpriteCallStub[ 7] = (unsigned char)((destOffset >> 8) & 0xFFu);
|
|
|
|
// destBank only changes if the dst surface migrates banks (~never).
|
|
if (destBank != gDrawStubLastBank) {
|
|
gSpriteCallStub[ 2] = destBank;
|
|
gDrawStubLastBank = destBank;
|
|
}
|
|
|
|
// fnAddr changes only on shift parity flips or sprite swaps.
|
|
if (fnAddr != gDrawStubLastFnAddr) {
|
|
const uint8_t *fnB_ = (const uint8_t *)&fnAddr;
|
|
gSpriteCallStub[ 9] = fnB_[0];
|
|
gSpriteCallStub[10] = fnB_[1];
|
|
gSpriteCallStub[11] = fnB_[2];
|
|
gDrawStubLastFnAddr = fnAddr;
|
|
}
|
|
|
|
// ORCA-C compiles this function under `longa on / longi on`
|
|
// (M=16, X=16) and emits the function epilogue assuming those
|
|
// widths at exit -- the deallocation ADC takes a 2-byte immediate
|
|
// and any LDX/LDY use 2-byte immediates. The byte writes to
|
|
// gSpriteCallStub above leave M=8, and an earlier PHP/PLP-only
|
|
// wrapper let the asm block exit in the wrong M state. The
|
|
// epilogue's `ADC #imm; TCS` then decoded as a wider ADC that
|
|
// swallowed the TCS, S was never adjusted, RTL popped wrong
|
|
// bytes, control fell into BSS, and the IIgs hit BRK on a zero
|
|
// byte. Force M=16/X=16 before returning to compiled C.
|
|
asm {
|
|
rep #0x30
|
|
sep #0x20
|
|
jsl gSpriteCallStub
|
|
rep #0x30
|
|
}
|
|
}
|
|
|
|
|
|
// Save/Restore call stub. The compiled MVN-row routines need
|
|
// M=16 / X=16 and expect index registers preset to source/dest
|
|
// offsets within their respective banks. MVN's own bank operands
|
|
// are in the routine bytes (patched per call), so this stub doesn't
|
|
// need to load DBR -- it just sets X and Y, JSLs, and restores DBR
|
|
// (MVN itself sets DBR to its destination bank as a side effect).
|
|
//
|
|
// Stub layout (13 bytes):
|
|
// 00: 8B PHB ; save caller DBR
|
|
// 01: A2 lo hi LDX #srcOffset
|
|
// 04: A0 lo hi LDY #dstOffset
|
|
// 07: 22 lo mid bk JSL routine
|
|
// 0B: AB PLB ; restore caller DBR
|
|
// 0C: 6B RTL
|
|
//
|
|
// For SAVE: X = screen lo, Y = backup lo
|
|
// For RESTORE: X = backup lo, Y = screen lo
|
|
//
|
|
// Two distinct stubs (one per op) instead of a shared one. Save and
|
|
// restore alternate every frame and they swap the X/Y meanings, so a
|
|
// shared stub forced a full re-stamp on every call. Per-op stubs let
|
|
// us cache: only the bytes that genuinely change frame-to-frame
|
|
// (typically just one of screenLo/backupLo as the sprite moves) get
|
|
// rewritten. Cuts per-call patching from 13 stores to 2 in the typical
|
|
// case (static backup buffer, stable shift parity).
|
|
static unsigned char gSpriteSaveStub[13];
|
|
static unsigned char gSpriteRestoreStub[13];
|
|
|
|
static bool gSaveStubInited = false;
|
|
static uint16_t gSaveStubLastXLo = 0xFFFFu;
|
|
static uint16_t gSaveStubLastYLo = 0xFFFFu;
|
|
static uint32_t gSaveStubLastFnAddr = 0xFFFFFFFFul;
|
|
|
|
static bool gRestoreStubInited = false;
|
|
static uint16_t gRestoreStubLastXLo = 0xFFFFu;
|
|
static uint16_t gRestoreStubLastYLo = 0xFFFFu;
|
|
static uint32_t gRestoreStubLastFnAddr= 0xFFFFFFFFul;
|
|
|
|
|
|
// patchMvnBanks stamps the destination and source bank operand bytes
|
|
// into each MVN inside an emitted save/restore routine. Layout from
|
|
// spriteEmitIigs.c::emitMvnCopyRoutine:
|
|
// row 0 (6 bytes): A9 lo hi 54 db sb
|
|
// row R (12 bytes, R>=1): 8A/98 18 69 lo hi AA/A8 A9 lo hi 54 db sb
|
|
// end (1 byte): 6B
|
|
// MVN dstbk is at offset (12*R + 4); srcbk at (12*R + 5).
|
|
static void patchMvnBanks(uint8_t *routine, uint16_t heightPx, uint8_t dstBank, uint8_t srcBank) {
|
|
uint16_t r;
|
|
|
|
for (r = 0; r < heightPx; r++) {
|
|
routine[12u * r + 4u] = dstBank;
|
|
routine[12u * r + 5u] = srcBank;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
// Split a 24-bit pointer into its low 16 bits + bank byte. The
|
|
// (uint32_t) cast works correctly in ORCA/C 2.2.1 (the 2.1.0 lossy-
|
|
// bank-byte bug is fixed). To avoid invoking the ~LSHR4 32-bit-shift
|
|
// helper for the `>> 16` to extract the bank byte, we cast to
|
|
// uint32_t and then byte-alias the storage -- gets the same bytes
|
|
// with simple loads.
|
|
#define SPLIT_POINTER(_ptr, _outLo, _outBank) \
|
|
do { \
|
|
uint32_t spAddr_ = (uint32_t)(_ptr); \
|
|
const uint8_t *spB_ = (const uint8_t *)&spAddr_; \
|
|
*(_outLo) = (uint16_t)(spB_[0] | ((uint16_t)spB_[1] << 8)); \
|
|
*(_outBank) = spB_[2]; \
|
|
} while (0)
|
|
|
|
// Backup-buffer pointer split cache. backup->bytes is a user-supplied
|
|
// buffer (e.g. a static array) and effectively never changes after
|
|
// the first call -- caching its split saves both Save and Restore the
|
|
// macro expansion per frame.
|
|
static const void *gLastBackupBytes = (const void *)0;
|
|
static uint16_t gLastBackupBytesLo = 0;
|
|
static uint8_t gLastBackupBytesBank = 0;
|
|
|
|
#define SPLIT_BACKUP_CACHED(_bytes, _outLo, _outBank) \
|
|
do { \
|
|
if ((const void *)(_bytes) == gLastBackupBytes) { \
|
|
*(_outLo) = gLastBackupBytesLo; \
|
|
*(_outBank) = gLastBackupBytesBank; \
|
|
} else { \
|
|
SPLIT_POINTER((_bytes), (_outLo), (_outBank)); \
|
|
gLastBackupBytes = (const void *)(_bytes); \
|
|
gLastBackupBytesLo = *(_outLo); \
|
|
gLastBackupBytesBank = *(_outBank); \
|
|
} \
|
|
} while (0)
|
|
|
|
|
|
void spriteCompiledSaveUnder(const SurfaceT *src, SpriteT *sp, int16_t x, int16_t y, SpriteBackupT *backup) {
|
|
uint8_t shift;
|
|
int16_t clippedX;
|
|
uint16_t widthPx;
|
|
uint16_t heightPx;
|
|
uint16_t copyBytes;
|
|
uint16_t screenLo;
|
|
uint16_t backupLo;
|
|
uint8_t screenBank;
|
|
uint8_t backupBank;
|
|
uint32_t fnAddr;
|
|
uint8_t *routine;
|
|
uint8_t *screenPtr;
|
|
uint16_t cacheIdx; /* shift * SPRITE_OP_COUNT + SPRITE_OP_SAVE, computed once */
|
|
uint8_t *cachedDst; /* &sp->cachedDstBank[0][0] + cacheIdx */
|
|
uint8_t *cachedSrc; /* &sp->cachedSrcBank[0][0] + cacheIdx */
|
|
uint16_t routineOffset; /* sp->routineOffsets[shift][SPRITE_OP_SAVE], computed once */
|
|
|
|
shift = (uint8_t)(x & 1);
|
|
clippedX = (int16_t)(x & ~1);
|
|
widthPx = (uint16_t)(sp->widthTiles * 8);
|
|
heightPx = (uint16_t)(sp->heightTiles * 8);
|
|
copyBytes = (uint16_t)((widthPx >> 1) + (shift == 1 ? 1 : 0));
|
|
|
|
screenPtr = (uint8_t *)&src->pixels[SURFACE_ROW_OFFSET(y) + ((uint16_t)clippedX >> 1)];
|
|
SPLIT_POINTER(screenPtr, &screenLo, &screenBank);
|
|
SPLIT_BACKUP_CACHED(backup->bytes, &backupLo, &backupBank);
|
|
|
|
backup->sprite = sp;
|
|
backup->x = clippedX;
|
|
backup->y = y;
|
|
backup->width = (uint16_t)(copyBytes << 1);
|
|
backup->height = heightPx;
|
|
/* sizeBytes is constant per (sprite, shift); cache to dodge the
|
|
* per-call ~CUMUL2 (uint16_t * uint16_t) helper. The byte-pointer
|
|
* arithmetic avoids reintroducing ~MUL4 for the uint16_t array
|
|
* indexing. */
|
|
{
|
|
uint16_t *sizeCachePtr = (uint16_t *)((uint8_t *)sp->cachedSizeBytes + ((uint16_t)shift << 1));
|
|
if (*sizeCachePtr == 0) {
|
|
*sizeCachePtr = (uint16_t)(copyBytes * heightPx);
|
|
}
|
|
backup->sizeBytes = *sizeCachePtr;
|
|
}
|
|
|
|
/* Compute the 1D index into the cached* / routineOffsets 2D arrays
|
|
* once. ORCA-C 2.2.1 lowers `shift * SPRITE_OP_COUNT` (where
|
|
* SPRITE_OP_COUNT==3) to a ~MUL4 helper call; (shift<<1)+shift
|
|
* compiles to two ASLs and an ADC, no helper. */
|
|
cacheIdx = (uint16_t)(((uint16_t)shift << 1) + (uint16_t)shift + SPRITE_OP_SAVE);
|
|
cachedDst = (uint8_t *)sp->cachedDstBank + cacheIdx;
|
|
cachedSrc = (uint8_t *)sp->cachedSrcBank + cacheIdx;
|
|
/* Same byte-pointer trick as SURFACE_ROW_OFFSET to dodge ~MUL4. */
|
|
routineOffset = *(uint16_t *)((uint8_t *)sp->routineOffsets + (cacheIdx << 1));
|
|
|
|
fnAddr = codegenArenaBaseAddr()
|
|
+ sp->slot->offset
|
|
+ (uint32_t)routineOffset;
|
|
|
|
// Stub: X = screen (source), Y = backup (destination).
|
|
if (!gSaveStubInited) {
|
|
gSpriteSaveStub[ 0] = 0x8B;
|
|
gSpriteSaveStub[ 1] = 0xA2;
|
|
gSpriteSaveStub[ 4] = 0xA0;
|
|
gSpriteSaveStub[ 7] = 0x22;
|
|
gSpriteSaveStub[11] = 0xAB;
|
|
gSpriteSaveStub[12] = 0x6B;
|
|
gSaveStubInited = true;
|
|
}
|
|
if (screenLo != gSaveStubLastXLo) {
|
|
gSpriteSaveStub[ 2] = (unsigned char)(screenLo & 0xFFu);
|
|
gSpriteSaveStub[ 3] = (unsigned char)((screenLo >> 8) & 0xFFu);
|
|
gSaveStubLastXLo = screenLo;
|
|
}
|
|
if (backupLo != gSaveStubLastYLo) {
|
|
gSpriteSaveStub[ 5] = (unsigned char)(backupLo & 0xFFu);
|
|
gSpriteSaveStub[ 6] = (unsigned char)((backupLo >> 8) & 0xFFu);
|
|
gSaveStubLastYLo = backupLo;
|
|
}
|
|
if (fnAddr != gSaveStubLastFnAddr) {
|
|
/* Byte-alias the uint32_t to grab the 3 bank/lo/hi bytes
|
|
* without invoking ~LSHR4 for the >>16. */
|
|
const uint8_t *fnB_ = (const uint8_t *)&fnAddr;
|
|
gSpriteSaveStub[ 8] = fnB_[0];
|
|
gSpriteSaveStub[ 9] = fnB_[1];
|
|
gSpriteSaveStub[10] = fnB_[2];
|
|
gSaveStubLastFnAddr = fnAddr;
|
|
}
|
|
|
|
// Skip the 16+ MVN-bank rewrites if the dst/src bank pair is the
|
|
// same as last call.
|
|
if (*cachedDst != backupBank || *cachedSrc != screenBank) {
|
|
routine = codegenArenaBase() + sp->slot->offset + routineOffset;
|
|
patchMvnBanks(routine, heightPx, /*dst*/backupBank, /*src*/screenBank);
|
|
*cachedDst = backupBank;
|
|
*cachedSrc = screenBank;
|
|
}
|
|
|
|
// MVN-based routine: needs M=16 / X=16; restore M=16 on exit
|
|
// matches ORCA-C `longa on` epilogue expectations.
|
|
asm {
|
|
rep #0x30
|
|
jsl gSpriteSaveStub
|
|
rep #0x30
|
|
}
|
|
}
|
|
|
|
|
|
void spriteCompiledRestoreUnder(SurfaceT *dst, const SpriteBackupT *backup) {
|
|
uint8_t shift;
|
|
uint16_t heightPx;
|
|
uint16_t copyBytes;
|
|
uint16_t spriteBytesPerRow;
|
|
uint16_t screenLo;
|
|
uint16_t backupLo;
|
|
uint8_t screenBank;
|
|
uint8_t backupBank;
|
|
uint32_t fnAddr;
|
|
uint8_t *routine;
|
|
uint8_t *screenPtr;
|
|
SpriteT *sp;
|
|
uint16_t cacheIdx; /* shift * SPRITE_OP_COUNT + SPRITE_OP_RESTORE, computed once */
|
|
uint8_t *cachedDst;
|
|
uint8_t *cachedSrc;
|
|
uint16_t routineOffset;
|
|
|
|
sp = backup->sprite;
|
|
heightPx = backup->height;
|
|
copyBytes = (uint16_t)(backup->width >> 1);
|
|
spriteBytesPerRow = (uint16_t)(sp->widthTiles * 4);
|
|
shift = (copyBytes == spriteBytesPerRow) ? 0 : 1;
|
|
|
|
screenPtr = (uint8_t *)&dst->pixels[SURFACE_ROW_OFFSET(backup->y) + ((uint16_t)backup->x >> 1)];
|
|
SPLIT_POINTER(screenPtr, &screenLo, &screenBank);
|
|
SPLIT_BACKUP_CACHED(backup->bytes, &backupLo, &backupBank);
|
|
|
|
/* Hoist 2D-array indexing -- see save-side comment. */
|
|
cacheIdx = (uint16_t)(((uint16_t)shift << 1) + (uint16_t)shift + SPRITE_OP_RESTORE);
|
|
cachedDst = (uint8_t *)sp->cachedDstBank + cacheIdx;
|
|
cachedSrc = (uint8_t *)sp->cachedSrcBank + cacheIdx;
|
|
/* Same byte-pointer trick as SURFACE_ROW_OFFSET to dodge ~MUL4. */
|
|
routineOffset = *(uint16_t *)((uint8_t *)sp->routineOffsets + (cacheIdx << 1));
|
|
|
|
fnAddr = codegenArenaBaseAddr()
|
|
+ sp->slot->offset
|
|
+ (uint32_t)routineOffset;
|
|
|
|
// Stub: X = backup (source), Y = screen (destination).
|
|
if (!gRestoreStubInited) {
|
|
gSpriteRestoreStub[ 0] = 0x8B;
|
|
gSpriteRestoreStub[ 1] = 0xA2;
|
|
gSpriteRestoreStub[ 4] = 0xA0;
|
|
gSpriteRestoreStub[ 7] = 0x22;
|
|
gSpriteRestoreStub[11] = 0xAB;
|
|
gSpriteRestoreStub[12] = 0x6B;
|
|
gRestoreStubInited = true;
|
|
}
|
|
if (backupLo != gRestoreStubLastXLo) {
|
|
gSpriteRestoreStub[ 2] = (unsigned char)(backupLo & 0xFFu);
|
|
gSpriteRestoreStub[ 3] = (unsigned char)((backupLo >> 8) & 0xFFu);
|
|
gRestoreStubLastXLo = backupLo;
|
|
}
|
|
if (screenLo != gRestoreStubLastYLo) {
|
|
gSpriteRestoreStub[ 5] = (unsigned char)(screenLo & 0xFFu);
|
|
gSpriteRestoreStub[ 6] = (unsigned char)((screenLo >> 8) & 0xFFu);
|
|
gRestoreStubLastYLo = screenLo;
|
|
}
|
|
if (fnAddr != gRestoreStubLastFnAddr) {
|
|
const uint8_t *fnB_ = (const uint8_t *)&fnAddr;
|
|
gSpriteRestoreStub[ 8] = fnB_[0];
|
|
gSpriteRestoreStub[ 9] = fnB_[1];
|
|
gSpriteRestoreStub[10] = fnB_[2];
|
|
gRestoreStubLastFnAddr = fnAddr;
|
|
}
|
|
|
|
// Same short-circuit as save: only re-stamp the bank operands if
|
|
// they actually changed since last call.
|
|
if (*cachedDst != screenBank || *cachedSrc != backupBank) {
|
|
routine = codegenArenaBase() + sp->slot->offset + routineOffset;
|
|
patchMvnBanks(routine, heightPx, /*dst*/screenBank, /*src*/backupBank);
|
|
*cachedDst = screenBank;
|
|
*cachedSrc = backupBank;
|
|
}
|
|
|
|
asm {
|
|
rep #0x30
|
|
jsl gSpriteRestoreStub
|
|
rep #0x30
|
|
}
|
|
}
|
|
|
|
#elif defined(JOEYLIB_PLATFORM_AMIGA)
|
|
|
|
/* Amiga planar dispatchers. spriteEmitPlanar68k.c emits routines with
|
|
* cdecl(p0, p1, p2, p3[, backup]) signatures that write directly to
|
|
* bitplanes. Compute byteOff = y*40 + x/8 and pass plane[i]+byteOff
|
|
* as the 4 plane args. shift = x % 8 selects the variant; today only
|
|
* shift 0 emits non-zero bytes, so callers should already have
|
|
* gated on routineOffsets[shift][op] != SPRITE_NOT_COMPILED.
|
|
*
|
|
* For non-zero shifts (x not 8-px-aligned), the dispatcher in
|
|
* src/core/sprite.c (spriteDraw / spriteSaveUnder / spriteRestoreUnder)
|
|
* sees SPRITE_NOT_COMPILED for the shift and falls back to the
|
|
* interpreter, which handles arbitrary x via halSpriteDrawPlanes /
|
|
* halSpriteSavePlanes / halSpriteRestorePlanes. */
|
|
|
|
#define AMIGA_BYTES_PER_ROW_LOCAL 40
|
|
|
|
void spriteCompiledDraw(SurfaceT *dst, const SpriteT *sp, int16_t x, int16_t y) {
|
|
typedef void (*DrawFn)(uint8_t *p0, uint8_t *p1, uint8_t *p2, uint8_t *p3);
|
|
uint8_t shift;
|
|
uint16_t byteOff;
|
|
uint8_t *p0;
|
|
uint8_t *p1;
|
|
uint8_t *p2;
|
|
uint8_t *p3;
|
|
DrawFn fn;
|
|
|
|
shift = (uint8_t)(x & 7);
|
|
byteOff = (uint16_t)((uint16_t)y * AMIGA_BYTES_PER_ROW_LOCAL + ((uint16_t)x >> 3));
|
|
p0 = halSurfacePlanePtr(dst, 0); if (p0 == NULL) return;
|
|
p1 = halSurfacePlanePtr(dst, 1);
|
|
p2 = halSurfacePlanePtr(dst, 2);
|
|
p3 = halSurfacePlanePtr(dst, 3);
|
|
fn = (DrawFn)(codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_DRAW]);
|
|
fn(p0 + byteOff, p1 + byteOff, p2 + byteOff, p3 + byteOff);
|
|
}
|
|
|
|
|
|
void spriteCompiledSaveUnder(const SurfaceT *src, SpriteT *sp, int16_t x, int16_t y, SpriteBackupT *backup) {
|
|
typedef void (*SaveFn)(uint8_t *p0, uint8_t *p1, uint8_t *p2, uint8_t *p3, uint8_t *backup);
|
|
uint8_t shift;
|
|
int16_t clippedX;
|
|
uint16_t widthPx;
|
|
uint16_t heightPx;
|
|
uint16_t byteOff;
|
|
uint8_t *p0;
|
|
uint8_t *p1;
|
|
uint8_t *p2;
|
|
uint8_t *p3;
|
|
SaveFn fn;
|
|
|
|
shift = (uint8_t)(x & 7);
|
|
clippedX = (int16_t)(x & ~7);
|
|
widthPx = (uint16_t)(sp->widthTiles * 8);
|
|
heightPx = (uint16_t)(sp->heightTiles * 8);
|
|
/* Shifts 1..7 spill into one extra plane byte per row (= +8 px). */
|
|
if (shift != 0u) {
|
|
widthPx = (uint16_t)(widthPx + 8u);
|
|
}
|
|
byteOff = (uint16_t)((uint16_t)y * AMIGA_BYTES_PER_ROW_LOCAL + ((uint16_t)clippedX >> 3));
|
|
|
|
backup->sprite = sp;
|
|
backup->x = clippedX;
|
|
backup->y = y;
|
|
backup->width = widthPx;
|
|
backup->height = heightPx;
|
|
/* 4 planes * h * (widthPx/8) bytes = h * widthPx/2. */
|
|
backup->sizeBytes = (uint16_t)((uint16_t)heightPx * (widthPx >> 1));
|
|
|
|
p0 = halSurfacePlanePtr(src, 0); if (p0 == NULL) return;
|
|
p1 = halSurfacePlanePtr(src, 1);
|
|
p2 = halSurfacePlanePtr(src, 2);
|
|
p3 = halSurfacePlanePtr(src, 3);
|
|
fn = (SaveFn)(codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_SAVE]);
|
|
fn(p0 + byteOff, p1 + byteOff, p2 + byteOff, p3 + byteOff, backup->bytes);
|
|
}
|
|
|
|
|
|
void spriteCompiledRestoreUnder(SurfaceT *dst, const SpriteBackupT *backup) {
|
|
typedef void (*RestoreFn)(uint8_t *p0, uint8_t *p1, uint8_t *p2, uint8_t *p3, const uint8_t *backup);
|
|
SpriteT *sp;
|
|
uint8_t shift;
|
|
uint16_t byteOff;
|
|
uint8_t *p0;
|
|
uint8_t *p1;
|
|
uint8_t *p2;
|
|
uint8_t *p3;
|
|
RestoreFn fn;
|
|
|
|
sp = backup->sprite;
|
|
/* backup->x is 8-px aligned (clippedX from save), so x & 7 is
|
|
* useless for picking the original shift. Encode it via
|
|
* backup->width: == widthTiles*8 means shift 0; > means shifted.
|
|
* Shifted slots 1..7 all alias to the same restore bytes, so
|
|
* slot 1 stands in for any non-zero shift. */
|
|
shift = (uint8_t)(backup->width > (uint16_t)(sp->widthTiles * 8) ? 1u : 0u);
|
|
byteOff = (uint16_t)((uint16_t)backup->y * AMIGA_BYTES_PER_ROW_LOCAL + ((uint16_t)backup->x >> 3));
|
|
|
|
p0 = halSurfacePlanePtr(dst, 0); if (p0 == NULL) return;
|
|
p1 = halSurfacePlanePtr(dst, 1);
|
|
p2 = halSurfacePlanePtr(dst, 2);
|
|
p3 = halSurfacePlanePtr(dst, 3);
|
|
fn = (RestoreFn)(codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_RESTORE]);
|
|
fn(p0 + byteOff, p1 + byteOff, p2 + byteOff, p3 + byteOff, backup->bytes);
|
|
}
|
|
|
|
#else
|
|
|
|
void spriteCompiledDraw(SurfaceT *dst, const SpriteT *sp, int16_t x, int16_t y) {
|
|
typedef void (*DrawFn)(uint8_t *destRow);
|
|
uint8_t shift;
|
|
uint8_t *destRow;
|
|
DrawFn fn;
|
|
|
|
shift = (uint8_t)(x & 1);
|
|
destRow = &dst->pixels[(uint16_t)y * SURFACE_BYTES_PER_ROW + ((uint16_t)x >> 1)];
|
|
fn = (DrawFn)(codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_DRAW]);
|
|
fn(destRow);
|
|
}
|
|
|
|
|
|
// x86 / 68k compiled save: bytes are a cdecl
|
|
// void copy(const uint8_t *src, uint8_t *dst)
|
|
// that walks heightPx rows of copyBytes from screen (stride
|
|
// SURFACE_BYTES_PER_ROW) into the contiguous backup buffer.
|
|
void spriteCompiledSaveUnder(const SurfaceT *src, SpriteT *sp, int16_t x, int16_t y, SpriteBackupT *backup) {
|
|
typedef void (*CopyFn)(const uint8_t *src, uint8_t *dst);
|
|
uint8_t shift;
|
|
int16_t clippedX;
|
|
uint16_t widthPx;
|
|
uint16_t heightPx;
|
|
uint16_t copyBytes;
|
|
uint8_t *screenPtr;
|
|
CopyFn fn;
|
|
|
|
shift = (uint8_t)(x & 1);
|
|
clippedX = (int16_t)(x & ~1);
|
|
widthPx = (uint16_t)(sp->widthTiles * 8);
|
|
heightPx = (uint16_t)(sp->heightTiles * 8);
|
|
copyBytes = (uint16_t)((widthPx >> 1) + (shift == 1 ? 1 : 0));
|
|
|
|
screenPtr = (uint8_t *)&src->pixels[(uint16_t)y * SURFACE_BYTES_PER_ROW + ((uint16_t)clippedX >> 1)];
|
|
|
|
backup->sprite = sp;
|
|
backup->x = clippedX;
|
|
backup->y = y;
|
|
backup->width = (uint16_t)(copyBytes << 1);
|
|
backup->height = heightPx;
|
|
backup->sizeBytes = (uint16_t)(copyBytes * heightPx);
|
|
|
|
fn = (CopyFn)(codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_SAVE]);
|
|
fn(screenPtr, backup->bytes);
|
|
}
|
|
|
|
|
|
// Mirror of save: caller swaps arg order so the same emitted shape
|
|
// drives backup -> screen. The screen-side stride lives inside the
|
|
// emitted bytes, so RESTORE has its own routine bytes (stride is
|
|
// applied to dst instead of src).
|
|
void spriteCompiledRestoreUnder(SurfaceT *dst, const SpriteBackupT *backup) {
|
|
typedef void (*CopyFn)(const uint8_t *src, uint8_t *dst);
|
|
SpriteT *sp;
|
|
uint8_t shift;
|
|
uint16_t copyBytes;
|
|
uint16_t spriteBytesPerRow;
|
|
uint8_t *screenPtr;
|
|
CopyFn fn;
|
|
|
|
sp = backup->sprite;
|
|
copyBytes = (uint16_t)(backup->width >> 1);
|
|
spriteBytesPerRow = (uint16_t)(sp->widthTiles * 4);
|
|
shift = (copyBytes == spriteBytesPerRow) ? 0 : 1;
|
|
|
|
screenPtr = (uint8_t *)&dst->pixels[(uint16_t)backup->y * SURFACE_BYTES_PER_ROW + ((uint16_t)backup->x >> 1)];
|
|
|
|
fn = (CopyFn)(codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_RESTORE]);
|
|
fn(backup->bytes, screenPtr);
|
|
}
|
|
|
|
#endif
|