Sprite save/restore for x86 and 68k.
This commit is contained in:
parent
6f37b126b8
commit
c4ee37941a
7 changed files with 409 additions and 98 deletions
|
|
@ -42,11 +42,17 @@ static uint16_t emitDrawForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// Save-under and restore-under emitters are IIgs-only at the moment;
|
// Save-under and restore-under emit dispatch. Each per-CPU pair
|
||||||
// other CPUs return 0, the runtime treats that as "not compiled" and
|
// produces row-by-row copy bytes; the runtime dispatch in
|
||||||
// falls back to spriteSaveUnderInterpreted / spriteRestoreUnderInterpreted.
|
// src/core/sprite.c gates on routineOffsets[shift][SPRITE_OP_SAVE]
|
||||||
|
// != SPRITE_NOT_COMPILED and falls back to the interpreted memcpy
|
||||||
|
// path otherwise.
|
||||||
static uint16_t emitSaveForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift) {
|
static uint16_t emitSaveForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift) {
|
||||||
#if defined(JOEYLIB_PLATFORM_IIGS)
|
#if defined(JOEYLIB_PLATFORM_DOS)
|
||||||
|
return spriteEmitSaveX86(out, sp, shift);
|
||||||
|
#elif defined(JOEYLIB_PLATFORM_AMIGA) || defined(JOEYLIB_PLATFORM_ATARIST)
|
||||||
|
return spriteEmitSave68k(out, sp, shift);
|
||||||
|
#elif defined(JOEYLIB_PLATFORM_IIGS)
|
||||||
return spriteEmitSaveIigs(out, sp, shift);
|
return spriteEmitSaveIigs(out, sp, shift);
|
||||||
#else
|
#else
|
||||||
(void)out; (void)sp; (void)shift;
|
(void)out; (void)sp; (void)shift;
|
||||||
|
|
@ -56,7 +62,11 @@ static uint16_t emitSaveForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift
|
||||||
|
|
||||||
|
|
||||||
static uint16_t emitRestoreForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift) {
|
static uint16_t emitRestoreForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift) {
|
||||||
#if defined(JOEYLIB_PLATFORM_IIGS)
|
#if defined(JOEYLIB_PLATFORM_DOS)
|
||||||
|
return spriteEmitRestoreX86(out, sp, shift);
|
||||||
|
#elif defined(JOEYLIB_PLATFORM_AMIGA) || defined(JOEYLIB_PLATFORM_ATARIST)
|
||||||
|
return spriteEmitRestore68k(out, sp, shift);
|
||||||
|
#elif defined(JOEYLIB_PLATFORM_IIGS)
|
||||||
return spriteEmitRestoreIigs(out, sp, shift);
|
return spriteEmitRestoreIigs(out, sp, shift);
|
||||||
#else
|
#else
|
||||||
(void)out; (void)sp; (void)shift;
|
(void)out; (void)sp; (void)shift;
|
||||||
|
|
@ -416,18 +426,62 @@ void spriteCompiledDraw(SurfaceT *dst, const SpriteT *sp, int16_t x, int16_t y)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// Non-IIgs platforms have no compiled save/restore yet. The dispatch
|
// x86 / 68k compiled save: bytes are a cdecl
|
||||||
// in src/core/sprite.c gates on routineOffsets[shift][SPRITE_OP_*] !=
|
// void copy(const uint8_t *src, uint8_t *dst)
|
||||||
// SPRITE_NOT_COMPILED, so these stubs should never actually run on
|
// that walks heightPx rows of copyBytes from screen (stride
|
||||||
// those platforms; they exist so spriteInternal.h's prototypes stay
|
// SURFACE_BYTES_PER_ROW) into the contiguous backup buffer.
|
||||||
// resolved at link time.
|
|
||||||
void spriteCompiledSaveUnder(const SurfaceT *src, SpriteT *sp, int16_t x, int16_t y, SpriteBackupT *backup) {
|
void spriteCompiledSaveUnder(const SurfaceT *src, SpriteT *sp, int16_t x, int16_t y, SpriteBackupT *backup) {
|
||||||
(void)src; (void)sp; (void)x; (void)y; (void)backup;
|
typedef void (*CopyFn)(const uint8_t *src, uint8_t *dst);
|
||||||
|
uint8_t shift;
|
||||||
|
int16_t clippedX;
|
||||||
|
uint16_t widthPx;
|
||||||
|
uint16_t heightPx;
|
||||||
|
uint16_t copyBytes;
|
||||||
|
uint8_t *screenPtr;
|
||||||
|
CopyFn fn;
|
||||||
|
|
||||||
|
shift = (uint8_t)(x & 1);
|
||||||
|
clippedX = (int16_t)(x & ~1);
|
||||||
|
widthPx = (uint16_t)(sp->widthTiles * 8);
|
||||||
|
heightPx = (uint16_t)(sp->heightTiles * 8);
|
||||||
|
copyBytes = (uint16_t)((widthPx >> 1) + (shift == 1 ? 1 : 0));
|
||||||
|
|
||||||
|
screenPtr = (uint8_t *)&src->pixels[(uint16_t)y * SURFACE_BYTES_PER_ROW + ((uint16_t)clippedX >> 1)];
|
||||||
|
|
||||||
|
backup->sprite = sp;
|
||||||
|
backup->x = clippedX;
|
||||||
|
backup->y = y;
|
||||||
|
backup->width = (uint16_t)(copyBytes << 1);
|
||||||
|
backup->height = heightPx;
|
||||||
|
backup->sizeBytes = (uint16_t)(copyBytes * heightPx);
|
||||||
|
|
||||||
|
fn = (CopyFn)(codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_SAVE]);
|
||||||
|
fn(screenPtr, backup->bytes);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Mirror of save: caller swaps arg order so the same emitted shape
|
||||||
|
// drives backup -> screen. The screen-side stride lives inside the
|
||||||
|
// emitted bytes, so RESTORE has its own routine bytes (stride is
|
||||||
|
// applied to dst instead of src).
|
||||||
void spriteCompiledRestoreUnder(SurfaceT *dst, const SpriteBackupT *backup) {
|
void spriteCompiledRestoreUnder(SurfaceT *dst, const SpriteBackupT *backup) {
|
||||||
(void)dst; (void)backup;
|
typedef void (*CopyFn)(const uint8_t *src, uint8_t *dst);
|
||||||
|
SpriteT *sp;
|
||||||
|
uint8_t shift;
|
||||||
|
uint16_t copyBytes;
|
||||||
|
uint16_t spriteBytesPerRow;
|
||||||
|
uint8_t *screenPtr;
|
||||||
|
CopyFn fn;
|
||||||
|
|
||||||
|
sp = backup->sprite;
|
||||||
|
copyBytes = (uint16_t)(backup->width >> 1);
|
||||||
|
spriteBytesPerRow = (uint16_t)(sp->widthTiles * 4);
|
||||||
|
shift = (copyBytes == spriteBytesPerRow) ? 0 : 1;
|
||||||
|
|
||||||
|
screenPtr = (uint8_t *)&dst->pixels[(uint16_t)backup->y * SURFACE_BYTES_PER_ROW + ((uint16_t)backup->x >> 1)];
|
||||||
|
|
||||||
|
fn = (CopyFn)(codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_RESTORE]);
|
||||||
|
fn(backup->bytes, screenPtr);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
||||||
|
|
@ -1,10 +1,12 @@
|
||||||
// 68k sprite codegen (Amiga + Atari ST). Emits SysV-ish cdecl-
|
// 68k sprite codegen (Amiga + Atari ST). Emits SysV-ish cdecl-
|
||||||
// callable PIC draw routines that write 4bpp packed surface bytes
|
// callable PIC draw / save / restore routines that read or write
|
||||||
// via d16(a0) chains. Same shape as the x86 emitter; only the
|
// 4bpp packed surface bytes via d16(a0) chains. Same shape as the
|
||||||
// instruction encoding differs.
|
// x86 emitter; only the instruction encoding differs.
|
||||||
//
|
//
|
||||||
// Calling convention (m68k gcc / mintlib):
|
// Calling convention (m68k gcc / mintlib):
|
||||||
// void draw(uint8_t *dst); -- arg in 4(sp); a0/a1/d0/d1 caller-saved.
|
// void draw(uint8_t *dst); -- arg in 4(sp)
|
||||||
|
// void save/restore(const uint8_t *src, uint8_t *dst); -- args in 4(sp)/8(sp)
|
||||||
|
// a0/a1/d0/d1 are caller-saved.
|
||||||
//
|
//
|
||||||
// Per-byte emit (no run coalescing yet):
|
// Per-byte emit (no run coalescing yet):
|
||||||
// - all-transparent: skip
|
// - all-transparent: skip
|
||||||
|
|
@ -38,13 +40,46 @@
|
||||||
|
|
||||||
// ----- Prototypes -----
|
// ----- Prototypes -----
|
||||||
|
|
||||||
static uint8_t spriteSourceByte(const SpriteT *sp, uint16_t row, uint16_t col);
|
static uint16_t emitCopyBody68k(uint8_t *out, uint16_t cursor, uint16_t heightPx, uint16_t copyBytes, bool strideOnSrc);
|
||||||
static void shiftedByteAt(const SpriteT *sp, uint16_t row, uint16_t col, uint8_t shift, uint16_t spriteBytesPerRow, uint8_t *outValue, uint8_t *outOpaqueMask);
|
static void shiftedByteAt(const SpriteT *sp, uint16_t row, uint16_t col, uint8_t shift, uint16_t spriteBytesPerRow, uint8_t *outValue, uint8_t *outOpaqueMask);
|
||||||
|
static uint8_t spriteSourceByte(const SpriteT *sp, uint16_t row, uint16_t col);
|
||||||
static uint16_t writeBE16(uint8_t *out, uint16_t value);
|
static uint16_t writeBE16(uint8_t *out, uint16_t value);
|
||||||
|
|
||||||
|
|
||||||
// ----- Emit helpers (alphabetical) -----
|
// ----- Emit helpers (alphabetical) -----
|
||||||
|
|
||||||
|
// Shared body for save/restore. Walks heightPx rows of copyBytes
|
||||||
|
// using `move.b (a0)+, (a1)+` byte-wise (safe regardless of pointer
|
||||||
|
// alignment, since the screen-side x can land on an odd byte). After
|
||||||
|
// each row except the last, advances either a0 (SAVE: src=screen) or
|
||||||
|
// a1 (RESTORE: dst=screen) by (SURFACE_BYTES_PER_ROW - copyBytes) so
|
||||||
|
// the strided side lines up with the next scanline; the contiguous
|
||||||
|
// side advances naturally via the post-increment.
|
||||||
|
//
|
||||||
|
// strideOnSrc=true -> source has the screen stride (SAVE)
|
||||||
|
// strideOnSrc=false -> destination has the screen stride (RESTORE)
|
||||||
|
static uint16_t emitCopyBody68k(uint8_t *out, uint16_t cursor, uint16_t heightPx, uint16_t copyBytes, bool strideOnSrc) {
|
||||||
|
uint16_t row;
|
||||||
|
uint16_t col;
|
||||||
|
uint16_t advance;
|
||||||
|
|
||||||
|
advance = (uint16_t)(SURFACE_BYTES_PER_ROW - copyBytes);
|
||||||
|
|
||||||
|
for (row = 0; row < heightPx; row++) {
|
||||||
|
// Unrolled: move.b (a0)+, (a1)+ -- 0x12D8.
|
||||||
|
for (col = 0; col < copyBytes; col++) {
|
||||||
|
cursor += writeBE16(out + cursor, 0x12D8u);
|
||||||
|
}
|
||||||
|
if (row + 1u < heightPx) {
|
||||||
|
// adda.w #advance, a0 (0xD0FC) for SAVE
|
||||||
|
// adda.w #advance, a1 (0xD2FC) for RESTORE
|
||||||
|
cursor += writeBE16(out + cursor, strideOnSrc ? 0xD0FCu : 0xD2FCu);
|
||||||
|
cursor += writeBE16(out + cursor, advance);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return cursor;
|
||||||
|
}
|
||||||
|
|
||||||
// Same logic as the x86 shiftedByteAt -- per-byte transparency
|
// Same logic as the x86 shiftedByteAt -- per-byte transparency
|
||||||
// decomposition for shift in {0,1}. opaqueMask high nibble 0xF0 if
|
// decomposition for shift in {0,1}. opaqueMask high nibble 0xF0 if
|
||||||
// dest high nibble is opaque, 0x0F if low is opaque.
|
// dest high nibble is opaque, 0x0F if low is opaque.
|
||||||
|
|
@ -184,3 +219,46 @@ uint16_t spriteEmitDraw68k(uint8_t *out, const SpriteT *sp, uint8_t shift) {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// RESTORE: copy backup -> screen. Destination has the screen stride.
|
||||||
|
uint16_t spriteEmitRestore68k(uint8_t *out, const SpriteT *sp, uint8_t shift) {
|
||||||
|
uint16_t cursor;
|
||||||
|
uint16_t heightPx;
|
||||||
|
uint16_t copyBytes;
|
||||||
|
|
||||||
|
cursor = 0;
|
||||||
|
heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS);
|
||||||
|
copyBytes = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW + (shift == 1u ? 1u : 0u));
|
||||||
|
|
||||||
|
// Prologue: movea.l 4(sp), a0 (src); movea.l 8(sp), a1 (dst).
|
||||||
|
cursor += writeBE16(out + cursor, 0x206Fu);
|
||||||
|
cursor += writeBE16(out + cursor, 0x0004u);
|
||||||
|
cursor += writeBE16(out + cursor, 0x226Fu);
|
||||||
|
cursor += writeBE16(out + cursor, 0x0008u);
|
||||||
|
|
||||||
|
cursor = emitCopyBody68k(out, cursor, heightPx, copyBytes, false);
|
||||||
|
|
||||||
|
cursor += writeBE16(out + cursor, 0x4E75u);
|
||||||
|
return cursor;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// SAVE: copy screen -> backup. Source has the screen stride.
|
||||||
|
uint16_t spriteEmitSave68k(uint8_t *out, const SpriteT *sp, uint8_t shift) {
|
||||||
|
uint16_t cursor;
|
||||||
|
uint16_t heightPx;
|
||||||
|
uint16_t copyBytes;
|
||||||
|
|
||||||
|
cursor = 0;
|
||||||
|
heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS);
|
||||||
|
copyBytes = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW + (shift == 1u ? 1u : 0u));
|
||||||
|
|
||||||
|
cursor += writeBE16(out + cursor, 0x206Fu);
|
||||||
|
cursor += writeBE16(out + cursor, 0x0004u);
|
||||||
|
cursor += writeBE16(out + cursor, 0x226Fu);
|
||||||
|
cursor += writeBE16(out + cursor, 0x0008u);
|
||||||
|
|
||||||
|
cursor = emitCopyBody68k(out, cursor, heightPx, copyBytes, true);
|
||||||
|
|
||||||
|
cursor += writeBE16(out + cursor, 0x4E75u);
|
||||||
|
return cursor;
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -217,6 +217,36 @@ uint16_t spriteEmitRestoreIigs(uint8_t *out, const SpriteT *sp, uint8_t shift) {
|
||||||
|
|
||||||
|
|
||||||
// 65816 draw emit. Returns bytes written.
|
// 65816 draw emit. Returns bytes written.
|
||||||
|
//
|
||||||
|
// Two emission paths share the body:
|
||||||
|
//
|
||||||
|
// * M=8 byte path (default; matches the stub-set entry mode):
|
||||||
|
// opaque: A9 vv LDA #vv ; 2c
|
||||||
|
// 99 lo hi STA abs,Y ; 5c
|
||||||
|
// ; 7c / 5 bytes per byte
|
||||||
|
// mixed: B9 lo hi LDA abs,Y ; 5c
|
||||||
|
// 29 mm AND #~mask ; 2c
|
||||||
|
// 09 vv ORA #val ; 2c
|
||||||
|
// 99 lo hi STA abs,Y ; 5c
|
||||||
|
// ;14c / 9 bytes per byte
|
||||||
|
//
|
||||||
|
// * M=16 word path (entered around runs of >= 2 consecutive
|
||||||
|
// fully-opaque bytes). Each word write covers 2 dest bytes:
|
||||||
|
// prologue: C2 20 REP #$20 ; 3c
|
||||||
|
// per pair: A9 lo hi LDA #imm16 ; 3c
|
||||||
|
// 99 lo hi STA abs,Y ; 6c
|
||||||
|
// ; 9c / 6 bytes per pair
|
||||||
|
// epilogue: E2 20 SEP #$20 ; 3c
|
||||||
|
//
|
||||||
|
// vs. M=8 path doing the same 2 bytes: 14c / 10 bytes. Per-pair
|
||||||
|
// savings are 5c / 4 bytes; the 6c/4-byte REP+SEP transition is
|
||||||
|
// amortized once per opaque run, so the path is profitable for
|
||||||
|
// runs of 2 pairs (4 consecutive opaque bytes) or longer. For
|
||||||
|
// isolated pairs we still take the M=16 path -- the 1-cycle loss
|
||||||
|
// vs. M=8 is dwarfed by the typical-sprite opaque-run length.
|
||||||
|
//
|
||||||
|
// Mixed bytes always run on the M=8 path because the AND/ORA in
|
||||||
|
// M=16 would clobber the adjacent byte.
|
||||||
uint16_t spriteEmitDrawIigs(uint8_t *out, const SpriteT *sp, uint8_t shift) {
|
uint16_t spriteEmitDrawIigs(uint8_t *out, const SpriteT *sp, uint8_t shift) {
|
||||||
uint16_t cursor;
|
uint16_t cursor;
|
||||||
uint16_t row;
|
uint16_t row;
|
||||||
|
|
@ -227,11 +257,15 @@ uint16_t spriteEmitDrawIigs(uint8_t *out, const SpriteT *sp, uint8_t shift) {
|
||||||
uint16_t absOffset;
|
uint16_t absOffset;
|
||||||
uint8_t value;
|
uint8_t value;
|
||||||
uint8_t opaqueMask;
|
uint8_t opaqueMask;
|
||||||
|
uint8_t nextValue;
|
||||||
|
uint8_t nextOpaqueMask;
|
||||||
|
bool wide;
|
||||||
|
|
||||||
cursor = 0;
|
cursor = 0;
|
||||||
heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS);
|
heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS);
|
||||||
spriteBytesPerRow = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW);
|
spriteBytesPerRow = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW);
|
||||||
destBytesPerRow = (uint16_t)(spriteBytesPerRow + (shift == 1 ? 1 : 0));
|
destBytesPerRow = (uint16_t)(spriteBytesPerRow + (shift == 1 ? 1 : 0));
|
||||||
|
wide = false;
|
||||||
|
|
||||||
// No prologue: caller (the inline-asm stub in spriteCompile.c)
|
// No prologue: caller (the inline-asm stub in spriteCompile.c)
|
||||||
// sets M=8/X=16/Y=destRow before JSL'ing here.
|
// sets M=8/X=16/Y=destRow before JSL'ing here.
|
||||||
|
|
@ -243,18 +277,42 @@ uint16_t spriteEmitDrawIigs(uint8_t *out, const SpriteT *sp, uint8_t shift) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
absOffset = (uint16_t)(row * SURFACE_BYTES_PER_ROW + col);
|
absOffset = (uint16_t)(row * SURFACE_BYTES_PER_ROW + col);
|
||||||
|
|
||||||
|
if (opaqueMask == 0xFFu && (col + 1) < destBytesPerRow) {
|
||||||
|
// Look ahead: if (col, col+1) are both fully opaque
|
||||||
|
// we can pair them as a single M=16 word write.
|
||||||
|
shiftedByteAt(sp, row, (uint16_t)(col + 1), shift,
|
||||||
|
spriteBytesPerRow, &nextValue, &nextOpaqueMask);
|
||||||
|
if (nextOpaqueMask == 0xFFu) {
|
||||||
|
if (!wide) {
|
||||||
|
out[cursor++] = 0xC2; // REP #$20 -- M=16
|
||||||
|
out[cursor++] = 0x20;
|
||||||
|
wide = true;
|
||||||
|
}
|
||||||
|
out[cursor++] = 0xA9; // LDA #imm16
|
||||||
|
cursor += writeLE16(out + cursor,
|
||||||
|
(uint16_t)(((uint16_t)nextValue << 8) | value));
|
||||||
|
out[cursor++] = 0x99; // STA abs,Y
|
||||||
|
cursor += writeLE16(out + cursor, absOffset);
|
||||||
|
col++; // consumed col+1
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Falls through here for: isolated opaque (no pair), mixed,
|
||||||
|
// or the trailing odd byte at the right edge. All on M=8.
|
||||||
|
if (wide) {
|
||||||
|
out[cursor++] = 0xE2; // SEP #$20 -- back to M=8
|
||||||
|
out[cursor++] = 0x20;
|
||||||
|
wide = false;
|
||||||
|
}
|
||||||
|
|
||||||
if (opaqueMask == 0xFFu) {
|
if (opaqueMask == 0xFFu) {
|
||||||
// lda #imm A9 ii
|
|
||||||
// sta abs,Y 99 lo hi
|
|
||||||
out[cursor++] = 0xA9;
|
out[cursor++] = 0xA9;
|
||||||
out[cursor++] = value;
|
out[cursor++] = value;
|
||||||
out[cursor++] = 0x99;
|
out[cursor++] = 0x99;
|
||||||
cursor += writeLE16(out + cursor, absOffset);
|
cursor += writeLE16(out + cursor, absOffset);
|
||||||
} else {
|
} else {
|
||||||
// lda abs,Y B9 lo hi
|
|
||||||
// and #mask 29 mm
|
|
||||||
// ora #val 09 vv
|
|
||||||
// sta abs,Y 99 lo hi
|
|
||||||
out[cursor++] = 0xB9;
|
out[cursor++] = 0xB9;
|
||||||
cursor += writeLE16(out + cursor, absOffset);
|
cursor += writeLE16(out + cursor, absOffset);
|
||||||
out[cursor++] = 0x29;
|
out[cursor++] = 0x29;
|
||||||
|
|
@ -267,6 +325,16 @@ uint16_t spriteEmitDrawIigs(uint8_t *out, const SpriteT *sp, uint8_t shift) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Routine exits in M=8: the JSL stub assumes M=8 throughout (the
|
||||||
|
// stub itself only ever ran with M=8 and doesn't restore M). The
|
||||||
|
// asm wrapper after the JSL forces M=16 again, but be defensive
|
||||||
|
// and ensure we leave M=8 here so the stub's PLB/RTL run as
|
||||||
|
// expected even if the wrapper convention changes.
|
||||||
|
if (wide) {
|
||||||
|
out[cursor++] = 0xE2;
|
||||||
|
out[cursor++] = 0x20;
|
||||||
|
}
|
||||||
|
|
||||||
// Epilogue: rtl (large memory model -b uses JSL/RTL).
|
// Epilogue: rtl (large memory model -b uses JSL/RTL).
|
||||||
out[cursor++] = 0x6B;
|
out[cursor++] = 0x6B;
|
||||||
return cursor;
|
return cursor;
|
||||||
|
|
|
||||||
|
|
@ -1,14 +1,11 @@
|
||||||
// x86 sprite codegen (DOS port). Emits 32-bit cdecl-callable PIC
|
// x86 sprite codegen (DOS port). Emits 32-bit cdecl-callable PIC
|
||||||
// draw routines that write 4bpp packed surface bytes via
|
// draw / save / restore routines that read or write 4bpp packed
|
||||||
// [esi+disp8] chains. The C side calls them through a function
|
// surface bytes via [esi+disp8] chains. The C side calls them
|
||||||
// pointer cast.
|
// through a function pointer cast.
|
||||||
//
|
//
|
||||||
// Calling convention:
|
// Calling convention:
|
||||||
// draw(uint8_t *dst) -- esi advances row by row
|
// draw(uint8_t *dst) -- arg in [esp+8] after prologue saves esi
|
||||||
//
|
// save/restore(const uint8_t *src, uint8_t *dst) -- args in [esp+12]/[esp+16] after esi+edi save
|
||||||
// Save and restore are not compiled -- they're uniform memcpy-
|
|
||||||
// shaped operations and the C interpreter handles them at memcpy
|
|
||||||
// speed via the standard library.
|
|
||||||
//
|
//
|
||||||
// Per-byte emit (no run coalescing yet):
|
// Per-byte emit (no run coalescing yet):
|
||||||
// - all-transparent (both nibbles 0): skip, no instruction
|
// - all-transparent (both nibbles 0): skip, no instruction
|
||||||
|
|
@ -45,12 +42,69 @@
|
||||||
|
|
||||||
// ----- Prototypes -----
|
// ----- Prototypes -----
|
||||||
|
|
||||||
static uint8_t spriteSourceByte(const SpriteT *sp, uint16_t row, uint16_t col);
|
static uint16_t emitCopyBodyX86(uint8_t *out, uint16_t cursor, uint16_t heightPx, uint16_t copyBytes, bool strideOnSrc);
|
||||||
static void shiftedByteAt(const SpriteT *sp, uint16_t row, uint16_t col, uint8_t shift, uint16_t spriteBytesPerRow, uint8_t *outValue, uint8_t *outOpaqueMask);
|
static void shiftedByteAt(const SpriteT *sp, uint16_t row, uint16_t col, uint8_t shift, uint16_t spriteBytesPerRow, uint8_t *outValue, uint8_t *outOpaqueMask);
|
||||||
|
static uint8_t spriteSourceByte(const SpriteT *sp, uint16_t row, uint16_t col);
|
||||||
|
|
||||||
|
|
||||||
// ----- Emit helpers (alphabetical) -----
|
// ----- Emit helpers (alphabetical) -----
|
||||||
|
|
||||||
|
// Shared body for save/restore. Walks heightPx rows of copyBytes
|
||||||
|
// using rep movsd for the dword-aligned bulk and rep movsb for the
|
||||||
|
// byte tail. After each row except the last, advances either esi
|
||||||
|
// or edi by (SURFACE_BYTES_PER_ROW - copyBytes) so the strided side
|
||||||
|
// (screen) lines up with the next scanline; the contiguous side
|
||||||
|
// (backup) advances naturally because rep movs* leaves the index
|
||||||
|
// register one past the last byte copied.
|
||||||
|
//
|
||||||
|
// strideOnSrc=true -> source has the screen stride (SAVE)
|
||||||
|
// strideOnSrc=false -> destination has the screen stride (RESTORE)
|
||||||
|
static uint16_t emitCopyBodyX86(uint8_t *out, uint16_t cursor, uint16_t heightPx, uint16_t copyBytes, bool strideOnSrc) {
|
||||||
|
uint16_t row;
|
||||||
|
uint16_t dwords;
|
||||||
|
uint16_t tail;
|
||||||
|
int32_t advance;
|
||||||
|
|
||||||
|
dwords = (uint16_t)(copyBytes >> 2);
|
||||||
|
tail = (uint16_t)(copyBytes & 0x3u);
|
||||||
|
advance = (int32_t)SURFACE_BYTES_PER_ROW - (int32_t)copyBytes;
|
||||||
|
|
||||||
|
for (row = 0; row < heightPx; row++) {
|
||||||
|
if (dwords > 0) {
|
||||||
|
// mov ecx, dwords (B9 imm32); rep movsd (F3 A5)
|
||||||
|
out[cursor++] = 0xB9;
|
||||||
|
out[cursor++] = (uint8_t)(dwords & 0xFFu);
|
||||||
|
out[cursor++] = (uint8_t)((dwords >> 8) & 0xFFu);
|
||||||
|
out[cursor++] = 0;
|
||||||
|
out[cursor++] = 0;
|
||||||
|
out[cursor++] = 0xF3;
|
||||||
|
out[cursor++] = 0xA5;
|
||||||
|
}
|
||||||
|
if (tail > 0) {
|
||||||
|
// mov ecx, tail (B9 imm32); rep movsb (F3 A4)
|
||||||
|
out[cursor++] = 0xB9;
|
||||||
|
out[cursor++] = (uint8_t)(tail & 0xFFu);
|
||||||
|
out[cursor++] = 0;
|
||||||
|
out[cursor++] = 0;
|
||||||
|
out[cursor++] = 0;
|
||||||
|
out[cursor++] = 0xF3;
|
||||||
|
out[cursor++] = 0xA4;
|
||||||
|
}
|
||||||
|
if (row + 1u < heightPx) {
|
||||||
|
// SAVE: add esi, advance (81 C6 imm32)
|
||||||
|
// RESTORE: add edi, advance (81 C7 imm32)
|
||||||
|
out[cursor++] = 0x81;
|
||||||
|
out[cursor++] = (uint8_t)(strideOnSrc ? 0xC6u : 0xC7u);
|
||||||
|
out[cursor++] = (uint8_t)(advance & 0xFFu);
|
||||||
|
out[cursor++] = (uint8_t)((advance >> 8) & 0xFFu);
|
||||||
|
out[cursor++] = (uint8_t)((advance >> 16) & 0xFFu);
|
||||||
|
out[cursor++] = (uint8_t)((advance >> 24) & 0xFFu);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return cursor;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
// Decompose a destination byte's contribution from the sprite into
|
// Decompose a destination byte's contribution from the sprite into
|
||||||
// (value, opaqueMask) for shift in {0, 1}. opaqueMask high nibble
|
// (value, opaqueMask) for shift in {0, 1}. opaqueMask high nibble
|
||||||
// 0xF0 means high dest nibble is opaque; 0x0F means low is opaque;
|
// 0xF0 means high dest nibble is opaque; 0x0F means low is opaque;
|
||||||
|
|
@ -189,3 +243,51 @@ uint16_t spriteEmitDrawX86(uint8_t *out, const SpriteT *sp, uint8_t shift) {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// RESTORE: copy backup -> screen. Destination has the screen stride.
|
||||||
|
uint16_t spriteEmitRestoreX86(uint8_t *out, const SpriteT *sp, uint8_t shift) {
|
||||||
|
uint16_t cursor;
|
||||||
|
uint16_t heightPx;
|
||||||
|
uint16_t copyBytes;
|
||||||
|
|
||||||
|
cursor = 0;
|
||||||
|
heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS);
|
||||||
|
copyBytes = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW + (shift == 1u ? 1u : 0u));
|
||||||
|
|
||||||
|
// Prologue: push esi; push edi; mov esi,[esp+12]; mov edi,[esp+16]
|
||||||
|
out[cursor++] = 0x56;
|
||||||
|
out[cursor++] = 0x57;
|
||||||
|
out[cursor++] = 0x8B; out[cursor++] = 0x74; out[cursor++] = 0x24; out[cursor++] = 0x0C;
|
||||||
|
out[cursor++] = 0x8B; out[cursor++] = 0x7C; out[cursor++] = 0x24; out[cursor++] = 0x10;
|
||||||
|
|
||||||
|
cursor = emitCopyBodyX86(out, cursor, heightPx, copyBytes, false);
|
||||||
|
|
||||||
|
// Epilogue: pop edi; pop esi; ret
|
||||||
|
out[cursor++] = 0x5F;
|
||||||
|
out[cursor++] = 0x5E;
|
||||||
|
out[cursor++] = 0xC3;
|
||||||
|
return cursor;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// SAVE: copy screen -> backup. Source has the screen stride.
|
||||||
|
uint16_t spriteEmitSaveX86(uint8_t *out, const SpriteT *sp, uint8_t shift) {
|
||||||
|
uint16_t cursor;
|
||||||
|
uint16_t heightPx;
|
||||||
|
uint16_t copyBytes;
|
||||||
|
|
||||||
|
cursor = 0;
|
||||||
|
heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS);
|
||||||
|
copyBytes = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW + (shift == 1u ? 1u : 0u));
|
||||||
|
|
||||||
|
out[cursor++] = 0x56;
|
||||||
|
out[cursor++] = 0x57;
|
||||||
|
out[cursor++] = 0x8B; out[cursor++] = 0x74; out[cursor++] = 0x24; out[cursor++] = 0x0C;
|
||||||
|
out[cursor++] = 0x8B; out[cursor++] = 0x7C; out[cursor++] = 0x24; out[cursor++] = 0x10;
|
||||||
|
|
||||||
|
cursor = emitCopyBodyX86(out, cursor, heightPx, copyBytes, true);
|
||||||
|
|
||||||
|
out[cursor++] = 0x5F;
|
||||||
|
out[cursor++] = 0x5E;
|
||||||
|
out[cursor++] = 0xC3;
|
||||||
|
return cursor;
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -27,11 +27,19 @@ uint16_t spriteEmitDrawIigs(uint8_t *out, const SpriteT *sp, uint8_t shift);
|
||||||
// rectangle between the destination surface and a backup buffer. The
|
// rectangle between the destination surface and a backup buffer. The
|
||||||
// rectangle's width and start position depend on the shift: for
|
// rectangle's width and start position depend on the shift: for
|
||||||
// shift=0 (even x) it covers exactly the sprite's bytes per row;
|
// shift=0 (even x) it covers exactly the sprite's bytes per row;
|
||||||
// for shift=1 (odd x) it covers one extra byte on each side, rounded
|
// for shift=1 (odd x) it covers one extra byte (left edge nibble).
|
||||||
// up to even. Per-CPU emitters return 0 to mean "not implemented" --
|
// Per-CPU emitters return 0 to mean "not implemented" -- the runtime
|
||||||
// the runtime dispatch falls back to the interpreted path in that
|
// dispatch falls back to the interpreted path in that case.
|
||||||
// case.
|
//
|
||||||
|
// IIgs uses a self-modifying MVN-stub on top of these bytes; x86 and
|
||||||
|
// 68k use a plain cdecl `void copy(const uint8_t *src, uint8_t *dst)`
|
||||||
|
// where the caller swaps args between SAVE (screen->backup) and
|
||||||
|
// RESTORE (backup->screen).
|
||||||
uint16_t spriteEmitSaveIigs (uint8_t *out, const SpriteT *sp, uint8_t shift);
|
uint16_t spriteEmitSaveIigs (uint8_t *out, const SpriteT *sp, uint8_t shift);
|
||||||
uint16_t spriteEmitRestoreIigs(uint8_t *out, const SpriteT *sp, uint8_t shift);
|
uint16_t spriteEmitRestoreIigs(uint8_t *out, const SpriteT *sp, uint8_t shift);
|
||||||
|
uint16_t spriteEmitSaveX86 (uint8_t *out, const SpriteT *sp, uint8_t shift);
|
||||||
|
uint16_t spriteEmitRestoreX86 (uint8_t *out, const SpriteT *sp, uint8_t shift);
|
||||||
|
uint16_t spriteEmitSave68k (uint8_t *out, const SpriteT *sp, uint8_t shift);
|
||||||
|
uint16_t spriteEmitRestore68k (uint8_t *out, const SpriteT *sp, uint8_t shift);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
||||||
|
|
@ -1,13 +1,10 @@
|
||||||
| Amiga chunky-to-planar conversion -- 68000 hand-rolled.
|
| Amiga chunky-to-planar conversion -- 68000 hand-rolled.
|
||||||
|
|
|
|
||||||
| Drop-in replacement for hal.c's old c2pRange C inner loop. The C
|
| Drop-in replacement for hal.c's old c2pRange C inner loop. Uses a
|
||||||
| version walked every pixel and OR'd individual bits into 4 plane
|
| 4 KB lookup table built once at HAL init: each (sourceByte, position,
|
||||||
| accumulators -- ~1.5 s for a full 320x200 frame on a 7 MHz 68000
|
| plane) tuple maps to the plane-byte bit contribution that source
|
||||||
| (the GCC m68k codegen is poor for tight bit-twiddling). This rewrite
|
| byte makes when it sits at that position within a 4-byte (8-pixel)
|
||||||
| uses a 4 KB lookup table built once at HAL init: each (sourceByte,
|
| planar group going to that plane.
|
||||||
| bytePosition, plane) tuple maps to the plane-byte-bit contribution
|
|
||||||
| that source byte makes when it sits at that position within a
|
|
||||||
| 4-byte (= 8-pixel) planar group.
|
|
||||||
|
|
|
|
||||||
| Calling convention: m68k-amigaos-gcc cdecl.
|
| Calling convention: m68k-amigaos-gcc cdecl.
|
||||||
| Args on stack at 4(sp), 8(sp), ...
|
| Args on stack at 4(sp), 8(sp), ...
|
||||||
|
|
@ -22,12 +19,17 @@
|
||||||
| uint16_t n, ; 24(sp) - planar byte count (low word)
|
| uint16_t n, ; 24(sp) - planar byte count (low word)
|
||||||
| const uint8_t *lut); ; 28(sp) - 4 KB LUT base
|
| const uint8_t *lut); ; 28(sp) - 4 KB LUT base
|
||||||
|
|
|
|
||||||
| LUT layout: lut[pos*1024 + plane*256 + src] = 1-byte plane contribution
|
| LUT layout: lut[src*16 + pos*4 + plane] = 1-byte plane contribution
|
||||||
| for source byte `src` sitting at byte-position `pos` within its
|
| for source byte `src` sitting at byte-position `pos` (0..3) within
|
||||||
| 4-byte planar group, going to plane `plane`. Byte-position 0 is the
|
| its 4-byte planar group, going to plane `plane` (0..3). All 16
|
||||||
| leftmost (its two pixels land in plane-byte bits 7 and 6); position
|
| (pos, plane) entries for one src byte are contiguous, so the inner
|
||||||
| 3 is the rightmost (bits 1 and 0). Built once by chunkyToPlanarInit
|
| loop reaches every entry off (a5, d4.w) with an 8-bit displacement
|
||||||
| (in hal.c) at HAL boot.
|
| (0..15) and never has to advance an index register.
|
||||||
|
|
|
||||||
|
| Per planar byte we consume 4 source bytes (positions 0..3 of the
|
||||||
|
| 8-pixel group). For each we compute d4 = src*16 with four add.w's
|
||||||
|
| (faster than asl.w on 68000) and OR the four plane contributions
|
||||||
|
| into d0..d3 with byte-displaced (a5,d4.w) reads.
|
||||||
|
|
|
|
||||||
| GAS-syntax (binutils m68k); assembled by m68k-amigaos-as via the
|
| GAS-syntax (binutils m68k); assembled by m68k-amigaos-as via the
|
||||||
| gcc driver.
|
| gcc driver.
|
||||||
|
|
@ -65,54 +67,52 @@ _chunkyToPlanarRow:
|
||||||
moveq #0,%d3 | plane 3 acc
|
moveq #0,%d3 | plane 3 acc
|
||||||
|
|
||||||
| ----- Source byte position 0 -----
|
| ----- Source byte position 0 -----
|
||||||
| a5 points to start of LUT. Plane 0/1/2/3 sub-tables
|
|
||||||
| for position 0 are at offsets 0/256/512/768.
|
|
||||||
moveq #0,%d4
|
moveq #0,%d4
|
||||||
move.b (%a0)+,%d4 | src[0]
|
move.b (%a0)+,%d4 | src[0]
|
||||||
move.l %a5,%a6
|
add.w %d4,%d4
|
||||||
or.b (%a6,%d4.w),%d0 | +0 = pos0 plane 0
|
add.w %d4,%d4
|
||||||
lea 256(%a6),%a6
|
add.w %d4,%d4
|
||||||
or.b (%a6,%d4.w),%d1 | +256 = pos0 plane 1
|
add.w %d4,%d4 | d4 = src * 16
|
||||||
lea 256(%a6),%a6
|
or.b 0(%a5,%d4.w),%d0 | pos0 plane0
|
||||||
or.b (%a6,%d4.w),%d2 | +512 = pos0 plane 2
|
or.b 1(%a5,%d4.w),%d1 | pos0 plane1
|
||||||
lea 256(%a6),%a6
|
or.b 2(%a5,%d4.w),%d2 | pos0 plane2
|
||||||
or.b (%a6,%d4.w),%d3 | +768 = pos0 plane 3
|
or.b 3(%a5,%d4.w),%d3 | pos0 plane3
|
||||||
|
|
||||||
| ----- Source byte position 1 -----
|
| ----- Source byte position 1 -----
|
||||||
lea 256(%a6),%a6 | advance to pos1 plane 0
|
|
||||||
moveq #0,%d4
|
moveq #0,%d4
|
||||||
move.b (%a0)+,%d4
|
move.b (%a0)+,%d4 | src[1]
|
||||||
or.b (%a6,%d4.w),%d0
|
add.w %d4,%d4
|
||||||
lea 256(%a6),%a6
|
add.w %d4,%d4
|
||||||
or.b (%a6,%d4.w),%d1
|
add.w %d4,%d4
|
||||||
lea 256(%a6),%a6
|
add.w %d4,%d4
|
||||||
or.b (%a6,%d4.w),%d2
|
or.b 4(%a5,%d4.w),%d0 | pos1 plane0
|
||||||
lea 256(%a6),%a6
|
or.b 5(%a5,%d4.w),%d1 | pos1 plane1
|
||||||
or.b (%a6,%d4.w),%d3
|
or.b 6(%a5,%d4.w),%d2 | pos1 plane2
|
||||||
|
or.b 7(%a5,%d4.w),%d3 | pos1 plane3
|
||||||
|
|
||||||
| ----- Source byte position 2 -----
|
| ----- Source byte position 2 -----
|
||||||
lea 256(%a6),%a6
|
|
||||||
moveq #0,%d4
|
moveq #0,%d4
|
||||||
move.b (%a0)+,%d4
|
move.b (%a0)+,%d4 | src[2]
|
||||||
or.b (%a6,%d4.w),%d0
|
add.w %d4,%d4
|
||||||
lea 256(%a6),%a6
|
add.w %d4,%d4
|
||||||
or.b (%a6,%d4.w),%d1
|
add.w %d4,%d4
|
||||||
lea 256(%a6),%a6
|
add.w %d4,%d4
|
||||||
or.b (%a6,%d4.w),%d2
|
or.b 8(%a5,%d4.w),%d0 | pos2 plane0
|
||||||
lea 256(%a6),%a6
|
or.b 9(%a5,%d4.w),%d1 | pos2 plane1
|
||||||
or.b (%a6,%d4.w),%d3
|
or.b 10(%a5,%d4.w),%d2 | pos2 plane2
|
||||||
|
or.b 11(%a5,%d4.w),%d3 | pos2 plane3
|
||||||
|
|
||||||
| ----- Source byte position 3 -----
|
| ----- Source byte position 3 -----
|
||||||
lea 256(%a6),%a6
|
|
||||||
moveq #0,%d4
|
moveq #0,%d4
|
||||||
move.b (%a0)+,%d4
|
move.b (%a0)+,%d4 | src[3]
|
||||||
or.b (%a6,%d4.w),%d0
|
add.w %d4,%d4
|
||||||
lea 256(%a6),%a6
|
add.w %d4,%d4
|
||||||
or.b (%a6,%d4.w),%d1
|
add.w %d4,%d4
|
||||||
lea 256(%a6),%a6
|
add.w %d4,%d4
|
||||||
or.b (%a6,%d4.w),%d2
|
or.b 12(%a5,%d4.w),%d0 | pos3 plane0
|
||||||
lea 256(%a6),%a6
|
or.b 13(%a5,%d4.w),%d1 | pos3 plane1
|
||||||
or.b (%a6,%d4.w),%d3
|
or.b 14(%a5,%d4.w),%d2 | pos3 plane2
|
||||||
|
or.b 15(%a5,%d4.w),%d3 | pos3 plane3
|
||||||
|
|
||||||
| ----- Store plane bytes -----
|
| ----- Store plane bytes -----
|
||||||
move.b %d0,(%a1)+
|
move.b %d0,(%a1)+
|
||||||
|
|
|
||||||
|
|
@ -77,11 +77,12 @@ static uint16_t gCachedPalette[SURFACE_PALETTE_COUNT][SURFACE_COLORS_PER_PALETTE
|
||||||
static bool gCacheValid = false;
|
static bool gCacheValid = false;
|
||||||
|
|
||||||
// 4 KB chunky-to-planar lookup table consumed by chunkyToPlanarRow
|
// 4 KB chunky-to-planar lookup table consumed by chunkyToPlanarRow
|
||||||
// (src/port/amiga/c2p.s). Layout: gC2pLut[pos*1024 + plane*256 + src]
|
// (src/port/amiga/c2p.s). Layout: gC2pLut[src*16 + pos*4 + plane] =
|
||||||
// = the plane-byte bit contribution that source byte `src` makes when
|
// the plane-byte bit contribution that source byte `src` makes to
|
||||||
// it sits at byte-position `pos` within a 4-byte (8-pixel) planar
|
// plane `plane` when it sits at byte-position `pos` within a 4-byte
|
||||||
// group, going to plane `plane`. Built once by initC2pLut on the
|
// (8-pixel) planar group. The src-major layout lets the asm inner
|
||||||
// first halPresent call.
|
// loop reach all 16 (pos, plane) entries for a single src byte via
|
||||||
|
// 8-bit displacements off (a5, d4.w) without any LEA between reads.
|
||||||
static uint8_t gC2pLut[4 * 1024];
|
static uint8_t gC2pLut[4 * 1024];
|
||||||
static bool gC2pLutReady = false;
|
static bool gC2pLutReady = false;
|
||||||
|
|
||||||
|
|
@ -116,14 +117,14 @@ static void initC2pLut(void) {
|
||||||
if (gC2pLutReady) {
|
if (gC2pLutReady) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
for (src = 0; src < 256; src++) {
|
||||||
for (pos = 0; pos < 4; pos++) {
|
for (pos = 0; pos < 4; pos++) {
|
||||||
highShift = (uint8_t)(7 - 2 * pos);
|
highShift = (uint8_t)(7 - 2 * pos);
|
||||||
lowShift = (uint8_t)(6 - 2 * pos);
|
lowShift = (uint8_t)(6 - 2 * pos);
|
||||||
for (plane = 0; plane < 4; plane++) {
|
for (plane = 0; plane < 4; plane++) {
|
||||||
for (src = 0; src < 256; src++) {
|
|
||||||
highBit = (uint8_t)(((src >> 4) >> plane) & 1);
|
highBit = (uint8_t)(((src >> 4) >> plane) & 1);
|
||||||
lowBit = (uint8_t)(((src & 0x0F) >> plane) & 1);
|
lowBit = (uint8_t)(((src & 0x0F) >> plane) & 1);
|
||||||
gC2pLut[pos * 1024 + plane * 256 + src] =
|
gC2pLut[src * 16 + pos * 4 + plane] =
|
||||||
(uint8_t)((highBit << highShift) | (lowBit << lowShift));
|
(uint8_t)((highBit << highShift) | (lowBit << lowShift));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue