From c4ee37941a8f91552b44870e65a201cbcdf2ca89 Mon Sep 17 00:00:00 2001 From: Scott Duensing Date: Sun, 26 Apr 2026 21:42:43 -0500 Subject: [PATCH] Sprite save/restore for x86 and 68k. --- src/codegen/spriteCompile.c | 78 +++++++++++++++++++---- src/codegen/spriteEmit68k.c | 88 +++++++++++++++++++++++-- src/codegen/spriteEmitIigs.c | 80 +++++++++++++++++++++-- src/codegen/spriteEmitX86.c | 120 ++++++++++++++++++++++++++++++++--- src/codegen/spriteEmitter.h | 16 +++-- src/port/amiga/c2p.s | 102 ++++++++++++++--------------- src/port/amiga/hal.c | 23 +++---- 7 files changed, 409 insertions(+), 98 deletions(-) diff --git a/src/codegen/spriteCompile.c b/src/codegen/spriteCompile.c index d87bf2a..ce7b196 100644 --- a/src/codegen/spriteCompile.c +++ b/src/codegen/spriteCompile.c @@ -42,11 +42,17 @@ static uint16_t emitDrawForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift } -// Save-under and restore-under emitters are IIgs-only at the moment; -// other CPUs return 0, the runtime treats that as "not compiled" and -// falls back to spriteSaveUnderInterpreted / spriteRestoreUnderInterpreted. +// Save-under and restore-under emit dispatch. Each per-CPU pair +// produces row-by-row copy bytes; the runtime dispatch in +// src/core/sprite.c gates on routineOffsets[shift][SPRITE_OP_SAVE] +// != SPRITE_NOT_COMPILED and falls back to the interpreted memcpy +// path otherwise. static uint16_t emitSaveForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift) { -#if defined(JOEYLIB_PLATFORM_IIGS) +#if defined(JOEYLIB_PLATFORM_DOS) + return spriteEmitSaveX86(out, sp, shift); +#elif defined(JOEYLIB_PLATFORM_AMIGA) || defined(JOEYLIB_PLATFORM_ATARIST) + return spriteEmitSave68k(out, sp, shift); +#elif defined(JOEYLIB_PLATFORM_IIGS) return spriteEmitSaveIigs(out, sp, shift); #else (void)out; (void)sp; (void)shift; @@ -56,7 +62,11 @@ static uint16_t emitSaveForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift static uint16_t emitRestoreForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift) { -#if defined(JOEYLIB_PLATFORM_IIGS) +#if defined(JOEYLIB_PLATFORM_DOS) + return spriteEmitRestoreX86(out, sp, shift); +#elif defined(JOEYLIB_PLATFORM_AMIGA) || defined(JOEYLIB_PLATFORM_ATARIST) + return spriteEmitRestore68k(out, sp, shift); +#elif defined(JOEYLIB_PLATFORM_IIGS) return spriteEmitRestoreIigs(out, sp, shift); #else (void)out; (void)sp; (void)shift; @@ -416,18 +426,62 @@ void spriteCompiledDraw(SurfaceT *dst, const SpriteT *sp, int16_t x, int16_t y) } -// Non-IIgs platforms have no compiled save/restore yet. The dispatch -// in src/core/sprite.c gates on routineOffsets[shift][SPRITE_OP_*] != -// SPRITE_NOT_COMPILED, so these stubs should never actually run on -// those platforms; they exist so spriteInternal.h's prototypes stay -// resolved at link time. +// x86 / 68k compiled save: bytes are a cdecl +// void copy(const uint8_t *src, uint8_t *dst) +// that walks heightPx rows of copyBytes from screen (stride +// SURFACE_BYTES_PER_ROW) into the contiguous backup buffer. void spriteCompiledSaveUnder(const SurfaceT *src, SpriteT *sp, int16_t x, int16_t y, SpriteBackupT *backup) { - (void)src; (void)sp; (void)x; (void)y; (void)backup; + typedef void (*CopyFn)(const uint8_t *src, uint8_t *dst); + uint8_t shift; + int16_t clippedX; + uint16_t widthPx; + uint16_t heightPx; + uint16_t copyBytes; + uint8_t *screenPtr; + CopyFn fn; + + shift = (uint8_t)(x & 1); + clippedX = (int16_t)(x & ~1); + widthPx = (uint16_t)(sp->widthTiles * 8); + heightPx = (uint16_t)(sp->heightTiles * 8); + copyBytes = (uint16_t)((widthPx >> 1) + (shift == 1 ? 1 : 0)); + + screenPtr = (uint8_t *)&src->pixels[(uint16_t)y * SURFACE_BYTES_PER_ROW + ((uint16_t)clippedX >> 1)]; + + backup->sprite = sp; + backup->x = clippedX; + backup->y = y; + backup->width = (uint16_t)(copyBytes << 1); + backup->height = heightPx; + backup->sizeBytes = (uint16_t)(copyBytes * heightPx); + + fn = (CopyFn)(codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_SAVE]); + fn(screenPtr, backup->bytes); } +// Mirror of save: caller swaps arg order so the same emitted shape +// drives backup -> screen. The screen-side stride lives inside the +// emitted bytes, so RESTORE has its own routine bytes (stride is +// applied to dst instead of src). void spriteCompiledRestoreUnder(SurfaceT *dst, const SpriteBackupT *backup) { - (void)dst; (void)backup; + typedef void (*CopyFn)(const uint8_t *src, uint8_t *dst); + SpriteT *sp; + uint8_t shift; + uint16_t copyBytes; + uint16_t spriteBytesPerRow; + uint8_t *screenPtr; + CopyFn fn; + + sp = backup->sprite; + copyBytes = (uint16_t)(backup->width >> 1); + spriteBytesPerRow = (uint16_t)(sp->widthTiles * 4); + shift = (copyBytes == spriteBytesPerRow) ? 0 : 1; + + screenPtr = (uint8_t *)&dst->pixels[(uint16_t)backup->y * SURFACE_BYTES_PER_ROW + ((uint16_t)backup->x >> 1)]; + + fn = (CopyFn)(codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_RESTORE]); + fn(backup->bytes, screenPtr); } #endif diff --git a/src/codegen/spriteEmit68k.c b/src/codegen/spriteEmit68k.c index 5147cec..b86851c 100644 --- a/src/codegen/spriteEmit68k.c +++ b/src/codegen/spriteEmit68k.c @@ -1,10 +1,12 @@ // 68k sprite codegen (Amiga + Atari ST). Emits SysV-ish cdecl- -// callable PIC draw routines that write 4bpp packed surface bytes -// via d16(a0) chains. Same shape as the x86 emitter; only the -// instruction encoding differs. +// callable PIC draw / save / restore routines that read or write +// 4bpp packed surface bytes via d16(a0) chains. Same shape as the +// x86 emitter; only the instruction encoding differs. // // Calling convention (m68k gcc / mintlib): -// void draw(uint8_t *dst); -- arg in 4(sp); a0/a1/d0/d1 caller-saved. +// void draw(uint8_t *dst); -- arg in 4(sp) +// void save/restore(const uint8_t *src, uint8_t *dst); -- args in 4(sp)/8(sp) +// a0/a1/d0/d1 are caller-saved. // // Per-byte emit (no run coalescing yet): // - all-transparent: skip @@ -38,13 +40,46 @@ // ----- Prototypes ----- -static uint8_t spriteSourceByte(const SpriteT *sp, uint16_t row, uint16_t col); +static uint16_t emitCopyBody68k(uint8_t *out, uint16_t cursor, uint16_t heightPx, uint16_t copyBytes, bool strideOnSrc); static void shiftedByteAt(const SpriteT *sp, uint16_t row, uint16_t col, uint8_t shift, uint16_t spriteBytesPerRow, uint8_t *outValue, uint8_t *outOpaqueMask); +static uint8_t spriteSourceByte(const SpriteT *sp, uint16_t row, uint16_t col); static uint16_t writeBE16(uint8_t *out, uint16_t value); // ----- Emit helpers (alphabetical) ----- +// Shared body for save/restore. Walks heightPx rows of copyBytes +// using `move.b (a0)+, (a1)+` byte-wise (safe regardless of pointer +// alignment, since the screen-side x can land on an odd byte). After +// each row except the last, advances either a0 (SAVE: src=screen) or +// a1 (RESTORE: dst=screen) by (SURFACE_BYTES_PER_ROW - copyBytes) so +// the strided side lines up with the next scanline; the contiguous +// side advances naturally via the post-increment. +// +// strideOnSrc=true -> source has the screen stride (SAVE) +// strideOnSrc=false -> destination has the screen stride (RESTORE) +static uint16_t emitCopyBody68k(uint8_t *out, uint16_t cursor, uint16_t heightPx, uint16_t copyBytes, bool strideOnSrc) { + uint16_t row; + uint16_t col; + uint16_t advance; + + advance = (uint16_t)(SURFACE_BYTES_PER_ROW - copyBytes); + + for (row = 0; row < heightPx; row++) { + // Unrolled: move.b (a0)+, (a1)+ -- 0x12D8. + for (col = 0; col < copyBytes; col++) { + cursor += writeBE16(out + cursor, 0x12D8u); + } + if (row + 1u < heightPx) { + // adda.w #advance, a0 (0xD0FC) for SAVE + // adda.w #advance, a1 (0xD2FC) for RESTORE + cursor += writeBE16(out + cursor, strideOnSrc ? 0xD0FCu : 0xD2FCu); + cursor += writeBE16(out + cursor, advance); + } + } + return cursor; +} + // Same logic as the x86 shiftedByteAt -- per-byte transparency // decomposition for shift in {0,1}. opaqueMask high nibble 0xF0 if // dest high nibble is opaque, 0x0F if low is opaque. @@ -184,3 +219,46 @@ uint16_t spriteEmitDraw68k(uint8_t *out, const SpriteT *sp, uint8_t shift) { } +// RESTORE: copy backup -> screen. Destination has the screen stride. +uint16_t spriteEmitRestore68k(uint8_t *out, const SpriteT *sp, uint8_t shift) { + uint16_t cursor; + uint16_t heightPx; + uint16_t copyBytes; + + cursor = 0; + heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS); + copyBytes = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW + (shift == 1u ? 1u : 0u)); + + // Prologue: movea.l 4(sp), a0 (src); movea.l 8(sp), a1 (dst). + cursor += writeBE16(out + cursor, 0x206Fu); + cursor += writeBE16(out + cursor, 0x0004u); + cursor += writeBE16(out + cursor, 0x226Fu); + cursor += writeBE16(out + cursor, 0x0008u); + + cursor = emitCopyBody68k(out, cursor, heightPx, copyBytes, false); + + cursor += writeBE16(out + cursor, 0x4E75u); + return cursor; +} + + +// SAVE: copy screen -> backup. Source has the screen stride. +uint16_t spriteEmitSave68k(uint8_t *out, const SpriteT *sp, uint8_t shift) { + uint16_t cursor; + uint16_t heightPx; + uint16_t copyBytes; + + cursor = 0; + heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS); + copyBytes = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW + (shift == 1u ? 1u : 0u)); + + cursor += writeBE16(out + cursor, 0x206Fu); + cursor += writeBE16(out + cursor, 0x0004u); + cursor += writeBE16(out + cursor, 0x226Fu); + cursor += writeBE16(out + cursor, 0x0008u); + + cursor = emitCopyBody68k(out, cursor, heightPx, copyBytes, true); + + cursor += writeBE16(out + cursor, 0x4E75u); + return cursor; +} diff --git a/src/codegen/spriteEmitIigs.c b/src/codegen/spriteEmitIigs.c index 6b1d93f..0829ef6 100644 --- a/src/codegen/spriteEmitIigs.c +++ b/src/codegen/spriteEmitIigs.c @@ -217,6 +217,36 @@ uint16_t spriteEmitRestoreIigs(uint8_t *out, const SpriteT *sp, uint8_t shift) { // 65816 draw emit. Returns bytes written. +// +// Two emission paths share the body: +// +// * M=8 byte path (default; matches the stub-set entry mode): +// opaque: A9 vv LDA #vv ; 2c +// 99 lo hi STA abs,Y ; 5c +// ; 7c / 5 bytes per byte +// mixed: B9 lo hi LDA abs,Y ; 5c +// 29 mm AND #~mask ; 2c +// 09 vv ORA #val ; 2c +// 99 lo hi STA abs,Y ; 5c +// ;14c / 9 bytes per byte +// +// * M=16 word path (entered around runs of >= 2 consecutive +// fully-opaque bytes). Each word write covers 2 dest bytes: +// prologue: C2 20 REP #$20 ; 3c +// per pair: A9 lo hi LDA #imm16 ; 3c +// 99 lo hi STA abs,Y ; 6c +// ; 9c / 6 bytes per pair +// epilogue: E2 20 SEP #$20 ; 3c +// +// vs. M=8 path doing the same 2 bytes: 14c / 10 bytes. Per-pair +// savings are 5c / 4 bytes; the 6c/4-byte REP+SEP transition is +// amortized once per opaque run, so the path is profitable for +// runs of 2 pairs (4 consecutive opaque bytes) or longer. For +// isolated pairs we still take the M=16 path -- the 1-cycle loss +// vs. M=8 is dwarfed by the typical-sprite opaque-run length. +// +// Mixed bytes always run on the M=8 path because the AND/ORA in +// M=16 would clobber the adjacent byte. uint16_t spriteEmitDrawIigs(uint8_t *out, const SpriteT *sp, uint8_t shift) { uint16_t cursor; uint16_t row; @@ -227,11 +257,15 @@ uint16_t spriteEmitDrawIigs(uint8_t *out, const SpriteT *sp, uint8_t shift) { uint16_t absOffset; uint8_t value; uint8_t opaqueMask; + uint8_t nextValue; + uint8_t nextOpaqueMask; + bool wide; cursor = 0; heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS); spriteBytesPerRow = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW); destBytesPerRow = (uint16_t)(spriteBytesPerRow + (shift == 1 ? 1 : 0)); + wide = false; // No prologue: caller (the inline-asm stub in spriteCompile.c) // sets M=8/X=16/Y=destRow before JSL'ing here. @@ -243,18 +277,42 @@ uint16_t spriteEmitDrawIigs(uint8_t *out, const SpriteT *sp, uint8_t shift) { continue; } absOffset = (uint16_t)(row * SURFACE_BYTES_PER_ROW + col); + + if (opaqueMask == 0xFFu && (col + 1) < destBytesPerRow) { + // Look ahead: if (col, col+1) are both fully opaque + // we can pair them as a single M=16 word write. + shiftedByteAt(sp, row, (uint16_t)(col + 1), shift, + spriteBytesPerRow, &nextValue, &nextOpaqueMask); + if (nextOpaqueMask == 0xFFu) { + if (!wide) { + out[cursor++] = 0xC2; // REP #$20 -- M=16 + out[cursor++] = 0x20; + wide = true; + } + out[cursor++] = 0xA9; // LDA #imm16 + cursor += writeLE16(out + cursor, + (uint16_t)(((uint16_t)nextValue << 8) | value)); + out[cursor++] = 0x99; // STA abs,Y + cursor += writeLE16(out + cursor, absOffset); + col++; // consumed col+1 + continue; + } + } + + // Falls through here for: isolated opaque (no pair), mixed, + // or the trailing odd byte at the right edge. All on M=8. + if (wide) { + out[cursor++] = 0xE2; // SEP #$20 -- back to M=8 + out[cursor++] = 0x20; + wide = false; + } + if (opaqueMask == 0xFFu) { - // lda #imm A9 ii - // sta abs,Y 99 lo hi out[cursor++] = 0xA9; out[cursor++] = value; out[cursor++] = 0x99; cursor += writeLE16(out + cursor, absOffset); } else { - // lda abs,Y B9 lo hi - // and #mask 29 mm - // ora #val 09 vv - // sta abs,Y 99 lo hi out[cursor++] = 0xB9; cursor += writeLE16(out + cursor, absOffset); out[cursor++] = 0x29; @@ -267,6 +325,16 @@ uint16_t spriteEmitDrawIigs(uint8_t *out, const SpriteT *sp, uint8_t shift) { } } + // Routine exits in M=8: the JSL stub assumes M=8 throughout (the + // stub itself only ever ran with M=8 and doesn't restore M). The + // asm wrapper after the JSL forces M=16 again, but be defensive + // and ensure we leave M=8 here so the stub's PLB/RTL run as + // expected even if the wrapper convention changes. + if (wide) { + out[cursor++] = 0xE2; + out[cursor++] = 0x20; + } + // Epilogue: rtl (large memory model -b uses JSL/RTL). out[cursor++] = 0x6B; return cursor; diff --git a/src/codegen/spriteEmitX86.c b/src/codegen/spriteEmitX86.c index 8247811..8d66228 100644 --- a/src/codegen/spriteEmitX86.c +++ b/src/codegen/spriteEmitX86.c @@ -1,14 +1,11 @@ // x86 sprite codegen (DOS port). Emits 32-bit cdecl-callable PIC -// draw routines that write 4bpp packed surface bytes via -// [esi+disp8] chains. The C side calls them through a function -// pointer cast. +// draw / save / restore routines that read or write 4bpp packed +// surface bytes via [esi+disp8] chains. The C side calls them +// through a function pointer cast. // // Calling convention: -// draw(uint8_t *dst) -- esi advances row by row -// -// Save and restore are not compiled -- they're uniform memcpy- -// shaped operations and the C interpreter handles them at memcpy -// speed via the standard library. +// draw(uint8_t *dst) -- arg in [esp+8] after prologue saves esi +// save/restore(const uint8_t *src, uint8_t *dst) -- args in [esp+12]/[esp+16] after esi+edi save // // Per-byte emit (no run coalescing yet): // - all-transparent (both nibbles 0): skip, no instruction @@ -45,12 +42,69 @@ // ----- Prototypes ----- -static uint8_t spriteSourceByte(const SpriteT *sp, uint16_t row, uint16_t col); +static uint16_t emitCopyBodyX86(uint8_t *out, uint16_t cursor, uint16_t heightPx, uint16_t copyBytes, bool strideOnSrc); static void shiftedByteAt(const SpriteT *sp, uint16_t row, uint16_t col, uint8_t shift, uint16_t spriteBytesPerRow, uint8_t *outValue, uint8_t *outOpaqueMask); +static uint8_t spriteSourceByte(const SpriteT *sp, uint16_t row, uint16_t col); // ----- Emit helpers (alphabetical) ----- +// Shared body for save/restore. Walks heightPx rows of copyBytes +// using rep movsd for the dword-aligned bulk and rep movsb for the +// byte tail. After each row except the last, advances either esi +// or edi by (SURFACE_BYTES_PER_ROW - copyBytes) so the strided side +// (screen) lines up with the next scanline; the contiguous side +// (backup) advances naturally because rep movs* leaves the index +// register one past the last byte copied. +// +// strideOnSrc=true -> source has the screen stride (SAVE) +// strideOnSrc=false -> destination has the screen stride (RESTORE) +static uint16_t emitCopyBodyX86(uint8_t *out, uint16_t cursor, uint16_t heightPx, uint16_t copyBytes, bool strideOnSrc) { + uint16_t row; + uint16_t dwords; + uint16_t tail; + int32_t advance; + + dwords = (uint16_t)(copyBytes >> 2); + tail = (uint16_t)(copyBytes & 0x3u); + advance = (int32_t)SURFACE_BYTES_PER_ROW - (int32_t)copyBytes; + + for (row = 0; row < heightPx; row++) { + if (dwords > 0) { + // mov ecx, dwords (B9 imm32); rep movsd (F3 A5) + out[cursor++] = 0xB9; + out[cursor++] = (uint8_t)(dwords & 0xFFu); + out[cursor++] = (uint8_t)((dwords >> 8) & 0xFFu); + out[cursor++] = 0; + out[cursor++] = 0; + out[cursor++] = 0xF3; + out[cursor++] = 0xA5; + } + if (tail > 0) { + // mov ecx, tail (B9 imm32); rep movsb (F3 A4) + out[cursor++] = 0xB9; + out[cursor++] = (uint8_t)(tail & 0xFFu); + out[cursor++] = 0; + out[cursor++] = 0; + out[cursor++] = 0; + out[cursor++] = 0xF3; + out[cursor++] = 0xA4; + } + if (row + 1u < heightPx) { + // SAVE: add esi, advance (81 C6 imm32) + // RESTORE: add edi, advance (81 C7 imm32) + out[cursor++] = 0x81; + out[cursor++] = (uint8_t)(strideOnSrc ? 0xC6u : 0xC7u); + out[cursor++] = (uint8_t)(advance & 0xFFu); + out[cursor++] = (uint8_t)((advance >> 8) & 0xFFu); + out[cursor++] = (uint8_t)((advance >> 16) & 0xFFu); + out[cursor++] = (uint8_t)((advance >> 24) & 0xFFu); + } + } + return cursor; +} + + // Decompose a destination byte's contribution from the sprite into // (value, opaqueMask) for shift in {0, 1}. opaqueMask high nibble // 0xF0 means high dest nibble is opaque; 0x0F means low is opaque; @@ -189,3 +243,51 @@ uint16_t spriteEmitDrawX86(uint8_t *out, const SpriteT *sp, uint8_t shift) { } +// RESTORE: copy backup -> screen. Destination has the screen stride. +uint16_t spriteEmitRestoreX86(uint8_t *out, const SpriteT *sp, uint8_t shift) { + uint16_t cursor; + uint16_t heightPx; + uint16_t copyBytes; + + cursor = 0; + heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS); + copyBytes = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW + (shift == 1u ? 1u : 0u)); + + // Prologue: push esi; push edi; mov esi,[esp+12]; mov edi,[esp+16] + out[cursor++] = 0x56; + out[cursor++] = 0x57; + out[cursor++] = 0x8B; out[cursor++] = 0x74; out[cursor++] = 0x24; out[cursor++] = 0x0C; + out[cursor++] = 0x8B; out[cursor++] = 0x7C; out[cursor++] = 0x24; out[cursor++] = 0x10; + + cursor = emitCopyBodyX86(out, cursor, heightPx, copyBytes, false); + + // Epilogue: pop edi; pop esi; ret + out[cursor++] = 0x5F; + out[cursor++] = 0x5E; + out[cursor++] = 0xC3; + return cursor; +} + + +// SAVE: copy screen -> backup. Source has the screen stride. +uint16_t spriteEmitSaveX86(uint8_t *out, const SpriteT *sp, uint8_t shift) { + uint16_t cursor; + uint16_t heightPx; + uint16_t copyBytes; + + cursor = 0; + heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS); + copyBytes = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW + (shift == 1u ? 1u : 0u)); + + out[cursor++] = 0x56; + out[cursor++] = 0x57; + out[cursor++] = 0x8B; out[cursor++] = 0x74; out[cursor++] = 0x24; out[cursor++] = 0x0C; + out[cursor++] = 0x8B; out[cursor++] = 0x7C; out[cursor++] = 0x24; out[cursor++] = 0x10; + + cursor = emitCopyBodyX86(out, cursor, heightPx, copyBytes, true); + + out[cursor++] = 0x5F; + out[cursor++] = 0x5E; + out[cursor++] = 0xC3; + return cursor; +} diff --git a/src/codegen/spriteEmitter.h b/src/codegen/spriteEmitter.h index 8a8955f..8fbe359 100644 --- a/src/codegen/spriteEmitter.h +++ b/src/codegen/spriteEmitter.h @@ -27,11 +27,19 @@ uint16_t spriteEmitDrawIigs(uint8_t *out, const SpriteT *sp, uint8_t shift); // rectangle between the destination surface and a backup buffer. The // rectangle's width and start position depend on the shift: for // shift=0 (even x) it covers exactly the sprite's bytes per row; -// for shift=1 (odd x) it covers one extra byte on each side, rounded -// up to even. Per-CPU emitters return 0 to mean "not implemented" -- -// the runtime dispatch falls back to the interpreted path in that -// case. +// for shift=1 (odd x) it covers one extra byte (left edge nibble). +// Per-CPU emitters return 0 to mean "not implemented" -- the runtime +// dispatch falls back to the interpreted path in that case. +// +// IIgs uses a self-modifying MVN-stub on top of these bytes; x86 and +// 68k use a plain cdecl `void copy(const uint8_t *src, uint8_t *dst)` +// where the caller swaps args between SAVE (screen->backup) and +// RESTORE (backup->screen). uint16_t spriteEmitSaveIigs (uint8_t *out, const SpriteT *sp, uint8_t shift); uint16_t spriteEmitRestoreIigs(uint8_t *out, const SpriteT *sp, uint8_t shift); +uint16_t spriteEmitSaveX86 (uint8_t *out, const SpriteT *sp, uint8_t shift); +uint16_t spriteEmitRestoreX86 (uint8_t *out, const SpriteT *sp, uint8_t shift); +uint16_t spriteEmitSave68k (uint8_t *out, const SpriteT *sp, uint8_t shift); +uint16_t spriteEmitRestore68k (uint8_t *out, const SpriteT *sp, uint8_t shift); #endif diff --git a/src/port/amiga/c2p.s b/src/port/amiga/c2p.s index 9a7c1f3..25554fa 100644 --- a/src/port/amiga/c2p.s +++ b/src/port/amiga/c2p.s @@ -1,13 +1,10 @@ | Amiga chunky-to-planar conversion -- 68000 hand-rolled. | -| Drop-in replacement for hal.c's old c2pRange C inner loop. The C -| version walked every pixel and OR'd individual bits into 4 plane -| accumulators -- ~1.5 s for a full 320x200 frame on a 7 MHz 68000 -| (the GCC m68k codegen is poor for tight bit-twiddling). This rewrite -| uses a 4 KB lookup table built once at HAL init: each (sourceByte, -| bytePosition, plane) tuple maps to the plane-byte-bit contribution -| that source byte makes when it sits at that position within a -| 4-byte (= 8-pixel) planar group. +| Drop-in replacement for hal.c's old c2pRange C inner loop. Uses a +| 4 KB lookup table built once at HAL init: each (sourceByte, position, +| plane) tuple maps to the plane-byte bit contribution that source +| byte makes when it sits at that position within a 4-byte (8-pixel) +| planar group going to that plane. | | Calling convention: m68k-amigaos-gcc cdecl. | Args on stack at 4(sp), 8(sp), ... @@ -22,12 +19,17 @@ | uint16_t n, ; 24(sp) - planar byte count (low word) | const uint8_t *lut); ; 28(sp) - 4 KB LUT base | -| LUT layout: lut[pos*1024 + plane*256 + src] = 1-byte plane contribution -| for source byte `src` sitting at byte-position `pos` within its -| 4-byte planar group, going to plane `plane`. Byte-position 0 is the -| leftmost (its two pixels land in plane-byte bits 7 and 6); position -| 3 is the rightmost (bits 1 and 0). Built once by chunkyToPlanarInit -| (in hal.c) at HAL boot. +| LUT layout: lut[src*16 + pos*4 + plane] = 1-byte plane contribution +| for source byte `src` sitting at byte-position `pos` (0..3) within +| its 4-byte planar group, going to plane `plane` (0..3). All 16 +| (pos, plane) entries for one src byte are contiguous, so the inner +| loop reaches every entry off (a5, d4.w) with an 8-bit displacement +| (0..15) and never has to advance an index register. +| +| Per planar byte we consume 4 source bytes (positions 0..3 of the +| 8-pixel group). For each we compute d4 = src*16 with four add.w's +| (faster than asl.w on 68000) and OR the four plane contributions +| into d0..d3 with byte-displaced (a5,d4.w) reads. | | GAS-syntax (binutils m68k); assembled by m68k-amigaos-as via the | gcc driver. @@ -65,54 +67,52 @@ _chunkyToPlanarRow: moveq #0,%d3 | plane 3 acc | ----- Source byte position 0 ----- - | a5 points to start of LUT. Plane 0/1/2/3 sub-tables - | for position 0 are at offsets 0/256/512/768. moveq #0,%d4 move.b (%a0)+,%d4 | src[0] - move.l %a5,%a6 - or.b (%a6,%d4.w),%d0 | +0 = pos0 plane 0 - lea 256(%a6),%a6 - or.b (%a6,%d4.w),%d1 | +256 = pos0 plane 1 - lea 256(%a6),%a6 - or.b (%a6,%d4.w),%d2 | +512 = pos0 plane 2 - lea 256(%a6),%a6 - or.b (%a6,%d4.w),%d3 | +768 = pos0 plane 3 + add.w %d4,%d4 + add.w %d4,%d4 + add.w %d4,%d4 + add.w %d4,%d4 | d4 = src * 16 + or.b 0(%a5,%d4.w),%d0 | pos0 plane0 + or.b 1(%a5,%d4.w),%d1 | pos0 plane1 + or.b 2(%a5,%d4.w),%d2 | pos0 plane2 + or.b 3(%a5,%d4.w),%d3 | pos0 plane3 | ----- Source byte position 1 ----- - lea 256(%a6),%a6 | advance to pos1 plane 0 moveq #0,%d4 - move.b (%a0)+,%d4 - or.b (%a6,%d4.w),%d0 - lea 256(%a6),%a6 - or.b (%a6,%d4.w),%d1 - lea 256(%a6),%a6 - or.b (%a6,%d4.w),%d2 - lea 256(%a6),%a6 - or.b (%a6,%d4.w),%d3 + move.b (%a0)+,%d4 | src[1] + add.w %d4,%d4 + add.w %d4,%d4 + add.w %d4,%d4 + add.w %d4,%d4 + or.b 4(%a5,%d4.w),%d0 | pos1 plane0 + or.b 5(%a5,%d4.w),%d1 | pos1 plane1 + or.b 6(%a5,%d4.w),%d2 | pos1 plane2 + or.b 7(%a5,%d4.w),%d3 | pos1 plane3 | ----- Source byte position 2 ----- - lea 256(%a6),%a6 moveq #0,%d4 - move.b (%a0)+,%d4 - or.b (%a6,%d4.w),%d0 - lea 256(%a6),%a6 - or.b (%a6,%d4.w),%d1 - lea 256(%a6),%a6 - or.b (%a6,%d4.w),%d2 - lea 256(%a6),%a6 - or.b (%a6,%d4.w),%d3 + move.b (%a0)+,%d4 | src[2] + add.w %d4,%d4 + add.w %d4,%d4 + add.w %d4,%d4 + add.w %d4,%d4 + or.b 8(%a5,%d4.w),%d0 | pos2 plane0 + or.b 9(%a5,%d4.w),%d1 | pos2 plane1 + or.b 10(%a5,%d4.w),%d2 | pos2 plane2 + or.b 11(%a5,%d4.w),%d3 | pos2 plane3 | ----- Source byte position 3 ----- - lea 256(%a6),%a6 moveq #0,%d4 - move.b (%a0)+,%d4 - or.b (%a6,%d4.w),%d0 - lea 256(%a6),%a6 - or.b (%a6,%d4.w),%d1 - lea 256(%a6),%a6 - or.b (%a6,%d4.w),%d2 - lea 256(%a6),%a6 - or.b (%a6,%d4.w),%d3 + move.b (%a0)+,%d4 | src[3] + add.w %d4,%d4 + add.w %d4,%d4 + add.w %d4,%d4 + add.w %d4,%d4 + or.b 12(%a5,%d4.w),%d0 | pos3 plane0 + or.b 13(%a5,%d4.w),%d1 | pos3 plane1 + or.b 14(%a5,%d4.w),%d2 | pos3 plane2 + or.b 15(%a5,%d4.w),%d3 | pos3 plane3 | ----- Store plane bytes ----- move.b %d0,(%a1)+ diff --git a/src/port/amiga/hal.c b/src/port/amiga/hal.c index 174cdee..9c8bb20 100644 --- a/src/port/amiga/hal.c +++ b/src/port/amiga/hal.c @@ -77,11 +77,12 @@ static uint16_t gCachedPalette[SURFACE_PALETTE_COUNT][SURFACE_COLORS_PER_PALETTE static bool gCacheValid = false; // 4 KB chunky-to-planar lookup table consumed by chunkyToPlanarRow -// (src/port/amiga/c2p.s). Layout: gC2pLut[pos*1024 + plane*256 + src] -// = the plane-byte bit contribution that source byte `src` makes when -// it sits at byte-position `pos` within a 4-byte (8-pixel) planar -// group, going to plane `plane`. Built once by initC2pLut on the -// first halPresent call. +// (src/port/amiga/c2p.s). Layout: gC2pLut[src*16 + pos*4 + plane] = +// the plane-byte bit contribution that source byte `src` makes to +// plane `plane` when it sits at byte-position `pos` within a 4-byte +// (8-pixel) planar group. The src-major layout lets the asm inner +// loop reach all 16 (pos, plane) entries for a single src byte via +// 8-bit displacements off (a5, d4.w) without any LEA between reads. static uint8_t gC2pLut[4 * 1024]; static bool gC2pLutReady = false; @@ -116,14 +117,14 @@ static void initC2pLut(void) { if (gC2pLutReady) { return; } - for (pos = 0; pos < 4; pos++) { - highShift = (uint8_t)(7 - 2 * pos); - lowShift = (uint8_t)(6 - 2 * pos); - for (plane = 0; plane < 4; plane++) { - for (src = 0; src < 256; src++) { + for (src = 0; src < 256; src++) { + for (pos = 0; pos < 4; pos++) { + highShift = (uint8_t)(7 - 2 * pos); + lowShift = (uint8_t)(6 - 2 * pos); + for (plane = 0; plane < 4; plane++) { highBit = (uint8_t)(((src >> 4) >> plane) & 1); lowBit = (uint8_t)(((src & 0x0F) >> plane) & 1); - gC2pLut[pos * 1024 + plane * 256 + src] = + gC2pLut[src * 16 + pos * 4 + plane] = (uint8_t)((highBit << highShift) | (lowBit << lowShift)); } }