Compare commits

..

No commits in common. "ea1e853d5d51d9a949f0f8e9c256ef488de90fbf" and "6f37b126b882b5aef04cc84ee3420306ccce4e59" have entirely different histories.

10 changed files with 199 additions and 1140 deletions

View file

@ -42,17 +42,11 @@ static uint16_t emitDrawForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift
} }
// Save-under and restore-under emit dispatch. Each per-CPU pair // Save-under and restore-under emitters are IIgs-only at the moment;
// produces row-by-row copy bytes; the runtime dispatch in // other CPUs return 0, the runtime treats that as "not compiled" and
// src/core/sprite.c gates on routineOffsets[shift][SPRITE_OP_SAVE] // falls back to spriteSaveUnderInterpreted / spriteRestoreUnderInterpreted.
// != SPRITE_NOT_COMPILED and falls back to the interpreted memcpy
// path otherwise.
static uint16_t emitSaveForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift) { static uint16_t emitSaveForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift) {
#if defined(JOEYLIB_PLATFORM_DOS) #if defined(JOEYLIB_PLATFORM_IIGS)
return spriteEmitSaveX86(out, sp, shift);
#elif defined(JOEYLIB_PLATFORM_AMIGA) || defined(JOEYLIB_PLATFORM_ATARIST)
return spriteEmitSave68k(out, sp, shift);
#elif defined(JOEYLIB_PLATFORM_IIGS)
return spriteEmitSaveIigs(out, sp, shift); return spriteEmitSaveIigs(out, sp, shift);
#else #else
(void)out; (void)sp; (void)shift; (void)out; (void)sp; (void)shift;
@ -62,11 +56,7 @@ static uint16_t emitSaveForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift
static uint16_t emitRestoreForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift) { static uint16_t emitRestoreForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift) {
#if defined(JOEYLIB_PLATFORM_DOS) #if defined(JOEYLIB_PLATFORM_IIGS)
return spriteEmitRestoreX86(out, sp, shift);
#elif defined(JOEYLIB_PLATFORM_AMIGA) || defined(JOEYLIB_PLATFORM_ATARIST)
return spriteEmitRestore68k(out, sp, shift);
#elif defined(JOEYLIB_PLATFORM_IIGS)
return spriteEmitRestoreIigs(out, sp, shift); return spriteEmitRestoreIigs(out, sp, shift);
#else #else
(void)out; (void)sp; (void)shift; (void)out; (void)sp; (void)shift;
@ -426,62 +416,18 @@ void spriteCompiledDraw(SurfaceT *dst, const SpriteT *sp, int16_t x, int16_t y)
} }
// x86 / 68k compiled save: bytes are a cdecl // Non-IIgs platforms have no compiled save/restore yet. The dispatch
// void copy(const uint8_t *src, uint8_t *dst) // in src/core/sprite.c gates on routineOffsets[shift][SPRITE_OP_*] !=
// that walks heightPx rows of copyBytes from screen (stride // SPRITE_NOT_COMPILED, so these stubs should never actually run on
// SURFACE_BYTES_PER_ROW) into the contiguous backup buffer. // those platforms; they exist so spriteInternal.h's prototypes stay
// resolved at link time.
void spriteCompiledSaveUnder(const SurfaceT *src, SpriteT *sp, int16_t x, int16_t y, SpriteBackupT *backup) { void spriteCompiledSaveUnder(const SurfaceT *src, SpriteT *sp, int16_t x, int16_t y, SpriteBackupT *backup) {
typedef void (*CopyFn)(const uint8_t *src, uint8_t *dst); (void)src; (void)sp; (void)x; (void)y; (void)backup;
uint8_t shift;
int16_t clippedX;
uint16_t widthPx;
uint16_t heightPx;
uint16_t copyBytes;
uint8_t *screenPtr;
CopyFn fn;
shift = (uint8_t)(x & 1);
clippedX = (int16_t)(x & ~1);
widthPx = (uint16_t)(sp->widthTiles * 8);
heightPx = (uint16_t)(sp->heightTiles * 8);
copyBytes = (uint16_t)((widthPx >> 1) + (shift == 1 ? 1 : 0));
screenPtr = (uint8_t *)&src->pixels[(uint16_t)y * SURFACE_BYTES_PER_ROW + ((uint16_t)clippedX >> 1)];
backup->sprite = sp;
backup->x = clippedX;
backup->y = y;
backup->width = (uint16_t)(copyBytes << 1);
backup->height = heightPx;
backup->sizeBytes = (uint16_t)(copyBytes * heightPx);
fn = (CopyFn)(codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_SAVE]);
fn(screenPtr, backup->bytes);
} }
// Mirror of save: caller swaps arg order so the same emitted shape
// drives backup -> screen. The screen-side stride lives inside the
// emitted bytes, so RESTORE has its own routine bytes (stride is
// applied to dst instead of src).
void spriteCompiledRestoreUnder(SurfaceT *dst, const SpriteBackupT *backup) { void spriteCompiledRestoreUnder(SurfaceT *dst, const SpriteBackupT *backup) {
typedef void (*CopyFn)(const uint8_t *src, uint8_t *dst); (void)dst; (void)backup;
SpriteT *sp;
uint8_t shift;
uint16_t copyBytes;
uint16_t spriteBytesPerRow;
uint8_t *screenPtr;
CopyFn fn;
sp = backup->sprite;
copyBytes = (uint16_t)(backup->width >> 1);
spriteBytesPerRow = (uint16_t)(sp->widthTiles * 4);
shift = (copyBytes == spriteBytesPerRow) ? 0 : 1;
screenPtr = (uint8_t *)&dst->pixels[(uint16_t)backup->y * SURFACE_BYTES_PER_ROW + ((uint16_t)backup->x >> 1)];
fn = (CopyFn)(codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_RESTORE]);
fn(backup->bytes, screenPtr);
} }
#endif #endif

View file

@ -1,12 +1,10 @@
// 68k sprite codegen (Amiga + Atari ST). Emits SysV-ish cdecl- // 68k sprite codegen (Amiga + Atari ST). Emits SysV-ish cdecl-
// callable PIC draw / save / restore routines that read or write // callable PIC draw routines that write 4bpp packed surface bytes
// 4bpp packed surface bytes via d16(a0) chains. Same shape as the // via d16(a0) chains. Same shape as the x86 emitter; only the
// x86 emitter; only the instruction encoding differs. // instruction encoding differs.
// //
// Calling convention (m68k gcc / mintlib): // Calling convention (m68k gcc / mintlib):
// void draw(uint8_t *dst); -- arg in 4(sp) // void draw(uint8_t *dst); -- arg in 4(sp); a0/a1/d0/d1 caller-saved.
// void save/restore(const uint8_t *src, uint8_t *dst); -- args in 4(sp)/8(sp)
// a0/a1/d0/d1 are caller-saved.
// //
// Per-byte emit (no run coalescing yet): // Per-byte emit (no run coalescing yet):
// - all-transparent: skip // - all-transparent: skip
@ -40,46 +38,13 @@
// ----- Prototypes ----- // ----- Prototypes -----
static uint16_t emitCopyBody68k(uint8_t *out, uint16_t cursor, uint16_t heightPx, uint16_t copyBytes, bool strideOnSrc);
static void shiftedByteAt(const SpriteT *sp, uint16_t row, uint16_t col, uint8_t shift, uint16_t spriteBytesPerRow, uint8_t *outValue, uint8_t *outOpaqueMask);
static uint8_t spriteSourceByte(const SpriteT *sp, uint16_t row, uint16_t col); static uint8_t spriteSourceByte(const SpriteT *sp, uint16_t row, uint16_t col);
static void shiftedByteAt(const SpriteT *sp, uint16_t row, uint16_t col, uint8_t shift, uint16_t spriteBytesPerRow, uint8_t *outValue, uint8_t *outOpaqueMask);
static uint16_t writeBE16(uint8_t *out, uint16_t value); static uint16_t writeBE16(uint8_t *out, uint16_t value);
// ----- Emit helpers (alphabetical) ----- // ----- Emit helpers (alphabetical) -----
// Shared body for save/restore. Walks heightPx rows of copyBytes
// using `move.b (a0)+, (a1)+` byte-wise (safe regardless of pointer
// alignment, since the screen-side x can land on an odd byte). After
// each row except the last, advances either a0 (SAVE: src=screen) or
// a1 (RESTORE: dst=screen) by (SURFACE_BYTES_PER_ROW - copyBytes) so
// the strided side lines up with the next scanline; the contiguous
// side advances naturally via the post-increment.
//
// strideOnSrc=true -> source has the screen stride (SAVE)
// strideOnSrc=false -> destination has the screen stride (RESTORE)
static uint16_t emitCopyBody68k(uint8_t *out, uint16_t cursor, uint16_t heightPx, uint16_t copyBytes, bool strideOnSrc) {
uint16_t row;
uint16_t col;
uint16_t advance;
advance = (uint16_t)(SURFACE_BYTES_PER_ROW - copyBytes);
for (row = 0; row < heightPx; row++) {
// Unrolled: move.b (a0)+, (a1)+ -- 0x12D8.
for (col = 0; col < copyBytes; col++) {
cursor += writeBE16(out + cursor, 0x12D8u);
}
if (row + 1u < heightPx) {
// adda.w #advance, a0 (0xD0FC) for SAVE
// adda.w #advance, a1 (0xD2FC) for RESTORE
cursor += writeBE16(out + cursor, strideOnSrc ? 0xD0FCu : 0xD2FCu);
cursor += writeBE16(out + cursor, advance);
}
}
return cursor;
}
// Same logic as the x86 shiftedByteAt -- per-byte transparency // Same logic as the x86 shiftedByteAt -- per-byte transparency
// decomposition for shift in {0,1}. opaqueMask high nibble 0xF0 if // decomposition for shift in {0,1}. opaqueMask high nibble 0xF0 if
// dest high nibble is opaque, 0x0F if low is opaque. // dest high nibble is opaque, 0x0F if low is opaque.
@ -219,46 +184,3 @@ uint16_t spriteEmitDraw68k(uint8_t *out, const SpriteT *sp, uint8_t shift) {
} }
// RESTORE: copy backup -> screen. Destination has the screen stride.
uint16_t spriteEmitRestore68k(uint8_t *out, const SpriteT *sp, uint8_t shift) {
uint16_t cursor;
uint16_t heightPx;
uint16_t copyBytes;
cursor = 0;
heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS);
copyBytes = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW + (shift == 1u ? 1u : 0u));
// Prologue: movea.l 4(sp), a0 (src); movea.l 8(sp), a1 (dst).
cursor += writeBE16(out + cursor, 0x206Fu);
cursor += writeBE16(out + cursor, 0x0004u);
cursor += writeBE16(out + cursor, 0x226Fu);
cursor += writeBE16(out + cursor, 0x0008u);
cursor = emitCopyBody68k(out, cursor, heightPx, copyBytes, false);
cursor += writeBE16(out + cursor, 0x4E75u);
return cursor;
}
// SAVE: copy screen -> backup. Source has the screen stride.
uint16_t spriteEmitSave68k(uint8_t *out, const SpriteT *sp, uint8_t shift) {
uint16_t cursor;
uint16_t heightPx;
uint16_t copyBytes;
cursor = 0;
heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS);
copyBytes = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW + (shift == 1u ? 1u : 0u));
cursor += writeBE16(out + cursor, 0x206Fu);
cursor += writeBE16(out + cursor, 0x0004u);
cursor += writeBE16(out + cursor, 0x226Fu);
cursor += writeBE16(out + cursor, 0x0008u);
cursor = emitCopyBody68k(out, cursor, heightPx, copyBytes, true);
cursor += writeBE16(out + cursor, 0x4E75u);
return cursor;
}

View file

@ -217,36 +217,6 @@ uint16_t spriteEmitRestoreIigs(uint8_t *out, const SpriteT *sp, uint8_t shift) {
// 65816 draw emit. Returns bytes written. // 65816 draw emit. Returns bytes written.
//
// Two emission paths share the body:
//
// * M=8 byte path (default; matches the stub-set entry mode):
// opaque: A9 vv LDA #vv ; 2c
// 99 lo hi STA abs,Y ; 5c
// ; 7c / 5 bytes per byte
// mixed: B9 lo hi LDA abs,Y ; 5c
// 29 mm AND #~mask ; 2c
// 09 vv ORA #val ; 2c
// 99 lo hi STA abs,Y ; 5c
// ;14c / 9 bytes per byte
//
// * M=16 word path (entered around runs of >= 2 consecutive
// fully-opaque bytes). Each word write covers 2 dest bytes:
// prologue: C2 20 REP #$20 ; 3c
// per pair: A9 lo hi LDA #imm16 ; 3c
// 99 lo hi STA abs,Y ; 6c
// ; 9c / 6 bytes per pair
// epilogue: E2 20 SEP #$20 ; 3c
//
// vs. M=8 path doing the same 2 bytes: 14c / 10 bytes. Per-pair
// savings are 5c / 4 bytes; the 6c/4-byte REP+SEP transition is
// amortized once per opaque run, so the path is profitable for
// runs of 2 pairs (4 consecutive opaque bytes) or longer. For
// isolated pairs we still take the M=16 path -- the 1-cycle loss
// vs. M=8 is dwarfed by the typical-sprite opaque-run length.
//
// Mixed bytes always run on the M=8 path because the AND/ORA in
// M=16 would clobber the adjacent byte.
uint16_t spriteEmitDrawIigs(uint8_t *out, const SpriteT *sp, uint8_t shift) { uint16_t spriteEmitDrawIigs(uint8_t *out, const SpriteT *sp, uint8_t shift) {
uint16_t cursor; uint16_t cursor;
uint16_t row; uint16_t row;
@ -257,15 +227,11 @@ uint16_t spriteEmitDrawIigs(uint8_t *out, const SpriteT *sp, uint8_t shift) {
uint16_t absOffset; uint16_t absOffset;
uint8_t value; uint8_t value;
uint8_t opaqueMask; uint8_t opaqueMask;
uint8_t nextValue;
uint8_t nextOpaqueMask;
bool wide;
cursor = 0; cursor = 0;
heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS); heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS);
spriteBytesPerRow = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW); spriteBytesPerRow = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW);
destBytesPerRow = (uint16_t)(spriteBytesPerRow + (shift == 1 ? 1 : 0)); destBytesPerRow = (uint16_t)(spriteBytesPerRow + (shift == 1 ? 1 : 0));
wide = false;
// No prologue: caller (the inline-asm stub in spriteCompile.c) // No prologue: caller (the inline-asm stub in spriteCompile.c)
// sets M=8/X=16/Y=destRow before JSL'ing here. // sets M=8/X=16/Y=destRow before JSL'ing here.
@ -277,42 +243,18 @@ uint16_t spriteEmitDrawIigs(uint8_t *out, const SpriteT *sp, uint8_t shift) {
continue; continue;
} }
absOffset = (uint16_t)(row * SURFACE_BYTES_PER_ROW + col); absOffset = (uint16_t)(row * SURFACE_BYTES_PER_ROW + col);
if (opaqueMask == 0xFFu && (col + 1) < destBytesPerRow) {
// Look ahead: if (col, col+1) are both fully opaque
// we can pair them as a single M=16 word write.
shiftedByteAt(sp, row, (uint16_t)(col + 1), shift,
spriteBytesPerRow, &nextValue, &nextOpaqueMask);
if (nextOpaqueMask == 0xFFu) {
if (!wide) {
out[cursor++] = 0xC2; // REP #$20 -- M=16
out[cursor++] = 0x20;
wide = true;
}
out[cursor++] = 0xA9; // LDA #imm16
cursor += writeLE16(out + cursor,
(uint16_t)(((uint16_t)nextValue << 8) | value));
out[cursor++] = 0x99; // STA abs,Y
cursor += writeLE16(out + cursor, absOffset);
col++; // consumed col+1
continue;
}
}
// Falls through here for: isolated opaque (no pair), mixed,
// or the trailing odd byte at the right edge. All on M=8.
if (wide) {
out[cursor++] = 0xE2; // SEP #$20 -- back to M=8
out[cursor++] = 0x20;
wide = false;
}
if (opaqueMask == 0xFFu) { if (opaqueMask == 0xFFu) {
// lda #imm A9 ii
// sta abs,Y 99 lo hi
out[cursor++] = 0xA9; out[cursor++] = 0xA9;
out[cursor++] = value; out[cursor++] = value;
out[cursor++] = 0x99; out[cursor++] = 0x99;
cursor += writeLE16(out + cursor, absOffset); cursor += writeLE16(out + cursor, absOffset);
} else { } else {
// lda abs,Y B9 lo hi
// and #mask 29 mm
// ora #val 09 vv
// sta abs,Y 99 lo hi
out[cursor++] = 0xB9; out[cursor++] = 0xB9;
cursor += writeLE16(out + cursor, absOffset); cursor += writeLE16(out + cursor, absOffset);
out[cursor++] = 0x29; out[cursor++] = 0x29;
@ -325,16 +267,6 @@ uint16_t spriteEmitDrawIigs(uint8_t *out, const SpriteT *sp, uint8_t shift) {
} }
} }
// Routine exits in M=8: the JSL stub assumes M=8 throughout (the
// stub itself only ever ran with M=8 and doesn't restore M). The
// asm wrapper after the JSL forces M=16 again, but be defensive
// and ensure we leave M=8 here so the stub's PLB/RTL run as
// expected even if the wrapper convention changes.
if (wide) {
out[cursor++] = 0xE2;
out[cursor++] = 0x20;
}
// Epilogue: rtl (large memory model -b uses JSL/RTL). // Epilogue: rtl (large memory model -b uses JSL/RTL).
out[cursor++] = 0x6B; out[cursor++] = 0x6B;
return cursor; return cursor;

View file

@ -1,23 +1,20 @@
// x86 sprite codegen (DOS port). Emits 32-bit cdecl-callable PIC // x86 sprite codegen (DOS port). Emits 32-bit cdecl-callable PIC
// draw / save / restore routines that read or write 4bpp packed // draw routines that write 4bpp packed surface bytes via
// surface bytes via [esi+disp8] chains. The C side calls them // [esi+disp8] chains. The C side calls them through a function
// through a function pointer cast. // pointer cast.
// //
// Calling convention: // Calling convention:
// draw(uint8_t *dst) -- arg in [esp+8] after prologue saves esi // draw(uint8_t *dst) -- esi advances row by row
// save/restore(const uint8_t *src, uint8_t *dst) -- args in [esp+12]/[esp+16] after esi+edi save
// //
// Per-byte emit, with opaque-run coalescing for the draw path: // Save and restore are not compiled -- they're uniform memcpy-
// shaped operations and the C interpreter handles them at memcpy
// speed via the standard library.
//
// Per-byte emit (no run coalescing yet):
// - all-transparent (both nibbles 0): skip, no instruction // - all-transparent (both nibbles 0): skip, no instruction
// - all-opaque: mov byte [esi+col], imm8 (4 bytes encoded)
// - mixed: mov al,[esi+col]; and al,mask; or al,val; mov [esi+col],al // - mixed: mov al,[esi+col]; and al,mask; or al,val; mov [esi+col],al
// (3 + 2 + 2 + 3 = 10 bytes) // (3 + 2 + 2 + 3 = 10 bytes)
// - run of N consecutive fully-opaque bytes: emit largest chunks
// while N >= 4: mov dword [esi+col], imm32 (7 bytes, 1 store)
// if N >= 2: mov word [esi+col], imm16 (6 bytes, 1 store)
// if N == 1: mov byte [esi+col], imm8 (4 bytes, 1 store)
// A run of 4 opaque bytes is therefore one 7-byte store instead of
// four 4-byte stores (16 bytes / 4 stores). Unaligned access is
// fine on 386+.
// Per row: // Per row:
// add esi, SURFACE_BYTES_PER_ROW (6 bytes encoded) // add esi, SURFACE_BYTES_PER_ROW (6 bytes encoded)
// Prologue: // Prologue:
@ -48,69 +45,12 @@
// ----- Prototypes ----- // ----- Prototypes -----
static uint16_t emitCopyBodyX86(uint8_t *out, uint16_t cursor, uint16_t heightPx, uint16_t copyBytes, bool strideOnSrc);
static void shiftedByteAt(const SpriteT *sp, uint16_t row, uint16_t col, uint8_t shift, uint16_t spriteBytesPerRow, uint8_t *outValue, uint8_t *outOpaqueMask);
static uint8_t spriteSourceByte(const SpriteT *sp, uint16_t row, uint16_t col); static uint8_t spriteSourceByte(const SpriteT *sp, uint16_t row, uint16_t col);
static void shiftedByteAt(const SpriteT *sp, uint16_t row, uint16_t col, uint8_t shift, uint16_t spriteBytesPerRow, uint8_t *outValue, uint8_t *outOpaqueMask);
// ----- Emit helpers (alphabetical) ----- // ----- Emit helpers (alphabetical) -----
// Shared body for save/restore. Walks heightPx rows of copyBytes
// using rep movsd for the dword-aligned bulk and rep movsb for the
// byte tail. After each row except the last, advances either esi
// or edi by (SURFACE_BYTES_PER_ROW - copyBytes) so the strided side
// (screen) lines up with the next scanline; the contiguous side
// (backup) advances naturally because rep movs* leaves the index
// register one past the last byte copied.
//
// strideOnSrc=true -> source has the screen stride (SAVE)
// strideOnSrc=false -> destination has the screen stride (RESTORE)
static uint16_t emitCopyBodyX86(uint8_t *out, uint16_t cursor, uint16_t heightPx, uint16_t copyBytes, bool strideOnSrc) {
uint16_t row;
uint16_t dwords;
uint16_t tail;
int32_t advance;
dwords = (uint16_t)(copyBytes >> 2);
tail = (uint16_t)(copyBytes & 0x3u);
advance = (int32_t)SURFACE_BYTES_PER_ROW - (int32_t)copyBytes;
for (row = 0; row < heightPx; row++) {
if (dwords > 0) {
// mov ecx, dwords (B9 imm32); rep movsd (F3 A5)
out[cursor++] = 0xB9;
out[cursor++] = (uint8_t)(dwords & 0xFFu);
out[cursor++] = (uint8_t)((dwords >> 8) & 0xFFu);
out[cursor++] = 0;
out[cursor++] = 0;
out[cursor++] = 0xF3;
out[cursor++] = 0xA5;
}
if (tail > 0) {
// mov ecx, tail (B9 imm32); rep movsb (F3 A4)
out[cursor++] = 0xB9;
out[cursor++] = (uint8_t)(tail & 0xFFu);
out[cursor++] = 0;
out[cursor++] = 0;
out[cursor++] = 0;
out[cursor++] = 0xF3;
out[cursor++] = 0xA4;
}
if (row + 1u < heightPx) {
// SAVE: add esi, advance (81 C6 imm32)
// RESTORE: add edi, advance (81 C7 imm32)
out[cursor++] = 0x81;
out[cursor++] = (uint8_t)(strideOnSrc ? 0xC6u : 0xC7u);
out[cursor++] = (uint8_t)(advance & 0xFFu);
out[cursor++] = (uint8_t)((advance >> 8) & 0xFFu);
out[cursor++] = (uint8_t)((advance >> 16) & 0xFFu);
out[cursor++] = (uint8_t)((advance >> 24) & 0xFFu);
}
}
return cursor;
}
// Decompose a destination byte's contribution from the sprite into // Decompose a destination byte's contribution from the sprite into
// (value, opaqueMask) for shift in {0, 1}. opaqueMask high nibble // (value, opaqueMask) for shift in {0, 1}. opaqueMask high nibble
// 0xF0 means high dest nibble is opaque; 0x0F means low is opaque; // 0xF0 means high dest nibble is opaque; 0x0F means low is opaque;
@ -188,17 +128,11 @@ uint16_t spriteEmitDrawX86(uint8_t *out, const SpriteT *sp, uint8_t shift) {
uint16_t cursor; uint16_t cursor;
uint16_t row; uint16_t row;
uint16_t col; uint16_t col;
uint16_t runEnd;
uint16_t runLen;
uint16_t heightPx; uint16_t heightPx;
uint16_t spriteBytesPerRow; uint16_t spriteBytesPerRow;
uint16_t destBytesPerRow; uint16_t destBytesPerRow;
uint8_t value; uint8_t value;
uint8_t opaqueMask; uint8_t opaqueMask;
uint8_t v1;
uint8_t v2;
uint8_t v3;
uint8_t m;
cursor = 0; cursor = 0;
heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS); heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS);
@ -210,7 +144,7 @@ uint16_t spriteEmitDrawX86(uint8_t *out, const SpriteT *sp, uint8_t shift) {
out[cursor++] = 0x8B; out[cursor++] = 0x74; out[cursor++] = 0x8B; out[cursor++] = 0x74;
out[cursor++] = 0x24; out[cursor++] = 0x08; out[cursor++] = 0x24; out[cursor++] = 0x08;
// Body: per row, scan dest bytes coalescing fully-opaque runs. // Body: per row, per dest byte.
for (row = 0; row < heightPx; row++) { for (row = 0; row < heightPx; row++) {
if (row > 0) { if (row > 0) {
// add esi, SURFACE_BYTES_PER_ROW (32-bit imm) // add esi, SURFACE_BYTES_PER_ROW (32-bit imm)
@ -220,14 +154,17 @@ uint16_t spriteEmitDrawX86(uint8_t *out, const SpriteT *sp, uint8_t shift) {
out[cursor++] = 0x00; out[cursor++] = 0x00;
out[cursor++] = 0x00; out[cursor++] = 0x00;
} }
col = 0; for (col = 0; col < destBytesPerRow; col++) {
while (col < destBytesPerRow) {
shiftedByteAt(sp, row, col, shift, spriteBytesPerRow, &value, &opaqueMask); shiftedByteAt(sp, row, col, shift, spriteBytesPerRow, &value, &opaqueMask);
if (opaqueMask == 0x00) { if (opaqueMask == 0x00) {
col++; continue; // both nibbles transparent
continue;
} }
if (opaqueMask != 0xFFu) { if (opaqueMask == 0xFFu) {
// mov byte [esi+col], imm8 (C6 46 cc ii)
out[cursor++] = 0xC6; out[cursor++] = 0x46;
out[cursor++] = (uint8_t)(col & 0xFFu);
out[cursor++] = value;
} else {
// Mixed: read-modify-write. // Mixed: read-modify-write.
// mov al, [esi+col] (8A 46 cc) // mov al, [esi+col] (8A 46 cc)
// and al, ~opaqueMask (24 mm) // and al, ~opaqueMask (24 mm)
@ -241,61 +178,6 @@ uint16_t spriteEmitDrawX86(uint8_t *out, const SpriteT *sp, uint8_t shift) {
out[cursor++] = value; out[cursor++] = value;
out[cursor++] = 0x88; out[cursor++] = 0x46; out[cursor++] = 0x88; out[cursor++] = 0x46;
out[cursor++] = (uint8_t)(col & 0xFFu); out[cursor++] = (uint8_t)(col & 0xFFu);
col++;
continue;
}
// Fully opaque at col -- find the end of the run.
runEnd = (uint16_t)(col + 1);
while (runEnd < destBytesPerRow) {
shiftedByteAt(sp, row, runEnd, shift, spriteBytesPerRow, &v1, &m);
if (m != 0xFFu) {
break;
}
runEnd++;
}
runLen = (uint16_t)(runEnd - col);
// Emit dword stores while >= 4 bytes remain, then a word
// store if >= 2, then a single byte. shiftedByteAt is cheap
// enough that re-reading per chunk beats threading a
// fixed-size buffer through.
while (runLen >= 4) {
shiftedByteAt(sp, row, (uint16_t)(col + 1), shift, spriteBytesPerRow, &v1, &m);
shiftedByteAt(sp, row, (uint16_t)(col + 2), shift, spriteBytesPerRow, &v2, &m);
shiftedByteAt(sp, row, (uint16_t)(col + 3), shift, spriteBytesPerRow, &v3, &m);
// mov dword [esi+col], imm32 (C7 46 cc ii ii ii ii)
out[cursor++] = 0xC7; out[cursor++] = 0x46;
out[cursor++] = (uint8_t)(col & 0xFFu);
out[cursor++] = value;
out[cursor++] = v1;
out[cursor++] = v2;
out[cursor++] = v3;
col = (uint16_t)(col + 4);
runLen = (uint16_t)(runLen - 4);
if (runLen > 0) {
shiftedByteAt(sp, row, col, shift, spriteBytesPerRow, &value, &opaqueMask);
}
}
if (runLen >= 2) {
shiftedByteAt(sp, row, (uint16_t)(col + 1), shift, spriteBytesPerRow, &v1, &m);
// mov word [esi+col], imm16 (66 C7 46 cc ii ii)
out[cursor++] = 0x66;
out[cursor++] = 0xC7; out[cursor++] = 0x46;
out[cursor++] = (uint8_t)(col & 0xFFu);
out[cursor++] = value;
out[cursor++] = v1;
col = (uint16_t)(col + 2);
runLen = (uint16_t)(runLen - 2);
if (runLen > 0) {
shiftedByteAt(sp, row, col, shift, spriteBytesPerRow, &value, &opaqueMask);
}
}
if (runLen == 1) {
// mov byte [esi+col], imm8 (C6 46 cc ii)
out[cursor++] = 0xC6; out[cursor++] = 0x46;
out[cursor++] = (uint8_t)(col & 0xFFu);
out[cursor++] = value;
col++;
} }
} }
} }
@ -307,51 +189,3 @@ uint16_t spriteEmitDrawX86(uint8_t *out, const SpriteT *sp, uint8_t shift) {
} }
// RESTORE: copy backup -> screen. Destination has the screen stride.
uint16_t spriteEmitRestoreX86(uint8_t *out, const SpriteT *sp, uint8_t shift) {
uint16_t cursor;
uint16_t heightPx;
uint16_t copyBytes;
cursor = 0;
heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS);
copyBytes = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW + (shift == 1u ? 1u : 0u));
// Prologue: push esi; push edi; mov esi,[esp+12]; mov edi,[esp+16]
out[cursor++] = 0x56;
out[cursor++] = 0x57;
out[cursor++] = 0x8B; out[cursor++] = 0x74; out[cursor++] = 0x24; out[cursor++] = 0x0C;
out[cursor++] = 0x8B; out[cursor++] = 0x7C; out[cursor++] = 0x24; out[cursor++] = 0x10;
cursor = emitCopyBodyX86(out, cursor, heightPx, copyBytes, false);
// Epilogue: pop edi; pop esi; ret
out[cursor++] = 0x5F;
out[cursor++] = 0x5E;
out[cursor++] = 0xC3;
return cursor;
}
// SAVE: copy screen -> backup. Source has the screen stride.
uint16_t spriteEmitSaveX86(uint8_t *out, const SpriteT *sp, uint8_t shift) {
uint16_t cursor;
uint16_t heightPx;
uint16_t copyBytes;
cursor = 0;
heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS);
copyBytes = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW + (shift == 1u ? 1u : 0u));
out[cursor++] = 0x56;
out[cursor++] = 0x57;
out[cursor++] = 0x8B; out[cursor++] = 0x74; out[cursor++] = 0x24; out[cursor++] = 0x0C;
out[cursor++] = 0x8B; out[cursor++] = 0x7C; out[cursor++] = 0x24; out[cursor++] = 0x10;
cursor = emitCopyBodyX86(out, cursor, heightPx, copyBytes, true);
out[cursor++] = 0x5F;
out[cursor++] = 0x5E;
out[cursor++] = 0xC3;
return cursor;
}

View file

@ -27,19 +27,11 @@ uint16_t spriteEmitDrawIigs(uint8_t *out, const SpriteT *sp, uint8_t shift);
// rectangle between the destination surface and a backup buffer. The // rectangle between the destination surface and a backup buffer. The
// rectangle's width and start position depend on the shift: for // rectangle's width and start position depend on the shift: for
// shift=0 (even x) it covers exactly the sprite's bytes per row; // shift=0 (even x) it covers exactly the sprite's bytes per row;
// for shift=1 (odd x) it covers one extra byte (left edge nibble). // for shift=1 (odd x) it covers one extra byte on each side, rounded
// Per-CPU emitters return 0 to mean "not implemented" -- the runtime // up to even. Per-CPU emitters return 0 to mean "not implemented" --
// dispatch falls back to the interpreted path in that case. // the runtime dispatch falls back to the interpreted path in that
// // case.
// IIgs uses a self-modifying MVN-stub on top of these bytes; x86 and
// 68k use a plain cdecl `void copy(const uint8_t *src, uint8_t *dst)`
// where the caller swaps args between SAVE (screen->backup) and
// RESTORE (backup->screen).
uint16_t spriteEmitSaveIigs (uint8_t *out, const SpriteT *sp, uint8_t shift); uint16_t spriteEmitSaveIigs (uint8_t *out, const SpriteT *sp, uint8_t shift);
uint16_t spriteEmitRestoreIigs(uint8_t *out, const SpriteT *sp, uint8_t shift); uint16_t spriteEmitRestoreIigs(uint8_t *out, const SpriteT *sp, uint8_t shift);
uint16_t spriteEmitSaveX86 (uint8_t *out, const SpriteT *sp, uint8_t shift);
uint16_t spriteEmitRestoreX86 (uint8_t *out, const SpriteT *sp, uint8_t shift);
uint16_t spriteEmitSave68k (uint8_t *out, const SpriteT *sp, uint8_t shift);
uint16_t spriteEmitRestore68k (uint8_t *out, const SpriteT *sp, uint8_t shift);
#endif #endif

View file

@ -1,10 +1,13 @@
| Amiga chunky-to-planar conversion -- 68000 hand-rolled. | Amiga chunky-to-planar conversion -- 68000 hand-rolled.
| |
| Drop-in replacement for hal.c's old c2pRange C inner loop. Uses a | Drop-in replacement for hal.c's old c2pRange C inner loop. The C
| 4 KB lookup table built once at HAL init: each (sourceByte, position, | version walked every pixel and OR'd individual bits into 4 plane
| plane) tuple maps to the plane-byte bit contribution that source | accumulators -- ~1.5 s for a full 320x200 frame on a 7 MHz 68000
| byte makes when it sits at that position within a 4-byte (8-pixel) | (the GCC m68k codegen is poor for tight bit-twiddling). This rewrite
| planar group going to that plane. | uses a 4 KB lookup table built once at HAL init: each (sourceByte,
| bytePosition, plane) tuple maps to the plane-byte-bit contribution
| that source byte makes when it sits at that position within a
| 4-byte (= 8-pixel) planar group.
| |
| Calling convention: m68k-amigaos-gcc cdecl. | Calling convention: m68k-amigaos-gcc cdecl.
| Args on stack at 4(sp), 8(sp), ... | Args on stack at 4(sp), 8(sp), ...
@ -19,17 +22,12 @@
| uint16_t n, ; 24(sp) - planar byte count (low word) | uint16_t n, ; 24(sp) - planar byte count (low word)
| const uint8_t *lut); ; 28(sp) - 4 KB LUT base | const uint8_t *lut); ; 28(sp) - 4 KB LUT base
| |
| LUT layout: lut[src*16 + pos*4 + plane] = 1-byte plane contribution | LUT layout: lut[pos*1024 + plane*256 + src] = 1-byte plane contribution
| for source byte `src` sitting at byte-position `pos` (0..3) within | for source byte `src` sitting at byte-position `pos` within its
| its 4-byte planar group, going to plane `plane` (0..3). All 16 | 4-byte planar group, going to plane `plane`. Byte-position 0 is the
| (pos, plane) entries for one src byte are contiguous, so the inner | leftmost (its two pixels land in plane-byte bits 7 and 6); position
| loop reaches every entry off (a5, d4.w) with an 8-bit displacement | 3 is the rightmost (bits 1 and 0). Built once by chunkyToPlanarInit
| (0..15) and never has to advance an index register. | (in hal.c) at HAL boot.
|
| Per planar byte we consume 4 source bytes (positions 0..3 of the
| 8-pixel group). For each we compute d4 = src*16 with four add.w's
| (faster than asl.w on 68000) and OR the four plane contributions
| into d0..d3 with byte-displaced (a5,d4.w) reads.
| |
| GAS-syntax (binutils m68k); assembled by m68k-amigaos-as via the | GAS-syntax (binutils m68k); assembled by m68k-amigaos-as via the
| gcc driver. | gcc driver.
@ -67,52 +65,54 @@ _chunkyToPlanarRow:
moveq #0,%d3 | plane 3 acc moveq #0,%d3 | plane 3 acc
| ----- Source byte position 0 ----- | ----- Source byte position 0 -----
| a5 points to start of LUT. Plane 0/1/2/3 sub-tables
| for position 0 are at offsets 0/256/512/768.
moveq #0,%d4 moveq #0,%d4
move.b (%a0)+,%d4 | src[0] move.b (%a0)+,%d4 | src[0]
add.w %d4,%d4 move.l %a5,%a6
add.w %d4,%d4 or.b (%a6,%d4.w),%d0 | +0 = pos0 plane 0
add.w %d4,%d4 lea 256(%a6),%a6
add.w %d4,%d4 | d4 = src * 16 or.b (%a6,%d4.w),%d1 | +256 = pos0 plane 1
or.b 0(%a5,%d4.w),%d0 | pos0 plane0 lea 256(%a6),%a6
or.b 1(%a5,%d4.w),%d1 | pos0 plane1 or.b (%a6,%d4.w),%d2 | +512 = pos0 plane 2
or.b 2(%a5,%d4.w),%d2 | pos0 plane2 lea 256(%a6),%a6
or.b 3(%a5,%d4.w),%d3 | pos0 plane3 or.b (%a6,%d4.w),%d3 | +768 = pos0 plane 3
| ----- Source byte position 1 ----- | ----- Source byte position 1 -----
lea 256(%a6),%a6 | advance to pos1 plane 0
moveq #0,%d4 moveq #0,%d4
move.b (%a0)+,%d4 | src[1] move.b (%a0)+,%d4
add.w %d4,%d4 or.b (%a6,%d4.w),%d0
add.w %d4,%d4 lea 256(%a6),%a6
add.w %d4,%d4 or.b (%a6,%d4.w),%d1
add.w %d4,%d4 lea 256(%a6),%a6
or.b 4(%a5,%d4.w),%d0 | pos1 plane0 or.b (%a6,%d4.w),%d2
or.b 5(%a5,%d4.w),%d1 | pos1 plane1 lea 256(%a6),%a6
or.b 6(%a5,%d4.w),%d2 | pos1 plane2 or.b (%a6,%d4.w),%d3
or.b 7(%a5,%d4.w),%d3 | pos1 plane3
| ----- Source byte position 2 ----- | ----- Source byte position 2 -----
lea 256(%a6),%a6
moveq #0,%d4 moveq #0,%d4
move.b (%a0)+,%d4 | src[2] move.b (%a0)+,%d4
add.w %d4,%d4 or.b (%a6,%d4.w),%d0
add.w %d4,%d4 lea 256(%a6),%a6
add.w %d4,%d4 or.b (%a6,%d4.w),%d1
add.w %d4,%d4 lea 256(%a6),%a6
or.b 8(%a5,%d4.w),%d0 | pos2 plane0 or.b (%a6,%d4.w),%d2
or.b 9(%a5,%d4.w),%d1 | pos2 plane1 lea 256(%a6),%a6
or.b 10(%a5,%d4.w),%d2 | pos2 plane2 or.b (%a6,%d4.w),%d3
or.b 11(%a5,%d4.w),%d3 | pos2 plane3
| ----- Source byte position 3 ----- | ----- Source byte position 3 -----
lea 256(%a6),%a6
moveq #0,%d4 moveq #0,%d4
move.b (%a0)+,%d4 | src[3] move.b (%a0)+,%d4
add.w %d4,%d4 or.b (%a6,%d4.w),%d0
add.w %d4,%d4 lea 256(%a6),%a6
add.w %d4,%d4 or.b (%a6,%d4.w),%d1
add.w %d4,%d4 lea 256(%a6),%a6
or.b 12(%a5,%d4.w),%d0 | pos3 plane0 or.b (%a6,%d4.w),%d2
or.b 13(%a5,%d4.w),%d1 | pos3 plane1 lea 256(%a6),%a6
or.b 14(%a5,%d4.w),%d2 | pos3 plane2 or.b (%a6,%d4.w),%d3
or.b 15(%a5,%d4.w),%d3 | pos3 plane3
| ----- Store plane bytes ----- | ----- Store plane bytes -----
move.b %d0,(%a1)+ move.b %d0,(%a1)+

View file

@ -77,12 +77,11 @@ static uint16_t gCachedPalette[SURFACE_PALETTE_COUNT][SURFACE_COLORS_PER_PALETTE
static bool gCacheValid = false; static bool gCacheValid = false;
// 4 KB chunky-to-planar lookup table consumed by chunkyToPlanarRow // 4 KB chunky-to-planar lookup table consumed by chunkyToPlanarRow
// (src/port/amiga/c2p.s). Layout: gC2pLut[src*16 + pos*4 + plane] = // (src/port/amiga/c2p.s). Layout: gC2pLut[pos*1024 + plane*256 + src]
// the plane-byte bit contribution that source byte `src` makes to // = the plane-byte bit contribution that source byte `src` makes when
// plane `plane` when it sits at byte-position `pos` within a 4-byte // it sits at byte-position `pos` within a 4-byte (8-pixel) planar
// (8-pixel) planar group. The src-major layout lets the asm inner // group, going to plane `plane`. Built once by initC2pLut on the
// loop reach all 16 (pos, plane) entries for a single src byte via // first halPresent call.
// 8-bit displacements off (a5, d4.w) without any LEA between reads.
static uint8_t gC2pLut[4 * 1024]; static uint8_t gC2pLut[4 * 1024];
static bool gC2pLutReady = false; static bool gC2pLutReady = false;
@ -117,14 +116,14 @@ static void initC2pLut(void) {
if (gC2pLutReady) { if (gC2pLutReady) {
return; return;
} }
for (src = 0; src < 256; src++) {
for (pos = 0; pos < 4; pos++) { for (pos = 0; pos < 4; pos++) {
highShift = (uint8_t)(7 - 2 * pos); highShift = (uint8_t)(7 - 2 * pos);
lowShift = (uint8_t)(6 - 2 * pos); lowShift = (uint8_t)(6 - 2 * pos);
for (plane = 0; plane < 4; plane++) { for (plane = 0; plane < 4; plane++) {
for (src = 0; src < 256; src++) {
highBit = (uint8_t)(((src >> 4) >> plane) & 1); highBit = (uint8_t)(((src >> 4) >> plane) & 1);
lowBit = (uint8_t)(((src & 0x0F) >> plane) & 1); lowBit = (uint8_t)(((src & 0x0F) >> plane) & 1);
gC2pLut[src * 16 + pos * 4 + plane] = gC2pLut[pos * 1024 + plane * 256 + src] =
(uint8_t)((highBit << highShift) | (lowBit << lowShift)); (uint8_t)((highBit << highShift) | (lowBit << lowShift));
} }
} }

View file

@ -1,188 +0,0 @@
| Atari ST chunky-to-planar conversion -- 68000 hand-rolled.
|
| Drop-in replacement for hal.c's old c2pRow C inner loop. The C
| version walked every pixel and built each plane word with a
| run-time variable bit shift (`1 << bit`), which costs ~6+2*bit
| cycles on 68000 -- roughly 100+ cycles per pixel after GCC's m68k
| codegen overhead. This rewrite uses a 4 KB lookup table built once
| at HAL init: same layout as the Amiga c2p LUT, so the
| (sourceByte, position, plane) -> 2-bit contribution mapping is
| identical, but the routine packs results into ST word-interleaved
| planar (4 plane words per 16-pixel group) instead of 4 separate
| plane bytes.
|
| Each ST group is 8 source bytes -> 4 plane words. Source byte
| positions 0..3 contribute to the HIGH byte of each plane word
| (bits 15..8); positions 4..7 contribute to the LOW byte (bits
| 7..0). Within a byte, the LUT for (src, bp%4, plane) already
| places bits at (7-2*(bp%4), 6-2*(bp%4)), so we use the SAME LUT
| entries for both halves -- we just shift d0..d3 left by 8 between
| the halves to move the high-half bits up before the low half ORs
| into the now-empty low byte.
|
| Calling convention: m68k-atari-mint-gcc cdecl.
| Args on stack at 4(sp), 8(sp), ...
| d2-d7, a2-a6 are callee-save.
| No return value.
|
| void chunkyToPlanarRowSt(const uint8_t *src, ; 4(sp) - 4bpp packed source row
| uint16_t *dst, ; 8(sp) - planar dest row (uint16_t*)
| uint16_t groupStart, ; 12(sp) - first group index (low word)
| uint16_t groupEnd, ; 16(sp) - one-past-last group index (low word)
| const uint8_t *lut); ; 20(sp) - 4 KB LUT base
|
| LUT layout: lut[src*16 + pos*4 + plane] (uint8) = the 2-bit plane
| contribution for source byte `src` at byte-position `pos` (0..3
| within a 4-byte chunk) going to plane `plane` (0..3). All 16
| (pos, plane) entries for one src byte are contiguous, so the inner
| loop reaches every entry off (a5, d4.w) with an 8-bit displacement
| (0..15) without LEA between reads.
|
| GAS-syntax (binutils m68k); assembled by m68k-atari-mint-as via
| the gcc driver.
.text
.globl _chunkyToPlanarRowSt
| MOVEM frame: d2-d7 (6) + a2-a6 (5) = 11 regs * 4 bytes = 44 bytes.
.equ SAVED_REGS_SIZE, 44
_chunkyToPlanarRowSt:
movem.l %d2-%d7/%a2-%a6,-(%sp)
move.l 4+SAVED_REGS_SIZE(%sp),%a0 | src row base
move.l 8+SAVED_REGS_SIZE(%sp),%a1 | dst (uint16_t*)
| Both groupStart and groupEnd are uint16_t but GCC
| promotes them to int and pushes 4 bytes each; the
| low word lives at +2 in big-endian layout.
move.w 12+SAVED_REGS_SIZE+2(%sp),%d6 | groupStart
move.w 16+SAVED_REGS_SIZE+2(%sp),%d7 | groupEnd
move.l 20+SAVED_REGS_SIZE(%sp),%a5 | LUT base
| Advance src and dst to the first group's data.
| Each group consumes 8 source bytes and produces 4
| dest words (8 bytes), so both pointers advance by
| groupStart * 8.
move.w %d6,%d4
lsl.w #3,%d4
add.w %d4,%a0
add.w %d4,%a1
sub.w %d6,%d7 | groupCount = end - start
subq.w #1,%d7 | DBRA bias
bmi .Ldone
.LgroupLoop:
moveq #0,%d0 | plane 0 acc
moveq #0,%d1 | plane 1 acc
moveq #0,%d2 | plane 2 acc
moveq #0,%d3 | plane 3 acc
| ===== Source bytes 0..3 -> high byte of each plane word =====
moveq #0,%d4
move.b (%a0)+,%d4
add.w %d4,%d4
add.w %d4,%d4
add.w %d4,%d4
add.w %d4,%d4 | d4 = src * 16
or.b 0(%a5,%d4.w),%d0
or.b 1(%a5,%d4.w),%d1
or.b 2(%a5,%d4.w),%d2
or.b 3(%a5,%d4.w),%d3
moveq #0,%d4
move.b (%a0)+,%d4
add.w %d4,%d4
add.w %d4,%d4
add.w %d4,%d4
add.w %d4,%d4
or.b 4(%a5,%d4.w),%d0
or.b 5(%a5,%d4.w),%d1
or.b 6(%a5,%d4.w),%d2
or.b 7(%a5,%d4.w),%d3
moveq #0,%d4
move.b (%a0)+,%d4
add.w %d4,%d4
add.w %d4,%d4
add.w %d4,%d4
add.w %d4,%d4
or.b 8(%a5,%d4.w),%d0
or.b 9(%a5,%d4.w),%d1
or.b 10(%a5,%d4.w),%d2
or.b 11(%a5,%d4.w),%d3
moveq #0,%d4
move.b (%a0)+,%d4
add.w %d4,%d4
add.w %d4,%d4
add.w %d4,%d4
add.w %d4,%d4
or.b 12(%a5,%d4.w),%d0
or.b 13(%a5,%d4.w),%d1
or.b 14(%a5,%d4.w),%d2
or.b 15(%a5,%d4.w),%d3
| Move accumulated bits into the HIGH byte of each word.
lsl.w #8,%d0
lsl.w #8,%d1
lsl.w #8,%d2
lsl.w #8,%d3
| ===== Source bytes 4..7 -> low byte of each plane word =====
moveq #0,%d4
move.b (%a0)+,%d4
add.w %d4,%d4
add.w %d4,%d4
add.w %d4,%d4
add.w %d4,%d4
or.b 0(%a5,%d4.w),%d0
or.b 1(%a5,%d4.w),%d1
or.b 2(%a5,%d4.w),%d2
or.b 3(%a5,%d4.w),%d3
moveq #0,%d4
move.b (%a0)+,%d4
add.w %d4,%d4
add.w %d4,%d4
add.w %d4,%d4
add.w %d4,%d4
or.b 4(%a5,%d4.w),%d0
or.b 5(%a5,%d4.w),%d1
or.b 6(%a5,%d4.w),%d2
or.b 7(%a5,%d4.w),%d3
moveq #0,%d4
move.b (%a0)+,%d4
add.w %d4,%d4
add.w %d4,%d4
add.w %d4,%d4
add.w %d4,%d4
or.b 8(%a5,%d4.w),%d0
or.b 9(%a5,%d4.w),%d1
or.b 10(%a5,%d4.w),%d2
or.b 11(%a5,%d4.w),%d3
moveq #0,%d4
move.b (%a0)+,%d4
add.w %d4,%d4
add.w %d4,%d4
add.w %d4,%d4
add.w %d4,%d4
or.b 12(%a5,%d4.w),%d0
or.b 13(%a5,%d4.w),%d1
or.b 14(%a5,%d4.w),%d2
or.b 15(%a5,%d4.w),%d3
| Store 4 plane words.
move.w %d0,(%a1)+
move.w %d1,(%a1)+
move.w %d2,(%a1)+
move.w %d3,(%a1)+
dbra %d7,.LgroupLoop
.Ldone:
movem.l (%sp)+,%d2-%d7/%a2-%a6
rts

View file

@ -64,19 +64,12 @@
// ----- Prototypes ----- // ----- Prototypes -----
static uint16_t quantizeColorToSt(uint16_t orgb); static uint16_t quantizeColorToSt(uint16_t orgb);
static void c2pRow(const uint8_t *src, uint16_t *dst, uint16_t groupStart, uint16_t groupEnd);
static void c2pRange(const SurfaceT *src, int16_t y0, int16_t y1, uint16_t groupStart, uint16_t groupEnd); static void c2pRange(const SurfaceT *src, int16_t y0, int16_t y1, uint16_t groupStart, uint16_t groupEnd);
static void flattenScbPalettes(const SurfaceT *src); static void flattenScbPalettes(const SurfaceT *src);
static void initC2pLut(void);
static void writeDiagnostics(void); static void writeDiagnostics(void);
static long writePrevPaletteRegs(void); static long writePrevPaletteRegs(void);
// Provided by src/port/atarist/c2p.s.
extern void chunkyToPlanarRowSt(const uint8_t *src,
uint16_t *dst,
uint16_t groupStart,
uint16_t groupEnd,
const uint8_t *lut);
static __attribute__((interrupt_handler)) void timerBIsr(void); static __attribute__((interrupt_handler)) void timerBIsr(void);
static __attribute__((interrupt_handler)) void vblIsr(void); static __attribute__((interrupt_handler)) void vblIsr(void);
static void buildTransitions(const SurfaceT *src); static void buildTransitions(const SurfaceT *src);
@ -136,31 +129,55 @@ static uint8_t gCachedScb [SURFACE_HEIGHT];
static uint16_t gCachedPalette[SURFACE_PALETTE_COUNT][SURFACE_COLORS_PER_PALETTE]; static uint16_t gCachedPalette[SURFACE_PALETTE_COUNT][SURFACE_COLORS_PER_PALETTE];
static bool gCacheValid = false; static bool gCacheValid = false;
// 4 KB chunky-to-planar lookup table consumed by chunkyToPlanarRowSt
// (src/port/atarist/c2p.s). Layout: gC2pLut[src*16 + pos*4 + plane]
// = the 2-bit plane-byte contribution for source byte `src` at
// byte-position `pos` (0..3 within a 4-byte chunk) going to plane
// `plane`. Bit positions inside the byte are (7-2*pos, 6-2*pos), so
// the same table feeds both halves of an ST plane word: positions
// 0..3 land in the high byte, 4..7 (re-indexed mod 4) in the low
// byte. Built once by initC2pLut on the first halPresent call.
static uint8_t gC2pLut[4 * 1024];
static bool gC2pLutReady = false;
// ----- Internal helpers (alphabetical) ----- // ----- Internal helpers (alphabetical) -----
// Convert 16 chunky pixels (8 bytes 4bpp packed) to 4 ST planar words
// per group. groupStart..groupEnd selects a horizontal sub-range so
// halPresentRect can avoid touching unchanged groups.
static void c2pRow(const uint8_t *src, uint16_t *dst, uint16_t groupStart, uint16_t groupEnd) {
uint16_t group;
uint16_t px;
uint16_t plane0;
uint16_t plane1;
uint16_t plane2;
uint16_t plane3;
uint8_t byte;
uint8_t nibble;
uint16_t bit;
for (group = groupStart; group < groupEnd; group++) {
plane0 = 0;
plane1 = 0;
plane2 = 0;
plane3 = 0;
for (px = 0; px < 16; px++) {
byte = src[(group * 8) + (px >> 1)];
nibble = (uint8_t)((px & 1) ? (byte & 0x0F) : (byte >> 4));
bit = (uint16_t)(15 - px);
plane0 = (uint16_t)(plane0 | (((nibble >> 0) & 1) << bit));
plane1 = (uint16_t)(plane1 | (((nibble >> 1) & 1) << bit));
plane2 = (uint16_t)(plane2 | (((nibble >> 2) & 1) << bit));
plane3 = (uint16_t)(plane3 | (((nibble >> 3) & 1) << bit));
}
dst[(group * 4) + 0] = plane0;
dst[(group * 4) + 1] = plane1;
dst[(group * 4) + 2] = plane2;
dst[(group * 4) + 3] = plane3;
}
}
static void c2pRange(const SurfaceT *src, int16_t y0, int16_t y1, uint16_t groupStart, uint16_t groupEnd) { static void c2pRange(const SurfaceT *src, int16_t y0, int16_t y1, uint16_t groupStart, uint16_t groupEnd) {
int16_t y; int16_t y;
const uint8_t *srcLine; const uint8_t *srcLine;
uint16_t *dstLine; uint16_t *dstLine;
if (!gC2pLutReady) {
initC2pLut();
}
for (y = y0; y < y1; y++) { for (y = y0; y < y1; y++) {
srcLine = &src->pixels[y * SURFACE_BYTES_PER_ROW]; srcLine = &src->pixels[y * SURFACE_BYTES_PER_ROW];
dstLine = (uint16_t *)&gScreenBase[y * ST_BYTES_PER_ROW]; dstLine = (uint16_t *)&gScreenBase[y * ST_BYTES_PER_ROW];
chunkyToPlanarRowSt(srcLine, dstLine, groupStart, groupEnd, gC2pLut); c2pRow(srcLine, dstLine, groupStart, groupEnd);
} }
} }
@ -246,37 +263,6 @@ static void refreshPaletteStateIfNeeded(const SurfaceT *src) {
} }
// Build the 4 KB chunky-to-planar lookup table consumed by
// chunkyToPlanarRowSt. Same layout/contents as the Amiga c2p LUT;
// see src/port/atarist/c2p.s for the addressing math.
static void initC2pLut(void) {
uint16_t pos;
uint16_t plane;
uint16_t src;
uint8_t highShift;
uint8_t lowShift;
uint8_t highBit;
uint8_t lowBit;
if (gC2pLutReady) {
return;
}
for (src = 0; src < 256; src++) {
for (pos = 0; pos < 4; pos++) {
highShift = (uint8_t)(7 - 2 * pos);
lowShift = (uint8_t)(6 - 2 * pos);
for (plane = 0; plane < 4; plane++) {
highBit = (uint8_t)(((src >> 4) >> plane) & 1);
lowBit = (uint8_t)(((src & 0x0F) >> plane) & 1);
gC2pLut[src * 16 + pos * 4 + plane] =
(uint8_t)((highBit << highShift) | (lowBit << lowShift));
}
}
}
gC2pLutReady = true;
}
// 12-bit $0RGB to STF 9-bit palette register (drops the low bit of // 12-bit $0RGB to STF 9-bit palette register (drops the low bit of
// each 4-bit channel). // each 4-bit channel).
static uint16_t quantizeColorToSt(uint16_t orgb) { static uint16_t quantizeColorToSt(uint16_t orgb) {

View file

@ -1,36 +1,17 @@
// joeysprite: host-side compiler that turns sprite art into a `.spr` // joeysprite: host-side compiler that turns raw tile data into a
// file ready to be loaded at runtime by spriteLoadFile. // `.spr` file ready to be loaded at runtime by spriteLoadFile.
// //
// Usage: // Usage:
// joeysprite --target {iigs,amiga,atarist,dos} // joeysprite --target {iigs,amiga,atarist,dos}
// [--width-tiles N --height-tiles M] // --width-tiles N --height-tiles M
// INPUT OUTPUT.spr // input.tiles output.spr
// //
// Two input formats are accepted; the first 2 bytes select the path: // `input.tiles` is widthTiles * heightTiles * 32 bytes, laid out
//
// PPM (P6) -- 8-bit-per-channel raster from any pixel-art tool that
// exports PPM (GIMP, ImageMagick `convert`, paint.net, etc.). Image
// dimensions must be multiples of 8 in both axes; widthTiles /
// heightTiles are auto-derived as W/8 and H/8 (CLI overrides are
// optional and must match). Each input RGB is reduced to a 12-bit
// $0RGB color (high nibble of each channel); the input must use
// no more than 16 distinct $0RGB colors after that reduction. The
// FIRST color encountered (typically the top-left pixel) is bound
// to palette index 0, which the runtime treats as transparent --
// so paint your sprite background with that pixel's color.
//
// Raw `.tiles` -- widthTiles * heightTiles * 32 bytes, laid out
// tile-major as the runtime SpriteT.tileData expects: tile (0,0) // tile-major as the runtime SpriteT.tileData expects: tile (0,0)
// first 32 bytes, tile (1,0) next 32, ... tile (widthTiles-1, 0), // first 32 bytes, tile (1,0) next 32, ... tile (widthTiles-1, 0),
// then tile (0,1), and so on. Inside each tile, rows are stored // then tile (0,1), and so on. Inside each tile, rows are stored
// top-to-bottom and each row is 4 bytes (8 pixels at 4bpp packed, // top-to-bottom and each row is 4 bytes (8 pixels at 4bpp packed,
// high nibble = left pixel). --width-tiles / --height-tiles are // high nibble = left pixel).
// required for this path since the file carries no header.
//
// The .spr output carries indices only -- the palette mapping is the
// application's responsibility (typical pattern: ship a separate
// .jas built from the same PPM via joeyasset, or hand-author the
// palette in code).
// //
// Output `.spr` format (target-native byte order for code; see // Output `.spr` format (target-native byte order for code; see
// DESIGN.md §12). Mirrors src/core/sprite.c's reader: // DESIGN.md §12). Mirrors src/core/sprite.c's reader:
@ -40,19 +21,14 @@
// bytes 4-5 tileBytes (LE16) = widthTiles*heightTiles*32 // bytes 4-5 tileBytes (LE16) = widthTiles*heightTiles*32
// ... offsets (JOEY_SPRITE_SHIFT_COUNT * SPRITE_OP_COUNT * // ... offsets (JOEY_SPRITE_SHIFT_COUNT * SPRITE_OP_COUNT *
// uint16_t LE): [draw_s0, save_s0, restore_s0, // uint16_t LE): [draw_s0, save_s0, restore_s0,
// draw_s1, save_s1, restore_s1]. Each entry is the // draw_s1, save_s1, restore_s1]. Save/restore offsets
// byte offset of that routine within the compiled-code // are 0 here -- the runtime keeps the memcpy-based
// region, or 0xFFFF (SPRITE_NOT_COMPILED) if the per-CPU // interpreter for those ops.
// emitter returned 0 bytes for that op -- the runtime
// then falls back to the interpreted memcpy/RMW path.
// ... compiled code (codeSize bytes) // ... compiled code (codeSize bytes)
// ... raw tile data (tileBytes bytes; same layout as the // ... raw tile data (tileBytes bytes; same layout as the
// input file, lets the runtime interpreter handle // input file, lets the runtime interpreter handle
// clipped draws without decoding the compiled bytes). // clipped draws without decoding the compiled bytes).
#include <ctype.h>
#include <errno.h>
#include <stdbool.h>
#include <stdint.h> #include <stdint.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
@ -75,26 +51,18 @@ typedef enum {
// ----- Constants ----- // ----- Constants -----
#define MAX_SCRATCH_BYTES (16u * 1024u) #define MAX_SCRATCH_BYTES (16u * 1024u)
// Pixel art conventions for sprite work. #define SPR_HEADER_SIZE 6
#define TILE_PIXELS 8 // Save/restore offsets are reserved (0) for now -- the runtime
#define TILE_BYTES 32 // memcpy interpreter handles them.
#define TILE_BYTES_PER_ROW 4 #define SHIFT_OPS 3
#define MAX_PALETTE_ENTRIES 16 #define OFFSET_TABLE_BYTES (JOEY_SPRITE_SHIFT_COUNT * SHIFT_OPS * 2u)
#define PPM_TOKEN_MAX 64
// ----- Prototypes ----- // ----- Prototypes -----
static int buildPalette(const uint8_t *rgb, int width, int height, uint8_t *outIndices);
static int compileToSpr(const SpriteT *sp, TargetE target, const char *outPath); static int compileToSpr(const SpriteT *sp, TargetE target, const char *outPath);
static uint16_t emitForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift, uint8_t op, TargetE target); static uint16_t emitForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift, TargetE target);
static bool fileIsPpm(const char *path);
static int loadPpm(const char *path, int *outWidth, int *outHeight, uint8_t **outPixels);
static int loadPpmAsTiles(const char *path, long *widthTiles, long *heightTiles, uint8_t **outTiles, uint32_t *outSize);
static int loadTileData(const char *path, uint8_t **outBytes, uint32_t *outSize); static int loadTileData(const char *path, uint8_t **outBytes, uint32_t *outSize);
static void packIndicesToTiles(const uint8_t *indices, int width, int height, uint8_t *outTiles);
static int parsePpmToken(FILE *fp, char *out, int outLen);
static TargetE parseTarget(const char *name); static TargetE parseTarget(const char *name);
static int usage(const char *prog); static int usage(const char *prog);
static int writeLE16(FILE *fp, uint16_t v); static int writeLE16(FILE *fp, uint16_t v);
@ -102,68 +70,16 @@ static int writeLE16(FILE *fp, uint16_t v);
// ----- Internal helpers (alphabetical) ----- // ----- Internal helpers (alphabetical) -----
// Reduce every input RGB triple to a 12-bit $0RGB color and assign
// palette indices in encounter order: top-left pixel = index 0,
// next-encountered = index 1, etc. The runtime treats index 0 as
// transparent, so the top-left pixel must be the sprite's background
// color. Returns the number of distinct colors found, or -1 if the
// image needs more than 16 entries after $0RGB quantization.
//
// Mirrors joeyasset's buildPalette but only emits the index array;
// joeysprite drops the $0RGB palette since the .spr format carries
// indices alone.
static int buildPalette(const uint8_t *rgb, int width, int height, uint8_t *outIndices) {
uint16_t palette[MAX_PALETTE_ENTRIES];
int paletteCount;
int total;
int i;
int j;
uint8_t r;
uint8_t g;
uint8_t b;
uint16_t color;
total = width * height;
paletteCount = 0;
for (i = 0; i < total; i++) {
r = (uint8_t)(rgb[i * 3 + 0] >> 4);
g = (uint8_t)(rgb[i * 3 + 1] >> 4);
b = (uint8_t)(rgb[i * 3 + 2] >> 4);
color = (uint16_t)((r << 8) | (g << 4) | b);
for (j = 0; j < paletteCount; j++) {
if (palette[j] == color) {
break;
}
}
if (j == paletteCount) {
if (paletteCount >= MAX_PALETTE_ENTRIES) {
return -1;
}
palette[paletteCount] = color;
paletteCount++;
}
outIndices[i] = (uint8_t)j;
}
return paletteCount;
}
// Two-pass: pass 1 sizes every (shift, op) routine into shiftOpSizes;
// pass 2 stamps them into the code buffer at their cumulative offsets.
// Routines that return 0 bytes (the per-CPU emitter doesn't implement
// that op) get SPRITE_NOT_COMPILED in their offset slot so the runtime
// dispatch falls back to the interpreted path.
static int compileToSpr(const SpriteT *sp, TargetE target, const char *outPath) { static int compileToSpr(const SpriteT *sp, TargetE target, const char *outPath) {
uint8_t *scratch; uint8_t *scratch;
uint8_t *codeBuf; uint8_t *codeBuf;
uint16_t routineSizes[JOEY_SPRITE_SHIFT_COUNT][SPRITE_OP_COUNT]; uint16_t shiftLengths[JOEY_SPRITE_SHIFT_COUNT];
uint16_t routineOffsets[JOEY_SPRITE_SHIFT_COUNT][SPRITE_OP_COUNT];
uint32_t totalCodeSize; uint32_t totalCodeSize;
uint8_t shift; uint8_t shift;
uint8_t op; uint8_t op;
uint16_t written; uint16_t written;
uint16_t cursor; uint16_t cursor;
uint16_t value; uint16_t offset;
FILE *fp; FILE *fp;
int rc; int rc;
@ -175,17 +91,10 @@ static int compileToSpr(const SpriteT *sp, TargetE target, const char *outPath)
totalCodeSize = 0; totalCodeSize = 0;
for (shift = 0; shift < JOEY_SPRITE_SHIFT_COUNT; shift++) { for (shift = 0; shift < JOEY_SPRITE_SHIFT_COUNT; shift++) {
for (op = 0; op < SPRITE_OP_COUNT; op++) { written = emitForTarget(scratch, sp, shift, target);
written = emitForTarget(scratch, sp, shift, op, target); shiftLengths[shift] = written;
routineSizes[shift][op] = written;
if (written == 0) {
routineOffsets[shift][op] = SPRITE_NOT_COMPILED;
} else {
routineOffsets[shift][op] = (uint16_t)totalCodeSize;
totalCodeSize += written; totalCodeSize += written;
} }
}
}
if (totalCodeSize > 0xFFFFu) { if (totalCodeSize > 0xFFFFu) {
fprintf(stderr, "joeysprite: emitted %u code bytes; max is 65535\n", fprintf(stderr, "joeysprite: emitted %u code bytes; max is 65535\n",
(unsigned)totalCodeSize); (unsigned)totalCodeSize);
@ -193,7 +102,7 @@ static int compileToSpr(const SpriteT *sp, TargetE target, const char *outPath)
return 2; return 2;
} }
codeBuf = (uint8_t *)malloc(totalCodeSize > 0 ? totalCodeSize : 1); codeBuf = (uint8_t *)malloc(totalCodeSize);
if (codeBuf == NULL) { if (codeBuf == NULL) {
fprintf(stderr, "joeysprite: out of memory for code buffer\n"); fprintf(stderr, "joeysprite: out of memory for code buffer\n");
free(scratch); free(scratch);
@ -202,14 +111,9 @@ static int compileToSpr(const SpriteT *sp, TargetE target, const char *outPath)
cursor = 0; cursor = 0;
for (shift = 0; shift < JOEY_SPRITE_SHIFT_COUNT; shift++) { for (shift = 0; shift < JOEY_SPRITE_SHIFT_COUNT; shift++) {
for (op = 0; op < SPRITE_OP_COUNT; op++) { written = emitForTarget(codeBuf + cursor, sp, shift, target);
if (routineSizes[shift][op] == 0) {
continue;
}
written = emitForTarget(codeBuf + cursor, sp, shift, op, target);
cursor = (uint16_t)(cursor + written); cursor = (uint16_t)(cursor + written);
} }
}
fp = fopen(outPath, "wb"); fp = fopen(outPath, "wb");
if (fp == NULL) { if (fp == NULL) {
@ -225,17 +129,25 @@ static int compileToSpr(const SpriteT *sp, TargetE target, const char *outPath)
if (rc == 0 && writeLE16(fp, (uint16_t)totalCodeSize) != 0) rc = 2; if (rc == 0 && writeLE16(fp, (uint16_t)totalCodeSize) != 0) rc = 2;
if (rc == 0 && writeLE16(fp, (uint16_t)(sp->widthTiles * sp->heightTiles * 32u)) != 0) rc = 2; if (rc == 0 && writeLE16(fp, (uint16_t)(sp->widthTiles * sp->heightTiles * 32u)) != 0) rc = 2;
// Offset table: cumulative draw offsets + zeros for save/restore.
offset = 0;
for (shift = 0; rc == 0 && shift < JOEY_SPRITE_SHIFT_COUNT; shift++) { for (shift = 0; rc == 0 && shift < JOEY_SPRITE_SHIFT_COUNT; shift++) {
for (op = 0; op < SPRITE_OP_COUNT; op++) { for (op = 0; op < SHIFT_OPS; op++) {
value = routineOffsets[shift][op]; uint16_t value;
if (op == SPRITE_OP_DRAW) {
value = offset;
} else {
value = 0;
}
if (writeLE16(fp, value) != 0) { if (writeLE16(fp, value) != 0) {
rc = 2; rc = 2;
break; break;
} }
} }
offset = (uint16_t)(offset + shiftLengths[shift]);
} }
if (rc == 0 && totalCodeSize > 0) { if (rc == 0) {
if (fwrite(codeBuf, 1, totalCodeSize, fp) != totalCodeSize) { if (fwrite(codeBuf, 1, totalCodeSize, fp) != totalCodeSize) {
rc = 2; rc = 2;
} }
@ -267,207 +179,21 @@ static int compileToSpr(const SpriteT *sp, TargetE target, const char *outPath)
} }
static uint16_t emitForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift, uint8_t op, TargetE target) { static uint16_t emitForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift, TargetE target) {
switch (target) { switch (target) {
case TARGET_DOS: case TARGET_DOS:
switch (op) { return spriteEmitDrawX86(out, sp, shift);
case SPRITE_OP_DRAW: return spriteEmitDrawX86 (out, sp, shift);
case SPRITE_OP_SAVE: return spriteEmitSaveX86 (out, sp, shift);
case SPRITE_OP_RESTORE: return spriteEmitRestoreX86(out, sp, shift);
default: return 0;
}
case TARGET_AMIGA: case TARGET_AMIGA:
case TARGET_ATARIST: case TARGET_ATARIST:
switch (op) { return spriteEmitDraw68k(out, sp, shift);
case SPRITE_OP_DRAW: return spriteEmitDraw68k (out, sp, shift);
case SPRITE_OP_SAVE: return spriteEmitSave68k (out, sp, shift);
case SPRITE_OP_RESTORE: return spriteEmitRestore68k(out, sp, shift);
default: return 0;
}
case TARGET_IIGS: case TARGET_IIGS:
switch (op) { return spriteEmitDrawIigs(out, sp, shift);
case SPRITE_OP_DRAW: return spriteEmitDrawIigs (out, sp, shift);
case SPRITE_OP_SAVE: return spriteEmitSaveIigs (out, sp, shift);
case SPRITE_OP_RESTORE: return spriteEmitRestoreIigs(out, sp, shift);
default: return 0;
}
default: default:
return 0; return 0;
} }
} }
// Sniff the first 2 bytes for the PPM magic. Errors return false (the
// caller will fall through to the .tiles loader, which surfaces a
// clear error if the bytes aren't valid tile data either).
static bool fileIsPpm(const char *path) {
FILE *fp;
int c0;
int c1;
fp = fopen(path, "rb");
if (fp == NULL) {
return false;
}
c0 = fgetc(fp);
c1 = fgetc(fp);
fclose(fp);
return (c0 == 'P' && c1 == '6');
}
// Read a PPM (P6) raster into a freshly allocated 8-bit RGB buffer.
// Mirrors joeyasset's loadPpm. Caller frees *outPixels.
static int loadPpm(const char *path, int *outWidth, int *outHeight, uint8_t **outPixels) {
FILE *fp;
char tok[PPM_TOKEN_MAX];
int width;
int height;
int maxval;
size_t pixelBytes;
uint8_t *buf;
size_t read;
fp = fopen(path, "rb");
if (fp == NULL) {
fprintf(stderr, "joeysprite: cannot open %s: %s\n", path, strerror(errno));
return 2;
}
if (parsePpmToken(fp, tok, sizeof(tok)) != 0 || strcmp(tok, "P6") != 0) {
fprintf(stderr, "joeysprite: %s is not a PPM (P6) file\n", path);
fclose(fp);
return 2;
}
if (parsePpmToken(fp, tok, sizeof(tok)) != 0) {
fclose(fp);
return 2;
}
width = atoi(tok);
if (parsePpmToken(fp, tok, sizeof(tok)) != 0) {
fclose(fp);
return 2;
}
height = atoi(tok);
if (parsePpmToken(fp, tok, sizeof(tok)) != 0) {
fclose(fp);
return 2;
}
maxval = atoi(tok);
if (width <= 0 || height <= 0) {
fprintf(stderr, "joeysprite: %s has non-positive dimensions\n", path);
fclose(fp);
return 2;
}
if (maxval != 255) {
fprintf(stderr, "joeysprite: %s maxval %d unsupported (must be 255)\n", path, maxval);
fclose(fp);
return 2;
}
pixelBytes = (size_t)width * (size_t)height * 3u;
buf = (uint8_t *)malloc(pixelBytes);
if (buf == NULL) {
fprintf(stderr, "joeysprite: out of memory (%zu bytes)\n", pixelBytes);
fclose(fp);
return 2;
}
read = fread(buf, 1, pixelBytes, fp);
fclose(fp);
if (read != pixelBytes) {
fprintf(stderr, "joeysprite: short raster in %s (got %zu, need %zu)\n",
path, read, pixelBytes);
free(buf);
return 2;
}
*outWidth = width;
*outHeight = height;
*outPixels = buf;
return 0;
}
// End-to-end PPM -> tile-major 4bpp packed. On entry, *widthTiles /
// *heightTiles are 0 if the user didn't pass --width-tiles /
// --height-tiles, or the user-provided values otherwise; we fill in
// the auto-derived values when the user left them at 0, and validate
// against the image when they didn't.
static int loadPpmAsTiles(const char *path, long *widthTiles, long *heightTiles, uint8_t **outTiles, uint32_t *outSize) {
uint8_t *rgb;
uint8_t *indices;
uint8_t *tiles;
int width;
int height;
long wTiles;
long hTiles;
uint32_t tileBytes;
int paletteCount;
int rc;
rc = loadPpm(path, &width, &height, &rgb);
if (rc != 0) {
return rc;
}
if ((width % TILE_PIXELS) != 0 || (height % TILE_PIXELS) != 0) {
fprintf(stderr,
"joeysprite: %s is %dx%d -- both dimensions must be multiples of %d\n",
path, width, height, TILE_PIXELS);
free(rgb);
return 2;
}
wTiles = width / TILE_PIXELS;
hTiles = height / TILE_PIXELS;
if (*widthTiles == 0) {
*widthTiles = wTiles;
} else if (*widthTiles != wTiles) {
fprintf(stderr,
"joeysprite: --width-tiles %ld disagrees with image width %d (%ld tiles)\n",
*widthTiles, width, wTiles);
free(rgb);
return 2;
}
if (*heightTiles == 0) {
*heightTiles = hTiles;
} else if (*heightTiles != hTiles) {
fprintf(stderr,
"joeysprite: --height-tiles %ld disagrees with image height %d (%ld tiles)\n",
*heightTiles, height, hTiles);
free(rgb);
return 2;
}
indices = (uint8_t *)malloc((size_t)width * (size_t)height);
if (indices == NULL) {
fprintf(stderr, "joeysprite: out of memory for index buffer\n");
free(rgb);
return 2;
}
paletteCount = buildPalette(rgb, width, height, indices);
free(rgb);
if (paletteCount < 0) {
fprintf(stderr,
"joeysprite: %s has more than 16 distinct $0RGB colors after\n"
" 4-bit-per-channel quantization. Reduce the input palette and\n"
" retry (e.g. pngquant --nofs 16, or GIMP -> Image -> Mode ->\n"
" Indexed... with 16 colors and no dithering).\n", path);
free(indices);
return 2;
}
tileBytes = (uint32_t)wTiles * (uint32_t)hTiles * TILE_BYTES;
tiles = (uint8_t *)malloc(tileBytes);
if (tiles == NULL) {
fprintf(stderr, "joeysprite: out of memory for tile buffer\n");
free(indices);
return 2;
}
packIndicesToTiles(indices, width, height, tiles);
free(indices);
*outTiles = tiles;
*outSize = tileBytes;
return 0;
}
static int loadTileData(const char *path, uint8_t **outBytes, uint32_t *outSize) { static int loadTileData(const char *path, uint8_t **outBytes, uint32_t *outSize) {
FILE *fp; FILE *fp;
long fileSize; long fileSize;
@ -510,76 +236,6 @@ static int loadTileData(const char *path, uint8_t **outBytes, uint32_t *outSize)
} }
// Reshuffle row-major palette indices into the tile-major 4bpp packed
// layout the runtime SpriteT.tileData expects: tile (tx,ty)'s 32 bytes
// land contiguously at outTiles[(ty*widthTiles + tx) * 32], with each
// row inside the tile as 4 packed bytes (high nibble = left pixel).
static void packIndicesToTiles(const uint8_t *indices, int width, int height, uint8_t *outTiles) {
int widthTiles;
int heightTiles;
int tx;
int ty;
int row;
int col;
int pxX;
int pxY;
uint8_t hi;
uint8_t lo;
uint8_t *tile;
widthTiles = width / TILE_PIXELS;
heightTiles = height / TILE_PIXELS;
for (ty = 0; ty < heightTiles; ty++) {
for (tx = 0; tx < widthTiles; tx++) {
tile = &outTiles[(ty * widthTiles + tx) * TILE_BYTES];
for (row = 0; row < TILE_PIXELS; row++) {
pxY = ty * TILE_PIXELS + row;
for (col = 0; col < TILE_BYTES_PER_ROW; col++) {
pxX = tx * TILE_PIXELS + col * 2;
hi = (uint8_t)(indices[pxY * width + pxX] & 0x0Fu);
lo = (uint8_t)(indices[pxY * width + pxX + 1] & 0x0Fu);
tile[row * TILE_BYTES_PER_ROW + col] = (uint8_t)((hi << 4) | lo);
}
}
}
}
}
// Reads a single whitespace-separated token from a PPM header,
// skipping `#` comments to end-of-line. Mirrors joeyasset.
static int parsePpmToken(FILE *fp, char *out, int outLen) {
int c;
int pos;
pos = 0;
for (;;) {
c = fgetc(fp);
if (c == EOF) {
return -1;
}
if (isspace(c)) {
continue;
}
if (c == '#') {
while ((c = fgetc(fp)) != EOF && c != '\n') {
/* skip */;
}
continue;
}
break;
}
while (c != EOF && !isspace(c) && c != '#') {
if (pos < outLen - 1) {
out[pos++] = (char)c;
}
c = fgetc(fp);
}
out[pos] = 0;
return 0;
}
static TargetE parseTarget(const char *name) { static TargetE parseTarget(const char *name) {
if (strcmp(name, "iigs") == 0) return TARGET_IIGS; if (strcmp(name, "iigs") == 0) return TARGET_IIGS;
if (strcmp(name, "amiga") == 0) return TARGET_AMIGA; if (strcmp(name, "amiga") == 0) return TARGET_AMIGA;
@ -592,11 +248,8 @@ static TargetE parseTarget(const char *name) {
static int usage(const char *prog) { static int usage(const char *prog) {
fprintf(stderr, fprintf(stderr,
"usage: %s --target {iigs,amiga,atarist,dos} \\\n" "usage: %s --target {iigs,amiga,atarist,dos} \\\n"
" [--width-tiles N --height-tiles M] \\\n" " --width-tiles N --height-tiles M \\\n"
" INPUT OUTPUT.spr\n" " input.tiles output.spr\n", prog);
" INPUT is a PPM (P6) file (auto-derives tile dims from W/8, H/8)\n"
" or a raw .tiles byte stream (requires --width-tiles/--height-tiles).\n",
prog);
return 2; return 2;
} }
@ -648,11 +301,9 @@ int main(int argc, char **argv) {
return usage(argv[0]); return usage(argv[0]);
} }
} }
if (targetName == NULL || inPath == NULL || outPath == NULL) { if (targetName == NULL || widthTiles <= 0 || widthTiles > 255 ||
return usage(argv[0]); heightTiles <= 0 || heightTiles > 255 ||
} inPath == NULL || outPath == NULL) {
if (widthTiles < 0 || widthTiles > 255 ||
heightTiles < 0 || heightTiles > 255) {
return usage(argv[0]); return usage(argv[0]);
} }
@ -662,25 +313,10 @@ int main(int argc, char **argv) {
return usage(argv[0]); return usage(argv[0]);
} }
if (fileIsPpm(inPath)) {
// PPM path: tile dims auto-derive (or validate against CLI).
rc = loadPpmAsTiles(inPath, &widthTiles, &heightTiles, &tileBytes, &tileSize);
if (rc != 0) {
return rc;
}
} else {
// Raw .tiles path: tile dims required.
if (widthTiles <= 0 || heightTiles <= 0) {
fprintf(stderr,
"joeysprite: %s is not a PPM; --width-tiles and --height-tiles are required\n",
inPath);
return usage(argv[0]);
}
rc = loadTileData(inPath, &tileBytes, &tileSize); rc = loadTileData(inPath, &tileBytes, &tileSize);
if (rc != 0) { if (rc != 0) {
return rc; return rc;
} }
}
expectedTileSize = (uint32_t)(widthTiles * heightTiles * 32); expectedTileSize = (uint32_t)(widthTiles * heightTiles * 32);
if (tileSize != expectedTileSize) { if (tileSize != expectedTileSize) {