joeylib2/src/codegen/spriteEmitX86.c

369 lines
14 KiB
C

// x86 sprite codegen (DOS port). Emits 32-bit cdecl-callable PIC
// draw / save / restore routines that read or write 4bpp packed
// surface bytes via [esi+disp8] chains. The C side calls them
// through a function pointer cast.
//
// Calling convention:
// draw(uint8_t *dst) -- arg in [esp+8] after prologue saves esi
// save/restore(const uint8_t *src, uint8_t *dst) -- args in [esp+12]/[esp+16] after esi+edi save
//
// Per-byte emit, with opaque-run coalescing for the draw path:
// - all-transparent (both nibbles 0): skip, no instruction
// - mixed: mov al,[esi+col]; and al,mask; or al,val; mov [esi+col],al
// (3 + 2 + 2 + 3 = 10 bytes)
// - run of N consecutive fully-opaque bytes: emit largest chunks
// while N >= 4: mov dword [esi+col], imm32 (7 bytes, 1 store)
// if N >= 2: mov word [esi+col], imm16 (6 bytes, 1 store)
// if N == 1: mov byte [esi+col], imm8 (4 bytes, 1 store)
// A run of 4 opaque bytes is therefore one 7-byte store instead of
// four 4-byte stores (16 bytes / 4 stores). Unaligned access is
// fine on 386+.
// Per row:
// add esi, SURFACE_BYTES_PER_ROW (6 bytes encoded)
// Prologue:
// push esi; mov esi, [esp+8] (1 + 4 = 5 bytes)
// Epilogue:
// pop esi; ret (1 + 1 = 2 bytes)
#include "joey/sprite.h"
#include "joey/surface.h"
#include "spriteEmitter.h"
#include "spriteInternal.h"
// ----- Constants -----
#define TILE_PIXELS 8
#define TILE_BYTES 32
#define TILE_BYTES_PER_ROW 4
#define TRANSPARENT_NIBBLE 0
// Worst-case bytes per emitted routine, used to size the scratch
// buffer. A 32x32 sprite is 16 rows * (16 dest bytes + 1 for shift1)
// = 272 dest-byte slots, each up to 10 bytes mixed = 2720; plus
// per-row prologues 32*6=192; plus prologue/epilogue 8. Round up
// generously.
#define MAX_ROUTINE_BYTES 8192
// ----- Prototypes -----
static uint16_t emitCopyBodyX86(uint8_t *out, uint16_t cursor, uint16_t heightPx, uint16_t copyBytes, bool strideOnSrc);
static void shiftedByteAt(const SpriteT *sp, uint16_t row, uint16_t col, uint8_t shift, uint16_t spriteBytesPerRow, uint8_t *outValue, uint8_t *outOpaqueMask);
static uint8_t spriteSourceByte(const SpriteT *sp, uint16_t row, uint16_t col);
// ----- Emit helpers (alphabetical) -----
// Shared body for save/restore. Walks heightPx rows of copyBytes
// using rep movsd for the dword-aligned bulk and rep movsb for the
// byte tail. After each row except the last, advances either esi
// or edi by (SURFACE_BYTES_PER_ROW - copyBytes) so the strided side
// (screen) lines up with the next scanline; the contiguous side
// (backup) advances naturally because rep movs* leaves the index
// register one past the last byte copied.
//
// strideOnSrc=true -> source has the screen stride (SAVE)
// strideOnSrc=false -> destination has the screen stride (RESTORE)
static uint16_t emitCopyBodyX86(uint8_t *out, uint16_t cursor, uint16_t heightPx, uint16_t copyBytes, bool strideOnSrc) {
uint16_t row;
uint16_t dwords;
uint16_t tail;
int32_t advance;
dwords = (uint16_t)(copyBytes >> 2);
tail = (uint16_t)(copyBytes & 0x3u);
advance = (int32_t)SURFACE_BYTES_PER_ROW - (int32_t)copyBytes;
for (row = 0; row < heightPx; row++) {
if (dwords > 0) {
// mov ecx, dwords (B9 imm32); rep movsd (F3 A5)
out[cursor++] = 0xB9;
out[cursor++] = (uint8_t)(dwords & 0xFFu);
out[cursor++] = (uint8_t)((dwords >> 8) & 0xFFu);
out[cursor++] = 0;
out[cursor++] = 0;
out[cursor++] = 0xF3;
out[cursor++] = 0xA5;
}
if (tail > 0) {
// mov ecx, tail (B9 imm32); rep movsb (F3 A4)
out[cursor++] = 0xB9;
out[cursor++] = (uint8_t)(tail & 0xFFu);
out[cursor++] = 0;
out[cursor++] = 0;
out[cursor++] = 0;
out[cursor++] = 0xF3;
out[cursor++] = 0xA4;
}
if (row + 1u < heightPx) {
// SAVE: add esi, advance (81 C6 imm32)
// RESTORE: add edi, advance (81 C7 imm32)
out[cursor++] = 0x81;
out[cursor++] = (uint8_t)(strideOnSrc ? 0xC6u : 0xC7u);
out[cursor++] = (uint8_t)(advance & 0xFFu);
out[cursor++] = (uint8_t)((advance >> 8) & 0xFFu);
out[cursor++] = (uint8_t)((advance >> 16) & 0xFFu);
out[cursor++] = (uint8_t)((advance >> 24) & 0xFFu);
}
}
return cursor;
}
// Decompose a destination byte's contribution from the sprite into
// (value, opaqueMask) for shift in {0, 1}. opaqueMask high nibble
// 0xF0 means high dest nibble is opaque; 0x0F means low is opaque;
// 0x00 means both transparent. value's transparent nibbles are 0.
static void shiftedByteAt(const SpriteT *sp, uint16_t row, uint16_t col, uint8_t shift, uint16_t spriteBytesPerRow, uint8_t *outValue, uint8_t *outOpaqueMask) {
uint8_t srcByte;
uint8_t hi;
uint8_t lo;
bool hasLeft;
bool hasRight;
*outValue = 0;
*outOpaqueMask = 0;
if (shift == 0) {
if (col >= spriteBytesPerRow) {
return;
}
srcByte = spriteSourceByte(sp, row, col);
hi = (uint8_t)((srcByte >> 4) & 0x0Fu);
lo = (uint8_t)(srcByte & 0x0Fu);
if (hi != TRANSPARENT_NIBBLE) {
*outValue |= (uint8_t)(hi << 4);
*outOpaqueMask |= 0xF0u;
}
if (lo != TRANSPARENT_NIBBLE) {
*outValue |= lo;
*outOpaqueMask |= 0x0Fu;
}
return;
}
// shift = 1
hasLeft = (col >= 1) && ((uint16_t)(col - 1) < spriteBytesPerRow);
hasRight = (col < spriteBytesPerRow);
if (hasLeft) {
srcByte = spriteSourceByte(sp, row, (uint16_t)(col - 1));
hi = (uint8_t)(srcByte & 0x0Fu); // sprite byte's LOW nibble
if (hi != TRANSPARENT_NIBBLE) {
*outValue |= (uint8_t)(hi << 4);
*outOpaqueMask |= 0xF0u;
}
}
if (hasRight) {
srcByte = spriteSourceByte(sp, row, col);
lo = (uint8_t)((srcByte >> 4) & 0x0Fu); // sprite byte's HIGH nibble
if (lo != TRANSPARENT_NIBBLE) {
*outValue |= lo;
*outOpaqueMask |= 0x0Fu;
}
}
}
// Sample a sprite tile-data byte at (row, col) where col is in
// sprite-byte coordinates (0..spriteBytesPerRow-1).
static uint8_t spriteSourceByte(const SpriteT *sp, uint16_t row, uint16_t col) {
uint16_t tileX;
uint16_t tileY;
uint16_t inTileX;
uint16_t inTileY;
const uint8_t *tile;
tileX = (uint16_t)(col / TILE_BYTES_PER_ROW);
tileY = (uint16_t)(row / TILE_PIXELS);
inTileX = (uint16_t)(col & (TILE_BYTES_PER_ROW - 1));
inTileY = (uint16_t)(row & (TILE_PIXELS - 1));
tile = sp->tileData + ((uint32_t)(tileY * sp->widthTiles + tileX)) * TILE_BYTES;
return tile[inTileY * TILE_BYTES_PER_ROW + inTileX];
}
// Emit a draw routine for one shift variant. Returns bytes written.
// Routine signature: void f(uint8_t *dst).
uint16_t spriteEmitDrawX86(uint8_t *out, const SpriteT *sp, uint8_t shift) {
uint16_t cursor;
uint16_t row;
uint16_t col;
uint16_t runEnd;
uint16_t runLen;
uint16_t heightPx;
uint16_t spriteBytesPerRow;
uint16_t destBytesPerRow;
uint8_t value;
uint8_t opaqueMask;
uint8_t v1;
uint8_t v2;
uint8_t v3;
uint8_t m;
if (shift > 1u) {
return 0u;
}
cursor = 0;
heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS);
spriteBytesPerRow = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW);
destBytesPerRow = (uint16_t)(spriteBytesPerRow + (shift == 1 ? 1 : 0));
// Prologue: push esi; mov esi, [esp+8]
out[cursor++] = 0x56; // push esi
out[cursor++] = 0x8B; out[cursor++] = 0x74;
out[cursor++] = 0x24; out[cursor++] = 0x08;
// Body: per row, scan dest bytes coalescing fully-opaque runs.
for (row = 0; row < heightPx; row++) {
if (row > 0) {
// add esi, SURFACE_BYTES_PER_ROW (32-bit imm)
out[cursor++] = 0x81; out[cursor++] = 0xC6;
out[cursor++] = (uint8_t)(SURFACE_BYTES_PER_ROW & 0xFFu);
out[cursor++] = (uint8_t)((SURFACE_BYTES_PER_ROW >> 8) & 0xFFu);
out[cursor++] = 0x00;
out[cursor++] = 0x00;
}
col = 0;
while (col < destBytesPerRow) {
shiftedByteAt(sp, row, col, shift, spriteBytesPerRow, &value, &opaqueMask);
if (opaqueMask == 0x00) {
col++;
continue;
}
if (opaqueMask != 0xFFu) {
// Mixed: read-modify-write.
// mov al, [esi+col] (8A 46 cc)
// and al, ~opaqueMask (24 mm)
// or al, value (0C vv)
// mov [esi+col], al (88 46 cc)
out[cursor++] = 0x8A; out[cursor++] = 0x46;
out[cursor++] = (uint8_t)(col & 0xFFu);
out[cursor++] = 0x24;
out[cursor++] = (uint8_t)(~opaqueMask & 0xFFu);
out[cursor++] = 0x0C;
out[cursor++] = value;
out[cursor++] = 0x88; out[cursor++] = 0x46;
out[cursor++] = (uint8_t)(col & 0xFFu);
col++;
continue;
}
// Fully opaque at col -- find the end of the run.
runEnd = (uint16_t)(col + 1);
while (runEnd < destBytesPerRow) {
shiftedByteAt(sp, row, runEnd, shift, spriteBytesPerRow, &v1, &m);
if (m != 0xFFu) {
break;
}
runEnd++;
}
runLen = (uint16_t)(runEnd - col);
// Emit dword stores while >= 4 bytes remain, then a word
// store if >= 2, then a single byte. shiftedByteAt is cheap
// enough that re-reading per chunk beats threading a
// fixed-size buffer through.
while (runLen >= 4) {
shiftedByteAt(sp, row, (uint16_t)(col + 1), shift, spriteBytesPerRow, &v1, &m);
shiftedByteAt(sp, row, (uint16_t)(col + 2), shift, spriteBytesPerRow, &v2, &m);
shiftedByteAt(sp, row, (uint16_t)(col + 3), shift, spriteBytesPerRow, &v3, &m);
// mov dword [esi+col], imm32 (C7 46 cc ii ii ii ii)
out[cursor++] = 0xC7; out[cursor++] = 0x46;
out[cursor++] = (uint8_t)(col & 0xFFu);
out[cursor++] = value;
out[cursor++] = v1;
out[cursor++] = v2;
out[cursor++] = v3;
col = (uint16_t)(col + 4);
runLen = (uint16_t)(runLen - 4);
if (runLen > 0) {
shiftedByteAt(sp, row, col, shift, spriteBytesPerRow, &value, &opaqueMask);
}
}
if (runLen >= 2) {
shiftedByteAt(sp, row, (uint16_t)(col + 1), shift, spriteBytesPerRow, &v1, &m);
// mov word [esi+col], imm16 (66 C7 46 cc ii ii)
out[cursor++] = 0x66;
out[cursor++] = 0xC7; out[cursor++] = 0x46;
out[cursor++] = (uint8_t)(col & 0xFFu);
out[cursor++] = value;
out[cursor++] = v1;
col = (uint16_t)(col + 2);
runLen = (uint16_t)(runLen - 2);
if (runLen > 0) {
shiftedByteAt(sp, row, col, shift, spriteBytesPerRow, &value, &opaqueMask);
}
}
if (runLen == 1) {
// mov byte [esi+col], imm8 (C6 46 cc ii)
out[cursor++] = 0xC6; out[cursor++] = 0x46;
out[cursor++] = (uint8_t)(col & 0xFFu);
out[cursor++] = value;
col++;
}
}
}
// Epilogue: pop esi; ret
out[cursor++] = 0x5E;
out[cursor++] = 0xC3;
return cursor;
}
// RESTORE: copy backup -> screen. Destination has the screen stride.
uint16_t spriteEmitRestoreX86(uint8_t *out, const SpriteT *sp, uint8_t shift) {
uint16_t cursor;
uint16_t heightPx;
uint16_t copyBytes;
if (shift > 1u) {
return 0u;
}
cursor = 0;
heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS);
copyBytes = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW + (shift == 1u ? 1u : 0u));
// Prologue: push esi; push edi; mov esi,[esp+12]; mov edi,[esp+16]
out[cursor++] = 0x56;
out[cursor++] = 0x57;
out[cursor++] = 0x8B; out[cursor++] = 0x74; out[cursor++] = 0x24; out[cursor++] = 0x0C;
out[cursor++] = 0x8B; out[cursor++] = 0x7C; out[cursor++] = 0x24; out[cursor++] = 0x10;
cursor = emitCopyBodyX86(out, cursor, heightPx, copyBytes, false);
// Epilogue: pop edi; pop esi; ret
out[cursor++] = 0x5F;
out[cursor++] = 0x5E;
out[cursor++] = 0xC3;
return cursor;
}
// SAVE: copy screen -> backup. Source has the screen stride.
uint16_t spriteEmitSaveX86(uint8_t *out, const SpriteT *sp, uint8_t shift) {
uint16_t cursor;
uint16_t heightPx;
uint16_t copyBytes;
if (shift > 1u) {
return 0u;
}
cursor = 0;
heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS);
copyBytes = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW + (shift == 1u ? 1u : 0u));
out[cursor++] = 0x56;
out[cursor++] = 0x57;
out[cursor++] = 0x8B; out[cursor++] = 0x74; out[cursor++] = 0x24; out[cursor++] = 0x0C;
out[cursor++] = 0x8B; out[cursor++] = 0x7C; out[cursor++] = 0x24; out[cursor++] = 0x10;
cursor = emitCopyBodyX86(out, cursor, heightPx, copyBytes, true);
out[cursor++] = 0x5F;
out[cursor++] = 0x5E;
out[cursor++] = 0xC3;
return cursor;
}