369 lines
14 KiB
C
369 lines
14 KiB
C
// x86 sprite codegen (DOS port). Emits 32-bit cdecl-callable PIC
|
|
// draw / save / restore routines that read or write 4bpp packed
|
|
// surface bytes via [esi+disp8] chains. The C side calls them
|
|
// through a function pointer cast.
|
|
//
|
|
// Calling convention:
|
|
// draw(uint8_t *dst) -- arg in [esp+8] after prologue saves esi
|
|
// save/restore(const uint8_t *src, uint8_t *dst) -- args in [esp+12]/[esp+16] after esi+edi save
|
|
//
|
|
// Per-byte emit, with opaque-run coalescing for the draw path:
|
|
// - all-transparent (both nibbles 0): skip, no instruction
|
|
// - mixed: mov al,[esi+col]; and al,mask; or al,val; mov [esi+col],al
|
|
// (3 + 2 + 2 + 3 = 10 bytes)
|
|
// - run of N consecutive fully-opaque bytes: emit largest chunks
|
|
// while N >= 4: mov dword [esi+col], imm32 (7 bytes, 1 store)
|
|
// if N >= 2: mov word [esi+col], imm16 (6 bytes, 1 store)
|
|
// if N == 1: mov byte [esi+col], imm8 (4 bytes, 1 store)
|
|
// A run of 4 opaque bytes is therefore one 7-byte store instead of
|
|
// four 4-byte stores (16 bytes / 4 stores). Unaligned access is
|
|
// fine on 386+.
|
|
// Per row:
|
|
// add esi, SURFACE_BYTES_PER_ROW (6 bytes encoded)
|
|
// Prologue:
|
|
// push esi; mov esi, [esp+8] (1 + 4 = 5 bytes)
|
|
// Epilogue:
|
|
// pop esi; ret (1 + 1 = 2 bytes)
|
|
|
|
#include "joey/sprite.h"
|
|
#include "joey/surface.h"
|
|
#include "spriteEmitter.h"
|
|
#include "spriteInternal.h"
|
|
|
|
|
|
// ----- Constants -----
|
|
|
|
#define TILE_PIXELS 8
|
|
#define TILE_BYTES 32
|
|
#define TILE_BYTES_PER_ROW 4
|
|
#define TRANSPARENT_NIBBLE 0
|
|
|
|
// Worst-case bytes per emitted routine, used to size the scratch
|
|
// buffer. A 32x32 sprite is 16 rows * (16 dest bytes + 1 for shift1)
|
|
// = 272 dest-byte slots, each up to 10 bytes mixed = 2720; plus
|
|
// per-row prologues 32*6=192; plus prologue/epilogue 8. Round up
|
|
// generously.
|
|
#define MAX_ROUTINE_BYTES 8192
|
|
|
|
|
|
// ----- Prototypes -----
|
|
|
|
static uint16_t emitCopyBodyX86(uint8_t *out, uint16_t cursor, uint16_t heightPx, uint16_t copyBytes, bool strideOnSrc);
|
|
static void shiftedByteAt(const SpriteT *sp, uint16_t row, uint16_t col, uint8_t shift, uint16_t spriteBytesPerRow, uint8_t *outValue, uint8_t *outOpaqueMask);
|
|
static uint8_t spriteSourceByte(const SpriteT *sp, uint16_t row, uint16_t col);
|
|
|
|
|
|
// ----- Emit helpers (alphabetical) -----
|
|
|
|
// Shared body for save/restore. Walks heightPx rows of copyBytes
|
|
// using rep movsd for the dword-aligned bulk and rep movsb for the
|
|
// byte tail. After each row except the last, advances either esi
|
|
// or edi by (SURFACE_BYTES_PER_ROW - copyBytes) so the strided side
|
|
// (screen) lines up with the next scanline; the contiguous side
|
|
// (backup) advances naturally because rep movs* leaves the index
|
|
// register one past the last byte copied.
|
|
//
|
|
// strideOnSrc=true -> source has the screen stride (SAVE)
|
|
// strideOnSrc=false -> destination has the screen stride (RESTORE)
|
|
static uint16_t emitCopyBodyX86(uint8_t *out, uint16_t cursor, uint16_t heightPx, uint16_t copyBytes, bool strideOnSrc) {
|
|
uint16_t row;
|
|
uint16_t dwords;
|
|
uint16_t tail;
|
|
int32_t advance;
|
|
|
|
dwords = (uint16_t)(copyBytes >> 2);
|
|
tail = (uint16_t)(copyBytes & 0x3u);
|
|
advance = (int32_t)SURFACE_BYTES_PER_ROW - (int32_t)copyBytes;
|
|
|
|
for (row = 0; row < heightPx; row++) {
|
|
if (dwords > 0) {
|
|
// mov ecx, dwords (B9 imm32); rep movsd (F3 A5)
|
|
out[cursor++] = 0xB9;
|
|
out[cursor++] = (uint8_t)(dwords & 0xFFu);
|
|
out[cursor++] = (uint8_t)((dwords >> 8) & 0xFFu);
|
|
out[cursor++] = 0;
|
|
out[cursor++] = 0;
|
|
out[cursor++] = 0xF3;
|
|
out[cursor++] = 0xA5;
|
|
}
|
|
if (tail > 0) {
|
|
// mov ecx, tail (B9 imm32); rep movsb (F3 A4)
|
|
out[cursor++] = 0xB9;
|
|
out[cursor++] = (uint8_t)(tail & 0xFFu);
|
|
out[cursor++] = 0;
|
|
out[cursor++] = 0;
|
|
out[cursor++] = 0;
|
|
out[cursor++] = 0xF3;
|
|
out[cursor++] = 0xA4;
|
|
}
|
|
if (row + 1u < heightPx) {
|
|
// SAVE: add esi, advance (81 C6 imm32)
|
|
// RESTORE: add edi, advance (81 C7 imm32)
|
|
out[cursor++] = 0x81;
|
|
out[cursor++] = (uint8_t)(strideOnSrc ? 0xC6u : 0xC7u);
|
|
out[cursor++] = (uint8_t)(advance & 0xFFu);
|
|
out[cursor++] = (uint8_t)((advance >> 8) & 0xFFu);
|
|
out[cursor++] = (uint8_t)((advance >> 16) & 0xFFu);
|
|
out[cursor++] = (uint8_t)((advance >> 24) & 0xFFu);
|
|
}
|
|
}
|
|
return cursor;
|
|
}
|
|
|
|
|
|
// Decompose a destination byte's contribution from the sprite into
|
|
// (value, opaqueMask) for shift in {0, 1}. opaqueMask high nibble
|
|
// 0xF0 means high dest nibble is opaque; 0x0F means low is opaque;
|
|
// 0x00 means both transparent. value's transparent nibbles are 0.
|
|
static void shiftedByteAt(const SpriteT *sp, uint16_t row, uint16_t col, uint8_t shift, uint16_t spriteBytesPerRow, uint8_t *outValue, uint8_t *outOpaqueMask) {
|
|
uint8_t srcByte;
|
|
uint8_t hi;
|
|
uint8_t lo;
|
|
bool hasLeft;
|
|
bool hasRight;
|
|
|
|
*outValue = 0;
|
|
*outOpaqueMask = 0;
|
|
|
|
if (shift == 0) {
|
|
if (col >= spriteBytesPerRow) {
|
|
return;
|
|
}
|
|
srcByte = spriteSourceByte(sp, row, col);
|
|
hi = (uint8_t)((srcByte >> 4) & 0x0Fu);
|
|
lo = (uint8_t)(srcByte & 0x0Fu);
|
|
if (hi != TRANSPARENT_NIBBLE) {
|
|
*outValue |= (uint8_t)(hi << 4);
|
|
*outOpaqueMask |= 0xF0u;
|
|
}
|
|
if (lo != TRANSPARENT_NIBBLE) {
|
|
*outValue |= lo;
|
|
*outOpaqueMask |= 0x0Fu;
|
|
}
|
|
return;
|
|
}
|
|
// shift = 1
|
|
hasLeft = (col >= 1) && ((uint16_t)(col - 1) < spriteBytesPerRow);
|
|
hasRight = (col < spriteBytesPerRow);
|
|
if (hasLeft) {
|
|
srcByte = spriteSourceByte(sp, row, (uint16_t)(col - 1));
|
|
hi = (uint8_t)(srcByte & 0x0Fu); // sprite byte's LOW nibble
|
|
if (hi != TRANSPARENT_NIBBLE) {
|
|
*outValue |= (uint8_t)(hi << 4);
|
|
*outOpaqueMask |= 0xF0u;
|
|
}
|
|
}
|
|
if (hasRight) {
|
|
srcByte = spriteSourceByte(sp, row, col);
|
|
lo = (uint8_t)((srcByte >> 4) & 0x0Fu); // sprite byte's HIGH nibble
|
|
if (lo != TRANSPARENT_NIBBLE) {
|
|
*outValue |= lo;
|
|
*outOpaqueMask |= 0x0Fu;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
// Sample a sprite tile-data byte at (row, col) where col is in
|
|
// sprite-byte coordinates (0..spriteBytesPerRow-1).
|
|
static uint8_t spriteSourceByte(const SpriteT *sp, uint16_t row, uint16_t col) {
|
|
uint16_t tileX;
|
|
uint16_t tileY;
|
|
uint16_t inTileX;
|
|
uint16_t inTileY;
|
|
const uint8_t *tile;
|
|
|
|
tileX = (uint16_t)(col / TILE_BYTES_PER_ROW);
|
|
tileY = (uint16_t)(row / TILE_PIXELS);
|
|
inTileX = (uint16_t)(col & (TILE_BYTES_PER_ROW - 1));
|
|
inTileY = (uint16_t)(row & (TILE_PIXELS - 1));
|
|
tile = sp->tileData + ((uint32_t)(tileY * sp->widthTiles + tileX)) * TILE_BYTES;
|
|
return tile[inTileY * TILE_BYTES_PER_ROW + inTileX];
|
|
}
|
|
|
|
|
|
// Emit a draw routine for one shift variant. Returns bytes written.
|
|
// Routine signature: void f(uint8_t *dst).
|
|
uint16_t spriteEmitDrawX86(uint8_t *out, const SpriteT *sp, uint8_t shift) {
|
|
uint16_t cursor;
|
|
uint16_t row;
|
|
uint16_t col;
|
|
uint16_t runEnd;
|
|
uint16_t runLen;
|
|
uint16_t heightPx;
|
|
uint16_t spriteBytesPerRow;
|
|
uint16_t destBytesPerRow;
|
|
uint8_t value;
|
|
uint8_t opaqueMask;
|
|
uint8_t v1;
|
|
uint8_t v2;
|
|
uint8_t v3;
|
|
uint8_t m;
|
|
|
|
if (shift > 1u) {
|
|
return 0u;
|
|
}
|
|
|
|
cursor = 0;
|
|
heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS);
|
|
spriteBytesPerRow = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW);
|
|
destBytesPerRow = (uint16_t)(spriteBytesPerRow + (shift == 1 ? 1 : 0));
|
|
|
|
// Prologue: push esi; mov esi, [esp+8]
|
|
out[cursor++] = 0x56; // push esi
|
|
out[cursor++] = 0x8B; out[cursor++] = 0x74;
|
|
out[cursor++] = 0x24; out[cursor++] = 0x08;
|
|
|
|
// Body: per row, scan dest bytes coalescing fully-opaque runs.
|
|
for (row = 0; row < heightPx; row++) {
|
|
if (row > 0) {
|
|
// add esi, SURFACE_BYTES_PER_ROW (32-bit imm)
|
|
out[cursor++] = 0x81; out[cursor++] = 0xC6;
|
|
out[cursor++] = (uint8_t)(SURFACE_BYTES_PER_ROW & 0xFFu);
|
|
out[cursor++] = (uint8_t)((SURFACE_BYTES_PER_ROW >> 8) & 0xFFu);
|
|
out[cursor++] = 0x00;
|
|
out[cursor++] = 0x00;
|
|
}
|
|
col = 0;
|
|
while (col < destBytesPerRow) {
|
|
shiftedByteAt(sp, row, col, shift, spriteBytesPerRow, &value, &opaqueMask);
|
|
if (opaqueMask == 0x00) {
|
|
col++;
|
|
continue;
|
|
}
|
|
if (opaqueMask != 0xFFu) {
|
|
// Mixed: read-modify-write.
|
|
// mov al, [esi+col] (8A 46 cc)
|
|
// and al, ~opaqueMask (24 mm)
|
|
// or al, value (0C vv)
|
|
// mov [esi+col], al (88 46 cc)
|
|
out[cursor++] = 0x8A; out[cursor++] = 0x46;
|
|
out[cursor++] = (uint8_t)(col & 0xFFu);
|
|
out[cursor++] = 0x24;
|
|
out[cursor++] = (uint8_t)(~opaqueMask & 0xFFu);
|
|
out[cursor++] = 0x0C;
|
|
out[cursor++] = value;
|
|
out[cursor++] = 0x88; out[cursor++] = 0x46;
|
|
out[cursor++] = (uint8_t)(col & 0xFFu);
|
|
col++;
|
|
continue;
|
|
}
|
|
// Fully opaque at col -- find the end of the run.
|
|
runEnd = (uint16_t)(col + 1);
|
|
while (runEnd < destBytesPerRow) {
|
|
shiftedByteAt(sp, row, runEnd, shift, spriteBytesPerRow, &v1, &m);
|
|
if (m != 0xFFu) {
|
|
break;
|
|
}
|
|
runEnd++;
|
|
}
|
|
runLen = (uint16_t)(runEnd - col);
|
|
|
|
// Emit dword stores while >= 4 bytes remain, then a word
|
|
// store if >= 2, then a single byte. shiftedByteAt is cheap
|
|
// enough that re-reading per chunk beats threading a
|
|
// fixed-size buffer through.
|
|
while (runLen >= 4) {
|
|
shiftedByteAt(sp, row, (uint16_t)(col + 1), shift, spriteBytesPerRow, &v1, &m);
|
|
shiftedByteAt(sp, row, (uint16_t)(col + 2), shift, spriteBytesPerRow, &v2, &m);
|
|
shiftedByteAt(sp, row, (uint16_t)(col + 3), shift, spriteBytesPerRow, &v3, &m);
|
|
// mov dword [esi+col], imm32 (C7 46 cc ii ii ii ii)
|
|
out[cursor++] = 0xC7; out[cursor++] = 0x46;
|
|
out[cursor++] = (uint8_t)(col & 0xFFu);
|
|
out[cursor++] = value;
|
|
out[cursor++] = v1;
|
|
out[cursor++] = v2;
|
|
out[cursor++] = v3;
|
|
col = (uint16_t)(col + 4);
|
|
runLen = (uint16_t)(runLen - 4);
|
|
if (runLen > 0) {
|
|
shiftedByteAt(sp, row, col, shift, spriteBytesPerRow, &value, &opaqueMask);
|
|
}
|
|
}
|
|
if (runLen >= 2) {
|
|
shiftedByteAt(sp, row, (uint16_t)(col + 1), shift, spriteBytesPerRow, &v1, &m);
|
|
// mov word [esi+col], imm16 (66 C7 46 cc ii ii)
|
|
out[cursor++] = 0x66;
|
|
out[cursor++] = 0xC7; out[cursor++] = 0x46;
|
|
out[cursor++] = (uint8_t)(col & 0xFFu);
|
|
out[cursor++] = value;
|
|
out[cursor++] = v1;
|
|
col = (uint16_t)(col + 2);
|
|
runLen = (uint16_t)(runLen - 2);
|
|
if (runLen > 0) {
|
|
shiftedByteAt(sp, row, col, shift, spriteBytesPerRow, &value, &opaqueMask);
|
|
}
|
|
}
|
|
if (runLen == 1) {
|
|
// mov byte [esi+col], imm8 (C6 46 cc ii)
|
|
out[cursor++] = 0xC6; out[cursor++] = 0x46;
|
|
out[cursor++] = (uint8_t)(col & 0xFFu);
|
|
out[cursor++] = value;
|
|
col++;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Epilogue: pop esi; ret
|
|
out[cursor++] = 0x5E;
|
|
out[cursor++] = 0xC3;
|
|
return cursor;
|
|
}
|
|
|
|
|
|
// RESTORE: copy backup -> screen. Destination has the screen stride.
|
|
uint16_t spriteEmitRestoreX86(uint8_t *out, const SpriteT *sp, uint8_t shift) {
|
|
uint16_t cursor;
|
|
uint16_t heightPx;
|
|
uint16_t copyBytes;
|
|
|
|
if (shift > 1u) {
|
|
return 0u;
|
|
}
|
|
|
|
cursor = 0;
|
|
heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS);
|
|
copyBytes = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW + (shift == 1u ? 1u : 0u));
|
|
|
|
// Prologue: push esi; push edi; mov esi,[esp+12]; mov edi,[esp+16]
|
|
out[cursor++] = 0x56;
|
|
out[cursor++] = 0x57;
|
|
out[cursor++] = 0x8B; out[cursor++] = 0x74; out[cursor++] = 0x24; out[cursor++] = 0x0C;
|
|
out[cursor++] = 0x8B; out[cursor++] = 0x7C; out[cursor++] = 0x24; out[cursor++] = 0x10;
|
|
|
|
cursor = emitCopyBodyX86(out, cursor, heightPx, copyBytes, false);
|
|
|
|
// Epilogue: pop edi; pop esi; ret
|
|
out[cursor++] = 0x5F;
|
|
out[cursor++] = 0x5E;
|
|
out[cursor++] = 0xC3;
|
|
return cursor;
|
|
}
|
|
|
|
|
|
// SAVE: copy screen -> backup. Source has the screen stride.
|
|
uint16_t spriteEmitSaveX86(uint8_t *out, const SpriteT *sp, uint8_t shift) {
|
|
uint16_t cursor;
|
|
uint16_t heightPx;
|
|
uint16_t copyBytes;
|
|
|
|
if (shift > 1u) {
|
|
return 0u;
|
|
}
|
|
|
|
cursor = 0;
|
|
heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS);
|
|
copyBytes = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW + (shift == 1u ? 1u : 0u));
|
|
|
|
out[cursor++] = 0x56;
|
|
out[cursor++] = 0x57;
|
|
out[cursor++] = 0x8B; out[cursor++] = 0x74; out[cursor++] = 0x24; out[cursor++] = 0x0C;
|
|
out[cursor++] = 0x8B; out[cursor++] = 0x7C; out[cursor++] = 0x24; out[cursor++] = 0x10;
|
|
|
|
cursor = emitCopyBodyX86(out, cursor, heightPx, copyBytes, true);
|
|
|
|
out[cursor++] = 0x5F;
|
|
out[cursor++] = 0x5E;
|
|
out[cursor++] = 0xC3;
|
|
return cursor;
|
|
}
|