Sprite run coalescing added to x86.

This commit is contained in:
Scott Duensing 2026-04-26 21:54:04 -05:00
parent c4ee37941a
commit 6dd2266f13

View file

@ -7,11 +7,17 @@
// draw(uint8_t *dst) -- arg in [esp+8] after prologue saves esi // draw(uint8_t *dst) -- arg in [esp+8] after prologue saves esi
// save/restore(const uint8_t *src, uint8_t *dst) -- args in [esp+12]/[esp+16] after esi+edi save // save/restore(const uint8_t *src, uint8_t *dst) -- args in [esp+12]/[esp+16] after esi+edi save
// //
// Per-byte emit (no run coalescing yet): // Per-byte emit, with opaque-run coalescing for the draw path:
// - all-transparent (both nibbles 0): skip, no instruction // - all-transparent (both nibbles 0): skip, no instruction
// - all-opaque: mov byte [esi+col], imm8 (4 bytes encoded)
// - mixed: mov al,[esi+col]; and al,mask; or al,val; mov [esi+col],al // - mixed: mov al,[esi+col]; and al,mask; or al,val; mov [esi+col],al
// (3 + 2 + 2 + 3 = 10 bytes) // (3 + 2 + 2 + 3 = 10 bytes)
// - run of N consecutive fully-opaque bytes: emit largest chunks
// while N >= 4: mov dword [esi+col], imm32 (7 bytes, 1 store)
// if N >= 2: mov word [esi+col], imm16 (6 bytes, 1 store)
// if N == 1: mov byte [esi+col], imm8 (4 bytes, 1 store)
// A run of 4 opaque bytes is therefore one 7-byte store instead of
// four 4-byte stores (16 bytes / 4 stores). Unaligned access is
// fine on 386+.
// Per row: // Per row:
// add esi, SURFACE_BYTES_PER_ROW (6 bytes encoded) // add esi, SURFACE_BYTES_PER_ROW (6 bytes encoded)
// Prologue: // Prologue:
@ -182,11 +188,17 @@ uint16_t spriteEmitDrawX86(uint8_t *out, const SpriteT *sp, uint8_t shift) {
uint16_t cursor; uint16_t cursor;
uint16_t row; uint16_t row;
uint16_t col; uint16_t col;
uint16_t runEnd;
uint16_t runLen;
uint16_t heightPx; uint16_t heightPx;
uint16_t spriteBytesPerRow; uint16_t spriteBytesPerRow;
uint16_t destBytesPerRow; uint16_t destBytesPerRow;
uint8_t value; uint8_t value;
uint8_t opaqueMask; uint8_t opaqueMask;
uint8_t v1;
uint8_t v2;
uint8_t v3;
uint8_t m;
cursor = 0; cursor = 0;
heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS); heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS);
@ -198,7 +210,7 @@ uint16_t spriteEmitDrawX86(uint8_t *out, const SpriteT *sp, uint8_t shift) {
out[cursor++] = 0x8B; out[cursor++] = 0x74; out[cursor++] = 0x8B; out[cursor++] = 0x74;
out[cursor++] = 0x24; out[cursor++] = 0x08; out[cursor++] = 0x24; out[cursor++] = 0x08;
// Body: per row, per dest byte. // Body: per row, scan dest bytes coalescing fully-opaque runs.
for (row = 0; row < heightPx; row++) { for (row = 0; row < heightPx; row++) {
if (row > 0) { if (row > 0) {
// add esi, SURFACE_BYTES_PER_ROW (32-bit imm) // add esi, SURFACE_BYTES_PER_ROW (32-bit imm)
@ -208,17 +220,14 @@ uint16_t spriteEmitDrawX86(uint8_t *out, const SpriteT *sp, uint8_t shift) {
out[cursor++] = 0x00; out[cursor++] = 0x00;
out[cursor++] = 0x00; out[cursor++] = 0x00;
} }
for (col = 0; col < destBytesPerRow; col++) { col = 0;
while (col < destBytesPerRow) {
shiftedByteAt(sp, row, col, shift, spriteBytesPerRow, &value, &opaqueMask); shiftedByteAt(sp, row, col, shift, spriteBytesPerRow, &value, &opaqueMask);
if (opaqueMask == 0x00) { if (opaqueMask == 0x00) {
continue; // both nibbles transparent col++;
continue;
} }
if (opaqueMask == 0xFFu) { if (opaqueMask != 0xFFu) {
// mov byte [esi+col], imm8 (C6 46 cc ii)
out[cursor++] = 0xC6; out[cursor++] = 0x46;
out[cursor++] = (uint8_t)(col & 0xFFu);
out[cursor++] = value;
} else {
// Mixed: read-modify-write. // Mixed: read-modify-write.
// mov al, [esi+col] (8A 46 cc) // mov al, [esi+col] (8A 46 cc)
// and al, ~opaqueMask (24 mm) // and al, ~opaqueMask (24 mm)
@ -232,6 +241,61 @@ uint16_t spriteEmitDrawX86(uint8_t *out, const SpriteT *sp, uint8_t shift) {
out[cursor++] = value; out[cursor++] = value;
out[cursor++] = 0x88; out[cursor++] = 0x46; out[cursor++] = 0x88; out[cursor++] = 0x46;
out[cursor++] = (uint8_t)(col & 0xFFu); out[cursor++] = (uint8_t)(col & 0xFFu);
col++;
continue;
}
// Fully opaque at col -- find the end of the run.
runEnd = (uint16_t)(col + 1);
while (runEnd < destBytesPerRow) {
shiftedByteAt(sp, row, runEnd, shift, spriteBytesPerRow, &v1, &m);
if (m != 0xFFu) {
break;
}
runEnd++;
}
runLen = (uint16_t)(runEnd - col);
// Emit dword stores while >= 4 bytes remain, then a word
// store if >= 2, then a single byte. shiftedByteAt is cheap
// enough that re-reading per chunk beats threading a
// fixed-size buffer through.
while (runLen >= 4) {
shiftedByteAt(sp, row, (uint16_t)(col + 1), shift, spriteBytesPerRow, &v1, &m);
shiftedByteAt(sp, row, (uint16_t)(col + 2), shift, spriteBytesPerRow, &v2, &m);
shiftedByteAt(sp, row, (uint16_t)(col + 3), shift, spriteBytesPerRow, &v3, &m);
// mov dword [esi+col], imm32 (C7 46 cc ii ii ii ii)
out[cursor++] = 0xC7; out[cursor++] = 0x46;
out[cursor++] = (uint8_t)(col & 0xFFu);
out[cursor++] = value;
out[cursor++] = v1;
out[cursor++] = v2;
out[cursor++] = v3;
col = (uint16_t)(col + 4);
runLen = (uint16_t)(runLen - 4);
if (runLen > 0) {
shiftedByteAt(sp, row, col, shift, spriteBytesPerRow, &value, &opaqueMask);
}
}
if (runLen >= 2) {
shiftedByteAt(sp, row, (uint16_t)(col + 1), shift, spriteBytesPerRow, &v1, &m);
// mov word [esi+col], imm16 (66 C7 46 cc ii ii)
out[cursor++] = 0x66;
out[cursor++] = 0xC7; out[cursor++] = 0x46;
out[cursor++] = (uint8_t)(col & 0xFFu);
out[cursor++] = value;
out[cursor++] = v1;
col = (uint16_t)(col + 2);
runLen = (uint16_t)(runLen - 2);
if (runLen > 0) {
shiftedByteAt(sp, row, col, shift, spriteBytesPerRow, &value, &opaqueMask);
}
}
if (runLen == 1) {
// mov byte [esi+col], imm8 (C6 46 cc ii)
out[cursor++] = 0xC6; out[cursor++] = 0x46;
out[cursor++] = (uint8_t)(col & 0xFFu);
out[cursor++] = value;
col++;
} }
} }
} }