diff --git a/src/codegen/spriteEmitX86.c b/src/codegen/spriteEmitX86.c index 8d66228..b0c1bbf 100644 --- a/src/codegen/spriteEmitX86.c +++ b/src/codegen/spriteEmitX86.c @@ -7,11 +7,17 @@ // draw(uint8_t *dst) -- arg in [esp+8] after prologue saves esi // save/restore(const uint8_t *src, uint8_t *dst) -- args in [esp+12]/[esp+16] after esi+edi save // -// Per-byte emit (no run coalescing yet): +// Per-byte emit, with opaque-run coalescing for the draw path: // - all-transparent (both nibbles 0): skip, no instruction -// - all-opaque: mov byte [esi+col], imm8 (4 bytes encoded) // - mixed: mov al,[esi+col]; and al,mask; or al,val; mov [esi+col],al // (3 + 2 + 2 + 3 = 10 bytes) +// - run of N consecutive fully-opaque bytes: emit largest chunks +// while N >= 4: mov dword [esi+col], imm32 (7 bytes, 1 store) +// if N >= 2: mov word [esi+col], imm16 (6 bytes, 1 store) +// if N == 1: mov byte [esi+col], imm8 (4 bytes, 1 store) +// A run of 4 opaque bytes is therefore one 7-byte store instead of +// four 4-byte stores (16 bytes / 4 stores). Unaligned access is +// fine on 386+. // Per row: // add esi, SURFACE_BYTES_PER_ROW (6 bytes encoded) // Prologue: @@ -182,11 +188,17 @@ uint16_t spriteEmitDrawX86(uint8_t *out, const SpriteT *sp, uint8_t shift) { uint16_t cursor; uint16_t row; uint16_t col; + uint16_t runEnd; + uint16_t runLen; uint16_t heightPx; uint16_t spriteBytesPerRow; uint16_t destBytesPerRow; uint8_t value; uint8_t opaqueMask; + uint8_t v1; + uint8_t v2; + uint8_t v3; + uint8_t m; cursor = 0; heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS); @@ -198,7 +210,7 @@ uint16_t spriteEmitDrawX86(uint8_t *out, const SpriteT *sp, uint8_t shift) { out[cursor++] = 0x8B; out[cursor++] = 0x74; out[cursor++] = 0x24; out[cursor++] = 0x08; - // Body: per row, per dest byte. + // Body: per row, scan dest bytes coalescing fully-opaque runs. for (row = 0; row < heightPx; row++) { if (row > 0) { // add esi, SURFACE_BYTES_PER_ROW (32-bit imm) @@ -208,17 +220,14 @@ uint16_t spriteEmitDrawX86(uint8_t *out, const SpriteT *sp, uint8_t shift) { out[cursor++] = 0x00; out[cursor++] = 0x00; } - for (col = 0; col < destBytesPerRow; col++) { + col = 0; + while (col < destBytesPerRow) { shiftedByteAt(sp, row, col, shift, spriteBytesPerRow, &value, &opaqueMask); if (opaqueMask == 0x00) { - continue; // both nibbles transparent + col++; + continue; } - if (opaqueMask == 0xFFu) { - // mov byte [esi+col], imm8 (C6 46 cc ii) - out[cursor++] = 0xC6; out[cursor++] = 0x46; - out[cursor++] = (uint8_t)(col & 0xFFu); - out[cursor++] = value; - } else { + if (opaqueMask != 0xFFu) { // Mixed: read-modify-write. // mov al, [esi+col] (8A 46 cc) // and al, ~opaqueMask (24 mm) @@ -232,6 +241,61 @@ uint16_t spriteEmitDrawX86(uint8_t *out, const SpriteT *sp, uint8_t shift) { out[cursor++] = value; out[cursor++] = 0x88; out[cursor++] = 0x46; out[cursor++] = (uint8_t)(col & 0xFFu); + col++; + continue; + } + // Fully opaque at col -- find the end of the run. + runEnd = (uint16_t)(col + 1); + while (runEnd < destBytesPerRow) { + shiftedByteAt(sp, row, runEnd, shift, spriteBytesPerRow, &v1, &m); + if (m != 0xFFu) { + break; + } + runEnd++; + } + runLen = (uint16_t)(runEnd - col); + + // Emit dword stores while >= 4 bytes remain, then a word + // store if >= 2, then a single byte. shiftedByteAt is cheap + // enough that re-reading per chunk beats threading a + // fixed-size buffer through. + while (runLen >= 4) { + shiftedByteAt(sp, row, (uint16_t)(col + 1), shift, spriteBytesPerRow, &v1, &m); + shiftedByteAt(sp, row, (uint16_t)(col + 2), shift, spriteBytesPerRow, &v2, &m); + shiftedByteAt(sp, row, (uint16_t)(col + 3), shift, spriteBytesPerRow, &v3, &m); + // mov dword [esi+col], imm32 (C7 46 cc ii ii ii ii) + out[cursor++] = 0xC7; out[cursor++] = 0x46; + out[cursor++] = (uint8_t)(col & 0xFFu); + out[cursor++] = value; + out[cursor++] = v1; + out[cursor++] = v2; + out[cursor++] = v3; + col = (uint16_t)(col + 4); + runLen = (uint16_t)(runLen - 4); + if (runLen > 0) { + shiftedByteAt(sp, row, col, shift, spriteBytesPerRow, &value, &opaqueMask); + } + } + if (runLen >= 2) { + shiftedByteAt(sp, row, (uint16_t)(col + 1), shift, spriteBytesPerRow, &v1, &m); + // mov word [esi+col], imm16 (66 C7 46 cc ii ii) + out[cursor++] = 0x66; + out[cursor++] = 0xC7; out[cursor++] = 0x46; + out[cursor++] = (uint8_t)(col & 0xFFu); + out[cursor++] = value; + out[cursor++] = v1; + col = (uint16_t)(col + 2); + runLen = (uint16_t)(runLen - 2); + if (runLen > 0) { + shiftedByteAt(sp, row, col, shift, spriteBytesPerRow, &value, &opaqueMask); + } + } + if (runLen == 1) { + // mov byte [esi+col], imm8 (C6 46 cc ii) + out[cursor++] = 0xC6; out[cursor++] = 0x46; + out[cursor++] = (uint8_t)(col & 0xFFu); + out[cursor++] = value; + col++; } } }