Sprite run coalescing added to x86.

2026-04-26 21:54:04 -05:00 · 2026-04-26 21:54:04 -05:00 · 6dd2266f13
commit 6dd2266f13
parent c4ee37941a
1 changed files with 75 additions and 11 deletions
--- a/src/codegen/spriteEmitX86.c
+++ b/src/codegen/spriteEmitX86.c
@ -7,11 +7,17 @@
 //   draw(uint8_t *dst)                              -- arg in [esp+8] after prologue saves esi
 //   save/restore(const uint8_t *src, uint8_t *dst)  -- args in [esp+12]/[esp+16] after esi+edi save
 //
-// Per-byte emit (no run coalescing yet):
+// Per-byte emit, with opaque-run coalescing for the draw path:
 //   - all-transparent (both nibbles 0): skip, no instruction
-//   - all-opaque: mov byte [esi+col], imm8       (4 bytes encoded)
 //   - mixed: mov al,[esi+col]; and al,mask; or al,val; mov [esi+col],al
 //                                                 (3 + 2 + 2 + 3 = 10 bytes)
+//   - run of N consecutive fully-opaque bytes: emit largest chunks
+//     while N >= 4: mov dword [esi+col], imm32   (7 bytes,  1 store)
+//     if   N >= 2: mov word  [esi+col], imm16   (6 bytes,  1 store)
+//     if   N == 1: mov byte  [esi+col], imm8    (4 bytes,  1 store)
+//     A run of 4 opaque bytes is therefore one 7-byte store instead of
+//     four 4-byte stores (16 bytes / 4 stores). Unaligned access is
+//     fine on 386+.
 // Per row:
 //   add esi, SURFACE_BYTES_PER_ROW                (6 bytes encoded)
 // Prologue:
@ -182,11 +188,17 @@ uint16_t spriteEmitDrawX86(uint8_t *out, const SpriteT *sp, uint8_t shift) {
    uint16_t  cursor;
    uint16_t  row;
    uint16_t  col;
+    uint16_t  runEnd;
+    uint16_t  runLen;
    uint16_t  heightPx;
    uint16_t  spriteBytesPerRow;
    uint16_t  destBytesPerRow;
    uint8_t   value;
    uint8_t   opaqueMask;
+    uint8_t   v1;
+    uint8_t   v2;
+    uint8_t   v3;
+    uint8_t   m;

    cursor             = 0;
    heightPx           = (uint16_t)(sp->heightTiles * TILE_PIXELS);
@ -198,7 +210,7 @@ uint16_t spriteEmitDrawX86(uint8_t *out, const SpriteT *sp, uint8_t shift) {
    out[cursor++] = 0x8B; out[cursor++] = 0x74;
    out[cursor++] = 0x24; out[cursor++] = 0x08;

-    // Body: per row, per dest byte.
+    // Body: per row, scan dest bytes coalescing fully-opaque runs.
    for (row = 0; row < heightPx; row++) {
        if (row > 0) {
            // add esi, SURFACE_BYTES_PER_ROW (32-bit imm)
@ -208,17 +220,14 @@ uint16_t spriteEmitDrawX86(uint8_t *out, const SpriteT *sp, uint8_t shift) {
            out[cursor++] = 0x00;
            out[cursor++] = 0x00;
        }
-        for (col = 0; col < destBytesPerRow; col++) {
+        col = 0;
+        while (col < destBytesPerRow) {
            shiftedByteAt(sp, row, col, shift, spriteBytesPerRow, &value, &opaqueMask);
            if (opaqueMask == 0x00) {
-                continue;       // both nibbles transparent
+                col++;
+                continue;
            }
-            if (opaqueMask == 0xFFu) {
-                // mov byte [esi+col], imm8       (C6 46 cc ii)
-                out[cursor++] = 0xC6; out[cursor++] = 0x46;
-                out[cursor++] = (uint8_t)(col & 0xFFu);
-                out[cursor++] = value;
-            } else {
+            if (opaqueMask != 0xFFu) {
                // Mixed: read-modify-write.
                //   mov al, [esi+col]            (8A 46 cc)
                //   and al, ~opaqueMask          (24 mm)
@ -232,6 +241,61 @@ uint16_t spriteEmitDrawX86(uint8_t *out, const SpriteT *sp, uint8_t shift) {
                out[cursor++] = value;
                out[cursor++] = 0x88; out[cursor++] = 0x46;
                out[cursor++] = (uint8_t)(col & 0xFFu);
+                col++;
+                continue;
+            }
+            // Fully opaque at col -- find the end of the run.
+            runEnd = (uint16_t)(col + 1);
+            while (runEnd < destBytesPerRow) {
+                shiftedByteAt(sp, row, runEnd, shift, spriteBytesPerRow, &v1, &m);
+                if (m != 0xFFu) {
+                    break;
+                }
+                runEnd++;
+            }
+            runLen = (uint16_t)(runEnd - col);
+
+            // Emit dword stores while >= 4 bytes remain, then a word
+            // store if >= 2, then a single byte. shiftedByteAt is cheap
+            // enough that re-reading per chunk beats threading a
+            // fixed-size buffer through.
+            while (runLen >= 4) {
+                shiftedByteAt(sp, row, (uint16_t)(col + 1), shift, spriteBytesPerRow, &v1, &m);
+                shiftedByteAt(sp, row, (uint16_t)(col + 2), shift, spriteBytesPerRow, &v2, &m);
+                shiftedByteAt(sp, row, (uint16_t)(col + 3), shift, spriteBytesPerRow, &v3, &m);
+                // mov dword [esi+col], imm32   (C7 46 cc ii ii ii ii)
+                out[cursor++] = 0xC7; out[cursor++] = 0x46;
+                out[cursor++] = (uint8_t)(col & 0xFFu);
+                out[cursor++] = value;
+                out[cursor++] = v1;
+                out[cursor++] = v2;
+                out[cursor++] = v3;
+                col    = (uint16_t)(col + 4);
+                runLen = (uint16_t)(runLen - 4);
+                if (runLen > 0) {
+                    shiftedByteAt(sp, row, col, shift, spriteBytesPerRow, &value, &opaqueMask);
+                }
+            }
+            if (runLen >= 2) {
+                shiftedByteAt(sp, row, (uint16_t)(col + 1), shift, spriteBytesPerRow, &v1, &m);
+                // mov word [esi+col], imm16    (66 C7 46 cc ii ii)
+                out[cursor++] = 0x66;
+                out[cursor++] = 0xC7; out[cursor++] = 0x46;
+                out[cursor++] = (uint8_t)(col & 0xFFu);
+                out[cursor++] = value;
+                out[cursor++] = v1;
+                col    = (uint16_t)(col + 2);
+                runLen = (uint16_t)(runLen - 2);
+                if (runLen > 0) {
+                    shiftedByteAt(sp, row, col, shift, spriteBytesPerRow, &value, &opaqueMask);
+                }
+            }
+            if (runLen == 1) {
+                // mov byte [esi+col], imm8     (C6 46 cc ii)
+                out[cursor++] = 0xC6; out[cursor++] = 0x46;
+                out[cursor++] = (uint8_t)(col & 0xFFu);
+                out[cursor++] = value;
+                col++;
            }
        }
    }