// x86 sprite codegen (DOS port). Emits 32-bit cdecl-callable PIC // draw / save / restore routines that read or write 4bpp packed // surface bytes via [esi+disp8] chains. The C side calls them // through a function pointer cast. // // Calling convention: // draw(uint8_t *dst) -- arg in [esp+8] after prologue saves esi // save/restore(const uint8_t *src, uint8_t *dst) -- args in [esp+12]/[esp+16] after esi+edi save // // Per-byte emit, with opaque-run coalescing for the draw path: // - all-transparent (both nibbles 0): skip, no instruction // - mixed: mov al,[esi+col]; and al,mask; or al,val; mov [esi+col],al // (3 + 2 + 2 + 3 = 10 bytes) // - run of N consecutive fully-opaque bytes: emit largest chunks // while N >= 4: mov dword [esi+col], imm32 (7 bytes, 1 store) // if N >= 2: mov word [esi+col], imm16 (6 bytes, 1 store) // if N == 1: mov byte [esi+col], imm8 (4 bytes, 1 store) // A run of 4 opaque bytes is therefore one 7-byte store instead of // four 4-byte stores (16 bytes / 4 stores). Unaligned access is // fine on 386+. // Per row: // add esi, SURFACE_BYTES_PER_ROW (6 bytes encoded) // Prologue: // push esi; mov esi, [esp+8] (1 + 4 = 5 bytes) // Epilogue: // pop esi; ret (1 + 1 = 2 bytes) #include "joey/sprite.h" #include "joey/surface.h" #include "spriteEmitter.h" #include "spriteInternal.h" // ----- Constants ----- #define TILE_PIXELS 8 #define TILE_BYTES 32 #define TILE_BYTES_PER_ROW 4 #define TRANSPARENT_NIBBLE 0 // Worst-case bytes per emitted routine, used to size the scratch // buffer. A 32x32 sprite is 16 rows * (16 dest bytes + 1 for shift1) // = 272 dest-byte slots, each up to 10 bytes mixed = 2720; plus // per-row prologues 32*6=192; plus prologue/epilogue 8. Round up // generously. #define MAX_ROUTINE_BYTES 8192 // ----- Prototypes ----- static uint16_t emitCopyBodyX86(uint8_t *out, uint16_t cursor, uint16_t heightPx, uint16_t copyBytes, bool strideOnSrc); static void shiftedByteAt(const SpriteT *sp, uint16_t row, uint16_t col, uint8_t shift, uint16_t spriteBytesPerRow, uint8_t *outValue, uint8_t *outOpaqueMask); static uint8_t spriteSourceByte(const SpriteT *sp, uint16_t row, uint16_t col); // ----- Emit helpers (alphabetical) ----- // Shared body for save/restore. Walks heightPx rows of copyBytes // using rep movsd for the dword-aligned bulk and rep movsb for the // byte tail. After each row except the last, advances either esi // or edi by (SURFACE_BYTES_PER_ROW - copyBytes) so the strided side // (screen) lines up with the next scanline; the contiguous side // (backup) advances naturally because rep movs* leaves the index // register one past the last byte copied. // // strideOnSrc=true -> source has the screen stride (SAVE) // strideOnSrc=false -> destination has the screen stride (RESTORE) static uint16_t emitCopyBodyX86(uint8_t *out, uint16_t cursor, uint16_t heightPx, uint16_t copyBytes, bool strideOnSrc) { uint16_t row; uint16_t dwords; uint16_t tail; int32_t advance; dwords = (uint16_t)(copyBytes >> 2); tail = (uint16_t)(copyBytes & 0x3u); advance = (int32_t)SURFACE_BYTES_PER_ROW - (int32_t)copyBytes; for (row = 0; row < heightPx; row++) { if (dwords > 0) { // mov ecx, dwords (B9 imm32); rep movsd (F3 A5) out[cursor++] = 0xB9; out[cursor++] = (uint8_t)(dwords & 0xFFu); out[cursor++] = (uint8_t)((dwords >> 8) & 0xFFu); out[cursor++] = 0; out[cursor++] = 0; out[cursor++] = 0xF3; out[cursor++] = 0xA5; } if (tail > 0) { // mov ecx, tail (B9 imm32); rep movsb (F3 A4) out[cursor++] = 0xB9; out[cursor++] = (uint8_t)(tail & 0xFFu); out[cursor++] = 0; out[cursor++] = 0; out[cursor++] = 0; out[cursor++] = 0xF3; out[cursor++] = 0xA4; } if (row + 1u < heightPx) { // SAVE: add esi, advance (81 C6 imm32) // RESTORE: add edi, advance (81 C7 imm32) out[cursor++] = 0x81; out[cursor++] = (uint8_t)(strideOnSrc ? 0xC6u : 0xC7u); out[cursor++] = (uint8_t)(advance & 0xFFu); out[cursor++] = (uint8_t)((advance >> 8) & 0xFFu); out[cursor++] = (uint8_t)((advance >> 16) & 0xFFu); out[cursor++] = (uint8_t)((advance >> 24) & 0xFFu); } } return cursor; } // Decompose a destination byte's contribution from the sprite into // (value, opaqueMask) for shift in {0, 1}. opaqueMask high nibble // 0xF0 means high dest nibble is opaque; 0x0F means low is opaque; // 0x00 means both transparent. value's transparent nibbles are 0. static void shiftedByteAt(const SpriteT *sp, uint16_t row, uint16_t col, uint8_t shift, uint16_t spriteBytesPerRow, uint8_t *outValue, uint8_t *outOpaqueMask) { uint8_t srcByte; uint8_t hi; uint8_t lo; bool hasLeft; bool hasRight; *outValue = 0; *outOpaqueMask = 0; if (shift == 0) { if (col >= spriteBytesPerRow) { return; } srcByte = spriteSourceByte(sp, row, col); hi = (uint8_t)((srcByte >> 4) & 0x0Fu); lo = (uint8_t)(srcByte & 0x0Fu); if (hi != TRANSPARENT_NIBBLE) { *outValue |= (uint8_t)(hi << 4); *outOpaqueMask |= 0xF0u; } if (lo != TRANSPARENT_NIBBLE) { *outValue |= lo; *outOpaqueMask |= 0x0Fu; } return; } // shift = 1 hasLeft = (col >= 1) && ((uint16_t)(col - 1) < spriteBytesPerRow); hasRight = (col < spriteBytesPerRow); if (hasLeft) { srcByte = spriteSourceByte(sp, row, (uint16_t)(col - 1)); hi = (uint8_t)(srcByte & 0x0Fu); // sprite byte's LOW nibble if (hi != TRANSPARENT_NIBBLE) { *outValue |= (uint8_t)(hi << 4); *outOpaqueMask |= 0xF0u; } } if (hasRight) { srcByte = spriteSourceByte(sp, row, col); lo = (uint8_t)((srcByte >> 4) & 0x0Fu); // sprite byte's HIGH nibble if (lo != TRANSPARENT_NIBBLE) { *outValue |= lo; *outOpaqueMask |= 0x0Fu; } } } // Sample a sprite tile-data byte at (row, col) where col is in // sprite-byte coordinates (0..spriteBytesPerRow-1). static uint8_t spriteSourceByte(const SpriteT *sp, uint16_t row, uint16_t col) { uint16_t tileX; uint16_t tileY; uint16_t inTileX; uint16_t inTileY; const uint8_t *tile; tileX = (uint16_t)(col / TILE_BYTES_PER_ROW); tileY = (uint16_t)(row / TILE_PIXELS); inTileX = (uint16_t)(col & (TILE_BYTES_PER_ROW - 1)); inTileY = (uint16_t)(row & (TILE_PIXELS - 1)); tile = sp->tileData + ((uint32_t)(tileY * sp->widthTiles + tileX)) * TILE_BYTES; return tile[inTileY * TILE_BYTES_PER_ROW + inTileX]; } // Emit a draw routine for one shift variant. Returns bytes written. // Routine signature: void f(uint8_t *dst). uint16_t spriteEmitDrawX86(uint8_t *out, const SpriteT *sp, uint8_t shift) { uint16_t cursor; uint16_t row; uint16_t col; uint16_t runEnd; uint16_t runLen; uint16_t heightPx; uint16_t spriteBytesPerRow; uint16_t destBytesPerRow; uint8_t value; uint8_t opaqueMask; uint8_t v1; uint8_t v2; uint8_t v3; uint8_t m; if (shift > 1u) { return 0u; } cursor = 0; heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS); spriteBytesPerRow = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW); destBytesPerRow = (uint16_t)(spriteBytesPerRow + (shift == 1 ? 1 : 0)); // Prologue: push esi; mov esi, [esp+8] out[cursor++] = 0x56; // push esi out[cursor++] = 0x8B; out[cursor++] = 0x74; out[cursor++] = 0x24; out[cursor++] = 0x08; // Body: per row, scan dest bytes coalescing fully-opaque runs. for (row = 0; row < heightPx; row++) { if (row > 0) { // add esi, SURFACE_BYTES_PER_ROW (32-bit imm) out[cursor++] = 0x81; out[cursor++] = 0xC6; out[cursor++] = (uint8_t)(SURFACE_BYTES_PER_ROW & 0xFFu); out[cursor++] = (uint8_t)((SURFACE_BYTES_PER_ROW >> 8) & 0xFFu); out[cursor++] = 0x00; out[cursor++] = 0x00; } col = 0; while (col < destBytesPerRow) { shiftedByteAt(sp, row, col, shift, spriteBytesPerRow, &value, &opaqueMask); if (opaqueMask == 0x00) { col++; continue; } if (opaqueMask != 0xFFu) { // Mixed: read-modify-write. // mov al, [esi+col] (8A 46 cc) // and al, ~opaqueMask (24 mm) // or al, value (0C vv) // mov [esi+col], al (88 46 cc) out[cursor++] = 0x8A; out[cursor++] = 0x46; out[cursor++] = (uint8_t)(col & 0xFFu); out[cursor++] = 0x24; out[cursor++] = (uint8_t)(~opaqueMask & 0xFFu); out[cursor++] = 0x0C; out[cursor++] = value; out[cursor++] = 0x88; out[cursor++] = 0x46; out[cursor++] = (uint8_t)(col & 0xFFu); col++; continue; } // Fully opaque at col -- find the end of the run. runEnd = (uint16_t)(col + 1); while (runEnd < destBytesPerRow) { shiftedByteAt(sp, row, runEnd, shift, spriteBytesPerRow, &v1, &m); if (m != 0xFFu) { break; } runEnd++; } runLen = (uint16_t)(runEnd - col); // Emit dword stores while >= 4 bytes remain, then a word // store if >= 2, then a single byte. shiftedByteAt is cheap // enough that re-reading per chunk beats threading a // fixed-size buffer through. while (runLen >= 4) { shiftedByteAt(sp, row, (uint16_t)(col + 1), shift, spriteBytesPerRow, &v1, &m); shiftedByteAt(sp, row, (uint16_t)(col + 2), shift, spriteBytesPerRow, &v2, &m); shiftedByteAt(sp, row, (uint16_t)(col + 3), shift, spriteBytesPerRow, &v3, &m); // mov dword [esi+col], imm32 (C7 46 cc ii ii ii ii) out[cursor++] = 0xC7; out[cursor++] = 0x46; out[cursor++] = (uint8_t)(col & 0xFFu); out[cursor++] = value; out[cursor++] = v1; out[cursor++] = v2; out[cursor++] = v3; col = (uint16_t)(col + 4); runLen = (uint16_t)(runLen - 4); if (runLen > 0) { shiftedByteAt(sp, row, col, shift, spriteBytesPerRow, &value, &opaqueMask); } } if (runLen >= 2) { shiftedByteAt(sp, row, (uint16_t)(col + 1), shift, spriteBytesPerRow, &v1, &m); // mov word [esi+col], imm16 (66 C7 46 cc ii ii) out[cursor++] = 0x66; out[cursor++] = 0xC7; out[cursor++] = 0x46; out[cursor++] = (uint8_t)(col & 0xFFu); out[cursor++] = value; out[cursor++] = v1; col = (uint16_t)(col + 2); runLen = (uint16_t)(runLen - 2); if (runLen > 0) { shiftedByteAt(sp, row, col, shift, spriteBytesPerRow, &value, &opaqueMask); } } if (runLen == 1) { // mov byte [esi+col], imm8 (C6 46 cc ii) out[cursor++] = 0xC6; out[cursor++] = 0x46; out[cursor++] = (uint8_t)(col & 0xFFu); out[cursor++] = value; col++; } } } // Epilogue: pop esi; ret out[cursor++] = 0x5E; out[cursor++] = 0xC3; return cursor; } // RESTORE: copy backup -> screen. Destination has the screen stride. uint16_t spriteEmitRestoreX86(uint8_t *out, const SpriteT *sp, uint8_t shift) { uint16_t cursor; uint16_t heightPx; uint16_t copyBytes; if (shift > 1u) { return 0u; } cursor = 0; heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS); copyBytes = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW + (shift == 1u ? 1u : 0u)); // Prologue: push esi; push edi; mov esi,[esp+12]; mov edi,[esp+16] out[cursor++] = 0x56; out[cursor++] = 0x57; out[cursor++] = 0x8B; out[cursor++] = 0x74; out[cursor++] = 0x24; out[cursor++] = 0x0C; out[cursor++] = 0x8B; out[cursor++] = 0x7C; out[cursor++] = 0x24; out[cursor++] = 0x10; cursor = emitCopyBodyX86(out, cursor, heightPx, copyBytes, false); // Epilogue: pop edi; pop esi; ret out[cursor++] = 0x5F; out[cursor++] = 0x5E; out[cursor++] = 0xC3; return cursor; } // SAVE: copy screen -> backup. Source has the screen stride. uint16_t spriteEmitSaveX86(uint8_t *out, const SpriteT *sp, uint8_t shift) { uint16_t cursor; uint16_t heightPx; uint16_t copyBytes; if (shift > 1u) { return 0u; } cursor = 0; heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS); copyBytes = (uint16_t)(sp->widthTiles * TILE_BYTES_PER_ROW + (shift == 1u ? 1u : 0u)); out[cursor++] = 0x56; out[cursor++] = 0x57; out[cursor++] = 0x8B; out[cursor++] = 0x74; out[cursor++] = 0x24; out[cursor++] = 0x0C; out[cursor++] = 0x8B; out[cursor++] = 0x7C; out[cursor++] = 0x24; out[cursor++] = 0x10; cursor = emitCopyBodyX86(out, cursor, heightPx, copyBytes, true); out[cursor++] = 0x5F; out[cursor++] = 0x5E; out[cursor++] = 0xC3; return cursor; }