Sprite run coalescing added to x86.
This commit is contained in:
parent
c4ee37941a
commit
6dd2266f13
1 changed files with 75 additions and 11 deletions
|
|
@ -7,11 +7,17 @@
|
|||
// draw(uint8_t *dst) -- arg in [esp+8] after prologue saves esi
|
||||
// save/restore(const uint8_t *src, uint8_t *dst) -- args in [esp+12]/[esp+16] after esi+edi save
|
||||
//
|
||||
// Per-byte emit (no run coalescing yet):
|
||||
// Per-byte emit, with opaque-run coalescing for the draw path:
|
||||
// - all-transparent (both nibbles 0): skip, no instruction
|
||||
// - all-opaque: mov byte [esi+col], imm8 (4 bytes encoded)
|
||||
// - mixed: mov al,[esi+col]; and al,mask; or al,val; mov [esi+col],al
|
||||
// (3 + 2 + 2 + 3 = 10 bytes)
|
||||
// - run of N consecutive fully-opaque bytes: emit largest chunks
|
||||
// while N >= 4: mov dword [esi+col], imm32 (7 bytes, 1 store)
|
||||
// if N >= 2: mov word [esi+col], imm16 (6 bytes, 1 store)
|
||||
// if N == 1: mov byte [esi+col], imm8 (4 bytes, 1 store)
|
||||
// A run of 4 opaque bytes is therefore one 7-byte store instead of
|
||||
// four 4-byte stores (16 bytes / 4 stores). Unaligned access is
|
||||
// fine on 386+.
|
||||
// Per row:
|
||||
// add esi, SURFACE_BYTES_PER_ROW (6 bytes encoded)
|
||||
// Prologue:
|
||||
|
|
@ -182,11 +188,17 @@ uint16_t spriteEmitDrawX86(uint8_t *out, const SpriteT *sp, uint8_t shift) {
|
|||
uint16_t cursor;
|
||||
uint16_t row;
|
||||
uint16_t col;
|
||||
uint16_t runEnd;
|
||||
uint16_t runLen;
|
||||
uint16_t heightPx;
|
||||
uint16_t spriteBytesPerRow;
|
||||
uint16_t destBytesPerRow;
|
||||
uint8_t value;
|
||||
uint8_t opaqueMask;
|
||||
uint8_t v1;
|
||||
uint8_t v2;
|
||||
uint8_t v3;
|
||||
uint8_t m;
|
||||
|
||||
cursor = 0;
|
||||
heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS);
|
||||
|
|
@ -198,7 +210,7 @@ uint16_t spriteEmitDrawX86(uint8_t *out, const SpriteT *sp, uint8_t shift) {
|
|||
out[cursor++] = 0x8B; out[cursor++] = 0x74;
|
||||
out[cursor++] = 0x24; out[cursor++] = 0x08;
|
||||
|
||||
// Body: per row, per dest byte.
|
||||
// Body: per row, scan dest bytes coalescing fully-opaque runs.
|
||||
for (row = 0; row < heightPx; row++) {
|
||||
if (row > 0) {
|
||||
// add esi, SURFACE_BYTES_PER_ROW (32-bit imm)
|
||||
|
|
@ -208,17 +220,14 @@ uint16_t spriteEmitDrawX86(uint8_t *out, const SpriteT *sp, uint8_t shift) {
|
|||
out[cursor++] = 0x00;
|
||||
out[cursor++] = 0x00;
|
||||
}
|
||||
for (col = 0; col < destBytesPerRow; col++) {
|
||||
col = 0;
|
||||
while (col < destBytesPerRow) {
|
||||
shiftedByteAt(sp, row, col, shift, spriteBytesPerRow, &value, &opaqueMask);
|
||||
if (opaqueMask == 0x00) {
|
||||
continue; // both nibbles transparent
|
||||
col++;
|
||||
continue;
|
||||
}
|
||||
if (opaqueMask == 0xFFu) {
|
||||
// mov byte [esi+col], imm8 (C6 46 cc ii)
|
||||
out[cursor++] = 0xC6; out[cursor++] = 0x46;
|
||||
out[cursor++] = (uint8_t)(col & 0xFFu);
|
||||
out[cursor++] = value;
|
||||
} else {
|
||||
if (opaqueMask != 0xFFu) {
|
||||
// Mixed: read-modify-write.
|
||||
// mov al, [esi+col] (8A 46 cc)
|
||||
// and al, ~opaqueMask (24 mm)
|
||||
|
|
@ -232,6 +241,61 @@ uint16_t spriteEmitDrawX86(uint8_t *out, const SpriteT *sp, uint8_t shift) {
|
|||
out[cursor++] = value;
|
||||
out[cursor++] = 0x88; out[cursor++] = 0x46;
|
||||
out[cursor++] = (uint8_t)(col & 0xFFu);
|
||||
col++;
|
||||
continue;
|
||||
}
|
||||
// Fully opaque at col -- find the end of the run.
|
||||
runEnd = (uint16_t)(col + 1);
|
||||
while (runEnd < destBytesPerRow) {
|
||||
shiftedByteAt(sp, row, runEnd, shift, spriteBytesPerRow, &v1, &m);
|
||||
if (m != 0xFFu) {
|
||||
break;
|
||||
}
|
||||
runEnd++;
|
||||
}
|
||||
runLen = (uint16_t)(runEnd - col);
|
||||
|
||||
// Emit dword stores while >= 4 bytes remain, then a word
|
||||
// store if >= 2, then a single byte. shiftedByteAt is cheap
|
||||
// enough that re-reading per chunk beats threading a
|
||||
// fixed-size buffer through.
|
||||
while (runLen >= 4) {
|
||||
shiftedByteAt(sp, row, (uint16_t)(col + 1), shift, spriteBytesPerRow, &v1, &m);
|
||||
shiftedByteAt(sp, row, (uint16_t)(col + 2), shift, spriteBytesPerRow, &v2, &m);
|
||||
shiftedByteAt(sp, row, (uint16_t)(col + 3), shift, spriteBytesPerRow, &v3, &m);
|
||||
// mov dword [esi+col], imm32 (C7 46 cc ii ii ii ii)
|
||||
out[cursor++] = 0xC7; out[cursor++] = 0x46;
|
||||
out[cursor++] = (uint8_t)(col & 0xFFu);
|
||||
out[cursor++] = value;
|
||||
out[cursor++] = v1;
|
||||
out[cursor++] = v2;
|
||||
out[cursor++] = v3;
|
||||
col = (uint16_t)(col + 4);
|
||||
runLen = (uint16_t)(runLen - 4);
|
||||
if (runLen > 0) {
|
||||
shiftedByteAt(sp, row, col, shift, spriteBytesPerRow, &value, &opaqueMask);
|
||||
}
|
||||
}
|
||||
if (runLen >= 2) {
|
||||
shiftedByteAt(sp, row, (uint16_t)(col + 1), shift, spriteBytesPerRow, &v1, &m);
|
||||
// mov word [esi+col], imm16 (66 C7 46 cc ii ii)
|
||||
out[cursor++] = 0x66;
|
||||
out[cursor++] = 0xC7; out[cursor++] = 0x46;
|
||||
out[cursor++] = (uint8_t)(col & 0xFFu);
|
||||
out[cursor++] = value;
|
||||
out[cursor++] = v1;
|
||||
col = (uint16_t)(col + 2);
|
||||
runLen = (uint16_t)(runLen - 2);
|
||||
if (runLen > 0) {
|
||||
shiftedByteAt(sp, row, col, shift, spriteBytesPerRow, &value, &opaqueMask);
|
||||
}
|
||||
}
|
||||
if (runLen == 1) {
|
||||
// mov byte [esi+col], imm8 (C6 46 cc ii)
|
||||
out[cursor++] = 0xC6; out[cursor++] = 0x46;
|
||||
out[cursor++] = (uint8_t)(col & 0xFFu);
|
||||
out[cursor++] = value;
|
||||
col++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue