joeylib2/src/codegen/spriteEmitPlanar68k.c

505 lines
20 KiB
C

// Planar 68k sprite codegen for Amiga (post-Phase 9, no chunky shadow).
//
// Emits PIC routines that write directly to the four bitplanes via 4
// address-register pointers (a0..a3 = plane[0..3] base + byteOff,
// where byteOff = y*40 + x/8 -- the dispatcher pre-computes this).
//
// Calling convention (cdecl on m68k-amigaos-gcc):
// draw(p0, p1, p2, p3):
// args at 4(sp), 8(sp), 12(sp), 16(sp) -- one ULONG per plane.
// loaded into a0..a3 by the prologue.
// save(p0, p1, p2, p3, backup):
// 5 args; backup at 20(sp), loaded into a4.
// restore(p0, p1, p2, p3, backup):
// same as save but reads backup, writes planes.
//
// Per-byte plane write encoding decisions:
// - all-transparent (mask=0): skip the byte entirely
// - all-opaque (mask=0xFF): move.b #imm, d16(an) (6 bytes)
// - mixed (0<mask<0xFF): move.b d16(an), d0;
// andi.b #~mask, d0;
// ori.b #imm, d0;
// move.b d0, d16(an) (4+6+6+4 = 20 bytes)
//
// Per row advance: 4 plane pointers each get adda.w #SURFACE_WIDTH/8
// = adda.w #40, an (4 bytes encoded each, 16 bytes total per row).
// We omit the advance after the last row.
//
// Shift handling: shifts 0..7 are pre-baked. The dispatcher selects
// the variant via x % 8 and pre-computes byteOff = y*40 + (x & ~7)/8
// (i.e. round x DOWN to 8-pixel boundary). The variant for shift s
// then emits to (widthTiles + 1) plane bytes per row when s != 0
// (the rightmost shift bits spill into one extra plane byte) and to
// widthTiles plane bytes per row when s == 0.
//
// The emitter assumes sprite width is a multiple of 8 (= a multiple
// of one tile = a multiple of 8 pixels) so plane bytes per row are
// integer. JoeyLib sprites are always tile-multiple by API contract.
#include "joey/sprite.h"
#include "joey/surface.h"
#include "spriteEmitter.h"
#include "spriteInternal.h"
// ----- Constants -----
#define TILE_PIXELS 8
#define TILE_BYTES 32
#define TILE_BYTES_PER_ROW 4
#define TRANSPARENT_NIBBLE 0
#define AMIGA_BITPLANES 4
#define AMIGA_BYTES_PER_ROW 40
// ----- Instruction encoding helpers -----
static uint16_t writeBE16(uint8_t *out, uint16_t value) {
out[0] = (uint8_t)((value >> 8) & 0xFFu);
out[1] = (uint8_t)(value & 0xFFu);
return 2u;
}
// movea.l <d16,SP>, an -- load arg at SP+disp into An.
// Encoding: 0010 nnn 001 010 111 + disp16
// = 0x2057 + (n << 9), where n is dst An.
// a0: 0x206F, a1: 0x226F, a2: 0x246F, a3: 0x266F, a4: 0x286F.
static const uint16_t kMoveaSpToAn[] = {
0x206Fu, 0x226Fu, 0x246Fu, 0x266Fu, 0x286Fu, 0x2A6Fu, 0x2C6Fu, 0x2E6Fu
};
// adda.w #imm, an -- adds 16-bit signed imm to An (sign-extended).
// Encoding: 1101 nnn 011 111 100 + imm
// = 0xD0FC + (n << 9).
static const uint16_t kAddaWImmToAn[] = {
0xD0FCu, 0xD2FCu, 0xD4FCu, 0xD6FCu, 0xD8FCu, 0xDAFCu, 0xDCFCu, 0xDEFCu
};
// ANDI.B #imm, D0 -- 4 bytes (opcode word + imm word, byte in low half).
// Opcode: 0000 0010 00 000 000 (size=byte, mode=Dn, reg=D0)
#define ANDI_B_IMM_D0 0x0200u
// ORI.B #imm, D0 -- 4 bytes (opcode word + imm word, byte in low half).
// Opcode: 0000 0000 00 000 000
#define ORI_B_IMM_D0 0x0000u
// MOVE.B d16(An), D0 -- 4 bytes (opcode + disp).
// Encoding: 0001 000 000 mode reg
// = size=01 (byte), dst reg=000 (D0), dst mode=000 (Dn),
// src mode=101 (d16,An), src reg=An.
// = 0001000 000 101 nnn = 0x1028 + An.
static const uint16_t kMoveBD16AnToD0[] = {
0x1028u, 0x1029u, 0x102Au, 0x102Bu
};
// MOVE.B D0, d16(An) -- 4 bytes (opcode + disp).
// Encoding: 0001 nnn 101 000 000 = 0x1140 + (An << 9).
static const uint16_t kMoveBD0ToD16An[] = {
0x1140u, 0x1340u, 0x1540u, 0x1740u
};
// MOVE.B #imm, d16(An) -- 6 bytes (opcode + imm + disp).
// Encoding: 0001 nnn 101 111 100 = 0x117C + (An << 9).
// (Was 0x113C earlier -- that's mode=100=predec; mode=101=d16(An)
// is the bit difference. Predec emits a 4-byte instruction with no
// disp word, so the byte stream went out of sync and every
// subsequent instruction decoded into garbage.)
static const uint16_t kMoveBImmToD16An[] = {
0x117Cu, 0x137Cu, 0x157Cu, 0x177Cu
};
// MOVE.B (a4)+, d16(An) -- 4 bytes (opcode + disp). -- used by save/restore (backup in a4)
// Encoding: 0001 nnn 101 011 100 = 0x115C + (An << 9).
static const uint16_t kMoveBA4PostincToD16An[] = {
0x115Cu, 0x135Cu, 0x155Cu, 0x175Cu
};
// MOVE.B d16(An), (a4)+ -- 4 bytes (opcode + disp). -- used by save (planes -> backup)
// Encoding: 1001 100 011 mode reg
// Wait, MOVE.B src,(a4)+ : dst mode = 011 (an+), dst reg = 100 (A4),
// so dst reg=100, dst mode=011 -> opcode high = 0001 100 011 ...
// = 0001100011 mode reg = 0x18C0..
// 0001 100 011 101 nnn = 0x18E8 + An.
static const uint16_t kMoveBD16AnToA4Postinc[] = {
0x18E8u, 0x18E9u, 0x18EAu, 0x18EBu
};
// MOVEM.L reglist, -(SP) -- 4 bytes (opcode + reglist mask).
// Opcode 0x48E7. Predec mask is REVERSED vs all other modes:
// bit 15 = D0, ..., bit 8 = D7, bit 7 = A0, bit 6 = A1, bit 5 = A2,
// bit 4 = A3, bit 3 = A4, bit 2 = A5, bit 1 = A6, bit 0 = A7.
#define MOVEM_L_PUSH_OPCODE 0x48E7u
#define MOVEM_L_MASK_A2_A3 0x0030u /* bits 5,4 = A2,A3 (predec order) */
#define MOVEM_L_MASK_A2_A3_A4 0x0038u /* bits 5,4,3 = A2,A3,A4 */
// MOVEM.L (SP)+, reglist -- 4 bytes (opcode + reglist mask).
// Opcode 0x4CDF. Postinc mask follows the standard layout:
// bit 0 = D0, ..., bit 7 = D7, bit 8 = A0, ..., bit 15 = A7.
#define MOVEM_L_POP_OPCODE 0x4CDFu
#define MOVEM_L_MASK_POP_A2_A3 0x0C00u /* bits 11,10 = A3,A2 */
#define MOVEM_L_MASK_POP_A2_A3_A4 0x1C00u /* bits 12,11,10 = A4,A3,A2 */
// RTS opcode.
#define OPCODE_RTS 0x4E75u
// ----- Emit helpers -----
// For shift 0 (byte-aligned x), the sprite's chunky tile data converts
// directly to plane bytes without any sub-byte shifting. For each
// (row, col-byte, plane) we extract the 8 plane bits from 4 chunky
// bytes (= 8 pixels) and produce one plane byte; we also produce a
// mask byte indicating which pixel positions are non-transparent
// (any plane bit != 0 in the source means non-transparent if
// transparent index is 0, the JoeyLib convention).
//
// Sprite layout: tileData = wTiles x hTiles tiles, each tile = 8 rows
// x 4 chunky bytes (32 bytes). Tiles laid out row-major within the
// sprite. For plane-byte column `c` of row `r`:
// tileX = c (since each plane byte covers exactly one tile column)
// tileY = r / 8
// inTileY = r % 8
// chunky bytes = tileData + (tileY*wTiles + tileX)*32 + inTileY*4 + 0..3
//
// `col` must be in [0, widthTiles); callers handle out-of-range cols
// (used when computing shifted variants that span widthTiles+1 output
// bytes per row) by passing a sentinel and checking against widthTiles
// before invoking this helper.
static void planeByteAndMaskAt(const SpriteT *sp, uint16_t row, uint16_t col,
uint8_t *planeBytes /*[4]*/, uint8_t *maskByte)
{
uint16_t tileX;
uint16_t tileY;
uint16_t inTileY;
const uint8_t *tile;
const uint8_t *chunky;
uint8_t nibbles[8];
uint8_t b0, b1, b2, b3;
uint16_t p;
uint8_t bitMask;
uint8_t pix;
tileX = col;
tileY = row >> 3;
inTileY = row & 7u;
tile = sp->tileData + (uint32_t)((tileY * sp->widthTiles + tileX) * 32u);
chunky = tile + inTileY * 4u;
nibbles[0] = (uint8_t)(chunky[0] >> 4);
nibbles[1] = (uint8_t)(chunky[0] & 0x0Fu);
nibbles[2] = (uint8_t)(chunky[1] >> 4);
nibbles[3] = (uint8_t)(chunky[1] & 0x0Fu);
nibbles[4] = (uint8_t)(chunky[2] >> 4);
nibbles[5] = (uint8_t)(chunky[2] & 0x0Fu);
nibbles[6] = (uint8_t)(chunky[3] >> 4);
nibbles[7] = (uint8_t)(chunky[3] & 0x0Fu);
b0 = 0u; b1 = 0u; b2 = 0u; b3 = 0u;
*maskByte = 0u;
for (p = 0; p < 8u; p++) {
pix = nibbles[p];
if (pix == TRANSPARENT_NIBBLE) {
continue;
}
bitMask = (uint8_t)(0x80u >> p);
*maskByte = (uint8_t)(*maskByte | bitMask);
if (pix & 1u) b0 = (uint8_t)(b0 | bitMask);
if (pix & 2u) b1 = (uint8_t)(b1 | bitMask);
if (pix & 4u) b2 = (uint8_t)(b2 | bitMask);
if (pix & 8u) b3 = (uint8_t)(b3 | bitMask);
}
planeBytes[0] = b0;
planeBytes[1] = b1;
planeBytes[2] = b2;
planeBytes[3] = b3;
}
// Shifted variant: produces 4 plane bytes and 1 mask byte for output
// column `outCol` (0..widthTiles inclusive) of row `row` when the
// sprite is shifted right by `shift` pixels (1..7). For shift 0,
// callers should use planeByteAndMaskAt directly (faster, no spill).
//
// Each output byte is composed of bits drawn from up to two source
// plane bytes:
// leftPart = src[outCol-1] << (8 - shift) (high (shift) bits)
// rightPart = src[outCol] >> shift (low (8-shift) bits)
// with src[-1] and src[widthTiles] treated as 0/transparent. The
// resulting plane byte is leftPart | rightPart; the mask byte is the
// shifted union of the per-byte source masks.
static void planeByteAndMaskShifted(const SpriteT *sp, uint16_t row, uint16_t outCol,
uint8_t shift, uint16_t widthTiles,
uint8_t *planeBytes /*[4]*/, uint8_t *maskByte)
{
uint8_t leftPlanes[AMIGA_BITPLANES];
uint8_t leftMask;
uint8_t rightPlanes[AMIGA_BITPLANES];
uint8_t rightMask;
uint8_t i;
leftMask = 0u;
rightMask = 0u;
for (i = 0; i < AMIGA_BITPLANES; i++) {
leftPlanes[i] = 0u;
rightPlanes[i] = 0u;
}
if (outCol > 0u && (uint16_t)(outCol - 1u) < widthTiles) {
planeByteAndMaskAt(sp, row, (uint16_t)(outCol - 1u), leftPlanes, &leftMask);
}
if (outCol < widthTiles) {
planeByteAndMaskAt(sp, row, outCol, rightPlanes, &rightMask);
}
*maskByte = (uint8_t)(((leftMask << (8u - shift)) & 0xFFu) |
((rightMask >> shift) & 0xFFu));
for (i = 0; i < AMIGA_BITPLANES; i++) {
planeBytes[i] = (uint8_t)(((leftPlanes[i] << (8u - shift)) & 0xFFu) |
((rightPlanes[i] >> shift) & 0xFFu));
}
}
// Emit code that merges one plane byte into d16(an) where d16 is the
// row-relative byte offset (0 since we re-base each row by adda.w).
// The choice of all-opaque vs mixed encoding cuts code size when many
// pixels are opaque (typical for sprite interiors).
static uint16_t emitMergeByteToD16An(uint8_t *out, uint16_t cursor,
uint8_t an, uint8_t disp,
uint8_t maskByte, uint8_t srcByte)
{
if (maskByte == 0u) {
return cursor; /* nothing to write */
}
if (maskByte == 0xFFu) {
/* All-opaque shortcut: move.b #src, d16(an). */
cursor += writeBE16(out + cursor, kMoveBImmToD16An[an]);
cursor += writeBE16(out + cursor, (uint16_t)srcByte);
cursor += writeBE16(out + cursor, (uint16_t)disp);
return cursor;
}
/* Mixed: load existing, clear mask bits, OR in src, write back. */
cursor += writeBE16(out + cursor, kMoveBD16AnToD0[an]);
cursor += writeBE16(out + cursor, (uint16_t)disp);
cursor += writeBE16(out + cursor, ANDI_B_IMM_D0);
cursor += writeBE16(out + cursor, (uint16_t)((~maskByte) & 0xFFu));
cursor += writeBE16(out + cursor, ORI_B_IMM_D0);
cursor += writeBE16(out + cursor, (uint16_t)srcByte);
cursor += writeBE16(out + cursor, kMoveBD0ToD16An[an]);
cursor += writeBE16(out + cursor, (uint16_t)disp);
return cursor;
}
// ----- Public API -----
uint16_t spriteEmitDrawPlanar68k(uint8_t *out, const SpriteT *sp, uint8_t shift) {
uint16_t cursor;
uint16_t row;
uint16_t col;
uint16_t heightPx;
uint16_t widthTiles;
uint16_t bytesPerRow; /* per plane, per row */
uint8_t planeBytes[AMIGA_BITPLANES];
uint8_t maskByte;
uint8_t i;
if (shift > 7u) {
return 0u;
}
cursor = 0;
heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS);
widthTiles = (uint16_t)sp->widthTiles;
bytesPerRow = (uint16_t)(widthTiles + (shift == 0u ? 0u : 1u));
/* Prologue: m68k cdecl callee-saves a2-a6; we clobber a2 and a3
* loading plane pointers, so push them first. After the push, all
* stack arg displacements shift by +8 (two longs). */
cursor += writeBE16(out + cursor, MOVEM_L_PUSH_OPCODE);
cursor += writeBE16(out + cursor, MOVEM_L_MASK_A2_A3);
for (i = 0; i < AMIGA_BITPLANES; i++) {
cursor += writeBE16(out + cursor, kMoveaSpToAn[i]);
cursor += writeBE16(out + cursor, (uint16_t)(8u + 4u + i * 4u));
}
for (row = 0; row < heightPx; row++) {
for (col = 0; col < bytesPerRow; col++) {
if (shift == 0u) {
planeByteAndMaskAt(sp, row, col, planeBytes, &maskByte);
} else {
planeByteAndMaskShifted(sp, row, col, shift, widthTiles, planeBytes, &maskByte);
}
for (i = 0; i < AMIGA_BITPLANES; i++) {
cursor = emitMergeByteToD16An(out, cursor, i, (uint8_t)col,
maskByte, planeBytes[i]);
}
}
if (row + 1u < heightPx) {
for (i = 0; i < AMIGA_BITPLANES; i++) {
cursor += writeBE16(out + cursor, kAddaWImmToAn[i]);
cursor += writeBE16(out + cursor, (uint16_t)AMIGA_BYTES_PER_ROW);
}
}
}
/* Epilogue: restore a2-a3, rts. */
cursor += writeBE16(out + cursor, MOVEM_L_POP_OPCODE);
cursor += writeBE16(out + cursor, MOVEM_L_MASK_POP_A2_A3);
cursor += writeBE16(out + cursor, OPCODE_RTS);
return cursor;
}
// SAVE: planes -> backup. backup is one contiguous 4*H*W/8 byte buffer
// laid out as 4 plane stripes, matching halSpriteSavePlanes format
// (so cross-platform save buffer is interchangeable).
//
// Per row: for each plane, copy bytesPerRow bytes from d16(an) to
// (a4)+. After the row's reads, the planes need to advance by 40,
// while a4 advances naturally via post-increment.
//
// Plane stripes are sequential in backup. We could either (a) do all
// rows of plane 0, then plane 1, etc. (matches halSpriteSavePlanes
// layout), or (b) interleave rows of all 4 planes (different layout).
// halSpriteSavePlanes does (a) -- 4 separate plane stripes. The
// emitted code below matches that layout for compat.
uint16_t spriteEmitSavePlanar68k(uint8_t *out, const SpriteT *sp, uint8_t shift) {
uint16_t cursor;
uint16_t row;
uint16_t col;
uint16_t heightPx;
uint16_t bytesPerRow;
uint8_t i;
/* Shifts 2..7 reuse shift 1's bytes (identical memcpy). The
* spriteCompile post-emit pass aliases their routineOffsets to
* slot 1 so this routine is emitted once. */
if (shift > 1u) {
return 0u;
}
cursor = 0;
heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS);
bytesPerRow = (uint16_t)(sp->widthTiles + (shift == 0u ? 0u : 1u));
/* Prologue: callee-save a2/a3/a4 (m68k cdecl), then load 4 plane
* pointers + backup pointer. After the push, all stack arg disps
* shift by +12 (three longs). */
cursor += writeBE16(out + cursor, MOVEM_L_PUSH_OPCODE);
cursor += writeBE16(out + cursor, MOVEM_L_MASK_A2_A3_A4);
for (i = 0; i < AMIGA_BITPLANES; i++) {
cursor += writeBE16(out + cursor, kMoveaSpToAn[i]);
cursor += writeBE16(out + cursor, (uint16_t)(12u + 4u + i * 4u));
}
/* a4 = backup. */
cursor += writeBE16(out + cursor, kMoveaSpToAn[4]);
cursor += writeBE16(out + cursor, (uint16_t)(12u + 4u + 4u * 4u));
/* Plane-major: for each plane, walk all rows. After this routine,
* each An has advanced by H*40 (one frame full); we don't need to
* unwind because the function returns. We DO need to reset An
* back to start before walking the NEXT plane though.
*
* Simpler alternative: row-major (interleaved). Per row, copy
* bytesPerRow bytes from each plane to (a4)+, then advance all
* 4 planes by 40. Net: a4 advances by 4*H*bytesPerRow; planes
* advance by H*40. Backup layout becomes interleaved (plane0_row0,
* plane1_row0, plane2_row0, plane3_row0, plane0_row1, ...).
*
* That doesn't match halSpriteSavePlanes' plane-major layout. Need
* to either (a) match it -- emit per-plane outer loop with a4
* stride between planes -- or (b) change halSpriteSavePlanes to
* interleaved. Picking (b) is simpler in emitted code, but ALSO
* requires updating halSpriteRestorePlanes and halSpriteRestoreUnder
* fallback math.
*
* For now: use plane-major matching halSpriteSavePlanes. Per
* plane: walk rows, copy bytes from d16(an) to (a4)+, advance an
* by 40 after each row except the last; reset an back to start
* before next plane. */
for (i = 0; i < AMIGA_BITPLANES; i++) {
for (row = 0; row < heightPx; row++) {
for (col = 0; col < bytesPerRow; col++) {
cursor += writeBE16(out + cursor, kMoveBD16AnToA4Postinc[i]);
cursor += writeBE16(out + cursor, (uint16_t)col);
}
if (row + 1u < heightPx) {
cursor += writeBE16(out + cursor, kAddaWImmToAn[i]);
cursor += writeBE16(out + cursor, (uint16_t)AMIGA_BYTES_PER_ROW);
}
}
/* Reset An back to the plane base for next iteration. The
* total advance was (heightPx - 1) * 40. Subtract that. */
if (i + 1u < AMIGA_BITPLANES) {
cursor += writeBE16(out + cursor, kAddaWImmToAn[i]);
cursor += writeBE16(out + cursor, (uint16_t)(0u - ((heightPx - 1u) * AMIGA_BYTES_PER_ROW)));
}
}
cursor += writeBE16(out + cursor, MOVEM_L_POP_OPCODE);
cursor += writeBE16(out + cursor, MOVEM_L_MASK_POP_A2_A3_A4);
cursor += writeBE16(out + cursor, OPCODE_RTS);
return cursor;
}
// RESTORE: backup -> planes. Mirror of save. Uses MOVE.B (a4)+, d16(an).
uint16_t spriteEmitRestorePlanar68k(uint8_t *out, const SpriteT *sp, uint8_t shift) {
uint16_t cursor;
uint16_t row;
uint16_t col;
uint16_t heightPx;
uint16_t bytesPerRow;
uint8_t i;
if (shift > 1u) {
return 0u;
}
cursor = 0;
heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS);
bytesPerRow = (uint16_t)(sp->widthTiles + (shift == 0u ? 0u : 1u));
/* Callee-save a2/a3/a4; arg disps shift by +12. */
cursor += writeBE16(out + cursor, MOVEM_L_PUSH_OPCODE);
cursor += writeBE16(out + cursor, MOVEM_L_MASK_A2_A3_A4);
for (i = 0; i < AMIGA_BITPLANES; i++) {
cursor += writeBE16(out + cursor, kMoveaSpToAn[i]);
cursor += writeBE16(out + cursor, (uint16_t)(12u + 4u + i * 4u));
}
cursor += writeBE16(out + cursor, kMoveaSpToAn[4]);
cursor += writeBE16(out + cursor, (uint16_t)(12u + 4u + 4u * 4u));
for (i = 0; i < AMIGA_BITPLANES; i++) {
for (row = 0; row < heightPx; row++) {
for (col = 0; col < bytesPerRow; col++) {
cursor += writeBE16(out + cursor, kMoveBA4PostincToD16An[i]);
cursor += writeBE16(out + cursor, (uint16_t)col);
}
if (row + 1u < heightPx) {
cursor += writeBE16(out + cursor, kAddaWImmToAn[i]);
cursor += writeBE16(out + cursor, (uint16_t)AMIGA_BYTES_PER_ROW);
}
}
if (i + 1u < AMIGA_BITPLANES) {
cursor += writeBE16(out + cursor, kAddaWImmToAn[i]);
cursor += writeBE16(out + cursor, (uint16_t)(0u - ((heightPx - 1u) * AMIGA_BYTES_PER_ROW)));
}
}
cursor += writeBE16(out + cursor, MOVEM_L_POP_OPCODE);
cursor += writeBE16(out + cursor, MOVEM_L_MASK_POP_A2_A3_A4);
cursor += writeBE16(out + cursor, OPCODE_RTS);
return cursor;
}