// Planar 68k sprite codegen for Amiga (post-Phase 9, no chunky shadow). // // Emits PIC routines that write directly to the four bitplanes via 4 // address-register pointers (a0..a3 = plane[0..3] base + byteOff, // where byteOff = y*40 + x/8 -- the dispatcher pre-computes this). // // Calling convention (cdecl on m68k-amigaos-gcc): // draw(p0, p1, p2, p3): // args at 4(sp), 8(sp), 12(sp), 16(sp) -- one ULONG per plane. // loaded into a0..a3 by the prologue. // save(p0, p1, p2, p3, backup): // 5 args; backup at 20(sp), loaded into a4. // restore(p0, p1, p2, p3, backup): // same as save but reads backup, writes planes. // // Per-byte plane write encoding decisions: // - all-transparent (mask=0): skip the byte entirely // - all-opaque (mask=0xFF): move.b #imm, d16(an) (6 bytes) // - mixed (0> 8) & 0xFFu); out[1] = (uint8_t)(value & 0xFFu); return 2u; } // movea.l , an -- load arg at SP+disp into An. // Encoding: 0010 nnn 001 010 111 + disp16 // = 0x2057 + (n << 9), where n is dst An. // a0: 0x206F, a1: 0x226F, a2: 0x246F, a3: 0x266F, a4: 0x286F. static const uint16_t kMoveaSpToAn[] = { 0x206Fu, 0x226Fu, 0x246Fu, 0x266Fu, 0x286Fu, 0x2A6Fu, 0x2C6Fu, 0x2E6Fu }; // adda.w #imm, an -- adds 16-bit signed imm to An (sign-extended). // Encoding: 1101 nnn 011 111 100 + imm // = 0xD0FC + (n << 9). static const uint16_t kAddaWImmToAn[] = { 0xD0FCu, 0xD2FCu, 0xD4FCu, 0xD6FCu, 0xD8FCu, 0xDAFCu, 0xDCFCu, 0xDEFCu }; // ANDI.B #imm, D0 -- 4 bytes (opcode word + imm word, byte in low half). // Opcode: 0000 0010 00 000 000 (size=byte, mode=Dn, reg=D0) #define ANDI_B_IMM_D0 0x0200u // ORI.B #imm, D0 -- 4 bytes (opcode word + imm word, byte in low half). // Opcode: 0000 0000 00 000 000 #define ORI_B_IMM_D0 0x0000u // MOVE.B d16(An), D0 -- 4 bytes (opcode + disp). // Encoding: 0001 000 000 mode reg // = size=01 (byte), dst reg=000 (D0), dst mode=000 (Dn), // src mode=101 (d16,An), src reg=An. // = 0001000 000 101 nnn = 0x1028 + An. static const uint16_t kMoveBD16AnToD0[] = { 0x1028u, 0x1029u, 0x102Au, 0x102Bu }; // MOVE.B D0, d16(An) -- 4 bytes (opcode + disp). // Encoding: 0001 nnn 101 000 000 = 0x1140 + (An << 9). static const uint16_t kMoveBD0ToD16An[] = { 0x1140u, 0x1340u, 0x1540u, 0x1740u }; // MOVE.B #imm, d16(An) -- 6 bytes (opcode + imm + disp). // Encoding: 0001 nnn 101 111 100 = 0x117C + (An << 9). // (Was 0x113C earlier -- that's mode=100=predec; mode=101=d16(An) // is the bit difference. Predec emits a 4-byte instruction with no // disp word, so the byte stream went out of sync and every // subsequent instruction decoded into garbage.) static const uint16_t kMoveBImmToD16An[] = { 0x117Cu, 0x137Cu, 0x157Cu, 0x177Cu }; // MOVE.B (a4)+, d16(An) -- 4 bytes (opcode + disp). -- used by save/restore (backup in a4) // Encoding: 0001 nnn 101 011 100 = 0x115C + (An << 9). static const uint16_t kMoveBA4PostincToD16An[] = { 0x115Cu, 0x135Cu, 0x155Cu, 0x175Cu }; // MOVE.B d16(An), (a4)+ -- 4 bytes (opcode + disp). -- used by save (planes -> backup) // Encoding: 1001 100 011 mode reg // Wait, MOVE.B src,(a4)+ : dst mode = 011 (an+), dst reg = 100 (A4), // so dst reg=100, dst mode=011 -> opcode high = 0001 100 011 ... // = 0001100011 mode reg = 0x18C0.. // 0001 100 011 101 nnn = 0x18E8 + An. static const uint16_t kMoveBD16AnToA4Postinc[] = { 0x18E8u, 0x18E9u, 0x18EAu, 0x18EBu }; // MOVEM.L reglist, -(SP) -- 4 bytes (opcode + reglist mask). // Opcode 0x48E7. Predec mask is REVERSED vs all other modes: // bit 15 = D0, ..., bit 8 = D7, bit 7 = A0, bit 6 = A1, bit 5 = A2, // bit 4 = A3, bit 3 = A4, bit 2 = A5, bit 1 = A6, bit 0 = A7. #define MOVEM_L_PUSH_OPCODE 0x48E7u #define MOVEM_L_MASK_A2_A3 0x0030u /* bits 5,4 = A2,A3 (predec order) */ #define MOVEM_L_MASK_A2_A3_A4 0x0038u /* bits 5,4,3 = A2,A3,A4 */ // MOVEM.L (SP)+, reglist -- 4 bytes (opcode + reglist mask). // Opcode 0x4CDF. Postinc mask follows the standard layout: // bit 0 = D0, ..., bit 7 = D7, bit 8 = A0, ..., bit 15 = A7. #define MOVEM_L_POP_OPCODE 0x4CDFu #define MOVEM_L_MASK_POP_A2_A3 0x0C00u /* bits 11,10 = A3,A2 */ #define MOVEM_L_MASK_POP_A2_A3_A4 0x1C00u /* bits 12,11,10 = A4,A3,A2 */ // RTS opcode. #define OPCODE_RTS 0x4E75u // ----- Emit helpers ----- // For shift 0 (byte-aligned x), the sprite's chunky tile data converts // directly to plane bytes without any sub-byte shifting. For each // (row, col-byte, plane) we extract the 8 plane bits from 4 chunky // bytes (= 8 pixels) and produce one plane byte; we also produce a // mask byte indicating which pixel positions are non-transparent // (any plane bit != 0 in the source means non-transparent if // transparent index is 0, the JoeyLib convention). // // Sprite layout: tileData = wTiles x hTiles tiles, each tile = 8 rows // x 4 chunky bytes (32 bytes). Tiles laid out row-major within the // sprite. For plane-byte column `c` of row `r`: // tileX = c (since each plane byte covers exactly one tile column) // tileY = r / 8 // inTileY = r % 8 // chunky bytes = tileData + (tileY*wTiles + tileX)*32 + inTileY*4 + 0..3 // // `col` must be in [0, widthTiles); callers handle out-of-range cols // (used when computing shifted variants that span widthTiles+1 output // bytes per row) by passing a sentinel and checking against widthTiles // before invoking this helper. static void planeByteAndMaskAt(const SpriteT *sp, uint16_t row, uint16_t col, uint8_t *planeBytes /*[4]*/, uint8_t *maskByte) { uint16_t tileX; uint16_t tileY; uint16_t inTileY; const uint8_t *tile; const uint8_t *chunky; uint8_t nibbles[8]; uint8_t b0, b1, b2, b3; uint16_t p; uint8_t bitMask; uint8_t pix; tileX = col; tileY = row >> 3; inTileY = row & 7u; tile = sp->tileData + (uint32_t)((tileY * sp->widthTiles + tileX) * 32u); chunky = tile + inTileY * 4u; nibbles[0] = (uint8_t)(chunky[0] >> 4); nibbles[1] = (uint8_t)(chunky[0] & 0x0Fu); nibbles[2] = (uint8_t)(chunky[1] >> 4); nibbles[3] = (uint8_t)(chunky[1] & 0x0Fu); nibbles[4] = (uint8_t)(chunky[2] >> 4); nibbles[5] = (uint8_t)(chunky[2] & 0x0Fu); nibbles[6] = (uint8_t)(chunky[3] >> 4); nibbles[7] = (uint8_t)(chunky[3] & 0x0Fu); b0 = 0u; b1 = 0u; b2 = 0u; b3 = 0u; *maskByte = 0u; for (p = 0; p < 8u; p++) { pix = nibbles[p]; if (pix == TRANSPARENT_NIBBLE) { continue; } bitMask = (uint8_t)(0x80u >> p); *maskByte = (uint8_t)(*maskByte | bitMask); if (pix & 1u) b0 = (uint8_t)(b0 | bitMask); if (pix & 2u) b1 = (uint8_t)(b1 | bitMask); if (pix & 4u) b2 = (uint8_t)(b2 | bitMask); if (pix & 8u) b3 = (uint8_t)(b3 | bitMask); } planeBytes[0] = b0; planeBytes[1] = b1; planeBytes[2] = b2; planeBytes[3] = b3; } // Shifted variant: produces 4 plane bytes and 1 mask byte for output // column `outCol` (0..widthTiles inclusive) of row `row` when the // sprite is shifted right by `shift` pixels (1..7). For shift 0, // callers should use planeByteAndMaskAt directly (faster, no spill). // // Each output byte is composed of bits drawn from up to two source // plane bytes: // leftPart = src[outCol-1] << (8 - shift) (high (shift) bits) // rightPart = src[outCol] >> shift (low (8-shift) bits) // with src[-1] and src[widthTiles] treated as 0/transparent. The // resulting plane byte is leftPart | rightPart; the mask byte is the // shifted union of the per-byte source masks. static void planeByteAndMaskShifted(const SpriteT *sp, uint16_t row, uint16_t outCol, uint8_t shift, uint16_t widthTiles, uint8_t *planeBytes /*[4]*/, uint8_t *maskByte) { uint8_t leftPlanes[AMIGA_BITPLANES]; uint8_t leftMask; uint8_t rightPlanes[AMIGA_BITPLANES]; uint8_t rightMask; uint8_t i; leftMask = 0u; rightMask = 0u; for (i = 0; i < AMIGA_BITPLANES; i++) { leftPlanes[i] = 0u; rightPlanes[i] = 0u; } if (outCol > 0u && (uint16_t)(outCol - 1u) < widthTiles) { planeByteAndMaskAt(sp, row, (uint16_t)(outCol - 1u), leftPlanes, &leftMask); } if (outCol < widthTiles) { planeByteAndMaskAt(sp, row, outCol, rightPlanes, &rightMask); } *maskByte = (uint8_t)(((leftMask << (8u - shift)) & 0xFFu) | ((rightMask >> shift) & 0xFFu)); for (i = 0; i < AMIGA_BITPLANES; i++) { planeBytes[i] = (uint8_t)(((leftPlanes[i] << (8u - shift)) & 0xFFu) | ((rightPlanes[i] >> shift) & 0xFFu)); } } // Emit code that merges one plane byte into d16(an) where d16 is the // row-relative byte offset (0 since we re-base each row by adda.w). // The choice of all-opaque vs mixed encoding cuts code size when many // pixels are opaque (typical for sprite interiors). static uint16_t emitMergeByteToD16An(uint8_t *out, uint16_t cursor, uint8_t an, uint8_t disp, uint8_t maskByte, uint8_t srcByte) { if (maskByte == 0u) { return cursor; /* nothing to write */ } if (maskByte == 0xFFu) { /* All-opaque shortcut: move.b #src, d16(an). */ cursor += writeBE16(out + cursor, kMoveBImmToD16An[an]); cursor += writeBE16(out + cursor, (uint16_t)srcByte); cursor += writeBE16(out + cursor, (uint16_t)disp); return cursor; } /* Mixed: load existing, clear mask bits, OR in src, write back. */ cursor += writeBE16(out + cursor, kMoveBD16AnToD0[an]); cursor += writeBE16(out + cursor, (uint16_t)disp); cursor += writeBE16(out + cursor, ANDI_B_IMM_D0); cursor += writeBE16(out + cursor, (uint16_t)((~maskByte) & 0xFFu)); cursor += writeBE16(out + cursor, ORI_B_IMM_D0); cursor += writeBE16(out + cursor, (uint16_t)srcByte); cursor += writeBE16(out + cursor, kMoveBD0ToD16An[an]); cursor += writeBE16(out + cursor, (uint16_t)disp); return cursor; } // ----- Public API ----- uint16_t spriteEmitDrawPlanar68k(uint8_t *out, const SpriteT *sp, uint8_t shift) { uint16_t cursor; uint16_t row; uint16_t col; uint16_t heightPx; uint16_t widthTiles; uint16_t bytesPerRow; /* per plane, per row */ uint8_t planeBytes[AMIGA_BITPLANES]; uint8_t maskByte; uint8_t i; if (shift > 7u) { return 0u; } cursor = 0; heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS); widthTiles = (uint16_t)sp->widthTiles; bytesPerRow = (uint16_t)(widthTiles + (shift == 0u ? 0u : 1u)); /* Prologue: m68k cdecl callee-saves a2-a6; we clobber a2 and a3 * loading plane pointers, so push them first. After the push, all * stack arg displacements shift by +8 (two longs). */ cursor += writeBE16(out + cursor, MOVEM_L_PUSH_OPCODE); cursor += writeBE16(out + cursor, MOVEM_L_MASK_A2_A3); for (i = 0; i < AMIGA_BITPLANES; i++) { cursor += writeBE16(out + cursor, kMoveaSpToAn[i]); cursor += writeBE16(out + cursor, (uint16_t)(8u + 4u + i * 4u)); } for (row = 0; row < heightPx; row++) { for (col = 0; col < bytesPerRow; col++) { if (shift == 0u) { planeByteAndMaskAt(sp, row, col, planeBytes, &maskByte); } else { planeByteAndMaskShifted(sp, row, col, shift, widthTiles, planeBytes, &maskByte); } for (i = 0; i < AMIGA_BITPLANES; i++) { cursor = emitMergeByteToD16An(out, cursor, i, (uint8_t)col, maskByte, planeBytes[i]); } } if (row + 1u < heightPx) { for (i = 0; i < AMIGA_BITPLANES; i++) { cursor += writeBE16(out + cursor, kAddaWImmToAn[i]); cursor += writeBE16(out + cursor, (uint16_t)AMIGA_BYTES_PER_ROW); } } } /* Epilogue: restore a2-a3, rts. */ cursor += writeBE16(out + cursor, MOVEM_L_POP_OPCODE); cursor += writeBE16(out + cursor, MOVEM_L_MASK_POP_A2_A3); cursor += writeBE16(out + cursor, OPCODE_RTS); return cursor; } // SAVE: planes -> backup. backup is one contiguous 4*H*W/8 byte buffer // laid out as 4 plane stripes, matching halSpriteSavePlanes format // (so cross-platform save buffer is interchangeable). // // Per row: for each plane, copy bytesPerRow bytes from d16(an) to // (a4)+. After the row's reads, the planes need to advance by 40, // while a4 advances naturally via post-increment. // // Plane stripes are sequential in backup. We could either (a) do all // rows of plane 0, then plane 1, etc. (matches halSpriteSavePlanes // layout), or (b) interleave rows of all 4 planes (different layout). // halSpriteSavePlanes does (a) -- 4 separate plane stripes. The // emitted code below matches that layout for compat. uint16_t spriteEmitSavePlanar68k(uint8_t *out, const SpriteT *sp, uint8_t shift) { uint16_t cursor; uint16_t row; uint16_t col; uint16_t heightPx; uint16_t bytesPerRow; uint8_t i; /* Shifts 2..7 reuse shift 1's bytes (identical memcpy). The * spriteCompile post-emit pass aliases their routineOffsets to * slot 1 so this routine is emitted once. */ if (shift > 1u) { return 0u; } cursor = 0; heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS); bytesPerRow = (uint16_t)(sp->widthTiles + (shift == 0u ? 0u : 1u)); /* Prologue: callee-save a2/a3/a4 (m68k cdecl), then load 4 plane * pointers + backup pointer. After the push, all stack arg disps * shift by +12 (three longs). */ cursor += writeBE16(out + cursor, MOVEM_L_PUSH_OPCODE); cursor += writeBE16(out + cursor, MOVEM_L_MASK_A2_A3_A4); for (i = 0; i < AMIGA_BITPLANES; i++) { cursor += writeBE16(out + cursor, kMoveaSpToAn[i]); cursor += writeBE16(out + cursor, (uint16_t)(12u + 4u + i * 4u)); } /* a4 = backup. */ cursor += writeBE16(out + cursor, kMoveaSpToAn[4]); cursor += writeBE16(out + cursor, (uint16_t)(12u + 4u + 4u * 4u)); /* Plane-major: for each plane, walk all rows. After this routine, * each An has advanced by H*40 (one frame full); we don't need to * unwind because the function returns. We DO need to reset An * back to start before walking the NEXT plane though. * * Simpler alternative: row-major (interleaved). Per row, copy * bytesPerRow bytes from each plane to (a4)+, then advance all * 4 planes by 40. Net: a4 advances by 4*H*bytesPerRow; planes * advance by H*40. Backup layout becomes interleaved (plane0_row0, * plane1_row0, plane2_row0, plane3_row0, plane0_row1, ...). * * That doesn't match halSpriteSavePlanes' plane-major layout. Need * to either (a) match it -- emit per-plane outer loop with a4 * stride between planes -- or (b) change halSpriteSavePlanes to * interleaved. Picking (b) is simpler in emitted code, but ALSO * requires updating halSpriteRestorePlanes and halSpriteRestoreUnder * fallback math. * * For now: use plane-major matching halSpriteSavePlanes. Per * plane: walk rows, copy bytes from d16(an) to (a4)+, advance an * by 40 after each row except the last; reset an back to start * before next plane. */ for (i = 0; i < AMIGA_BITPLANES; i++) { for (row = 0; row < heightPx; row++) { for (col = 0; col < bytesPerRow; col++) { cursor += writeBE16(out + cursor, kMoveBD16AnToA4Postinc[i]); cursor += writeBE16(out + cursor, (uint16_t)col); } if (row + 1u < heightPx) { cursor += writeBE16(out + cursor, kAddaWImmToAn[i]); cursor += writeBE16(out + cursor, (uint16_t)AMIGA_BYTES_PER_ROW); } } /* Reset An back to the plane base for next iteration. The * total advance was (heightPx - 1) * 40. Subtract that. */ if (i + 1u < AMIGA_BITPLANES) { cursor += writeBE16(out + cursor, kAddaWImmToAn[i]); cursor += writeBE16(out + cursor, (uint16_t)(0u - ((heightPx - 1u) * AMIGA_BYTES_PER_ROW))); } } cursor += writeBE16(out + cursor, MOVEM_L_POP_OPCODE); cursor += writeBE16(out + cursor, MOVEM_L_MASK_POP_A2_A3_A4); cursor += writeBE16(out + cursor, OPCODE_RTS); return cursor; } // RESTORE: backup -> planes. Mirror of save. Uses MOVE.B (a4)+, d16(an). uint16_t spriteEmitRestorePlanar68k(uint8_t *out, const SpriteT *sp, uint8_t shift) { uint16_t cursor; uint16_t row; uint16_t col; uint16_t heightPx; uint16_t bytesPerRow; uint8_t i; if (shift > 1u) { return 0u; } cursor = 0; heightPx = (uint16_t)(sp->heightTiles * TILE_PIXELS); bytesPerRow = (uint16_t)(sp->widthTiles + (shift == 0u ? 0u : 1u)); /* Callee-save a2/a3/a4; arg disps shift by +12. */ cursor += writeBE16(out + cursor, MOVEM_L_PUSH_OPCODE); cursor += writeBE16(out + cursor, MOVEM_L_MASK_A2_A3_A4); for (i = 0; i < AMIGA_BITPLANES; i++) { cursor += writeBE16(out + cursor, kMoveaSpToAn[i]); cursor += writeBE16(out + cursor, (uint16_t)(12u + 4u + i * 4u)); } cursor += writeBE16(out + cursor, kMoveaSpToAn[4]); cursor += writeBE16(out + cursor, (uint16_t)(12u + 4u + 4u * 4u)); for (i = 0; i < AMIGA_BITPLANES; i++) { for (row = 0; row < heightPx; row++) { for (col = 0; col < bytesPerRow; col++) { cursor += writeBE16(out + cursor, kMoveBA4PostincToD16An[i]); cursor += writeBE16(out + cursor, (uint16_t)col); } if (row + 1u < heightPx) { cursor += writeBE16(out + cursor, kAddaWImmToAn[i]); cursor += writeBE16(out + cursor, (uint16_t)AMIGA_BYTES_PER_ROW); } } if (i + 1u < AMIGA_BITPLANES) { cursor += writeBE16(out + cursor, kAddaWImmToAn[i]); cursor += writeBE16(out + cursor, (uint16_t)(0u - ((heightPx - 1u) * AMIGA_BYTES_PER_ROW))); } } cursor += writeBE16(out + cursor, MOVEM_L_POP_OPCODE); cursor += writeBE16(out + cursor, MOVEM_L_MASK_POP_A2_A3_A4); cursor += writeBE16(out + cursor, OPCODE_RTS); return cursor; }