// Planar 68k sprite codegen for Amiga (post-Phase 9, no chunky shadow).
//
// Emits PIC routines that write directly to the four bitplanes via 4
// address-register pointers (a0..a3 = plane[0..3] base + byteOff,
// where byteOff = y*40 + x/8 -- the dispatcher pre-computes this).
//
// Calling convention (cdecl on m68k-amigaos-gcc):
//   draw(p0, p1, p2, p3):
//     args at 4(sp), 8(sp), 12(sp), 16(sp) -- one ULONG per plane.
//     loaded into a0..a3 by the prologue.
//   save(p0, p1, p2, p3, backup):
//     5 args; backup at 20(sp), loaded into a4.
//   restore(p0, p1, p2, p3, backup):
//     same as save but reads backup, writes planes.
//
// Per-byte plane write encoding decisions:
//   - all-transparent (mask=0):  skip the byte entirely
//   - all-opaque (mask=0xFF):    move.b #imm, d16(an)        (6 bytes)
//   - mixed (0<mask<0xFF):       move.b d16(an), d0;
//                                andi.b #~mask, d0;
//                                ori.b  #imm, d0;
//                                move.b d0, d16(an)          (4+6+6+4 = 20 bytes)
//
// Per row advance: 4 plane pointers each get adda.w #SURFACE_WIDTH/8
// = adda.w #40, an  (4 bytes encoded each, 16 bytes total per row).
// We omit the advance after the last row.
//
// Shift handling: shifts 0..7 are pre-baked. The dispatcher selects
// the variant via x % 8 and pre-computes byteOff = y*40 + (x & ~7)/8
// (i.e. round x DOWN to 8-pixel boundary). The variant for shift s
// then emits to (widthTiles + 1) plane bytes per row when s != 0
// (the rightmost shift bits spill into one extra plane byte) and to
// widthTiles plane bytes per row when s == 0.
//
// The emitter assumes sprite width is a multiple of 8 (= a multiple
// of one tile = a multiple of 8 pixels) so plane bytes per row are
// integer. JoeyLib sprites are always tile-multiple by API contract.

#include "joey/sprite.h"
#include "joey/surface.h"
#include "spriteEmitter.h"
#include "spriteInternal.h"


// ----- Constants -----

#define TILE_PIXELS              8
#define TILE_BYTES               32
#define TILE_BYTES_PER_ROW       4
#define TRANSPARENT_NIBBLE       0
#define AMIGA_BITPLANES          4
#define AMIGA_BYTES_PER_ROW      40


// ----- Instruction encoding helpers -----

static uint16_t writeBE16(uint8_t *out, uint16_t value) {
    out[0] = (uint8_t)((value >> 8) & 0xFFu);
    out[1] = (uint8_t)(value & 0xFFu);
    return 2u;
}


// movea.l <d16,SP>, an  -- load arg at SP+disp into An.
// Encoding: 0010 nnn 001 010 111  + disp16
//           = 0x2057 + (n << 9), where n is dst An.
//   a0: 0x206F, a1: 0x226F, a2: 0x246F, a3: 0x266F, a4: 0x286F.
static const uint16_t kMoveaSpToAn[] = {
    0x206Fu, 0x226Fu, 0x246Fu, 0x266Fu, 0x286Fu, 0x2A6Fu, 0x2C6Fu, 0x2E6Fu
};


// adda.w #imm, an  -- adds 16-bit signed imm to An (sign-extended).
// Encoding: 1101 nnn 011 111 100  + imm
//           = 0xD0FC + (n << 9).
static const uint16_t kAddaWImmToAn[] = {
    0xD0FCu, 0xD2FCu, 0xD4FCu, 0xD6FCu, 0xD8FCu, 0xDAFCu, 0xDCFCu, 0xDEFCu
};


// ANDI.B #imm, D0  -- 4 bytes (opcode word + imm word, byte in low half).
// Opcode: 0000 0010 00 000 000  (size=byte, mode=Dn, reg=D0)
#define ANDI_B_IMM_D0   0x0200u

// ORI.B  #imm, D0  -- 4 bytes (opcode word + imm word, byte in low half).
// Opcode: 0000 0000 00 000 000
#define ORI_B_IMM_D0    0x0000u


// MOVE.B d16(An), D0  -- 4 bytes (opcode + disp).
// Encoding: 0001 000 000 mode reg
//   = size=01 (byte), dst reg=000 (D0), dst mode=000 (Dn),
//     src mode=101 (d16,An), src reg=An.
//   = 0001000 000 101 nnn = 0x1028 + An.
static const uint16_t kMoveBD16AnToD0[] = {
    0x1028u, 0x1029u, 0x102Au, 0x102Bu
};


// MOVE.B D0, d16(An)  -- 4 bytes (opcode + disp).
// Encoding: 0001 nnn 101 000 000 = 0x1140 + (An << 9).
static const uint16_t kMoveBD0ToD16An[] = {
    0x1140u, 0x1340u, 0x1540u, 0x1740u
};


// MOVE.B #imm, d16(An) -- 6 bytes (opcode + imm + disp).
// Encoding: 0001 nnn 101 111 100 = 0x117C + (An << 9).
//   (Was 0x113C earlier -- that's mode=100=predec; mode=101=d16(An)
//    is the bit difference. Predec emits a 4-byte instruction with no
//    disp word, so the byte stream went out of sync and every
//    subsequent instruction decoded into garbage.)
static const uint16_t kMoveBImmToD16An[] = {
    0x117Cu, 0x137Cu, 0x157Cu, 0x177Cu
};


// MOVE.B (a4)+, d16(An) -- 4 bytes (opcode + disp).  -- used by save/restore (backup in a4)
// Encoding: 0001 nnn 101 011 100 = 0x115C + (An << 9).
static const uint16_t kMoveBA4PostincToD16An[] = {
    0x115Cu, 0x135Cu, 0x155Cu, 0x175Cu
};


// MOVE.B d16(An), (a4)+ -- 4 bytes (opcode + disp).  -- used by save (planes -> backup)
// Encoding: 1001 100 011 mode reg
//   Wait, MOVE.B src,(a4)+ : dst mode = 011 (an+), dst reg = 100 (A4),
//   so dst reg=100, dst mode=011 -> opcode high = 0001 100 011 ...
//   = 0001100011 mode reg = 0x18C0..
//   0001 100 011 101 nnn = 0x18E8 + An.
static const uint16_t kMoveBD16AnToA4Postinc[] = {
    0x18E8u, 0x18E9u, 0x18EAu, 0x18EBu
};


// MOVEM.L reglist, -(SP)  -- 4 bytes (opcode + reglist mask).
//   Opcode 0x48E7. Predec mask is REVERSED vs all other modes:
//   bit 15 = D0, ..., bit 8 = D7, bit 7 = A0, bit 6 = A1, bit 5 = A2,
//   bit 4 = A3, bit 3 = A4, bit 2 = A5, bit 1 = A6, bit 0 = A7.
#define MOVEM_L_PUSH_OPCODE   0x48E7u
#define MOVEM_L_MASK_A2_A3    0x0030u  /* bits 5,4 = A2,A3 (predec order) */
#define MOVEM_L_MASK_A2_A3_A4 0x0038u  /* bits 5,4,3 = A2,A3,A4 */

// MOVEM.L (SP)+, reglist  -- 4 bytes (opcode + reglist mask).
//   Opcode 0x4CDF. Postinc mask follows the standard layout:
//   bit 0 = D0, ..., bit 7 = D7, bit 8 = A0, ..., bit 15 = A7.
#define MOVEM_L_POP_OPCODE    0x4CDFu
#define MOVEM_L_MASK_POP_A2_A3    0x0C00u  /* bits 11,10 = A3,A2 */
#define MOVEM_L_MASK_POP_A2_A3_A4 0x1C00u  /* bits 12,11,10 = A4,A3,A2 */

// RTS opcode.
#define OPCODE_RTS            0x4E75u


// ----- Emit helpers -----

// For shift 0 (byte-aligned x), the sprite's chunky tile data converts
// directly to plane bytes without any sub-byte shifting. For each
// (row, col-byte, plane) we extract the 8 plane bits from 4 chunky
// bytes (= 8 pixels) and produce one plane byte; we also produce a
// mask byte indicating which pixel positions are non-transparent
// (any plane bit != 0 in the source means non-transparent if
// transparent index is 0, the JoeyLib convention).
//
// Sprite layout: tileData = wTiles x hTiles tiles, each tile = 8 rows
// x 4 chunky bytes (32 bytes). Tiles laid out row-major within the
// sprite. For plane-byte column `c` of row `r`:
//   tileX = c (since each plane byte covers exactly one tile column)
//   tileY = r / 8
//   inTileY = r % 8
//   chunky bytes = tileData + (tileY*wTiles + tileX)*32 + inTileY*4 + 0..3
//
// `col` must be in [0, widthTiles); callers handle out-of-range cols
// (used when computing shifted variants that span widthTiles+1 output
// bytes per row) by passing a sentinel and checking against widthTiles
// before invoking this helper.
static void planeByteAndMaskAt(const SpriteT *sp, uint16_t row, uint16_t col,
                               uint8_t *planeBytes /*[4]*/, uint8_t *maskByte)
{
    uint16_t       tileX;
    uint16_t       tileY;
    uint16_t       inTileY;
    const uint8_t *tile;
    const uint8_t *chunky;
    uint8_t        nibbles[8];
    uint8_t        b0, b1, b2, b3;
    uint16_t       p;
    uint8_t        bitMask;
    uint8_t        pix;

    tileX   = col;
    tileY   = row >> 3;
    inTileY = row & 7u;

    tile   = sp->tileData + (uint32_t)((tileY * sp->widthTiles + tileX) * 32u);
    chunky = tile + inTileY * 4u;

    nibbles[0] = (uint8_t)(chunky[0] >> 4);
    nibbles[1] = (uint8_t)(chunky[0] & 0x0Fu);
    nibbles[2] = (uint8_t)(chunky[1] >> 4);
    nibbles[3] = (uint8_t)(chunky[1] & 0x0Fu);
    nibbles[4] = (uint8_t)(chunky[2] >> 4);
    nibbles[5] = (uint8_t)(chunky[2] & 0x0Fu);
    nibbles[6] = (uint8_t)(chunky[3] >> 4);
    nibbles[7] = (uint8_t)(chunky[3] & 0x0Fu);

    b0 = 0u; b1 = 0u; b2 = 0u; b3 = 0u;
    *maskByte = 0u;
    for (p = 0; p < 8u; p++) {
        pix = nibbles[p];
        if (pix == TRANSPARENT_NIBBLE) {
            continue;
        }
        bitMask = (uint8_t)(0x80u >> p);
        *maskByte = (uint8_t)(*maskByte | bitMask);
        if (pix & 1u) b0 = (uint8_t)(b0 | bitMask);
        if (pix & 2u) b1 = (uint8_t)(b1 | bitMask);
        if (pix & 4u) b2 = (uint8_t)(b2 | bitMask);
        if (pix & 8u) b3 = (uint8_t)(b3 | bitMask);
    }
    planeBytes[0] = b0;
    planeBytes[1] = b1;
    planeBytes[2] = b2;
    planeBytes[3] = b3;
}


// Shifted variant: produces 4 plane bytes and 1 mask byte for output
// column `outCol` (0..widthTiles inclusive) of row `row` when the
// sprite is shifted right by `shift` pixels (1..7). For shift 0,
// callers should use planeByteAndMaskAt directly (faster, no spill).
//
// Each output byte is composed of bits drawn from up to two source
// plane bytes:
//   leftPart  = src[outCol-1] << (8 - shift)   (high (shift) bits)
//   rightPart = src[outCol]   >> shift          (low (8-shift) bits)
// with src[-1] and src[widthTiles] treated as 0/transparent. The
// resulting plane byte is leftPart | rightPart; the mask byte is the
// shifted union of the per-byte source masks.
static void planeByteAndMaskShifted(const SpriteT *sp, uint16_t row, uint16_t outCol,
                                    uint8_t shift, uint16_t widthTiles,
                                    uint8_t *planeBytes /*[4]*/, uint8_t *maskByte)
{
    uint8_t leftPlanes[AMIGA_BITPLANES];
    uint8_t leftMask;
    uint8_t rightPlanes[AMIGA_BITPLANES];
    uint8_t rightMask;
    uint8_t i;

    leftMask  = 0u;
    rightMask = 0u;
    for (i = 0; i < AMIGA_BITPLANES; i++) {
        leftPlanes[i]  = 0u;
        rightPlanes[i] = 0u;
    }

    if (outCol > 0u && (uint16_t)(outCol - 1u) < widthTiles) {
        planeByteAndMaskAt(sp, row, (uint16_t)(outCol - 1u), leftPlanes, &leftMask);
    }
    if (outCol < widthTiles) {
        planeByteAndMaskAt(sp, row, outCol, rightPlanes, &rightMask);
    }

    *maskByte = (uint8_t)(((leftMask  << (8u - shift)) & 0xFFu) |
                          ((rightMask >>       shift)  & 0xFFu));
    for (i = 0; i < AMIGA_BITPLANES; i++) {
        planeBytes[i] = (uint8_t)(((leftPlanes[i]  << (8u - shift)) & 0xFFu) |
                                  ((rightPlanes[i] >>       shift)  & 0xFFu));
    }
}


// Emit code that merges one plane byte into d16(an) where d16 is the
// row-relative byte offset (0 since we re-base each row by adda.w).
// The choice of all-opaque vs mixed encoding cuts code size when many
// pixels are opaque (typical for sprite interiors).
static uint16_t emitMergeByteToD16An(uint8_t *out, uint16_t cursor,
                                     uint8_t an, uint8_t disp,
                                     uint8_t maskByte, uint8_t srcByte)
{
    if (maskByte == 0u) {
        return cursor;  /* nothing to write */
    }
    if (maskByte == 0xFFu) {
        /* All-opaque shortcut: move.b #src, d16(an). */
        cursor += writeBE16(out + cursor, kMoveBImmToD16An[an]);
        cursor += writeBE16(out + cursor, (uint16_t)srcByte);
        cursor += writeBE16(out + cursor, (uint16_t)disp);
        return cursor;
    }
    /* Mixed: load existing, clear mask bits, OR in src, write back. */
    cursor += writeBE16(out + cursor, kMoveBD16AnToD0[an]);
    cursor += writeBE16(out + cursor, (uint16_t)disp);
    cursor += writeBE16(out + cursor, ANDI_B_IMM_D0);
    cursor += writeBE16(out + cursor, (uint16_t)((~maskByte) & 0xFFu));
    cursor += writeBE16(out + cursor, ORI_B_IMM_D0);
    cursor += writeBE16(out + cursor, (uint16_t)srcByte);
    cursor += writeBE16(out + cursor, kMoveBD0ToD16An[an]);
    cursor += writeBE16(out + cursor, (uint16_t)disp);
    return cursor;
}


// ----- Public API -----

uint16_t spriteEmitDrawPlanar68k(uint8_t *out, const SpriteT *sp, uint8_t shift) {
    uint16_t cursor;
    uint16_t row;
    uint16_t col;
    uint16_t heightPx;
    uint16_t widthTiles;
    uint16_t bytesPerRow;       /* per plane, per row */
    uint8_t  planeBytes[AMIGA_BITPLANES];
    uint8_t  maskByte;
    uint8_t  i;

    if (shift > 7u) {
        return 0u;
    }

    cursor      = 0;
    heightPx    = (uint16_t)(sp->heightTiles * TILE_PIXELS);
    widthTiles  = (uint16_t)sp->widthTiles;
    bytesPerRow = (uint16_t)(widthTiles + (shift == 0u ? 0u : 1u));

    /* Prologue: m68k cdecl callee-saves a2-a6; we clobber a2 and a3
     * loading plane pointers, so push them first. After the push, all
     * stack arg displacements shift by +8 (two longs). */
    cursor += writeBE16(out + cursor, MOVEM_L_PUSH_OPCODE);
    cursor += writeBE16(out + cursor, MOVEM_L_MASK_A2_A3);
    for (i = 0; i < AMIGA_BITPLANES; i++) {
        cursor += writeBE16(out + cursor, kMoveaSpToAn[i]);
        cursor += writeBE16(out + cursor, (uint16_t)(8u + 4u + i * 4u));
    }

    for (row = 0; row < heightPx; row++) {
        for (col = 0; col < bytesPerRow; col++) {
            if (shift == 0u) {
                planeByteAndMaskAt(sp, row, col, planeBytes, &maskByte);
            } else {
                planeByteAndMaskShifted(sp, row, col, shift, widthTiles, planeBytes, &maskByte);
            }
            for (i = 0; i < AMIGA_BITPLANES; i++) {
                cursor = emitMergeByteToD16An(out, cursor, i, (uint8_t)col,
                                              maskByte, planeBytes[i]);
            }
        }
        if (row + 1u < heightPx) {
            for (i = 0; i < AMIGA_BITPLANES; i++) {
                cursor += writeBE16(out + cursor, kAddaWImmToAn[i]);
                cursor += writeBE16(out + cursor, (uint16_t)AMIGA_BYTES_PER_ROW);
            }
        }
    }

    /* Epilogue: restore a2-a3, rts. */
    cursor += writeBE16(out + cursor, MOVEM_L_POP_OPCODE);
    cursor += writeBE16(out + cursor, MOVEM_L_MASK_POP_A2_A3);
    cursor += writeBE16(out + cursor, OPCODE_RTS);
    return cursor;
}


// SAVE: planes -> backup. backup is one contiguous 4*H*W/8 byte buffer
// laid out as 4 plane stripes, matching halSpriteSavePlanes format
// (so cross-platform save buffer is interchangeable).
//
// Per row: for each plane, copy bytesPerRow bytes from d16(an) to
// (a4)+. After the row's reads, the planes need to advance by 40,
// while a4 advances naturally via post-increment.
//
// Plane stripes are sequential in backup. We could either (a) do all
// rows of plane 0, then plane 1, etc. (matches halSpriteSavePlanes
// layout), or (b) interleave rows of all 4 planes (different layout).
// halSpriteSavePlanes does (a) -- 4 separate plane stripes. The
// emitted code below matches that layout for compat.
uint16_t spriteEmitSavePlanar68k(uint8_t *out, const SpriteT *sp, uint8_t shift) {
    uint16_t cursor;
    uint16_t row;
    uint16_t col;
    uint16_t heightPx;
    uint16_t bytesPerRow;
    uint8_t  i;

    /* Shifts 2..7 reuse shift 1's bytes (identical memcpy). The
     * spriteCompile post-emit pass aliases their routineOffsets to
     * slot 1 so this routine is emitted once. */
    if (shift > 1u) {
        return 0u;
    }

    cursor      = 0;
    heightPx    = (uint16_t)(sp->heightTiles * TILE_PIXELS);
    bytesPerRow = (uint16_t)(sp->widthTiles + (shift == 0u ? 0u : 1u));

    /* Prologue: callee-save a2/a3/a4 (m68k cdecl), then load 4 plane
     * pointers + backup pointer. After the push, all stack arg disps
     * shift by +12 (three longs). */
    cursor += writeBE16(out + cursor, MOVEM_L_PUSH_OPCODE);
    cursor += writeBE16(out + cursor, MOVEM_L_MASK_A2_A3_A4);
    for (i = 0; i < AMIGA_BITPLANES; i++) {
        cursor += writeBE16(out + cursor, kMoveaSpToAn[i]);
        cursor += writeBE16(out + cursor, (uint16_t)(12u + 4u + i * 4u));
    }
    /* a4 = backup. */
    cursor += writeBE16(out + cursor, kMoveaSpToAn[4]);
    cursor += writeBE16(out + cursor, (uint16_t)(12u + 4u + 4u * 4u));

    /* Plane-major: for each plane, walk all rows. After this routine,
     * each An has advanced by H*40 (one frame full); we don't need to
     * unwind because the function returns. We DO need to reset An
     * back to start before walking the NEXT plane though.
     *
     * Simpler alternative: row-major (interleaved). Per row, copy
     * bytesPerRow bytes from each plane to (a4)+, then advance all
     * 4 planes by 40. Net: a4 advances by 4*H*bytesPerRow; planes
     * advance by H*40. Backup layout becomes interleaved (plane0_row0,
     * plane1_row0, plane2_row0, plane3_row0, plane0_row1, ...).
     *
     * That doesn't match halSpriteSavePlanes' plane-major layout. Need
     * to either (a) match it -- emit per-plane outer loop with a4
     * stride between planes -- or (b) change halSpriteSavePlanes to
     * interleaved. Picking (b) is simpler in emitted code, but ALSO
     * requires updating halSpriteRestorePlanes and halSpriteRestoreUnder
     * fallback math.
     *
     * For now: use plane-major matching halSpriteSavePlanes. Per
     * plane: walk rows, copy bytes from d16(an) to (a4)+, advance an
     * by 40 after each row except the last; reset an back to start
     * before next plane. */
    for (i = 0; i < AMIGA_BITPLANES; i++) {
        for (row = 0; row < heightPx; row++) {
            for (col = 0; col < bytesPerRow; col++) {
                cursor += writeBE16(out + cursor, kMoveBD16AnToA4Postinc[i]);
                cursor += writeBE16(out + cursor, (uint16_t)col);
            }
            if (row + 1u < heightPx) {
                cursor += writeBE16(out + cursor, kAddaWImmToAn[i]);
                cursor += writeBE16(out + cursor, (uint16_t)AMIGA_BYTES_PER_ROW);
            }
        }
        /* Reset An back to the plane base for next iteration. The
         * total advance was (heightPx - 1) * 40. Subtract that. */
        if (i + 1u < AMIGA_BITPLANES) {
            cursor += writeBE16(out + cursor, kAddaWImmToAn[i]);
            cursor += writeBE16(out + cursor, (uint16_t)(0u - ((heightPx - 1u) * AMIGA_BYTES_PER_ROW)));
        }
    }

    cursor += writeBE16(out + cursor, MOVEM_L_POP_OPCODE);
    cursor += writeBE16(out + cursor, MOVEM_L_MASK_POP_A2_A3_A4);
    cursor += writeBE16(out + cursor, OPCODE_RTS);
    return cursor;
}


// RESTORE: backup -> planes. Mirror of save. Uses MOVE.B (a4)+, d16(an).
uint16_t spriteEmitRestorePlanar68k(uint8_t *out, const SpriteT *sp, uint8_t shift) {
    uint16_t cursor;
    uint16_t row;
    uint16_t col;
    uint16_t heightPx;
    uint16_t bytesPerRow;
    uint8_t  i;

    if (shift > 1u) {
        return 0u;
    }

    cursor      = 0;
    heightPx    = (uint16_t)(sp->heightTiles * TILE_PIXELS);
    bytesPerRow = (uint16_t)(sp->widthTiles + (shift == 0u ? 0u : 1u));

    /* Callee-save a2/a3/a4; arg disps shift by +12. */
    cursor += writeBE16(out + cursor, MOVEM_L_PUSH_OPCODE);
    cursor += writeBE16(out + cursor, MOVEM_L_MASK_A2_A3_A4);
    for (i = 0; i < AMIGA_BITPLANES; i++) {
        cursor += writeBE16(out + cursor, kMoveaSpToAn[i]);
        cursor += writeBE16(out + cursor, (uint16_t)(12u + 4u + i * 4u));
    }
    cursor += writeBE16(out + cursor, kMoveaSpToAn[4]);
    cursor += writeBE16(out + cursor, (uint16_t)(12u + 4u + 4u * 4u));

    for (i = 0; i < AMIGA_BITPLANES; i++) {
        for (row = 0; row < heightPx; row++) {
            for (col = 0; col < bytesPerRow; col++) {
                cursor += writeBE16(out + cursor, kMoveBA4PostincToD16An[i]);
                cursor += writeBE16(out + cursor, (uint16_t)col);
            }
            if (row + 1u < heightPx) {
                cursor += writeBE16(out + cursor, kAddaWImmToAn[i]);
                cursor += writeBE16(out + cursor, (uint16_t)AMIGA_BYTES_PER_ROW);
            }
        }
        if (i + 1u < AMIGA_BITPLANES) {
            cursor += writeBE16(out + cursor, kAddaWImmToAn[i]);
            cursor += writeBE16(out + cursor, (uint16_t)(0u - ((heightPx - 1u) * AMIGA_BYTES_PER_ROW)));
        }
    }

    cursor += writeBE16(out + cursor, MOVEM_L_POP_OPCODE);
    cursor += writeBE16(out + cursor, MOVEM_L_MASK_POP_A2_A3_A4);
    cursor += writeBE16(out + cursor, OPCODE_RTS);
    return cursor;
}