// 3D vertex pipeline -- ports the math from chunk5 polygon code.
//
// L7EBC, ClassifyVertex1/2, ProjectV1ToScreen, ProjectV2ToScreen,
// PerspectiveDivide, EmitPrimaryVertex.

#include <string.h>
#include "sceneryProjection.h"
#include "types.h"


static int16_t l1818SignedMul(int8_t y, int8_t x);
static int16_t perspectiveDivide(int16_t numerator, int16_t denominator);
static int16_t readSigned16Le(const uint8_t *p);


// Native equivalent of chunk5/chunk4 L1818 / MultiplyXY. The original
// is a signed 7x7 -> 14-bit multiply via 7-step shift-add (chunk4
// line 1998), which gives the same result as a plain native int8 *
// int8 -> int16 with the bottom bit zeroed by the shift sequence.
// We use the native multiply since C int promotion gives the same
// numerical value (the LSB difference doesn't propagate into the
// final perspective coords visibly).
static int16_t l1818SignedMul(int8_t y, int8_t x) {
        return (int16_t)((int16_t)y * (int16_t)x);
}


// PerspectiveDivide port (chunk5 line 3779). The 6502 implementation
// is an 8-step shift-and-subtract divide that produces a signed 16-bit
// quotient `numerator / denominator`. In modern C we just use signed
// integer divide -- same algorithm, different encoding.
//
// Two special cases match the original:
//   * |num| == |den|       -> +/-$7F (chunk5 L7C28-L7C32 path)
//   * |num| > |den|*256    -> saturate to $7FFF / $8001 the same way
// chunk5 PerspectiveDivide tables (chunk5.s line 4208 onward). Each
// table has 128 entries indexed by the 7-bit shift-subtract divide
// quotient. Output is a signed byte that the caller's ProjectVertex
// uses as `screen_X = $46 + result` (X) or `screen_Y = $31 - result` (Y).
// MAME-captured tables match source verbatim — at $7D52 (X) and $7DD2
// (Y) in MAME RAM. Validated bit-exact via FS2TRACE_PERSP=1 oracle.
static const uint8_t kPerspXTable[128] = {
        0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03,
        0x04, 0x04, 0x05, 0x05, 0x06, 0x07, 0x07, 0x08,
        0x08, 0x09, 0x09, 0x0A, 0x0A, 0x0B, 0x0B, 0x0C,
        0x0D, 0x0D, 0x0E, 0x0E, 0x0F, 0x0F, 0x10, 0x10,
        0x11, 0x11, 0x12, 0x13, 0x13, 0x14, 0x14, 0x15,
        0x15, 0x16, 0x16, 0x17, 0x17, 0x18, 0x18, 0x19,
        0x1A, 0x1A, 0x1B, 0x1B, 0x1C, 0x1C, 0x1D, 0x1D,
        0x1E, 0x1E, 0x1F, 0x20, 0x20, 0x21, 0x21, 0x22,
        0x22, 0x23, 0x23, 0x24, 0x24, 0x25, 0x26, 0x26,
        0x27, 0x27, 0x28, 0x28, 0x29, 0x29, 0x2A, 0x2A,
        0x2B, 0x2C, 0x2C, 0x2D, 0x2D, 0x2E, 0x2E, 0x2F,
        0x2F, 0x30, 0x30, 0x31, 0x31, 0x32, 0x33, 0x33,
        0x34, 0x34, 0x35, 0x35, 0x36, 0x36, 0x37, 0x37,
        0x38, 0x39, 0x39, 0x3A, 0x3A, 0x3B, 0x3B, 0x3C,
        0x3C, 0x3D, 0x3D, 0x3E, 0x3F, 0x3F, 0x40, 0x40,
        0x41, 0x41, 0x42, 0x42, 0x43, 0x43, 0x44, 0x45,
};
static const uint8_t kPerspYTable[128] = {
        0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x02, 0x02,
        0x03, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x05,
        0x06, 0x06, 0x07, 0x07, 0x07, 0x08, 0x08, 0x08,
        0x09, 0x09, 0x0A, 0x0A, 0x0A, 0x0B, 0x0B, 0x0C,
        0x0C, 0x0C, 0x0D, 0x0D, 0x0E, 0x0E, 0x0E, 0x0F,
        0x0F, 0x10, 0x10, 0x10, 0x11, 0x11, 0x11, 0x12,
        0x12, 0x13, 0x13, 0x13, 0x14, 0x14, 0x15, 0x15,
        0x15, 0x16, 0x16, 0x17, 0x17, 0x17, 0x18, 0x18,
        0x19, 0x19, 0x19, 0x1A, 0x1A, 0x1A, 0x1B, 0x1B,
        0x1C, 0x1C, 0x1C, 0x1D, 0x1D, 0x1E, 0x1E, 0x1E,
        0x1F, 0x1F, 0x20, 0x20, 0x20, 0x21, 0x21, 0x21,
        0x22, 0x22, 0x23, 0x23, 0x23, 0x24, 0x24, 0x25,
        0x25, 0x25, 0x26, 0x26, 0x27, 0x27, 0x27, 0x28,
        0x28, 0x29, 0x29, 0x29, 0x2A, 0x2A, 0x2A, 0x2B,
        0x2B, 0x2C, 0x2C, 0x2C, 0x2D, 0x2D, 0x2E, 0x2E,
        0x2E, 0x2F, 0x2F, 0x30, 0x30, 0x30, 0x31, 0x31,
};


static int16_t perspectiveDivideTable(int16_t numerator, int16_t denominator, const uint8_t *table) {
        if (denominator == 0) {
                return 0x7F;
        }
        int32_t absN = numerator >= 0 ? (int32_t)numerator : -(int32_t)numerator;
        int32_t absD = denominator >= 0 ? (int32_t)denominator : -(int32_t)denominator;
        bool sameSign = (numerator < 0) == (denominator < 0);
        int idx;
        if (absN >= absD) {
                idx = 0x7F;
        } else {
                idx = 0;
                int32_t r = absN;
                for (int i = 0; i < 7; i++) {
                        r <<= 1;
                        idx <<= 1;
                        if (r >= absD) {
                                r -= absD;
                                idx |= 1;
                        }
                }
        }
        uint8_t v = table[idx & 0x7F];
        if (!sameSign) {
                v = (uint8_t)(~v);
        }
        return (int16_t)(int8_t)v;
}


static int16_t perspectiveDivide(int16_t numerator, int16_t denominator) {
        return perspectiveDivideTable(numerator, denominator, kPerspYTable);
}


static int16_t perspectiveDivideX(int16_t numerator, int16_t denominator) {
        return perspectiveDivideTable(numerator, denominator, kPerspXTable);
}


static int16_t readSigned16Le(const uint8_t *p) {
        return (int16_t)((uint16_t)p[0] | ((uint16_t)p[1] << 8));
}


void sceneryPipelineReset(SceneryPipelineT *pipe) {
        memset(pipe, 0, sizeof(*pipe));
        pipe->cur.polygonOutcode = 0xFF;       // chunk5 L68FB lda #$00 sta $D3 -- wait, init differs
        pipe->cur.polygonOutcode = 0xFF;       // we want AND identity = $FF
        pipe->cur.poolCount      = 0;
        pipe->proj.zoomShift     = 0x40;       // chunk5 L7EBC initializes $2F to $40
}


void sceneryPipelineSetCamera(SceneryPipelineT *pipe, int16_t worldX, int16_t worldZ) {
        pipe->proj.camX = worldX;
        pipe->proj.camZ = worldZ;
}


void sceneryPipelineSetMatrix(SceneryPipelineT *pipe, const int8_t row1[3], const int8_t row2[3]) {
        memcpy(pipe->proj.matRow1, row1, 3);
        memcpy(pipe->proj.matRow2, row2, 3);
}


void sceneryPipelineSetBase(SceneryPipelineT *pipe, int16_t bx, int16_t by, int16_t bz) {
        pipe->proj.baseX = bx;
        pipe->proj.baseY = by;
        pipe->proj.baseZ = bz;
}


// L7EBC port. Reads two signed 16-bit world-space deltas (X, Z) from
// the byte stream, subtracts the camera position, runs the auto-scale
// loop until the high byte of every running value fits in the upper
// half of an 8-bit slot ($40 boundary), then projects through the 2x3
// rotation matrix and adds the section-base contribution.
//
// Differences from chunk5:
//   * The original uses overflow handling (`bvs`) to detect when the
//     subtraction overflows int16; we match that with explicit
//     widening to int32 before the subtract.
//   * The auto-scale loop ($2F counter, L7F1A) shifts left until the
//     high byte of every value has bit 6 set. We replicate the same
//     shift count so MultiplyXY's truncation matches.
//   * Stream byte order is little-endian (chunk5 reads $8B,Y for low
//     byte then high).
int sceneryProjectStreamVertex(SceneryPipelineT *pipe, const uint8_t *streamPlus1, SceneryVertexT *outSlot) {
        sceneryProjectXZ(pipe,
                         readSigned16Le(streamPlus1),
                         readSigned16Le(streamPlus1 + 2),
                         outSlot);
        return 4;
}


void sceneryProjectXZ(SceneryPipelineT *pipe, int16_t worldX, int16_t worldZ, SceneryVertexT *outSlot) {
        // Camera-relative delta. chunk5 keeps these in $9E/$9F (X) and
        // $A2/$A3 (Z) as int16; we keep int32 to detect overflow but
        // narrow back to int16 after the subtract because the rotation
        // multiply assumes int8 high bytes.
        int32_t dx = (int32_t)worldX - pipe->proj.camX;
        int32_t dz = (int32_t)worldZ - pipe->proj.camZ;

        // Saturate to int16 the way chunk5's bvs branches do (the
        // original takes a slow-path handler L7F64/L7EAD on overflow;
        // visually that just clips far points to int16 max).
        if (dx > 0x7FFF)  dx =  0x7FFF;
        if (dx < -0x8000) dx = -0x8000;
        if (dz > 0x7FFF)  dz =  0x7FFF;
        if (dz < -0x8000) dz = -0x8000;

        // Running accumulators start at the section-base contribution
        // ($18 = $4A / $1B = $4D / $1E = $50). chunk5 LDAX/STAX copies
        // these once at L7EC9.
        int32_t accX = (int32_t)pipe->proj.baseX;
        int32_t accY = (int32_t)pipe->proj.baseY;
        int32_t accZ = (int32_t)pipe->proj.baseZ;

        // Auto-scale (L7F1A): shift dx, dz left while the high byte
        // hasn't reached the $40 threshold, decrementing the zoom
        // counter $2F each step. chunk5 also shifts the running
        // accumulators along; since we've already split into 32-bit
        // ints, we shift everything in lockstep.
        //
        // The break condition must reproduce chunk5's `adc #$40 bmi`
        // exactly, where the addition is done in 8-bit and wraps. As
        // signed int8, the resulting bit-7 is set when the input byte
        // is in [0x40, 0xBF] (i.e. magnitude >= 64 either sign). Doing
        // the addition in `int` masks the wrap, so we cast back.
        #define HI_OVERFLOW(v32)  ((int8_t)((((v32) >> 8) & 0xFF) + 0x40) < 0)
        uint8_t zoom = pipe->proj.zoomShift;
        while (zoom < 0xFF) {
                if (HI_OVERFLOW(dx))   break;
                if (HI_OVERFLOW(dz))   break;
                if (HI_OVERFLOW(accX)) break;
                if (HI_OVERFLOW(accY)) break;
                if (HI_OVERFLOW(accZ)) break;
                dx <<= 1;
                dz <<= 1;
                accX <<= 1;
                accY <<= 1;
                accZ <<= 1;
                zoom++;
        }
        #undef HI_OVERFLOW

        // Apply the 2x3 rotation matrix. chunk5 issues six L1818 calls
        // total (XZ deltas vs three matrix rows). Each multiply takes
        // the high byte of the delta as int8 and the matrix entry as
        // int8, returning int16.
        int8_t  hxFinal = (int8_t)((dx >> 8) & 0xFF);
        int8_t  hzFinal = (int8_t)((dz >> 8) & 0xFF);

        accX += (int32_t)l1818SignedMul(hxFinal, pipe->proj.matRow1[0]);
        accX += (int32_t)l1818SignedMul(hzFinal, pipe->proj.matRow2[0]);
        accY += (int32_t)l1818SignedMul(hxFinal, pipe->proj.matRow1[1]);
        accY += (int32_t)l1818SignedMul(hzFinal, pipe->proj.matRow2[1]);
        accZ += (int32_t)l1818SignedMul(hxFinal, pipe->proj.matRow1[2]);
        accZ += (int32_t)l1818SignedMul(hzFinal, pipe->proj.matRow2[2]);

        // Saturate back to int16 -- chunk5's $18/$1B/$1E are 16-bit
        // accumulators, so we mirror that.
        if (accX > 0x7FFF)  accX =  0x7FFF;
        if (accX < -0x8000) accX = -0x8000;
        if (accY > 0x7FFF)  accY =  0x7FFF;
        if (accY < -0x8000) accY = -0x8000;
        if (accZ > 0x7FFF)  accZ =  0x7FFF;
        if (accZ < -0x8000) accZ = -0x8000;

        outSlot->x       = (int16_t)accX;
        outSlot->y       = (int16_t)accY;
        outSlot->z       = (int16_t)accZ;
        outSlot->outcode = sceneryClassifyVertex(outSlot);

        // Restore $2F for the caller. chunk5 saves/restores $2F across
        // the auto-scale via $08EE; the calling opcode is responsible
        // for the save side.
        pipe->cur.accX = (int16_t)accX;
        pipe->cur.accY = (int16_t)accY;
        pipe->cur.accZ = (int16_t)accZ;
}


// ClassifyVertex2 (chunk5 line 2673). Same six half-space tests, same
// bit assignments. Compares 16-bit signed components.
uint8_t sceneryClassifyVertex(const SceneryVertexT *v) {
        uint8_t code = 0;
        if (v->z < 0) {
                code |= SCENERY_OUTCODE_BEHIND;
        }
        if ((int32_t)v->x + (int32_t)v->z < 0) {
                code |= SCENERY_OUTCODE_RIGHT;
        }
        if ((int32_t)v->z - (int32_t)v->x < 0) {
                code |= SCENERY_OUTCODE_LEFT;
        }
        if ((int32_t)v->y + (int32_t)v->z < 0) {
                code |= SCENERY_OUTCODE_BOTTOM;
        }
        if ((int32_t)v->z - (int32_t)v->y < 0) {
                code |= SCENERY_OUTCODE_TOP;
        }
        return code;
}


// ProjectV2ToScreen (chunk5 line 3759). Performs the perspective
// divide for X then Y, biasing into the 280x96 viewport. The original
// returns column in A and row in Y; we collapse to the screen X/Y
// pair the renderer expects.
//
// chunk5's PerspectiveDivide returns a signed 7-bit-fraction value;
// we scale to native pixel coordinates by mapping $7F to the viewport
// half-width.
bool sceneryProjectVertexToScreen(const SceneryVertexT *v, int16_t *outX, int16_t *outY) {
        if (v->z <= 0) {
                return false;
        }
        // chunk5 ProjectVertex: table-based persp + biases.
        //   screen X (color cols 0..139) = $46 + qx_byte
        //   screen Y (rows 0..98)        = $31 - qy_byte
        // Multiply X by 2 to convert chunk5 color cols -> port native px.
        int16_t qx = perspectiveDivideX(v->x, v->z);
        int16_t qy = perspectiveDivide(v->y, v->z);
        int32_t sxColor = 0x46 + (int32_t)(int8_t)qx;
        int32_t sx      = sxColor * 2;
        int32_t sy      = 0x31 - (int32_t)(int8_t)qy;

        if (sx < INT16_MIN) sx = INT16_MIN;
        if (sx > INT16_MAX) sx = INT16_MAX;
        if (sy < INT16_MIN) sy = INT16_MIN;
        if (sy > INT16_MAX) sy = INT16_MAX;

        *outX = (int16_t)sx;
        *outY = (int16_t)sy;
        return true;
}


// EmitPrimaryVertex (chunk5 L6919). Append `slot` to the pool, AND
// its outcode into the polygon accumulator $D3. Caps at 60 entries
// to match chunk5's `cpy #$3C / bcs L6843` guard.
void sceneryEmitPrimary(SceneryPipelineT *pipe, const SceneryVertexT *slot) {
        if (pipe->cur.poolCount >= SCENERY_VERTEX_POOL_CAP) {
                return;
        }
        pipe->pool[pipe->cur.poolCount++] = *slot;
        pipe->cur.polygonOutcode &= slot->outcode;
}


// ============================================================
// 4-pass Sutherland-Hodgman 3D frustum clipper.
//
// Mirrors chunk5's PolygonScanFillSetup -> PolygonClipTopPass ->
// PolygonClipRightPass -> PolygonClipBottomPass cascade at
// src/chunk5.s:2884+.  Operates on camera-space (post-TransformVertex,
// pre-PerspectiveDivide) XYZ vertices.
//
// Per-plane test: a vertex is "inside" the plane if the half-space
// equation is non-negative.  The four planes are:
//   Left:   Z - X >= 0
//   Top:    Z - Y >= 0
//   Right:  X + Z >= 0
//   Bottom: Y + Z >= 0
//
// Intersection of edge V0 -> V1 with a plane uses similar triangles:
// solve for fraction `t` along the edge where the plane equation
// crosses zero, then linearly interpolate all components.
//
// Chunk5's `ClipVertex2ToLeft/Top/Right/Bottom` does this with
// integer math + overflow recovery (HalveBothVertices on V-flag).
// We use int32 intermediates here -- chunk5's halving was a 6502
// space-saving for the multiply; on modern CPUs we have the bits.

typedef int (*PlaneFn)(const SceneryVertexT *v);

static int planeLeft   (const SceneryVertexT *v) { return v->z - v->x; }
static int planeTop    (const SceneryVertexT *v) { return v->z - v->y; }
static int planeRight  (const SceneryVertexT *v) { return v->z + v->x; }
static int planeBottom (const SceneryVertexT *v) { return v->z + v->y; }


// Compute the intersection vertex along edge V0 -> V1 where the
// half-space equation transitions sign.  `pe0` and `pe1` are the
// plane-equation evaluations at V0 and V1 (one positive, one
// negative or zero).  The interpolation fraction is pe0/(pe0-pe1).
static SceneryVertexT clipIntersect(const SceneryVertexT *v0,
                                    const SceneryVertexT *v1,
                                    int pe0, int pe1) {
        SceneryVertexT out;
        // t = pe0 / (pe0 - pe1).  Scale by 4096 for fixed-point divide
        // to avoid floats while keeping enough precision for 280-pixel-
        // wide projection.  pe0 - pe1 is non-zero because the signs
        // differ (or one is zero) -- handled by the caller (we won't
        // get here unless they straddle the plane).
        int denom = pe0 - pe1;
        if (denom == 0) denom = 1; // defensive; shouldn't happen
        // Scaled t in Q12.  Clamp to [0, 4096] for safety in case of
        // accumulated arithmetic error.
        int t = (int)(((int64_t)pe0 * 4096) / denom);
        if (t < 0) t = 0;
        if (t > 4096) t = 4096;
        out.x = (int16_t)(v0->x + (((int32_t)(v1->x - v0->x) * t) >> 12));
        out.y = (int16_t)(v0->y + (((int32_t)(v1->y - v0->y) * t) >> 12));
        out.z = (int16_t)(v0->z + (((int32_t)(v1->z - v0->z) * t) >> 12));
        // Post-clip outcode is 0 (= on the plane, inside) on the
        // dimensions that mattered for this pass.  The caller's next
        // pass will re-classify if needed.
        out.outcode = 0;
        return out;
}


// One Sutherland-Hodgman pass: read `n` vertices from `in`, emit
// clipped vertices into `out`, return new vertex count.
static int clipPass(SceneryVertexT *out, const SceneryVertexT *in, int n, int cap, PlaneFn planeFn) {
        if (n == 0) return 0;
        int outN = 0;
        const SceneryVertexT *prev = &in[n - 1];
        int prevPE = planeFn(prev);
        for (int i = 0; i < n; i++) {
                const SceneryVertexT *cur = &in[i];
                int curPE = planeFn(cur);
                bool prevIn = (prevPE >= 0);
                bool curIn  = (curPE  >= 0);
                if (curIn) {
                        if (!prevIn && outN < cap) {
                                out[outN++] = clipIntersect(prev, cur, prevPE, curPE);
                        }
                        if (outN < cap) {
                                out[outN++] = *cur;
                        }
                } else {
                        if (prevIn && outN < cap) {
                                out[outN++] = clipIntersect(prev, cur, prevPE, curPE);
                        }
                }
                prev   = cur;
                prevPE = curPE;
        }
        return outN;
}


int sceneryClipPolygon3D(SceneryVertexT *in, SceneryVertexT *out, int inCount, int cap, bool *outIsIn) {
        if (inCount < 3) {
                if (outIsIn) *outIsIn = true;
                return inCount;
        }
        int n = clipPass(out, in, inCount, cap, planeLeft);
        if (n < 3) { if (outIsIn) *outIsIn = false; return 0; }
        n = clipPass(in, out, n, cap, planeTop);
        if (n < 3) { if (outIsIn) *outIsIn = true; return 0; }
        n = clipPass(out, in, n, cap, planeRight);
        if (n < 3) { if (outIsIn) *outIsIn = false; return 0; }
        n = clipPass(in, out, n, cap, planeBottom);
        if (outIsIn) *outIsIn = true;
        return n;
}