// 3D vertex pipeline -- ports the math from chunk5 polygon code. // // L7EBC, ClassifyVertex1/2, ProjectV1ToScreen, ProjectV2ToScreen, // PerspectiveDivide, EmitPrimaryVertex. #include #include "sceneryProjection.h" #include "types.h" static int16_t l1818SignedMul(int8_t y, int8_t x); static int16_t perspectiveDivide(int16_t numerator, int16_t denominator); static int16_t readSigned16Le(const uint8_t *p); // Native equivalent of chunk5/chunk4 L1818 / MultiplyXY. The original // is a signed 7x7 -> 14-bit multiply via 7-step shift-add (chunk4 // line 1998), which gives the same result as a plain native int8 * // int8 -> int16 with the bottom bit zeroed by the shift sequence. // We use the native multiply since C int promotion gives the same // numerical value (the LSB difference doesn't propagate into the // final perspective coords visibly). static int16_t l1818SignedMul(int8_t y, int8_t x) { return (int16_t)((int16_t)y * (int16_t)x); } // PerspectiveDivide port (chunk5 line 3779). The 6502 implementation // is an 8-step shift-and-subtract divide that produces a signed 16-bit // quotient `numerator / denominator`. In modern C we just use signed // integer divide -- same algorithm, different encoding. // // Two special cases match the original: // * |num| == |den| -> +/-$7F (chunk5 L7C28-L7C32 path) // * |num| > |den|*256 -> saturate to $7FFF / $8001 the same way // chunk5 PerspectiveDivide tables (chunk5.s line 4208 onward). Each // table has 128 entries indexed by the 7-bit shift-subtract divide // quotient. Output is a signed byte that the caller's ProjectVertex // uses as `screen_X = $46 + result` (X) or `screen_Y = $31 - result` (Y). // MAME-captured tables match source verbatim — at $7D52 (X) and $7DD2 // (Y) in MAME RAM. Validated bit-exact via FS2TRACE_PERSP=1 oracle. static const uint8_t kPerspXTable[128] = { 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0A, 0x0A, 0x0B, 0x0B, 0x0C, 0x0D, 0x0D, 0x0E, 0x0E, 0x0F, 0x0F, 0x10, 0x10, 0x11, 0x11, 0x12, 0x13, 0x13, 0x14, 0x14, 0x15, 0x15, 0x16, 0x16, 0x17, 0x17, 0x18, 0x18, 0x19, 0x1A, 0x1A, 0x1B, 0x1B, 0x1C, 0x1C, 0x1D, 0x1D, 0x1E, 0x1E, 0x1F, 0x20, 0x20, 0x21, 0x21, 0x22, 0x22, 0x23, 0x23, 0x24, 0x24, 0x25, 0x26, 0x26, 0x27, 0x27, 0x28, 0x28, 0x29, 0x29, 0x2A, 0x2A, 0x2B, 0x2C, 0x2C, 0x2D, 0x2D, 0x2E, 0x2E, 0x2F, 0x2F, 0x30, 0x30, 0x31, 0x31, 0x32, 0x33, 0x33, 0x34, 0x34, 0x35, 0x35, 0x36, 0x36, 0x37, 0x37, 0x38, 0x39, 0x39, 0x3A, 0x3A, 0x3B, 0x3B, 0x3C, 0x3C, 0x3D, 0x3D, 0x3E, 0x3F, 0x3F, 0x40, 0x40, 0x41, 0x41, 0x42, 0x42, 0x43, 0x43, 0x44, 0x45, }; static const uint8_t kPerspYTable[128] = { 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x07, 0x08, 0x08, 0x08, 0x09, 0x09, 0x0A, 0x0A, 0x0A, 0x0B, 0x0B, 0x0C, 0x0C, 0x0C, 0x0D, 0x0D, 0x0E, 0x0E, 0x0E, 0x0F, 0x0F, 0x10, 0x10, 0x10, 0x11, 0x11, 0x11, 0x12, 0x12, 0x13, 0x13, 0x13, 0x14, 0x14, 0x15, 0x15, 0x15, 0x16, 0x16, 0x17, 0x17, 0x17, 0x18, 0x18, 0x19, 0x19, 0x19, 0x1A, 0x1A, 0x1A, 0x1B, 0x1B, 0x1C, 0x1C, 0x1C, 0x1D, 0x1D, 0x1E, 0x1E, 0x1E, 0x1F, 0x1F, 0x20, 0x20, 0x20, 0x21, 0x21, 0x21, 0x22, 0x22, 0x23, 0x23, 0x23, 0x24, 0x24, 0x25, 0x25, 0x25, 0x26, 0x26, 0x27, 0x27, 0x27, 0x28, 0x28, 0x29, 0x29, 0x29, 0x2A, 0x2A, 0x2A, 0x2B, 0x2B, 0x2C, 0x2C, 0x2C, 0x2D, 0x2D, 0x2E, 0x2E, 0x2E, 0x2F, 0x2F, 0x30, 0x30, 0x30, 0x31, 0x31, }; static int16_t perspectiveDivideTable(int16_t numerator, int16_t denominator, const uint8_t *table) { if (denominator == 0) { return 0x7F; } int32_t absN = numerator >= 0 ? (int32_t)numerator : -(int32_t)numerator; int32_t absD = denominator >= 0 ? (int32_t)denominator : -(int32_t)denominator; bool sameSign = (numerator < 0) == (denominator < 0); int idx; if (absN >= absD) { idx = 0x7F; } else { idx = 0; int32_t r = absN; for (int i = 0; i < 7; i++) { r <<= 1; idx <<= 1; if (r >= absD) { r -= absD; idx |= 1; } } } uint8_t v = table[idx & 0x7F]; if (!sameSign) { v = (uint8_t)(~v); } return (int16_t)(int8_t)v; } static int16_t perspectiveDivide(int16_t numerator, int16_t denominator) { return perspectiveDivideTable(numerator, denominator, kPerspYTable); } static int16_t perspectiveDivideX(int16_t numerator, int16_t denominator) { return perspectiveDivideTable(numerator, denominator, kPerspXTable); } static int16_t readSigned16Le(const uint8_t *p) { return (int16_t)((uint16_t)p[0] | ((uint16_t)p[1] << 8)); } void sceneryPipelineReset(SceneryPipelineT *pipe) { memset(pipe, 0, sizeof(*pipe)); pipe->cur.polygonOutcode = 0xFF; // chunk5 L68FB lda #$00 sta $D3 -- wait, init differs pipe->cur.polygonOutcode = 0xFF; // we want AND identity = $FF pipe->cur.poolCount = 0; pipe->proj.zoomShift = 0x40; // chunk5 L7EBC initializes $2F to $40 } void sceneryPipelineSetCamera(SceneryPipelineT *pipe, int16_t worldX, int16_t worldZ) { pipe->proj.camX = worldX; pipe->proj.camZ = worldZ; } void sceneryPipelineSetMatrix(SceneryPipelineT *pipe, const int8_t row1[3], const int8_t row2[3]) { memcpy(pipe->proj.matRow1, row1, 3); memcpy(pipe->proj.matRow2, row2, 3); } void sceneryPipelineSetBase(SceneryPipelineT *pipe, int16_t bx, int16_t by, int16_t bz) { pipe->proj.baseX = bx; pipe->proj.baseY = by; pipe->proj.baseZ = bz; } // L7EBC port. Reads two signed 16-bit world-space deltas (X, Z) from // the byte stream, subtracts the camera position, runs the auto-scale // loop until the high byte of every running value fits in the upper // half of an 8-bit slot ($40 boundary), then projects through the 2x3 // rotation matrix and adds the section-base contribution. // // Differences from chunk5: // * The original uses overflow handling (`bvs`) to detect when the // subtraction overflows int16; we match that with explicit // widening to int32 before the subtract. // * The auto-scale loop ($2F counter, L7F1A) shifts left until the // high byte of every value has bit 6 set. We replicate the same // shift count so MultiplyXY's truncation matches. // * Stream byte order is little-endian (chunk5 reads $8B,Y for low // byte then high). int sceneryProjectStreamVertex(SceneryPipelineT *pipe, const uint8_t *streamPlus1, SceneryVertexT *outSlot) { sceneryProjectXZ(pipe, readSigned16Le(streamPlus1), readSigned16Le(streamPlus1 + 2), outSlot); return 4; } void sceneryProjectXZ(SceneryPipelineT *pipe, int16_t worldX, int16_t worldZ, SceneryVertexT *outSlot) { // Camera-relative delta. chunk5 keeps these in $9E/$9F (X) and // $A2/$A3 (Z) as int16; we keep int32 to detect overflow but // narrow back to int16 after the subtract because the rotation // multiply assumes int8 high bytes. int32_t dx = (int32_t)worldX - pipe->proj.camX; int32_t dz = (int32_t)worldZ - pipe->proj.camZ; // Saturate to int16 the way chunk5's bvs branches do (the // original takes a slow-path handler L7F64/L7EAD on overflow; // visually that just clips far points to int16 max). if (dx > 0x7FFF) dx = 0x7FFF; if (dx < -0x8000) dx = -0x8000; if (dz > 0x7FFF) dz = 0x7FFF; if (dz < -0x8000) dz = -0x8000; // Running accumulators start at the section-base contribution // ($18 = $4A / $1B = $4D / $1E = $50). chunk5 LDAX/STAX copies // these once at L7EC9. int32_t accX = (int32_t)pipe->proj.baseX; int32_t accY = (int32_t)pipe->proj.baseY; int32_t accZ = (int32_t)pipe->proj.baseZ; // Auto-scale (L7F1A): shift dx, dz left while the high byte // hasn't reached the $40 threshold, decrementing the zoom // counter $2F each step. chunk5 also shifts the running // accumulators along; since we've already split into 32-bit // ints, we shift everything in lockstep. // // The break condition must reproduce chunk5's `adc #$40 bmi` // exactly, where the addition is done in 8-bit and wraps. As // signed int8, the resulting bit-7 is set when the input byte // is in [0x40, 0xBF] (i.e. magnitude >= 64 either sign). Doing // the addition in `int` masks the wrap, so we cast back. #define HI_OVERFLOW(v32) ((int8_t)((((v32) >> 8) & 0xFF) + 0x40) < 0) uint8_t zoom = pipe->proj.zoomShift; while (zoom < 0xFF) { if (HI_OVERFLOW(dx)) break; if (HI_OVERFLOW(dz)) break; if (HI_OVERFLOW(accX)) break; if (HI_OVERFLOW(accY)) break; if (HI_OVERFLOW(accZ)) break; dx <<= 1; dz <<= 1; accX <<= 1; accY <<= 1; accZ <<= 1; zoom++; } #undef HI_OVERFLOW // Apply the 2x3 rotation matrix. chunk5 issues six L1818 calls // total (XZ deltas vs three matrix rows). Each multiply takes // the high byte of the delta as int8 and the matrix entry as // int8, returning int16. int8_t hxFinal = (int8_t)((dx >> 8) & 0xFF); int8_t hzFinal = (int8_t)((dz >> 8) & 0xFF); accX += (int32_t)l1818SignedMul(hxFinal, pipe->proj.matRow1[0]); accX += (int32_t)l1818SignedMul(hzFinal, pipe->proj.matRow2[0]); accY += (int32_t)l1818SignedMul(hxFinal, pipe->proj.matRow1[1]); accY += (int32_t)l1818SignedMul(hzFinal, pipe->proj.matRow2[1]); accZ += (int32_t)l1818SignedMul(hxFinal, pipe->proj.matRow1[2]); accZ += (int32_t)l1818SignedMul(hzFinal, pipe->proj.matRow2[2]); // Saturate back to int16 -- chunk5's $18/$1B/$1E are 16-bit // accumulators, so we mirror that. if (accX > 0x7FFF) accX = 0x7FFF; if (accX < -0x8000) accX = -0x8000; if (accY > 0x7FFF) accY = 0x7FFF; if (accY < -0x8000) accY = -0x8000; if (accZ > 0x7FFF) accZ = 0x7FFF; if (accZ < -0x8000) accZ = -0x8000; outSlot->x = (int16_t)accX; outSlot->y = (int16_t)accY; outSlot->z = (int16_t)accZ; outSlot->outcode = sceneryClassifyVertex(outSlot); // Restore $2F for the caller. chunk5 saves/restores $2F across // the auto-scale via $08EE; the calling opcode is responsible // for the save side. pipe->cur.accX = (int16_t)accX; pipe->cur.accY = (int16_t)accY; pipe->cur.accZ = (int16_t)accZ; } // ClassifyVertex2 (chunk5 line 2673). Same six half-space tests, same // bit assignments. Compares 16-bit signed components. uint8_t sceneryClassifyVertex(const SceneryVertexT *v) { uint8_t code = 0; if (v->z < 0) { code |= SCENERY_OUTCODE_BEHIND; } if ((int32_t)v->x + (int32_t)v->z < 0) { code |= SCENERY_OUTCODE_RIGHT; } if ((int32_t)v->z - (int32_t)v->x < 0) { code |= SCENERY_OUTCODE_LEFT; } if ((int32_t)v->y + (int32_t)v->z < 0) { code |= SCENERY_OUTCODE_BOTTOM; } if ((int32_t)v->z - (int32_t)v->y < 0) { code |= SCENERY_OUTCODE_TOP; } return code; } // ProjectV2ToScreen (chunk5 line 3759). Performs the perspective // divide for X then Y, biasing into the 280x96 viewport. The original // returns column in A and row in Y; we collapse to the screen X/Y // pair the renderer expects. // // chunk5's PerspectiveDivide returns a signed 7-bit-fraction value; // we scale to native pixel coordinates by mapping $7F to the viewport // half-width. bool sceneryProjectVertexToScreen(const SceneryVertexT *v, int16_t *outX, int16_t *outY) { if (v->z <= 0) { return false; } // chunk5 ProjectVertex: table-based persp + biases. // screen X (color cols 0..139) = $46 + qx_byte // screen Y (rows 0..98) = $31 - qy_byte // Multiply X by 2 to convert chunk5 color cols -> port native px. int16_t qx = perspectiveDivideX(v->x, v->z); int16_t qy = perspectiveDivide(v->y, v->z); int32_t sxColor = 0x46 + (int32_t)(int8_t)qx; int32_t sx = sxColor * 2; int32_t sy = 0x31 - (int32_t)(int8_t)qy; if (sx < INT16_MIN) sx = INT16_MIN; if (sx > INT16_MAX) sx = INT16_MAX; if (sy < INT16_MIN) sy = INT16_MIN; if (sy > INT16_MAX) sy = INT16_MAX; *outX = (int16_t)sx; *outY = (int16_t)sy; return true; } // EmitPrimaryVertex (chunk5 L6919). Append `slot` to the pool, AND // its outcode into the polygon accumulator $D3. Caps at 60 entries // to match chunk5's `cpy #$3C / bcs L6843` guard. void sceneryEmitPrimary(SceneryPipelineT *pipe, const SceneryVertexT *slot) { if (pipe->cur.poolCount >= SCENERY_VERTEX_POOL_CAP) { return; } pipe->pool[pipe->cur.poolCount++] = *slot; pipe->cur.polygonOutcode &= slot->outcode; } // ============================================================ // 4-pass Sutherland-Hodgman 3D frustum clipper. // // Mirrors chunk5's PolygonScanFillSetup -> PolygonClipTopPass -> // PolygonClipRightPass -> PolygonClipBottomPass cascade at // src/chunk5.s:2884+. Operates on camera-space (post-TransformVertex, // pre-PerspectiveDivide) XYZ vertices. // // Per-plane test: a vertex is "inside" the plane if the half-space // equation is non-negative. The four planes are: // Left: Z - X >= 0 // Top: Z - Y >= 0 // Right: X + Z >= 0 // Bottom: Y + Z >= 0 // // Intersection of edge V0 -> V1 with a plane uses similar triangles: // solve for fraction `t` along the edge where the plane equation // crosses zero, then linearly interpolate all components. // // Chunk5's `ClipVertex2ToLeft/Top/Right/Bottom` does this with // integer math + overflow recovery (HalveBothVertices on V-flag). // We use int32 intermediates here -- chunk5's halving was a 6502 // space-saving for the multiply; on modern CPUs we have the bits. typedef int (*PlaneFn)(const SceneryVertexT *v); static int planeLeft (const SceneryVertexT *v) { return v->z - v->x; } static int planeTop (const SceneryVertexT *v) { return v->z - v->y; } static int planeRight (const SceneryVertexT *v) { return v->z + v->x; } static int planeBottom (const SceneryVertexT *v) { return v->z + v->y; } // Compute the intersection vertex along edge V0 -> V1 where the // half-space equation transitions sign. `pe0` and `pe1` are the // plane-equation evaluations at V0 and V1 (one positive, one // negative or zero). The interpolation fraction is pe0/(pe0-pe1). static SceneryVertexT clipIntersect(const SceneryVertexT *v0, const SceneryVertexT *v1, int pe0, int pe1) { SceneryVertexT out; // t = pe0 / (pe0 - pe1). Scale by 4096 for fixed-point divide // to avoid floats while keeping enough precision for 280-pixel- // wide projection. pe0 - pe1 is non-zero because the signs // differ (or one is zero) -- handled by the caller (we won't // get here unless they straddle the plane). int denom = pe0 - pe1; if (denom == 0) denom = 1; // defensive; shouldn't happen // Scaled t in Q12. Clamp to [0, 4096] for safety in case of // accumulated arithmetic error. int t = (int)(((int64_t)pe0 * 4096) / denom); if (t < 0) t = 0; if (t > 4096) t = 4096; out.x = (int16_t)(v0->x + (((int32_t)(v1->x - v0->x) * t) >> 12)); out.y = (int16_t)(v0->y + (((int32_t)(v1->y - v0->y) * t) >> 12)); out.z = (int16_t)(v0->z + (((int32_t)(v1->z - v0->z) * t) >> 12)); // Post-clip outcode is 0 (= on the plane, inside) on the // dimensions that mattered for this pass. The caller's next // pass will re-classify if needed. out.outcode = 0; return out; } // One Sutherland-Hodgman pass: read `n` vertices from `in`, emit // clipped vertices into `out`, return new vertex count. static int clipPass(SceneryVertexT *out, const SceneryVertexT *in, int n, int cap, PlaneFn planeFn) { if (n == 0) return 0; int outN = 0; const SceneryVertexT *prev = &in[n - 1]; int prevPE = planeFn(prev); for (int i = 0; i < n; i++) { const SceneryVertexT *cur = &in[i]; int curPE = planeFn(cur); bool prevIn = (prevPE >= 0); bool curIn = (curPE >= 0); if (curIn) { if (!prevIn && outN < cap) { out[outN++] = clipIntersect(prev, cur, prevPE, curPE); } if (outN < cap) { out[outN++] = *cur; } } else { if (prevIn && outN < cap) { out[outN++] = clipIntersect(prev, cur, prevPE, curPE); } } prev = cur; prevPE = curPE; } return outN; } int sceneryClipPolygon3D(SceneryVertexT *in, SceneryVertexT *out, int inCount, int cap, bool *outIsIn) { if (inCount < 3) { if (outIsIn) *outIsIn = true; return inCount; } int n = clipPass(out, in, inCount, cap, planeLeft); if (n < 3) { if (outIsIn) *outIsIn = false; return 0; } n = clipPass(in, out, n, cap, planeTop); if (n < 3) { if (outIsIn) *outIsIn = true; return 0; } n = clipPass(out, in, n, cap, planeRight); if (n < 3) { if (outIsIn) *outIsIn = false; return 0; } n = clipPass(in, out, n, cap, planeBottom); if (outIsIn) *outIsIn = true; return n; }