fs2port/port/src/sceneryProjection.c
2026-05-13 21:32:05 -05:00

441 lines
18 KiB
C

// 3D vertex pipeline -- ports the math from chunk5 polygon code.
//
// L7EBC, ClassifyVertex1/2, ProjectV1ToScreen, ProjectV2ToScreen,
// PerspectiveDivide, EmitPrimaryVertex.
#include <string.h>
#include "sceneryProjection.h"
#include "types.h"
static int16_t l1818SignedMul(int8_t y, int8_t x);
static int16_t perspectiveDivide(int16_t numerator, int16_t denominator);
static int16_t readSigned16Le(const uint8_t *p);
// Native equivalent of chunk5/chunk4 L1818 / MultiplyXY. The original
// is a signed 7x7 -> 14-bit multiply via 7-step shift-add (chunk4
// line 1998), which gives the same result as a plain native int8 *
// int8 -> int16 with the bottom bit zeroed by the shift sequence.
// We use the native multiply since C int promotion gives the same
// numerical value (the LSB difference doesn't propagate into the
// final perspective coords visibly).
static int16_t l1818SignedMul(int8_t y, int8_t x) {
return (int16_t)((int16_t)y * (int16_t)x);
}
// PerspectiveDivide port (chunk5 line 3779). The 6502 implementation
// is an 8-step shift-and-subtract divide that produces a signed 16-bit
// quotient `numerator / denominator`. In modern C we just use signed
// integer divide -- same algorithm, different encoding.
//
// Two special cases match the original:
// * |num| == |den| -> +/-$7F (chunk5 L7C28-L7C32 path)
// * |num| > |den|*256 -> saturate to $7FFF / $8001 the same way
// chunk5 PerspectiveDivide tables (chunk5.s line 4208 onward). Each
// table has 128 entries indexed by the 7-bit shift-subtract divide
// quotient. Output is a signed byte that the caller's ProjectVertex
// uses as `screen_X = $46 + result` (X) or `screen_Y = $31 - result` (Y).
// MAME-captured tables match source verbatim — at $7D52 (X) and $7DD2
// (Y) in MAME RAM. Validated bit-exact via FS2TRACE_PERSP=1 oracle.
static const uint8_t kPerspXTable[128] = {
0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03,
0x04, 0x04, 0x05, 0x05, 0x06, 0x07, 0x07, 0x08,
0x08, 0x09, 0x09, 0x0A, 0x0A, 0x0B, 0x0B, 0x0C,
0x0D, 0x0D, 0x0E, 0x0E, 0x0F, 0x0F, 0x10, 0x10,
0x11, 0x11, 0x12, 0x13, 0x13, 0x14, 0x14, 0x15,
0x15, 0x16, 0x16, 0x17, 0x17, 0x18, 0x18, 0x19,
0x1A, 0x1A, 0x1B, 0x1B, 0x1C, 0x1C, 0x1D, 0x1D,
0x1E, 0x1E, 0x1F, 0x20, 0x20, 0x21, 0x21, 0x22,
0x22, 0x23, 0x23, 0x24, 0x24, 0x25, 0x26, 0x26,
0x27, 0x27, 0x28, 0x28, 0x29, 0x29, 0x2A, 0x2A,
0x2B, 0x2C, 0x2C, 0x2D, 0x2D, 0x2E, 0x2E, 0x2F,
0x2F, 0x30, 0x30, 0x31, 0x31, 0x32, 0x33, 0x33,
0x34, 0x34, 0x35, 0x35, 0x36, 0x36, 0x37, 0x37,
0x38, 0x39, 0x39, 0x3A, 0x3A, 0x3B, 0x3B, 0x3C,
0x3C, 0x3D, 0x3D, 0x3E, 0x3F, 0x3F, 0x40, 0x40,
0x41, 0x41, 0x42, 0x42, 0x43, 0x43, 0x44, 0x45,
};
static const uint8_t kPerspYTable[128] = {
0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x02, 0x02,
0x03, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x05,
0x06, 0x06, 0x07, 0x07, 0x07, 0x08, 0x08, 0x08,
0x09, 0x09, 0x0A, 0x0A, 0x0A, 0x0B, 0x0B, 0x0C,
0x0C, 0x0C, 0x0D, 0x0D, 0x0E, 0x0E, 0x0E, 0x0F,
0x0F, 0x10, 0x10, 0x10, 0x11, 0x11, 0x11, 0x12,
0x12, 0x13, 0x13, 0x13, 0x14, 0x14, 0x15, 0x15,
0x15, 0x16, 0x16, 0x17, 0x17, 0x17, 0x18, 0x18,
0x19, 0x19, 0x19, 0x1A, 0x1A, 0x1A, 0x1B, 0x1B,
0x1C, 0x1C, 0x1C, 0x1D, 0x1D, 0x1E, 0x1E, 0x1E,
0x1F, 0x1F, 0x20, 0x20, 0x20, 0x21, 0x21, 0x21,
0x22, 0x22, 0x23, 0x23, 0x23, 0x24, 0x24, 0x25,
0x25, 0x25, 0x26, 0x26, 0x27, 0x27, 0x27, 0x28,
0x28, 0x29, 0x29, 0x29, 0x2A, 0x2A, 0x2A, 0x2B,
0x2B, 0x2C, 0x2C, 0x2C, 0x2D, 0x2D, 0x2E, 0x2E,
0x2E, 0x2F, 0x2F, 0x30, 0x30, 0x30, 0x31, 0x31,
};
static int16_t perspectiveDivideTable(int16_t numerator, int16_t denominator, const uint8_t *table) {
if (denominator == 0) {
return 0x7F;
}
int32_t absN = numerator >= 0 ? (int32_t)numerator : -(int32_t)numerator;
int32_t absD = denominator >= 0 ? (int32_t)denominator : -(int32_t)denominator;
bool sameSign = (numerator < 0) == (denominator < 0);
int idx;
if (absN >= absD) {
idx = 0x7F;
} else {
idx = 0;
int32_t r = absN;
for (int i = 0; i < 7; i++) {
r <<= 1;
idx <<= 1;
if (r >= absD) {
r -= absD;
idx |= 1;
}
}
}
uint8_t v = table[idx & 0x7F];
if (!sameSign) {
v = (uint8_t)(~v);
}
return (int16_t)(int8_t)v;
}
static int16_t perspectiveDivide(int16_t numerator, int16_t denominator) {
return perspectiveDivideTable(numerator, denominator, kPerspYTable);
}
static int16_t perspectiveDivideX(int16_t numerator, int16_t denominator) {
return perspectiveDivideTable(numerator, denominator, kPerspXTable);
}
static int16_t readSigned16Le(const uint8_t *p) {
return (int16_t)((uint16_t)p[0] | ((uint16_t)p[1] << 8));
}
void sceneryPipelineReset(SceneryPipelineT *pipe) {
memset(pipe, 0, sizeof(*pipe));
pipe->cur.polygonOutcode = 0xFF; // chunk5 L68FB lda #$00 sta $D3 -- wait, init differs
pipe->cur.polygonOutcode = 0xFF; // we want AND identity = $FF
pipe->cur.poolCount = 0;
pipe->proj.zoomShift = 0x40; // chunk5 L7EBC initializes $2F to $40
}
void sceneryPipelineSetCamera(SceneryPipelineT *pipe, int16_t worldX, int16_t worldZ) {
pipe->proj.camX = worldX;
pipe->proj.camZ = worldZ;
}
void sceneryPipelineSetMatrix(SceneryPipelineT *pipe, const int8_t row1[3], const int8_t row2[3]) {
memcpy(pipe->proj.matRow1, row1, 3);
memcpy(pipe->proj.matRow2, row2, 3);
}
void sceneryPipelineSetBase(SceneryPipelineT *pipe, int16_t bx, int16_t by, int16_t bz) {
pipe->proj.baseX = bx;
pipe->proj.baseY = by;
pipe->proj.baseZ = bz;
}
// L7EBC port. Reads two signed 16-bit world-space deltas (X, Z) from
// the byte stream, subtracts the camera position, runs the auto-scale
// loop until the high byte of every running value fits in the upper
// half of an 8-bit slot ($40 boundary), then projects through the 2x3
// rotation matrix and adds the section-base contribution.
//
// Differences from chunk5:
// * The original uses overflow handling (`bvs`) to detect when the
// subtraction overflows int16; we match that with explicit
// widening to int32 before the subtract.
// * The auto-scale loop ($2F counter, L7F1A) shifts left until the
// high byte of every value has bit 6 set. We replicate the same
// shift count so MultiplyXY's truncation matches.
// * Stream byte order is little-endian (chunk5 reads $8B,Y for low
// byte then high).
int sceneryProjectStreamVertex(SceneryPipelineT *pipe, const uint8_t *streamPlus1, SceneryVertexT *outSlot) {
sceneryProjectXZ(pipe,
readSigned16Le(streamPlus1),
readSigned16Le(streamPlus1 + 2),
outSlot);
return 4;
}
void sceneryProjectXZ(SceneryPipelineT *pipe, int16_t worldX, int16_t worldZ, SceneryVertexT *outSlot) {
// Camera-relative delta. chunk5 keeps these in $9E/$9F (X) and
// $A2/$A3 (Z) as int16; we keep int32 to detect overflow but
// narrow back to int16 after the subtract because the rotation
// multiply assumes int8 high bytes.
int32_t dx = (int32_t)worldX - pipe->proj.camX;
int32_t dz = (int32_t)worldZ - pipe->proj.camZ;
// Saturate to int16 the way chunk5's bvs branches do (the
// original takes a slow-path handler L7F64/L7EAD on overflow;
// visually that just clips far points to int16 max).
if (dx > 0x7FFF) dx = 0x7FFF;
if (dx < -0x8000) dx = -0x8000;
if (dz > 0x7FFF) dz = 0x7FFF;
if (dz < -0x8000) dz = -0x8000;
// Running accumulators start at the section-base contribution
// ($18 = $4A / $1B = $4D / $1E = $50). chunk5 LDAX/STAX copies
// these once at L7EC9.
int32_t accX = (int32_t)pipe->proj.baseX;
int32_t accY = (int32_t)pipe->proj.baseY;
int32_t accZ = (int32_t)pipe->proj.baseZ;
// Auto-scale (L7F1A): shift dx, dz left while the high byte
// hasn't reached the $40 threshold, decrementing the zoom
// counter $2F each step. chunk5 also shifts the running
// accumulators along; since we've already split into 32-bit
// ints, we shift everything in lockstep.
//
// The break condition must reproduce chunk5's `adc #$40 bmi`
// exactly, where the addition is done in 8-bit and wraps. As
// signed int8, the resulting bit-7 is set when the input byte
// is in [0x40, 0xBF] (i.e. magnitude >= 64 either sign). Doing
// the addition in `int` masks the wrap, so we cast back.
#define HI_OVERFLOW(v32) ((int8_t)((((v32) >> 8) & 0xFF) + 0x40) < 0)
uint8_t zoom = pipe->proj.zoomShift;
while (zoom < 0xFF) {
if (HI_OVERFLOW(dx)) break;
if (HI_OVERFLOW(dz)) break;
if (HI_OVERFLOW(accX)) break;
if (HI_OVERFLOW(accY)) break;
if (HI_OVERFLOW(accZ)) break;
dx <<= 1;
dz <<= 1;
accX <<= 1;
accY <<= 1;
accZ <<= 1;
zoom++;
}
#undef HI_OVERFLOW
// Apply the 2x3 rotation matrix. chunk5 issues six L1818 calls
// total (XZ deltas vs three matrix rows). Each multiply takes
// the high byte of the delta as int8 and the matrix entry as
// int8, returning int16.
int8_t hxFinal = (int8_t)((dx >> 8) & 0xFF);
int8_t hzFinal = (int8_t)((dz >> 8) & 0xFF);
accX += (int32_t)l1818SignedMul(hxFinal, pipe->proj.matRow1[0]);
accX += (int32_t)l1818SignedMul(hzFinal, pipe->proj.matRow2[0]);
accY += (int32_t)l1818SignedMul(hxFinal, pipe->proj.matRow1[1]);
accY += (int32_t)l1818SignedMul(hzFinal, pipe->proj.matRow2[1]);
accZ += (int32_t)l1818SignedMul(hxFinal, pipe->proj.matRow1[2]);
accZ += (int32_t)l1818SignedMul(hzFinal, pipe->proj.matRow2[2]);
// Saturate back to int16 -- chunk5's $18/$1B/$1E are 16-bit
// accumulators, so we mirror that.
if (accX > 0x7FFF) accX = 0x7FFF;
if (accX < -0x8000) accX = -0x8000;
if (accY > 0x7FFF) accY = 0x7FFF;
if (accY < -0x8000) accY = -0x8000;
if (accZ > 0x7FFF) accZ = 0x7FFF;
if (accZ < -0x8000) accZ = -0x8000;
outSlot->x = (int16_t)accX;
outSlot->y = (int16_t)accY;
outSlot->z = (int16_t)accZ;
outSlot->outcode = sceneryClassifyVertex(outSlot);
// Restore $2F for the caller. chunk5 saves/restores $2F across
// the auto-scale via $08EE; the calling opcode is responsible
// for the save side.
pipe->cur.accX = (int16_t)accX;
pipe->cur.accY = (int16_t)accY;
pipe->cur.accZ = (int16_t)accZ;
}
// ClassifyVertex2 (chunk5 line 2673). Same six half-space tests, same
// bit assignments. Compares 16-bit signed components.
uint8_t sceneryClassifyVertex(const SceneryVertexT *v) {
uint8_t code = 0;
if (v->z < 0) {
code |= SCENERY_OUTCODE_BEHIND;
}
if ((int32_t)v->x + (int32_t)v->z < 0) {
code |= SCENERY_OUTCODE_RIGHT;
}
if ((int32_t)v->z - (int32_t)v->x < 0) {
code |= SCENERY_OUTCODE_LEFT;
}
if ((int32_t)v->y + (int32_t)v->z < 0) {
code |= SCENERY_OUTCODE_BOTTOM;
}
if ((int32_t)v->z - (int32_t)v->y < 0) {
code |= SCENERY_OUTCODE_TOP;
}
return code;
}
// ProjectV2ToScreen (chunk5 line 3759). Performs the perspective
// divide for X then Y, biasing into the 280x96 viewport. The original
// returns column in A and row in Y; we collapse to the screen X/Y
// pair the renderer expects.
//
// chunk5's PerspectiveDivide returns a signed 7-bit-fraction value;
// we scale to native pixel coordinates by mapping $7F to the viewport
// half-width.
bool sceneryProjectVertexToScreen(const SceneryVertexT *v, int16_t *outX, int16_t *outY) {
if (v->z <= 0) {
return false;
}
// chunk5 ProjectVertex: table-based persp + biases.
// screen X (color cols 0..139) = $46 + qx_byte
// screen Y (rows 0..98) = $31 - qy_byte
// Multiply X by 2 to convert chunk5 color cols -> port native px.
int16_t qx = perspectiveDivideX(v->x, v->z);
int16_t qy = perspectiveDivide(v->y, v->z);
int32_t sxColor = 0x46 + (int32_t)(int8_t)qx;
int32_t sx = sxColor * 2;
int32_t sy = 0x31 - (int32_t)(int8_t)qy;
if (sx < INT16_MIN) sx = INT16_MIN;
if (sx > INT16_MAX) sx = INT16_MAX;
if (sy < INT16_MIN) sy = INT16_MIN;
if (sy > INT16_MAX) sy = INT16_MAX;
*outX = (int16_t)sx;
*outY = (int16_t)sy;
return true;
}
// EmitPrimaryVertex (chunk5 L6919). Append `slot` to the pool, AND
// its outcode into the polygon accumulator $D3. Caps at 60 entries
// to match chunk5's `cpy #$3C / bcs L6843` guard.
void sceneryEmitPrimary(SceneryPipelineT *pipe, const SceneryVertexT *slot) {
if (pipe->cur.poolCount >= SCENERY_VERTEX_POOL_CAP) {
return;
}
pipe->pool[pipe->cur.poolCount++] = *slot;
pipe->cur.polygonOutcode &= slot->outcode;
}
// ============================================================
// 4-pass Sutherland-Hodgman 3D frustum clipper.
//
// Mirrors chunk5's PolygonScanFillSetup -> PolygonClipTopPass ->
// PolygonClipRightPass -> PolygonClipBottomPass cascade at
// src/chunk5.s:2884+. Operates on camera-space (post-TransformVertex,
// pre-PerspectiveDivide) XYZ vertices.
//
// Per-plane test: a vertex is "inside" the plane if the half-space
// equation is non-negative. The four planes are:
// Left: Z - X >= 0
// Top: Z - Y >= 0
// Right: X + Z >= 0
// Bottom: Y + Z >= 0
//
// Intersection of edge V0 -> V1 with a plane uses similar triangles:
// solve for fraction `t` along the edge where the plane equation
// crosses zero, then linearly interpolate all components.
//
// Chunk5's `ClipVertex2ToLeft/Top/Right/Bottom` does this with
// integer math + overflow recovery (HalveBothVertices on V-flag).
// We use int32 intermediates here -- chunk5's halving was a 6502
// space-saving for the multiply; on modern CPUs we have the bits.
typedef int (*PlaneFn)(const SceneryVertexT *v);
static int planeLeft (const SceneryVertexT *v) { return v->z - v->x; }
static int planeTop (const SceneryVertexT *v) { return v->z - v->y; }
static int planeRight (const SceneryVertexT *v) { return v->z + v->x; }
static int planeBottom (const SceneryVertexT *v) { return v->z + v->y; }
// Compute the intersection vertex along edge V0 -> V1 where the
// half-space equation transitions sign. `pe0` and `pe1` are the
// plane-equation evaluations at V0 and V1 (one positive, one
// negative or zero). The interpolation fraction is pe0/(pe0-pe1).
static SceneryVertexT clipIntersect(const SceneryVertexT *v0,
const SceneryVertexT *v1,
int pe0, int pe1) {
SceneryVertexT out;
// t = pe0 / (pe0 - pe1). Scale by 4096 for fixed-point divide
// to avoid floats while keeping enough precision for 280-pixel-
// wide projection. pe0 - pe1 is non-zero because the signs
// differ (or one is zero) -- handled by the caller (we won't
// get here unless they straddle the plane).
int denom = pe0 - pe1;
if (denom == 0) denom = 1; // defensive; shouldn't happen
// Scaled t in Q12. Clamp to [0, 4096] for safety in case of
// accumulated arithmetic error.
int t = (int)(((int64_t)pe0 * 4096) / denom);
if (t < 0) t = 0;
if (t > 4096) t = 4096;
out.x = (int16_t)(v0->x + (((int32_t)(v1->x - v0->x) * t) >> 12));
out.y = (int16_t)(v0->y + (((int32_t)(v1->y - v0->y) * t) >> 12));
out.z = (int16_t)(v0->z + (((int32_t)(v1->z - v0->z) * t) >> 12));
// Post-clip outcode is 0 (= on the plane, inside) on the
// dimensions that mattered for this pass. The caller's next
// pass will re-classify if needed.
out.outcode = 0;
return out;
}
// One Sutherland-Hodgman pass: read `n` vertices from `in`, emit
// clipped vertices into `out`, return new vertex count.
static int clipPass(SceneryVertexT *out, const SceneryVertexT *in, int n, int cap, PlaneFn planeFn) {
if (n == 0) return 0;
int outN = 0;
const SceneryVertexT *prev = &in[n - 1];
int prevPE = planeFn(prev);
for (int i = 0; i < n; i++) {
const SceneryVertexT *cur = &in[i];
int curPE = planeFn(cur);
bool prevIn = (prevPE >= 0);
bool curIn = (curPE >= 0);
if (curIn) {
if (!prevIn && outN < cap) {
out[outN++] = clipIntersect(prev, cur, prevPE, curPE);
}
if (outN < cap) {
out[outN++] = *cur;
}
} else {
if (prevIn && outN < cap) {
out[outN++] = clipIntersect(prev, cur, prevPE, curPE);
}
}
prev = cur;
prevPE = curPE;
}
return outN;
}
int sceneryClipPolygon3D(SceneryVertexT *in, SceneryVertexT *out, int inCount, int cap, bool *outIsIn) {
if (inCount < 3) {
if (outIsIn) *outIsIn = true;
return inCount;
}
int n = clipPass(out, in, inCount, cap, planeLeft);
if (n < 3) { if (outIsIn) *outIsIn = false; return 0; }
n = clipPass(in, out, n, cap, planeTop);
if (n < 3) { if (outIsIn) *outIsIn = true; return 0; }
n = clipPass(out, in, n, cap, planeRight);
if (n < 3) { if (outIsIn) *outIsIn = false; return 0; }
n = clipPass(in, out, n, cap, planeBottom);
if (outIsIn) *outIsIn = true;
return n;
}