joeylib2/src/core/hal.h

336 lines
17 KiB
C

// Internal HAL (hardware abstraction layer) interface.
//
// This header is included by src/core/*.c and by per-port source under
// src/port/<platform>/. It is NOT part of the public API and must not
// be installed or exposed to game code.
//
// Each port must implement every function declared here.
#ifndef JOEYLIB_HAL_H
#define JOEYLIB_HAL_H
#include "joey/core.h"
#include "joey/input.h"
#include "joey/surface.h"
// Per-port one-shot initialization. Called from joeyInit after config
// has been stored but before any surfaces are created. The port sets up
// the display mode, allocates any HW-adjacent buffers (chip RAM on
// Amiga, VGA mode on DOS, SHR on IIgs), and prepares for presents.
// Returns true on success. On failure, halLastError may be set.
bool halInit(const JoeyConfigT *config);
// Per-port teardown. Restores display mode, frees HW-adjacent buffers.
void halShutdown(void);
// Allocate / release the SURFACE_PIXELS_SIZE-byte pixel buffer that
// backs the library-owned stage surface. Ports that have a
// hardware-friendly pin location for the back buffer (IIgs $01/2000
// with SHR shadow inhibited) return that address here; ports with no
// such constraint just malloc/free.
uint8_t *halStageAllocPixels(void);
void halStageFreePixels(uint8_t *pixels);
// Present the entire source surface to the display.
void halPresent(const SurfaceT *src);
// Present a rectangular region of the source surface. The caller has
// already validated and clipped the rect to be fully inside the
// surface bounds and to have positive extents.
void halPresentRect(const SurfaceT *src, int16_t x, int16_t y, uint16_t w, uint16_t h);
// Optional: returns a port-specific error message string for the last
// HAL failure, or NULL if none. Ports may return NULL always.
const char *halLastError(void);
// Input: per-port keyboard setup, polling, and teardown.
// halInputInit is called at the end of joeyInit; halInputShutdown
// from joeyShutdown before halShutdown. halInputPoll refreshes the
// core-owned key-state array (declared in core/inputInternal.h) --
// the port writes true into gKeyState[key] for held keys. Keys the
// port does not recognize simply stay zero.
void halInputInit(void);
void halInputShutdown(void);
void halInputPoll(void);
// Block until the next display vertical blank. Each port implements
// this with whatever native wait the hardware provides (VGA $3DA,
// graphics.library WaitTOF, XBIOS Vsync, $C019 polling).
void halWaitVBL(void);
// Audio: per-port engine setup, module + SFX playback, teardown.
// halAudioInit returns true if the platform has a working engine.
// All entry points are safe to call when init failed -- they become
// no-ops. See joey/audio.h for the public API contract that wraps
// these.
bool halAudioInit(void);
void halAudioShutdown(void);
void halAudioPlayMod(const uint8_t *data, uint32_t length, bool loop);
void halAudioStopMod(void);
bool halAudioIsPlayingMod(void);
void halAudioPlaySfx(uint8_t slot, const uint8_t *sample, uint32_t length, uint16_t rateHz);
void halAudioStopSfx(uint8_t slot);
void halAudioFrameTick(void);
// Optional fast-path hooks. Each returns true if the port handled the
// operation in a port-specific accelerated path; false means the
// caller should fall back to the platform-agnostic C implementation.
//
// Funneling all asm dispatches through hal.c (one TU per port) avoids
// the cumulative ORCA Linker "Expression too complex" failure that
// hits when multiple cross-platform TUs each call into a named load
// segment full of asm primitives. Cross-platform code in src/core/
// only ever calls into HAL, so the link-time expression cost is paid
// once per binary -- not once per TU that wants speed.
//
// Each port must provide all of these; ports without an accelerated
// path simply return false from every hook.
bool halFastSurfaceClear(SurfaceT *s, uint8_t doubled);
bool halFastFillRect(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex);
bool halFastTileFill(SurfaceT *s, uint8_t bx, uint8_t by, uint16_t fillWord);
// Tile primitives operate on already-computed row-0 pointers from
// the C wrapper. dstRow0 / srcRow0 point at the first byte of the
// 8x8 region within their respective surfaces (stride 160). For
// tilePaste / tileSnap the TileT side is a packed 32-byte buffer
// (stride 4); the corresponding pointer points at byte 0 of that
// buffer.
bool halFastTileCopy(uint8_t *dstRow0, const uint8_t *srcRow0);
bool halFastTileCopyMasked(uint8_t *dstRow0, const uint8_t *srcRow0, uint8_t transparent);
bool halFastTilePaste(uint8_t *dstRow0, const uint8_t *srcTilePixels);
bool halFastTileSnap(uint8_t *dstTilePixels, const uint8_t *srcRow0);
// drawPixel inner: caller has already done NULL + bounds checks.
// (x, y) are guaranteed in [0..SURFACE_WIDTH-1] x [0..SURFACE_HEIGHT-1].
// colorIndex is the 0..15 nibble. Surface dirty marking happens in
// the C wrapper after this returns.
bool halFastDrawPixel(SurfaceT *s, uint16_t x, uint16_t y, uint8_t colorIndex);
// drawLine inner for the diagonal case. Caller ensures both endpoints
// are inside the surface bounds, so the inner loop runs without
// per-pixel clip checks. The C wrapper still routes pure horizontal
// and vertical lines through fillRect (which has its own fast path).
bool halFastDrawLine(SurfaceT *s, int16_t x0, int16_t y0, int16_t x1, int16_t y1, uint8_t colorIndex);
// drawCircle / fillCircle inner. Caller has already validated that
// the entire bounding circle (cx-r .. cx+r, cy-r .. cy+r) fits inside
// the surface bounds, so the inner loop plots every octant pixel
// unconditionally. r is guaranteed > 0; the cx == 0 / r == 0 cases
// stay in the C wrapper.
bool halFastDrawCircle(SurfaceT *s, int16_t cx, int16_t cy, uint16_t r, uint8_t colorIndex);
bool halFastFillCircle(SurfaceT *s, int16_t cx, int16_t cy, uint16_t r, uint8_t colorIndex);
// floodFill helper: combined seed test + walk-left + walk-right for
// one row. Returns true if the port handled it (asm path taken). The
// out-param seedMatched tells the caller whether the seed pixel
// satisfied the match criterion -- if false, caller skips this pop;
// if true, leftXOut/rightXOut hold the run boundaries.
// Returns false if no asm path; caller falls back to C walks.
bool halFastFloodWalk(uint8_t *row, int16_t startX,
uint8_t matchColor, uint8_t newColor, bool matchEqual,
bool *seedMatched,
int16_t *leftXOut, int16_t *rightXOut);
// floodFill helper for the row-above / row-below run-detection scans.
// Walks pixels [leftX..rightX] inclusive of `row`, writing 1 byte per
// pixel into markBuf (1 = qualifies for flood, 0 = does not). The C
// side then walks markBuf for run-edge transitions, replacing the
// per-pixel srcPixel + match check inside the inner loop.
// Returns true if the port handled it; false to fall back to C.
bool halFastFloodScanRow(uint8_t *row, int16_t leftX, int16_t rightX,
uint8_t matchColor, uint8_t newColor, bool matchEqual,
uint8_t *markBuf);
// Combined per-pixel scan + run-edge walk + seed push. Higher-level
// than halFastFloodScanRow: replaces both the markBuf fill AND the C
// loop that walks markBuf for falling edges. *spInOut is read on entry
// and updated with the new top-of-stack on return. Returns true if
// the port handled it (caller skips the C run-edge walk entirely);
// false to fall back to halFastFloodScanRow + C walk.
bool halFastFloodScanAndPush(uint8_t *row, int16_t leftX, int16_t rightX,
uint8_t matchColor, uint8_t newColor, bool matchEqual,
int16_t scanY,
int16_t *stackX, int16_t *stackY,
int16_t *spInOut, int16_t maxSp);
// Highest-level flood helper: combined seed-test + walk-left + walk-right
// + scan-above + scan-below + push for ONE popped seed. Replaces three
// cross-segment HAL calls (halFastFloodWalk + 2x halFastFloodScanAndPush)
// per dispatch loop iteration with one. The asm internally caches row
// addr / matchByte / nibble decoder across all three sub-operations.
//
// pixels is the surface base (s->pixels). On return, leftXOut / rightXOut
// hold the matching-run boundaries (only valid if seedMatched != 0); the
// caller does the 1-row halFastFillRect using those bounds. *spInOut is
// updated with any new seeds the asm pushed for the row above/below.
//
// Returns true if the port handled it; false to fall back to
// halFastFloodWalk + the per-side halFastFloodScanAndPush calls.
bool halFastFloodWalkAndScans(uint8_t *pixels, int16_t x, int16_t y,
uint8_t matchColor, uint8_t newColor, bool matchEqual,
int16_t *stackX, int16_t *stackY,
int16_t *spInOut, int16_t maxSp,
bool *seedMatched,
int16_t *leftXOut, int16_t *rightXOut);
// surfaceBlit / surfaceBlitMasked rect-copy helper. Caller has done
// the clip math: dstRow0 / srcRow0 point at row 0 of the source/dest
// regions, dstX / srcX are intra-row pixel offsets, copyW/copyH are
// the clipped extents. dst stride is hardcoded SURFACE_BYTES_PER_ROW.
// transparent == $FFFF means opaque (always copy); any 0..15 value
// means src nibbles equal to that index are skipped.
// Returns true if the port handled it; false to fall back to C.
bool halFastBlitRect(uint8_t *dstRow0, int16_t dstX,
const uint8_t *srcRow0, int16_t srcX,
int16_t copyW, int16_t copyH, int16_t srcRowBytes,
uint16_t transparent);
#ifdef JOEYLIB_PLATFORM_IIGS
// =====================================================================
// IIgs direct-dispatch macros.
//
// The halFast* function declarations above are the cross-platform API.
// On IIgs, those wrappers were ~60-80 cyc/call of pure plumbing on top
// of the asm itself: wrapper prologue (PHB/PHD/TCD), redundant arg
// re-push for the inner JSL, then wrapper epilogue. The macros below
// take effect at preprocess time and inline the asm call at the call
// site, eliminating the wrapper layer entirely.
//
// Cross-platform code in src/core/*.c is unchanged -- it still calls
// halFastDrawPixel(...) etc. On IIgs the preprocessor swaps that for
// the macro expansion before ORCA-C compiles the file. The matching
// halFast* C definitions in src/port/iigs/hal.c are deleted, since
// nothing references them once the macros take effect.
//
// Macros use comma-expression form so they evaluate to a `bool` value
// (most halFast* return true on IIgs since the asm always succeeds).
// =====================================================================
extern void iigsDrawPixelInner (uint8_t *pixels, uint16_t x, uint16_t y, uint16_t nibble);
extern void iigsDrawLineInner (uint8_t *pixels, uint16_t x0, uint16_t y0, uint16_t x1, uint16_t y1, uint16_t nibble);
extern void iigsDrawCircleInner (uint8_t *pixels, uint16_t cx, uint16_t cy, uint16_t r, uint16_t nibble);
extern void iigsFillCircleInner (uint8_t *pixels, uint16_t cx, uint16_t cy, uint16_t r, uint16_t fillWord);
extern void iigsSurfaceClearInner(uint8_t *pixels, uint16_t fillWord);
extern void iigsTileFillInner (uint8_t *dstRow0, uint16_t fillWord);
extern void iigsTileCopyInner (uint8_t *dstRow0, const uint8_t *srcRow0);
extern void iigsTileCopyMaskedInner(uint8_t *dstRow0, const uint8_t *srcRow0, uint16_t transparent);
extern void iigsTilePasteInner (uint8_t *dstRow0, const uint8_t *srcTilePixels);
extern void iigsTileSnapInner (uint8_t *dstTilePixels, const uint8_t *srcRow0);
extern void iigsBlitRectInner (uint8_t *dstRow0, uint16_t dstX, const uint8_t *srcRow0, uint16_t srcX, uint16_t copyW, uint16_t copyH, uint16_t srcRowBytes, uint16_t transparent);
extern void iigsFillRectInner (uint8_t *pixels, uint16_t x, uint16_t y, uint16_t w, uint16_t h, uint16_t nibble);
extern void iigsFloodWalkAndScansInner(uint8_t *pixels, uint16_t x, uint16_t y, uint16_t matchColor, uint16_t newColor, uint16_t matchEqual, int16_t *stackX, int16_t *stackY, uint16_t *spInOut, uint16_t maxSp);
extern uint16_t gFloodSeedMatch;
extern uint16_t gFloodLeftX;
extern uint16_t gFloodRightX;
#undef halFastDrawPixel
#define halFastDrawPixel(_s, _x, _y, _c) \
(iigsDrawPixelInner((_s)->pixels, (uint16_t)(_x), (uint16_t)(_y), \
(uint16_t)((_c) & 0x0F)), \
true)
#undef halFastDrawLine
#define halFastDrawLine(_s, _x0, _y0, _x1, _y1, _c) \
(iigsDrawLineInner((_s)->pixels, (uint16_t)(_x0), (uint16_t)(_y0), \
(uint16_t)(_x1), (uint16_t)(_y1), \
(uint16_t)((_c) & 0x0F)), \
true)
#undef halFastDrawCircle
#define halFastDrawCircle(_s, _cx, _cy, _r, _c) \
(iigsDrawCircleInner((_s)->pixels, (uint16_t)(_cx), (uint16_t)(_cy), \
(_r), (uint16_t)((_c) & 0x0F)), \
true)
// fillWord = doubled byte * $0101 = (nib*$11) * $101 = nib * $1111.
// Compile-time arithmetic when caller passes a constant; at most a
// single multiply when the nibble is variable (still cheaper than
// the wrapper's three sequential ORs / shifts).
#undef halFastFillCircle
#define halFastFillCircle(_s, _cx, _cy, _r, _c) \
((_s) == stageGet() \
? (iigsFillCircleInner((_s)->pixels, (uint16_t)(_cx), (uint16_t)(_cy), \
(_r), (uint16_t)(((_c) & 0x0F) * 0x1111)), \
true) \
: false)
#undef halFastSurfaceClear
#define halFastSurfaceClear(_s, _d) \
((_s) == stageGet() \
? (iigsSurfaceClearInner((_s)->pixels, \
(uint16_t)((uint16_t)(_d) | ((uint16_t)(_d) << 8))), \
true) \
: false)
// halFastFillRect stays as a real C wrapper -- removing it triggered
// an unrelated ORCA linker bank-placement failure (same mode as the
// peislam.asm deletion: `Unresolved reference Label:
// emitMvnCopyRoutine` in sprite codegen). The wrapper now just
// forwards to iigsFillRectInner (asm does partial+middle); we lose
// the call-site macro inlining for fillRect specifically but keep
// the rest of the macros AND the new asm helper. Per-call wrapper
// overhead for halFastFillRect is back (~80 cyc) but at least the
// per-row partial-byte logic happens in asm now.
// Tile primitives operate on caller-computed row pointers; just
// forward the args. by/bx are tile coords -> bx*4 + by*8*160 byte
// offset within the surface.
#undef halFastTileFill
#define halFastTileFill(_s, _bx, _by, _fw) \
(iigsTileFillInner(&(_s)->pixels[(uint16_t)(_by) * 8 * SURFACE_BYTES_PER_ROW \
+ (uint16_t)(_bx) * 4], \
(_fw)), \
true)
#undef halFastTileCopy
#define halFastTileCopy(_d, _s) (iigsTileCopyInner((_d), (_s)), true)
#undef halFastTileCopyMasked
#define halFastTileCopyMasked(_d, _s, _t) \
(iigsTileCopyMaskedInner((_d), (_s), (uint16_t)(_t)), true)
#undef halFastTilePaste
#define halFastTilePaste(_d, _s) (iigsTilePasteInner((_d), (_s)), true)
#undef halFastTileSnap
#define halFastTileSnap(_d, _s) (iigsTileSnapInner((_d), (_s)), true)
#undef halFastBlitRect
#define halFastBlitRect(_dr, _dx, _sr, _sx, _w, _h, _ss, _t) \
(iigsBlitRectInner((_dr), (uint16_t)(_dx), (_sr), (uint16_t)(_sx), \
(uint16_t)(_w), (uint16_t)(_h), \
(uint16_t)(_ss), (_t)), \
true)
// Tier 2/3 flood fallbacks always returned false on IIgs (the asm
// impls were deleted as unreachable). Macros to constant false so
// ORCA-C dead-code-eliminates the never-taken fallback branches in
// floodFillInternal.
#undef halFastFloodWalk
#define halFastFloodWalk(_row, _sx, _mc, _nc, _me, _sm, _lx, _rx) (false)
#undef halFastFloodScanRow
#define halFastFloodScanRow(_row, _lx, _rx, _mc, _nc, _me, _mb) (false)
#undef halFastFloodScanAndPush
#define halFastFloodScanAndPush(_row, _lx, _rx, _mc, _nc, _me, _sy, _sx, _syA, _sp, _ms) (false)
// Tier-1 flood: multi-output. Asm sets gFloodSeedMatch / gFloodLeftX /
// gFloodRightX; macro reads those into the caller's out-ptrs.
#undef halFastFloodWalkAndScans
#define halFastFloodWalkAndScans(_pix, _x, _y, _mc, _nc, _me, _sx, _sy, _sp, _ms, _smOut, _lxOut, _rxOut) \
(iigsFloodWalkAndScansInner((_pix), (uint16_t)(_x), (uint16_t)(_y), \
(uint16_t)((_mc) & 0x0F), \
(uint16_t)((_nc) & 0x0F), \
(uint16_t)((_me) ? 1 : 0), \
(_sx), (_sy), \
(uint16_t *)(_sp), \
(uint16_t)(_ms)), \
*(_smOut) = (gFloodSeedMatch != 0), \
*(_lxOut) = (int16_t)gFloodLeftX, \
*(_rxOut) = (int16_t)gFloodRightX, \
true)
#endif /* JOEYLIB_PLATFORM_IIGS */
#endif