joeylib2/src/port/iigs/hal.c

459 lines
19 KiB
C

// Apple IIgs HAL: enable SHR, write pixels / SCBs / palettes into the
// $E1 bank at the stock addresses the shifter reads from.
//
// Memory map in bank $E1:
// $2000 - $9CFF pixel data (32000 bytes, 160 bytes per scanline)
// $9D00 - $9DC7 SCB bytes (200 used)
// $9E00 - $9FFF 16 palettes x 16 colors x 2 bytes, $0RGB
//
// NEWVIDEO register at $00C029 controls SHR enable. Bit 7 turns SHR on.
// ORCA/C must be built with 32-bit pointer mode (-w or equivalent) so
// that the long addresses resolve to bank $E1.
//
// DIRTY-WALK + PEI-SLAM PRESENT
// -----------------------------
// halPresent walks the per-row dirty bands maintained by drawing
// primitives in src/core/*.c. Fully-dirty rows go through the PEI
// slam in src/port/iigs/peislam.asm (~530 cyc/row, ~55% faster than
// memcpy/MVN); partial-dirty rows use memcpy, which ORCA-C lowers
// to MVN (7 cyc/byte) -- the fastest 65816 way to move bytes into
// bank $E1 when the dirty band is too narrow to amortize the slam's
// per-call AUXWRITE/RAMRD/shadow toggle.
//
// peislam.asm declares its load segment as DRAWPRIMS so the linker
// places it in its own bank, separate from AUDIO's _ROOT (where
// audio_full.c + Memory Manager + stdio + NTPstreamsound already
// crowd up against the 64 KB-per-bank limit).
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include "joey/debug.h"
#include "hal.h"
#include "surfaceInternal.h"
/* GetTick wrapper in peislam.asm: invokes the Misc Toolset GetTick
* ($2503) and returns the low 16 bits of the system's tick counter
* (firmware VBL ISR-driven). Polling $C019 from C user code missed
* transitions for any op over ~1 ms; the system's tick counter is
* updated by the actual interrupt handler so it stays accurate
* regardless of caller polling rate. Tick rate matches the video
* field rate -- 60 Hz on NTSC, 50 Hz on PAL. */
extern uint16_t iigsGetTickWord(void);
/* Reads battery RAM hrtz50or60: 0 = NTSC, 1 = PAL. */
extern uint16_t iigsReadHzParam(void);
static uint16_t gFrameHz = 60u;
// hal.c is the single TU that calls into joeyDraw.asm. Cross-
// platform draw.c / tile.c / etc. dispatch through halFast*
// functions defined here; they never reference the asm symbols
// directly. This avoids the cumulative ORCA-Linker-Expression-
// too-complex-in-13/SysLib failure that hit when each cross-
// platform TU brought its own asm extern.
JOEYLIB_SEGMENT("DRAWPRIMS")
// 32 KB stack-slam fill via AUXWRITE. ~25 ms full-screen.
extern void iigsSurfaceClearInner(uint8_t *pixels, uint16_t fillWord);
// Full-fill asm helper (partial leading byte + middle MVN + partial
// trailing byte). Called by halFastFillRect below.
extern void iigsFillRectInner(uint8_t *pixels, uint16_t x, uint16_t y, uint16_t w, uint16_t h, uint16_t nibble);
// 16 STA abs,X stores at fixed offsets along a 160-byte stride.
// ~120 cyc per call.
extern void iigsTileFillInner(uint8_t *dstRow0, uint16_t fillWord);
// Tile copy / paste / snap inner loops. All take 4-byte large-
// model pointers; bank may differ between dst and src (heap
// surface vs stage). Stride contracts:
// tileCopyInner / tileCopyMaskedInner: dst 160, src 160
// tilePasteInner: dst 160, src 4
// tileSnapInner: dst 4, src 160
extern void iigsTileCopyInner(uint8_t *dstRow0, const uint8_t *srcRow0);
extern void iigsTileCopyMaskedInner(uint8_t *dstRow0, const uint8_t *srcRow0, uint16_t transparent);
extern void iigsTilePasteInner(uint8_t *dstRow0, const uint8_t *srcTilePixels);
extern void iigsTileSnapInner(uint8_t *dstTilePixels, const uint8_t *srcRow0);
// Single-pixel and Bresenham line plot. drawLine inner takes
// pre-clipped endpoints (caller validates against surface bounds);
// it does no per-pixel clipping in the loop.
extern void iigsDrawPixelInner(uint8_t *pixels, uint16_t x, uint16_t y, uint16_t nibble);
extern void iigsDrawLineInner(uint8_t *pixels, uint16_t x0, uint16_t y0, uint16_t x1, uint16_t y1, uint16_t nibble);
// Bresenham midpoint circle outline. Caller has verified the entire
// bbox is on-surface so no per-pixel clip.
extern void iigsDrawCircleInner(uint8_t *pixels, uint16_t cx, uint16_t cy, uint16_t r, uint16_t nibble);
// Stage-to-SHR full upload: pixels (MVN $01->$E1), SCB, palette.
// Asm uses post-MVN DBR=$E1 to do sta abs,Y for SCB/palette.
// Replaces ORCA-C's memcpy path which silently fails when called
// from halPresent (DBR-state quirk after prior asm primitives).
extern void iigsBlitStageToShr(uint8_t *scbPtr, uint16_t *palettePtr);
// floodFill walk results: written by iigsFloodWalkAndScansInner,
// read back by halFastFloodWalkAndScans.
extern uint16_t gFloodSeedMatch;
extern uint16_t gFloodLeftX;
extern uint16_t gFloodRightX;
// Per-pixel rect blit (src->dst). transparent == $FFFF means opaque
// (always copy); else pixels with src nibble == (transparent & $0F)
// are skipped. Dst stride is hardcoded 160 (SURFACE_BYTES_PER_ROW).
extern void iigsBlitRectInner(uint8_t *dstRow0, uint16_t dstX, const uint8_t *srcRow0, uint16_t srcX, uint16_t copyW, uint16_t copyH, uint16_t srcRowBytes, uint16_t transparent);
// Single-call per-popped-seed worker: seed test + walk-left + walk-right
// + scan-above + scan-below + push, all sharing cached row addr and
// match decoders. Outputs to gFloodSeedMatch / gFloodLeftX / gFloodRightX.
extern void iigsFloodWalkAndScansInner(uint8_t *pixels, uint16_t x, uint16_t y, uint16_t matchColor, uint16_t newColor, uint16_t matchEqual, int16_t *stackX, int16_t *stackY, uint16_t *spInOut, uint16_t maxSp);
// One-shot init for the y*160 lookup table (gRowOffsetLut, 400 bytes
// in DRAWPRIMS data). Called once from halInit. After this returns,
// every asm primitive that needs row offset can do `lda >lut,x` instead
// of the 7-instruction shift-add.
extern void iigsInitRowLut(void);
// Per-row MVN blit from $01:srcOffset to $E1:srcOffset for partial-
// screen presents (halPresentRect). srcOffset is the byte offset
// within bank $01 of the FIRST byte to copy on the FIRST row;
// subsequent rows are at srcOffset + 160, etc. ~9 cyc/byte vs
// ORCA-C memcpy's ~30 cyc/byte.
extern void iigsBlitRectStageToShr(uint16_t srcOffset, uint16_t copyBytes, uint16_t rowsLeft);
// PEI-slam variant of the per-row rect blit. ~3 cyc/byte vs MVN's
// ~9 cyc/byte. Constraints: copyBytes must be even and 2..80
// (caller / dispatcher checks). For sprite-rect presents (typical
// 8 bytes wide x 16 rows) saves ~600 cyc/frame vs the MVN form.
extern void iigsBlitRectStageToShrPEI(uint16_t srcOffset, uint16_t copyBytes, uint16_t rowsLeft);
// Filled circle, scanline-style. fillWord low byte is the doubled
// nibble (e.g., 0x33 for nibble 3).
extern void iigsFillCircleInner(uint8_t *pixels, uint16_t cx, uint16_t cy, uint16_t r, uint16_t fillWord);
// ----- Hardware addresses (24-bit / long pointers) -----
#define IIGS_NEWVIDEO_REG ((volatile uint8_t *)0x00C029L)
#define IIGS_BORDER_REG ((volatile uint8_t *)0x00C034L)
#define IIGS_SHADOW_REG ((volatile uint8_t *)0x00C035L)
#define IIGS_VBL_STATUS ((volatile uint8_t *)0x00C019L)
#define IIGS_SHR_PIXELS ((uint8_t *)0xE12000L)
#define IIGS_SHR_SCB ((uint8_t *)0xE19D00L)
#define IIGS_SHR_PALETTE ((uint16_t *)0xE19E00L)
// The stage lives at $01/2000 -- the same offset as the SHR display
// framebuffer at $E1/2000, but in the fast (2.8 MHz) bank. With SHR
// shadow inhibited at $C035, writes here are NOT auto-mirrored to
// $E1, so drawing is full-speed and isolated from the displayed
// frame until the next stagePresent.
#define IIGS_STAGE_PIXELS ((uint8_t *)0x012000L)
#define VBL_BAR_BIT 0x80
// NEWVIDEO bit masks
#define NEWVIDEO_SHR_ON 0x80
#define NEWVIDEO_LINEARIZE 0x40
// Bit 0 is documented as reserved-must-be-1 in the IIgs Hardware
// Reference for forward compatibility. Real silicon doesn't care, but
// GSplus halts on writes that leave it clear (see moremem.c c029
// handler) and bumps its "Code: RED" status. Always include this bit.
#define NEWVIDEO_RESERVED_BIT 0x01
// $C035 SHADOW register: bit set = shadow INHIBITED for that range.
// Bit 1 = hi-res page 1 ($02000-$03FFF in bank $01)
// Bit 2 = hi-res page 2 ($04000-$05FFF in bank $01)
// Bit 3 = SHR ($02000-$09FFF in bank $01)
// We set 1+2+3 because the SHR pixel range overlaps both hi-res
// pages; leaving any of those shadows live would silently mirror
// part of the stage to $E1.
#define SHADOW_INHIBIT_SHR_MASK 0x0E
// $C034 BORDER register: high nibble = beep/IRQ enables (preserve),
// low nibble = border color index 0..15. Color 0 is the all-zero
// palette entry by IIgs convention; we force the low nibble to 0
// in halInit so the visible bezel matches the cleared SHR background.
#define BORDER_COLOR_MASK 0xF0
// ----- Module state -----
static uint8_t gPreviousNewVideo = 0;
static uint8_t gPreviousBorder = 0;
static uint8_t gPreviousShadow = 0;
static bool gModeSet = false;
// SCB / palette upload skipping is now driven by gStageScbDirty /
// gStagePaletteDirty (core/surface.c). The old per-frame memcmp-
// against-cached-copy approach was costing ~7 ms / frame on ORCA-C.
// PEI slam scratch. File-scope non-static so the asm can `ext` them;
// all accesses inside the slam use long-mode `>` addressing so they
// bypass the //e RAMRD redirect the slam turns on for its duration.
volatile uint16_t gPeiOrigSp;
volatile uint8_t gPeiOrigShadow;
volatile uint16_t gPeiTempRowBase;
volatile uint16_t gPeiCurRow; // row counter saved across slam (stack is hijacked)
volatile uint16_t gPeiChunkRow; // in-chunk row counter saved across slam (Y reg storage)
// peislam.asm's per-row peiSlamFullRow helper is no longer wired in;
// the present pipeline now does its own PEI-slam loop inside
// iigsBlitStageToShr above (with dirty-row skip).
// Upload SCB / palette into bank-$E1 SHR memory only when the
// matching dirty flag is set. Replaces a per-frame 712-byte memcmp
// pair (~7 ms / frame on ORCA-C with -b) with a 2-cyc flag check.
// gStageScbDirty / gStagePaletteDirty live in core/surface.c; they
// start true (forces the very first present to upload), get set true
// again whenever scbSet* / paletteSet mutate the stage's data, and
// get cleared here after upload.
static void uploadScbAndPaletteIfNeeded(const SurfaceT *src) {
if (gStageScbDirty) {
memcpy(IIGS_SHR_SCB, src->scb, SURFACE_HEIGHT);
gStageScbDirty = false;
}
if (gStagePaletteDirty) {
memcpy(IIGS_SHR_PALETTE, src->palette, sizeof(src->palette));
gStagePaletteDirty = false;
}
}
// ----- HAL API (alphabetical) -----
bool halInit(const JoeyConfigT *config) {
(void)config;
gPreviousNewVideo = *IIGS_NEWVIDEO_REG;
gPreviousBorder = *IIGS_BORDER_REG;
gPreviousShadow = *IIGS_SHADOW_REG;
*IIGS_NEWVIDEO_REG = (uint8_t)(NEWVIDEO_SHR_ON | NEWVIDEO_LINEARIZE | NEWVIDEO_RESERVED_BIT);
*IIGS_BORDER_REG = (uint8_t)(gPreviousBorder & BORDER_COLOR_MASK);
// Inhibit shadowing of the stage region. Without this, every
// write to $01/2000-9FFF mirrors to $E1 and the off-screen-buffer
// illusion breaks (the user would see drawing in progress).
*IIGS_SHADOW_REG = (uint8_t)(gPreviousShadow | SHADOW_INHIBIT_SHR_MASK);
// SCB and palette are uploaded by halPresent's iigsBlitStageToShr
// (asm path, MVN to bank $E1). C-side memset/memcpy to bank $E1
// is unreliable from halInit's calling context, so we don't try
// it here -- the first present will set up SCB to 320 mode.
iigsInitRowLut();
gFrameHz = (iigsReadHzParam() == 1u) ? 50u : 60u;
gModeSet = true;
return true;
}
const char *halLastError(void) {
return NULL;
}
void halPresent(const SurfaceT *src) {
if (src == NULL) {
return;
}
// iigsBlitStageToShr does pixels (MVN $01->$E1) + SCB + palette
// upload entirely in asm via DBR=$E1 + sta abs,Y indexed stores.
// ORCA-C's C-side memcpy to bank $E1 has been unreliable from
// halPresent's calling context, so we route everything through
// the asm path. Future: re-introduce per-row dirty-band logic
// for partial-screen updates (currently we always blit 32K).
iigsBlitStageToShr(src->scb, &src->palette[0][0]);
}
void halShutdown(void) {
if (gModeSet) {
*IIGS_NEWVIDEO_REG = gPreviousNewVideo;
*IIGS_BORDER_REG = gPreviousBorder;
*IIGS_SHADOW_REG = gPreviousShadow;
gModeSet = false;
}
}
// halFastSurfaceClear / halFastDrawLine / halFastDrawCircle /
// halFastFillCircle / halFastTileCopy / halFastTileCopyMasked /
// halFastTilePaste / halFastTileSnap / halFastTileFill /
// halFastBlitRect / halFastFloodWalk[AndScans] /
// halFastFloodScanRow / halFastFloodScanAndPush all dispatch via
// macros in core/hal.h on IIgs (#ifdef JOEYLIB_PLATFORM_IIGS block).
// Only halFastFillRect remains a real function below because its
// partial-byte (nibble-edge) handling is too gnarly for a macro.
// halFastFillRect: macro-dispatched in core/hal.h, same as the other
// halFast* primitives. The C wrapper that used to live here was kept
// as load-bearing _ROOT mass to defeat ORCA-Linker bank fragility;
// since the CORESYS migration drained _ROOT, the macro form is safe.
uint8_t *halStageAllocPixels(void) {
return IIGS_STAGE_PIXELS;
}
void halStageFreePixels(uint8_t *pixels) {
(void)pixels;
// Backing memory is hardware-pinned; nothing to free.
}
// IIgs is chunky-native: portData is unused. The chunky `pixels`
// buffer at $01:2000 is the stage's pixel storage and the source for
// stagePresent's PEI-slam to $E1.
void *halSurfaceAllocPortData(SurfaceT *s, bool isStage) {
(void)s;
(void)isStage;
return NULL;
}
void halSurfaceFreePortData(SurfaceT *s, bool isStage, void *portData) {
(void)s;
(void)isStage;
(void)portData;
}
// IIgs SHR is chunky-native; no bitplanes to update.
void halFillRectPlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex) {
(void)s;
(void)x;
(void)y;
(void)w;
(void)h;
(void)colorIndex;
}
void halSurfaceCopyPlanes(SurfaceT *dst, const SurfaceT *src) {
(void)dst;
(void)src;
}
void halTileFillPlanes(SurfaceT *s, uint8_t bx, uint8_t by, uint8_t colorIndex) {
(void)s; (void)bx; (void)by; (void)colorIndex;
}
void halTileCopyPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy) {
(void)dst; (void)dstBx; (void)dstBy; (void)src; (void)srcBx; (void)srcBy;
}
void halTileCopyMaskedPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy, uint8_t transparentIndex) {
(void)dst; (void)dstBx; (void)dstBy; (void)src; (void)srcBx; (void)srcBy; (void)transparentIndex;
}
void halTilePastePlanes(SurfaceT *dst, uint8_t bx, uint8_t by, const uint8_t *chunkyTile) {
(void)dst; (void)bx; (void)by; (void)chunkyTile;
}
void halTileSnapPlanes(const SurfaceT *src, uint8_t bx, uint8_t by, uint8_t *chunkyTileOut) {
(void)src; (void)bx; (void)by; (void)chunkyTileOut;
}
void halSpriteDrawPlanes(SurfaceT *s, const SpriteT *sp, int16_t x, int16_t y) {
(void)s; (void)sp; (void)x; (void)y;
}
void halBlitRectPlanes(SurfaceT *dst, int16_t x, int16_t y, const uint8_t *srcBytes, int16_t srcX0, int16_t srcY0, int16_t copyW, int16_t copyH, int16_t srcRowBytes, uint16_t transparent) {
(void)dst; (void)x; (void)y; (void)srcBytes; (void)srcX0; (void)srcY0;
(void)copyW; (void)copyH; (void)srcRowBytes; (void)transparent;
}
void halSpriteSavePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t *dstPlaneBytes) {
(void)s; (void)x; (void)y; (void)w; (void)h; (void)dstPlaneBytes;
}
void halSpriteRestorePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, const uint8_t *srcPlaneBytes) {
(void)s; (void)x; (void)y; (void)w; (void)h; (void)srcPlaneBytes;
}
/* Phase 9 chunky reader hooks: IIgs reads from s->pixels just like
* the legacy paths did. Same logic as the DOS port. */
uint8_t halSamplePixel(const SurfaceT *s, int16_t x, int16_t y) {
uint8_t byte = s->pixels[(uint16_t)y * SURFACE_BYTES_PER_ROW + ((uint16_t)x >> 1)];
if (x & 1) return (uint8_t)(byte & 0x0Fu);
return (uint8_t)((byte & 0xF0u) >> 4);
}
uint32_t halSurfaceHash(const SurfaceT *s) {
uint16_t lo = 0xACE1u, hi = 0x1357u, blocks, n, v;
const uint8_t *p;
const uint16_t *w;
uint8_t b;
p = s->pixels;
blocks = (uint16_t)(SURFACE_PIXELS_SIZE / 8);
do {
b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b);
b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b);
b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b);
b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b);
b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b);
b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b);
b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b);
b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b);
blocks--;
} while (blocks > 0u);
p = s->scb;
for (n = 0; n < (uint16_t)SURFACE_HEIGHT; n++) {
b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b);
}
w = &s->palette[0][0];
for (n = 0; n < (uint16_t)SURFACE_PALETTE_ENTRIES; n++) {
v = *w++;
b = (uint8_t)((v >> 8) & 0xFFu); SURFACE_HASH_MIX_BYTE(lo, hi, b);
b = (uint8_t)(v & 0xFFu); SURFACE_HASH_MIX_BYTE(lo, hi, b);
}
return ((uint32_t)hi << 16) | (uint32_t)lo;
}
void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src) {
memcpy(dst->pixels, src->pixels, SURFACE_PIXELS_SIZE);
}
bool halSurfaceLoadFile(SurfaceT *dst, FILE *fp) {
return fread(dst->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE;
}
bool halSurfaceSaveFile(const SurfaceT *src, FILE *fp) {
return fwrite(src->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE;
}
uint8_t *halSurfaceAllocPixels(void) {
return (uint8_t *)calloc(1, SURFACE_PIXELS_SIZE);
}
void halSurfaceFreePixels(uint8_t *pixels) {
free(pixels);
}
uint8_t *halSurfacePlanePtr(const SurfaceT *s, uint8_t planeIdx) {
(void)s; (void)planeIdx;
return NULL;
}
// $C019 RDVBLBAR: bit 7 = 0 during vertical blank, 1 during active
// scan. To produce a rising-edge wait (one VBL per call), first spin
// while VBL is currently active (bit 7 = 0), then spin until VBL
// fires again (bit 7 returns to 0). The IIgs SHR refresh is 60 Hz.
void halWaitVBL(void) {
while ((*IIGS_VBL_STATUS & VBL_BAR_BIT) == 0) {
/* already in VBL: wait for active scan */;
}
while ((*IIGS_VBL_STATUS & VBL_BAR_BIT) != 0) {
/* scanning: wait for next VBL */;
}
}
// Frame counter via $C019 polling. Edge-detected on each call: the
// caller (UBER, animation loops) polls fast enough that we never
// miss a VBL transition. No IRQ involvement; safe in the S16 takeover
// context where ToolBox interrupt setup would be intrusive.
//
// gFrameCount uses an explicit lda+adc+sta read-modify-write rather
// than `gFrameCount++` because ORCA-C lowers the post-increment to
// `inc |gFrameCount` (the only INC abs form on 65816 -- there is no
// INC long-abs). With this file in the DRAWPRIMS load segment but
// halFrameCount called from CORESYS via JSL, DBR isn't pointing at
// DRAWPRIMS's data bank, so the abs INC silently mutates the wrong
// byte and the counter never advances. The explicit lda > / sta >
// pattern uses long-mode addressing throughout, which is
// DBR-independent.
uint16_t halFrameCount(void) {
return iigsGetTickWord();
}
uint16_t halFrameHz(void) {
return gFrameHz;
}