joeylib2/src/port/iigs/hal.c

322 lines
14 KiB
C

// Apple IIgs HAL: enable SHR, write pixels / SCBs / palettes into the
// $E1 bank at the stock addresses the shifter reads from.
//
// Memory map in bank $E1:
// $2000 - $9CFF pixel data (32000 bytes, 160 bytes per scanline)
// $9D00 - $9DC7 SCB bytes (200 used)
// $9E00 - $9FFF 16 palettes x 16 colors x 2 bytes, $0RGB
//
// NEWVIDEO register at $00C029 controls SHR enable. Bit 7 turns SHR on.
// ORCA/C must be built with 32-bit pointer mode (-w or equivalent) so
// that the long addresses resolve to bank $E1.
//
// DIRTY-WALK + PEI-SLAM PRESENT
// -----------------------------
// halPresent walks the per-row dirty bands maintained by drawing
// primitives in src/core/*.c. Fully-dirty rows go through the PEI
// slam in src/port/iigs/peislam.asm (~530 cyc/row, ~55% faster than
// memcpy/MVN); partial-dirty rows use memcpy, which ORCA-C lowers
// to MVN (7 cyc/byte) -- the fastest 65816 way to move bytes into
// bank $E1 when the dirty band is too narrow to amortize the slam's
// per-call AUXWRITE/RAMRD/shadow toggle.
//
// peislam.asm declares its load segment as DRAWPRIMS so the linker
// places it in its own bank, separate from AUDIO's _ROOT (where
// audio_full.c + Memory Manager + stdio + NTPstreamsound already
// crowd up against the 64 KB-per-bank limit).
#include <stddef.h>
#include <string.h>
#include "joey/debug.h"
#include "hal.h"
#include "surfaceInternal.h"
// hal.c is the single TU that calls into joeyDraw.asm. Cross-
// platform draw.c / tile.c / etc. dispatch through halFast*
// functions defined here; they never reference the asm symbols
// directly. This avoids the cumulative ORCA-Linker-Expression-
// too-complex-in-13/SysLib failure that hit when each cross-
// platform TU brought its own asm extern.
JOEYLIB_SEGMENT("DRAWPRIMS")
// 32 KB stack-slam fill via AUXWRITE. ~25 ms full-screen.
extern void iigsSurfaceClearInner(uint8_t *pixels, uint16_t fillWord);
// Full-fill asm helper (partial leading byte + middle MVN + partial
// trailing byte). Called by halFastFillRect below.
extern void iigsFillRectInner(uint8_t *pixels, uint16_t x, uint16_t y, uint16_t w, uint16_t h, uint16_t nibble);
// 16 STA abs,X stores at fixed offsets along a 160-byte stride.
// ~120 cyc per call.
extern void iigsTileFillInner(uint8_t *dstRow0, uint16_t fillWord);
// Tile copy / paste / snap inner loops. All take 4-byte large-
// model pointers; bank may differ between dst and src (heap
// surface vs stage). Stride contracts:
// tileCopyInner / tileCopyMaskedInner: dst 160, src 160
// tilePasteInner: dst 160, src 4
// tileSnapInner: dst 4, src 160
extern void iigsTileCopyInner(uint8_t *dstRow0, const uint8_t *srcRow0);
extern void iigsTileCopyMaskedInner(uint8_t *dstRow0, const uint8_t *srcRow0, uint16_t transparent);
extern void iigsTilePasteInner(uint8_t *dstRow0, const uint8_t *srcTilePixels);
extern void iigsTileSnapInner(uint8_t *dstTilePixels, const uint8_t *srcRow0);
// Single-pixel and Bresenham line plot. drawLine inner takes
// pre-clipped endpoints (caller validates against surface bounds);
// it does no per-pixel clipping in the loop.
extern void iigsDrawPixelInner(uint8_t *pixels, uint16_t x, uint16_t y, uint16_t nibble);
extern void iigsDrawLineInner(uint8_t *pixels, uint16_t x0, uint16_t y0, uint16_t x1, uint16_t y1, uint16_t nibble);
// Bresenham midpoint circle outline. Caller has verified the entire
// bbox is on-surface so no per-pixel clip.
extern void iigsDrawCircleInner(uint8_t *pixels, uint16_t cx, uint16_t cy, uint16_t r, uint16_t nibble);
// Stage-to-SHR full upload: pixels (MVN $01->$E1), SCB, palette.
// Asm uses post-MVN DBR=$E1 to do sta abs,Y for SCB/palette.
// Replaces ORCA-C's memcpy path which silently fails when called
// from halPresent (DBR-state quirk after prior asm primitives).
extern void iigsBlitStageToShr(uint8_t *scbPtr, uint16_t *palettePtr);
// floodFill walk results: written by iigsFloodWalkAndScansInner,
// read back by halFastFloodWalkAndScans.
extern uint16_t gFloodSeedMatch;
extern uint16_t gFloodLeftX;
extern uint16_t gFloodRightX;
// Per-pixel rect blit (src->dst). transparent == $FFFF means opaque
// (always copy); else pixels with src nibble == (transparent & $0F)
// are skipped. Dst stride is hardcoded 160 (SURFACE_BYTES_PER_ROW).
extern void iigsBlitRectInner(uint8_t *dstRow0, uint16_t dstX, const uint8_t *srcRow0, uint16_t srcX, uint16_t copyW, uint16_t copyH, uint16_t srcRowBytes, uint16_t transparent);
// Single-call per-popped-seed worker: seed test + walk-left + walk-right
// + scan-above + scan-below + push, all sharing cached row addr and
// match decoders. Outputs to gFloodSeedMatch / gFloodLeftX / gFloodRightX.
extern void iigsFloodWalkAndScansInner(uint8_t *pixels, uint16_t x, uint16_t y, uint16_t matchColor, uint16_t newColor, uint16_t matchEqual, int16_t *stackX, int16_t *stackY, uint16_t *spInOut, uint16_t maxSp);
// One-shot init for the y*160 lookup table (gRowOffsetLut, 400 bytes
// in DRAWPRIMS data). Called once from halInit. After this returns,
// every asm primitive that needs row offset can do `lda >lut,x` instead
// of the 7-instruction shift-add.
extern void iigsInitRowLut(void);
// Per-row MVN blit from $01:srcOffset to $E1:srcOffset for partial-
// screen presents (halPresentRect). srcOffset is the byte offset
// within bank $01 of the FIRST byte to copy on the FIRST row;
// subsequent rows are at srcOffset + 160, etc. ~9 cyc/byte vs
// ORCA-C memcpy's ~30 cyc/byte.
extern void iigsBlitRectStageToShr(uint16_t srcOffset, uint16_t copyBytes, uint16_t rowsLeft);
// Filled circle, scanline-style. fillWord low byte is the doubled
// nibble (e.g., 0x33 for nibble 3).
extern void iigsFillCircleInner(uint8_t *pixels, uint16_t cx, uint16_t cy, uint16_t r, uint16_t fillWord);
// ----- Hardware addresses (24-bit / long pointers) -----
#define IIGS_NEWVIDEO_REG ((volatile uint8_t *)0x00C029L)
#define IIGS_BORDER_REG ((volatile uint8_t *)0x00C034L)
#define IIGS_SHADOW_REG ((volatile uint8_t *)0x00C035L)
#define IIGS_VBL_STATUS ((volatile uint8_t *)0x00C019L)
#define IIGS_SHR_PIXELS ((uint8_t *)0xE12000L)
#define IIGS_SHR_SCB ((uint8_t *)0xE19D00L)
#define IIGS_SHR_PALETTE ((uint16_t *)0xE19E00L)
// The stage lives at $01/2000 -- the same offset as the SHR display
// framebuffer at $E1/2000, but in the fast (2.8 MHz) bank. With SHR
// shadow inhibited at $C035, writes here are NOT auto-mirrored to
// $E1, so drawing is full-speed and isolated from the displayed
// frame until the next stagePresent.
#define IIGS_STAGE_PIXELS ((uint8_t *)0x012000L)
#define VBL_BAR_BIT 0x80
// NEWVIDEO bit masks
#define NEWVIDEO_SHR_ON 0x80
#define NEWVIDEO_LINEARIZE 0x40
// Bit 0 is documented as reserved-must-be-1 in the IIgs Hardware
// Reference for forward compatibility. Real silicon doesn't care, but
// GSplus halts on writes that leave it clear (see moremem.c c029
// handler) and bumps its "Code: RED" status. Always include this bit.
#define NEWVIDEO_RESERVED_BIT 0x01
// $C035 SHADOW register: bit set = shadow INHIBITED for that range.
// Bit 1 = hi-res page 1 ($02000-$03FFF in bank $01)
// Bit 2 = hi-res page 2 ($04000-$05FFF in bank $01)
// Bit 3 = SHR ($02000-$09FFF in bank $01)
// We set 1+2+3 because the SHR pixel range overlaps both hi-res
// pages; leaving any of those shadows live would silently mirror
// part of the stage to $E1.
#define SHADOW_INHIBIT_SHR_MASK 0x0E
// $C034 BORDER register: high nibble = beep/IRQ enables (preserve),
// low nibble = border color index 0..15. Color 0 is the all-zero
// palette entry by IIgs convention; we force the low nibble to 0
// in halInit so the visible bezel matches the cleared SHR background.
#define BORDER_COLOR_MASK 0xF0
// ----- Module state -----
static uint8_t gPreviousNewVideo = 0;
static uint8_t gPreviousBorder = 0;
static uint8_t gPreviousShadow = 0;
static bool gModeSet = false;
// SCB / palette upload skipping is now driven by gStageScbDirty /
// gStagePaletteDirty (core/surface.c). The old per-frame memcmp-
// against-cached-copy approach was costing ~7 ms / frame on ORCA-C.
// PEI slam scratch. File-scope non-static so the asm can `ext` them;
// all accesses inside the slam use long-mode `>` addressing so they
// bypass the //e RAMRD redirect the slam turns on for its duration.
volatile uint16_t gPeiOrigSp;
volatile uint8_t gPeiOrigShadow;
volatile uint16_t gPeiTempRowBase;
volatile uint16_t gPeiCurRow; // row counter saved across slam (stack is hijacked)
volatile uint16_t gPeiChunkRow; // in-chunk row counter saved across slam (Y reg storage)
// peislam.asm's per-row peiSlamFullRow helper is no longer wired in;
// the present pipeline now does its own PEI-slam loop inside
// iigsBlitStageToShr above (with dirty-row skip).
// Upload SCB / palette into bank-$E1 SHR memory only when the
// matching dirty flag is set. Replaces a per-frame 712-byte memcmp
// pair (~7 ms / frame on ORCA-C with -b) with a 2-cyc flag check.
// gStageScbDirty / gStagePaletteDirty live in core/surface.c; they
// start true (forces the very first present to upload), get set true
// again whenever scbSet* / paletteSet mutate the stage's data, and
// get cleared here after upload.
static void uploadScbAndPaletteIfNeeded(const SurfaceT *src) {
if (gStageScbDirty) {
memcpy(IIGS_SHR_SCB, src->scb, SURFACE_HEIGHT);
gStageScbDirty = false;
}
if (gStagePaletteDirty) {
memcpy(IIGS_SHR_PALETTE, src->palette, sizeof(src->palette));
gStagePaletteDirty = false;
}
}
// ----- HAL API (alphabetical) -----
bool halInit(const JoeyConfigT *config) {
(void)config;
gPreviousNewVideo = *IIGS_NEWVIDEO_REG;
gPreviousBorder = *IIGS_BORDER_REG;
gPreviousShadow = *IIGS_SHADOW_REG;
*IIGS_NEWVIDEO_REG = (uint8_t)(NEWVIDEO_SHR_ON | NEWVIDEO_LINEARIZE | NEWVIDEO_RESERVED_BIT);
*IIGS_BORDER_REG = (uint8_t)(gPreviousBorder & BORDER_COLOR_MASK);
// Inhibit shadowing of the stage region. Without this, every
// write to $01/2000-9FFF mirrors to $E1 and the off-screen-buffer
// illusion breaks (the user would see drawing in progress).
*IIGS_SHADOW_REG = (uint8_t)(gPreviousShadow | SHADOW_INHIBIT_SHR_MASK);
// SCB and palette are uploaded by halPresent's iigsBlitStageToShr
// (asm path, MVN to bank $E1). C-side memset/memcpy to bank $E1
// is unreliable from halInit's calling context, so we don't try
// it here -- the first present will set up SCB to 320 mode.
iigsInitRowLut();
gModeSet = true;
return true;
}
const char *halLastError(void) {
return NULL;
}
void halPresent(const SurfaceT *src) {
if (src == NULL) {
return;
}
// iigsBlitStageToShr does pixels (MVN $01->$E1) + SCB + palette
// upload entirely in asm via DBR=$E1 + sta abs,Y indexed stores.
// ORCA-C's C-side memcpy to bank $E1 has been unreliable from
// halPresent's calling context, so we route everything through
// the asm path. Future: re-introduce per-row dirty-band logic
// for partial-screen updates (currently we always blit 32K).
iigsBlitStageToShr(src->scb, &src->palette[0][0]);
}
void halPresentRect(const SurfaceT *src, int16_t x, int16_t y, uint16_t w, uint16_t h) {
uint16_t copyBytes;
int16_t byteStart;
uint16_t srcOffset;
if (src == NULL) {
return;
}
uploadScbAndPaletteIfNeeded(src);
// Pixel copy: byte-aligned runs per scanline. x is always even
// after API-level clipping for 4bpp packed if caller aligned it;
// otherwise we include the byte containing the leftmost pixel.
byteStart = x >> 1;
copyBytes = (uint16_t)(((x + (int16_t)w + 1) >> 1) - byteStart);
if (copyBytes == 0 || h == 0) {
return;
}
// Asm per-row MVN blit. Stage pixels live at $01:2000; SHR display
// at $E1:2000 (same offset within their banks). srcOffset is the
// byte offset of the first byte to copy on the first row.
srcOffset = (uint16_t)(0x2000 + y * SURFACE_BYTES_PER_ROW + byteStart);
iigsBlitRectStageToShr(srcOffset, copyBytes, h);
}
void halShutdown(void) {
if (gModeSet) {
*IIGS_NEWVIDEO_REG = gPreviousNewVideo;
*IIGS_BORDER_REG = gPreviousBorder;
*IIGS_SHADOW_REG = gPreviousShadow;
gModeSet = false;
}
}
// halFastSurfaceClear / halFastDrawLine / halFastDrawCircle /
// halFastFillCircle / halFastTileCopy / halFastTileCopyMasked /
// halFastTilePaste / halFastTileSnap / halFastTileFill /
// halFastBlitRect / halFastFloodWalk[AndScans] /
// halFastFloodScanRow / halFastFloodScanAndPush all dispatch via
// macros in core/hal.h on IIgs (#ifdef JOEYLIB_PLATFORM_IIGS block).
// Only halFastFillRect remains a real function below because its
// partial-byte (nibble-edge) handling is too gnarly for a macro.
// halFastFillRect: thin wrapper around iigsFillRectInner. The asm
// helper now handles the partial-byte (nibble-edge) logic that used
// to live here, so this function is just a stage-check + forward.
// (It's not macro-dispatched like the others because removing it
// from the C side triggers an unrelated ORCA-linker bank-placement
// failure -- the binary needs enough mass in _ROOT to keep sprite
// codegen's static symbols at addresses the linker can resolve.)
bool halFastFillRect(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex) {
if (s == NULL || s != stageGet()) {
return false;
}
iigsFillRectInner(s->pixels,
(uint16_t)x, (uint16_t)y,
(uint16_t)w, (uint16_t)h,
(uint16_t)(colorIndex & 0x0F));
return true;
}
uint8_t *halStageAllocPixels(void) {
return IIGS_STAGE_PIXELS;
}
void halStageFreePixels(uint8_t *pixels) {
(void)pixels;
// Backing memory is hardware-pinned; nothing to free.
}
// $C019 RDVBLBAR: bit 7 = 0 during vertical blank, 1 during active
// scan. To produce a rising-edge wait (one VBL per call), first spin
// while VBL is currently active (bit 7 = 0), then spin until VBL
// fires again (bit 7 returns to 0). The IIgs SHR refresh is 60 Hz.
void halWaitVBL(void) {
while ((*IIGS_VBL_STATUS & VBL_BAR_BIT) == 0) {
/* already in VBL: wait for active scan */;
}
while ((*IIGS_VBL_STATUS & VBL_BAR_BIT) != 0) {
/* scanning: wait for next VBL */;
}
}