322 lines
14 KiB
C
322 lines
14 KiB
C
// Apple IIgs HAL: enable SHR, write pixels / SCBs / palettes into the
|
|
// $E1 bank at the stock addresses the shifter reads from.
|
|
//
|
|
// Memory map in bank $E1:
|
|
// $2000 - $9CFF pixel data (32000 bytes, 160 bytes per scanline)
|
|
// $9D00 - $9DC7 SCB bytes (200 used)
|
|
// $9E00 - $9FFF 16 palettes x 16 colors x 2 bytes, $0RGB
|
|
//
|
|
// NEWVIDEO register at $00C029 controls SHR enable. Bit 7 turns SHR on.
|
|
// ORCA/C must be built with 32-bit pointer mode (-w or equivalent) so
|
|
// that the long addresses resolve to bank $E1.
|
|
//
|
|
// DIRTY-WALK + PEI-SLAM PRESENT
|
|
// -----------------------------
|
|
// halPresent walks the per-row dirty bands maintained by drawing
|
|
// primitives in src/core/*.c. Fully-dirty rows go through the PEI
|
|
// slam in src/port/iigs/peislam.asm (~530 cyc/row, ~55% faster than
|
|
// memcpy/MVN); partial-dirty rows use memcpy, which ORCA-C lowers
|
|
// to MVN (7 cyc/byte) -- the fastest 65816 way to move bytes into
|
|
// bank $E1 when the dirty band is too narrow to amortize the slam's
|
|
// per-call AUXWRITE/RAMRD/shadow toggle.
|
|
//
|
|
// peislam.asm declares its load segment as DRAWPRIMS so the linker
|
|
// places it in its own bank, separate from AUDIO's _ROOT (where
|
|
// audio_full.c + Memory Manager + stdio + NTPstreamsound already
|
|
// crowd up against the 64 KB-per-bank limit).
|
|
|
|
#include <stddef.h>
|
|
#include <string.h>
|
|
|
|
#include "joey/debug.h"
|
|
#include "hal.h"
|
|
#include "surfaceInternal.h"
|
|
|
|
// hal.c is the single TU that calls into joeyDraw.asm. Cross-
|
|
// platform draw.c / tile.c / etc. dispatch through halFast*
|
|
// functions defined here; they never reference the asm symbols
|
|
// directly. This avoids the cumulative ORCA-Linker-Expression-
|
|
// too-complex-in-13/SysLib failure that hit when each cross-
|
|
// platform TU brought its own asm extern.
|
|
JOEYLIB_SEGMENT("DRAWPRIMS")
|
|
|
|
// 32 KB stack-slam fill via AUXWRITE. ~25 ms full-screen.
|
|
extern void iigsSurfaceClearInner(uint8_t *pixels, uint16_t fillWord);
|
|
// Full-fill asm helper (partial leading byte + middle MVN + partial
|
|
// trailing byte). Called by halFastFillRect below.
|
|
extern void iigsFillRectInner(uint8_t *pixels, uint16_t x, uint16_t y, uint16_t w, uint16_t h, uint16_t nibble);
|
|
// 16 STA abs,X stores at fixed offsets along a 160-byte stride.
|
|
// ~120 cyc per call.
|
|
extern void iigsTileFillInner(uint8_t *dstRow0, uint16_t fillWord);
|
|
// Tile copy / paste / snap inner loops. All take 4-byte large-
|
|
// model pointers; bank may differ between dst and src (heap
|
|
// surface vs stage). Stride contracts:
|
|
// tileCopyInner / tileCopyMaskedInner: dst 160, src 160
|
|
// tilePasteInner: dst 160, src 4
|
|
// tileSnapInner: dst 4, src 160
|
|
extern void iigsTileCopyInner(uint8_t *dstRow0, const uint8_t *srcRow0);
|
|
extern void iigsTileCopyMaskedInner(uint8_t *dstRow0, const uint8_t *srcRow0, uint16_t transparent);
|
|
extern void iigsTilePasteInner(uint8_t *dstRow0, const uint8_t *srcTilePixels);
|
|
extern void iigsTileSnapInner(uint8_t *dstTilePixels, const uint8_t *srcRow0);
|
|
// Single-pixel and Bresenham line plot. drawLine inner takes
|
|
// pre-clipped endpoints (caller validates against surface bounds);
|
|
// it does no per-pixel clipping in the loop.
|
|
extern void iigsDrawPixelInner(uint8_t *pixels, uint16_t x, uint16_t y, uint16_t nibble);
|
|
extern void iigsDrawLineInner(uint8_t *pixels, uint16_t x0, uint16_t y0, uint16_t x1, uint16_t y1, uint16_t nibble);
|
|
// Bresenham midpoint circle outline. Caller has verified the entire
|
|
// bbox is on-surface so no per-pixel clip.
|
|
extern void iigsDrawCircleInner(uint8_t *pixels, uint16_t cx, uint16_t cy, uint16_t r, uint16_t nibble);
|
|
// Stage-to-SHR full upload: pixels (MVN $01->$E1), SCB, palette.
|
|
// Asm uses post-MVN DBR=$E1 to do sta abs,Y for SCB/palette.
|
|
// Replaces ORCA-C's memcpy path which silently fails when called
|
|
// from halPresent (DBR-state quirk after prior asm primitives).
|
|
extern void iigsBlitStageToShr(uint8_t *scbPtr, uint16_t *palettePtr);
|
|
// floodFill walk results: written by iigsFloodWalkAndScansInner,
|
|
// read back by halFastFloodWalkAndScans.
|
|
extern uint16_t gFloodSeedMatch;
|
|
extern uint16_t gFloodLeftX;
|
|
extern uint16_t gFloodRightX;
|
|
// Per-pixel rect blit (src->dst). transparent == $FFFF means opaque
|
|
// (always copy); else pixels with src nibble == (transparent & $0F)
|
|
// are skipped. Dst stride is hardcoded 160 (SURFACE_BYTES_PER_ROW).
|
|
extern void iigsBlitRectInner(uint8_t *dstRow0, uint16_t dstX, const uint8_t *srcRow0, uint16_t srcX, uint16_t copyW, uint16_t copyH, uint16_t srcRowBytes, uint16_t transparent);
|
|
// Single-call per-popped-seed worker: seed test + walk-left + walk-right
|
|
// + scan-above + scan-below + push, all sharing cached row addr and
|
|
// match decoders. Outputs to gFloodSeedMatch / gFloodLeftX / gFloodRightX.
|
|
extern void iigsFloodWalkAndScansInner(uint8_t *pixels, uint16_t x, uint16_t y, uint16_t matchColor, uint16_t newColor, uint16_t matchEqual, int16_t *stackX, int16_t *stackY, uint16_t *spInOut, uint16_t maxSp);
|
|
// One-shot init for the y*160 lookup table (gRowOffsetLut, 400 bytes
|
|
// in DRAWPRIMS data). Called once from halInit. After this returns,
|
|
// every asm primitive that needs row offset can do `lda >lut,x` instead
|
|
// of the 7-instruction shift-add.
|
|
extern void iigsInitRowLut(void);
|
|
// Per-row MVN blit from $01:srcOffset to $E1:srcOffset for partial-
|
|
// screen presents (halPresentRect). srcOffset is the byte offset
|
|
// within bank $01 of the FIRST byte to copy on the FIRST row;
|
|
// subsequent rows are at srcOffset + 160, etc. ~9 cyc/byte vs
|
|
// ORCA-C memcpy's ~30 cyc/byte.
|
|
extern void iigsBlitRectStageToShr(uint16_t srcOffset, uint16_t copyBytes, uint16_t rowsLeft);
|
|
// Filled circle, scanline-style. fillWord low byte is the doubled
|
|
// nibble (e.g., 0x33 for nibble 3).
|
|
extern void iigsFillCircleInner(uint8_t *pixels, uint16_t cx, uint16_t cy, uint16_t r, uint16_t fillWord);
|
|
|
|
// ----- Hardware addresses (24-bit / long pointers) -----
|
|
|
|
#define IIGS_NEWVIDEO_REG ((volatile uint8_t *)0x00C029L)
|
|
#define IIGS_BORDER_REG ((volatile uint8_t *)0x00C034L)
|
|
#define IIGS_SHADOW_REG ((volatile uint8_t *)0x00C035L)
|
|
#define IIGS_VBL_STATUS ((volatile uint8_t *)0x00C019L)
|
|
#define IIGS_SHR_PIXELS ((uint8_t *)0xE12000L)
|
|
#define IIGS_SHR_SCB ((uint8_t *)0xE19D00L)
|
|
#define IIGS_SHR_PALETTE ((uint16_t *)0xE19E00L)
|
|
|
|
// The stage lives at $01/2000 -- the same offset as the SHR display
|
|
// framebuffer at $E1/2000, but in the fast (2.8 MHz) bank. With SHR
|
|
// shadow inhibited at $C035, writes here are NOT auto-mirrored to
|
|
// $E1, so drawing is full-speed and isolated from the displayed
|
|
// frame until the next stagePresent.
|
|
#define IIGS_STAGE_PIXELS ((uint8_t *)0x012000L)
|
|
|
|
#define VBL_BAR_BIT 0x80
|
|
|
|
// NEWVIDEO bit masks
|
|
#define NEWVIDEO_SHR_ON 0x80
|
|
#define NEWVIDEO_LINEARIZE 0x40
|
|
// Bit 0 is documented as reserved-must-be-1 in the IIgs Hardware
|
|
// Reference for forward compatibility. Real silicon doesn't care, but
|
|
// GSplus halts on writes that leave it clear (see moremem.c c029
|
|
// handler) and bumps its "Code: RED" status. Always include this bit.
|
|
#define NEWVIDEO_RESERVED_BIT 0x01
|
|
|
|
// $C035 SHADOW register: bit set = shadow INHIBITED for that range.
|
|
// Bit 1 = hi-res page 1 ($02000-$03FFF in bank $01)
|
|
// Bit 2 = hi-res page 2 ($04000-$05FFF in bank $01)
|
|
// Bit 3 = SHR ($02000-$09FFF in bank $01)
|
|
// We set 1+2+3 because the SHR pixel range overlaps both hi-res
|
|
// pages; leaving any of those shadows live would silently mirror
|
|
// part of the stage to $E1.
|
|
#define SHADOW_INHIBIT_SHR_MASK 0x0E
|
|
|
|
// $C034 BORDER register: high nibble = beep/IRQ enables (preserve),
|
|
// low nibble = border color index 0..15. Color 0 is the all-zero
|
|
// palette entry by IIgs convention; we force the low nibble to 0
|
|
// in halInit so the visible bezel matches the cleared SHR background.
|
|
#define BORDER_COLOR_MASK 0xF0
|
|
|
|
// ----- Module state -----
|
|
|
|
static uint8_t gPreviousNewVideo = 0;
|
|
static uint8_t gPreviousBorder = 0;
|
|
static uint8_t gPreviousShadow = 0;
|
|
static bool gModeSet = false;
|
|
|
|
// SCB / palette upload skipping is now driven by gStageScbDirty /
|
|
// gStagePaletteDirty (core/surface.c). The old per-frame memcmp-
|
|
// against-cached-copy approach was costing ~7 ms / frame on ORCA-C.
|
|
|
|
// PEI slam scratch. File-scope non-static so the asm can `ext` them;
|
|
// all accesses inside the slam use long-mode `>` addressing so they
|
|
// bypass the //e RAMRD redirect the slam turns on for its duration.
|
|
volatile uint16_t gPeiOrigSp;
|
|
volatile uint8_t gPeiOrigShadow;
|
|
volatile uint16_t gPeiTempRowBase;
|
|
volatile uint16_t gPeiCurRow; // row counter saved across slam (stack is hijacked)
|
|
volatile uint16_t gPeiChunkRow; // in-chunk row counter saved across slam (Y reg storage)
|
|
|
|
// peislam.asm's per-row peiSlamFullRow helper is no longer wired in;
|
|
// the present pipeline now does its own PEI-slam loop inside
|
|
// iigsBlitStageToShr above (with dirty-row skip).
|
|
|
|
|
|
|
|
// Upload SCB / palette into bank-$E1 SHR memory only when the
|
|
// matching dirty flag is set. Replaces a per-frame 712-byte memcmp
|
|
// pair (~7 ms / frame on ORCA-C with -b) with a 2-cyc flag check.
|
|
// gStageScbDirty / gStagePaletteDirty live in core/surface.c; they
|
|
// start true (forces the very first present to upload), get set true
|
|
// again whenever scbSet* / paletteSet mutate the stage's data, and
|
|
// get cleared here after upload.
|
|
static void uploadScbAndPaletteIfNeeded(const SurfaceT *src) {
|
|
if (gStageScbDirty) {
|
|
memcpy(IIGS_SHR_SCB, src->scb, SURFACE_HEIGHT);
|
|
gStageScbDirty = false;
|
|
}
|
|
if (gStagePaletteDirty) {
|
|
memcpy(IIGS_SHR_PALETTE, src->palette, sizeof(src->palette));
|
|
gStagePaletteDirty = false;
|
|
}
|
|
}
|
|
|
|
|
|
// ----- HAL API (alphabetical) -----
|
|
|
|
bool halInit(const JoeyConfigT *config) {
|
|
(void)config;
|
|
gPreviousNewVideo = *IIGS_NEWVIDEO_REG;
|
|
gPreviousBorder = *IIGS_BORDER_REG;
|
|
gPreviousShadow = *IIGS_SHADOW_REG;
|
|
*IIGS_NEWVIDEO_REG = (uint8_t)(NEWVIDEO_SHR_ON | NEWVIDEO_LINEARIZE | NEWVIDEO_RESERVED_BIT);
|
|
*IIGS_BORDER_REG = (uint8_t)(gPreviousBorder & BORDER_COLOR_MASK);
|
|
// Inhibit shadowing of the stage region. Without this, every
|
|
// write to $01/2000-9FFF mirrors to $E1 and the off-screen-buffer
|
|
// illusion breaks (the user would see drawing in progress).
|
|
*IIGS_SHADOW_REG = (uint8_t)(gPreviousShadow | SHADOW_INHIBIT_SHR_MASK);
|
|
// SCB and palette are uploaded by halPresent's iigsBlitStageToShr
|
|
// (asm path, MVN to bank $E1). C-side memset/memcpy to bank $E1
|
|
// is unreliable from halInit's calling context, so we don't try
|
|
// it here -- the first present will set up SCB to 320 mode.
|
|
iigsInitRowLut();
|
|
gModeSet = true;
|
|
return true;
|
|
}
|
|
|
|
|
|
const char *halLastError(void) {
|
|
return NULL;
|
|
}
|
|
|
|
|
|
void halPresent(const SurfaceT *src) {
|
|
if (src == NULL) {
|
|
return;
|
|
}
|
|
// iigsBlitStageToShr does pixels (MVN $01->$E1) + SCB + palette
|
|
// upload entirely in asm via DBR=$E1 + sta abs,Y indexed stores.
|
|
// ORCA-C's C-side memcpy to bank $E1 has been unreliable from
|
|
// halPresent's calling context, so we route everything through
|
|
// the asm path. Future: re-introduce per-row dirty-band logic
|
|
// for partial-screen updates (currently we always blit 32K).
|
|
iigsBlitStageToShr(src->scb, &src->palette[0][0]);
|
|
}
|
|
|
|
|
|
void halPresentRect(const SurfaceT *src, int16_t x, int16_t y, uint16_t w, uint16_t h) {
|
|
uint16_t copyBytes;
|
|
int16_t byteStart;
|
|
uint16_t srcOffset;
|
|
|
|
if (src == NULL) {
|
|
return;
|
|
}
|
|
|
|
uploadScbAndPaletteIfNeeded(src);
|
|
|
|
// Pixel copy: byte-aligned runs per scanline. x is always even
|
|
// after API-level clipping for 4bpp packed if caller aligned it;
|
|
// otherwise we include the byte containing the leftmost pixel.
|
|
byteStart = x >> 1;
|
|
copyBytes = (uint16_t)(((x + (int16_t)w + 1) >> 1) - byteStart);
|
|
|
|
if (copyBytes == 0 || h == 0) {
|
|
return;
|
|
}
|
|
|
|
// Asm per-row MVN blit. Stage pixels live at $01:2000; SHR display
|
|
// at $E1:2000 (same offset within their banks). srcOffset is the
|
|
// byte offset of the first byte to copy on the first row.
|
|
srcOffset = (uint16_t)(0x2000 + y * SURFACE_BYTES_PER_ROW + byteStart);
|
|
iigsBlitRectStageToShr(srcOffset, copyBytes, h);
|
|
}
|
|
|
|
|
|
void halShutdown(void) {
|
|
if (gModeSet) {
|
|
*IIGS_NEWVIDEO_REG = gPreviousNewVideo;
|
|
*IIGS_BORDER_REG = gPreviousBorder;
|
|
*IIGS_SHADOW_REG = gPreviousShadow;
|
|
gModeSet = false;
|
|
}
|
|
}
|
|
|
|
|
|
// halFastSurfaceClear / halFastDrawLine / halFastDrawCircle /
|
|
// halFastFillCircle / halFastTileCopy / halFastTileCopyMasked /
|
|
// halFastTilePaste / halFastTileSnap / halFastTileFill /
|
|
// halFastBlitRect / halFastFloodWalk[AndScans] /
|
|
// halFastFloodScanRow / halFastFloodScanAndPush all dispatch via
|
|
// macros in core/hal.h on IIgs (#ifdef JOEYLIB_PLATFORM_IIGS block).
|
|
// Only halFastFillRect remains a real function below because its
|
|
// partial-byte (nibble-edge) handling is too gnarly for a macro.
|
|
|
|
|
|
// halFastFillRect: thin wrapper around iigsFillRectInner. The asm
|
|
// helper now handles the partial-byte (nibble-edge) logic that used
|
|
// to live here, so this function is just a stage-check + forward.
|
|
// (It's not macro-dispatched like the others because removing it
|
|
// from the C side triggers an unrelated ORCA-linker bank-placement
|
|
// failure -- the binary needs enough mass in _ROOT to keep sprite
|
|
// codegen's static symbols at addresses the linker can resolve.)
|
|
bool halFastFillRect(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex) {
|
|
if (s == NULL || s != stageGet()) {
|
|
return false;
|
|
}
|
|
iigsFillRectInner(s->pixels,
|
|
(uint16_t)x, (uint16_t)y,
|
|
(uint16_t)w, (uint16_t)h,
|
|
(uint16_t)(colorIndex & 0x0F));
|
|
return true;
|
|
}
|
|
|
|
|
|
uint8_t *halStageAllocPixels(void) {
|
|
return IIGS_STAGE_PIXELS;
|
|
}
|
|
|
|
|
|
void halStageFreePixels(uint8_t *pixels) {
|
|
(void)pixels;
|
|
// Backing memory is hardware-pinned; nothing to free.
|
|
}
|
|
|
|
|
|
// $C019 RDVBLBAR: bit 7 = 0 during vertical blank, 1 during active
|
|
// scan. To produce a rising-edge wait (one VBL per call), first spin
|
|
// while VBL is currently active (bit 7 = 0), then spin until VBL
|
|
// fires again (bit 7 returns to 0). The IIgs SHR refresh is 60 Hz.
|
|
void halWaitVBL(void) {
|
|
while ((*IIGS_VBL_STATUS & VBL_BAR_BIT) == 0) {
|
|
/* already in VBL: wait for active scan */;
|
|
}
|
|
while ((*IIGS_VBL_STATUS & VBL_BAR_BIT) != 0) {
|
|
/* scanning: wait for next VBL */;
|
|
}
|
|
}
|