// Apple IIgs HAL: enable SHR, write pixels / SCBs / palettes into the // $E1 bank at the stock addresses the shifter reads from. // // Memory map in bank $E1: // $2000 - $9CFF pixel data (32000 bytes, 160 bytes per scanline) // $9D00 - $9DC7 SCB bytes (200 used) // $9E00 - $9FFF 16 palettes x 16 colors x 2 bytes, $0RGB // // NEWVIDEO register at $00C029 controls SHR enable. Bit 7 turns SHR on. // ORCA/C must be built with 32-bit pointer mode (-w or equivalent) so // that the long addresses resolve to bank $E1. // // DIRTY-WALK + PEI-SLAM PRESENT // ----------------------------- // halPresent walks the per-row dirty bands maintained by drawing // primitives in src/core/*.c. Fully-dirty rows go through the PEI // slam in src/port/iigs/peislam.asm (~530 cyc/row, ~55% faster than // memcpy/MVN); partial-dirty rows use memcpy, which ORCA-C lowers // to MVN (7 cyc/byte) -- the fastest 65816 way to move bytes into // bank $E1 when the dirty band is too narrow to amortize the slam's // per-call AUXWRITE/RAMRD/shadow toggle. // // peislam.asm declares its load segment as DRAWPRIMS so the linker // places it in its own bank, separate from AUDIO's _ROOT (where // audio_full.c + Memory Manager + stdio + NTPstreamsound already // crowd up against the 64 KB-per-bank limit). #include #include #include #include "joey/debug.h" #include "hal.h" #include "surfaceInternal.h" /* GetTick wrapper in peislam.asm: invokes the Misc Toolset GetTick * ($2503) and returns the low 16 bits of the system's tick counter * (firmware VBL ISR-driven). Polling $C019 from C user code missed * transitions for any op over ~1 ms; the system's tick counter is * updated by the actual interrupt handler so it stays accurate * regardless of caller polling rate. Tick rate matches the video * field rate -- 60 Hz on NTSC, 50 Hz on PAL. */ extern uint16_t iigsGetTickWord(void); /* Reads battery RAM hrtz50or60: 0 = NTSC, 1 = PAL. */ extern uint16_t iigsReadHzParam(void); static uint16_t gFrameHz = 60u; // hal.c is the single TU that calls into joeyDraw.asm. Cross- // platform draw.c / tile.c / etc. dispatch through halFast* // functions defined here; they never reference the asm symbols // directly. This avoids the cumulative ORCA-Linker-Expression- // too-complex-in-13/SysLib failure that hit when each cross- // platform TU brought its own asm extern. JOEYLIB_SEGMENT("DRAWPRIMS") // 32 KB stack-slam fill via AUXWRITE. ~25 ms full-screen. extern void iigsSurfaceClearInner(uint8_t *pixels, uint16_t fillWord); // Full-fill asm helper (partial leading byte + middle MVN + partial // trailing byte). Called by halFastFillRect below. extern void iigsFillRectInner(uint8_t *pixels, uint16_t x, uint16_t y, uint16_t w, uint16_t h, uint16_t nibble); // 16 STA abs,X stores at fixed offsets along a 160-byte stride. // ~120 cyc per call. extern void iigsTileFillInner(uint8_t *dstRow0, uint16_t fillWord); // Tile copy / paste / snap inner loops. All take 4-byte large- // model pointers; bank may differ between dst and src (heap // surface vs stage). Stride contracts: // tileCopyInner / tileCopyMaskedInner: dst 160, src 160 // tilePasteInner: dst 160, src 4 // tileSnapInner: dst 4, src 160 extern void iigsTileCopyInner(uint8_t *dstRow0, const uint8_t *srcRow0); extern void iigsTileCopyMaskedInner(uint8_t *dstRow0, const uint8_t *srcRow0, uint16_t transparent); extern void iigsTilePasteInner(uint8_t *dstRow0, const uint8_t *srcTilePixels); extern void iigsTileSnapInner(uint8_t *dstTilePixels, const uint8_t *srcRow0); // Single-pixel and Bresenham line plot. drawLine inner takes // pre-clipped endpoints (caller validates against surface bounds); // it does no per-pixel clipping in the loop. extern void iigsDrawPixelInner(uint8_t *pixels, uint16_t x, uint16_t y, uint16_t nibble); extern void iigsDrawLineInner(uint8_t *pixels, uint16_t x0, uint16_t y0, uint16_t x1, uint16_t y1, uint16_t nibble); // Bresenham midpoint circle outline. Caller has verified the entire // bbox is on-surface so no per-pixel clip. extern void iigsDrawCircleInner(uint8_t *pixels, uint16_t cx, uint16_t cy, uint16_t r, uint16_t nibble); // Stage-to-SHR full upload: pixels (MVN $01->$E1), SCB, palette. // Asm uses post-MVN DBR=$E1 to do sta abs,Y for SCB/palette. // Replaces ORCA-C's memcpy path which silently fails when called // from halPresent (DBR-state quirk after prior asm primitives). extern void iigsBlitStageToShr(uint8_t *scbPtr, uint16_t *palettePtr); // floodFill walk results: written by iigsFloodWalkAndScansInner, // read back by halFastFloodWalkAndScans. extern uint16_t gFloodSeedMatch; extern uint16_t gFloodLeftX; extern uint16_t gFloodRightX; // Per-pixel rect blit (src->dst). transparent == $FFFF means opaque // (always copy); else pixels with src nibble == (transparent & $0F) // are skipped. Dst stride is hardcoded 160 (SURFACE_BYTES_PER_ROW). extern void iigsBlitRectInner(uint8_t *dstRow0, uint16_t dstX, const uint8_t *srcRow0, uint16_t srcX, uint16_t copyW, uint16_t copyH, uint16_t srcRowBytes, uint16_t transparent); // Single-call per-popped-seed worker: seed test + walk-left + walk-right // + scan-above + scan-below + push, all sharing cached row addr and // match decoders. Outputs to gFloodSeedMatch / gFloodLeftX / gFloodRightX. extern void iigsFloodWalkAndScansInner(uint8_t *pixels, uint16_t x, uint16_t y, uint16_t matchColor, uint16_t newColor, uint16_t matchEqual, int16_t *stackX, int16_t *stackY, uint16_t *spInOut, uint16_t maxSp); // One-shot init for the y*160 lookup table (gRowOffsetLut, 400 bytes // in DRAWPRIMS data). Called once from halInit. After this returns, // every asm primitive that needs row offset can do `lda >lut,x` instead // of the 7-instruction shift-add. extern void iigsInitRowLut(void); // Per-row MVN blit from $01:srcOffset to $E1:srcOffset for partial- // screen presents (halPresentRect). srcOffset is the byte offset // within bank $01 of the FIRST byte to copy on the FIRST row; // subsequent rows are at srcOffset + 160, etc. ~9 cyc/byte vs // ORCA-C memcpy's ~30 cyc/byte. extern void iigsBlitRectStageToShr(uint16_t srcOffset, uint16_t copyBytes, uint16_t rowsLeft); // PEI-slam variant of the per-row rect blit. ~3 cyc/byte vs MVN's // ~9 cyc/byte. Constraints: copyBytes must be even and 2..80 // (caller / dispatcher checks). For sprite-rect presents (typical // 8 bytes wide x 16 rows) saves ~600 cyc/frame vs the MVN form. extern void iigsBlitRectStageToShrPEI(uint16_t srcOffset, uint16_t copyBytes, uint16_t rowsLeft); // Filled circle, scanline-style. fillWord low byte is the doubled // nibble (e.g., 0x33 for nibble 3). extern void iigsFillCircleInner(uint8_t *pixels, uint16_t cx, uint16_t cy, uint16_t r, uint16_t fillWord); // ----- Hardware addresses (24-bit / long pointers) ----- #define IIGS_NEWVIDEO_REG ((volatile uint8_t *)0x00C029L) #define IIGS_BORDER_REG ((volatile uint8_t *)0x00C034L) #define IIGS_SHADOW_REG ((volatile uint8_t *)0x00C035L) #define IIGS_VBL_STATUS ((volatile uint8_t *)0x00C019L) #define IIGS_SHR_PIXELS ((uint8_t *)0xE12000L) #define IIGS_SHR_SCB ((uint8_t *)0xE19D00L) #define IIGS_SHR_PALETTE ((uint16_t *)0xE19E00L) // The stage lives at $01/2000 -- the same offset as the SHR display // framebuffer at $E1/2000, but in the fast (2.8 MHz) bank. With SHR // shadow inhibited at $C035, writes here are NOT auto-mirrored to // $E1, so drawing is full-speed and isolated from the displayed // frame until the next stagePresent. #define IIGS_STAGE_PIXELS ((uint8_t *)0x012000L) #define VBL_BAR_BIT 0x80 // NEWVIDEO bit masks #define NEWVIDEO_SHR_ON 0x80 #define NEWVIDEO_LINEARIZE 0x40 // Bit 0 is documented as reserved-must-be-1 in the IIgs Hardware // Reference for forward compatibility. Real silicon doesn't care, but // GSplus halts on writes that leave it clear (see moremem.c c029 // handler) and bumps its "Code: RED" status. Always include this bit. #define NEWVIDEO_RESERVED_BIT 0x01 // $C035 SHADOW register: bit set = shadow INHIBITED for that range. // Bit 1 = hi-res page 1 ($02000-$03FFF in bank $01) // Bit 2 = hi-res page 2 ($04000-$05FFF in bank $01) // Bit 3 = SHR ($02000-$09FFF in bank $01) // We set 1+2+3 because the SHR pixel range overlaps both hi-res // pages; leaving any of those shadows live would silently mirror // part of the stage to $E1. #define SHADOW_INHIBIT_SHR_MASK 0x0E // $C034 BORDER register: high nibble = beep/IRQ enables (preserve), // low nibble = border color index 0..15. Color 0 is the all-zero // palette entry by IIgs convention; we force the low nibble to 0 // in halInit so the visible bezel matches the cleared SHR background. #define BORDER_COLOR_MASK 0xF0 // ----- Module state ----- static uint8_t gPreviousNewVideo = 0; static uint8_t gPreviousBorder = 0; static uint8_t gPreviousShadow = 0; static bool gModeSet = false; // SCB / palette upload skipping is now driven by gStageScbDirty / // gStagePaletteDirty (core/surface.c). The old per-frame memcmp- // against-cached-copy approach was costing ~7 ms / frame on ORCA-C. // PEI slam scratch. File-scope non-static so the asm can `ext` them; // all accesses inside the slam use long-mode `>` addressing so they // bypass the //e RAMRD redirect the slam turns on for its duration. volatile uint16_t gPeiOrigSp; volatile uint8_t gPeiOrigShadow; volatile uint16_t gPeiTempRowBase; volatile uint16_t gPeiCurRow; // row counter saved across slam (stack is hijacked) volatile uint16_t gPeiChunkRow; // in-chunk row counter saved across slam (Y reg storage) // peislam.asm's per-row peiSlamFullRow helper is no longer wired in; // the present pipeline now does its own PEI-slam loop inside // iigsBlitStageToShr above (with dirty-row skip). // Upload SCB / palette into bank-$E1 SHR memory only when the // matching dirty flag is set. Replaces a per-frame 712-byte memcmp // pair (~7 ms / frame on ORCA-C with -b) with a 2-cyc flag check. // gStageScbDirty / gStagePaletteDirty live in core/surface.c; they // start true (forces the very first present to upload), get set true // again whenever scbSet* / paletteSet mutate the stage's data, and // get cleared here after upload. static void uploadScbAndPaletteIfNeeded(const SurfaceT *src) { if (gStageScbDirty) { memcpy(IIGS_SHR_SCB, src->scb, SURFACE_HEIGHT); gStageScbDirty = false; } if (gStagePaletteDirty) { memcpy(IIGS_SHR_PALETTE, src->palette, sizeof(src->palette)); gStagePaletteDirty = false; } } // ----- HAL API (alphabetical) ----- bool halInit(const JoeyConfigT *config) { (void)config; gPreviousNewVideo = *IIGS_NEWVIDEO_REG; gPreviousBorder = *IIGS_BORDER_REG; gPreviousShadow = *IIGS_SHADOW_REG; *IIGS_NEWVIDEO_REG = (uint8_t)(NEWVIDEO_SHR_ON | NEWVIDEO_LINEARIZE | NEWVIDEO_RESERVED_BIT); *IIGS_BORDER_REG = (uint8_t)(gPreviousBorder & BORDER_COLOR_MASK); // Inhibit shadowing of the stage region. Without this, every // write to $01/2000-9FFF mirrors to $E1 and the off-screen-buffer // illusion breaks (the user would see drawing in progress). *IIGS_SHADOW_REG = (uint8_t)(gPreviousShadow | SHADOW_INHIBIT_SHR_MASK); // SCB and palette are uploaded by halPresent's iigsBlitStageToShr // (asm path, MVN to bank $E1). C-side memset/memcpy to bank $E1 // is unreliable from halInit's calling context, so we don't try // it here -- the first present will set up SCB to 320 mode. iigsInitRowLut(); gFrameHz = (iigsReadHzParam() == 1u) ? 50u : 60u; gModeSet = true; return true; } const char *halLastError(void) { return NULL; } void halPresent(const SurfaceT *src) { if (src == NULL) { return; } // iigsBlitStageToShr does pixels (MVN $01->$E1) + SCB + palette // upload entirely in asm via DBR=$E1 + sta abs,Y indexed stores. // ORCA-C's C-side memcpy to bank $E1 has been unreliable from // halPresent's calling context, so we route everything through // the asm path. Future: re-introduce per-row dirty-band logic // for partial-screen updates (currently we always blit 32K). iigsBlitStageToShr(src->scb, &src->palette[0][0]); } void halShutdown(void) { if (gModeSet) { *IIGS_NEWVIDEO_REG = gPreviousNewVideo; *IIGS_BORDER_REG = gPreviousBorder; *IIGS_SHADOW_REG = gPreviousShadow; gModeSet = false; } } // halFastSurfaceClear / halFastDrawLine / halFastDrawCircle / // halFastFillCircle / halFastTileCopy / halFastTileCopyMasked / // halFastTilePaste / halFastTileSnap / halFastTileFill / // halFastBlitRect / halFastFloodWalk[AndScans] / // halFastFloodScanRow / halFastFloodScanAndPush all dispatch via // macros in core/hal.h on IIgs (#ifdef JOEYLIB_PLATFORM_IIGS block). // Only halFastFillRect remains a real function below because its // partial-byte (nibble-edge) handling is too gnarly for a macro. // halFastFillRect: macro-dispatched in core/hal.h, same as the other // halFast* primitives. The C wrapper that used to live here was kept // as load-bearing _ROOT mass to defeat ORCA-Linker bank fragility; // since the CORESYS migration drained _ROOT, the macro form is safe. uint8_t *halStageAllocPixels(void) { return IIGS_STAGE_PIXELS; } void halStageFreePixels(uint8_t *pixels) { (void)pixels; // Backing memory is hardware-pinned; nothing to free. } // IIgs is chunky-native: portData is unused. The chunky `pixels` // buffer at $01:2000 is the stage's pixel storage and the source for // stagePresent's PEI-slam to $E1. void *halSurfaceAllocPortData(SurfaceT *s, bool isStage) { (void)s; (void)isStage; return NULL; } void halSurfaceFreePortData(SurfaceT *s, bool isStage, void *portData) { (void)s; (void)isStage; (void)portData; } // IIgs SHR is chunky-native; no bitplanes to update. void halFillRectPlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex) { (void)s; (void)x; (void)y; (void)w; (void)h; (void)colorIndex; } void halSurfaceCopyPlanes(SurfaceT *dst, const SurfaceT *src) { (void)dst; (void)src; } void halTileFillPlanes(SurfaceT *s, uint8_t bx, uint8_t by, uint8_t colorIndex) { (void)s; (void)bx; (void)by; (void)colorIndex; } void halTileCopyPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy) { (void)dst; (void)dstBx; (void)dstBy; (void)src; (void)srcBx; (void)srcBy; } void halTileCopyMaskedPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy, uint8_t transparentIndex) { (void)dst; (void)dstBx; (void)dstBy; (void)src; (void)srcBx; (void)srcBy; (void)transparentIndex; } void halTilePastePlanes(SurfaceT *dst, uint8_t bx, uint8_t by, const uint8_t *chunkyTile) { (void)dst; (void)bx; (void)by; (void)chunkyTile; } void halTileSnapPlanes(const SurfaceT *src, uint8_t bx, uint8_t by, uint8_t *chunkyTileOut) { (void)src; (void)bx; (void)by; (void)chunkyTileOut; } void halSpriteDrawPlanes(SurfaceT *s, const SpriteT *sp, int16_t x, int16_t y) { (void)s; (void)sp; (void)x; (void)y; } void halBlitRectPlanes(SurfaceT *dst, int16_t x, int16_t y, const uint8_t *srcBytes, int16_t srcX0, int16_t srcY0, int16_t copyW, int16_t copyH, int16_t srcRowBytes, uint16_t transparent) { (void)dst; (void)x; (void)y; (void)srcBytes; (void)srcX0; (void)srcY0; (void)copyW; (void)copyH; (void)srcRowBytes; (void)transparent; } void halSpriteSavePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t *dstPlaneBytes) { (void)s; (void)x; (void)y; (void)w; (void)h; (void)dstPlaneBytes; } void halSpriteRestorePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, const uint8_t *srcPlaneBytes) { (void)s; (void)x; (void)y; (void)w; (void)h; (void)srcPlaneBytes; } /* Phase 9 chunky reader hooks: IIgs reads from s->pixels just like * the legacy paths did. Same logic as the DOS port. */ uint8_t halSamplePixel(const SurfaceT *s, int16_t x, int16_t y) { uint8_t byte = s->pixels[(uint16_t)y * SURFACE_BYTES_PER_ROW + ((uint16_t)x >> 1)]; if (x & 1) return (uint8_t)(byte & 0x0Fu); return (uint8_t)((byte & 0xF0u) >> 4); } uint32_t halSurfaceHash(const SurfaceT *s) { uint16_t lo = 0xACE1u, hi = 0x1357u, blocks, n, v; const uint8_t *p; const uint16_t *w; uint8_t b; p = s->pixels; blocks = (uint16_t)(SURFACE_PIXELS_SIZE / 8); do { b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b); b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b); b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b); b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b); b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b); b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b); b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b); b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b); blocks--; } while (blocks > 0u); p = s->scb; for (n = 0; n < (uint16_t)SURFACE_HEIGHT; n++) { b = *p++; SURFACE_HASH_MIX_BYTE(lo, hi, b); } w = &s->palette[0][0]; for (n = 0; n < (uint16_t)SURFACE_PALETTE_ENTRIES; n++) { v = *w++; b = (uint8_t)((v >> 8) & 0xFFu); SURFACE_HASH_MIX_BYTE(lo, hi, b); b = (uint8_t)(v & 0xFFu); SURFACE_HASH_MIX_BYTE(lo, hi, b); } return ((uint32_t)hi << 16) | (uint32_t)lo; } void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src) { memcpy(dst->pixels, src->pixels, SURFACE_PIXELS_SIZE); } bool halSurfaceLoadFile(SurfaceT *dst, FILE *fp) { return fread(dst->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE; } bool halSurfaceSaveFile(const SurfaceT *src, FILE *fp) { return fwrite(src->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE; } uint8_t *halSurfaceAllocPixels(void) { return (uint8_t *)calloc(1, SURFACE_PIXELS_SIZE); } void halSurfaceFreePixels(uint8_t *pixels) { free(pixels); } uint8_t *halSurfacePlanePtr(const SurfaceT *s, uint8_t planeIdx) { (void)s; (void)planeIdx; return NULL; } // $C019 RDVBLBAR: bit 7 = 0 during vertical blank, 1 during active // scan. To produce a rising-edge wait (one VBL per call), first spin // while VBL is currently active (bit 7 = 0), then spin until VBL // fires again (bit 7 returns to 0). The IIgs SHR refresh is 60 Hz. void halWaitVBL(void) { while ((*IIGS_VBL_STATUS & VBL_BAR_BIT) == 0) { /* already in VBL: wait for active scan */; } while ((*IIGS_VBL_STATUS & VBL_BAR_BIT) != 0) { /* scanning: wait for next VBL */; } } // Frame counter via $C019 polling. Edge-detected on each call: the // caller (UBER, animation loops) polls fast enough that we never // miss a VBL transition. No IRQ involvement; safe in the S16 takeover // context where ToolBox interrupt setup would be intrusive. // // gFrameCount uses an explicit lda+adc+sta read-modify-write rather // than `gFrameCount++` because ORCA-C lowers the post-increment to // `inc |gFrameCount` (the only INC abs form on 65816 -- there is no // INC long-abs). With this file in the DRAWPRIMS load segment but // halFrameCount called from CORESYS via JSL, DBR isn't pointing at // DRAWPRIMS's data bank, so the abs INC silently mutates the wrong // byte and the counter never advances. The explicit lda > / sta > // pattern uses long-mode addressing throughout, which is // DBR-independent. uint16_t halFrameCount(void) { return iigsGetTickWord(); } uint16_t halFrameHz(void) { return gFrameHz; }