// Apple IIgs HAL: enable SHR, write pixels / SCBs / palettes into the // $E1 bank at the stock addresses the shifter reads from. // // Memory map in bank $E1: // $2000 - $9CFF pixel data (32000 bytes, 160 bytes per scanline) // $9D00 - $9DC7 SCB bytes (200 used) // $9E00 - $9FFF 16 palettes x 16 colors x 2 bytes, $0RGB // // NEWVIDEO register at $00C029 controls SHR enable. Bit 7 turns SHR on. // ORCA/C must be built with 32-bit pointer mode (-w or equivalent) so // that the long addresses resolve to bank $E1. // // DIRTY-WALK + PEI-SLAM PRESENT // ----------------------------- // halPresent walks the per-row dirty bands maintained by drawing // primitives in src/core/*.c. Fully-dirty rows go through the PEI // slam in src/port/iigs/peislam.asm (~530 cyc/row, ~55% faster than // memcpy/MVN); partial-dirty rows use memcpy, which ORCA-C lowers // to MVN (7 cyc/byte) -- the fastest 65816 way to move bytes into // bank $E1 when the dirty band is too narrow to amortize the slam's // per-call AUXWRITE/RAMRD/shadow toggle. // // peislam.asm declares its load segment as DRAWPRIMS so the linker // places it in its own bank, separate from AUDIO's _ROOT (where // audio_full.c + Memory Manager + stdio + NTPstreamsound already // crowd up against the 64 KB-per-bank limit). #include #include #include "joey/debug.h" #include "hal.h" #include "surfaceInternal.h" // hal.c is the single TU that calls into joeyDraw.asm. Cross- // platform draw.c / tile.c / etc. dispatch through halFast* // functions defined here; they never reference the asm symbols // directly. This avoids the cumulative ORCA-Linker-Expression- // too-complex-in-13/SysLib failure that hit when each cross- // platform TU brought its own asm extern. JOEYLIB_SEGMENT("DRAWPRIMS") // 32 KB stack-slam fill via AUXWRITE. ~25 ms full-screen. extern void iigsSurfaceClearInner(uint8_t *pixels, uint16_t fillWord); // Full-fill asm helper (partial leading byte + middle MVN + partial // trailing byte). Called by halFastFillRect below. extern void iigsFillRectInner(uint8_t *pixels, uint16_t x, uint16_t y, uint16_t w, uint16_t h, uint16_t nibble); // 16 STA abs,X stores at fixed offsets along a 160-byte stride. // ~120 cyc per call. extern void iigsTileFillInner(uint8_t *dstRow0, uint16_t fillWord); // Tile copy / paste / snap inner loops. All take 4-byte large- // model pointers; bank may differ between dst and src (heap // surface vs stage). Stride contracts: // tileCopyInner / tileCopyMaskedInner: dst 160, src 160 // tilePasteInner: dst 160, src 4 // tileSnapInner: dst 4, src 160 extern void iigsTileCopyInner(uint8_t *dstRow0, const uint8_t *srcRow0); extern void iigsTileCopyMaskedInner(uint8_t *dstRow0, const uint8_t *srcRow0, uint16_t transparent); extern void iigsTilePasteInner(uint8_t *dstRow0, const uint8_t *srcTilePixels); extern void iigsTileSnapInner(uint8_t *dstTilePixels, const uint8_t *srcRow0); // Single-pixel and Bresenham line plot. drawLine inner takes // pre-clipped endpoints (caller validates against surface bounds); // it does no per-pixel clipping in the loop. extern void iigsDrawPixelInner(uint8_t *pixels, uint16_t x, uint16_t y, uint16_t nibble); extern void iigsDrawLineInner(uint8_t *pixels, uint16_t x0, uint16_t y0, uint16_t x1, uint16_t y1, uint16_t nibble); // Bresenham midpoint circle outline. Caller has verified the entire // bbox is on-surface so no per-pixel clip. extern void iigsDrawCircleInner(uint8_t *pixels, uint16_t cx, uint16_t cy, uint16_t r, uint16_t nibble); // Stage-to-SHR full upload: pixels (MVN $01->$E1), SCB, palette. // Asm uses post-MVN DBR=$E1 to do sta abs,Y for SCB/palette. // Replaces ORCA-C's memcpy path which silently fails when called // from halPresent (DBR-state quirk after prior asm primitives). extern void iigsBlitStageToShr(uint8_t *scbPtr, uint16_t *palettePtr); // floodFill walk results: written by iigsFloodWalkAndScansInner, // read back by halFastFloodWalkAndScans. extern uint16_t gFloodSeedMatch; extern uint16_t gFloodLeftX; extern uint16_t gFloodRightX; // Per-pixel rect blit (src->dst). transparent == $FFFF means opaque // (always copy); else pixels with src nibble == (transparent & $0F) // are skipped. Dst stride is hardcoded 160 (SURFACE_BYTES_PER_ROW). extern void iigsBlitRectInner(uint8_t *dstRow0, uint16_t dstX, const uint8_t *srcRow0, uint16_t srcX, uint16_t copyW, uint16_t copyH, uint16_t srcRowBytes, uint16_t transparent); // Single-call per-popped-seed worker: seed test + walk-left + walk-right // + scan-above + scan-below + push, all sharing cached row addr and // match decoders. Outputs to gFloodSeedMatch / gFloodLeftX / gFloodRightX. extern void iigsFloodWalkAndScansInner(uint8_t *pixels, uint16_t x, uint16_t y, uint16_t matchColor, uint16_t newColor, uint16_t matchEqual, int16_t *stackX, int16_t *stackY, uint16_t *spInOut, uint16_t maxSp); // One-shot init for the y*160 lookup table (gRowOffsetLut, 400 bytes // in DRAWPRIMS data). Called once from halInit. After this returns, // every asm primitive that needs row offset can do `lda >lut,x` instead // of the 7-instruction shift-add. extern void iigsInitRowLut(void); // Per-row MVN blit from $01:srcOffset to $E1:srcOffset for partial- // screen presents (halPresentRect). srcOffset is the byte offset // within bank $01 of the FIRST byte to copy on the FIRST row; // subsequent rows are at srcOffset + 160, etc. ~9 cyc/byte vs // ORCA-C memcpy's ~30 cyc/byte. extern void iigsBlitRectStageToShr(uint16_t srcOffset, uint16_t copyBytes, uint16_t rowsLeft); // Filled circle, scanline-style. fillWord low byte is the doubled // nibble (e.g., 0x33 for nibble 3). extern void iigsFillCircleInner(uint8_t *pixels, uint16_t cx, uint16_t cy, uint16_t r, uint16_t fillWord); // ----- Hardware addresses (24-bit / long pointers) ----- #define IIGS_NEWVIDEO_REG ((volatile uint8_t *)0x00C029L) #define IIGS_BORDER_REG ((volatile uint8_t *)0x00C034L) #define IIGS_SHADOW_REG ((volatile uint8_t *)0x00C035L) #define IIGS_VBL_STATUS ((volatile uint8_t *)0x00C019L) #define IIGS_SHR_PIXELS ((uint8_t *)0xE12000L) #define IIGS_SHR_SCB ((uint8_t *)0xE19D00L) #define IIGS_SHR_PALETTE ((uint16_t *)0xE19E00L) // The stage lives at $01/2000 -- the same offset as the SHR display // framebuffer at $E1/2000, but in the fast (2.8 MHz) bank. With SHR // shadow inhibited at $C035, writes here are NOT auto-mirrored to // $E1, so drawing is full-speed and isolated from the displayed // frame until the next stagePresent. #define IIGS_STAGE_PIXELS ((uint8_t *)0x012000L) #define VBL_BAR_BIT 0x80 // NEWVIDEO bit masks #define NEWVIDEO_SHR_ON 0x80 #define NEWVIDEO_LINEARIZE 0x40 // Bit 0 is documented as reserved-must-be-1 in the IIgs Hardware // Reference for forward compatibility. Real silicon doesn't care, but // GSplus halts on writes that leave it clear (see moremem.c c029 // handler) and bumps its "Code: RED" status. Always include this bit. #define NEWVIDEO_RESERVED_BIT 0x01 // $C035 SHADOW register: bit set = shadow INHIBITED for that range. // Bit 1 = hi-res page 1 ($02000-$03FFF in bank $01) // Bit 2 = hi-res page 2 ($04000-$05FFF in bank $01) // Bit 3 = SHR ($02000-$09FFF in bank $01) // We set 1+2+3 because the SHR pixel range overlaps both hi-res // pages; leaving any of those shadows live would silently mirror // part of the stage to $E1. #define SHADOW_INHIBIT_SHR_MASK 0x0E // $C034 BORDER register: high nibble = beep/IRQ enables (preserve), // low nibble = border color index 0..15. Color 0 is the all-zero // palette entry by IIgs convention; we force the low nibble to 0 // in halInit so the visible bezel matches the cleared SHR background. #define BORDER_COLOR_MASK 0xF0 // ----- Module state ----- static uint8_t gPreviousNewVideo = 0; static uint8_t gPreviousBorder = 0; static uint8_t gPreviousShadow = 0; static bool gModeSet = false; // SCB / palette upload skipping is now driven by gStageScbDirty / // gStagePaletteDirty (core/surface.c). The old per-frame memcmp- // against-cached-copy approach was costing ~7 ms / frame on ORCA-C. // PEI slam scratch. File-scope non-static so the asm can `ext` them; // all accesses inside the slam use long-mode `>` addressing so they // bypass the //e RAMRD redirect the slam turns on for its duration. volatile uint16_t gPeiOrigSp; volatile uint8_t gPeiOrigShadow; volatile uint16_t gPeiTempRowBase; volatile uint16_t gPeiCurRow; // row counter saved across slam (stack is hijacked) volatile uint16_t gPeiChunkRow; // in-chunk row counter saved across slam (Y reg storage) // peislam.asm's per-row peiSlamFullRow helper is no longer wired in; // the present pipeline now does its own PEI-slam loop inside // iigsBlitStageToShr above (with dirty-row skip). // Upload SCB / palette into bank-$E1 SHR memory only when the // matching dirty flag is set. Replaces a per-frame 712-byte memcmp // pair (~7 ms / frame on ORCA-C with -b) with a 2-cyc flag check. // gStageScbDirty / gStagePaletteDirty live in core/surface.c; they // start true (forces the very first present to upload), get set true // again whenever scbSet* / paletteSet mutate the stage's data, and // get cleared here after upload. static void uploadScbAndPaletteIfNeeded(const SurfaceT *src) { if (gStageScbDirty) { memcpy(IIGS_SHR_SCB, src->scb, SURFACE_HEIGHT); gStageScbDirty = false; } if (gStagePaletteDirty) { memcpy(IIGS_SHR_PALETTE, src->palette, sizeof(src->palette)); gStagePaletteDirty = false; } } // ----- HAL API (alphabetical) ----- bool halInit(const JoeyConfigT *config) { (void)config; gPreviousNewVideo = *IIGS_NEWVIDEO_REG; gPreviousBorder = *IIGS_BORDER_REG; gPreviousShadow = *IIGS_SHADOW_REG; *IIGS_NEWVIDEO_REG = (uint8_t)(NEWVIDEO_SHR_ON | NEWVIDEO_LINEARIZE | NEWVIDEO_RESERVED_BIT); *IIGS_BORDER_REG = (uint8_t)(gPreviousBorder & BORDER_COLOR_MASK); // Inhibit shadowing of the stage region. Without this, every // write to $01/2000-9FFF mirrors to $E1 and the off-screen-buffer // illusion breaks (the user would see drawing in progress). *IIGS_SHADOW_REG = (uint8_t)(gPreviousShadow | SHADOW_INHIBIT_SHR_MASK); // SCB and palette are uploaded by halPresent's iigsBlitStageToShr // (asm path, MVN to bank $E1). C-side memset/memcpy to bank $E1 // is unreliable from halInit's calling context, so we don't try // it here -- the first present will set up SCB to 320 mode. iigsInitRowLut(); gModeSet = true; return true; } const char *halLastError(void) { return NULL; } void halPresent(const SurfaceT *src) { if (src == NULL) { return; } // iigsBlitStageToShr does pixels (MVN $01->$E1) + SCB + palette // upload entirely in asm via DBR=$E1 + sta abs,Y indexed stores. // ORCA-C's C-side memcpy to bank $E1 has been unreliable from // halPresent's calling context, so we route everything through // the asm path. Future: re-introduce per-row dirty-band logic // for partial-screen updates (currently we always blit 32K). iigsBlitStageToShr(src->scb, &src->palette[0][0]); } void halPresentRect(const SurfaceT *src, int16_t x, int16_t y, uint16_t w, uint16_t h) { uint16_t copyBytes; int16_t byteStart; uint16_t srcOffset; if (src == NULL) { return; } uploadScbAndPaletteIfNeeded(src); // Pixel copy: byte-aligned runs per scanline. x is always even // after API-level clipping for 4bpp packed if caller aligned it; // otherwise we include the byte containing the leftmost pixel. byteStart = x >> 1; copyBytes = (uint16_t)(((x + (int16_t)w + 1) >> 1) - byteStart); if (copyBytes == 0 || h == 0) { return; } // Asm per-row MVN blit. Stage pixels live at $01:2000; SHR display // at $E1:2000 (same offset within their banks). srcOffset is the // byte offset of the first byte to copy on the first row. srcOffset = (uint16_t)(0x2000 + y * SURFACE_BYTES_PER_ROW + byteStart); iigsBlitRectStageToShr(srcOffset, copyBytes, h); } void halShutdown(void) { if (gModeSet) { *IIGS_NEWVIDEO_REG = gPreviousNewVideo; *IIGS_BORDER_REG = gPreviousBorder; *IIGS_SHADOW_REG = gPreviousShadow; gModeSet = false; } } // halFastSurfaceClear / halFastDrawLine / halFastDrawCircle / // halFastFillCircle / halFastTileCopy / halFastTileCopyMasked / // halFastTilePaste / halFastTileSnap / halFastTileFill / // halFastBlitRect / halFastFloodWalk[AndScans] / // halFastFloodScanRow / halFastFloodScanAndPush all dispatch via // macros in core/hal.h on IIgs (#ifdef JOEYLIB_PLATFORM_IIGS block). // Only halFastFillRect remains a real function below because its // partial-byte (nibble-edge) handling is too gnarly for a macro. // halFastFillRect: thin wrapper around iigsFillRectInner. The asm // helper now handles the partial-byte (nibble-edge) logic that used // to live here, so this function is just a stage-check + forward. // (It's not macro-dispatched like the others because removing it // from the C side triggers an unrelated ORCA-linker bank-placement // failure -- the binary needs enough mass in _ROOT to keep sprite // codegen's static symbols at addresses the linker can resolve.) bool halFastFillRect(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex) { if (s == NULL || s != stageGet()) { return false; } iigsFillRectInner(s->pixels, (uint16_t)x, (uint16_t)y, (uint16_t)w, (uint16_t)h, (uint16_t)(colorIndex & 0x0F)); return true; } uint8_t *halStageAllocPixels(void) { return IIGS_STAGE_PIXELS; } void halStageFreePixels(uint8_t *pixels) { (void)pixels; // Backing memory is hardware-pinned; nothing to free. } // $C019 RDVBLBAR: bit 7 = 0 during vertical blank, 1 during active // scan. To produce a rising-edge wait (one VBL per call), first spin // while VBL is currently active (bit 7 = 0), then spin until VBL // fires again (bit 7 returns to 0). The IIgs SHR refresh is 60 Hz. void halWaitVBL(void) { while ((*IIGS_VBL_STATUS & VBL_BAR_BIT) == 0) { /* already in VBL: wait for active scan */; } while ((*IIGS_VBL_STATUS & VBL_BAR_BIT) != 0) { /* scanning: wait for next VBL */; } }