From 20cbccaca5f49ca91e5b5602e50ac279ac27234e Mon Sep 17 00:00:00 2001 From: Scott Duensing Date: Thu, 30 Apr 2026 17:41:23 -0500 Subject: [PATCH] More speed! --- make/iigs.mk | 13 +++++++++++- src/codegen/spriteCompile.c | 13 ++++++------ src/codegen/spriteEmitIigs.c | 40 ++++++++++++++++++++---------------- src/core/asset.c | 3 +++ src/core/audio.c | 3 +++ src/core/codegenArena.c | 3 +++ src/core/debug.c | 4 ++++ src/core/draw.c | 22 ++++++++++---------- src/core/hal.h | 22 ++++++++++++-------- src/core/init.c | 3 +++ src/core/input.c | 3 +++ src/core/palette.c | 3 +++ src/core/present.c | 3 +++ src/core/scb.c | 3 +++ src/core/sprite.c | 3 +++ src/core/surface.c | 7 +++++++ src/core/surfaceInternal.h | 13 ++++++++++++ src/core/tile.c | 14 ++++++------- src/port/iigs/hal.c | 23 +++++---------------- src/port/iigs/input.c | 3 +++ 20 files changed, 130 insertions(+), 71 deletions(-) diff --git a/make/iigs.mk b/make/iigs.mk index 1bd8e21..eb107a6 100644 --- a/make/iigs.mk +++ b/make/iigs.mk @@ -49,7 +49,18 @@ NTP_BIN := $(BUILD)/audio/ntpplayer.bin NTP_ASM := $(BUILD)/audio/ntpdata.asm IIGS_MERLIN := $(REPO_DIR)/toolchains/iigs/merlin32/bin/merlin32 -LIB_SRCS := $(CORE_C_SRCS_IIGS) $(PORT_C_SRCS) $(PORT_ASM_SRCS_ALL) $(NTP_ASM) $(CODEGEN_SRCS) +# IMPORTANT: CODEGEN_SRCS (specifically spriteEmitIigs.c) MUST be the +# first entry after the main object in the link order. ORCA-Linker's +# bank assignment is order-sensitive: when spriteEmitIigs.c lands at +# any later position, the linker assigns SPRITECG to a bank where its +# intra-OMF-segment static-symbol relocations (emitMvnCopyRoutine, +# shiftedByteAt, writeLE16) can't be encoded -- you get cryptic +# "Addressing error" / "Unresolved reference Label: ..." failures +# whose root cause is bank packing, not source. Putting CODEGEN_SRCS +# first gives SPRITECG prime placement and the relocations resolve. +# This was the underlying cause of feedback_orca_link_segment_count +# cases 2-5 (we'd been working around it by managing _ROOT mass). +LIB_SRCS := $(CODEGEN_SRCS) $(CORE_C_SRCS_IIGS) $(PORT_C_SRCS) $(PORT_ASM_SRCS_ALL) $(NTP_ASM) HELLO_SRC := $(EXAMPLES)/hello/hello.c HELLO_BIN := $(BINDIR)/HELLO diff --git a/src/codegen/spriteCompile.c b/src/codegen/spriteCompile.c index b8ebf98..74c1c69 100644 --- a/src/codegen/spriteCompile.c +++ b/src/codegen/spriteCompile.c @@ -158,10 +158,9 @@ bool spriteCompile(SpriteT *sp) { #if defined(JOEYLIB_PLATFORM_IIGS) -// y*160 lookup. gRowOffsetLut is the 200-entry uint16_t table built -// once by iigsInitRowLut at halInit. Replaces ORCA-C's runtime -// multiply (a JSL into __mul16) with a single indexed long-mode read. -extern const uint16_t gRowOffsetLut[200]; +// SURFACE_ROW_OFFSET dispatches to the gRowOffsetLut lookup on IIgs; +// declared in surfaceInternal.h. Replaces ORCA-C's __mul16 JSL with a +// single indexed long-mode read. // IIgs uses inline asm + a self-modifying call stub instead of a C // function-pointer cast. The build uses ORCA-C large memory model @@ -212,7 +211,7 @@ void spriteCompiledDraw(SurfaceT *dst, const SpriteT *sp, int16_t x, int16_t y) uint8_t *destPtr; uint8_t destBytes[4]; shift = (uint8_t)(x & 1); - destPtr = &dst->pixels[gRowOffsetLut[(uint16_t)y] + ((uint16_t)x >> 1)]; + destPtr = &dst->pixels[SURFACE_ROW_OFFSET(y) + ((uint16_t)x >> 1)]; memcpy(destBytes, &destPtr, 4); destAddr = (uint32_t)destBytes[0] | ((uint32_t)destBytes[1] << 8) @@ -366,7 +365,7 @@ void spriteCompiledSaveUnder(const SurfaceT *src, SpriteT *sp, int16_t x, int16_ heightPx = (uint16_t)(sp->heightTiles * 8); copyBytes = (uint16_t)((widthPx >> 1) + (shift == 1 ? 1 : 0)); - screenPtr = (uint8_t *)&src->pixels[gRowOffsetLut[(uint16_t)y] + ((uint16_t)clippedX >> 1)]; + screenPtr = (uint8_t *)&src->pixels[SURFACE_ROW_OFFSET(y) + ((uint16_t)clippedX >> 1)]; splitPointer(screenPtr, &screenLo, &screenBank); splitPointer(backup->bytes, &backupLo, &backupBank); @@ -450,7 +449,7 @@ void spriteCompiledRestoreUnder(SurfaceT *dst, const SpriteBackupT *backup) { spriteBytesPerRow = (uint16_t)(sp->widthTiles * 4); shift = (copyBytes == spriteBytesPerRow) ? 0 : 1; - screenPtr = (uint8_t *)&dst->pixels[gRowOffsetLut[(uint16_t)backup->y] + ((uint16_t)backup->x >> 1)]; + screenPtr = (uint8_t *)&dst->pixels[SURFACE_ROW_OFFSET(backup->y) + ((uint16_t)backup->x >> 1)]; splitPointer(screenPtr, &screenLo, &screenBank); splitPointer(backup->bytes, &backupLo, &backupBank); diff --git a/src/codegen/spriteEmitIigs.c b/src/codegen/spriteEmitIigs.c index 01936d5..a5fda1e 100644 --- a/src/codegen/spriteEmitIigs.c +++ b/src/codegen/spriteEmitIigs.c @@ -57,7 +57,6 @@ JOEYLIB_SEGMENT("SPRITECG") static uint16_t emitMvnCopyRoutine(uint8_t *out, uint16_t heightPx, uint16_t copyBytes, bool advanceX); static void shiftedByteAt(const SpriteT *sp, uint16_t row, uint16_t col, uint8_t shift, uint16_t spriteBytesPerRow, uint8_t *outValue, uint8_t *outOpaqueMask); static uint8_t spriteSourceByte(const SpriteT *sp, uint16_t row, uint16_t col); -static uint16_t writeLE16(uint8_t *out, uint16_t value); // ----- Emit helpers (alphabetical) ----- @@ -126,13 +125,9 @@ static uint8_t spriteSourceByte(const SpriteT *sp, uint16_t row, uint16_t col) { } -// 65816 is little-endian; write low byte first. -static uint16_t writeLE16(uint8_t *out, uint16_t value) { - out[0] = (uint8_t)(value & 0xFFu); - out[1] = (uint8_t)((value >> 8) & 0xFFu); - return 2; -} - +// writeLE16 was inlined at every call site. Inlining cuts a JSL/RTL +// per emitted 16-bit immediate (4 instructions per byte * 12 sites) +// and avoids ORCA-Linker bank-fragility around tiny-helper resolution. // Common backbone for save and restore. Both ops copy a byte-aligned // rectangle row-by-row using MVN; only the operand banks (which buffer @@ -178,11 +173,13 @@ static uint16_t emitMvnCopyRoutine(uint8_t *out, uint16_t heightPx, uint16_t cop out[cursor++] = advanceX ? 0x8A : 0x98; // TXA / TYA out[cursor++] = 0x18; // CLC out[cursor++] = 0x69; // ADC #imm (M=16) - cursor += writeLE16(out + cursor, advance); + out[cursor++] = (uint8_t)(advance & 0xFFu); + out[cursor++] = (uint8_t)((advance >> 8) & 0xFFu); out[cursor++] = advanceX ? 0xAA : 0xA8; // TAX / TAY } out[cursor++] = 0xA9; // LDA #imm (M=16) - cursor += writeLE16(out + cursor, (uint16_t)(copyBytes - 1)); + out[cursor++] = (uint8_t)((copyBytes - 1) & 0xFFu); + out[cursor++] = (uint8_t)(((copyBytes - 1) >> 8) & 0xFFu); out[cursor++] = 0x54; // MVN out[cursor++] = 0x00; // dstbk -- patched per call out[cursor++] = 0x00; // srcbk -- patched per call @@ -299,11 +296,15 @@ uint16_t spriteEmitDrawIigs(uint8_t *out, const SpriteT *sp, uint8_t shift) { out[cursor++] = 0x20; wide = true; } - out[cursor++] = 0xA9; // LDA #imm16 - cursor += writeLE16(out + cursor, - (uint16_t)(((uint16_t)nextValue << 8) | value)); - out[cursor++] = 0x99; // STA abs,Y - cursor += writeLE16(out + cursor, absOffset); + { + uint16_t pair = (uint16_t)(((uint16_t)nextValue << 8) | value); + out[cursor++] = 0xA9; // LDA #imm16 + out[cursor++] = (uint8_t)(pair & 0xFFu); + out[cursor++] = (uint8_t)((pair >> 8) & 0xFFu); + out[cursor++] = 0x99; // STA abs,Y + out[cursor++] = (uint8_t)(absOffset & 0xFFu); + out[cursor++] = (uint8_t)((absOffset >> 8) & 0xFFu); + } col++; // consumed col+1 continue; } @@ -321,16 +322,19 @@ uint16_t spriteEmitDrawIigs(uint8_t *out, const SpriteT *sp, uint8_t shift) { out[cursor++] = 0xA9; out[cursor++] = value; out[cursor++] = 0x99; - cursor += writeLE16(out + cursor, absOffset); + out[cursor++] = (uint8_t)(absOffset & 0xFFu); + out[cursor++] = (uint8_t)((absOffset >> 8) & 0xFFu); } else { out[cursor++] = 0xB9; - cursor += writeLE16(out + cursor, absOffset); + out[cursor++] = (uint8_t)(absOffset & 0xFFu); + out[cursor++] = (uint8_t)((absOffset >> 8) & 0xFFu); out[cursor++] = 0x29; out[cursor++] = (uint8_t)(~opaqueMask & 0xFFu); out[cursor++] = 0x09; out[cursor++] = value; out[cursor++] = 0x99; - cursor += writeLE16(out + cursor, absOffset); + out[cursor++] = (uint8_t)(absOffset & 0xFFu); + out[cursor++] = (uint8_t)((absOffset >> 8) & 0xFFu); } } } diff --git a/src/core/asset.c b/src/core/asset.c index 68e3b16..173cd49 100644 --- a/src/core/asset.c +++ b/src/core/asset.c @@ -12,6 +12,9 @@ #include "joey/asset.h" #include "joey/palette.h" +// CORESYS: hoisted out of _ROOT (see surface.c for rationale). +JOEYLIB_SEGMENT("CORESYS") + #define JAS_HEADER_SIZE 44 #define JAS_PIXELS_OFFSET JAS_HEADER_SIZE #define JAS_PALETTE_OFFSET 12 diff --git a/src/core/audio.c b/src/core/audio.c index 8cd4824..3a39207 100644 --- a/src/core/audio.c +++ b/src/core/audio.c @@ -8,6 +8,9 @@ #include "joey/audio.h" #include "hal.h" +// CORESYS: hoisted out of _ROOT (see surface.c for rationale). +JOEYLIB_SEGMENT("CORESYS") + static bool gAudioReady = false; diff --git a/src/core/codegenArena.c b/src/core/codegenArena.c index 46d3487..4bd3eac 100644 --- a/src/core/codegenArena.c +++ b/src/core/codegenArena.c @@ -23,6 +23,9 @@ #include "codegenArenaInternal.h" +// CORESYS: hoisted out of _ROOT (see surface.c for rationale). +JOEYLIB_SEGMENT("CORESYS") + // ----- Module state ----- diff --git a/src/core/debug.c b/src/core/debug.c index 5c57137..bb85d7d 100644 --- a/src/core/debug.c +++ b/src/core/debug.c @@ -10,8 +10,12 @@ #include #include +#include "joey/platform.h" #include "joey/debug.h" +// CORESYS: hoisted out of _ROOT (see surface.c for rationale). +JOEYLIB_SEGMENT("CORESYS") + static const char *kLogPath = "joeylog.txt"; diff --git a/src/core/draw.c b/src/core/draw.c index 91c220a..ee12b79 100644 --- a/src/core/draw.c +++ b/src/core/draw.c @@ -113,7 +113,7 @@ static void fillRectClipped(SurfaceT *s, int16_t x, int16_t y, int16_t w, int16_ uint8_t *line; for (row = 0; row < h; row++) { - line = &s->pixels[(y + row) * SURFACE_BYTES_PER_ROW]; + line = &s->pixels[SURFACE_ROW_OFFSET(y + row)]; pxStart = x; pxEnd = x + w; @@ -208,7 +208,7 @@ static void floodFillInternal(SurfaceT *s, int16_t startX, int16_t startY, uint8 // Fallback path needs row; compute it here so the asm path // above doesn't pay for an unused y*160 multiply on every iter. - row = &s->pixels[y * SURFACE_BYTES_PER_ROW]; + row = &s->pixels[SURFACE_ROW_OFFSET(y)]; // Tier-2 asm fast path: combined seed test + walk-left + // walk-right in one cross-segment call. Falls back to the @@ -294,7 +294,7 @@ static void floodFillInternal(SurfaceT *s, int16_t startX, int16_t startY, uint8 } scanY = (int16_t)(y + 1); } - scanRow = &s->pixels[scanY * SURFACE_BYTES_PER_ROW]; + scanRow = &s->pixels[SURFACE_ROW_OFFSET(scanY)]; // Prefer the combined scan+push asm path (one call per // scan, no markBuf and no per-pixel C edge walk). if (!halFastFloodScanAndPush(scanRow, leftX, rightX, @@ -502,7 +502,7 @@ void drawPixel(SurfaceT *s, int16_t x, int16_t y, uint8_t colorIndex) { } if (!halFastDrawPixel(s, (uint16_t)x, (uint16_t)y, colorIndex)) { - byte = &s->pixels[y * SURFACE_BYTES_PER_ROW + (x >> 1)]; + byte = &s->pixels[SURFACE_ROW_OFFSET(y) + (x >> 1)]; nibble = colorIndex & 0x0F; if (x & 1) { *byte = (uint8_t)((*byte & 0xF0) | nibble); @@ -625,7 +625,7 @@ void floodFill(SurfaceT *s, int16_t x, int16_t y, uint8_t newColor) { if (x < 0 || x >= SURFACE_WIDTH || y < 0 || y >= SURFACE_HEIGHT) { return; } - row = &s->pixels[y * SURFACE_BYTES_PER_ROW]; + row = &s->pixels[SURFACE_ROW_OFFSET(y)]; seedColor = srcPixel(row, x); if ((seedColor & 0x0F) == (newColor & 0x0F)) { return; @@ -644,7 +644,7 @@ void floodFillBounded(SurfaceT *s, int16_t x, int16_t y, uint8_t newColor, uint8 if (x < 0 || x >= SURFACE_WIDTH || y < 0 || y >= SURFACE_HEIGHT) { return; } - row = &s->pixels[y * SURFACE_BYTES_PER_ROW]; + row = &s->pixels[SURFACE_ROW_OFFSET(y)]; pix = srcPixel(row, x); // Starting on a boundary pixel or already-filled pixel: nothing // to do. @@ -668,7 +668,7 @@ uint8_t samplePixel(const SurfaceT *s, int16_t x, int16_t y) { return 0; } - byte = s->pixels[y * SURFACE_BYTES_PER_ROW + (x >> 1)]; + byte = s->pixels[SURFACE_ROW_OFFSET(y) + (x >> 1)]; if (x & 1) { return (uint8_t)(byte & 0x0F); } @@ -698,12 +698,12 @@ void surfaceBlit(SurfaceT *dst, const JoeyAssetT *src, int16_t x, int16_t y) { srcRowBytes = (int16_t)((src->width + 1) >> 1); srcRow = &src->pixels[srcY0 * srcRowBytes]; - dstRow = &dst->pixels[y * SURFACE_BYTES_PER_ROW]; + dstRow = &dst->pixels[SURFACE_ROW_OFFSET(y)]; if (!halFastBlitRect(dstRow, x, srcRow, srcX0, copyW, copyH, srcRowBytes, 0xFFFFu)) { for (row = 0; row < copyH; row++) { srcRow = &src->pixels[(srcY0 + row) * srcRowBytes]; - dstRow = &dst->pixels[(y + row) * SURFACE_BYTES_PER_ROW]; + dstRow = &dst->pixels[SURFACE_ROW_OFFSET(y + row)]; for (col = 0; col < copyW; col++) { nibble = srcPixel(srcRow, srcX0 + col); dstPixel(dstRow, x + col, nibble); @@ -738,12 +738,12 @@ void surfaceBlitMasked(SurfaceT *dst, const JoeyAssetT *src, int16_t x, int16_t transparent = (uint8_t)(transparentIndex & 0x0F); srcRowBytes = (int16_t)((src->width + 1) >> 1); srcRow = &src->pixels[srcY0 * srcRowBytes]; - dstRow = &dst->pixels[y * SURFACE_BYTES_PER_ROW]; + dstRow = &dst->pixels[SURFACE_ROW_OFFSET(y)]; if (!halFastBlitRect(dstRow, x, srcRow, srcX0, copyW, copyH, srcRowBytes, (uint16_t)transparent)) { for (row = 0; row < copyH; row++) { srcRow = &src->pixels[(srcY0 + row) * srcRowBytes]; - dstRow = &dst->pixels[(y + row) * SURFACE_BYTES_PER_ROW]; + dstRow = &dst->pixels[SURFACE_ROW_OFFSET(y + row)]; for (col = 0; col < copyW; col++) { nibble = srcPixel(srcRow, srcX0 + col); if (nibble == transparent) { diff --git a/src/core/hal.h b/src/core/hal.h index 3a4665d..2726846 100644 --- a/src/core/hal.h +++ b/src/core/hal.h @@ -262,15 +262,19 @@ extern uint16_t gFloodRightX; true) \ : false) -// halFastFillRect stays as a real C wrapper -- removing it triggered -// an unrelated ORCA linker bank-placement failure (same mode as the -// peislam.asm deletion: `Unresolved reference Label: -// emitMvnCopyRoutine` in sprite codegen). The wrapper now just -// forwards to iigsFillRectInner (asm does partial+middle); we lose -// the call-site macro inlining for fillRect specifically but keep -// the rest of the macros AND the new asm helper. Per-call wrapper -// overhead for halFastFillRect is back (~80 cyc) but at least the -// per-row partial-byte logic happens in asm now. +// halFastFillRect: macro form, same shape as the others. Builds +// clean now that _ROOT has been thinned out via the CORESYS load +// segment migration -- previous attempts shrank _ROOT enough to +// retrip the bank-packing fragility, but with most core .c files +// out of _ROOT that's no longer reactive. Saves ~80 cyc/call. +#undef halFastFillRect +#define halFastFillRect(_s, _x, _y, _w, _h, _c) \ + ((_s) == stageGet() \ + ? (iigsFillRectInner((_s)->pixels, (uint16_t)(_x), (uint16_t)(_y), \ + (uint16_t)(_w), (uint16_t)(_h), \ + (uint16_t)((_c) & 0x0F)), \ + true) \ + : false) // Tile primitives operate on caller-computed row pointers; just // forward the args. by/bx are tile coords -> bx*4 + by*8*160 byte diff --git a/src/core/init.c b/src/core/init.c index 26f4c99..fc263ff 100644 --- a/src/core/init.c +++ b/src/core/init.c @@ -12,6 +12,9 @@ #include "hal.h" #include "surfaceInternal.h" +// CORESYS: hoisted out of _ROOT (see surface.c for rationale). +JOEYLIB_SEGMENT("CORESYS") + // 8 KB fits the largest typical sprite working set (~3-4 KB per // 32x32 sprite at all opaque) and keeps malloc requests small enough // for IIgs ORCA-C's small-memory-model heap to satisfy them. diff --git a/src/core/input.c b/src/core/input.c index 8347179..28e65c5 100644 --- a/src/core/input.c +++ b/src/core/input.c @@ -15,6 +15,9 @@ #include "hal.h" #include "inputInternal.h" +// CORESYS: hoisted out of _ROOT (see surface.c for rationale). +JOEYLIB_SEGMENT("CORESYS") + bool gKeyState [KEY_COUNT]; bool gKeyPrev [KEY_COUNT]; diff --git a/src/core/palette.c b/src/core/palette.c index 812fb94..5a85459 100644 --- a/src/core/palette.c +++ b/src/core/palette.c @@ -10,6 +10,9 @@ #include "joey/palette.h" #include "surfaceInternal.h" +// CORESYS: hoisted out of _ROOT (see surface.c for rationale). +JOEYLIB_SEGMENT("CORESYS") + // ----- Public API (alphabetical) ----- void paletteGet(const SurfaceT *s, uint8_t paletteIndex, uint16_t *out16) { diff --git a/src/core/present.c b/src/core/present.c index 84d561a..02468b2 100644 --- a/src/core/present.c +++ b/src/core/present.c @@ -12,6 +12,9 @@ #include "hal.h" #include "surfaceInternal.h" +// CORESYS: hoisted out of _ROOT (see surface.c for rationale). +JOEYLIB_SEGMENT("CORESYS") + // ----- Public API (alphabetical) ----- void stagePresent(void) { diff --git a/src/core/scb.c b/src/core/scb.c index 59d44dd..6946c79 100644 --- a/src/core/scb.c +++ b/src/core/scb.c @@ -9,6 +9,9 @@ #include "joey/palette.h" #include "surfaceInternal.h" +// CORESYS: hoisted out of _ROOT (see surface.c for rationale). +JOEYLIB_SEGMENT("CORESYS") + // ----- Public API (alphabetical) ----- uint8_t scbGet(const SurfaceT *s, uint16_t line) { diff --git a/src/core/sprite.c b/src/core/sprite.c index e2b6da8..7daf805 100644 --- a/src/core/sprite.c +++ b/src/core/sprite.c @@ -13,6 +13,9 @@ #include "spriteInternal.h" #include "surfaceInternal.h" +// CORESYS: hoisted out of _ROOT (see surface.c for rationale). +JOEYLIB_SEGMENT("CORESYS") + // 8x8 tiles, 4bpp packed = 4 bytes/row * 8 rows = 32 bytes/tile. #define TILE_BYTES 32 #define TILE_PIXELS 8 diff --git a/src/core/surface.c b/src/core/surface.c index 9035797..013981d 100644 --- a/src/core/surface.c +++ b/src/core/surface.c @@ -10,6 +10,13 @@ #include "hal.h" #include "surfaceInternal.h" +// Hoist into a CORESYS load segment alongside the other small core +// files. Keeps _ROOT thin and stable so it stops reacting to per-file +// source changes -- _ROOT size flux was tripping ORCA-Linker bank +// packing in spriteEmitIigs.c (see feedback_orca_link_segment_count +// cases 2-4). +JOEYLIB_SEGMENT("CORESYS") + #ifdef JOEYLIB_PLATFORM_IIGS extern void iigsMarkDirtyRowsInner(uint16_t yStart, uint16_t yEnd, uint16_t minWord, uint16_t maxWord); #endif diff --git a/src/core/surfaceInternal.h b/src/core/surfaceInternal.h index 0f1d9d7..175ec9f 100644 --- a/src/core/surfaceInternal.h +++ b/src/core/surfaceInternal.h @@ -60,6 +60,19 @@ void surfaceMarkDirtyAll(const SurfaceT *s); // Reset every row to CLEAN. Called by stagePresent after the slam. void stageDirtyClearAll(void); +// y -> byte offset of row y in a SURFACE_BYTES_PER_ROW-strided buffer. +// On IIgs this expands to a single indexed long-mode read against +// gRowOffsetLut (built once at halInit). On other ports it's the +// straight multiply -- those compilers (gcc, OpenWatcom) optimize the +// constant 160 to a shift+add chain that's already cheap. The point +// is to dodge ORCA-C's __mul16 JSL on every per-row pointer compute. +#ifdef JOEYLIB_PLATFORM_IIGS +extern const uint16_t gRowOffsetLut[200]; +#define SURFACE_ROW_OFFSET(_y) ((uint16_t)gRowOffsetLut[(uint16_t)(_y)]) +#else +#define SURFACE_ROW_OFFSET(_y) ((uint16_t)((uint16_t)(_y) * SURFACE_BYTES_PER_ROW)) +#endif + // Allocate and free the library-owned stage (the back-buffer surface // that stagePresent flips to the display). Called from init.c during // joeyInit / joeyShutdown. The stage's pixel storage is supplied by diff --git a/src/core/tile.c b/src/core/tile.c index 87bb36a..ce4ac1f 100644 --- a/src/core/tile.c +++ b/src/core/tile.c @@ -145,8 +145,8 @@ void tileCopy(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, srcPixelX = (uint16_t)((uint16_t)srcBx * TILE_PIXELS_PER_SIDE); srcPixelY = (uint16_t)((uint16_t)srcBy * TILE_PIXELS_PER_SIDE); - dstRow0 = &dst->pixels[dstPixelY * SURFACE_BYTES_PER_ROW + (dstPixelX >> 1)]; - srcRow0 = &src->pixels[srcPixelY * SURFACE_BYTES_PER_ROW + (srcPixelX >> 1)]; + dstRow0 = &dst->pixels[SURFACE_ROW_OFFSET(dstPixelY) + (dstPixelX >> 1)]; + srcRow0 = &src->pixels[SURFACE_ROW_OFFSET(srcPixelY) + (srcPixelX >> 1)]; if (!halFastTileCopy(dstRow0, srcRow0)) { copyTileOpaque(dstRow0, srcRow0); @@ -176,8 +176,8 @@ void tileCopyMasked(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT srcPixelX = (uint16_t)((uint16_t)srcBx * TILE_PIXELS_PER_SIDE); srcPixelY = (uint16_t)((uint16_t)srcBy * TILE_PIXELS_PER_SIDE); - dstRow0 = &dst->pixels[dstPixelY * SURFACE_BYTES_PER_ROW + (dstPixelX >> 1)]; - srcRow0 = &src->pixels[srcPixelY * SURFACE_BYTES_PER_ROW + (srcPixelX >> 1)]; + dstRow0 = &dst->pixels[SURFACE_ROW_OFFSET(dstPixelY) + (dstPixelX >> 1)]; + srcRow0 = &src->pixels[SURFACE_ROW_OFFSET(srcPixelY) + (srcPixelX >> 1)]; if (!halFastTileCopyMasked(dstRow0, srcRow0, transparentIndex)) { copyTileMasked(dstRow0, srcRow0, transparentIndex); @@ -203,7 +203,7 @@ void tileFill(SurfaceT *s, uint8_t bx, uint8_t by, uint8_t colorIndex) { doubled = (uint8_t)(((colorIndex & 0x0F) << 4) | (colorIndex & 0x0F)); if (!halFastTileFill(s, bx, by, (uint16_t)((uint16_t)doubled | ((uint16_t)doubled << 8)))) { - uint8_t *row = &s->pixels[pixelY * SURFACE_BYTES_PER_ROW + (pixelX >> 1)]; + uint8_t *row = &s->pixels[SURFACE_ROW_OFFSET(pixelY) + (pixelX >> 1)]; uint8_t i; for (i = 0; i < TILE_PIXELS_PER_SIDE; i++) { row[0] = doubled; @@ -233,7 +233,7 @@ void tilePaste(SurfaceT *dst, uint8_t bx, uint8_t by, const TileT *in) { } pixelX = (uint16_t)((uint16_t)bx * TILE_PIXELS_PER_SIDE); pixelY = (uint16_t)((uint16_t)by * TILE_PIXELS_PER_SIDE); - dstRow = &dst->pixels[pixelY * SURFACE_BYTES_PER_ROW + (pixelX >> 1)]; + dstRow = &dst->pixels[SURFACE_ROW_OFFSET(pixelY) + (pixelX >> 1)]; src = &in->pixels[0]; if (!halFastTilePaste(dstRow, src)) { for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) { @@ -265,7 +265,7 @@ void tileSnap(const SurfaceT *src, uint8_t bx, uint8_t by, TileT *out) { } pixelX = (uint16_t)((uint16_t)bx * TILE_PIXELS_PER_SIDE); pixelY = (uint16_t)((uint16_t)by * TILE_PIXELS_PER_SIDE); - srcRow = &src->pixels[pixelY * SURFACE_BYTES_PER_ROW + (pixelX >> 1)]; + srcRow = &src->pixels[SURFACE_ROW_OFFSET(pixelY) + (pixelX >> 1)]; dst = &out->pixels[0]; if (!halFastTileSnap(dst, srcRow)) { for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) { diff --git a/src/port/iigs/hal.c b/src/port/iigs/hal.c index db4bda1..6df76a8 100644 --- a/src/port/iigs/hal.c +++ b/src/port/iigs/hal.c @@ -253,7 +253,7 @@ void halPresentRect(const SurfaceT *src, int16_t x, int16_t y, uint16_t w, uint1 // Asm per-row MVN blit. Stage pixels live at $01:2000; SHR display // at $E1:2000 (same offset within their banks). srcOffset is the // byte offset of the first byte to copy on the first row. - srcOffset = (uint16_t)(0x2000 + y * SURFACE_BYTES_PER_ROW + byteStart); + srcOffset = (uint16_t)(0x2000 + SURFACE_ROW_OFFSET(y) + byteStart); iigsBlitRectStageToShr(srcOffset, copyBytes, h); } @@ -278,23 +278,10 @@ void halShutdown(void) { // partial-byte (nibble-edge) handling is too gnarly for a macro. -// halFastFillRect: thin wrapper around iigsFillRectInner. The asm -// helper now handles the partial-byte (nibble-edge) logic that used -// to live here, so this function is just a stage-check + forward. -// (It's not macro-dispatched like the others because removing it -// from the C side triggers an unrelated ORCA-linker bank-placement -// failure -- the binary needs enough mass in _ROOT to keep sprite -// codegen's static symbols at addresses the linker can resolve.) -bool halFastFillRect(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex) { - if (s == NULL || s != stageGet()) { - return false; - } - iigsFillRectInner(s->pixels, - (uint16_t)x, (uint16_t)y, - (uint16_t)w, (uint16_t)h, - (uint16_t)(colorIndex & 0x0F)); - return true; -} +// halFastFillRect: macro-dispatched in core/hal.h, same as the other +// halFast* primitives. The C wrapper that used to live here was kept +// as load-bearing _ROOT mass to defeat ORCA-Linker bank fragility; +// since the CORESYS migration drained _ROOT, the macro form is safe. uint8_t *halStageAllocPixels(void) { diff --git a/src/port/iigs/input.c b/src/port/iigs/input.c index f4ede3d..337821a 100644 --- a/src/port/iigs/input.c +++ b/src/port/iigs/input.c @@ -37,6 +37,9 @@ #include "inputInternal.h" #include "joey/surface.h" +// CORESYS: hoisted out of _ROOT (see surface.c for rationale). +JOEYLIB_SEGMENT("CORESYS") + // ----- Hardware registers ----- #define IIGS_KBD ((volatile uint8_t *)0x00C000L)