From 20cbccaca5f49ca91e5b5602e50ac279ac27234e Mon Sep 17 00:00:00 2001
From: Scott Duensing <scott@duensing.com>
Date: Thu, 30 Apr 2026 17:41:23 -0500
Subject: [PATCH] More speed!

---
 make/iigs.mk                 | 13 +++++++++++-
 src/codegen/spriteCompile.c  | 13 ++++++------
 src/codegen/spriteEmitIigs.c | 40 ++++++++++++++++++++----------------
 src/core/asset.c             |  3 +++
 src/core/audio.c             |  3 +++
 src/core/codegenArena.c      |  3 +++
 src/core/debug.c             |  4 ++++
 src/core/draw.c              | 22 ++++++++++----------
 src/core/hal.h               | 22 ++++++++++++--------
 src/core/init.c              |  3 +++
 src/core/input.c             |  3 +++
 src/core/palette.c           |  3 +++
 src/core/present.c           |  3 +++
 src/core/scb.c               |  3 +++
 src/core/sprite.c            |  3 +++
 src/core/surface.c           |  7 +++++++
 src/core/surfaceInternal.h   | 13 ++++++++++++
 src/core/tile.c              | 14 ++++++-------
 src/port/iigs/hal.c          | 23 +++++----------------
 src/port/iigs/input.c        |  3 +++
 20 files changed, 130 insertions(+), 71 deletions(-)

diff --git a/make/iigs.mk b/make/iigs.mk
index 1bd8e21..eb107a6 100644
--- a/make/iigs.mk
+++ b/make/iigs.mk
@@ -49,7 +49,18 @@ NTP_BIN      := $(BUILD)/audio/ntpplayer.bin
 NTP_ASM      := $(BUILD)/audio/ntpdata.asm
 IIGS_MERLIN  := $(REPO_DIR)/toolchains/iigs/merlin32/bin/merlin32
 
-LIB_SRCS := $(CORE_C_SRCS_IIGS) $(PORT_C_SRCS) $(PORT_ASM_SRCS_ALL) $(NTP_ASM) $(CODEGEN_SRCS)
+# IMPORTANT: CODEGEN_SRCS (specifically spriteEmitIigs.c) MUST be the
+# first entry after the main object in the link order. ORCA-Linker's
+# bank assignment is order-sensitive: when spriteEmitIigs.c lands at
+# any later position, the linker assigns SPRITECG to a bank where its
+# intra-OMF-segment static-symbol relocations (emitMvnCopyRoutine,
+# shiftedByteAt, writeLE16) can't be encoded -- you get cryptic
+# "Addressing error" / "Unresolved reference Label: ..." failures
+# whose root cause is bank packing, not source. Putting CODEGEN_SRCS
+# first gives SPRITECG prime placement and the relocations resolve.
+# This was the underlying cause of feedback_orca_link_segment_count
+# cases 2-5 (we'd been working around it by managing _ROOT mass).
+LIB_SRCS := $(CODEGEN_SRCS) $(CORE_C_SRCS_IIGS) $(PORT_C_SRCS) $(PORT_ASM_SRCS_ALL) $(NTP_ASM)
 
 HELLO_SRC   := $(EXAMPLES)/hello/hello.c
 HELLO_BIN   := $(BINDIR)/HELLO
diff --git a/src/codegen/spriteCompile.c b/src/codegen/spriteCompile.c
index b8ebf98..74c1c69 100644
--- a/src/codegen/spriteCompile.c
+++ b/src/codegen/spriteCompile.c
@@ -158,10 +158,9 @@ bool spriteCompile(SpriteT *sp) {
 
 #if defined(JOEYLIB_PLATFORM_IIGS)
 
-// y*160 lookup. gRowOffsetLut is the 200-entry uint16_t table built
-// once by iigsInitRowLut at halInit. Replaces ORCA-C's runtime
-// multiply (a JSL into __mul16) with a single indexed long-mode read.
-extern const uint16_t gRowOffsetLut[200];
+// SURFACE_ROW_OFFSET dispatches to the gRowOffsetLut lookup on IIgs;
+// declared in surfaceInternal.h. Replaces ORCA-C's __mul16 JSL with a
+// single indexed long-mode read.
 
 // IIgs uses inline asm + a self-modifying call stub instead of a C
 // function-pointer cast. The build uses ORCA-C large memory model
@@ -212,7 +211,7 @@ void spriteCompiledDraw(SurfaceT *dst, const SpriteT *sp, int16_t x, int16_t y)
         uint8_t *destPtr;
         uint8_t  destBytes[4];
         shift   = (uint8_t)(x & 1);
-        destPtr = &dst->pixels[gRowOffsetLut[(uint16_t)y] + ((uint16_t)x >> 1)];
+        destPtr = &dst->pixels[SURFACE_ROW_OFFSET(y) + ((uint16_t)x >> 1)];
         memcpy(destBytes, &destPtr, 4);
         destAddr   = (uint32_t)destBytes[0]
                    | ((uint32_t)destBytes[1] << 8)
@@ -366,7 +365,7 @@ void spriteCompiledSaveUnder(const SurfaceT *src, SpriteT *sp, int16_t x, int16_
     heightPx  = (uint16_t)(sp->heightTiles * 8);
     copyBytes = (uint16_t)((widthPx >> 1) + (shift == 1 ? 1 : 0));
 
-    screenPtr = (uint8_t *)&src->pixels[gRowOffsetLut[(uint16_t)y] + ((uint16_t)clippedX >> 1)];
+    screenPtr = (uint8_t *)&src->pixels[SURFACE_ROW_OFFSET(y) + ((uint16_t)clippedX >> 1)];
     splitPointer(screenPtr,    &screenLo, &screenBank);
     splitPointer(backup->bytes, &backupLo, &backupBank);
 
@@ -450,7 +449,7 @@ void spriteCompiledRestoreUnder(SurfaceT *dst, const SpriteBackupT *backup) {
     spriteBytesPerRow = (uint16_t)(sp->widthTiles * 4);
     shift             = (copyBytes == spriteBytesPerRow) ? 0 : 1;
 
-    screenPtr = (uint8_t *)&dst->pixels[gRowOffsetLut[(uint16_t)backup->y] + ((uint16_t)backup->x >> 1)];
+    screenPtr = (uint8_t *)&dst->pixels[SURFACE_ROW_OFFSET(backup->y) + ((uint16_t)backup->x >> 1)];
     splitPointer(screenPtr,    &screenLo, &screenBank);
     splitPointer(backup->bytes, &backupLo, &backupBank);
 
diff --git a/src/codegen/spriteEmitIigs.c b/src/codegen/spriteEmitIigs.c
index 01936d5..a5fda1e 100644
--- a/src/codegen/spriteEmitIigs.c
+++ b/src/codegen/spriteEmitIigs.c
@@ -57,7 +57,6 @@ JOEYLIB_SEGMENT("SPRITECG")
 static uint16_t emitMvnCopyRoutine(uint8_t *out, uint16_t heightPx, uint16_t copyBytes, bool advanceX);
 static void     shiftedByteAt(const SpriteT *sp, uint16_t row, uint16_t col, uint8_t shift, uint16_t spriteBytesPerRow, uint8_t *outValue, uint8_t *outOpaqueMask);
 static uint8_t  spriteSourceByte(const SpriteT *sp, uint16_t row, uint16_t col);
-static uint16_t writeLE16(uint8_t *out, uint16_t value);
 
 
 // ----- Emit helpers (alphabetical) -----
@@ -126,13 +125,9 @@ static uint8_t spriteSourceByte(const SpriteT *sp, uint16_t row, uint16_t col) {
 }
 
 
-// 65816 is little-endian; write low byte first.
-static uint16_t writeLE16(uint8_t *out, uint16_t value) {
-    out[0] = (uint8_t)(value & 0xFFu);
-    out[1] = (uint8_t)((value >> 8) & 0xFFu);
-    return 2;
-}
-
+// writeLE16 was inlined at every call site. Inlining cuts a JSL/RTL
+// per emitted 16-bit immediate (4 instructions per byte * 12 sites)
+// and avoids ORCA-Linker bank-fragility around tiny-helper resolution.
 
 // Common backbone for save and restore. Both ops copy a byte-aligned
 // rectangle row-by-row using MVN; only the operand banks (which buffer
@@ -178,11 +173,13 @@ static uint16_t emitMvnCopyRoutine(uint8_t *out, uint16_t heightPx, uint16_t cop
             out[cursor++] = advanceX ? 0x8A : 0x98;        // TXA / TYA
             out[cursor++] = 0x18;                          // CLC
             out[cursor++] = 0x69;                          // ADC #imm (M=16)
-            cursor += writeLE16(out + cursor, advance);
+            out[cursor++] = (uint8_t)(advance & 0xFFu);
+            out[cursor++] = (uint8_t)((advance >> 8) & 0xFFu);
             out[cursor++] = advanceX ? 0xAA : 0xA8;        // TAX / TAY
         }
         out[cursor++] = 0xA9;                              // LDA #imm (M=16)
-        cursor += writeLE16(out + cursor, (uint16_t)(copyBytes - 1));
+        out[cursor++] = (uint8_t)((copyBytes - 1) & 0xFFu);
+        out[cursor++] = (uint8_t)(((copyBytes - 1) >> 8) & 0xFFu);
         out[cursor++] = 0x54;                              // MVN
         out[cursor++] = 0x00;                              // dstbk -- patched per call
         out[cursor++] = 0x00;                              // srcbk -- patched per call
@@ -299,11 +296,15 @@ uint16_t spriteEmitDrawIigs(uint8_t *out, const SpriteT *sp, uint8_t shift) {
                         out[cursor++] = 0x20;
                         wide = true;
                     }
-                    out[cursor++] = 0xA9;                // LDA #imm16
-                    cursor += writeLE16(out + cursor,
-                                        (uint16_t)(((uint16_t)nextValue << 8) | value));
-                    out[cursor++] = 0x99;                // STA abs,Y
-                    cursor += writeLE16(out + cursor, absOffset);
+                    {
+                        uint16_t pair = (uint16_t)(((uint16_t)nextValue << 8) | value);
+                        out[cursor++] = 0xA9;            // LDA #imm16
+                        out[cursor++] = (uint8_t)(pair & 0xFFu);
+                        out[cursor++] = (uint8_t)((pair >> 8) & 0xFFu);
+                        out[cursor++] = 0x99;            // STA abs,Y
+                        out[cursor++] = (uint8_t)(absOffset & 0xFFu);
+                        out[cursor++] = (uint8_t)((absOffset >> 8) & 0xFFu);
+                    }
                     col++;                               // consumed col+1
                     continue;
                 }
@@ -321,16 +322,19 @@ uint16_t spriteEmitDrawIigs(uint8_t *out, const SpriteT *sp, uint8_t shift) {
                 out[cursor++] = 0xA9;
                 out[cursor++] = value;
                 out[cursor++] = 0x99;
-                cursor += writeLE16(out + cursor, absOffset);
+                out[cursor++] = (uint8_t)(absOffset & 0xFFu);
+                out[cursor++] = (uint8_t)((absOffset >> 8) & 0xFFu);
             } else {
                 out[cursor++] = 0xB9;
-                cursor += writeLE16(out + cursor, absOffset);
+                out[cursor++] = (uint8_t)(absOffset & 0xFFu);
+                out[cursor++] = (uint8_t)((absOffset >> 8) & 0xFFu);
                 out[cursor++] = 0x29;
                 out[cursor++] = (uint8_t)(~opaqueMask & 0xFFu);
                 out[cursor++] = 0x09;
                 out[cursor++] = value;
                 out[cursor++] = 0x99;
-                cursor += writeLE16(out + cursor, absOffset);
+                out[cursor++] = (uint8_t)(absOffset & 0xFFu);
+                out[cursor++] = (uint8_t)((absOffset >> 8) & 0xFFu);
             }
         }
     }
diff --git a/src/core/asset.c b/src/core/asset.c
index 68e3b16..173cd49 100644
--- a/src/core/asset.c
+++ b/src/core/asset.c
@@ -12,6 +12,9 @@
 #include "joey/asset.h"
 #include "joey/palette.h"
 
+// CORESYS: hoisted out of _ROOT (see surface.c for rationale).
+JOEYLIB_SEGMENT("CORESYS")
+
 #define JAS_HEADER_SIZE      44
 #define JAS_PIXELS_OFFSET    JAS_HEADER_SIZE
 #define JAS_PALETTE_OFFSET   12
diff --git a/src/core/audio.c b/src/core/audio.c
index 8cd4824..3a39207 100644
--- a/src/core/audio.c
+++ b/src/core/audio.c
@@ -8,6 +8,9 @@
 #include "joey/audio.h"
 #include "hal.h"
 
+// CORESYS: hoisted out of _ROOT (see surface.c for rationale).
+JOEYLIB_SEGMENT("CORESYS")
+
 static bool gAudioReady = false;
 
 
diff --git a/src/core/codegenArena.c b/src/core/codegenArena.c
index 46d3487..4bd3eac 100644
--- a/src/core/codegenArena.c
+++ b/src/core/codegenArena.c
@@ -23,6 +23,9 @@
 
 #include "codegenArenaInternal.h"
 
+// CORESYS: hoisted out of _ROOT (see surface.c for rationale).
+JOEYLIB_SEGMENT("CORESYS")
+
 
 // ----- Module state -----
 
diff --git a/src/core/debug.c b/src/core/debug.c
index 5c57137..bb85d7d 100644
--- a/src/core/debug.c
+++ b/src/core/debug.c
@@ -10,8 +10,12 @@
 #include <stdio.h>
 #include <stdarg.h>
 
+#include "joey/platform.h"
 #include "joey/debug.h"
 
+// CORESYS: hoisted out of _ROOT (see surface.c for rationale).
+JOEYLIB_SEGMENT("CORESYS")
+
 static const char *kLogPath = "joeylog.txt";
 
 
diff --git a/src/core/draw.c b/src/core/draw.c
index 91c220a..ee12b79 100644
--- a/src/core/draw.c
+++ b/src/core/draw.c
@@ -113,7 +113,7 @@ static void fillRectClipped(SurfaceT *s, int16_t x, int16_t y, int16_t w, int16_
     uint8_t *line;
 
     for (row = 0; row < h; row++) {
-        line    = &s->pixels[(y + row) * SURFACE_BYTES_PER_ROW];
+        line    = &s->pixels[SURFACE_ROW_OFFSET(y + row)];
         pxStart = x;
         pxEnd   = x + w;
 
@@ -208,7 +208,7 @@ static void floodFillInternal(SurfaceT *s, int16_t startX, int16_t startY, uint8
 
         // Fallback path needs row; compute it here so the asm path
         // above doesn't pay for an unused y*160 multiply on every iter.
-        row = &s->pixels[y * SURFACE_BYTES_PER_ROW];
+        row = &s->pixels[SURFACE_ROW_OFFSET(y)];
 
         // Tier-2 asm fast path: combined seed test + walk-left +
         // walk-right in one cross-segment call. Falls back to the
@@ -294,7 +294,7 @@ static void floodFillInternal(SurfaceT *s, int16_t startX, int16_t startY, uint8
                     }
                     scanY  = (int16_t)(y + 1);
                 }
-                scanRow = &s->pixels[scanY * SURFACE_BYTES_PER_ROW];
+                scanRow = &s->pixels[SURFACE_ROW_OFFSET(scanY)];
                 // Prefer the combined scan+push asm path (one call per
                 // scan, no markBuf and no per-pixel C edge walk).
                 if (!halFastFloodScanAndPush(scanRow, leftX, rightX,
@@ -502,7 +502,7 @@ void drawPixel(SurfaceT *s, int16_t x, int16_t y, uint8_t colorIndex) {
     }
 
     if (!halFastDrawPixel(s, (uint16_t)x, (uint16_t)y, colorIndex)) {
-        byte   = &s->pixels[y * SURFACE_BYTES_PER_ROW + (x >> 1)];
+        byte   = &s->pixels[SURFACE_ROW_OFFSET(y) + (x >> 1)];
         nibble = colorIndex & 0x0F;
         if (x & 1) {
             *byte = (uint8_t)((*byte & 0xF0) | nibble);
@@ -625,7 +625,7 @@ void floodFill(SurfaceT *s, int16_t x, int16_t y, uint8_t newColor) {
     if (x < 0 || x >= SURFACE_WIDTH || y < 0 || y >= SURFACE_HEIGHT) {
         return;
     }
-    row       = &s->pixels[y * SURFACE_BYTES_PER_ROW];
+    row       = &s->pixels[SURFACE_ROW_OFFSET(y)];
     seedColor = srcPixel(row, x);
     if ((seedColor & 0x0F) == (newColor & 0x0F)) {
         return;
@@ -644,7 +644,7 @@ void floodFillBounded(SurfaceT *s, int16_t x, int16_t y, uint8_t newColor, uint8
     if (x < 0 || x >= SURFACE_WIDTH || y < 0 || y >= SURFACE_HEIGHT) {
         return;
     }
-    row = &s->pixels[y * SURFACE_BYTES_PER_ROW];
+    row = &s->pixels[SURFACE_ROW_OFFSET(y)];
     pix = srcPixel(row, x);
     // Starting on a boundary pixel or already-filled pixel: nothing
     // to do.
@@ -668,7 +668,7 @@ uint8_t samplePixel(const SurfaceT *s, int16_t x, int16_t y) {
         return 0;
     }
 
-    byte = s->pixels[y * SURFACE_BYTES_PER_ROW + (x >> 1)];
+    byte = s->pixels[SURFACE_ROW_OFFSET(y) + (x >> 1)];
     if (x & 1) {
         return (uint8_t)(byte & 0x0F);
     }
@@ -698,12 +698,12 @@ void surfaceBlit(SurfaceT *dst, const JoeyAssetT *src, int16_t x, int16_t y) {
 
     srcRowBytes = (int16_t)((src->width + 1) >> 1);
     srcRow      = &src->pixels[srcY0 * srcRowBytes];
-    dstRow      = &dst->pixels[y * SURFACE_BYTES_PER_ROW];
+    dstRow      = &dst->pixels[SURFACE_ROW_OFFSET(y)];
     if (!halFastBlitRect(dstRow, x, srcRow, srcX0,
                          copyW, copyH, srcRowBytes, 0xFFFFu)) {
         for (row = 0; row < copyH; row++) {
             srcRow = &src->pixels[(srcY0 + row) * srcRowBytes];
-            dstRow = &dst->pixels[(y + row) * SURFACE_BYTES_PER_ROW];
+            dstRow = &dst->pixels[SURFACE_ROW_OFFSET(y + row)];
             for (col = 0; col < copyW; col++) {
                 nibble = srcPixel(srcRow, srcX0 + col);
                 dstPixel(dstRow, x + col, nibble);
@@ -738,12 +738,12 @@ void surfaceBlitMasked(SurfaceT *dst, const JoeyAssetT *src, int16_t x, int16_t
     transparent = (uint8_t)(transparentIndex & 0x0F);
     srcRowBytes = (int16_t)((src->width + 1) >> 1);
     srcRow      = &src->pixels[srcY0 * srcRowBytes];
-    dstRow      = &dst->pixels[y * SURFACE_BYTES_PER_ROW];
+    dstRow      = &dst->pixels[SURFACE_ROW_OFFSET(y)];
     if (!halFastBlitRect(dstRow, x, srcRow, srcX0,
                          copyW, copyH, srcRowBytes, (uint16_t)transparent)) {
         for (row = 0; row < copyH; row++) {
             srcRow = &src->pixels[(srcY0 + row) * srcRowBytes];
-            dstRow = &dst->pixels[(y + row) * SURFACE_BYTES_PER_ROW];
+            dstRow = &dst->pixels[SURFACE_ROW_OFFSET(y + row)];
             for (col = 0; col < copyW; col++) {
                 nibble = srcPixel(srcRow, srcX0 + col);
                 if (nibble == transparent) {
diff --git a/src/core/hal.h b/src/core/hal.h
index 3a4665d..2726846 100644
--- a/src/core/hal.h
+++ b/src/core/hal.h
@@ -262,15 +262,19 @@ extern uint16_t gFloodRightX;
         true) \
      : false)
 
-// halFastFillRect stays as a real C wrapper -- removing it triggered
-// an unrelated ORCA linker bank-placement failure (same mode as the
-// peislam.asm deletion: `Unresolved reference Label:
-// emitMvnCopyRoutine` in sprite codegen). The wrapper now just
-// forwards to iigsFillRectInner (asm does partial+middle); we lose
-// the call-site macro inlining for fillRect specifically but keep
-// the rest of the macros AND the new asm helper. Per-call wrapper
-// overhead for halFastFillRect is back (~80 cyc) but at least the
-// per-row partial-byte logic happens in asm now.
+// halFastFillRect: macro form, same shape as the others. Builds
+// clean now that _ROOT has been thinned out via the CORESYS load
+// segment migration -- previous attempts shrank _ROOT enough to
+// retrip the bank-packing fragility, but with most core .c files
+// out of _ROOT that's no longer reactive. Saves ~80 cyc/call.
+#undef  halFastFillRect
+#define halFastFillRect(_s, _x, _y, _w, _h, _c) \
+    ((_s) == stageGet() \
+     ? (iigsFillRectInner((_s)->pixels, (uint16_t)(_x), (uint16_t)(_y), \
+                          (uint16_t)(_w), (uint16_t)(_h), \
+                          (uint16_t)((_c) & 0x0F)), \
+        true) \
+     : false)
 
 // Tile primitives operate on caller-computed row pointers; just
 // forward the args. by/bx are tile coords -> bx*4 + by*8*160 byte
diff --git a/src/core/init.c b/src/core/init.c
index 26f4c99..fc263ff 100644
--- a/src/core/init.c
+++ b/src/core/init.c
@@ -12,6 +12,9 @@
 #include "hal.h"
 #include "surfaceInternal.h"
 
+// CORESYS: hoisted out of _ROOT (see surface.c for rationale).
+JOEYLIB_SEGMENT("CORESYS")
+
 // 8 KB fits the largest typical sprite working set (~3-4 KB per
 // 32x32 sprite at all opaque) and keeps malloc requests small enough
 // for IIgs ORCA-C's small-memory-model heap to satisfy them.
diff --git a/src/core/input.c b/src/core/input.c
index 8347179..28e65c5 100644
--- a/src/core/input.c
+++ b/src/core/input.c
@@ -15,6 +15,9 @@
 #include "hal.h"
 #include "inputInternal.h"
 
+// CORESYS: hoisted out of _ROOT (see surface.c for rationale).
+JOEYLIB_SEGMENT("CORESYS")
+
 bool    gKeyState        [KEY_COUNT];
 bool    gKeyPrev         [KEY_COUNT];
 
diff --git a/src/core/palette.c b/src/core/palette.c
index 812fb94..5a85459 100644
--- a/src/core/palette.c
+++ b/src/core/palette.c
@@ -10,6 +10,9 @@
 #include "joey/palette.h"
 #include "surfaceInternal.h"
 
+// CORESYS: hoisted out of _ROOT (see surface.c for rationale).
+JOEYLIB_SEGMENT("CORESYS")
+
 // ----- Public API (alphabetical) -----
 
 void paletteGet(const SurfaceT *s, uint8_t paletteIndex, uint16_t *out16) {
diff --git a/src/core/present.c b/src/core/present.c
index 84d561a..02468b2 100644
--- a/src/core/present.c
+++ b/src/core/present.c
@@ -12,6 +12,9 @@
 #include "hal.h"
 #include "surfaceInternal.h"
 
+// CORESYS: hoisted out of _ROOT (see surface.c for rationale).
+JOEYLIB_SEGMENT("CORESYS")
+
 // ----- Public API (alphabetical) -----
 
 void stagePresent(void) {
diff --git a/src/core/scb.c b/src/core/scb.c
index 59d44dd..6946c79 100644
--- a/src/core/scb.c
+++ b/src/core/scb.c
@@ -9,6 +9,9 @@
 #include "joey/palette.h"
 #include "surfaceInternal.h"
 
+// CORESYS: hoisted out of _ROOT (see surface.c for rationale).
+JOEYLIB_SEGMENT("CORESYS")
+
 // ----- Public API (alphabetical) -----
 
 uint8_t scbGet(const SurfaceT *s, uint16_t line) {
diff --git a/src/core/sprite.c b/src/core/sprite.c
index e2b6da8..7daf805 100644
--- a/src/core/sprite.c
+++ b/src/core/sprite.c
@@ -13,6 +13,9 @@
 #include "spriteInternal.h"
 #include "surfaceInternal.h"
 
+// CORESYS: hoisted out of _ROOT (see surface.c for rationale).
+JOEYLIB_SEGMENT("CORESYS")
+
 // 8x8 tiles, 4bpp packed = 4 bytes/row * 8 rows = 32 bytes/tile.
 #define TILE_BYTES         32
 #define TILE_PIXELS        8
diff --git a/src/core/surface.c b/src/core/surface.c
index 9035797..013981d 100644
--- a/src/core/surface.c
+++ b/src/core/surface.c
@@ -10,6 +10,13 @@
 #include "hal.h"
 #include "surfaceInternal.h"
 
+// Hoist into a CORESYS load segment alongside the other small core
+// files. Keeps _ROOT thin and stable so it stops reacting to per-file
+// source changes -- _ROOT size flux was tripping ORCA-Linker bank
+// packing in spriteEmitIigs.c (see feedback_orca_link_segment_count
+// cases 2-4).
+JOEYLIB_SEGMENT("CORESYS")
+
 #ifdef JOEYLIB_PLATFORM_IIGS
 extern void iigsMarkDirtyRowsInner(uint16_t yStart, uint16_t yEnd, uint16_t minWord, uint16_t maxWord);
 #endif
diff --git a/src/core/surfaceInternal.h b/src/core/surfaceInternal.h
index 0f1d9d7..175ec9f 100644
--- a/src/core/surfaceInternal.h
+++ b/src/core/surfaceInternal.h
@@ -60,6 +60,19 @@ void surfaceMarkDirtyAll(const SurfaceT *s);
 // Reset every row to CLEAN. Called by stagePresent after the slam.
 void stageDirtyClearAll(void);
 
+// y -> byte offset of row y in a SURFACE_BYTES_PER_ROW-strided buffer.
+// On IIgs this expands to a single indexed long-mode read against
+// gRowOffsetLut (built once at halInit). On other ports it's the
+// straight multiply -- those compilers (gcc, OpenWatcom) optimize the
+// constant 160 to a shift+add chain that's already cheap. The point
+// is to dodge ORCA-C's __mul16 JSL on every per-row pointer compute.
+#ifdef JOEYLIB_PLATFORM_IIGS
+extern const uint16_t gRowOffsetLut[200];
+#define SURFACE_ROW_OFFSET(_y) ((uint16_t)gRowOffsetLut[(uint16_t)(_y)])
+#else
+#define SURFACE_ROW_OFFSET(_y) ((uint16_t)((uint16_t)(_y) * SURFACE_BYTES_PER_ROW))
+#endif
+
 // Allocate and free the library-owned stage (the back-buffer surface
 // that stagePresent flips to the display). Called from init.c during
 // joeyInit / joeyShutdown. The stage's pixel storage is supplied by
diff --git a/src/core/tile.c b/src/core/tile.c
index 87bb36a..ce4ac1f 100644
--- a/src/core/tile.c
+++ b/src/core/tile.c
@@ -145,8 +145,8 @@ void tileCopy(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src,
     srcPixelX = (uint16_t)((uint16_t)srcBx * TILE_PIXELS_PER_SIDE);
     srcPixelY = (uint16_t)((uint16_t)srcBy * TILE_PIXELS_PER_SIDE);
 
-    dstRow0 = &dst->pixels[dstPixelY * SURFACE_BYTES_PER_ROW + (dstPixelX >> 1)];
-    srcRow0 = &src->pixels[srcPixelY * SURFACE_BYTES_PER_ROW + (srcPixelX >> 1)];
+    dstRow0 = &dst->pixels[SURFACE_ROW_OFFSET(dstPixelY) + (dstPixelX >> 1)];
+    srcRow0 = &src->pixels[SURFACE_ROW_OFFSET(srcPixelY) + (srcPixelX >> 1)];
 
     if (!halFastTileCopy(dstRow0, srcRow0)) {
         copyTileOpaque(dstRow0, srcRow0);
@@ -176,8 +176,8 @@ void tileCopyMasked(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT
     srcPixelX = (uint16_t)((uint16_t)srcBx * TILE_PIXELS_PER_SIDE);
     srcPixelY = (uint16_t)((uint16_t)srcBy * TILE_PIXELS_PER_SIDE);
 
-    dstRow0 = &dst->pixels[dstPixelY * SURFACE_BYTES_PER_ROW + (dstPixelX >> 1)];
-    srcRow0 = &src->pixels[srcPixelY * SURFACE_BYTES_PER_ROW + (srcPixelX >> 1)];
+    dstRow0 = &dst->pixels[SURFACE_ROW_OFFSET(dstPixelY) + (dstPixelX >> 1)];
+    srcRow0 = &src->pixels[SURFACE_ROW_OFFSET(srcPixelY) + (srcPixelX >> 1)];
 
     if (!halFastTileCopyMasked(dstRow0, srcRow0, transparentIndex)) {
         copyTileMasked(dstRow0, srcRow0, transparentIndex);
@@ -203,7 +203,7 @@ void tileFill(SurfaceT *s, uint8_t bx, uint8_t by, uint8_t colorIndex) {
     doubled = (uint8_t)(((colorIndex & 0x0F) << 4) | (colorIndex & 0x0F));
     if (!halFastTileFill(s, bx, by,
                          (uint16_t)((uint16_t)doubled | ((uint16_t)doubled << 8)))) {
-        uint8_t *row = &s->pixels[pixelY * SURFACE_BYTES_PER_ROW + (pixelX >> 1)];
+        uint8_t *row = &s->pixels[SURFACE_ROW_OFFSET(pixelY) + (pixelX >> 1)];
         uint8_t  i;
         for (i = 0; i < TILE_PIXELS_PER_SIDE; i++) {
             row[0] = doubled;
@@ -233,7 +233,7 @@ void tilePaste(SurfaceT *dst, uint8_t bx, uint8_t by, const TileT *in) {
     }
     pixelX = (uint16_t)((uint16_t)bx * TILE_PIXELS_PER_SIDE);
     pixelY = (uint16_t)((uint16_t)by * TILE_PIXELS_PER_SIDE);
-    dstRow = &dst->pixels[pixelY * SURFACE_BYTES_PER_ROW + (pixelX >> 1)];
+    dstRow = &dst->pixels[SURFACE_ROW_OFFSET(pixelY) + (pixelX >> 1)];
     src    = &in->pixels[0];
     if (!halFastTilePaste(dstRow, src)) {
         for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) {
@@ -265,7 +265,7 @@ void tileSnap(const SurfaceT *src, uint8_t bx, uint8_t by, TileT *out) {
     }
     pixelX = (uint16_t)((uint16_t)bx * TILE_PIXELS_PER_SIDE);
     pixelY = (uint16_t)((uint16_t)by * TILE_PIXELS_PER_SIDE);
-    srcRow = &src->pixels[pixelY * SURFACE_BYTES_PER_ROW + (pixelX >> 1)];
+    srcRow = &src->pixels[SURFACE_ROW_OFFSET(pixelY) + (pixelX >> 1)];
     dst    = &out->pixels[0];
     if (!halFastTileSnap(dst, srcRow)) {
         for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) {
diff --git a/src/port/iigs/hal.c b/src/port/iigs/hal.c
index db4bda1..6df76a8 100644
--- a/src/port/iigs/hal.c
+++ b/src/port/iigs/hal.c
@@ -253,7 +253,7 @@ void halPresentRect(const SurfaceT *src, int16_t x, int16_t y, uint16_t w, uint1
     // Asm per-row MVN blit. Stage pixels live at $01:2000; SHR display
     // at $E1:2000 (same offset within their banks). srcOffset is the
     // byte offset of the first byte to copy on the first row.
-    srcOffset = (uint16_t)(0x2000 + y * SURFACE_BYTES_PER_ROW + byteStart);
+    srcOffset = (uint16_t)(0x2000 + SURFACE_ROW_OFFSET(y) + byteStart);
     iigsBlitRectStageToShr(srcOffset, copyBytes, h);
 }
 
@@ -278,23 +278,10 @@ void halShutdown(void) {
 // partial-byte (nibble-edge) handling is too gnarly for a macro.
 
 
-// halFastFillRect: thin wrapper around iigsFillRectInner. The asm
-// helper now handles the partial-byte (nibble-edge) logic that used
-// to live here, so this function is just a stage-check + forward.
-// (It's not macro-dispatched like the others because removing it
-// from the C side triggers an unrelated ORCA-linker bank-placement
-// failure -- the binary needs enough mass in _ROOT to keep sprite
-// codegen's static symbols at addresses the linker can resolve.)
-bool halFastFillRect(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex) {
-    if (s == NULL || s != stageGet()) {
-        return false;
-    }
-    iigsFillRectInner(s->pixels,
-                      (uint16_t)x, (uint16_t)y,
-                      (uint16_t)w, (uint16_t)h,
-                      (uint16_t)(colorIndex & 0x0F));
-    return true;
-}
+// halFastFillRect: macro-dispatched in core/hal.h, same as the other
+// halFast* primitives. The C wrapper that used to live here was kept
+// as load-bearing _ROOT mass to defeat ORCA-Linker bank fragility;
+// since the CORESYS migration drained _ROOT, the macro form is safe.
 
 
 uint8_t *halStageAllocPixels(void) {
diff --git a/src/port/iigs/input.c b/src/port/iigs/input.c
index f4ede3d..337821a 100644
--- a/src/port/iigs/input.c
+++ b/src/port/iigs/input.c
@@ -37,6 +37,9 @@
 #include "inputInternal.h"
 #include "joey/surface.h"
 
+// CORESYS: hoisted out of _ROOT (see surface.c for rationale).
+JOEYLIB_SEGMENT("CORESYS")
+
 // ----- Hardware registers -----
 
 #define IIGS_KBD             ((volatile uint8_t *)0x00C000L)