More speed!

This commit is contained in:
Scott Duensing 2026-04-30 17:41:23 -05:00
parent 04a9550421
commit 20cbccaca5
20 changed files with 130 additions and 71 deletions

View file

@ -49,7 +49,18 @@ NTP_BIN := $(BUILD)/audio/ntpplayer.bin
NTP_ASM := $(BUILD)/audio/ntpdata.asm NTP_ASM := $(BUILD)/audio/ntpdata.asm
IIGS_MERLIN := $(REPO_DIR)/toolchains/iigs/merlin32/bin/merlin32 IIGS_MERLIN := $(REPO_DIR)/toolchains/iigs/merlin32/bin/merlin32
LIB_SRCS := $(CORE_C_SRCS_IIGS) $(PORT_C_SRCS) $(PORT_ASM_SRCS_ALL) $(NTP_ASM) $(CODEGEN_SRCS) # IMPORTANT: CODEGEN_SRCS (specifically spriteEmitIigs.c) MUST be the
# first entry after the main object in the link order. ORCA-Linker's
# bank assignment is order-sensitive: when spriteEmitIigs.c lands at
# any later position, the linker assigns SPRITECG to a bank where its
# intra-OMF-segment static-symbol relocations (emitMvnCopyRoutine,
# shiftedByteAt, writeLE16) can't be encoded -- you get cryptic
# "Addressing error" / "Unresolved reference Label: ..." failures
# whose root cause is bank packing, not source. Putting CODEGEN_SRCS
# first gives SPRITECG prime placement and the relocations resolve.
# This was the underlying cause of feedback_orca_link_segment_count
# cases 2-5 (we'd been working around it by managing _ROOT mass).
LIB_SRCS := $(CODEGEN_SRCS) $(CORE_C_SRCS_IIGS) $(PORT_C_SRCS) $(PORT_ASM_SRCS_ALL) $(NTP_ASM)
HELLO_SRC := $(EXAMPLES)/hello/hello.c HELLO_SRC := $(EXAMPLES)/hello/hello.c
HELLO_BIN := $(BINDIR)/HELLO HELLO_BIN := $(BINDIR)/HELLO

View file

@ -158,10 +158,9 @@ bool spriteCompile(SpriteT *sp) {
#if defined(JOEYLIB_PLATFORM_IIGS) #if defined(JOEYLIB_PLATFORM_IIGS)
// y*160 lookup. gRowOffsetLut is the 200-entry uint16_t table built // SURFACE_ROW_OFFSET dispatches to the gRowOffsetLut lookup on IIgs;
// once by iigsInitRowLut at halInit. Replaces ORCA-C's runtime // declared in surfaceInternal.h. Replaces ORCA-C's __mul16 JSL with a
// multiply (a JSL into __mul16) with a single indexed long-mode read. // single indexed long-mode read.
extern const uint16_t gRowOffsetLut[200];
// IIgs uses inline asm + a self-modifying call stub instead of a C // IIgs uses inline asm + a self-modifying call stub instead of a C
// function-pointer cast. The build uses ORCA-C large memory model // function-pointer cast. The build uses ORCA-C large memory model
@ -212,7 +211,7 @@ void spriteCompiledDraw(SurfaceT *dst, const SpriteT *sp, int16_t x, int16_t y)
uint8_t *destPtr; uint8_t *destPtr;
uint8_t destBytes[4]; uint8_t destBytes[4];
shift = (uint8_t)(x & 1); shift = (uint8_t)(x & 1);
destPtr = &dst->pixels[gRowOffsetLut[(uint16_t)y] + ((uint16_t)x >> 1)]; destPtr = &dst->pixels[SURFACE_ROW_OFFSET(y) + ((uint16_t)x >> 1)];
memcpy(destBytes, &destPtr, 4); memcpy(destBytes, &destPtr, 4);
destAddr = (uint32_t)destBytes[0] destAddr = (uint32_t)destBytes[0]
| ((uint32_t)destBytes[1] << 8) | ((uint32_t)destBytes[1] << 8)
@ -366,7 +365,7 @@ void spriteCompiledSaveUnder(const SurfaceT *src, SpriteT *sp, int16_t x, int16_
heightPx = (uint16_t)(sp->heightTiles * 8); heightPx = (uint16_t)(sp->heightTiles * 8);
copyBytes = (uint16_t)((widthPx >> 1) + (shift == 1 ? 1 : 0)); copyBytes = (uint16_t)((widthPx >> 1) + (shift == 1 ? 1 : 0));
screenPtr = (uint8_t *)&src->pixels[gRowOffsetLut[(uint16_t)y] + ((uint16_t)clippedX >> 1)]; screenPtr = (uint8_t *)&src->pixels[SURFACE_ROW_OFFSET(y) + ((uint16_t)clippedX >> 1)];
splitPointer(screenPtr, &screenLo, &screenBank); splitPointer(screenPtr, &screenLo, &screenBank);
splitPointer(backup->bytes, &backupLo, &backupBank); splitPointer(backup->bytes, &backupLo, &backupBank);
@ -450,7 +449,7 @@ void spriteCompiledRestoreUnder(SurfaceT *dst, const SpriteBackupT *backup) {
spriteBytesPerRow = (uint16_t)(sp->widthTiles * 4); spriteBytesPerRow = (uint16_t)(sp->widthTiles * 4);
shift = (copyBytes == spriteBytesPerRow) ? 0 : 1; shift = (copyBytes == spriteBytesPerRow) ? 0 : 1;
screenPtr = (uint8_t *)&dst->pixels[gRowOffsetLut[(uint16_t)backup->y] + ((uint16_t)backup->x >> 1)]; screenPtr = (uint8_t *)&dst->pixels[SURFACE_ROW_OFFSET(backup->y) + ((uint16_t)backup->x >> 1)];
splitPointer(screenPtr, &screenLo, &screenBank); splitPointer(screenPtr, &screenLo, &screenBank);
splitPointer(backup->bytes, &backupLo, &backupBank); splitPointer(backup->bytes, &backupLo, &backupBank);

View file

@ -57,7 +57,6 @@ JOEYLIB_SEGMENT("SPRITECG")
static uint16_t emitMvnCopyRoutine(uint8_t *out, uint16_t heightPx, uint16_t copyBytes, bool advanceX); static uint16_t emitMvnCopyRoutine(uint8_t *out, uint16_t heightPx, uint16_t copyBytes, bool advanceX);
static void shiftedByteAt(const SpriteT *sp, uint16_t row, uint16_t col, uint8_t shift, uint16_t spriteBytesPerRow, uint8_t *outValue, uint8_t *outOpaqueMask); static void shiftedByteAt(const SpriteT *sp, uint16_t row, uint16_t col, uint8_t shift, uint16_t spriteBytesPerRow, uint8_t *outValue, uint8_t *outOpaqueMask);
static uint8_t spriteSourceByte(const SpriteT *sp, uint16_t row, uint16_t col); static uint8_t spriteSourceByte(const SpriteT *sp, uint16_t row, uint16_t col);
static uint16_t writeLE16(uint8_t *out, uint16_t value);
// ----- Emit helpers (alphabetical) ----- // ----- Emit helpers (alphabetical) -----
@ -126,13 +125,9 @@ static uint8_t spriteSourceByte(const SpriteT *sp, uint16_t row, uint16_t col) {
} }
// 65816 is little-endian; write low byte first. // writeLE16 was inlined at every call site. Inlining cuts a JSL/RTL
static uint16_t writeLE16(uint8_t *out, uint16_t value) { // per emitted 16-bit immediate (4 instructions per byte * 12 sites)
out[0] = (uint8_t)(value & 0xFFu); // and avoids ORCA-Linker bank-fragility around tiny-helper resolution.
out[1] = (uint8_t)((value >> 8) & 0xFFu);
return 2;
}
// Common backbone for save and restore. Both ops copy a byte-aligned // Common backbone for save and restore. Both ops copy a byte-aligned
// rectangle row-by-row using MVN; only the operand banks (which buffer // rectangle row-by-row using MVN; only the operand banks (which buffer
@ -178,11 +173,13 @@ static uint16_t emitMvnCopyRoutine(uint8_t *out, uint16_t heightPx, uint16_t cop
out[cursor++] = advanceX ? 0x8A : 0x98; // TXA / TYA out[cursor++] = advanceX ? 0x8A : 0x98; // TXA / TYA
out[cursor++] = 0x18; // CLC out[cursor++] = 0x18; // CLC
out[cursor++] = 0x69; // ADC #imm (M=16) out[cursor++] = 0x69; // ADC #imm (M=16)
cursor += writeLE16(out + cursor, advance); out[cursor++] = (uint8_t)(advance & 0xFFu);
out[cursor++] = (uint8_t)((advance >> 8) & 0xFFu);
out[cursor++] = advanceX ? 0xAA : 0xA8; // TAX / TAY out[cursor++] = advanceX ? 0xAA : 0xA8; // TAX / TAY
} }
out[cursor++] = 0xA9; // LDA #imm (M=16) out[cursor++] = 0xA9; // LDA #imm (M=16)
cursor += writeLE16(out + cursor, (uint16_t)(copyBytes - 1)); out[cursor++] = (uint8_t)((copyBytes - 1) & 0xFFu);
out[cursor++] = (uint8_t)(((copyBytes - 1) >> 8) & 0xFFu);
out[cursor++] = 0x54; // MVN out[cursor++] = 0x54; // MVN
out[cursor++] = 0x00; // dstbk -- patched per call out[cursor++] = 0x00; // dstbk -- patched per call
out[cursor++] = 0x00; // srcbk -- patched per call out[cursor++] = 0x00; // srcbk -- patched per call
@ -299,11 +296,15 @@ uint16_t spriteEmitDrawIigs(uint8_t *out, const SpriteT *sp, uint8_t shift) {
out[cursor++] = 0x20; out[cursor++] = 0x20;
wide = true; wide = true;
} }
out[cursor++] = 0xA9; // LDA #imm16 {
cursor += writeLE16(out + cursor, uint16_t pair = (uint16_t)(((uint16_t)nextValue << 8) | value);
(uint16_t)(((uint16_t)nextValue << 8) | value)); out[cursor++] = 0xA9; // LDA #imm16
out[cursor++] = 0x99; // STA abs,Y out[cursor++] = (uint8_t)(pair & 0xFFu);
cursor += writeLE16(out + cursor, absOffset); out[cursor++] = (uint8_t)((pair >> 8) & 0xFFu);
out[cursor++] = 0x99; // STA abs,Y
out[cursor++] = (uint8_t)(absOffset & 0xFFu);
out[cursor++] = (uint8_t)((absOffset >> 8) & 0xFFu);
}
col++; // consumed col+1 col++; // consumed col+1
continue; continue;
} }
@ -321,16 +322,19 @@ uint16_t spriteEmitDrawIigs(uint8_t *out, const SpriteT *sp, uint8_t shift) {
out[cursor++] = 0xA9; out[cursor++] = 0xA9;
out[cursor++] = value; out[cursor++] = value;
out[cursor++] = 0x99; out[cursor++] = 0x99;
cursor += writeLE16(out + cursor, absOffset); out[cursor++] = (uint8_t)(absOffset & 0xFFu);
out[cursor++] = (uint8_t)((absOffset >> 8) & 0xFFu);
} else { } else {
out[cursor++] = 0xB9; out[cursor++] = 0xB9;
cursor += writeLE16(out + cursor, absOffset); out[cursor++] = (uint8_t)(absOffset & 0xFFu);
out[cursor++] = (uint8_t)((absOffset >> 8) & 0xFFu);
out[cursor++] = 0x29; out[cursor++] = 0x29;
out[cursor++] = (uint8_t)(~opaqueMask & 0xFFu); out[cursor++] = (uint8_t)(~opaqueMask & 0xFFu);
out[cursor++] = 0x09; out[cursor++] = 0x09;
out[cursor++] = value; out[cursor++] = value;
out[cursor++] = 0x99; out[cursor++] = 0x99;
cursor += writeLE16(out + cursor, absOffset); out[cursor++] = (uint8_t)(absOffset & 0xFFu);
out[cursor++] = (uint8_t)((absOffset >> 8) & 0xFFu);
} }
} }
} }

View file

@ -12,6 +12,9 @@
#include "joey/asset.h" #include "joey/asset.h"
#include "joey/palette.h" #include "joey/palette.h"
// CORESYS: hoisted out of _ROOT (see surface.c for rationale).
JOEYLIB_SEGMENT("CORESYS")
#define JAS_HEADER_SIZE 44 #define JAS_HEADER_SIZE 44
#define JAS_PIXELS_OFFSET JAS_HEADER_SIZE #define JAS_PIXELS_OFFSET JAS_HEADER_SIZE
#define JAS_PALETTE_OFFSET 12 #define JAS_PALETTE_OFFSET 12

View file

@ -8,6 +8,9 @@
#include "joey/audio.h" #include "joey/audio.h"
#include "hal.h" #include "hal.h"
// CORESYS: hoisted out of _ROOT (see surface.c for rationale).
JOEYLIB_SEGMENT("CORESYS")
static bool gAudioReady = false; static bool gAudioReady = false;

View file

@ -23,6 +23,9 @@
#include "codegenArenaInternal.h" #include "codegenArenaInternal.h"
// CORESYS: hoisted out of _ROOT (see surface.c for rationale).
JOEYLIB_SEGMENT("CORESYS")
// ----- Module state ----- // ----- Module state -----

View file

@ -10,8 +10,12 @@
#include <stdio.h> #include <stdio.h>
#include <stdarg.h> #include <stdarg.h>
#include "joey/platform.h"
#include "joey/debug.h" #include "joey/debug.h"
// CORESYS: hoisted out of _ROOT (see surface.c for rationale).
JOEYLIB_SEGMENT("CORESYS")
static const char *kLogPath = "joeylog.txt"; static const char *kLogPath = "joeylog.txt";

View file

@ -113,7 +113,7 @@ static void fillRectClipped(SurfaceT *s, int16_t x, int16_t y, int16_t w, int16_
uint8_t *line; uint8_t *line;
for (row = 0; row < h; row++) { for (row = 0; row < h; row++) {
line = &s->pixels[(y + row) * SURFACE_BYTES_PER_ROW]; line = &s->pixels[SURFACE_ROW_OFFSET(y + row)];
pxStart = x; pxStart = x;
pxEnd = x + w; pxEnd = x + w;
@ -208,7 +208,7 @@ static void floodFillInternal(SurfaceT *s, int16_t startX, int16_t startY, uint8
// Fallback path needs row; compute it here so the asm path // Fallback path needs row; compute it here so the asm path
// above doesn't pay for an unused y*160 multiply on every iter. // above doesn't pay for an unused y*160 multiply on every iter.
row = &s->pixels[y * SURFACE_BYTES_PER_ROW]; row = &s->pixels[SURFACE_ROW_OFFSET(y)];
// Tier-2 asm fast path: combined seed test + walk-left + // Tier-2 asm fast path: combined seed test + walk-left +
// walk-right in one cross-segment call. Falls back to the // walk-right in one cross-segment call. Falls back to the
@ -294,7 +294,7 @@ static void floodFillInternal(SurfaceT *s, int16_t startX, int16_t startY, uint8
} }
scanY = (int16_t)(y + 1); scanY = (int16_t)(y + 1);
} }
scanRow = &s->pixels[scanY * SURFACE_BYTES_PER_ROW]; scanRow = &s->pixels[SURFACE_ROW_OFFSET(scanY)];
// Prefer the combined scan+push asm path (one call per // Prefer the combined scan+push asm path (one call per
// scan, no markBuf and no per-pixel C edge walk). // scan, no markBuf and no per-pixel C edge walk).
if (!halFastFloodScanAndPush(scanRow, leftX, rightX, if (!halFastFloodScanAndPush(scanRow, leftX, rightX,
@ -502,7 +502,7 @@ void drawPixel(SurfaceT *s, int16_t x, int16_t y, uint8_t colorIndex) {
} }
if (!halFastDrawPixel(s, (uint16_t)x, (uint16_t)y, colorIndex)) { if (!halFastDrawPixel(s, (uint16_t)x, (uint16_t)y, colorIndex)) {
byte = &s->pixels[y * SURFACE_BYTES_PER_ROW + (x >> 1)]; byte = &s->pixels[SURFACE_ROW_OFFSET(y) + (x >> 1)];
nibble = colorIndex & 0x0F; nibble = colorIndex & 0x0F;
if (x & 1) { if (x & 1) {
*byte = (uint8_t)((*byte & 0xF0) | nibble); *byte = (uint8_t)((*byte & 0xF0) | nibble);
@ -625,7 +625,7 @@ void floodFill(SurfaceT *s, int16_t x, int16_t y, uint8_t newColor) {
if (x < 0 || x >= SURFACE_WIDTH || y < 0 || y >= SURFACE_HEIGHT) { if (x < 0 || x >= SURFACE_WIDTH || y < 0 || y >= SURFACE_HEIGHT) {
return; return;
} }
row = &s->pixels[y * SURFACE_BYTES_PER_ROW]; row = &s->pixels[SURFACE_ROW_OFFSET(y)];
seedColor = srcPixel(row, x); seedColor = srcPixel(row, x);
if ((seedColor & 0x0F) == (newColor & 0x0F)) { if ((seedColor & 0x0F) == (newColor & 0x0F)) {
return; return;
@ -644,7 +644,7 @@ void floodFillBounded(SurfaceT *s, int16_t x, int16_t y, uint8_t newColor, uint8
if (x < 0 || x >= SURFACE_WIDTH || y < 0 || y >= SURFACE_HEIGHT) { if (x < 0 || x >= SURFACE_WIDTH || y < 0 || y >= SURFACE_HEIGHT) {
return; return;
} }
row = &s->pixels[y * SURFACE_BYTES_PER_ROW]; row = &s->pixels[SURFACE_ROW_OFFSET(y)];
pix = srcPixel(row, x); pix = srcPixel(row, x);
// Starting on a boundary pixel or already-filled pixel: nothing // Starting on a boundary pixel or already-filled pixel: nothing
// to do. // to do.
@ -668,7 +668,7 @@ uint8_t samplePixel(const SurfaceT *s, int16_t x, int16_t y) {
return 0; return 0;
} }
byte = s->pixels[y * SURFACE_BYTES_PER_ROW + (x >> 1)]; byte = s->pixels[SURFACE_ROW_OFFSET(y) + (x >> 1)];
if (x & 1) { if (x & 1) {
return (uint8_t)(byte & 0x0F); return (uint8_t)(byte & 0x0F);
} }
@ -698,12 +698,12 @@ void surfaceBlit(SurfaceT *dst, const JoeyAssetT *src, int16_t x, int16_t y) {
srcRowBytes = (int16_t)((src->width + 1) >> 1); srcRowBytes = (int16_t)((src->width + 1) >> 1);
srcRow = &src->pixels[srcY0 * srcRowBytes]; srcRow = &src->pixels[srcY0 * srcRowBytes];
dstRow = &dst->pixels[y * SURFACE_BYTES_PER_ROW]; dstRow = &dst->pixels[SURFACE_ROW_OFFSET(y)];
if (!halFastBlitRect(dstRow, x, srcRow, srcX0, if (!halFastBlitRect(dstRow, x, srcRow, srcX0,
copyW, copyH, srcRowBytes, 0xFFFFu)) { copyW, copyH, srcRowBytes, 0xFFFFu)) {
for (row = 0; row < copyH; row++) { for (row = 0; row < copyH; row++) {
srcRow = &src->pixels[(srcY0 + row) * srcRowBytes]; srcRow = &src->pixels[(srcY0 + row) * srcRowBytes];
dstRow = &dst->pixels[(y + row) * SURFACE_BYTES_PER_ROW]; dstRow = &dst->pixels[SURFACE_ROW_OFFSET(y + row)];
for (col = 0; col < copyW; col++) { for (col = 0; col < copyW; col++) {
nibble = srcPixel(srcRow, srcX0 + col); nibble = srcPixel(srcRow, srcX0 + col);
dstPixel(dstRow, x + col, nibble); dstPixel(dstRow, x + col, nibble);
@ -738,12 +738,12 @@ void surfaceBlitMasked(SurfaceT *dst, const JoeyAssetT *src, int16_t x, int16_t
transparent = (uint8_t)(transparentIndex & 0x0F); transparent = (uint8_t)(transparentIndex & 0x0F);
srcRowBytes = (int16_t)((src->width + 1) >> 1); srcRowBytes = (int16_t)((src->width + 1) >> 1);
srcRow = &src->pixels[srcY0 * srcRowBytes]; srcRow = &src->pixels[srcY0 * srcRowBytes];
dstRow = &dst->pixels[y * SURFACE_BYTES_PER_ROW]; dstRow = &dst->pixels[SURFACE_ROW_OFFSET(y)];
if (!halFastBlitRect(dstRow, x, srcRow, srcX0, if (!halFastBlitRect(dstRow, x, srcRow, srcX0,
copyW, copyH, srcRowBytes, (uint16_t)transparent)) { copyW, copyH, srcRowBytes, (uint16_t)transparent)) {
for (row = 0; row < copyH; row++) { for (row = 0; row < copyH; row++) {
srcRow = &src->pixels[(srcY0 + row) * srcRowBytes]; srcRow = &src->pixels[(srcY0 + row) * srcRowBytes];
dstRow = &dst->pixels[(y + row) * SURFACE_BYTES_PER_ROW]; dstRow = &dst->pixels[SURFACE_ROW_OFFSET(y + row)];
for (col = 0; col < copyW; col++) { for (col = 0; col < copyW; col++) {
nibble = srcPixel(srcRow, srcX0 + col); nibble = srcPixel(srcRow, srcX0 + col);
if (nibble == transparent) { if (nibble == transparent) {

View file

@ -262,15 +262,19 @@ extern uint16_t gFloodRightX;
true) \ true) \
: false) : false)
// halFastFillRect stays as a real C wrapper -- removing it triggered // halFastFillRect: macro form, same shape as the others. Builds
// an unrelated ORCA linker bank-placement failure (same mode as the // clean now that _ROOT has been thinned out via the CORESYS load
// peislam.asm deletion: `Unresolved reference Label: // segment migration -- previous attempts shrank _ROOT enough to
// emitMvnCopyRoutine` in sprite codegen). The wrapper now just // retrip the bank-packing fragility, but with most core .c files
// forwards to iigsFillRectInner (asm does partial+middle); we lose // out of _ROOT that's no longer reactive. Saves ~80 cyc/call.
// the call-site macro inlining for fillRect specifically but keep #undef halFastFillRect
// the rest of the macros AND the new asm helper. Per-call wrapper #define halFastFillRect(_s, _x, _y, _w, _h, _c) \
// overhead for halFastFillRect is back (~80 cyc) but at least the ((_s) == stageGet() \
// per-row partial-byte logic happens in asm now. ? (iigsFillRectInner((_s)->pixels, (uint16_t)(_x), (uint16_t)(_y), \
(uint16_t)(_w), (uint16_t)(_h), \
(uint16_t)((_c) & 0x0F)), \
true) \
: false)
// Tile primitives operate on caller-computed row pointers; just // Tile primitives operate on caller-computed row pointers; just
// forward the args. by/bx are tile coords -> bx*4 + by*8*160 byte // forward the args. by/bx are tile coords -> bx*4 + by*8*160 byte

View file

@ -12,6 +12,9 @@
#include "hal.h" #include "hal.h"
#include "surfaceInternal.h" #include "surfaceInternal.h"
// CORESYS: hoisted out of _ROOT (see surface.c for rationale).
JOEYLIB_SEGMENT("CORESYS")
// 8 KB fits the largest typical sprite working set (~3-4 KB per // 8 KB fits the largest typical sprite working set (~3-4 KB per
// 32x32 sprite at all opaque) and keeps malloc requests small enough // 32x32 sprite at all opaque) and keeps malloc requests small enough
// for IIgs ORCA-C's small-memory-model heap to satisfy them. // for IIgs ORCA-C's small-memory-model heap to satisfy them.

View file

@ -15,6 +15,9 @@
#include "hal.h" #include "hal.h"
#include "inputInternal.h" #include "inputInternal.h"
// CORESYS: hoisted out of _ROOT (see surface.c for rationale).
JOEYLIB_SEGMENT("CORESYS")
bool gKeyState [KEY_COUNT]; bool gKeyState [KEY_COUNT];
bool gKeyPrev [KEY_COUNT]; bool gKeyPrev [KEY_COUNT];

View file

@ -10,6 +10,9 @@
#include "joey/palette.h" #include "joey/palette.h"
#include "surfaceInternal.h" #include "surfaceInternal.h"
// CORESYS: hoisted out of _ROOT (see surface.c for rationale).
JOEYLIB_SEGMENT("CORESYS")
// ----- Public API (alphabetical) ----- // ----- Public API (alphabetical) -----
void paletteGet(const SurfaceT *s, uint8_t paletteIndex, uint16_t *out16) { void paletteGet(const SurfaceT *s, uint8_t paletteIndex, uint16_t *out16) {

View file

@ -12,6 +12,9 @@
#include "hal.h" #include "hal.h"
#include "surfaceInternal.h" #include "surfaceInternal.h"
// CORESYS: hoisted out of _ROOT (see surface.c for rationale).
JOEYLIB_SEGMENT("CORESYS")
// ----- Public API (alphabetical) ----- // ----- Public API (alphabetical) -----
void stagePresent(void) { void stagePresent(void) {

View file

@ -9,6 +9,9 @@
#include "joey/palette.h" #include "joey/palette.h"
#include "surfaceInternal.h" #include "surfaceInternal.h"
// CORESYS: hoisted out of _ROOT (see surface.c for rationale).
JOEYLIB_SEGMENT("CORESYS")
// ----- Public API (alphabetical) ----- // ----- Public API (alphabetical) -----
uint8_t scbGet(const SurfaceT *s, uint16_t line) { uint8_t scbGet(const SurfaceT *s, uint16_t line) {

View file

@ -13,6 +13,9 @@
#include "spriteInternal.h" #include "spriteInternal.h"
#include "surfaceInternal.h" #include "surfaceInternal.h"
// CORESYS: hoisted out of _ROOT (see surface.c for rationale).
JOEYLIB_SEGMENT("CORESYS")
// 8x8 tiles, 4bpp packed = 4 bytes/row * 8 rows = 32 bytes/tile. // 8x8 tiles, 4bpp packed = 4 bytes/row * 8 rows = 32 bytes/tile.
#define TILE_BYTES 32 #define TILE_BYTES 32
#define TILE_PIXELS 8 #define TILE_PIXELS 8

View file

@ -10,6 +10,13 @@
#include "hal.h" #include "hal.h"
#include "surfaceInternal.h" #include "surfaceInternal.h"
// Hoist into a CORESYS load segment alongside the other small core
// files. Keeps _ROOT thin and stable so it stops reacting to per-file
// source changes -- _ROOT size flux was tripping ORCA-Linker bank
// packing in spriteEmitIigs.c (see feedback_orca_link_segment_count
// cases 2-4).
JOEYLIB_SEGMENT("CORESYS")
#ifdef JOEYLIB_PLATFORM_IIGS #ifdef JOEYLIB_PLATFORM_IIGS
extern void iigsMarkDirtyRowsInner(uint16_t yStart, uint16_t yEnd, uint16_t minWord, uint16_t maxWord); extern void iigsMarkDirtyRowsInner(uint16_t yStart, uint16_t yEnd, uint16_t minWord, uint16_t maxWord);
#endif #endif

View file

@ -60,6 +60,19 @@ void surfaceMarkDirtyAll(const SurfaceT *s);
// Reset every row to CLEAN. Called by stagePresent after the slam. // Reset every row to CLEAN. Called by stagePresent after the slam.
void stageDirtyClearAll(void); void stageDirtyClearAll(void);
// y -> byte offset of row y in a SURFACE_BYTES_PER_ROW-strided buffer.
// On IIgs this expands to a single indexed long-mode read against
// gRowOffsetLut (built once at halInit). On other ports it's the
// straight multiply -- those compilers (gcc, OpenWatcom) optimize the
// constant 160 to a shift+add chain that's already cheap. The point
// is to dodge ORCA-C's __mul16 JSL on every per-row pointer compute.
#ifdef JOEYLIB_PLATFORM_IIGS
extern const uint16_t gRowOffsetLut[200];
#define SURFACE_ROW_OFFSET(_y) ((uint16_t)gRowOffsetLut[(uint16_t)(_y)])
#else
#define SURFACE_ROW_OFFSET(_y) ((uint16_t)((uint16_t)(_y) * SURFACE_BYTES_PER_ROW))
#endif
// Allocate and free the library-owned stage (the back-buffer surface // Allocate and free the library-owned stage (the back-buffer surface
// that stagePresent flips to the display). Called from init.c during // that stagePresent flips to the display). Called from init.c during
// joeyInit / joeyShutdown. The stage's pixel storage is supplied by // joeyInit / joeyShutdown. The stage's pixel storage is supplied by

View file

@ -145,8 +145,8 @@ void tileCopy(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src,
srcPixelX = (uint16_t)((uint16_t)srcBx * TILE_PIXELS_PER_SIDE); srcPixelX = (uint16_t)((uint16_t)srcBx * TILE_PIXELS_PER_SIDE);
srcPixelY = (uint16_t)((uint16_t)srcBy * TILE_PIXELS_PER_SIDE); srcPixelY = (uint16_t)((uint16_t)srcBy * TILE_PIXELS_PER_SIDE);
dstRow0 = &dst->pixels[dstPixelY * SURFACE_BYTES_PER_ROW + (dstPixelX >> 1)]; dstRow0 = &dst->pixels[SURFACE_ROW_OFFSET(dstPixelY) + (dstPixelX >> 1)];
srcRow0 = &src->pixels[srcPixelY * SURFACE_BYTES_PER_ROW + (srcPixelX >> 1)]; srcRow0 = &src->pixels[SURFACE_ROW_OFFSET(srcPixelY) + (srcPixelX >> 1)];
if (!halFastTileCopy(dstRow0, srcRow0)) { if (!halFastTileCopy(dstRow0, srcRow0)) {
copyTileOpaque(dstRow0, srcRow0); copyTileOpaque(dstRow0, srcRow0);
@ -176,8 +176,8 @@ void tileCopyMasked(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT
srcPixelX = (uint16_t)((uint16_t)srcBx * TILE_PIXELS_PER_SIDE); srcPixelX = (uint16_t)((uint16_t)srcBx * TILE_PIXELS_PER_SIDE);
srcPixelY = (uint16_t)((uint16_t)srcBy * TILE_PIXELS_PER_SIDE); srcPixelY = (uint16_t)((uint16_t)srcBy * TILE_PIXELS_PER_SIDE);
dstRow0 = &dst->pixels[dstPixelY * SURFACE_BYTES_PER_ROW + (dstPixelX >> 1)]; dstRow0 = &dst->pixels[SURFACE_ROW_OFFSET(dstPixelY) + (dstPixelX >> 1)];
srcRow0 = &src->pixels[srcPixelY * SURFACE_BYTES_PER_ROW + (srcPixelX >> 1)]; srcRow0 = &src->pixels[SURFACE_ROW_OFFSET(srcPixelY) + (srcPixelX >> 1)];
if (!halFastTileCopyMasked(dstRow0, srcRow0, transparentIndex)) { if (!halFastTileCopyMasked(dstRow0, srcRow0, transparentIndex)) {
copyTileMasked(dstRow0, srcRow0, transparentIndex); copyTileMasked(dstRow0, srcRow0, transparentIndex);
@ -203,7 +203,7 @@ void tileFill(SurfaceT *s, uint8_t bx, uint8_t by, uint8_t colorIndex) {
doubled = (uint8_t)(((colorIndex & 0x0F) << 4) | (colorIndex & 0x0F)); doubled = (uint8_t)(((colorIndex & 0x0F) << 4) | (colorIndex & 0x0F));
if (!halFastTileFill(s, bx, by, if (!halFastTileFill(s, bx, by,
(uint16_t)((uint16_t)doubled | ((uint16_t)doubled << 8)))) { (uint16_t)((uint16_t)doubled | ((uint16_t)doubled << 8)))) {
uint8_t *row = &s->pixels[pixelY * SURFACE_BYTES_PER_ROW + (pixelX >> 1)]; uint8_t *row = &s->pixels[SURFACE_ROW_OFFSET(pixelY) + (pixelX >> 1)];
uint8_t i; uint8_t i;
for (i = 0; i < TILE_PIXELS_PER_SIDE; i++) { for (i = 0; i < TILE_PIXELS_PER_SIDE; i++) {
row[0] = doubled; row[0] = doubled;
@ -233,7 +233,7 @@ void tilePaste(SurfaceT *dst, uint8_t bx, uint8_t by, const TileT *in) {
} }
pixelX = (uint16_t)((uint16_t)bx * TILE_PIXELS_PER_SIDE); pixelX = (uint16_t)((uint16_t)bx * TILE_PIXELS_PER_SIDE);
pixelY = (uint16_t)((uint16_t)by * TILE_PIXELS_PER_SIDE); pixelY = (uint16_t)((uint16_t)by * TILE_PIXELS_PER_SIDE);
dstRow = &dst->pixels[pixelY * SURFACE_BYTES_PER_ROW + (pixelX >> 1)]; dstRow = &dst->pixels[SURFACE_ROW_OFFSET(pixelY) + (pixelX >> 1)];
src = &in->pixels[0]; src = &in->pixels[0];
if (!halFastTilePaste(dstRow, src)) { if (!halFastTilePaste(dstRow, src)) {
for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) { for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) {
@ -265,7 +265,7 @@ void tileSnap(const SurfaceT *src, uint8_t bx, uint8_t by, TileT *out) {
} }
pixelX = (uint16_t)((uint16_t)bx * TILE_PIXELS_PER_SIDE); pixelX = (uint16_t)((uint16_t)bx * TILE_PIXELS_PER_SIDE);
pixelY = (uint16_t)((uint16_t)by * TILE_PIXELS_PER_SIDE); pixelY = (uint16_t)((uint16_t)by * TILE_PIXELS_PER_SIDE);
srcRow = &src->pixels[pixelY * SURFACE_BYTES_PER_ROW + (pixelX >> 1)]; srcRow = &src->pixels[SURFACE_ROW_OFFSET(pixelY) + (pixelX >> 1)];
dst = &out->pixels[0]; dst = &out->pixels[0];
if (!halFastTileSnap(dst, srcRow)) { if (!halFastTileSnap(dst, srcRow)) {
for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) { for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) {

View file

@ -253,7 +253,7 @@ void halPresentRect(const SurfaceT *src, int16_t x, int16_t y, uint16_t w, uint1
// Asm per-row MVN blit. Stage pixels live at $01:2000; SHR display // Asm per-row MVN blit. Stage pixels live at $01:2000; SHR display
// at $E1:2000 (same offset within their banks). srcOffset is the // at $E1:2000 (same offset within their banks). srcOffset is the
// byte offset of the first byte to copy on the first row. // byte offset of the first byte to copy on the first row.
srcOffset = (uint16_t)(0x2000 + y * SURFACE_BYTES_PER_ROW + byteStart); srcOffset = (uint16_t)(0x2000 + SURFACE_ROW_OFFSET(y) + byteStart);
iigsBlitRectStageToShr(srcOffset, copyBytes, h); iigsBlitRectStageToShr(srcOffset, copyBytes, h);
} }
@ -278,23 +278,10 @@ void halShutdown(void) {
// partial-byte (nibble-edge) handling is too gnarly for a macro. // partial-byte (nibble-edge) handling is too gnarly for a macro.
// halFastFillRect: thin wrapper around iigsFillRectInner. The asm // halFastFillRect: macro-dispatched in core/hal.h, same as the other
// helper now handles the partial-byte (nibble-edge) logic that used // halFast* primitives. The C wrapper that used to live here was kept
// to live here, so this function is just a stage-check + forward. // as load-bearing _ROOT mass to defeat ORCA-Linker bank fragility;
// (It's not macro-dispatched like the others because removing it // since the CORESYS migration drained _ROOT, the macro form is safe.
// from the C side triggers an unrelated ORCA-linker bank-placement
// failure -- the binary needs enough mass in _ROOT to keep sprite
// codegen's static symbols at addresses the linker can resolve.)
bool halFastFillRect(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex) {
if (s == NULL || s != stageGet()) {
return false;
}
iigsFillRectInner(s->pixels,
(uint16_t)x, (uint16_t)y,
(uint16_t)w, (uint16_t)h,
(uint16_t)(colorIndex & 0x0F));
return true;
}
uint8_t *halStageAllocPixels(void) { uint8_t *halStageAllocPixels(void) {

View file

@ -37,6 +37,9 @@
#include "inputInternal.h" #include "inputInternal.h"
#include "joey/surface.h" #include "joey/surface.h"
// CORESYS: hoisted out of _ROOT (see surface.c for rationale).
JOEYLIB_SEGMENT("CORESYS")
// ----- Hardware registers ----- // ----- Hardware registers -----
#define IIGS_KBD ((volatile uint8_t *)0x00C000L) #define IIGS_KBD ((volatile uint8_t *)0x00C000L)