diff --git a/examples/audio/audio.c b/examples/audio/audio.c index 1f90f3e..bf0c9ae 100644 --- a/examples/audio/audio.c +++ b/examples/audio/audio.c @@ -103,7 +103,7 @@ static void initialPaint(SurfaceT *screen, bool audioOk) { surfaceClear(screen, COLOR_BG); fillRect(screen, BAR_X, BAR_Y, BAR_W, BAR_H, audioOk ? COLOR_HINT : COLOR_BG); - surfacePresent(screen); + stagePresent(); } @@ -128,9 +128,9 @@ int main(void) { return 1; } - screen = surfaceGetScreen(); + screen = stageGet(); if (screen == NULL) { - fprintf(stderr, "surfaceGetScreen returned NULL\n"); + fprintf(stderr, "stageGet returned NULL\n"); joeyShutdown(); return 1; } @@ -171,11 +171,11 @@ int main(void) { if (flashFrames > 0) { fillRect(screen, BAR_X, BAR_Y, BAR_W, BAR_H, COLOR_BAR); - surfacePresentRect(screen, BAR_X, BAR_Y, BAR_W, BAR_H); + stagePresentRect(BAR_X, BAR_Y, BAR_W, BAR_H); flashFrames--; if (flashFrames == 0) { fillRect(screen, BAR_X, BAR_Y, BAR_W, BAR_H, COLOR_HINT); - surfacePresentRect(screen, BAR_X, BAR_Y, BAR_W, BAR_H); + stagePresentRect(BAR_X, BAR_Y, BAR_W, BAR_H); } } } diff --git a/examples/draw/draw.c b/examples/draw/draw.c new file mode 100644 index 0000000..fe881c5 --- /dev/null +++ b/examples/draw/draw.c @@ -0,0 +1,277 @@ +// Drawing primitive smoke test. Lays out a 2x2 grid of cells, each +// exercising one family of primitives. On screen each cell should show +// a clear visual signal that the underlying inner loops (C or ASM) +// produced the expected pixel pattern. +// +// TL: drawPixel + drawLine (8-octant fan from cell center; pixel +// row of all 16 colors along the cell's bottom edge). +// TR: drawRect + fillRect (concentric outlines + filled blocks at +// deliberately odd x / odd width to catch nibble-edge bugs). +// BL: drawCircle + fillCircle (concentric outlines + a small filled +// disk at center). +// BR: tileCopy / tileCopyMasked / tileSnap+tilePaste / floodFill. +// +// Runs in HOST_MODE_TAKEOVER and holds the frame until the user +// presses ESC / RETURN / SPACE. + +#include + +#include + +#define CELL_W 160 +#define CELL_H 100 + +// Color slots (palette 0). Color 0 is the library-forced black. +#define C_BG 0 +#define C_BORDER 1 // white +#define C_RED 2 +#define C_GREEN 3 +#define C_BLUE 4 +#define C_YELLOW 5 +#define C_CYAN 6 +#define C_MAGENTA 7 +#define C_ORANGE 8 +#define C_GRAY 9 + +static void buildPalette(SurfaceT *screen); +static void drawCellBorder(SurfaceT *screen, int16_t cx, int16_t cy); +static void drawAllCellBorders(SurfaceT *screen); +static void drawPrimitivesPixelLine(SurfaceT *screen); +static void drawPrimitivesRect(SurfaceT *screen); +static void drawPrimitivesCircle(SurfaceT *screen); +static void drawPrimitivesTileFlood(SurfaceT *screen); +static void waitForKey(void); + + +static void buildPalette(SurfaceT *screen) { + uint16_t colors[SURFACE_COLORS_PER_PALETTE]; + + // 16 distinct $0RGB entries. Index 0 is forced to black anyway. + colors[0] = 0x000; + colors[1] = 0xFFF; // white + colors[2] = 0xF00; // red + colors[3] = 0x0F0; // green + colors[4] = 0x00F; // blue + colors[5] = 0xFF0; // yellow + colors[6] = 0x0FF; // cyan + colors[7] = 0xF0F; // magenta + colors[8] = 0xF80; // orange + colors[9] = 0x888; // mid gray + colors[10] = 0x800; + colors[11] = 0x080; + colors[12] = 0x008; + colors[13] = 0x880; + colors[14] = 0x088; + colors[15] = 0x808; + paletteSet(screen, 0, colors); + scbSetRange(screen, 0, SURFACE_HEIGHT - 1, 0); +} + + +static void drawCellBorder(SurfaceT *screen, int16_t cx, int16_t cy) { + drawRect(screen, cx, cy, CELL_W, CELL_H, C_BORDER); +} + + +static void drawAllCellBorders(SurfaceT *screen) { + drawCellBorder(screen, 0, 0); + drawCellBorder(screen, CELL_W, 0); + drawCellBorder(screen, 0, CELL_H); + drawCellBorder(screen, CELL_W, CELL_H); +} + + +// Top-left cell: drawPixel + drawLine. +// +// 8 lines fan out from the cell center (80, 50). Four are diagonal +// (the new ASM Bresenham path) and four are axis-aligned (drawLine +// routes those to fillRect). A horizontal row of 14 pixels along the +// cell's bottom verifies drawPixel: each pixel uses a different color +// index so the leftmost ones at color 0 are invisible (they are bg) +// and 1..13 progress through the palette. +static void drawPrimitivesPixelLine(SurfaceT *screen) { + int16_t cx; + int16_t cy; + int16_t i; + + cx = CELL_W / 2; // 80 + cy = CELL_H / 2; // 50 + + drawLine(screen, cx, cy, cx + 70, cy, C_RED); // E (horizontal) + drawLine(screen, cx, cy, cx + 60, cy - 40, C_GREEN); // NE (diagonal) + drawLine(screen, cx, cy, cx, cy - 45, C_BLUE); // N (vertical) + drawLine(screen, cx, cy, cx - 60, cy - 40, C_YELLOW); // NW + drawLine(screen, cx, cy, cx - 70, cy, C_CYAN); // W + drawLine(screen, cx, cy, cx - 60, cy + 40, C_MAGENTA); // SW + drawLine(screen, cx, cy, cx, cy + 45, C_ORANGE); // S + drawLine(screen, cx, cy, cx + 60, cy + 40, C_GRAY); // SE + + // Pixel row: 14 single-pixel writes at consecutive x to exercise + // both odd and even nibble paths. + for (i = 0; i < 14; i++) { + drawPixel(screen, (int16_t)(10 + i * 10), (int16_t)(CELL_H - 6), + (uint8_t)((i + 1) & 0x0F)); + } +} + + +// Top-right cell: drawRect + fillRect. +// +// Four nested rectangles with deliberately odd x/y/w/h to exercise +// the partial-byte (nibble) edge handling in halFastFillRect. The +// outermost is filled, the next outline-only, then filled with odd +// width, then a 1-pixel-wide vertical bar (drawRect collapses to a +// line via fillRect's 1-wide path). +static void drawPrimitivesRect(SurfaceT *screen) { + int16_t ox; + + ox = CELL_W; // cell origin x + + // Outer fill, even-aligned. + fillRect(screen, ox + 8, 8, 144, 84, C_RED); + // Inner outline, odd x to test partial-nibble edges. + drawRect(screen, ox + 17, 17, 124, 64, C_YELLOW); + // Odd width fill, odd x. + fillRect(screen, ox + 25, 25, 35, 48, C_GREEN); + // 1-pixel vertical bar (degenerate rect through fillRect 1-wide path). + fillRect(screen, ox + 100, 25, 1, 48, C_BORDER); + // Odd-x odd-w narrow bar to specifically hit hasLeading + hasTrailing + // in halFastFillRect. + fillRect(screen, ox + 75, 25, 7, 48, C_CYAN); +} + + +// Bottom-left cell: drawCircle + fillCircle. +// +// Concentric outlines at decreasing radii, alternating colors, plus a +// small filled disk at the center. Center is at the cell midpoint. +static void drawPrimitivesCircle(SurfaceT *screen) { + int16_t cx; + int16_t cy; + + cx = CELL_W / 2; + cy = CELL_H + CELL_H / 2; + + drawCircle(screen, cx, cy, 45, C_BORDER); + drawCircle(screen, cx, cy, 35, C_GREEN); + drawCircle(screen, cx, cy, 25, C_YELLOW); + drawCircle(screen, cx, cy, 15, C_CYAN); + fillCircle(screen, cx, cy, 8, C_MAGENTA); +} + + +// Bottom-right cell: tile + flood fill. +// +// Top portion: a 16x16 colored block, then tileSnap one of its 8x8 +// quadrants and tilePaste the captured tile to a neighbor block; +// also tileCopy the same source quadrant to a third location to +// exercise the full surface-to-surface path. Then a tileCopyMasked +// case: paint a 2x1 (16x8) "stripe" containing a transparent color 0 +// pattern interleaved with color, paste it over a solid backdrop with +// transparent=0; the backdrop should show through the transparent +// nibbles. +// +// Bottom portion: drawRect outlines a closed region, floodFillBounded +// fills its interior with a different color, stopping at the outline. +static void drawPrimitivesTileFlood(SurfaceT *screen) { + int16_t ox; + int16_t oy; + int16_t bx; + int16_t by; + int16_t i; + int16_t px; + TileT snapBuf; + + ox = CELL_W; // 160 + oy = CELL_H; // 100 + + // Source 16x16 block at (168, 108): a 4-quadrant pattern. + fillRect(screen, ox + 8, oy + 8, 8, 8, C_RED); + fillRect(screen, ox + 16, oy + 8, 8, 8, C_GREEN); + fillRect(screen, ox + 8, oy + 16, 8, 8, C_BLUE); + fillRect(screen, ox + 16, oy + 16, 8, 8, C_YELLOW); + + // tileSnap the top-left red quadrant (block bx=21, by=13) and + // tilePaste it next to the 16x16 source as the 5th quadrant. + bx = (int16_t)((ox + 8) / 8); + by = (int16_t)((oy + 8) / 8); + tileSnap(screen, (uint8_t)bx, (uint8_t)by, &snapBuf); + tilePaste(screen, (uint8_t)(bx + 4), (uint8_t)by, &snapBuf); + + // tileCopy from the green quadrant onto a fresh location below. + tileCopy(screen, (uint8_t)(bx + 4), (uint8_t)(by + 1), + screen, (uint8_t)(bx + 1), (uint8_t)by); + + // tileCopyMasked test: build a "transparent" striped pattern at + // (208, 132). The tile's source has color 0 in alternating + // nibbles. Paste it onto a solid orange backdrop so transparent + // nibbles let the orange show through. + fillRect(screen, ox + 80, oy + 32, 16, 8, C_ORANGE); // backdrop + // Build a vertical-stripe source at (240, 132): col-pixel = (px % 2 ? color : 0) + for (i = 0; i < 8; i++) { + for (px = 0; px < 16; px++) { + drawPixel(screen, (int16_t)(ox + 112 + px), (int16_t)(oy + 32 + i), + (uint8_t)((px & 1) ? C_MAGENTA : 0)); + } + } + // tileCopyMasked: source at block (ox+112)/8 = 34..35, by 16 + // -> dst at backdrop block (ox+80)/8 = 30..31, by 16 + tileCopyMasked(screen, (uint8_t)((ox + 80) / 8), (uint8_t)((oy + 32) / 8), + screen, (uint8_t)((ox + 112) / 8), (uint8_t)((oy + 32) / 8), + 0); + + // Flood-fill region: a small bordered rectangle in the cell's + // lower portion. Outline drawn in C_BORDER; floodFillBounded + // from a point inside should fill with C_CYAN, stopping at the + // border. + drawRect(screen, ox + 16, oy + 60, 64, 32, C_BORDER); + floodFillBounded(screen, (int16_t)(ox + 48), (int16_t)(oy + 76), + C_CYAN, C_BORDER); + + // Plain floodFill: solid block then re-fill to a new color. + fillRect(screen, ox + 96, oy + 60, 48, 32, C_GREEN); + floodFill(screen, (int16_t)(ox + 120), (int16_t)(oy + 76), C_GRAY); +} + + +static void waitForKey(void) { + joeyWaitForAnyKey(); +} + + +int main(void) { + JoeyConfigT config; + SurfaceT *screen; + + config.hostMode = HOST_MODE_TAKEOVER; + config.codegenBytes = 8 * 1024; + config.maxSurfaces = 4; + config.audioBytes = 64 * 1024; + config.assetBytes = 128 * 1024; + + if (!joeyInit(&config)) { + fprintf(stderr, "joeyInit failed: %s\n", joeyLastError()); + return 1; + } + + screen = stageGet(); + if (screen == NULL) { + fprintf(stderr, "stageGet returned NULL\n"); + joeyShutdown(); + return 1; + } + + buildPalette(screen); + surfaceClear(screen, C_BG); + drawAllCellBorders(screen); + drawPrimitivesPixelLine(screen); + drawPrimitivesRect(screen); + drawPrimitivesCircle(screen); + drawPrimitivesTileFlood(screen); + stagePresent(); + + waitForKey(); + + joeyShutdown(); + return 0; +} diff --git a/examples/joy/joy.c b/examples/joy/joy.c index 165b170..d2894c2 100644 --- a/examples/joy/joy.c +++ b/examples/joy/joy.c @@ -81,7 +81,7 @@ static void buildPalette(SurfaceT *screen) { static void drawAndPresent(SurfaceT *screen, int16_t x, int16_t y, int16_t w, int16_t h, uint8_t color) { fillRect(screen, x, y, (uint16_t)w, (uint16_t)h, color); - surfacePresentRect(screen, x, y, (uint16_t)w, (uint16_t)h); + stagePresentRect(x, y, (uint16_t)w, (uint16_t)h); } @@ -143,7 +143,7 @@ static void initialPaint(SurfaceT *screen) { gView[i].valid = false; gView[i].connected = false; } - surfacePresent(screen); + stagePresent(); } @@ -226,9 +226,9 @@ int main(void) { return 1; } - screen = surfaceGetScreen(); + screen = stageGet(); if (screen == NULL) { - fprintf(stderr, "surfaceGetScreen returned NULL\n"); + fprintf(stderr, "stageGet returned NULL\n"); joeyShutdown(); return 1; } diff --git a/examples/keys/keys.c b/examples/keys/keys.c index 7da35f0..841acb5 100644 --- a/examples/keys/keys.c +++ b/examples/keys/keys.c @@ -149,7 +149,7 @@ static void initialPaint(SurfaceT *screen) { gCellLit[row][col] = false; } } - surfacePresent(screen); + stagePresent(); } @@ -174,7 +174,7 @@ static void presentChangedCells(SurfaceT *screen, int16_t cursorCol, int16_t cur drawCell(screen, col, row, lit); x = (int16_t)(MARGIN_X + col * (CELL_W + GAP)); y = (int16_t)(MARGIN_Y + row * (CELL_H + GAP)); - surfacePresentRect(screen, x, y, CELL_W, CELL_H); + stagePresentRect(x, y, CELL_W, CELL_H); gCellLit[row][col] = lit; } } @@ -195,19 +195,19 @@ static void updateCursor(SurfaceT *screen, int16_t cursorCol, int16_t cursorRow) if (gLastCursorX != mouseX || gLastCursorY != mouseY) { if (gLastCursorCol != CELL_NONE) { drawCell(screen, gLastCursorCol, gLastCursorRow, gCellLit[gLastCursorRow][gLastCursorCol]); - surfacePresentRect(screen, + stagePresentRect( (int16_t)(MARGIN_X + gLastCursorCol * (CELL_W + GAP)), (int16_t)(MARGIN_Y + gLastCursorRow * (CELL_H + GAP)), CELL_W, CELL_H); } else if (gLastCursorX >= 0 && gLastCursorY >= 0) { // Old cursor was in a gap region. Stamp background over it. fillRect(screen, gLastCursorX, gLastCursorY, CURSOR_W, CURSOR_H, COLOR_BACKGROUND); - surfacePresentRect(screen, gLastCursorX, gLastCursorY, CURSOR_W, CURSOR_H); + stagePresentRect(gLastCursorX, gLastCursorY, CURSOR_W, CURSOR_H); } } drawCursor(screen, mouseX, mouseY); - surfacePresentRect(screen, mouseX, mouseY, CURSOR_W, CURSOR_H); + stagePresentRect(mouseX, mouseY, CURSOR_W, CURSOR_H); gLastCursorX = mouseX; gLastCursorY = mouseY; @@ -233,9 +233,9 @@ int main(void) { return 1; } - screen = surfaceGetScreen(); + screen = stageGet(); if (screen == NULL) { - fprintf(stderr, "surfaceGetScreen returned NULL\n"); + fprintf(stderr, "stageGet returned NULL\n"); joeyShutdown(); return 1; } diff --git a/examples/pattern/pattern.c b/examples/pattern/pattern.c index 36ec6f8..76b1e0b 100644 --- a/examples/pattern/pattern.c +++ b/examples/pattern/pattern.c @@ -8,7 +8,6 @@ // library contract, so the leftmost stripe is black in every band. #include -#include #include @@ -16,13 +15,11 @@ #define BAND_HEIGHT (SURFACE_HEIGHT / BAND_COUNT) #define STRIPE_COUNT 16 #define STRIPE_WIDTH (SURFACE_WIDTH / STRIPE_COUNT) -#define DISPLAY_SECONDS 5 static void buildPalettes(SurfaceT *screen); static void buildScbs(SurfaceT *screen); static void drawStripes(SurfaceT *screen); static void makeGradient(uint16_t *out16, int redOn, int greenOn, int blueOn); -static void waitSeconds(int seconds); static void buildPalettes(SurfaceT *screen) { @@ -102,15 +99,6 @@ static void makeGradient(uint16_t *out16, int redOn, int greenOn, int blueOn) { } -static void waitSeconds(int seconds) { - time_t start; - time_t now; - - start = time(NULL); - do { - now = time(NULL); - } while ((long)(now - start) < (long)seconds); -} int main(void) { @@ -128,9 +116,9 @@ int main(void) { return 1; } - screen = surfaceGetScreen(); + screen = stageGet(); if (screen == NULL) { - fprintf(stderr, "surfaceGetScreen returned NULL\n"); + fprintf(stderr, "stageGet returned NULL\n"); joeyShutdown(); return 1; } @@ -138,9 +126,9 @@ int main(void) { buildPalettes(screen); buildScbs(screen); drawStripes(screen); - surfacePresent(screen); + stagePresent(); - waitSeconds(DISPLAY_SECONDS); + joeyWaitForAnyKey(); joeyShutdown(); return 0; diff --git a/examples/sprite/sprite.c b/examples/sprite/sprite.c index 18d8e14..8006afd 100644 --- a/examples/sprite/sprite.c +++ b/examples/sprite/sprite.c @@ -14,7 +14,6 @@ #define BALL_TILES_X (BALL_W / 8) #define BALL_TILES_Y (BALL_H / 8) -#define TILE_BYTES 32 #define BALL_TILE_BYTES (BALL_TILES_X * BALL_TILES_Y * TILE_BYTES) // SaveUnder must store rounded-up byte boundaries: x rounded down to // even, width rounded up to even. Worst case for BALL_W=16 (already @@ -122,9 +121,9 @@ int main(void) { return 1; } - screen = surfaceGetScreen(); + screen = stageGet(); if (screen == NULL) { - fprintf(stderr, "surfaceGetScreen returned NULL\n"); + fprintf(stderr, "stageGet returned NULL\n"); joeyShutdown(); return 1; } @@ -145,7 +144,7 @@ int main(void) { buildPalette(screen); scbSetRange(screen, 0, SURFACE_HEIGHT - 1, BALL_PALETTE_IDX); surfaceClear(screen, COLOR_BG); - surfacePresent(screen); + stagePresent(); backup.bytes = gBallBackup; @@ -157,7 +156,7 @@ int main(void) { spriteSaveUnder(screen, ball, x, y, &backup); spriteDraw(screen, ball, x, y); - surfacePresentRect(screen, backup.x, backup.y, backup.width, backup.height); + stagePresentRect(backup.x, backup.y, backup.width, backup.height); haveBackup = true; for (;;) { @@ -168,7 +167,7 @@ int main(void) { // Stash the prior ball's region before restoring the bytes // under it. Do all off-screen work (restore + move + draw) - // first, then waitVBL + ONE surfacePresentRect covering both + // first, then waitVBL + ONE stagePresentRect covering both // old and new regions. Putting waitVBL immediately before the // present lets the present land inside the VBL window so the // CRT never sees a half-updated framebuffer (matters most on @@ -206,7 +205,7 @@ int main(void) { : (backup.y + backup.height)); joeyWaitVBL(); - surfacePresentRect(screen, unionX, unionY, + stagePresentRect(unionX, unionY, (uint16_t)(unionRight - unionX), (uint16_t)(unionBottom - unionY)); haveBackup = true; diff --git a/include/joey/draw.h b/include/joey/draw.h index d851805..419383d 100644 --- a/include/joey/draw.h +++ b/include/joey/draw.h @@ -23,9 +23,40 @@ void drawPixel(SurfaceT *s, int16_t x, int16_t y, uint8_t colorIndex); // Read a pixel value. Off-surface coordinates return 0. uint8_t samplePixel(const SurfaceT *s, int16_t x, int16_t y); +// Plot a line from (x0, y0) to (x1, y1) using Bresenham. Endpoints +// are inclusive. Off-surface pixels are skipped per-pixel; lines that +// pass entirely off-surface draw nothing. Horizontal and vertical +// runs hit fast paths. +void drawLine(SurfaceT *s, int16_t x0, int16_t y0, int16_t x1, int16_t y1, uint8_t colorIndex); + +// Outline a rectangle (1-pixel-wide border). 1xN / Nx1 degenerate +// to vertical / horizontal lines; 1x1 to a single pixel. Negative or +// zero dimensions are no-ops. +void drawRect(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex); + // Fill a solid rectangle. Negative or zero dimensions are no-ops. void fillRect(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex); +// Outline a circle of radius r centered at (cx, cy) using Bresenham +// midpoint. r == 0 plots a single pixel. +void drawCircle(SurfaceT *s, int16_t cx, int16_t cy, uint16_t r, uint8_t colorIndex); + +// Fill a disk of radius r centered at (cx, cy). r == 0 plots a single +// pixel. Spans are emitted per scanline using midpoint symmetry. +void fillCircle(SurfaceT *s, int16_t cx, int16_t cy, uint16_t r, uint8_t colorIndex); + +// Flood fill a 4-connected region starting at (x, y). Replaces every +// pixel of the original color reached via N/S/E/W steps. No-op if +// (x, y) is off-surface or already matches newColor. +void floodFill(SurfaceT *s, int16_t x, int16_t y, uint8_t newColor); + +// Flood fill a 4-connected region starting at (x, y), stopping at +// boundary pixels of color boundaryColor. Replaces every reachable +// pixel that is not boundaryColor with newColor. Used for vector-art +// rendering (e.g. Sierra-style picture playback): outline a closed +// region with drawLine in boundaryColor, then fill with this. +void floodFillBounded(SurfaceT *s, int16_t x, int16_t y, uint8_t newColor, uint8_t boundaryColor); + // Blit an asset onto the surface at (x, y). Source nibbles overwrite // destination nibbles verbatim -- the caller is responsible for // matching the asset's palette to the destination palette (typically diff --git a/include/joey/input.h b/include/joey/input.h index c4a16a8..c9414ba 100644 --- a/include/joey/input.h +++ b/include/joey/input.h @@ -89,6 +89,12 @@ typedef enum { void joeyInputPoll(void); +// Block until the user presses any key. Internally polls via +// joeyInputPoll, so per-port halInputPoll machinery (including +// audio-friendly IRQ-driven samplers) keeps working while the +// wait loop runs. +void joeyWaitForAnyKey(void); + bool joeyKeyDown(JoeyKeyE key); bool joeyKeyPressed(JoeyKeyE key); bool joeyKeyReleased(JoeyKeyE key); diff --git a/include/joey/joey.h b/include/joey/joey.h index 65300ea..fc725b8 100644 --- a/include/joey/joey.h +++ b/include/joey/joey.h @@ -13,6 +13,7 @@ #include "palette.h" #include "asset.h" #include "draw.h" +#include "tile.h" #include "present.h" #include "input.h" #include "audio.h" diff --git a/include/joey/platform.h b/include/joey/platform.h index d33146f..ac8f18c 100644 --- a/include/joey/platform.h +++ b/include/joey/platform.h @@ -63,6 +63,20 @@ #define JOEYLIB_PLATFORM_NAME "MS-DOS" #endif +// ----- ORCA-C named load segments ----- +// +// On the IIgs the ORCA Linker fits each load segment in its own bank, +// so spilling cross-platform .c files into named segments is the way +// to keep monolithic IIgs binaries under the 64 KB-per-bank _ROOT +// limit. The `segment "name";` statement is ORCA-C-specific syntax +// (see ORCA/C ch. 30); other ports' compilers don't recognize it, so +// the macro evaluates to nothing on Amiga/ST/DOS. +#ifdef JOEYLIB_PLATFORM_IIGS + #define JOEYLIB_SEGMENT(name) segment name; +#else + #define JOEYLIB_SEGMENT(name) +#endif + // ----- Library version ----- #define JOEYLIB_VERSION_MAJOR 1 diff --git a/include/joey/present.h b/include/joey/present.h index 7e64687..521baf7 100644 --- a/include/joey/present.h +++ b/include/joey/present.h @@ -1,9 +1,11 @@ -// Present / slam. +// Stage present. // -// surfacePresent copies pixels, SCBs, and palettes from a source -// surface to the visible display. On chunky platforms (IIgs, DOS) this -// is a direct copy; on planar platforms (Amiga, Atari ST) this is a -// chunky-to-planar conversion. See docs/DESIGN.md section 7. +// stagePresent flips the library-owned stage (back-buffer) to the +// display. On chunky platforms (IIgs, DOS) this is a direct copy; on +// planar platforms (Amiga, Atari ST) this is a chunky-to-planar +// conversion. Drawing primitives mark per-row dirty ranges on the +// stage as a side effect, so stagePresent only touches rows that +// actually changed since the last present. See docs/DESIGN.md. #ifndef JOEYLIB_PRESENT_H #define JOEYLIB_PRESENT_H @@ -12,12 +14,15 @@ #include "surface.h" #include "types.h" -// Present the entire source surface to the display. -void surfacePresent(const SurfaceT *src); +// Flip the dirty regions of the stage to the display, then clear the +// dirty state. Cheap when nothing has changed since the last call. +void stagePresent(void); -// Present a rectangular region of the source surface to the display. -// The rect is clipped to the surface. Negative or zero dimensions are -// no-ops. -void surfacePresentRect(const SurfaceT *src, int16_t x, int16_t y, uint16_t w, uint16_t h); +// Flip a specific rectangular region of the stage to the display, +// regardless of dirty state. Coordinates are clipped to the surface; +// negative or zero dimensions are no-ops. Does not consult or modify +// the dirty arrays -- callers mixing stagePresentRect with stagePresent +// in the same frame may see redundant work on the next stagePresent. +void stagePresentRect(int16_t x, int16_t y, uint16_t w, uint16_t h); #endif diff --git a/include/joey/surface.h b/include/joey/surface.h index c046a63..6f6e9d9 100644 --- a/include/joey/surface.h +++ b/include/joey/surface.h @@ -31,13 +31,15 @@ typedef struct SurfaceT SurfaceT; SurfaceT *surfaceCreate(void); // Release an offscreen surface previously returned by surfaceCreate. -// Passing NULL is a no-op. Passing the screen surface is a no-op. +// Passing NULL is a no-op. Passing the stage is a no-op. void surfaceDestroy(SurfaceT *s); -// The library's pre-allocated screen surface. This is the surface the -// library presents to the display. Always valid between joeyInit and -// joeyShutdown. -SurfaceT *surfaceGetScreen(void); +// The library-owned stage: the back-buffer surface that stagePresent +// flips to the display. Always valid between joeyInit and joeyShutdown. +// On IIgs the stage's pixel buffer is pinned to bank $01 SHR space with +// shadow inhibited so writes are full-speed (2.8 MHz) and isolated from +// the displayed framebuffer until the next stagePresent. +SurfaceT *stageGet(void); // Copy pixels, SCBs, and palettes from src into dst. Both must be valid // surfaces. diff --git a/include/joey/tile.h b/include/joey/tile.h new file mode 100644 index 0000000..0299fdc --- /dev/null +++ b/include/joey/tile.h @@ -0,0 +1,96 @@ +// Tiles: 8x8 pixel blocks aligned on the 8-pixel grid of any surface. +// +// A "tile" in JoeyLib isn't a separate object -- it's just the 8x8 +// region of a SurfaceT at block coordinates (bx, by), where bx is in +// [0, 39] and by is in [0, 24] (40x25 blocks per 320x200 surface). +// The tile API is a small set of operations that move 32-byte chunks +// between surfaces or fill them with a solid color. +// +// Why this shape: anything you can do with a regular surface -- +// drawPixel, drawLine, fillRect, blits -- also works on tile-aligned +// regions, so authors can paint, edit, and procedurally generate tile +// content using the same primitives they use for everything else. +// "Fonts," "terrain tilesets," and "spritesheets" are just surfaces +// you treat as tile sources at draw time. +// +// Snap / paste use a small TileT value type so callers don't need a +// 32 KB scratch surface for save-under-style work. +// +// Block coords map to pixels by multiplying by 8 -- byte-aligned in +// 4bpp packed (8 px = 4 bytes per row), so all the operations are +// byte-shoveling memcpys with no shifting. + +#ifndef JOEYLIB_TILE_H +#define JOEYLIB_TILE_H + +#include "platform.h" +#include "surface.h" +#include "types.h" + +// ----- Constants ----- + +#define TILE_PIXELS_PER_SIDE 8 +#define TILE_BYTES_PER_ROW 4 +#define TILE_BYTES (TILE_BYTES_PER_ROW * TILE_PIXELS_PER_SIDE) +#define TILE_BLOCKS_PER_ROW (SURFACE_WIDTH / TILE_PIXELS_PER_SIDE) +#define TILE_BLOCKS_PER_COL (SURFACE_HEIGHT / TILE_PIXELS_PER_SIDE) + +// Sentinel for asciiMap entries that should not draw. drawText +// advances the cursor past TILE_NO_GLYPH chars without writing. +#define TILE_NO_GLYPH ((uint16_t)0xFFFFu) + +// ----- Types ----- + +// Stack-allocated 32-byte snapshot buffer for tileSnap / tilePaste. +typedef struct TileT { + uint8_t pixels[TILE_BYTES]; +} TileT; + +// ----- API ----- + +// Copy the 8x8 block at (srcBx, srcBy) on src to (dstBx, dstBy) on +// dst, opaque. Out-of-range block coordinates on either side are +// silent no-ops; src and dst can be the same surface. +void tileCopy(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, + const SurfaceT *src, uint8_t srcBx, uint8_t srcBy); + +// Like tileCopy but pixels equal to transparentIndex are skipped -- +// the destination pixel keeps its original value. Use this for fonts +// (transparentIndex = 0 leaves the page background showing through +// glyph backgrounds) and for any tileset where some pixels are meant +// to be see-through. +void tileCopyMasked(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, + const SurfaceT *src, uint8_t srcBx, uint8_t srcBy, + uint8_t transparentIndex); + +// Fill the 8x8 block at (bx, by) with a solid color. Equivalent to +// fillRect(s, bx*8, by*8, 8, 8, colorIndex) but skips the rect +// clipping math since tile coords are already known to be in range. +void tileFill(SurfaceT *s, uint8_t bx, uint8_t by, uint8_t colorIndex); + +// Capture the 8x8 block at (bx, by) into the caller's TileT. Used +// for save-under style work where allocating a scratch surface would +// be overkill. +void tileSnap(const SurfaceT *src, uint8_t bx, uint8_t by, TileT *out); + +// Paste a TileT back onto a surface at block (bx, by). Always +// opaque; use tileCopyMasked if you need transparency. +void tilePaste(SurfaceT *dst, uint8_t bx, uint8_t by, const TileT *in); + +// Draw a NUL-terminated ASCII string at block (bx, by) using glyphs +// pulled from fontSurface. +// +// asciiMap is a 256-entry table mapping ASCII code to glyph location +// on fontSurface, encoded as a packed uint16_t: low byte = source +// blockX, high byte = source blockY. asciiMap[c] == TILE_NO_GLYPH +// causes that character to be skipped (cursor advances, nothing +// drawn). Glyph color 0 is treated as transparent so the underlying +// surface shows through glyph backgrounds. +// +// The cursor wraps to the next row at the right edge and truncates +// at the bottom edge. +void drawText(SurfaceT *dst, uint8_t bx, uint8_t by, + const SurfaceT *fontSurface, const uint16_t *asciiMap, + const char *str); + +#endif diff --git a/make/amiga.mk b/make/amiga.mk index 4f7ddfb..5429346 100644 --- a/make/amiga.mk +++ b/make/amiga.mk @@ -60,6 +60,8 @@ HELLO_SRC := $(EXAMPLES)/hello/hello.c HELLO_BIN := $(BINDIR)/Hello PATTERN_SRC := $(EXAMPLES)/pattern/pattern.c PATTERN_BIN := $(BINDIR)/Pattern +DRAW_SRC := $(EXAMPLES)/draw/draw.c +DRAW_BIN := $(BINDIR)/Draw KEYS_SRC := $(EXAMPLES)/keys/keys.c KEYS_BIN := $(BINDIR)/Keys JOY_SRC := $(EXAMPLES)/joy/joy.c @@ -76,7 +78,7 @@ DATA_DIR := $(BINDIR)/DATA DATA_FILES := $(DATA_DIR)/test.mod $(DATA_DIR)/test.sfx .PHONY: all amiga clean-amiga -all amiga: $(LIB) $(HELLO_BIN) $(PATTERN_BIN) $(KEYS_BIN) $(JOY_BIN) $(SPRITE_BIN) $(AUDIO_BIN) $(DATA_FILES) +all amiga: $(LIB) $(HELLO_BIN) $(PATTERN_BIN) $(DRAW_BIN) $(KEYS_BIN) $(JOY_BIN) $(SPRITE_BIN) $(AUDIO_BIN) $(DATA_FILES) $(BUILD)/obj/core/%.o: $(SRC_CORE)/%.c @mkdir -p $(dir $@) @@ -118,6 +120,10 @@ $(PATTERN_BIN): $(PATTERN_SRC) $(LIB) @mkdir -p $(dir $@) $(AMIGA_CC) $(CFLAGS) $< $(LIB) -o $@ $(LDFLAGS) +$(DRAW_BIN): $(DRAW_SRC) $(LIB) + @mkdir -p $(dir $@) + $(AMIGA_CC) $(CFLAGS) $< $(LIB) -o $@ $(LDFLAGS) + $(KEYS_BIN): $(KEYS_SRC) $(LIB) @mkdir -p $(dir $@) $(AMIGA_CC) $(CFLAGS) $< $(LIB) -o $@ $(LDFLAGS) diff --git a/make/atarist.mk b/make/atarist.mk index 64d3c4e..63c87d7 100644 --- a/make/atarist.mk +++ b/make/atarist.mk @@ -45,6 +45,8 @@ HELLO_SRC := $(EXAMPLES)/hello/hello.c HELLO_BIN := $(BINDIR)/HELLO.PRG PATTERN_SRC := $(EXAMPLES)/pattern/pattern.c PATTERN_BIN := $(BINDIR)/PATTERN.PRG +DRAW_SRC := $(EXAMPLES)/draw/draw.c +DRAW_BIN := $(BINDIR)/DRAW.PRG KEYS_SRC := $(EXAMPLES)/keys/keys.c KEYS_BIN := $(BINDIR)/KEYS.PRG JOY_SRC := $(EXAMPLES)/joy/joy.c @@ -61,7 +63,7 @@ DATA_DIR := $(BINDIR)/DATA DATA_FILES := $(DATA_DIR)/test.mod $(DATA_DIR)/test.sfx .PHONY: all atarist clean-atarist -all atarist: $(LIB) $(LIBXMP_AR) $(HELLO_BIN) $(PATTERN_BIN) $(KEYS_BIN) $(JOY_BIN) $(SPRITE_BIN) $(AUDIO_BIN) $(DATA_FILES) +all atarist: $(LIB) $(LIBXMP_AR) $(HELLO_BIN) $(PATTERN_BIN) $(DRAW_BIN) $(KEYS_BIN) $(JOY_BIN) $(SPRITE_BIN) $(AUDIO_BIN) $(DATA_FILES) $(BUILD)/obj/core/%.o: $(SRC_CORE)/%.c @mkdir -p $(dir $@) @@ -110,6 +112,10 @@ $(PATTERN_BIN): $(PATTERN_SRC) $(LIB) @mkdir -p $(dir $@) $(ST_CC) $(CFLAGS) $< $(LIB) $(LIBXMP_AR) -o $@ $(LDFLAGS) +$(DRAW_BIN): $(DRAW_SRC) $(LIB) + @mkdir -p $(dir $@) + $(ST_CC) $(CFLAGS) $< $(LIB) $(LIBXMP_AR) -o $@ $(LDFLAGS) + $(KEYS_BIN): $(KEYS_SRC) $(LIB) @mkdir -p $(dir $@) $(ST_CC) $(CFLAGS) $< $(LIB) $(LIBXMP_AR) -o $@ $(LDFLAGS) diff --git a/make/dos.mk b/make/dos.mk index a1b808c..5becb7d 100644 --- a/make/dos.mk +++ b/make/dos.mk @@ -39,6 +39,8 @@ HELLO_SRC := $(EXAMPLES)/hello/hello.c HELLO_BIN := $(BINDIR)/HELLO.EXE PATTERN_SRC := $(EXAMPLES)/pattern/pattern.c PATTERN_BIN := $(BINDIR)/PATTERN.EXE +DRAW_SRC := $(EXAMPLES)/draw/draw.c +DRAW_BIN := $(BINDIR)/DRAW.EXE KEYS_SRC := $(EXAMPLES)/keys/keys.c KEYS_BIN := $(BINDIR)/KEYS.EXE JOY_SRC := $(EXAMPLES)/joy/joy.c @@ -54,7 +56,7 @@ DATA_DIR := $(BINDIR)/DATA DATA_FILES := $(DATA_DIR)/test.mod $(DATA_DIR)/test.sfx .PHONY: all dos clean-dos -all dos: $(LIB) $(LIBXMP_AR) $(HELLO_BIN) $(PATTERN_BIN) $(KEYS_BIN) $(JOY_BIN) $(SPRITE_BIN) $(AUDIO_BIN) $(DATA_FILES) +all dos: $(LIB) $(LIBXMP_AR) $(HELLO_BIN) $(PATTERN_BIN) $(DRAW_BIN) $(KEYS_BIN) $(JOY_BIN) $(SPRITE_BIN) $(AUDIO_BIN) $(DATA_FILES) $(BUILD)/obj/core/%.o: $(SRC_CORE)/%.c @mkdir -p $(dir $@) @@ -94,6 +96,11 @@ $(PATTERN_BIN): $(PATTERN_SRC) $(LIB) $(DOS_CC) $(CFLAGS) $< $(LIB) $(LIBXMP_AR) -o $@ $(DOS_EMBED_DPMI) $@ +$(DRAW_BIN): $(DRAW_SRC) $(LIB) + @mkdir -p $(dir $@) + $(DOS_CC) $(CFLAGS) $< $(LIB) $(LIBXMP_AR) -o $@ + $(DOS_EMBED_DPMI) $@ + $(KEYS_BIN): $(KEYS_SRC) $(LIB) @mkdir -p $(dir $@) $(DOS_CC) $(CFLAGS) $< $(LIB) $(LIBXMP_AR) -o $@ diff --git a/make/iigs.mk b/make/iigs.mk index 4315b4c..1bd8e21 100644 --- a/make/iigs.mk +++ b/make/iigs.mk @@ -12,17 +12,21 @@ PLATFORM := iigs BUILD := $(REPO_DIR)/build/$(PLATFORM) BINDIR := $(BUILD)/bin -PORT_C_SRCS_ALL := $(wildcard $(SRC_PORT)/iigs/*.c) +PORT_C_SRCS_ALL := $(wildcard $(SRC_PORT)/iigs/*.c) +# Hand-rolled .asm sources go through ORCA's macro assembler via +# iix-build.sh's `assemble` dispatch. Each .asm declares its target +# load segment in the START operand (e.g. peislam.asm -> PEISLAMS) +# so the linker places its bytes in a separate bank from _ROOT. +# See ORCA/M for IIgs ch. 6 "Load Segments" for the mechanism. +PORT_ASM_SRCS_ALL := $(wildcard $(SRC_PORT)/iigs/*.asm) -# audio.c is the no-op stub linked into every demo. audio_full.c is the -# real implementation (NewHandle / fopen / JSL trampoline) and links -# only into AUDIO -- the IIgs build is monolithic, so pulling Memory -# Manager + ORCA stdio into every binary blows the linker's -# "Expression too complex" budget. The two files define the same -# halAudio* symbols; iigs/audio.c is filtered out of the AUDIO source -# set, audio_full.c is filtered out of the everyone-else set. -PORT_C_SRCS := $(filter-out %/audio_full.c, $(PORT_C_SRCS_ALL)) -PORT_C_SRCS_AUDIO := $(filter-out %/audio.c, $(PORT_C_SRCS_ALL)) +# audio_full.c declares its functions in the AUDIOIMPL load segment +# (`segment "AUDIOIMPL"` at file scope, see ORCA/C ch. 30) so the +# implementation code lives in its own bank, not _ROOT. That lets +# the same source link into every binary, replacing the earlier +# audio.c-stub vs audio_full.c-real split. The 34 KB NTP replayer +# bytes still ride along via the xxd-baked header. +PORT_C_SRCS := $(PORT_C_SRCS_ALL) # IIgs uses NTPstreamsound for SFX, not the libxmp+overlay combo that # DOS and ST share, so src/core/audioSfxMix.c is unused here. Filter @@ -34,9 +38,6 @@ CORE_C_SRCS_IIGS := $(filter-out %/audioSfxMix.c, $(CORE_C_SRCS)) CODEGEN_SRCS := $(REPO_DIR)/src/codegen/spriteEmitIigs.c \ $(REPO_DIR)/src/codegen/spriteCompile.c -LIB_SRCS := $(CORE_C_SRCS_IIGS) $(PORT_C_SRCS) $(CODEGEN_SRCS) -LIB_SRCS_AUDIO := $(CORE_C_SRCS_IIGS) $(PORT_C_SRCS_AUDIO) $(CODEGEN_SRCS) - # NinjaTrackerPlus replayer. Assembled with Merlin32 from the staged # source at toolchains/iigs/ntp/ninjatrackerplus.s. Output is a 34 KB # raw 65816 binary that the IIgs audio HAL loads at runtime via @@ -45,13 +46,17 @@ LIB_SRCS_AUDIO := $(CORE_C_SRCS_IIGS) $(PORT_C_SRCS_AUDIO) $(CODEGEN_SRCS) # load address even though it was assembled with `org $0F0000`. NTP_SRC := $(REPO_DIR)/toolchains/iigs/ntp/ninjatrackerplus.s NTP_BIN := $(BUILD)/audio/ntpplayer.bin -NTP_HEADER := $(BUILD)/audio/ntpplayer_data.h +NTP_ASM := $(BUILD)/audio/ntpdata.asm IIGS_MERLIN := $(REPO_DIR)/toolchains/iigs/merlin32/bin/merlin32 +LIB_SRCS := $(CORE_C_SRCS_IIGS) $(PORT_C_SRCS) $(PORT_ASM_SRCS_ALL) $(NTP_ASM) $(CODEGEN_SRCS) + HELLO_SRC := $(EXAMPLES)/hello/hello.c HELLO_BIN := $(BINDIR)/HELLO PATTERN_SRC := $(EXAMPLES)/pattern/pattern.c PATTERN_BIN := $(BINDIR)/PATTERN +DRAW_SRC := $(EXAMPLES)/draw/draw.c +DRAW_BIN := $(BINDIR)/DRAW KEYS_SRC := $(EXAMPLES)/keys/keys.c KEYS_BIN := $(BINDIR)/KEYS JOY_SRC := $(EXAMPLES)/joy/joy.c @@ -77,7 +82,11 @@ IIX_INCLUDES := \ -I $(REPO_DIR)/src/codegen .PHONY: all iigs iigs-disk clean-iigs -all iigs: $(HELLO_BIN) $(PATTERN_BIN) $(KEYS_BIN) $(JOY_BIN) $(SPRITE_BIN) $(AUDIO_BIN) +# Building the disk implicitly builds every binary it depends on, so +# `make iigs` ends with a fresh joey.2mg on every change. Without this, +# stale disk images would silently mask binary updates -- a surprise +# when the run script always boots from joey.2mg. +all iigs: $(DISK_IMG) $(NTP_BIN): $(NTP_SRC) $(IIGS_MERLIN) @mkdir -p $(dir $@) @@ -85,51 +94,55 @@ $(NTP_BIN): $(NTP_SRC) $(IIGS_MERLIN) cd $(BUILD)/audio && $(IIGS_MERLIN) . ninjatrackerplus.s mv $(BUILD)/audio/ntpplayer $@ -# Bake the NTP replayer bytes into a C header so audio_full.c can link -# the player into the AUDIO binary instead of fopen'ing a separate -# NTPPLAYER.BIN at runtime. NTP is bank-internal / PIC, so the linked -# bytes still BlockMove cleanly into the Memory Manager handle the HAL -# allocates. Same xxd-i pattern as test_assets.h. -$(NTP_HEADER): $(NTP_BIN) +# Bake the NTP replayer bytes into an ORCA-M asm file. The asm declares +# the bytes in a `data NTPDATA` segment; ORCA's linker groups same- +# name object segments into one load segment, and the GS/OS loader +# places it in its own bank. Net effect: the 34 KB of NTP bytes don't +# crowd _ROOT in any binary, so audio_full.c can link into every demo +# (vs the old audio.c-stub split). audio_full.c references +# gNtpPlayerBytes / gNtpPlayerBytes_len as externs (case-sensitive +# symbol match against the asm labels). +$(NTP_ASM): $(NTP_BIN) $(REPO_DIR)/toolchains/iigs/bin-to-asm.sh @mkdir -p $(dir $@) - @echo "// Generated by make/iigs.mk -- NinjaTrackerPlus replayer bytes." > $@ - @echo "#ifndef JOEYLIB_NTPPLAYER_DATA_H" >> $@ - @echo "#define JOEYLIB_NTPPLAYER_DATA_H" >> $@ - @printf "static const unsigned char gNtpPlayerBytes[] = {\n" >> $@ - @xxd -i < $(NTP_BIN) >> $@ - @printf "};\nstatic const unsigned int gNtpPlayerBytes_len = %d;\n" $$(wc -c < $(NTP_BIN)) >> $@ - @echo "#endif" >> $@ + $(REPO_DIR)/toolchains/iigs/bin-to-asm.sh $(NTP_BIN) $@ NTPDATA gNtpPlayerBytes gNtpPlayerBytes_len # iix-build.sh takes MAIN.c first, then EXTRA sources (compiled with # #pragma noroot). The example source supplies main(); libjoey sources # are the extras. The chtyp post-step tags the output as GS/OS S16 # ($B3) so GS/OS recognizes it as launchable; the file-type lives in # a user.com.apple.FinderInfo xattr that iix and profuse preserve. -$(HELLO_BIN): $(HELLO_SRC) $(LIB_SRCS) $(IIGS_BUILD) +# +# All binaries use ORCA-C large memory model (-b). Cost: slightly +# larger / slower compiled C per the ORCA docs. Win: 32-bit pointers +# everywhere, so library asm can take SurfaceT* args via one +# consistent ABI (small-mm 16-bit pointers truncated bank bytes, +# which broke any asm that wanted to address bank-1 stage memory). +$(HELLO_BIN): $(HELLO_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD) @mkdir -p $(dir $@) - $(IIGS_BUILD) $(IIX_INCLUDES) -o $@ $(HELLO_SRC) $(LIB_SRCS) + $(IIGS_BUILD) -b $(IIX_INCLUDES) -o $@ $(HELLO_SRC) $(LIB_SRCS) $(IIGS_IIX) chtyp -t S16 $@ -$(PATTERN_BIN): $(PATTERN_SRC) $(LIB_SRCS) $(IIGS_BUILD) +$(PATTERN_BIN): $(PATTERN_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD) @mkdir -p $(dir $@) - $(IIGS_BUILD) $(IIX_INCLUDES) -o $@ $(PATTERN_SRC) $(LIB_SRCS) + $(IIGS_BUILD) -b $(IIX_INCLUDES) -o $@ $(PATTERN_SRC) $(LIB_SRCS) $(IIGS_IIX) chtyp -t S16 $@ -$(KEYS_BIN): $(KEYS_SRC) $(LIB_SRCS) $(IIGS_BUILD) +$(DRAW_BIN): $(DRAW_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD) @mkdir -p $(dir $@) - $(IIGS_BUILD) $(IIX_INCLUDES) -o $@ $(KEYS_SRC) $(LIB_SRCS) + $(IIGS_BUILD) -b $(IIX_INCLUDES) -o $@ $(DRAW_SRC) $(LIB_SRCS) $(IIGS_IIX) chtyp -t S16 $@ -$(JOY_BIN): $(JOY_SRC) $(LIB_SRCS) $(IIGS_BUILD) +$(KEYS_BIN): $(KEYS_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD) @mkdir -p $(dir $@) - $(IIGS_BUILD) $(IIX_INCLUDES) -o $@ $(JOY_SRC) $(LIB_SRCS) + $(IIGS_BUILD) -b $(IIX_INCLUDES) -o $@ $(KEYS_SRC) $(LIB_SRCS) $(IIGS_IIX) chtyp -t S16 $@ -# Sprite demo uses ORCA-C large memory model (-b) so pointers are -# 32-bit and the codegen-arena JSL stub can call cross-bank into the -# arena. Without -b, ORCA-C's 16-bit pointers would lose the bank -# byte and the stub would JSL into bank 0 (system memory) -> crash. -$(SPRITE_BIN): $(SPRITE_SRC) $(LIB_SRCS) $(IIGS_BUILD) +$(JOY_BIN): $(JOY_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD) + @mkdir -p $(dir $@) + $(IIGS_BUILD) -b $(IIX_INCLUDES) -o $@ $(JOY_SRC) $(LIB_SRCS) + $(IIGS_IIX) chtyp -t S16 $@ + +$(SPRITE_BIN): $(SPRITE_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD) @mkdir -p $(dir $@) $(IIGS_BUILD) -b $(IIX_INCLUDES) -o $@ $(SPRITE_SRC) $(LIB_SRCS) $(IIGS_IIX) chtyp -t S16 $@ @@ -152,18 +165,18 @@ $(info iigs: php-cli not installed -- AUDIO demo will ship without TEST.NTP; ins AUDIO_DATA_FILES := $(AUDIO_SFX) endif -$(AUDIO_BIN): $(AUDIO_SRC) $(LIB_SRCS_AUDIO) $(NTP_HEADER) $(IIGS_BUILD) +$(AUDIO_BIN): $(AUDIO_SRC) $(LIB_SRCS) $(NTP_ASM) $(IIGS_BUILD) @mkdir -p $(dir $@) - $(IIGS_BUILD) -b $(IIX_INCLUDES) -I $(dir $(NTP_HEADER)) -I $(EXAMPLES)/audio -o $@ $(AUDIO_SRC) $(LIB_SRCS_AUDIO) + $(IIGS_BUILD) -b $(IIX_INCLUDES) -I $(EXAMPLES)/audio -o $@ $(AUDIO_SRC) $(LIB_SRCS) $(IIGS_IIX) chtyp -t S16 $@ # Assemble an 800KB ProDOS 2img containing the examples, ready to # mount in GSplus alongside a GS/OS boot volume. iigs-disk: $(DISK_IMG) -$(DISK_IMG): $(HELLO_BIN) $(PATTERN_BIN) $(KEYS_BIN) $(JOY_BIN) $(SPRITE_BIN) $(AUDIO_BIN) $(AUDIO_DATA_FILES) $(IIGS_PACKAGE) +$(DISK_IMG): $(HELLO_BIN) $(PATTERN_BIN) $(DRAW_BIN) $(KEYS_BIN) $(JOY_BIN) $(SPRITE_BIN) $(AUDIO_BIN) $(AUDIO_DATA_FILES) $(IIGS_PACKAGE) @mkdir -p $(dir $@) - $(IIGS_PACKAGE) $@ $(HELLO_BIN) $(PATTERN_BIN) $(KEYS_BIN) $(JOY_BIN) $(SPRITE_BIN) $(AUDIO_BIN) -- $(AUDIO_DATA_FILES) + $(IIGS_PACKAGE) $@ $(HELLO_BIN) $(PATTERN_BIN) $(DRAW_BIN) $(KEYS_BIN) $(JOY_BIN) $(SPRITE_BIN) $(AUDIO_BIN) -- $(AUDIO_DATA_FILES) clean-iigs: rm -rf $(BUILD) diff --git a/scripts/run-amiga.sh b/scripts/run-amiga.sh index a289743..1030396 100755 --- a/scripts/run-amiga.sh +++ b/scripts/run-amiga.sh @@ -17,27 +17,31 @@ # # scripts/run-amiga.sh # runs Pattern # scripts/run-amiga.sh hello # runs Hello -# scripts/run-amiga.sh keys # runs Keys +# scripts/run-amiga.sh draw # runs Draw +# +# Argument is any built example name (case-insensitive); the script +# normalizes it to PascalCase (first letter upper, rest lower) which +# is the JoeyLib convention for Amiga binary filenames. set -euo pipefail +if [[ $# -gt 1 ]]; then + echo "usage: $0 [example-name]" >&2 + exit 2 +fi + prog=${1:-pattern} repo=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd) bin_dir=$repo/build/amiga/bin support=$repo/toolchains/emulators/support -case $prog in - hello) file=Hello ;; - pattern) file=Pattern ;; - keys) file=Keys ;; - joy) file=Joy ;; - sprite) file=Sprite ;; - audio) file=Audio ;; - *) echo "usage: $0 [hello|pattern|keys|joy|sprite|audio]" >&2; exit 2 ;; -esac +prog_lower=${prog,,} +file=${prog_lower^} if [[ ! -f "$bin_dir/$file" ]]; then echo "$bin_dir/$file not built. Run 'make amiga' first." >&2 + echo "available examples in $bin_dir:" >&2 + find "$bin_dir" -maxdepth 1 -type f -executable -printf '%f\n' >&2 2>/dev/null || true exit 1 fi @@ -56,12 +60,9 @@ dump_keep=/tmp/joeylib-amiga-dump trap 'mkdir -p "$dump_keep"; cp "$work"/*.txt "$dump_keep"/ 2>/dev/null; rm -rf "$work"' EXIT mkdir -p "$work/s" -cp "$bin_dir/Hello" "$work/" 2>/dev/null || true -cp "$bin_dir/Pattern" "$work/" 2>/dev/null || true -cp "$bin_dir/Keys" "$work/" 2>/dev/null || true -cp "$bin_dir/Joy" "$work/" 2>/dev/null || true -cp "$bin_dir/Sprite" "$work/" 2>/dev/null || true -cp "$bin_dir/Audio" "$work/" 2>/dev/null || true +# Stage every built binary (executable file at top of bin_dir, no +# extension on Amiga). DATA/ is copied separately below. +find "$bin_dir" -maxdepth 1 -type f -executable -exec cp -t "$work/" {} + # Stage the DATA folder (test.mod, test.sfx) the audio demo loads from # the boot volume at runtime. if [[ -d "$bin_dir/DATA" ]]; then diff --git a/scripts/run-atarist.sh b/scripts/run-atarist.sh index 22b51f8..c28af6d 100755 --- a/scripts/run-atarist.sh +++ b/scripts/run-atarist.sh @@ -6,28 +6,31 @@ # # scripts/run-atarist.sh # runs PATTERN.PRG # scripts/run-atarist.sh hello # runs HELLO.PRG -# scripts/run-atarist.sh keys # runs KEYS.PRG +# scripts/run-atarist.sh draw # runs DRAW.PRG +# +# Argument is any built example name (case-insensitive); the script +# upper-cases it and appends .PRG, then checks the file exists. set -euo pipefail +if [[ $# -gt 1 ]]; then + echo "usage: $0 [example-name]" >&2 + exit 2 +fi + prog=${1:-pattern} repo=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd) bin_dir=$repo/build/atarist/bin - -case $prog in - hello) file=HELLO.PRG ;; - pattern) file=PATTERN.PRG ;; - keys) file=KEYS.PRG ;; - joy) file=JOY.PRG ;; - sprite) file=SPRITE.PRG ;; - audio) file=AUDIO.PRG ;; - *) echo "usage: $0 [hello|pattern|keys|joy|sprite|audio]" >&2; exit 2 ;; -esac +file=${prog^^}.PRG tos=$repo/toolchains/emulators/support/emutos-512k.img if [[ ! -f "$bin_dir/$file" ]]; then echo "$bin_dir/$file not built. Run 'make atarist' first." >&2 + if compgen -G "$bin_dir/*.PRG" > /dev/null; then + echo "available examples in $bin_dir:" >&2 + ls "$bin_dir"/*.PRG | xargs -n1 basename >&2 + fi exit 1 fi if [[ ! -f $tos ]]; then diff --git a/scripts/run-dos.sh b/scripts/run-dos.sh index 46f3f93..607d37c 100755 --- a/scripts/run-dos.sh +++ b/scripts/run-dos.sh @@ -3,26 +3,29 @@ # # scripts/run-dos.sh # runs PATTERN # scripts/run-dos.sh hello # runs HELLO -# scripts/run-dos.sh keys # runs KEYS +# scripts/run-dos.sh draw # runs DRAW +# +# Argument is any built example name (case-insensitive); the script +# upper-cases it and appends .EXE, then checks the file exists. set -euo pipefail +if [[ $# -gt 1 ]]; then + echo "usage: $0 [example-name]" >&2 + exit 2 +fi + prog=${1:-pattern} repo=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd) bin_dir=$repo/build/dos/bin - -case $prog in - hello) file=HELLO.EXE ;; - pattern) file=PATTERN.EXE ;; - keys) file=KEYS.EXE ;; - joy) file=JOY.EXE ;; - sprite) file=SPRITE.EXE ;; - audio) file=AUDIO.EXE ;; - *) echo "usage: $0 [hello|pattern|keys|joy|sprite|audio]" >&2; exit 2 ;; -esac +file=${prog^^}.EXE if [[ ! -f "$bin_dir/$file" ]]; then echo "$bin_dir/$file not built. Run 'make dos' first." >&2 + if compgen -G "$bin_dir/*.EXE" > /dev/null; then + echo "available examples in $bin_dir:" >&2 + ls "$bin_dir"/*.EXE | xargs -n1 basename >&2 + fi exit 1 fi diff --git a/scripts/run-iigs-mame.sh b/scripts/run-iigs-mame.sh index 0daba86..bf11dde 100755 --- a/scripts/run-iigs-mame.sh +++ b/scripts/run-iigs-mame.sh @@ -17,16 +17,28 @@ set -euo pipefail +if [[ $# -gt 1 ]]; then + echo "usage: $0 [example-name]" >&2 + exit 2 +fi + prog=${1:-pattern} repo=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd) -case $prog in - hello|pattern|keys|joy|sprite|audio) ;; - *) echo "usage: $0 [hello|pattern|keys|joy|sprite|audio]" >&2; exit 2 ;; -esac +bin_dir=$repo/build/iigs/bin +target=${prog^^} +if [[ ! -f "$bin_dir/$target" ]]; then + echo "$bin_dir/$target not built. Run 'make iigs' first." >&2 + if compgen -G "$bin_dir/*" > /dev/null; then + echo "available examples in $bin_dir:" >&2 + find "$bin_dir" -maxdepth 1 -type f -printf '%f\n' \ + | grep -vE '\.2mg$|\.txt$' >&2 || true + fi + exit 1 +fi sys_disk=$repo/toolchains/emulators/support/gsos-system.po -data_disk=$repo/build/iigs/bin/joey.2mg +data_disk=$bin_dir/joey.2mg for f in "$sys_disk" "$data_disk"; do if [[ ! -f $f ]]; then @@ -42,24 +54,30 @@ mkdir -p "$out" cp "$sys_disk" "$work/boot.po" cp "$data_disk" "$work/joey.2mg" -# Lua script: on every CPU stop (BRK, breakpoint, watchpoint, manual -# halt), append a state snapshot to crash.txt. This way we don't need -# the user to type anything at the debugger window -- whatever halts -# the CPU lands a record in crash.txt. -cat > "$work/crash-hook.lua" <<'LUA' --- Crash diagnostics for IIgs demos. Auto-resumes the initial debug --- pause so the user doesn't need to type "go". On any subsequent halt --- (BRK, watchpoint, breakpoint) outside ROM, dumps registers + bytes --- around PC to crash.txt. ROM halts (PB == 0xFE/0xFF) are skipped so --- we don't fill the file with normal IIgs ROM stack walking. +# Lua script: drives Finder via natural keyboard + macadb key fields +# to launch the requested example, dumps register state on any halt +# (BRK, breakpoint, watchpoint, manual stop). Field names for keys +# come from MAME's apple2gs macadb input definitions: +# :macadb:KEY0 -> "d D" +# :macadb:KEY1 -> "o O" +# :macadb:KEY2 -> "j J", "p P" +# :macadb:KEY3 -> "Command / Open Apple" +# Letters not on KEY0..2 fall back to natkeyboard:post() (which +# handles modifier-less character entry only). +# Type the FULL program name to disambiguate (DRAW vs DATA which both +# start with D and live in the same JOEYLIB volume). +prog_select_str=${target} +cat > "$work/crash-hook.lua" < 1800 and not crashed then + check_ckpt() + while step_idx <= #steps and boot_frames >= steps[step_idx][1] do + steps[step_idx][2]() + step_idx = step_idx + 1 + end + if boot_frames > 18000 and not crashed then crashed = true dump("watchdog") signal_done("watchdog") @@ -130,7 +226,8 @@ cat </dev/null; do if [[ -f $out/.done ]]; then kill "$mame_pid" 2>/dev/null diff --git a/scripts/run-iigs.sh b/scripts/run-iigs.sh index 423548e..af581e6 100755 --- a/scripts/run-iigs.sh +++ b/scripts/run-iigs.sh @@ -11,13 +11,18 @@ # # scripts/run-iigs.sh # boots (Pattern hint) # scripts/run-iigs.sh hello # boots, hints HELLO -# scripts/run-iigs.sh keys # boots, hints KEYS -# scripts/run-iigs.sh joy # boots, hints JOY -# scripts/run-iigs.sh sprite # boots, hints SPRITE -# scripts/run-iigs.sh audio # boots, hints AUDIO +# scripts/run-iigs.sh draw # boots, hints DRAW +# +# Argument is any built example name (case-insensitive); upper-case +# it for the Finder hint and existence-check. set -euo pipefail +if [[ $# -gt 1 ]]; then + echo "usage: $0 [example-name]" >&2 + exit 2 +fi + prog=${1:-pattern} repo=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd) @@ -33,10 +38,17 @@ sys_disk=$repo/toolchains/emulators/support/gsos-system.po data_disk=$repo/build/iigs/bin/joey.2mg null_c600=$repo/toolchains/emulators/support/iigs-null-c600.rom -case $prog in - hello|pattern|keys|joy|sprite|audio) ;; - *) echo "usage: $0 [hello|pattern|keys|joy|sprite|audio]" >&2; exit 2 ;; -esac +target=${prog^^} +bin_dir=$repo/build/iigs/bin +if [[ ! -f "$bin_dir/$target" ]]; then + echo "$bin_dir/$target not built. Run 'make iigs' first." >&2 + if compgen -G "$bin_dir/*" > /dev/null; then + echo "available examples in $bin_dir:" >&2 + find "$bin_dir" -maxdepth 1 -type f -printf '%f\n' \ + | grep -vE '\.2mg$|\.txt$' >&2 || true + fi + exit 1 +fi for f in "$gsplus" "$rom" "$sys_disk" "$data_disk" "$null_c600"; do if [[ ! -f $f ]]; then @@ -107,7 +119,6 @@ cp "$data_disk" "$work/joey.2mg" # install_support_iigs_null_c600. cp "$null_c600" "$work/c600.rom" -target=$(echo "$prog" | tr '[:lower:]' '[:upper:]') cat < #include "joey/draw.h" +#include "joey/debug.h" +#include "hal.h" #include "surfaceInternal.h" +// On IIgs, hoist all primitive functions out of _ROOT into a named +// DRAWPRIMS load segment. drawLine/drawCircle/fillCircle/floodFill/ +// floodFillBounded together push past the 64 KB-per-bank budget for +// the simpler binaries (PATTERN was the first to fail). On other +// ports this macro vanishes. +JOEYLIB_SEGMENT("DRAWPRIMS") + +// ----- Constants ----- + +// Flood-fill seed stack: each entry is (x, y) = 4 bytes, so 512 slots +// = 2 KB. For 320x200 surfaces with reasonable region sizes this is +// well above the worst-case scanline-fill seed depth (typically <50). +// On overflow the fill silently truncates rather than crashing. +#define FLOOD_STACK_SIZE 512 + // ----- Prototypes ----- static bool blitClip(int16_t *dstX, int16_t *dstY, int16_t *srcX, int16_t *srcY, int16_t *w, int16_t *h, int16_t srcW, int16_t srcH); static void clipRect(int16_t *x, int16_t *y, int16_t *w, int16_t *h, bool *outVisible); static void fillRectClipped(SurfaceT *s, int16_t x, int16_t y, int16_t w, int16_t h, uint8_t colorIndex); +static void floodFillInternal(SurfaceT *s, int16_t startX, int16_t startY, uint8_t newColor, uint8_t matchColor, bool matchEqual); static uint8_t srcPixel(const uint8_t *row, int16_t x); static void dstPixel(uint8_t *row, int16_t x, uint8_t nibble); @@ -117,6 +135,207 @@ static void fillRectClipped(SurfaceT *s, int16_t x, int16_t y, int16_t w, int16_ } +// Smith's scanline flood fill. Implements both the unbounded and the +// boundary-stopped variants in one pass: the matching predicate is +// (pixel == matchColor) when matchEqual is true (unbounded floodFill, +// matchColor is the original seed color) or (pixel != matchColor) +// when matchEqual is false (floodFillBounded, matchColor is the +// boundary that stops the fill). +// +// Algorithm: +// 1. Push seed (x, y) on stack. +// 2. Pop a seed; skip if its pixel no longer matches (already +// filled by an earlier span overlap). +// 3. Scan left and right from the seed to find the longest run of +// matching pixels containing it -- this is the current span. +// 4. Fill the span with newColor. +// 5. Walk the row above and the row below, scanning the columns +// that overlap the just-filled span; for each contiguous run of +// matching pixels, push the rightmost x of that run as a new +// seed (so popping that seed next will scan the same run). +// 6. Repeat until the stack drains. +// +// Stack overflow truncates the fill rather than crashing; for vector +// art (Sierra-style picture playback) the input is well-behaved and +// 512 entries is plenty. +static void floodFillInternal(SurfaceT *s, int16_t startX, int16_t startY, uint8_t newColor, uint8_t matchColor, bool matchEqual) { + static int16_t stackX[FLOOD_STACK_SIZE]; + static int16_t stackY[FLOOD_STACK_SIZE]; + static uint8_t floodMarkBuf[SURFACE_WIDTH]; + int16_t sp; + int16_t x; + int16_t y; + int16_t leftX; + int16_t rightX; + uint8_t *row; + uint8_t pix; + bool pixMatch; + uint8_t newNibble; + + newNibble = (uint8_t)(newColor & 0x0F); + matchColor = (uint8_t)(matchColor & 0x0F); + + sp = 0; + stackX[sp] = startX; + stackY[sp] = startY; + sp++; + + while (sp > 0) { + sp--; + x = stackX[sp]; + y = stackY[sp]; + + if (y < 0 || y >= SURFACE_HEIGHT || x < 0 || x >= SURFACE_WIDTH) { + continue; + } + row = &s->pixels[y * SURFACE_BYTES_PER_ROW]; + + // Highest-tier asm fast path: seed-test + walk-left + walk-right + // + 1-row fill + scan-above + scan-below + push, all in one + // cross-segment call. The asm caches row addr / match decoder + // across every sub-operation. C just pops and dispatches; this + // path completes the entire per-seed work. + { + bool seedMatched; + if (halFastFloodWalkAndScans(s->pixels, x, y, + matchColor, newNibble, matchEqual, + stackX, stackY, + &sp, FLOOD_STACK_SIZE, + &seedMatched, &leftX, &rightX)) { + continue; + } + } + + // Tier-2 asm fast path: combined seed test + walk-left + + // walk-right in one cross-segment call. Falls back to the + // pure-C walks below on ports without an asm implementation. + { + bool seedMatched; + if (halFastFloodWalk(row, x, matchColor, newNibble, matchEqual, + &seedMatched, &leftX, &rightX)) { + if (!seedMatched) { + continue; + } + } else { + pix = srcPixel(row, x); + pixMatch = (pix == matchColor); + if (matchEqual) { + if (!pixMatch) { + continue; + } + } else { + if (pixMatch || pix == newNibble) { + continue; + } + } + + // Walk left to find the start of the matching run. + leftX = x; + while (leftX > 0) { + pix = srcPixel(row, (int16_t)(leftX - 1)); + pixMatch = (pix == matchColor); + if (matchEqual ? !pixMatch : (pixMatch || pix == newNibble)) { + break; + } + leftX--; + } + + // Walk right to find the end. + rightX = x; + while (rightX < SURFACE_WIDTH - 1) { + pix = srcPixel(row, (int16_t)(rightX + 1)); + pixMatch = (pix == matchColor); + if (matchEqual ? !pixMatch : (pixMatch || pix == newNibble)) { + break; + } + rightX++; + } + } + } + + // Fill the span. Bypass fillRect's clipping wrapper: walk-out + // already guaranteed leftX/rightX are in [0..SURFACE_WIDTH-1] + // and the seed-pop bounds check did the same for y. + { + int16_t spanW = (int16_t)(rightX - leftX + 1); + if (!halFastFillRect(s, leftX, y, (uint16_t)spanW, 1, newNibble)) { + fillRectClipped(s, leftX, y, spanW, 1, newNibble); + } + } + + // Scan rows above and below for run boundaries. The hot + // per-pixel match check goes through halFastFloodScanRow on + // ports that have it (IIgs); fills markBuf[] with 1/0 per + // pixel so the run-edge walk below is array-only -- no + // function call, no nibble extract. + { + int16_t i; + int16_t spanLen; + uint8_t *scanRow; + int16_t scanY; + int16_t side; + bool curHit; + bool prevHit; + + spanLen = (int16_t)(rightX - leftX + 1); + for (side = 0; side < 2; side++) { + if (side == 0) { + if (y <= 0) { + continue; + } + scanY = (int16_t)(y - 1); + } else { + if (y >= SURFACE_HEIGHT - 1) { + continue; + } + scanY = (int16_t)(y + 1); + } + scanRow = &s->pixels[scanY * SURFACE_BYTES_PER_ROW]; + // Prefer the combined scan+push asm path (one call per + // scan, no markBuf and no per-pixel C edge walk). + if (!halFastFloodScanAndPush(scanRow, leftX, rightX, + matchColor, newNibble, matchEqual, + scanY, stackX, stackY, + &sp, FLOOD_STACK_SIZE)) { + if (!halFastFloodScanRow(scanRow, leftX, rightX, + matchColor, newNibble, matchEqual, + floodMarkBuf)) { + // C fallback: fill markBuf the slow way. + for (i = 0; i < spanLen; i++) { + pix = srcPixel(scanRow, (int16_t)(leftX + i)); + pixMatch = (pix == matchColor); + floodMarkBuf[i] = (uint8_t)(matchEqual + ? (pixMatch ? 1 : 0) + : ((!pixMatch && pix != newNibble) ? 1 : 0)); + } + } + // Walk markBuf for run-edge transitions. + prevHit = false; + for (i = 0; i < spanLen; i++) { + curHit = floodMarkBuf[i] != 0; + if (!curHit && prevHit) { + if (sp < FLOOD_STACK_SIZE) { + stackX[sp] = (int16_t)(leftX + i - 1); + stackY[sp] = scanY; + sp++; + } + } + prevHit = curHit; + } + if (prevHit) { + if (sp < FLOOD_STACK_SIZE) { + stackX[sp] = rightX; + stackY[sp] = scanY; + sp++; + } + } + } + } + } + } +} + + static void dstPixel(uint8_t *row, int16_t x, uint8_t nibble) { uint8_t *byte; @@ -142,6 +361,131 @@ static uint8_t srcPixel(const uint8_t *row, int16_t x) { // ----- Public API (alphabetical) ----- +void drawCircle(SurfaceT *s, int16_t cx, int16_t cy, uint16_t r, uint8_t colorIndex) { + int16_t x; + int16_t y; + int16_t err; + int16_t ir; + + if (s == NULL) { + return; + } + if (r == 0) { + drawPixel(s, cx, cy, colorIndex); + return; + } + + // Fast path: when the bounding circle is fully on-surface we can + // hand off to the port asm (no per-pixel bounds check needed in + // the inner loop) and mark the bounding box dirty once. + ir = (int16_t)r; + if (cx - ir >= 0 && cx + ir < SURFACE_WIDTH && + cy - ir >= 0 && cy + ir < SURFACE_HEIGHT && + halFastDrawCircle(s, cx, cy, r, colorIndex)) { + surfaceMarkDirtyRect(s, (int16_t)(cx - ir), (int16_t)(cy - ir), + (uint16_t)(2 * ir + 1), (uint16_t)(2 * ir + 1)); + return; + } + + // Bresenham midpoint: maintain (x, y) on the perimeter, eight- + // octant symmetry plots all 8 reflections each iteration. Routes + // through drawPixel so off-surface pixels clip individually. + x = (int16_t)r; + y = 0; + err = (int16_t)(1 - x); + while (x >= y) { + drawPixel(s, (int16_t)(cx + x), (int16_t)(cy + y), colorIndex); + drawPixel(s, (int16_t)(cx - x), (int16_t)(cy + y), colorIndex); + drawPixel(s, (int16_t)(cx + x), (int16_t)(cy - y), colorIndex); + drawPixel(s, (int16_t)(cx - x), (int16_t)(cy - y), colorIndex); + drawPixel(s, (int16_t)(cx + y), (int16_t)(cy + x), colorIndex); + drawPixel(s, (int16_t)(cx - y), (int16_t)(cy + x), colorIndex); + drawPixel(s, (int16_t)(cx + y), (int16_t)(cy - x), colorIndex); + drawPixel(s, (int16_t)(cx - y), (int16_t)(cy - x), colorIndex); + y++; + if (err <= 0) { + err = (int16_t)(err + 2 * y + 1); + } else { + x--; + err = (int16_t)(err + 2 * (y - x) + 1); + } + } +} + + +void drawLine(SurfaceT *s, int16_t x0, int16_t y0, int16_t x1, int16_t y1, uint8_t colorIndex) { + int16_t dx; + int16_t dy; + int16_t sx; + int16_t sy; + int16_t err; + int16_t e2; + int16_t tmp; + + if (s == NULL) { + return; + } + + // Horizontal and vertical fast paths use fillRect; the general + // case Bresenham routes per-pixel through drawPixel so per-pixel + // off-surface clipping just works. + if (y0 == y1) { + if (x0 > x1) { + tmp = x0; + x0 = x1; + x1 = tmp; + } + fillRect(s, x0, y0, (uint16_t)(x1 - x0 + 1), 1, colorIndex); + return; + } + if (x0 == x1) { + if (y0 > y1) { + tmp = y0; + y0 = y1; + y1 = tmp; + } + fillRect(s, x0, y0, 1, (uint16_t)(y1 - y0 + 1), colorIndex); + return; + } + + // Diagonal: if both endpoints are on-surface, the inner Bresenham + // can run without per-pixel bound checks. Hand off to the port + // fast path; bounding-box dirty marking happens here in C either + // way. + if (x0 >= 0 && x0 < SURFACE_WIDTH && x1 >= 0 && x1 < SURFACE_WIDTH && + y0 >= 0 && y0 < SURFACE_HEIGHT && y1 >= 0 && y1 < SURFACE_HEIGHT && + halFastDrawLine(s, x0, y0, x1, y1, colorIndex)) { + int16_t bbx = (x0 < x1) ? x0 : x1; + int16_t bby = (y0 < y1) ? y0 : y1; + int16_t bbw = (int16_t)(((x0 > x1) ? x0 : x1) - bbx + 1); + int16_t bbh = (int16_t)(((y0 > y1) ? y0 : y1) - bby + 1); + surfaceMarkDirtyRect(s, bbx, bby, bbw, bbh); + return; + } + + dx = (int16_t)((x1 > x0) ? (x1 - x0) : (x0 - x1)); + dy = (int16_t)(-((y1 > y0) ? (y1 - y0) : (y0 - y1))); + sx = (int16_t)((x0 < x1) ? 1 : -1); + sy = (int16_t)((y0 < y1) ? 1 : -1); + err = (int16_t)(dx + dy); + while (1) { + drawPixel(s, x0, y0, colorIndex); + if (x0 == x1 && y0 == y1) { + break; + } + e2 = (int16_t)(2 * err); + if (e2 >= dy) { + err = (int16_t)(err + dy); + x0 = (int16_t)(x0 + sx); + } + if (e2 <= dx) { + err = (int16_t)(err + dx); + y0 = (int16_t)(y0 + sy); + } + } +} + + void drawPixel(SurfaceT *s, int16_t x, int16_t y, uint8_t colorIndex) { uint8_t *byte; uint8_t nibble; @@ -153,12 +497,90 @@ void drawPixel(SurfaceT *s, int16_t x, int16_t y, uint8_t colorIndex) { return; } - byte = &s->pixels[y * SURFACE_BYTES_PER_ROW + (x >> 1)]; - nibble = colorIndex & 0x0F; - if (x & 1) { - *byte = (uint8_t)((*byte & 0xF0) | nibble); - } else { - *byte = (uint8_t)((*byte & 0x0F) | (nibble << 4)); + if (!halFastDrawPixel(s, (uint16_t)x, (uint16_t)y, colorIndex)) { + byte = &s->pixels[y * SURFACE_BYTES_PER_ROW + (x >> 1)]; + nibble = colorIndex & 0x0F; + if (x & 1) { + *byte = (uint8_t)((*byte & 0xF0) | nibble); + } else { + *byte = (uint8_t)((*byte & 0x0F) | (nibble << 4)); + } + } + surfaceMarkDirtyRect(s, x, y, 1, 1); +} + + +void drawRect(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex) { + if (s == NULL) { + return; + } + if (w == 0 || h == 0) { + return; + } + // Degenerate dimensions: a 1xN or Nx1 rect IS a line, and a 1x1 + // rect is a single pixel. fillRect handles both correctly so we + // don't need to fork the inner logic. + if (h == 1 || w == 1) { + fillRect(s, x, y, w, h, colorIndex); + return; + } + // Top edge. + fillRect(s, x, y, w, 1, colorIndex); + // Bottom edge. + fillRect(s, x, (int16_t)(y + (int16_t)h - 1), w, 1, colorIndex); + // Left edge (interior only -- top and bottom corners already drawn). + fillRect(s, x, (int16_t)(y + 1), 1, (uint16_t)(h - 2), colorIndex); + // Right edge (interior only). + fillRect(s, (int16_t)(x + (int16_t)w - 1), (int16_t)(y + 1), 1, (uint16_t)(h - 2), colorIndex); +} + + +void fillCircle(SurfaceT *s, int16_t cx, int16_t cy, uint16_t r, uint8_t colorIndex) { + int16_t y; + int16_t x; + int16_t ir; + uint16_t xx; + uint16_t yy; + uint16_t r2; + + if (s == NULL) { + return; + } + if (r == 0) { + drawPixel(s, cx, cy, colorIndex); + return; + } + + ir = (int16_t)r; + if (cx - ir >= 0 && cx + ir < SURFACE_WIDTH && + cy - ir >= 0 && cy + ir < SURFACE_HEIGHT && + halFastFillCircle(s, cx, cy, r, colorIndex)) { + surfaceMarkDirtyRect(s, (int16_t)(cx - ir), (int16_t)(cy - ir), + (uint16_t)(2 * ir + 1), (uint16_t)(2 * ir + 1)); + return; + } + + // For each y from 0 to r, find the largest x such that x*x + y*y + // <= r*r and emit a horizontal span. Maintain xx=x*x, yy=y*y + // incrementally so the hot loop never does a 32-bit multiply -- + // critical on 65816 / 68000 / 286 where mul is slow or absent. + // (y+1)^2 = y^2 + 2y + 1; (x-1)^2 = x^2 - 2x + 1. r is uint16_t + // so xx, yy, r2 fit in uint16_t for any r where x*x+y*y can equal + // r2 (i.e. r <= 255 -> r2 <= 65025). + xx = (uint16_t)(r * r); + r2 = xx; + yy = 0; + x = (int16_t)r; + for (y = 0; y <= (int16_t)r; y++) { + while (xx + yy > r2) { + xx = (uint16_t)(xx - (uint16_t)(2 * x - 1)); + x--; + } + fillRect(s, (int16_t)(cx - x), (int16_t)(cy + y), (uint16_t)(2 * x + 1), 1, colorIndex); + if (y > 0) { + fillRect(s, (int16_t)(cx - x), (int16_t)(cy - y), (uint16_t)(2 * x + 1), 1, colorIndex); + } + yy = (uint16_t)(yy + (uint16_t)(2 * y + 1)); } } @@ -182,7 +604,53 @@ void fillRect(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t if (!visible) { return; } - fillRectClipped(s, sx, sy, sw, sh, colorIndex); + if (!halFastFillRect(s, sx, sy, (uint16_t)sw, (uint16_t)sh, colorIndex)) { + fillRectClipped(s, sx, sy, sw, sh, colorIndex); + } + surfaceMarkDirtyRect(s, sx, sy, sw, sh); +} + + +void floodFill(SurfaceT *s, int16_t x, int16_t y, uint8_t newColor) { + uint8_t *row; + uint8_t seedColor; + + if (s == NULL) { + return; + } + if (x < 0 || x >= SURFACE_WIDTH || y < 0 || y >= SURFACE_HEIGHT) { + return; + } + row = &s->pixels[y * SURFACE_BYTES_PER_ROW]; + seedColor = srcPixel(row, x); + if ((seedColor & 0x0F) == (newColor & 0x0F)) { + return; + } + floodFillInternal(s, x, y, newColor, seedColor, true); +} + + +void floodFillBounded(SurfaceT *s, int16_t x, int16_t y, uint8_t newColor, uint8_t boundaryColor) { + uint8_t *row; + uint8_t pix; + + if (s == NULL) { + return; + } + if (x < 0 || x >= SURFACE_WIDTH || y < 0 || y >= SURFACE_HEIGHT) { + return; + } + row = &s->pixels[y * SURFACE_BYTES_PER_ROW]; + pix = srcPixel(row, x); + // Starting on a boundary pixel or already-filled pixel: nothing + // to do. + if ((pix & 0x0F) == (boundaryColor & 0x0F)) { + return; + } + if ((pix & 0x0F) == (newColor & 0x0F)) { + return; + } + floodFillInternal(s, x, y, newColor, boundaryColor, false); } @@ -225,14 +693,20 @@ void surfaceBlit(SurfaceT *dst, const JoeyAssetT *src, int16_t x, int16_t y) { } srcRowBytes = (int16_t)((src->width + 1) >> 1); - for (row = 0; row < copyH; row++) { - srcRow = &src->pixels[(srcY0 + row) * srcRowBytes]; - dstRow = &dst->pixels[(y + row) * SURFACE_BYTES_PER_ROW]; - for (col = 0; col < copyW; col++) { - nibble = srcPixel(srcRow, srcX0 + col); - dstPixel(dstRow, x + col, nibble); + srcRow = &src->pixels[srcY0 * srcRowBytes]; + dstRow = &dst->pixels[y * SURFACE_BYTES_PER_ROW]; + if (!halFastBlitRect(dstRow, x, srcRow, srcX0, + copyW, copyH, srcRowBytes, 0xFFFFu)) { + for (row = 0; row < copyH; row++) { + srcRow = &src->pixels[(srcY0 + row) * srcRowBytes]; + dstRow = &dst->pixels[(y + row) * SURFACE_BYTES_PER_ROW]; + for (col = 0; col < copyW; col++) { + nibble = srcPixel(srcRow, srcX0 + col); + dstPixel(dstRow, x + col, nibble); + } } } + surfaceMarkDirtyRect(dst, x, y, copyW, copyH); } @@ -259,17 +733,23 @@ void surfaceBlitMasked(SurfaceT *dst, const JoeyAssetT *src, int16_t x, int16_t transparent = (uint8_t)(transparentIndex & 0x0F); srcRowBytes = (int16_t)((src->width + 1) >> 1); - for (row = 0; row < copyH; row++) { - srcRow = &src->pixels[(srcY0 + row) * srcRowBytes]; - dstRow = &dst->pixels[(y + row) * SURFACE_BYTES_PER_ROW]; - for (col = 0; col < copyW; col++) { - nibble = srcPixel(srcRow, srcX0 + col); - if (nibble == transparent) { - continue; + srcRow = &src->pixels[srcY0 * srcRowBytes]; + dstRow = &dst->pixels[y * SURFACE_BYTES_PER_ROW]; + if (!halFastBlitRect(dstRow, x, srcRow, srcX0, + copyW, copyH, srcRowBytes, (uint16_t)transparent)) { + for (row = 0; row < copyH; row++) { + srcRow = &src->pixels[(srcY0 + row) * srcRowBytes]; + dstRow = &dst->pixels[(y + row) * SURFACE_BYTES_PER_ROW]; + for (col = 0; col < copyW; col++) { + nibble = srcPixel(srcRow, srcX0 + col); + if (nibble == transparent) { + continue; + } + dstPixel(dstRow, x + col, nibble); } - dstPixel(dstRow, x + col, nibble); } } + surfaceMarkDirtyRect(dst, x, y, copyW, copyH); } @@ -282,5 +762,8 @@ void surfaceClear(SurfaceT *s, uint8_t colorIndex) { } nibble = colorIndex & 0x0F; doubled = (uint8_t)((nibble << 4) | nibble); - memset(s->pixels, doubled, SURFACE_PIXELS_SIZE); + if (!halFastSurfaceClear(s, doubled)) { + memset(s->pixels, doubled, SURFACE_PIXELS_SIZE); + } + surfaceMarkDirtyAll(s); } diff --git a/src/core/hal.h b/src/core/hal.h index c0f205e..68d72d2 100644 --- a/src/core/hal.h +++ b/src/core/hal.h @@ -23,6 +23,14 @@ bool halInit(const JoeyConfigT *config); // Per-port teardown. Restores display mode, frees HW-adjacent buffers. void halShutdown(void); +// Allocate / release the SURFACE_PIXELS_SIZE-byte pixel buffer that +// backs the library-owned stage surface. Ports that have a +// hardware-friendly pin location for the back buffer (IIgs $01/2000 +// with SHR shadow inhibited) return that address here; ports with no +// such constraint just malloc/free. +uint8_t *halStageAllocPixels(void); +void halStageFreePixels(uint8_t *pixels); + // Present the entire source surface to the display. void halPresent(const SurfaceT *src); @@ -64,4 +72,116 @@ void halAudioPlaySfx(uint8_t slot, const uint8_t *sample, uint32_t length, uint1 void halAudioStopSfx(uint8_t slot); void halAudioFrameTick(void); +// Optional fast-path hooks. Each returns true if the port handled the +// operation in a port-specific accelerated path; false means the +// caller should fall back to the platform-agnostic C implementation. +// +// Funneling all asm dispatches through hal.c (one TU per port) avoids +// the cumulative ORCA Linker "Expression too complex" failure that +// hits when multiple cross-platform TUs each call into a named load +// segment full of asm primitives. Cross-platform code in src/core/ +// only ever calls into HAL, so the link-time expression cost is paid +// once per binary -- not once per TU that wants speed. +// +// Each port must provide all of these; ports without an accelerated +// path simply return false from every hook. +bool halFastSurfaceClear(SurfaceT *s, uint8_t doubled); +bool halFastFillRect(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex); +bool halFastTileFill(SurfaceT *s, uint8_t bx, uint8_t by, uint16_t fillWord); +// Tile primitives operate on already-computed row-0 pointers from +// the C wrapper. dstRow0 / srcRow0 point at the first byte of the +// 8x8 region within their respective surfaces (stride 160). For +// tilePaste / tileSnap the TileT side is a packed 32-byte buffer +// (stride 4); the corresponding pointer points at byte 0 of that +// buffer. +bool halFastTileCopy(uint8_t *dstRow0, const uint8_t *srcRow0); +bool halFastTileCopyMasked(uint8_t *dstRow0, const uint8_t *srcRow0, uint8_t transparent); +bool halFastTilePaste(uint8_t *dstRow0, const uint8_t *srcTilePixels); +bool halFastTileSnap(uint8_t *dstTilePixels, const uint8_t *srcRow0); + +// drawPixel inner: caller has already done NULL + bounds checks. +// (x, y) are guaranteed in [0..SURFACE_WIDTH-1] x [0..SURFACE_HEIGHT-1]. +// colorIndex is the 0..15 nibble. Surface dirty marking happens in +// the C wrapper after this returns. +bool halFastDrawPixel(SurfaceT *s, uint16_t x, uint16_t y, uint8_t colorIndex); + +// drawLine inner for the diagonal case. Caller ensures both endpoints +// are inside the surface bounds, so the inner loop runs without +// per-pixel clip checks. The C wrapper still routes pure horizontal +// and vertical lines through fillRect (which has its own fast path). +bool halFastDrawLine(SurfaceT *s, int16_t x0, int16_t y0, int16_t x1, int16_t y1, uint8_t colorIndex); + +// drawCircle / fillCircle inner. Caller has already validated that +// the entire bounding circle (cx-r .. cx+r, cy-r .. cy+r) fits inside +// the surface bounds, so the inner loop plots every octant pixel +// unconditionally. r is guaranteed > 0; the cx == 0 / r == 0 cases +// stay in the C wrapper. +bool halFastDrawCircle(SurfaceT *s, int16_t cx, int16_t cy, uint16_t r, uint8_t colorIndex); +bool halFastFillCircle(SurfaceT *s, int16_t cx, int16_t cy, uint16_t r, uint8_t colorIndex); + +// floodFill helper: combined seed test + walk-left + walk-right for +// one row. Returns true if the port handled it (asm path taken). The +// out-param seedMatched tells the caller whether the seed pixel +// satisfied the match criterion -- if false, caller skips this pop; +// if true, leftXOut/rightXOut hold the run boundaries. +// Returns false if no asm path; caller falls back to C walks. +bool halFastFloodWalk(uint8_t *row, int16_t startX, + uint8_t matchColor, uint8_t newColor, bool matchEqual, + bool *seedMatched, + int16_t *leftXOut, int16_t *rightXOut); + +// floodFill helper for the row-above / row-below run-detection scans. +// Walks pixels [leftX..rightX] inclusive of `row`, writing 1 byte per +// pixel into markBuf (1 = qualifies for flood, 0 = does not). The C +// side then walks markBuf for run-edge transitions, replacing the +// per-pixel srcPixel + match check inside the inner loop. +// Returns true if the port handled it; false to fall back to C. +bool halFastFloodScanRow(uint8_t *row, int16_t leftX, int16_t rightX, + uint8_t matchColor, uint8_t newColor, bool matchEqual, + uint8_t *markBuf); + +// Combined per-pixel scan + run-edge walk + seed push. Higher-level +// than halFastFloodScanRow: replaces both the markBuf fill AND the C +// loop that walks markBuf for falling edges. *spInOut is read on entry +// and updated with the new top-of-stack on return. Returns true if +// the port handled it (caller skips the C run-edge walk entirely); +// false to fall back to halFastFloodScanRow + C walk. +bool halFastFloodScanAndPush(uint8_t *row, int16_t leftX, int16_t rightX, + uint8_t matchColor, uint8_t newColor, bool matchEqual, + int16_t scanY, + int16_t *stackX, int16_t *stackY, + int16_t *spInOut, int16_t maxSp); + +// Highest-level flood helper: combined seed-test + walk-left + walk-right +// + scan-above + scan-below + push for ONE popped seed. Replaces three +// cross-segment HAL calls (halFastFloodWalk + 2x halFastFloodScanAndPush) +// per dispatch loop iteration with one. The asm internally caches row +// addr / matchByte / nibble decoder across all three sub-operations. +// +// pixels is the surface base (s->pixels). On return, leftXOut / rightXOut +// hold the matching-run boundaries (only valid if seedMatched != 0); the +// caller does the 1-row halFastFillRect using those bounds. *spInOut is +// updated with any new seeds the asm pushed for the row above/below. +// +// Returns true if the port handled it; false to fall back to +// halFastFloodWalk + the per-side halFastFloodScanAndPush calls. +bool halFastFloodWalkAndScans(uint8_t *pixels, int16_t x, int16_t y, + uint8_t matchColor, uint8_t newColor, bool matchEqual, + int16_t *stackX, int16_t *stackY, + int16_t *spInOut, int16_t maxSp, + bool *seedMatched, + int16_t *leftXOut, int16_t *rightXOut); + +// surfaceBlit / surfaceBlitMasked rect-copy helper. Caller has done +// the clip math: dstRow0 / srcRow0 point at row 0 of the source/dest +// regions, dstX / srcX are intra-row pixel offsets, copyW/copyH are +// the clipped extents. dst stride is hardcoded SURFACE_BYTES_PER_ROW. +// transparent == $FFFF means opaque (always copy); any 0..15 value +// means src nibbles equal to that index are skipped. +// Returns true if the port handled it; false to fall back to C. +bool halFastBlitRect(uint8_t *dstRow0, int16_t dstX, + const uint8_t *srcRow0, int16_t srcX, + int16_t copyW, int16_t copyH, int16_t srcRowBytes, + uint16_t transparent); + #endif diff --git a/src/core/init.c b/src/core/init.c index 1e2c5b7..26f4c99 100644 --- a/src/core/init.c +++ b/src/core/init.c @@ -57,23 +57,26 @@ bool joeyInit(const JoeyConfigT *config) { memcpy(&gConfig, config, sizeof(gConfig)); - if (!surfaceAllocScreen()) { - setError("failed to allocate screen surface"); + // halInit must run before stageAlloc: on IIgs the stage's pixel + // buffer comes from halStageAllocPixels, which depends on shadow / + // SHR setup that halInit performs. + if (!halInit(&gConfig)) { + const char *halMsg = halLastError(); + setError(halMsg != NULL ? halMsg : "halInit failed"); + return false; + } + + if (!stageAlloc()) { + setError("failed to allocate stage surface"); + halShutdown(); return false; } if (!codegenArenaInit(gConfig.codegenBytes != 0 ? gConfig.codegenBytes : DEFAULT_CODEGEN_BYTES)) { setError("failed to allocate codegen arena"); - surfaceFreeScreen(); - return false; - } - - if (!halInit(&gConfig)) { - const char *halMsg = halLastError(); - setError(halMsg != NULL ? halMsg : "halInit failed"); - codegenArenaShutdown(); - surfaceFreeScreen(); + stageFree(); + halShutdown(); return false; } @@ -99,9 +102,9 @@ void joeyShutdown(void) { return; } halInputShutdown(); - halShutdown(); codegenArenaShutdown(); - surfaceFreeScreen(); + stageFree(); + halShutdown(); gInitialized = false; clearError(); } diff --git a/src/core/input.c b/src/core/input.c index acb81d4..ad7cacf 100644 --- a/src/core/input.c +++ b/src/core/input.c @@ -38,6 +38,25 @@ void joeyInputPoll(void) { } +void joeyWaitForAnyKey(void) { + int16_t i; + + // Prime the previous-state snapshot so a key already held when the + // wait starts has to be released and re-pressed (rising edge) to + // satisfy the wait. Otherwise auto-repeat or a key still down from + // the previous frame would exit instantly. + joeyInputPoll(); + while (1) { + joeyInputPoll(); + for (i = (int16_t)(KEY_NONE + 1); i < (int16_t)KEY_COUNT; i++) { + if (joeyKeyPressed((JoeyKeyE)i)) { + return; + } + } + } +} + + bool joeyKeyDown(JoeyKeyE key) { if (key <= KEY_NONE || key >= KEY_COUNT) { return false; diff --git a/src/core/present.c b/src/core/present.c index 8cc35de..84d561a 100644 --- a/src/core/present.c +++ b/src/core/present.c @@ -1,32 +1,40 @@ -// Present / slam dispatcher. +// Stage present dispatcher. // -// Validates and clips the source rectangle, then routes to the port's -// HAL implementation for the actual pixel format conversion and -// display-memory write. +// stagePresent walks the per-row dirty bands set by drawing primitives +// and asks the port HAL to flip just those rows to the display, then +// resets the dirty state. stagePresentRect bypasses dirty tracking +// entirely and slams a caller-specified rectangle (after clipping). #include +#include "joey/debug.h" #include "joey/present.h" #include "hal.h" #include "surfaceInternal.h" // ----- Public API (alphabetical) ----- -void surfacePresent(const SurfaceT *src) { - if (src == NULL) { +void stagePresent(void) { + SurfaceT *stage; + + stage = stageGet(); + if (stage == NULL) { return; } - halPresent(src); + halPresent(stage); + stageDirtyClearAll(); } -void surfacePresentRect(const SurfaceT *src, int16_t x, int16_t y, uint16_t w, uint16_t h) { - int16_t sx; - int16_t sy; - int16_t sw; - int16_t sh; +void stagePresentRect(int16_t x, int16_t y, uint16_t w, uint16_t h) { + SurfaceT *stage; + int16_t sx; + int16_t sy; + int16_t sw; + int16_t sh; - if (src == NULL) { + stage = stageGet(); + if (stage == NULL) { return; } @@ -59,5 +67,5 @@ void surfacePresentRect(const SurfaceT *src, int16_t x, int16_t y, uint16_t w, u return; } - halPresentRect(src, sx, sy, (uint16_t)sw, (uint16_t)sh); + halPresentRect(stage, sx, sy, (uint16_t)sw, (uint16_t)sh); } diff --git a/src/core/sprite.c b/src/core/sprite.c index fdd1257..d505c90 100644 --- a/src/core/sprite.c +++ b/src/core/sprite.c @@ -153,6 +153,7 @@ static void spriteDrawInterpreted(SurfaceT *s, SpriteT *sp, int16_t x, int16_t y writeDstNibble(dstRow, (int16_t)(dx + col), nibble); } } + surfaceMarkDirtyRect(s, dx, dy, w, h); } @@ -276,6 +277,7 @@ void spriteDraw(SurfaceT *s, SpriteT *sp, int16_t x, int16_t y) { // need clip math (they walk fixed offsets). if (sp->slot != NULL && isFullyOnSurface(x, y, widthPx, heightPx)) { spriteCompiledDraw(s, sp, x, y); + surfaceMarkDirtyRect(s, x, y, (int16_t)widthPx, (int16_t)heightPx); return; } spriteDrawInterpreted(s, sp, x, y); @@ -556,6 +558,8 @@ void spriteRestoreUnder(SurfaceT *s, const SpriteBackupT *backup) { shift = (copyBytes == (int16_t)spriteBytesPerRow) ? 0 : 1; if (sp->routineOffsets[shift][SPRITE_OP_RESTORE] != SPRITE_NOT_COMPILED) { spriteCompiledRestoreUnder(s, backup); + surfaceMarkDirtyRect(s, backup->x, backup->y, + (int16_t)backup->width, (int16_t)backup->height); return; } } @@ -568,6 +572,8 @@ void spriteRestoreUnder(SurfaceT *s, const SpriteBackupT *backup) { &backup->bytes[(uint16_t)row * (uint16_t)copyBytes], (size_t)copyBytes); } + surfaceMarkDirtyRect(s, backup->x, backup->y, + (int16_t)backup->width, (int16_t)backup->height); } diff --git a/src/core/surface.c b/src/core/surface.c index 493fc22..0468ff9 100644 --- a/src/core/surface.c +++ b/src/core/surface.c @@ -1,5 +1,5 @@ // Surface allocation, destruction, persistence, and the library-owned -// screen surface. +// stage (the back-buffer surface). #include #include @@ -7,6 +7,7 @@ #include #include "joey/surface.h" +#include "hal.h" #include "surfaceInternal.h" #define SURFACE_PALETTE_BYTES (SURFACE_PALETTE_ENTRIES * (uint32_t)sizeof(uint16_t)) @@ -19,20 +20,52 @@ // ----- Module state ----- -static SurfaceT *gScreen = NULL; +static SurfaceT *gStage = NULL; + +uint8_t gStageMinWord[SURFACE_HEIGHT]; +uint8_t gStageMaxWord[SURFACE_HEIGHT]; + +// ----- Internal helpers (alphabetical) ----- + +static void widenRow(int16_t y, uint8_t minWord, uint8_t maxWord) { + if (minWord < gStageMinWord[y]) { + gStageMinWord[y] = minWord; + } + if (maxWord > gStageMaxWord[y]) { + gStageMaxWord[y] = maxWord; + } +} // ----- Public API (alphabetical) ----- +SurfaceT *stageGet(void) { + return gStage; +} + + void surfaceCopy(SurfaceT *dst, const SurfaceT *src) { if (dst == NULL || src == NULL || dst == src) { return; } - memcpy(dst, src, sizeof(SurfaceT)); + memcpy(dst->pixels, src->pixels, SURFACE_PIXELS_SIZE); + memcpy(dst->scb, src->scb, sizeof(src->scb)); + memcpy(dst->palette, src->palette, sizeof(src->palette)); + surfaceMarkDirtyAll(dst); } SurfaceT *surfaceCreate(void) { - SurfaceT *s = (SurfaceT *)calloc(1, sizeof(SurfaceT)); + SurfaceT *s; + + s = (SurfaceT *)calloc(1, sizeof(SurfaceT)); + if (s == NULL) { + return NULL; + } + s->pixels = (uint8_t *)calloc(1, SURFACE_PIXELS_SIZE); + if (s->pixels == NULL) { + free(s); + return NULL; + } return s; } @@ -41,18 +74,14 @@ void surfaceDestroy(SurfaceT *s) { if (s == NULL) { return; } - if (s == gScreen) { + if (s == gStage) { return; } + free(s->pixels); free(s); } -SurfaceT *surfaceGetScreen(void) { - return gScreen; -} - - bool surfaceLoadFile(SurfaceT *dst, const char *path) { FILE *fp; long fileSize; @@ -90,6 +119,7 @@ bool surfaceLoadFile(SurfaceT *dst, const char *path) { return false; } fclose(fp); + surfaceMarkDirtyAll(dst); return true; } @@ -121,21 +151,81 @@ bool surfaceSaveFile(const SurfaceT *src, const char *path) { } -// ----- Internal (alphabetical) ----- +void surfaceMarkDirtyAll(const SurfaceT *s) { + int16_t row; -bool surfaceAllocScreen(void) { - if (gScreen != NULL) { - return true; - } - gScreen = (SurfaceT *)calloc(1, sizeof(SurfaceT)); - return gScreen != NULL; -} - - -void surfaceFreeScreen(void) { - if (gScreen == NULL) { + if (s != gStage) { return; } - free(gScreen); - gScreen = NULL; + for (row = 0; row < SURFACE_HEIGHT; row++) { + gStageMinWord[row] = 0; + gStageMaxWord[row] = (uint8_t)(SURFACE_WORDS_PER_ROW - 1); + } +} + + +// Drawing primitives pass the rect they actually wrote (already +// clipped to surface bounds, w and h positive). For non-stage surfaces +// the call is a no-op so primitives can call unconditionally without +// branching themselves. +void surfaceMarkDirtyRect(const SurfaceT *s, int16_t x, int16_t y, int16_t w, int16_t h) { + int16_t row; + int16_t yEnd; + uint8_t minWord; + uint8_t maxWord; + + if (s != gStage) { + return; + } + if (w <= 0 || h <= 0) { + return; + } + minWord = (uint8_t)(x >> 2); + maxWord = (uint8_t)((x + w - 1) >> 2); + yEnd = y + h; + for (row = y; row < yEnd; row++) { + widenRow(row, minWord, maxWord); + } +} + + +// ----- Internal (alphabetical) ----- + +bool stageAlloc(void) { + if (gStage != NULL) { + return true; + } + gStage = (SurfaceT *)calloc(1, sizeof(SurfaceT)); + if (gStage == NULL) { + return false; + } + gStage->pixels = halStageAllocPixels(); + if (gStage->pixels == NULL) { + free(gStage); + gStage = NULL; + return false; + } + memset(gStage->pixels, 0, SURFACE_PIXELS_SIZE); + stageDirtyClearAll(); + return true; +} + + +void stageDirtyClearAll(void) { + int16_t row; + + for (row = 0; row < SURFACE_HEIGHT; row++) { + gStageMinWord[row] = STAGE_DIRTY_CLEAN_MIN; + gStageMaxWord[row] = STAGE_DIRTY_CLEAN_MAX; + } +} + + +void stageFree(void) { + if (gStage == NULL) { + return; + } + halStageFreePixels(gStage->pixels); + free(gStage); + gStage = NULL; } diff --git a/src/core/surfaceInternal.h b/src/core/surfaceInternal.h index 5da4e74..aa006ee 100644 --- a/src/core/surfaceInternal.h +++ b/src/core/surfaceInternal.h @@ -7,15 +7,57 @@ #include "joey/surface.h" +// Pixels are reached through a pointer rather than an inline array so +// that the per-port HAL can pin the stage's pixel buffer to a specific +// hardware-friendly address (e.g. IIgs $01/2000 with SHR shadow +// inhibited at $C035 so writes stay in fast bank $01 instead of +// auto-mirroring to $E1). Caller-side `s->pixels[i]` syntax is +// unchanged; only allocation/copy paths in surface.c shift to a +// two-buffer model. struct SurfaceT { - uint8_t pixels[SURFACE_PIXELS_SIZE]; - uint8_t scb[SURFACE_HEIGHT]; - uint16_t palette[SURFACE_PALETTE_COUNT][SURFACE_COLORS_PER_PALETTE]; + uint8_t *pixels; + uint8_t scb[SURFACE_HEIGHT]; + uint16_t palette[SURFACE_PALETTE_COUNT][SURFACE_COLORS_PER_PALETTE]; }; -// Allocate and free the library's pre-allocated screen surface. Called -// from init.c during joeyInit / joeyShutdown. -bool surfaceAllocScreen(void); -void surfaceFreeScreen(void); +// 16-bit words per scanline. SHR / chunky 4bpp packed = 2 px per byte, +// 4 px per 16-bit word. SURFACE_BYTES_PER_ROW (160) / 2 = 80 words. +// Dirty tracking grain is 16-bit words because that matches the IIgs +// PEI / PHA slam unit and the Amiga / ST c2p group is 16 px = 4 words. +#define SURFACE_WORDS_PER_ROW (SURFACE_BYTES_PER_ROW / 2) + +// Sentinels for "row is clean": min > max can never happen for a real +// dirty range, so the present loop tests `min > max` to skip a row. +#define STAGE_DIRTY_CLEAN_MIN 0xFFu +#define STAGE_DIRTY_CLEAN_MAX 0x00u + +// Per-row dirty word bands for the stage. gStageMinWord[y] is the +// leftmost dirty 16-bit column on row y (inclusive); gStageMaxWord[y] +// is the rightmost (inclusive). Both default to the CLEAN sentinels +// after stageAlloc and after each stagePresent. +extern uint8_t gStageMinWord[SURFACE_HEIGHT]; +extern uint8_t gStageMaxWord[SURFACE_HEIGHT]; + +// Drawing primitives call this with their already-clipped destination +// rect. If `s` is the stage, the affected rows' [minWord, maxWord] +// bands are widened to cover the rect. If `s` is any other surface, +// the call is a no-op -- non-stage surfaces never get presented, so +// they don't carry dirty state. +void surfaceMarkDirtyRect(const SurfaceT *s, int16_t x, int16_t y, int16_t w, int16_t h); + +// Shorthand for "every row, full width" -- used by surfaceClear and +// the bulk-replace paths (surfaceCopy, surfaceLoadFile). No-op if `s` +// is not the stage. +void surfaceMarkDirtyAll(const SurfaceT *s); + +// Reset every row to CLEAN. Called by stagePresent after the slam. +void stageDirtyClearAll(void); + +// Allocate and free the library-owned stage (the back-buffer surface +// that stagePresent flips to the display). Called from init.c during +// joeyInit / joeyShutdown. The stage's pixel storage is supplied by +// the port HAL via halStageAllocPixels. +bool stageAlloc(void); +void stageFree(void); #endif diff --git a/src/core/tile.c b/src/core/tile.c new file mode 100644 index 0000000..87bb36a --- /dev/null +++ b/src/core/tile.c @@ -0,0 +1,280 @@ +// Tiles as 8x8 surface regions. The whole API is byte-shoveling +// between SurfaceT regions (or between a SurfaceT region and a +// stack TileT buffer); no separate tileset container, no allocator. +// +// Block coords (bx, by) map to pixel (bx*8, by*8). At 4bpp packed +// each tile row is 4 bytes wide, so byte-aligned memcpy is the inner +// loop for everything except the masked / transparent variant, which +// has to read-modify-write each byte to preserve destination pixels +// under transparent (color-0 by convention) source nibbles. + +#include + +#include "joey/tile.h" +#include "hal.h" +#include "surfaceInternal.h" + +// (No -- the 4-byte-per-row inner copies are spelled out +// inline below. Avoiding memcpy / memset from the DRAWPRIMS load +// segment keeps cross-bank relocation references out of 13/SysLib; +// without that the ORCA Linker hits "Expression too complex" on +// the small-binary builds.) + +// Hoist tile primitives into the DRAWPRIMS load segment. Asm +// dispatches go through halFast* hooks in src/port/iigs/hal.c so +// only one TU references the asm symbols (avoids the cumulative +// "Expression too complex" link failure). +JOEYLIB_SEGMENT("DRAWPRIMS") + +// ----- Prototypes ----- + +static void copyTileOpaque(uint8_t *dst, const uint8_t *src); +static void copyTileMasked(uint8_t *dst, const uint8_t *src, uint8_t transparent); + +// ----- Internal helpers (alphabetical) ----- + +// 32-byte block copy with the surface row stride between rows. dst +// and src already point at the row-0 first byte of their respective +// 8x8 regions. +static void copyTileOpaque(uint8_t *dst, const uint8_t *src) { + uint8_t row; + + for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) { + dst[0] = src[0]; + dst[1] = src[1]; + dst[2] = src[2]; + dst[3] = src[3]; + dst += SURFACE_BYTES_PER_ROW; + src += SURFACE_BYTES_PER_ROW; + } +} + + +// Same as copyTileOpaque but treats source nibbles equal to +// `transparent` as skip-through. The src bytes are inspected nibble- +// by-nibble; only non-transparent nibbles overwrite the destination. +static void copyTileMasked(uint8_t *dst, const uint8_t *src, uint8_t transparent) { + uint8_t row; + uint8_t col; + uint8_t srcByte; + uint8_t dstByte; + uint8_t transHi; + uint8_t transLo; + uint8_t srcHi; + uint8_t srcLo; + + transHi = (uint8_t)((transparent & 0x0F) << 4); + transLo = (uint8_t)(transparent & 0x0F); + + for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) { + for (col = 0; col < TILE_BYTES_PER_ROW; col++) { + srcByte = src[col]; + srcHi = (uint8_t)(srcByte & 0xF0); + srcLo = (uint8_t)(srcByte & 0x0F); + // Both nibbles transparent: skip the byte entirely. + if (srcHi == transHi && srcLo == transLo) { + continue; + } + dstByte = dst[col]; + if (srcHi != transHi) { + dstByte = (uint8_t)((dstByte & 0x0F) | srcHi); + } + if (srcLo != transLo) { + dstByte = (uint8_t)((dstByte & 0xF0) | srcLo); + } + dst[col] = dstByte; + } + dst += SURFACE_BYTES_PER_ROW; + src += SURFACE_BYTES_PER_ROW; + } +} + + +// ----- Public API (alphabetical) ----- + +void drawText(SurfaceT *dst, uint8_t bx, uint8_t by, const SurfaceT *fontSurface, const uint16_t *asciiMap, const char *str) { + uint16_t entry; + uint8_t cx; + uint8_t cy; + uint8_t ch; + uint8_t srcBx; + uint8_t srcBy; + + if (dst == NULL || fontSurface == NULL || asciiMap == NULL || str == NULL) { + return; + } + cx = bx; + cy = by; + while (*str != '\0') { + ch = (uint8_t)*str++; + entry = asciiMap[ch]; + if (entry != TILE_NO_GLYPH) { + srcBx = (uint8_t)(entry & 0x00FFu); + srcBy = (uint8_t)((entry >> 8) & 0x00FFu); + tileCopyMasked(dst, cx, cy, fontSurface, srcBx, srcBy, 0u); + } + cx++; + if (cx >= TILE_BLOCKS_PER_ROW) { + cx = 0; + cy++; + if (cy >= TILE_BLOCKS_PER_COL) { + return; + } + } + } +} + + +void tileCopy(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy) { + uint8_t *dstRow0; + const uint8_t *srcRow0; + uint16_t dstPixelX; + uint16_t dstPixelY; + uint16_t srcPixelX; + uint16_t srcPixelY; + + if (dst == NULL || src == NULL) { + return; + } + if (dstBx >= TILE_BLOCKS_PER_ROW || dstBy >= TILE_BLOCKS_PER_COL || + srcBx >= TILE_BLOCKS_PER_ROW || srcBy >= TILE_BLOCKS_PER_COL) { + return; + } + dstPixelX = (uint16_t)((uint16_t)dstBx * TILE_PIXELS_PER_SIDE); + dstPixelY = (uint16_t)((uint16_t)dstBy * TILE_PIXELS_PER_SIDE); + srcPixelX = (uint16_t)((uint16_t)srcBx * TILE_PIXELS_PER_SIDE); + srcPixelY = (uint16_t)((uint16_t)srcBy * TILE_PIXELS_PER_SIDE); + + dstRow0 = &dst->pixels[dstPixelY * SURFACE_BYTES_PER_ROW + (dstPixelX >> 1)]; + srcRow0 = &src->pixels[srcPixelY * SURFACE_BYTES_PER_ROW + (srcPixelX >> 1)]; + + if (!halFastTileCopy(dstRow0, srcRow0)) { + copyTileOpaque(dstRow0, srcRow0); + } + surfaceMarkDirtyRect(dst, (int16_t)dstPixelX, (int16_t)dstPixelY, + TILE_PIXELS_PER_SIDE, TILE_PIXELS_PER_SIDE); +} + + +void tileCopyMasked(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const SurfaceT *src, uint8_t srcBx, uint8_t srcBy, uint8_t transparentIndex) { + uint8_t *dstRow0; + const uint8_t *srcRow0; + uint16_t dstPixelX; + uint16_t dstPixelY; + uint16_t srcPixelX; + uint16_t srcPixelY; + + if (dst == NULL || src == NULL) { + return; + } + if (dstBx >= TILE_BLOCKS_PER_ROW || dstBy >= TILE_BLOCKS_PER_COL || + srcBx >= TILE_BLOCKS_PER_ROW || srcBy >= TILE_BLOCKS_PER_COL) { + return; + } + dstPixelX = (uint16_t)((uint16_t)dstBx * TILE_PIXELS_PER_SIDE); + dstPixelY = (uint16_t)((uint16_t)dstBy * TILE_PIXELS_PER_SIDE); + srcPixelX = (uint16_t)((uint16_t)srcBx * TILE_PIXELS_PER_SIDE); + srcPixelY = (uint16_t)((uint16_t)srcBy * TILE_PIXELS_PER_SIDE); + + dstRow0 = &dst->pixels[dstPixelY * SURFACE_BYTES_PER_ROW + (dstPixelX >> 1)]; + srcRow0 = &src->pixels[srcPixelY * SURFACE_BYTES_PER_ROW + (srcPixelX >> 1)]; + + if (!halFastTileCopyMasked(dstRow0, srcRow0, transparentIndex)) { + copyTileMasked(dstRow0, srcRow0, transparentIndex); + } + surfaceMarkDirtyRect(dst, (int16_t)dstPixelX, (int16_t)dstPixelY, + TILE_PIXELS_PER_SIDE, TILE_PIXELS_PER_SIDE); +} + + +void tileFill(SurfaceT *s, uint8_t bx, uint8_t by, uint8_t colorIndex) { + uint8_t doubled; + uint16_t pixelX; + uint16_t pixelY; + + if (s == NULL) { + return; + } + if (bx >= TILE_BLOCKS_PER_ROW || by >= TILE_BLOCKS_PER_COL) { + return; + } + pixelX = (uint16_t)((uint16_t)bx * TILE_PIXELS_PER_SIDE); + pixelY = (uint16_t)((uint16_t)by * TILE_PIXELS_PER_SIDE); + doubled = (uint8_t)(((colorIndex & 0x0F) << 4) | (colorIndex & 0x0F)); + if (!halFastTileFill(s, bx, by, + (uint16_t)((uint16_t)doubled | ((uint16_t)doubled << 8)))) { + uint8_t *row = &s->pixels[pixelY * SURFACE_BYTES_PER_ROW + (pixelX >> 1)]; + uint8_t i; + for (i = 0; i < TILE_PIXELS_PER_SIDE; i++) { + row[0] = doubled; + row[1] = doubled; + row[2] = doubled; + row[3] = doubled; + row += SURFACE_BYTES_PER_ROW; + } + } + surfaceMarkDirtyRect(s, (int16_t)pixelX, (int16_t)pixelY, + TILE_PIXELS_PER_SIDE, TILE_PIXELS_PER_SIDE); +} + + +void tilePaste(SurfaceT *dst, uint8_t bx, uint8_t by, const TileT *in) { + uint8_t *dstRow; + const uint8_t *src; + uint16_t pixelX; + uint16_t pixelY; + uint8_t row; + + if (dst == NULL || in == NULL) { + return; + } + if (bx >= TILE_BLOCKS_PER_ROW || by >= TILE_BLOCKS_PER_COL) { + return; + } + pixelX = (uint16_t)((uint16_t)bx * TILE_PIXELS_PER_SIDE); + pixelY = (uint16_t)((uint16_t)by * TILE_PIXELS_PER_SIDE); + dstRow = &dst->pixels[pixelY * SURFACE_BYTES_PER_ROW + (pixelX >> 1)]; + src = &in->pixels[0]; + if (!halFastTilePaste(dstRow, src)) { + for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) { + dstRow[0] = src[0]; + dstRow[1] = src[1]; + dstRow[2] = src[2]; + dstRow[3] = src[3]; + dstRow += SURFACE_BYTES_PER_ROW; + src += TILE_BYTES_PER_ROW; + } + } + surfaceMarkDirtyRect(dst, (int16_t)pixelX, (int16_t)pixelY, + TILE_PIXELS_PER_SIDE, TILE_PIXELS_PER_SIDE); +} + + +void tileSnap(const SurfaceT *src, uint8_t bx, uint8_t by, TileT *out) { + const uint8_t *srcRow; + uint8_t *dst; + uint16_t pixelX; + uint16_t pixelY; + uint8_t row; + + if (src == NULL || out == NULL) { + return; + } + if (bx >= TILE_BLOCKS_PER_ROW || by >= TILE_BLOCKS_PER_COL) { + return; + } + pixelX = (uint16_t)((uint16_t)bx * TILE_PIXELS_PER_SIDE); + pixelY = (uint16_t)((uint16_t)by * TILE_PIXELS_PER_SIDE); + srcRow = &src->pixels[pixelY * SURFACE_BYTES_PER_ROW + (pixelX >> 1)]; + dst = &out->pixels[0]; + if (!halFastTileSnap(dst, srcRow)) { + for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) { + dst[0] = srcRow[0]; + dst[1] = srcRow[1]; + dst[2] = srcRow[2]; + dst[3] = srcRow[3]; + srcRow += SURFACE_BYTES_PER_ROW; + dst += TILE_BYTES_PER_ROW; + } + } +} diff --git a/src/port/amiga/hal.c b/src/port/amiga/hal.c index 9c8bb20..a802e1d 100644 --- a/src/port/amiga/hal.c +++ b/src/port/amiga/hal.c @@ -20,6 +20,7 @@ #include #include +#include #include #include @@ -439,7 +440,7 @@ bool halInit(const JoeyConfigT *config) { } // Force COLOR00 to black so the overscan/border region around the // 320x200 display is black until the app's palette load takes over - // on the first surfacePresent. Apps that paint a non-black bg need + // on the first stagePresent. Apps that paint a non-black bg need // do nothing -- their palette[0] writes the same COLOR00 once the // first LoadRGB4 fires from uploadScbAndPalette. SetRGB4(&gScreen->ViewPort, 0, 0, 0, 0); @@ -453,11 +454,30 @@ const char *halLastError(void) { void halPresent(const SurfaceT *src) { + int16_t y; + uint8_t minWord; + uint8_t maxWord; + uint16_t byteStart; + uint16_t byteEnd; + if (src == NULL || gScreen == NULL) { return; } updateCopperIfNeeded(src); - c2pRange(src, 0, SURFACE_HEIGHT, 0, AMIGA_BYTES_PER_ROW); + + // Walk per-row dirty bands: each planar byte covers 8 px = 2 chunky + // words, so byteStart = minWord/2 and byteEnd = maxWord/2 + 1 + // converts dirty-word units to the planar-byte units c2pRange wants. + for (y = 0; y < SURFACE_HEIGHT; y++) { + minWord = gStageMinWord[y]; + maxWord = gStageMaxWord[y]; + if (minWord > maxWord) { + continue; + } + byteStart = (uint16_t)(minWord >> 1); + byteEnd = (uint16_t)((maxWord >> 1) + 1); + c2pRange(src, y, (int16_t)(y + 1), byteStart, byteEnd); + } } @@ -507,3 +527,183 @@ void halShutdown(void) { gNewUCL = NULL; } } + + +// Amiga has no asm fast paths yet; cross-platform code falls back to +// its C implementations whenever these return false. +bool halFastSurfaceClear(SurfaceT *s, uint8_t doubled) { + (void)s; + (void)doubled; + return false; +} + + +bool halFastFillRect(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex) { + (void)s; + (void)x; + (void)y; + (void)w; + (void)h; + (void)colorIndex; + return false; +} + + +bool halFastTileCopy(uint8_t *dstRow0, const uint8_t *srcRow0) { + (void)dstRow0; + (void)srcRow0; + return false; +} + + +bool halFastTileCopyMasked(uint8_t *dstRow0, const uint8_t *srcRow0, uint8_t transparent) { + (void)dstRow0; + (void)srcRow0; + (void)transparent; + return false; +} + + +bool halFastTilePaste(uint8_t *dstRow0, const uint8_t *srcTilePixels) { + (void)dstRow0; + (void)srcTilePixels; + return false; +} + + +bool halFastTileSnap(uint8_t *dstTilePixels, const uint8_t *srcRow0) { + (void)dstTilePixels; + (void)srcRow0; + return false; +} + + +bool halFastDrawPixel(SurfaceT *s, uint16_t x, uint16_t y, uint8_t colorIndex) { + (void)s; + (void)x; + (void)y; + (void)colorIndex; + return false; +} + + +bool halFastDrawLine(SurfaceT *s, int16_t x0, int16_t y0, int16_t x1, int16_t y1, uint8_t colorIndex) { + (void)s; + (void)x0; + (void)y0; + (void)x1; + (void)y1; + (void)colorIndex; + return false; +} + + +bool halFastDrawCircle(SurfaceT *s, int16_t cx, int16_t cy, uint16_t r, uint8_t colorIndex) { + (void)s; + (void)cx; + (void)cy; + (void)r; + (void)colorIndex; + return false; +} + + +bool halFastFillCircle(SurfaceT *s, int16_t cx, int16_t cy, uint16_t r, uint8_t colorIndex) { + (void)s; + (void)cx; + (void)cy; + (void)r; + (void)colorIndex; + return false; +} + + +bool halFastFloodWalk(uint8_t *row, int16_t startX, uint8_t matchColor, uint8_t newColor, bool matchEqual, bool *seedMatched, int16_t *leftXOut, int16_t *rightXOut) { + (void)row; + (void)startX; + (void)matchColor; + (void)newColor; + (void)matchEqual; + (void)seedMatched; + (void)leftXOut; + (void)rightXOut; + return false; +} + + +bool halFastFloodScanRow(uint8_t *row, int16_t leftX, int16_t rightX, uint8_t matchColor, uint8_t newColor, bool matchEqual, uint8_t *markBuf) { + (void)row; + (void)leftX; + (void)rightX; + (void)matchColor; + (void)newColor; + (void)matchEqual; + (void)markBuf; + return false; +} + + +bool halFastBlitRect(uint8_t *dstRow0, int16_t dstX, const uint8_t *srcRow0, int16_t srcX, int16_t copyW, int16_t copyH, int16_t srcRowBytes, uint16_t transparent) { + (void)dstRow0; + (void)dstX; + (void)srcRow0; + (void)srcX; + (void)copyW; + (void)copyH; + (void)srcRowBytes; + (void)transparent; + return false; +} + + +bool halFastFloodScanAndPush(uint8_t *row, int16_t leftX, int16_t rightX, uint8_t matchColor, uint8_t newColor, bool matchEqual, int16_t scanY, int16_t *stackX, int16_t *stackY, int16_t *spInOut, int16_t maxSp) { + (void)row; + (void)leftX; + (void)rightX; + (void)matchColor; + (void)newColor; + (void)matchEqual; + (void)scanY; + (void)stackX; + (void)stackY; + (void)spInOut; + (void)maxSp; + return false; +} + + +bool halFastFloodWalkAndScans(uint8_t *pixels, int16_t x, int16_t y, uint8_t matchColor, uint8_t newColor, bool matchEqual, int16_t *stackX, int16_t *stackY, int16_t *spInOut, int16_t maxSp, bool *seedMatched, int16_t *leftXOut, int16_t *rightXOut) { + (void)pixels; + (void)x; + (void)y; + (void)matchColor; + (void)newColor; + (void)matchEqual; + (void)stackX; + (void)stackY; + (void)spInOut; + (void)maxSp; + (void)seedMatched; + (void)leftXOut; + (void)rightXOut; + return false; +} + + +bool halFastTileFill(SurfaceT *s, uint8_t bx, uint8_t by, uint16_t fillWord) { + (void)s; + (void)bx; + (void)by; + (void)fillWord; + return false; +} + + +uint8_t *halStageAllocPixels(void) { + return (uint8_t *)malloc(SURFACE_PIXELS_SIZE); +} + + +void halStageFreePixels(uint8_t *pixels) { + free(pixels); +} diff --git a/src/port/atarist/hal.c b/src/port/atarist/hal.c index 57d4ced..0ab4f8d 100644 --- a/src/port/atarist/hal.c +++ b/src/port/atarist/hal.c @@ -29,6 +29,7 @@ #include #include +#include #include #include @@ -497,11 +498,30 @@ const char *halLastError(void) { void halPresent(const SurfaceT *src) { + int16_t y; + uint8_t minWord; + uint8_t maxWord; + uint16_t groupStart; + uint16_t groupEnd; + if (src == NULL || !gModeSet) { return; } refreshPaletteStateIfNeeded(src); - c2pRange(src, 0, SURFACE_HEIGHT, 0, ST_GROUPS_PER_ROW); + + // Walk per-row dirty bands: each c2p group covers 16 px = 4 chunky + // words, so groupStart = minWord/4 and groupEnd = maxWord/4 + 1 + // converts dirty-word units to c2pRange's group units. + for (y = 0; y < SURFACE_HEIGHT; y++) { + minWord = gStageMinWord[y]; + maxWord = gStageMaxWord[y]; + if (minWord > maxWord) { + continue; + } + groupStart = (uint16_t)(minWord >> 2); + groupEnd = (uint16_t)((maxWord >> 2) + 1); + c2pRange(src, y, (int16_t)(y + 1), groupStart, groupEnd); + } } @@ -563,3 +583,183 @@ void halShutdown(void) { writeDiagnostics(); gModeSet = false; } + + +// ST has no asm fast paths yet; cross-platform code falls back to its +// C implementations when these return false. +bool halFastSurfaceClear(SurfaceT *s, uint8_t doubled) { + (void)s; + (void)doubled; + return false; +} + + +bool halFastFillRect(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex) { + (void)s; + (void)x; + (void)y; + (void)w; + (void)h; + (void)colorIndex; + return false; +} + + +bool halFastTileCopy(uint8_t *dstRow0, const uint8_t *srcRow0) { + (void)dstRow0; + (void)srcRow0; + return false; +} + + +bool halFastTileCopyMasked(uint8_t *dstRow0, const uint8_t *srcRow0, uint8_t transparent) { + (void)dstRow0; + (void)srcRow0; + (void)transparent; + return false; +} + + +bool halFastTilePaste(uint8_t *dstRow0, const uint8_t *srcTilePixels) { + (void)dstRow0; + (void)srcTilePixels; + return false; +} + + +bool halFastTileSnap(uint8_t *dstTilePixels, const uint8_t *srcRow0) { + (void)dstTilePixels; + (void)srcRow0; + return false; +} + + +bool halFastDrawPixel(SurfaceT *s, uint16_t x, uint16_t y, uint8_t colorIndex) { + (void)s; + (void)x; + (void)y; + (void)colorIndex; + return false; +} + + +bool halFastDrawLine(SurfaceT *s, int16_t x0, int16_t y0, int16_t x1, int16_t y1, uint8_t colorIndex) { + (void)s; + (void)x0; + (void)y0; + (void)x1; + (void)y1; + (void)colorIndex; + return false; +} + + +bool halFastDrawCircle(SurfaceT *s, int16_t cx, int16_t cy, uint16_t r, uint8_t colorIndex) { + (void)s; + (void)cx; + (void)cy; + (void)r; + (void)colorIndex; + return false; +} + + +bool halFastFillCircle(SurfaceT *s, int16_t cx, int16_t cy, uint16_t r, uint8_t colorIndex) { + (void)s; + (void)cx; + (void)cy; + (void)r; + (void)colorIndex; + return false; +} + + +bool halFastFloodWalk(uint8_t *row, int16_t startX, uint8_t matchColor, uint8_t newColor, bool matchEqual, bool *seedMatched, int16_t *leftXOut, int16_t *rightXOut) { + (void)row; + (void)startX; + (void)matchColor; + (void)newColor; + (void)matchEqual; + (void)seedMatched; + (void)leftXOut; + (void)rightXOut; + return false; +} + + +bool halFastFloodScanRow(uint8_t *row, int16_t leftX, int16_t rightX, uint8_t matchColor, uint8_t newColor, bool matchEqual, uint8_t *markBuf) { + (void)row; + (void)leftX; + (void)rightX; + (void)matchColor; + (void)newColor; + (void)matchEqual; + (void)markBuf; + return false; +} + + +bool halFastBlitRect(uint8_t *dstRow0, int16_t dstX, const uint8_t *srcRow0, int16_t srcX, int16_t copyW, int16_t copyH, int16_t srcRowBytes, uint16_t transparent) { + (void)dstRow0; + (void)dstX; + (void)srcRow0; + (void)srcX; + (void)copyW; + (void)copyH; + (void)srcRowBytes; + (void)transparent; + return false; +} + + +bool halFastFloodScanAndPush(uint8_t *row, int16_t leftX, int16_t rightX, uint8_t matchColor, uint8_t newColor, bool matchEqual, int16_t scanY, int16_t *stackX, int16_t *stackY, int16_t *spInOut, int16_t maxSp) { + (void)row; + (void)leftX; + (void)rightX; + (void)matchColor; + (void)newColor; + (void)matchEqual; + (void)scanY; + (void)stackX; + (void)stackY; + (void)spInOut; + (void)maxSp; + return false; +} + + +bool halFastFloodWalkAndScans(uint8_t *pixels, int16_t x, int16_t y, uint8_t matchColor, uint8_t newColor, bool matchEqual, int16_t *stackX, int16_t *stackY, int16_t *spInOut, int16_t maxSp, bool *seedMatched, int16_t *leftXOut, int16_t *rightXOut) { + (void)pixels; + (void)x; + (void)y; + (void)matchColor; + (void)newColor; + (void)matchEqual; + (void)stackX; + (void)stackY; + (void)spInOut; + (void)maxSp; + (void)seedMatched; + (void)leftXOut; + (void)rightXOut; + return false; +} + + +bool halFastTileFill(SurfaceT *s, uint8_t bx, uint8_t by, uint16_t fillWord) { + (void)s; + (void)bx; + (void)by; + (void)fillWord; + return false; +} + + +uint8_t *halStageAllocPixels(void) { + return (uint8_t *)malloc(SURFACE_PIXELS_SIZE); +} + + +void halStageFreePixels(uint8_t *pixels) { + free(pixels); +} diff --git a/src/port/dos/hal.c b/src/port/dos/hal.c index f37da87..70989cd 100644 --- a/src/port/dos/hal.c +++ b/src/port/dos/hal.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -216,14 +217,29 @@ const char *halLastError(void) { void halPresent(const SurfaceT *src) { - int16_t y; + int16_t y; + uint8_t minWord; + uint8_t maxWord; + int16_t pixelX; + uint16_t pixelW; if (src == NULL || gVgaMem == NULL) { return; } uploadPaletteIfNeeded(src); + + // Walk per-row dirty bands: each chunky word holds 4 mode-13h + // bytes, so pixelX = minWord*4 and pixelW = (maxWord-minWord+1)*4 + // gives the byte range expandAndWriteLine needs. for (y = 0; y < SURFACE_HEIGHT; y++) { - expandAndWriteLine(src, y, 0, SURFACE_WIDTH, &gVgaMem[y * VGA_STRIDE]); + minWord = gStageMinWord[y]; + maxWord = gStageMaxWord[y]; + if (minWord > maxWord) { + continue; + } + pixelX = (int16_t)((uint16_t)minWord << 2); + pixelW = (uint16_t)(((uint16_t)maxWord - minWord + 1u) << 2); + expandAndWriteLine(src, y, pixelX, pixelW, &gVgaMem[y * VGA_STRIDE]); } } @@ -277,3 +293,183 @@ void halShutdown(void) { gCrashLog = NULL; } } + + +// DOS has no asm fast paths yet; cross-platform code falls back to +// its C implementations when these return false. +bool halFastSurfaceClear(SurfaceT *s, uint8_t doubled) { + (void)s; + (void)doubled; + return false; +} + + +bool halFastFillRect(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex) { + (void)s; + (void)x; + (void)y; + (void)w; + (void)h; + (void)colorIndex; + return false; +} + + +bool halFastTileCopy(uint8_t *dstRow0, const uint8_t *srcRow0) { + (void)dstRow0; + (void)srcRow0; + return false; +} + + +bool halFastTileCopyMasked(uint8_t *dstRow0, const uint8_t *srcRow0, uint8_t transparent) { + (void)dstRow0; + (void)srcRow0; + (void)transparent; + return false; +} + + +bool halFastTilePaste(uint8_t *dstRow0, const uint8_t *srcTilePixels) { + (void)dstRow0; + (void)srcTilePixels; + return false; +} + + +bool halFastTileSnap(uint8_t *dstTilePixels, const uint8_t *srcRow0) { + (void)dstTilePixels; + (void)srcRow0; + return false; +} + + +bool halFastDrawPixel(SurfaceT *s, uint16_t x, uint16_t y, uint8_t colorIndex) { + (void)s; + (void)x; + (void)y; + (void)colorIndex; + return false; +} + + +bool halFastDrawLine(SurfaceT *s, int16_t x0, int16_t y0, int16_t x1, int16_t y1, uint8_t colorIndex) { + (void)s; + (void)x0; + (void)y0; + (void)x1; + (void)y1; + (void)colorIndex; + return false; +} + + +bool halFastDrawCircle(SurfaceT *s, int16_t cx, int16_t cy, uint16_t r, uint8_t colorIndex) { + (void)s; + (void)cx; + (void)cy; + (void)r; + (void)colorIndex; + return false; +} + + +bool halFastFillCircle(SurfaceT *s, int16_t cx, int16_t cy, uint16_t r, uint8_t colorIndex) { + (void)s; + (void)cx; + (void)cy; + (void)r; + (void)colorIndex; + return false; +} + + +bool halFastFloodWalk(uint8_t *row, int16_t startX, uint8_t matchColor, uint8_t newColor, bool matchEqual, bool *seedMatched, int16_t *leftXOut, int16_t *rightXOut) { + (void)row; + (void)startX; + (void)matchColor; + (void)newColor; + (void)matchEqual; + (void)seedMatched; + (void)leftXOut; + (void)rightXOut; + return false; +} + + +bool halFastFloodScanRow(uint8_t *row, int16_t leftX, int16_t rightX, uint8_t matchColor, uint8_t newColor, bool matchEqual, uint8_t *markBuf) { + (void)row; + (void)leftX; + (void)rightX; + (void)matchColor; + (void)newColor; + (void)matchEqual; + (void)markBuf; + return false; +} + + +bool halFastBlitRect(uint8_t *dstRow0, int16_t dstX, const uint8_t *srcRow0, int16_t srcX, int16_t copyW, int16_t copyH, int16_t srcRowBytes, uint16_t transparent) { + (void)dstRow0; + (void)dstX; + (void)srcRow0; + (void)srcX; + (void)copyW; + (void)copyH; + (void)srcRowBytes; + (void)transparent; + return false; +} + + +bool halFastFloodScanAndPush(uint8_t *row, int16_t leftX, int16_t rightX, uint8_t matchColor, uint8_t newColor, bool matchEqual, int16_t scanY, int16_t *stackX, int16_t *stackY, int16_t *spInOut, int16_t maxSp) { + (void)row; + (void)leftX; + (void)rightX; + (void)matchColor; + (void)newColor; + (void)matchEqual; + (void)scanY; + (void)stackX; + (void)stackY; + (void)spInOut; + (void)maxSp; + return false; +} + + +bool halFastFloodWalkAndScans(uint8_t *pixels, int16_t x, int16_t y, uint8_t matchColor, uint8_t newColor, bool matchEqual, int16_t *stackX, int16_t *stackY, int16_t *spInOut, int16_t maxSp, bool *seedMatched, int16_t *leftXOut, int16_t *rightXOut) { + (void)pixels; + (void)x; + (void)y; + (void)matchColor; + (void)newColor; + (void)matchEqual; + (void)stackX; + (void)stackY; + (void)spInOut; + (void)maxSp; + (void)seedMatched; + (void)leftXOut; + (void)rightXOut; + return false; +} + + +bool halFastTileFill(SurfaceT *s, uint8_t bx, uint8_t by, uint16_t fillWord) { + (void)s; + (void)bx; + (void)by; + (void)fillWord; + return false; +} + + +uint8_t *halStageAllocPixels(void) { + return (uint8_t *)malloc(SURFACE_PIXELS_SIZE); +} + + +void halStageFreePixels(uint8_t *pixels) { + free(pixels); +} diff --git a/src/port/iigs/audio.c b/src/port/iigs/audio.c deleted file mode 100644 index d833d84..0000000 --- a/src/port/iigs/audio.c +++ /dev/null @@ -1,70 +0,0 @@ -// Apple IIgs audio HAL stub. Real implementation pending. -// -// Pipeline already in place: -// * toolchains/install.sh fetches Ninjaforce NTP source into -// toolchains/iigs/ntp/ (CRLF stripped on extraction). -// * make/iigs.mk assembles ninjatrackerplus.s with Merlin32 into -// build/iigs/audio/ntpplayer.bin (34 KB raw 65816, originally -// org $0F0000 but bank-internal so it relocates at any bank-start -// load address). -// * package-disk.sh bundles ntpplayer.bin onto the IIgs disk image -// alongside the demo binaries. -// -// Why this file is still a stub: -// The runtime load path (NewHandle + fopen + fread on NTPPLAYER.BIN) -// pulls Memory Manager + ORCA stdio into the link, and ORCA Linker -// fails with "Expression too complex in 13/SysLib" when those are -// added on top of the existing graphics + input HAL plumbing for -// *every* demo (the IIgs build links each binary as one monolithic -// image, no static-library culling). Bringing in those tool sets -// needs to land alongside the JSL trampoline that actually uses the -// loaded NTP -- one combined effort, with the demos that don't need -// audio either kept on a thinner audio shim or split out so the -// linker isn't asked to resolve everything for everyone. -// -// Until that combined load + trampoline iteration ships, every entry -// here is a safe no-op so the audio API stays callable. - -#include "hal.h" - - -bool halAudioInit(void) { - return false; -} - - -void halAudioShutdown(void) { -} - - -void halAudioPlayMod(const uint8_t *data, uint32_t length, bool loop) { - (void)data; - (void)length; - (void)loop; -} - - -void halAudioStopMod(void) { -} - - -bool halAudioIsPlayingMod(void) { - return false; -} - - -void halAudioPlaySfx(uint8_t slot, const uint8_t *sample, uint32_t length, uint16_t rateHz) { - (void)slot; - (void)sample; - (void)length; - (void)rateHz; -} - - -void halAudioStopSfx(uint8_t slot) { - (void)slot; -} - - -void halAudioFrameTick(void) { -} diff --git a/src/port/iigs/audio_full.c b/src/port/iigs/audio_full.c index ea3c566..76da6b3 100644 --- a/src/port/iigs/audio_full.c +++ b/src/port/iigs/audio_full.c @@ -1,7 +1,13 @@ -// Apple IIgs audio HAL -- full version (linked only into the AUDIO -// demo via make/iigs.mk's split source set). audio.c keeps the no-op -// stub for every other demo so the monolithic IIgs link budget stays -// safe. +// Apple IIgs audio HAL -- single source linked into every IIgs demo. +// Earlier we split this into an audio.c no-op stub and an audio_full.c +// real implementation, filtering audio_full.c out of non-AUDIO source +// sets, because pulling Memory Manager + the 34 KB NTP replayer into +// every binary blew the ORCA Linker's blank-segment / "Expression too +// complex" budget. Now that we know how to name load segments (see +// ORCA/C ch. 30 "segment statement"), we put every function in this +// file into a named AUDIOIMPL load segment; the GS/OS loader places +// it in its own bank, so non-AUDIO binaries pay only for the data +// references, not the implementation code. // // The NinjaTrackerPlus replayer is Merlin32-assembled at build time // to ntpplayer.bin and baked into this TU as gNtpPlayerBytes via the @@ -17,7 +23,25 @@ #include "hal.h" #include "joey/audio.h" -#include "ntpplayer_data.h" + +// Place every function defined below in the shared DRAWPRIMS overflow +// load segment so the linker keeps the implementation code out of +// _ROOT in every binary that includes this TU. (See ORCA/C ch. 30 +// "segment statement". Reusing the same segment as draw.c / tile.c +// rather than picking a unique name keeps the linker's symbol- +// resolution expressions flat -- per-name extras nest the +// expression and trip the "too complex" threshold on small +// binaries.) +// +// The 34 KB NTP replayer bytes are NOT in this segment -- ORCA/C's +// `segment` statement only relocates functions, not data. They live +// in their own NTPDATA load segment, declared in build/iigs/audio/ +// ntpdata.asm (auto-generated from ntpplayer.bin by make/iigs.mk). +// We just extern the symbols here. +segment "DRAWPRIMS"; + +extern const unsigned char gNtpPlayerBytes[]; +extern const unsigned long gNtpPlayerBytes_len; // ----- Constants ----- diff --git a/src/port/iigs/hal.c b/src/port/iigs/hal.c index f05e13b..cb58d48 100644 --- a/src/port/iigs/hal.c +++ b/src/port/iigs/hal.c @@ -10,26 +10,118 @@ // ORCA/C must be built with 32-bit pointer mode (-w or equivalent) so // that the long addresses resolve to bank $E1. // -// For M1 this is a simple direct-copy present. PEI-slam (in assembly) -// arrives as an optimization in a later milestone; the structure here -// is unchanged -- only halPresent / halPresentRect get faster inner -// loops. +// DIRTY-WALK + PEI-SLAM PRESENT +// ----------------------------- +// halPresent walks the per-row dirty bands maintained by drawing +// primitives in src/core/*.c. Fully-dirty rows go through the PEI +// slam in src/port/iigs/peislam.asm (~530 cyc/row, ~55% faster than +// memcpy/MVN); partial-dirty rows use memcpy, which ORCA-C lowers +// to MVN (7 cyc/byte) -- the fastest 65816 way to move bytes into +// bank $E1 when the dirty band is too narrow to amortize the slam's +// per-call AUXWRITE/RAMRD/shadow toggle. +// +// peislam.asm declares its load segment as DRAWPRIMS so the linker +// places it in its own bank, separate from AUDIO's _ROOT (where +// audio_full.c + Memory Manager + stdio + NTPstreamsound already +// crowd up against the 64 KB-per-bank limit). #include #include +#include "joey/debug.h" #include "hal.h" #include "surfaceInternal.h" +// hal.c is the single TU that calls into joeyDraw.asm. Cross- +// platform draw.c / tile.c / etc. dispatch through halFast* +// functions defined here; they never reference the asm symbols +// directly. This avoids the cumulative ORCA-Linker-Expression- +// too-complex-in-13/SysLib failure that hit when each cross- +// platform TU brought its own asm extern. +JOEYLIB_SEGMENT("DRAWPRIMS") + +// 32 KB stack-slam fill via AUXWRITE. ~25 ms full-screen. +extern void iigsSurfaceClearInner(uint8_t *pixels, uint16_t fillWord); +// PEI-slam fill of `bytesPerRow` doubled bytes per row across `rows` +// rows, advancing 160 bytes per row. firstRow must be in bank $01. +// Caller handles partial-nibble edges in C; bytesPerRow is even. +extern void iigsFillRectStageInner(uint8_t *firstRow, uint16_t bytesPerRow, uint16_t rows, uint16_t fillWord); +// 16 STA abs,X stores at fixed offsets along a 160-byte stride. +// ~120 cyc per call. +extern void iigsTileFillInner(uint8_t *dstRow0, uint16_t fillWord); +// Tile copy / paste / snap inner loops. All take 4-byte large- +// model pointers; bank may differ between dst and src (heap +// surface vs stage). Stride contracts: +// tileCopyInner / tileCopyMaskedInner: dst 160, src 160 +// tilePasteInner: dst 160, src 4 +// tileSnapInner: dst 4, src 160 +extern void iigsTileCopyInner(uint8_t *dstRow0, const uint8_t *srcRow0); +extern void iigsTileCopyMaskedInner(uint8_t *dstRow0, const uint8_t *srcRow0, uint16_t transparent); +extern void iigsTilePasteInner(uint8_t *dstRow0, const uint8_t *srcTilePixels); +extern void iigsTileSnapInner(uint8_t *dstTilePixels, const uint8_t *srcRow0); +// Single-pixel and Bresenham line plot. drawLine inner takes +// pre-clipped endpoints (caller validates against surface bounds); +// it does no per-pixel clipping in the loop. +extern void iigsDrawPixelInner(uint8_t *pixels, uint16_t x, uint16_t y, uint16_t nibble); +extern void iigsDrawLineInner(uint8_t *pixels, uint16_t x0, uint16_t y0, uint16_t x1, uint16_t y1, uint16_t nibble); +// Bresenham midpoint circle outline. Caller has verified the entire +// bbox is on-surface so no per-pixel clip. +extern void iigsDrawCircleInner(uint8_t *pixels, uint16_t cx, uint16_t cy, uint16_t r, uint16_t nibble); +// Stage-to-SHR full upload: pixels (MVN $01->$E1), SCB, palette. +// Asm uses post-MVN DBR=$E1 to do sta abs,Y for SCB/palette. +// Replaces ORCA-C's memcpy path which silently fails when called +// from halPresent (DBR-state quirk after prior asm primitives). +extern void iigsBlitStageToShr(uint8_t *scbPtr, uint16_t *palettePtr); +// floodFill row walk: tests seed pixel and walks left/right to find +// the matching run. Writes results to gFloodSeedMatch / gFloodLeftX / +// gFloodRightX (DRAWPRIMS globals). +extern void iigsFloodWalkInner(uint8_t *row, uint16_t startX, uint16_t matchColor, uint16_t newColor, uint16_t matchEqual); +extern uint16_t gFloodSeedMatch; +extern uint16_t gFloodLeftX; +extern uint16_t gFloodRightX; +// Per-pixel match scan over [leftX..rightX] of `row`. Writes 1/0 to +// markBuf[i] for each pixel. matchEqual selects boundary vs equal mode +// (see C srcPixel match logic). +extern void iigsFloodScanRowInner(uint8_t *row, uint16_t leftX, uint16_t rightX, uint16_t matchColor, uint16_t newColor, uint16_t matchEqual, uint8_t *markBuf); +// Per-pixel rect blit (src->dst). transparent == $FFFF means opaque +// (always copy); else pixels with src nibble == (transparent & $0F) +// are skipped. Dst stride is hardcoded 160 (SURFACE_BYTES_PER_ROW). +extern void iigsBlitRectInner(uint8_t *dstRow0, uint16_t dstX, const uint8_t *srcRow0, uint16_t srcX, uint16_t copyW, uint16_t copyH, uint16_t srcRowBytes, uint16_t transparent); +// Combined scan + push: matches each pixel, tracks run state, pushes +// (x, scanY) to the (stackX, stackY) arrays at *spInOut on every +// falling edge and at the end of the row if still in a run. *spInOut +// is read on entry and updated with the new top-of-stack on return. +extern void iigsFloodScanAndPushInner(uint8_t *row, uint16_t leftX, uint16_t rightX, uint16_t matchColor, uint16_t newColor, uint16_t matchEqual, uint16_t scanY, int16_t *stackX, int16_t *stackY, uint16_t *spInOut, uint16_t maxSp); +// Single-call per-popped-seed worker: seed test + walk-left + walk-right +// + scan-above + scan-below + push, all sharing cached row addr and +// match decoders. Outputs to gFloodSeedMatch / gFloodLeftX / gFloodRightX. +extern void iigsFloodWalkAndScansInner(uint8_t *pixels, uint16_t x, uint16_t y, uint16_t matchColor, uint16_t newColor, uint16_t matchEqual, int16_t *stackX, int16_t *stackY, uint16_t *spInOut, uint16_t maxSp); +// One-shot init for the y*160 lookup table (gRowOffsetLut, 400 bytes +// in DRAWPRIMS data). Called once from halInit. After this returns, +// every asm primitive that needs row offset can do `lda >lut,x` instead +// of the 7-instruction shift-add. +extern void iigsInitRowLut(void); +// Filled circle, scanline-style. fillWord low byte is the doubled +// nibble (e.g., 0x33 for nibble 3). +extern void iigsFillCircleInner(uint8_t *pixels, uint16_t cx, uint16_t cy, uint16_t r, uint16_t fillWord); + // ----- Hardware addresses (24-bit / long pointers) ----- #define IIGS_NEWVIDEO_REG ((volatile uint8_t *)0x00C029L) #define IIGS_BORDER_REG ((volatile uint8_t *)0x00C034L) +#define IIGS_SHADOW_REG ((volatile uint8_t *)0x00C035L) #define IIGS_VBL_STATUS ((volatile uint8_t *)0x00C019L) #define IIGS_SHR_PIXELS ((uint8_t *)0xE12000L) #define IIGS_SHR_SCB ((uint8_t *)0xE19D00L) #define IIGS_SHR_PALETTE ((uint16_t *)0xE19E00L) +// The stage lives at $01/2000 -- the same offset as the SHR display +// framebuffer at $E1/2000, but in the fast (2.8 MHz) bank. With SHR +// shadow inhibited at $C035, writes here are NOT auto-mirrored to +// $E1, so drawing is full-speed and isolated from the displayed +// frame until the next stagePresent. +#define IIGS_STAGE_PIXELS ((uint8_t *)0x012000L) + #define VBL_BAR_BIT 0x80 // NEWVIDEO bit masks @@ -41,6 +133,15 @@ // handler) and bumps its "Code: RED" status. Always include this bit. #define NEWVIDEO_RESERVED_BIT 0x01 +// $C035 SHADOW register: bit set = shadow INHIBITED for that range. +// Bit 1 = hi-res page 1 ($02000-$03FFF in bank $01) +// Bit 2 = hi-res page 2 ($04000-$05FFF in bank $01) +// Bit 3 = SHR ($02000-$09FFF in bank $01) +// We set 1+2+3 because the SHR pixel range overlaps both hi-res +// pages; leaving any of those shadows live would silently mirror +// part of the stage to $E1. +#define SHADOW_INHIBIT_SHR_MASK 0x0E + // $C034 BORDER register: high nibble = beep/IRQ enables (preserve), // low nibble = border color index 0..15. Color 0 is the all-zero // palette entry by IIgs convention; we force the low nibble to 0 @@ -51,6 +152,7 @@ static uint8_t gPreviousNewVideo = 0; static uint8_t gPreviousBorder = 0; +static uint8_t gPreviousShadow = 0; static bool gModeSet = false; // Last-uploaded SCB and palette. Both registers live in bank $E1; on a @@ -62,6 +164,22 @@ static uint8_t gCachedScb [SURFACE_HEIGHT]; static uint16_t gCachedPalette[SURFACE_PALETTE_COUNT][SURFACE_COLORS_PER_PALETTE]; static bool gCacheValid = false; +// PEI slam scratch shared with src/port/iigs/peislam.asm. File-scope +// non-static so the asm can `ext` them; all accesses inside the slam +// use long-mode addressing so they bypass the //e RAMRD redirect the +// slam turns on for the duration of the run. +volatile uint16_t gPeiOrigSp; +volatile uint8_t gPeiOrigShadow; +volatile uint16_t gPeiTempRowBase; + +// Defined in src/port/iigs/peislam.asm, in its own load segment +// (DRAWPRIMS) so the GS/OS loader places it in a different bank from +// AUDIO's _ROOT. PEI-slams the full 80 words of stage row `y` into +// the matching $E1 SHR row, ~530 cyc/row vs ~1120 cyc for memcpy/MVN. +extern void peiSlamFullRow(int16_t y); + + + // Upload SCB and palette into bank-$E1 SHR memory only when they have // changed since the last call. paletteOrScbChanged returns false when // the cache is already in sync, in which case both memcpys to $E1 are @@ -86,8 +204,18 @@ bool halInit(const JoeyConfigT *config) { (void)config; gPreviousNewVideo = *IIGS_NEWVIDEO_REG; gPreviousBorder = *IIGS_BORDER_REG; + gPreviousShadow = *IIGS_SHADOW_REG; *IIGS_NEWVIDEO_REG = (uint8_t)(NEWVIDEO_SHR_ON | NEWVIDEO_LINEARIZE | NEWVIDEO_RESERVED_BIT); *IIGS_BORDER_REG = (uint8_t)(gPreviousBorder & BORDER_COLOR_MASK); + // Inhibit shadowing of the stage region. Without this, every + // write to $01/2000-9FFF mirrors to $E1 and the off-screen-buffer + // illusion breaks (the user would see drawing in progress). + *IIGS_SHADOW_REG = (uint8_t)(gPreviousShadow | SHADOW_INHIBIT_SHR_MASK); + // SCB and palette are uploaded by halPresent's iigsBlitStageToShr + // (asm path, MVN to bank $E1). C-side memset/memcpy to bank $E1 + // is unreliable from halInit's calling context, so we don't try + // it here -- the first present will set up SCB to 320 mode. + iigsInitRowLut(); gModeSet = true; return true; } @@ -102,8 +230,13 @@ void halPresent(const SurfaceT *src) { if (src == NULL) { return; } - uploadScbAndPaletteIfNeeded(src); - memcpy(IIGS_SHR_PIXELS, src->pixels, SURFACE_PIXELS_SIZE); + // iigsBlitStageToShr does pixels (MVN $01->$E1) + SCB + palette + // upload entirely in asm via DBR=$E1 + sta abs,Y indexed stores. + // ORCA-C's C-side memcpy to bank $E1 has been unreliable from + // halPresent's calling context, so we route everything through + // the asm path. Future: re-introduce per-row dirty-band logic + // for partial-screen updates (currently we always blit 32K). + iigsBlitStageToShr(src->scb, &src->palette[0][0]); } @@ -134,6 +267,270 @@ void halPresentRect(const SurfaceT *src, int16_t x, int16_t y, uint16_t w, uint1 } +void halShutdown(void) { + if (gModeSet) { + *IIGS_NEWVIDEO_REG = gPreviousNewVideo; + *IIGS_BORDER_REG = gPreviousBorder; + *IIGS_SHADOW_REG = gPreviousShadow; + gModeSet = false; + } +} + + +bool halFastSurfaceClear(SurfaceT *s, uint8_t doubled) { + uint16_t fillWord; + + if (s == NULL) { + return false; + } + if (s != stageGet()) { + return false; + } + fillWord = (uint16_t)((uint16_t)doubled | ((uint16_t)doubled << 8)); + iigsSurfaceClearInner(s->pixels, fillWord); + return true; +} + + +bool halFastFillRect(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t colorIndex) { + int16_t pxStart; + int16_t pxEnd; + int16_t midStart; + int16_t midBytes; + int16_t trailingByte; + int16_t leadingByte; + bool hasLeading; + bool hasTrailing; + int16_t row; + uint8_t *line; + uint16_t fillWord; + uint8_t nibble; + uint8_t doubled; + + if (s == NULL) { + return false; + } + if (s != stageGet()) { + return false; + } + + pxStart = x; + pxEnd = (int16_t)(x + (int16_t)w); + leadingByte = (int16_t)(pxStart >> 1); + hasLeading = (pxStart & 1) != 0; + if (hasLeading) { + pxStart++; + } + midStart = (int16_t)(pxStart >> 1); + midBytes = (int16_t)((pxEnd - pxStart) >> 1); + hasTrailing = ((pxEnd - pxStart) & 1) != 0; + trailingByte = (int16_t)(midStart + midBytes); + + if (midBytes <= 0) { + return false; + } + + nibble = (uint8_t)(colorIndex & 0x0F); + doubled = (uint8_t)((nibble << 4) | nibble); + + if (hasLeading || hasTrailing) { + for (row = 0; row < (int16_t)h; row++) { + line = &s->pixels[(y + row) * SURFACE_BYTES_PER_ROW]; + if (hasLeading) { + line[leadingByte] = (uint8_t)((line[leadingByte] & 0xF0) | nibble); + } + if (hasTrailing) { + line[trailingByte] = (uint8_t)((line[trailingByte] & 0x0F) | (nibble << 4)); + } + } + } + + fillWord = (uint16_t)((uint16_t)doubled | ((uint16_t)doubled << 8)); + line = &s->pixels[y * SURFACE_BYTES_PER_ROW + midStart]; + iigsFillRectStageInner(line, (uint16_t)midBytes, h, fillWord); + return true; +} + + +bool halFastTileCopy(uint8_t *dstRow0, const uint8_t *srcRow0) { + iigsTileCopyInner(dstRow0, srcRow0); + return true; +} + + +bool halFastTileCopyMasked(uint8_t *dstRow0, const uint8_t *srcRow0, uint8_t transparent) { + iigsTileCopyMaskedInner(dstRow0, srcRow0, (uint16_t)transparent); + return true; +} + + +bool halFastTilePaste(uint8_t *dstRow0, const uint8_t *srcTilePixels) { + iigsTilePasteInner(dstRow0, srcTilePixels); + return true; +} + + +bool halFastTileSnap(uint8_t *dstTilePixels, const uint8_t *srcRow0) { + iigsTileSnapInner(dstTilePixels, srcRow0); + return true; +} + +bool halFastDrawPixel(SurfaceT *s, uint16_t x, uint16_t y, uint8_t colorIndex) { + if (s == NULL) { + return false; + } + iigsDrawPixelInner(s->pixels, x, y, (uint16_t)(colorIndex & 0x0F)); + return true; +} + + +bool halFastDrawLine(SurfaceT *s, int16_t x0, int16_t y0, int16_t x1, int16_t y1, uint8_t colorIndex) { + if (s == NULL) { + return false; + } + iigsDrawLineInner(s->pixels, + (uint16_t)x0, (uint16_t)y0, + (uint16_t)x1, (uint16_t)y1, + (uint16_t)(colorIndex & 0x0F)); + return true; +} + + +bool halFastDrawCircle(SurfaceT *s, int16_t cx, int16_t cy, uint16_t r, uint8_t colorIndex) { + if (s == NULL) { + return false; + } + iigsDrawCircleInner(s->pixels, + (uint16_t)cx, (uint16_t)cy, r, + (uint16_t)(colorIndex & 0x0F)); + return true; +} + + +bool halFastFillCircle(SurfaceT *s, int16_t cx, int16_t cy, uint16_t r, uint8_t colorIndex) { + uint16_t fillWord; + uint8_t nibble; + uint8_t doubled; + if (s == NULL) { + return false; + } + if (s != stageGet()) { + return false; + } + nibble = (uint8_t)(colorIndex & 0x0F); + doubled = (uint8_t)((nibble << 4) | nibble); + fillWord = (uint16_t)((uint16_t)doubled | ((uint16_t)doubled << 8)); + iigsFillCircleInner(s->pixels, (uint16_t)cx, (uint16_t)cy, r, fillWord); + return true; +} + + +bool halFastFloodWalk(uint8_t *row, int16_t startX, uint8_t matchColor, uint8_t newColor, bool matchEqual, bool *seedMatched, int16_t *leftXOut, int16_t *rightXOut) { + if (row == NULL || seedMatched == NULL || leftXOut == NULL || rightXOut == NULL) { + return false; + } + iigsFloodWalkInner(row, (uint16_t)startX, + (uint16_t)(matchColor & 0x0F), + (uint16_t)(newColor & 0x0F), + (uint16_t)(matchEqual ? 1 : 0)); + *seedMatched = (gFloodSeedMatch != 0); + if (*seedMatched) { + *leftXOut = (int16_t)gFloodLeftX; + *rightXOut = (int16_t)gFloodRightX; + } + return true; +} + + +bool halFastFloodScanRow(uint8_t *row, int16_t leftX, int16_t rightX, uint8_t matchColor, uint8_t newColor, bool matchEqual, uint8_t *markBuf) { + if (row == NULL || markBuf == NULL) { + return false; + } + iigsFloodScanRowInner(row, (uint16_t)leftX, (uint16_t)rightX, + (uint16_t)(matchColor & 0x0F), + (uint16_t)(newColor & 0x0F), + (uint16_t)(matchEqual ? 1 : 0), + markBuf); + return true; +} + + +bool halFastFloodScanAndPush(uint8_t *row, int16_t leftX, int16_t rightX, uint8_t matchColor, uint8_t newColor, bool matchEqual, int16_t scanY, int16_t *stackX, int16_t *stackY, int16_t *spInOut, int16_t maxSp) { + if (row == NULL || stackX == NULL || stackY == NULL || spInOut == NULL) { + return false; + } + iigsFloodScanAndPushInner(row, + (uint16_t)leftX, (uint16_t)rightX, + (uint16_t)(matchColor & 0x0F), + (uint16_t)(newColor & 0x0F), + (uint16_t)(matchEqual ? 1 : 0), + (uint16_t)scanY, + stackX, stackY, + (uint16_t *)spInOut, + (uint16_t)maxSp); + return true; +} + + +bool halFastFloodWalkAndScans(uint8_t *pixels, int16_t x, int16_t y, uint8_t matchColor, uint8_t newColor, bool matchEqual, int16_t *stackX, int16_t *stackY, int16_t *spInOut, int16_t maxSp, bool *seedMatched, int16_t *leftXOut, int16_t *rightXOut) { + if (pixels == NULL || stackX == NULL || stackY == NULL || spInOut == NULL || seedMatched == NULL || leftXOut == NULL || rightXOut == NULL) { + return false; + } + iigsFloodWalkAndScansInner(pixels, + (uint16_t)x, (uint16_t)y, + (uint16_t)(matchColor & 0x0F), + (uint16_t)(newColor & 0x0F), + (uint16_t)(matchEqual ? 1 : 0), + stackX, stackY, + (uint16_t *)spInOut, + (uint16_t)maxSp); + *seedMatched = (gFloodSeedMatch != 0); + *leftXOut = (int16_t)gFloodLeftX; + *rightXOut = (int16_t)gFloodRightX; + return true; +} + + +bool halFastBlitRect(uint8_t *dstRow0, int16_t dstX, const uint8_t *srcRow0, int16_t srcX, int16_t copyW, int16_t copyH, int16_t srcRowBytes, uint16_t transparent) { + if (dstRow0 == NULL || srcRow0 == NULL || copyW <= 0 || copyH <= 0) { + return false; + } + iigsBlitRectInner(dstRow0, (uint16_t)dstX, + srcRow0, (uint16_t)srcX, + (uint16_t)copyW, (uint16_t)copyH, + (uint16_t)srcRowBytes, + transparent); + return true; +} + + +bool halFastTileFill(SurfaceT *s, uint8_t bx, uint8_t by, uint16_t fillWord) { + uint8_t *row; + uint16_t pixelX; + uint16_t pixelY; + + if (s == NULL) { + return false; + } + pixelX = (uint16_t)((uint16_t)bx * 8u); + pixelY = (uint16_t)((uint16_t)by * 8u); + row = &s->pixels[pixelY * SURFACE_BYTES_PER_ROW + (pixelX >> 1)]; + iigsTileFillInner(row, fillWord); + return true; +} + + +uint8_t *halStageAllocPixels(void) { + return IIGS_STAGE_PIXELS; +} + + +void halStageFreePixels(uint8_t *pixels) { + (void)pixels; + // Backing memory is hardware-pinned; nothing to free. +} + + // $C019 RDVBLBAR: bit 7 = 0 during vertical blank, 1 during active // scan. To produce a rising-edge wait (one VBL per call), first spin // while VBL is currently active (bit 7 = 0), then spin until VBL @@ -146,12 +543,3 @@ void halWaitVBL(void) { /* scanning: wait for next VBL */; } } - - -void halShutdown(void) { - if (gModeSet) { - *IIGS_NEWVIDEO_REG = gPreviousNewVideo; - *IIGS_BORDER_REG = gPreviousBorder; - gModeSet = false; - } -} diff --git a/src/port/iigs/joeyDraw.asm b/src/port/iigs/joeyDraw.asm new file mode 100644 index 0000000..c1dd426 --- /dev/null +++ b/src/port/iigs/joeyDraw.asm @@ -0,0 +1,3865 @@ +* joeyDraw.asm - Apple IIgs ASM inner loops for graphics primitives. +* +* Each function is the FAST INNER LOOP only. The C wrappers in +* src/core/draw.c and src/core/tile.c handle: +* - NULL / out-of-range argument validation +* - clipping math (Cohen-Sutherland for lines, simple AABB clip +* for rects, etc.) +* - dirty-rect marking after the fact (surfaceMarkDirtyRect / +* surfaceMarkDirtyAll) +* The inner loops trust their inputs: pre-clipped rect, byte-aligned +* destination, no NULL pointers, etc. +* +* Calling convention: ORCA-C cross-load-segment cdecl. +* - These routines live in the IIGSASM load segment; the C +* wrappers in iigs/hal.c live in DRAWPRIMS. ORCA-C emits JSL +* (3-byte return: PCL, PCH, PCB) for cross-load-segment calls +* regardless of memory model, so we use RTL to return and the +* stack offsets below are the same whether the binary was built +* with -b or not. +* - Caller pushes args right-to-left. +* - On JSL entry: SP+4 is the first arg byte 0. +* - After PHP at function entry: SP+5 is the first arg byte 0 +* (PHP shifts every offset by +1). After PHP+PHB: SP+6. After +* PHP+PHB+PHD: SP+8. +* - Pointer args are 4 bytes (low 16, then high byte = bank, then +* 1 padding byte). +* - 8-bit args (uint8_t etc.) are passed as int-promoted 16-bit. +* +* Each function declares its own ORCA object segment with a load- +* segment name of DRAWPRIMS, so the GS/OS loader places them +* together in their own bank rather than in _ROOT (see +* docs/DESIGN.md and the original ORCA/M for IIgs ch. 6 "Load +* Segments"). + + keep JOEYDRAW + case on + + +**************************************************************** +* Repeated patterns inlined throughout this file (ORCA/M's macro +* support did not accept our positional-parameter syntax so we expand +* by hand). Two recurring sequences worth naming: +* +* y -> row byte offset (a.k.a. y*160) via gRowOffsetLut. A holds y on +* entry; A holds y*160 on exit. Trashes X. Used in every primitive +* that needs row offsets: +* asl a ; A = y*2 (LUT byte offset) +* tax +* lda >gRowOffsetLut,x ; A = y*160 +* +* High-nibble RMW: byte at [PTR],Y in DP-indirect-long form, with the +* new (nib<<4) byte in NIBHI: +* lda [PTR],y +* and #$0F +* ora >NIBHI +* sta [PTR],y +* +* Low-nibble RMW: same shape, with the new nibble (low half only) in +* NIBLO: +* lda [PTR],y +* and #$F0 +* ora >NIBLO +* sta [PTR],y +**************************************************************** + + +**************************************************************** +* iigsProbeArgs(uint8_t *ptr, uint16_t a, uint16_t b) +* +* Pure diagnostic. Captures the 16-bit value at each of stack +* offsets 1..15 (after our PHP) into the gProbe1..gProbe15 scratch +* slots. C reads them back and prints to JOEYLOG. Lets us nail +* down ORCA's actual cdecl/-b convention by experiment instead of +* guessing. +**************************************************************** + +iigsProbeArgs start IIGSASM + php + rep #$30 + + lda 1,s + sta >gProbe1 + lda 2,s + sta >gProbe2 + lda 3,s + sta >gProbe3 + lda 4,s + sta >gProbe4 + lda 5,s + sta >gProbe5 + lda 6,s + sta >gProbe6 + lda 7,s + sta >gProbe7 + lda 8,s + sta >gProbe8 + lda 9,s + sta >gProbe9 + lda 10,s + sta >gProbe10 + lda 11,s + sta >gProbe11 + lda 12,s + sta >gProbe12 + lda 13,s + sta >gProbe13 + lda 14,s + sta >gProbe14 + lda 15,s + sta >gProbe15 + + plp + rtl + end + + +gProbe1 data DRAWPRIMS + ds 2 + end +gProbe2 data DRAWPRIMS + ds 2 + end +gProbe3 data DRAWPRIMS + ds 2 + end +gProbe4 data DRAWPRIMS + ds 2 + end +gProbe5 data DRAWPRIMS + ds 2 + end +gProbe6 data DRAWPRIMS + ds 2 + end +gProbe7 data DRAWPRIMS + ds 2 + end +gProbe8 data DRAWPRIMS + ds 2 + end +gProbe9 data DRAWPRIMS + ds 2 + end +gProbe10 data DRAWPRIMS + ds 2 + end +gProbe11 data DRAWPRIMS + ds 2 + end +gProbe12 data DRAWPRIMS + ds 2 + end +gProbe13 data DRAWPRIMS + ds 2 + end +gProbe14 data DRAWPRIMS + ds 2 + end +gProbe15 data DRAWPRIMS + ds 2 + end + + +**************************************************************** +* iigsSurfaceClearInner(uint8_t *pixels, uint16_t fillWord) +* +* MVN-based fill of the 32000-byte stage at $01:$2000-$9CFF with +* fillWord (16-bit pattern, both bytes the same). Seed first 2 +* bytes via long-mode STA, then MVN $01,$01 propagates the seed to +* the rest. ~7 cyc/byte; stays in M=16 throughout (no SEP/REP toggle +* in the body). SEI brackets to mask IRQs across MVN. +* +* MUST live in DRAWPRIMS load segment (same as the C caller +* halFastSurfaceClear). Putting this function in its own load segment +* causes ORCA-C's cross-segment call to silently land in a stub that +* RTLs without invoking the body -- looks like a no-op. +* +* The `pixels` arg is ignored; the stage is hardcoded at $01:$2000 +* (the C wrapper guarantees s == stageGet() before calling). +* +* Args after PHP only: pixels at SP+5..8, fillWord at SP+9..10. +**************************************************************** + +**************************************************************** +* iigsInitRowLut(void) +* +* One-shot init for the y*160 lookup table at gRowOffsetLut. C calls +* this once from halInit; afterwards every primitive that needs to +* convert y -> row byte offset can do `lda >gRowOffsetLut,x` (X = y*2) +* in 6 cyc instead of the 7-instruction shift-add (~22 cyc). 200 +* entries x 2 bytes = 400 bytes in DRAWPRIMS data. +**************************************************************** + +iigsInitRowLut start DRAWPRIMS + php + rep #$30 + LONGA ON + LONGI ON + ldx #0 ; X = byte offset into LUT + lda #0 ; A = current y*160 +initLutLoop anop + sta >gRowOffsetLut,x + clc + adc #160 + inx + inx + cpx #400 + bcc initLutLoop + LONGA OFF + LONGI OFF + plp + rtl + end + + +gRowOffsetLut data DRAWPRIMS + ds 400 + end + + +iigsSurfaceClearInner start DRAWPRIMS + php + sei + rep #$30 ; M=16, X=16 + LONGA ON + LONGI ON + + lda 9,s ; A = fillWord + sta >$012000 ; seed first 2 bytes + + ldx #$2000 + ldy #$2002 + lda #31997 + mvn $010000,$010000 + + LONGA OFF + LONGI OFF + plp + rtl + end + + +sclrPixelsCap data DRAWPRIMS + ds 2 + end +sclrFillCap data DRAWPRIMS + ds 2 + end + + +**************************************************************** +* Single 2-byte slot for surfaceClear's saved stack pointer. Other +* asm primitives (when they land) should keep state in registers +* and on the original stack frame -- adding more cross-segment +* data references tips the ORCA Linker's "Expression too complex" +* threshold for the smaller binaries. +**************************************************************** + +sclrSavedSp data DRAWPRIMS + ds 2 + end + + +**************************************************************** +* iigsTileFillInner(uint8_t *dstRow0, uint16_t fillWord) +* +* 16 STA abs,X stores at fixed offsets along a 160-byte stride. +* ~120 cyc per call vs ORCA-C's ~300. +**************************************************************** + +iigsTileFillInner start IIGSASM + php + rep #$30 + LONGA ON + LONGI ON + + lda 9,s ; fillWord + tay + lda 5,s ; dst offset + tax + + phb + sep #$20 + LONGA OFF + lda 8,s ; bank byte (post-PHB) + pha + plb + rep #$20 + LONGA ON + + tya + sta |0,x + sta |2,x + sta |160,x + sta |162,x + sta |320,x + sta |322,x + sta |480,x + sta |482,x + sta |640,x + sta |642,x + sta |800,x + sta |802,x + sta |960,x + sta |962,x + sta |1120,x + sta |1122,x + + LONGA OFF + LONGI OFF + plb + plp + rtl + end + + +**************************************************************** +* iigsFillRectStageInner(uint8_t *firstRow, uint16_t bytesPerRow, +* uint16_t rows, uint16_t fillWord) +* +* Per-row MVN fill at $01:firstRow + N*160 for N in [0..rows). Each +* row: seed first 2 bytes via long-mode indexed STA, then MVN $01,$01 +* propagates the seed to the rest of the row (count = bytesPerRow-3 +* so MVN copies bytesPerRow-2 bytes; total = 2 seed + bytesPerRow-2 = +* bytesPerRow bytes). Stays in M=16 throughout, SEI bracketed. +* +* MUST be in DRAWPRIMS load segment (see iigsSurfaceClearInner). +* +* Args after PHP only: +* firstRow at SP+5..8 (LO word at SP+5..6 -- only the LO word is +* used; bank is hardcoded $01) +* bytesPerRow at SP+9..10 +* rows at SP+11..12 +* fillWord at SP+13..14 +* +* Loop scratch in bank-1 free space below the stage at $01:$1F00: +* $01:$1F00 = curRow (uint16_t) +* $01:$1F02 = rowsLeft (uint16_t) +* $01:$1F04 = bytesPerRow (uint16_t, cached for inner loop) +* $01:$1F06 = fillWord seed (uint16_t) +* Bank-1 below the stage is verified writable; surfaceClear's MVN +* operates in the same bank. We avoid ORCA-M `data DRAWDATA` labels +* because they were resolving to addresses that hung the function. +**************************************************************** + +iigsFillRectStageInner start DRAWPRIMS +* Per-row MVN fill of `bytesPerRow` doubled bytes per row across +* `rows` rows, advancing 160 bytes per row. firstRow's low 16 bits +* point into bank $01 (the stage). Each row: M=8 seed of one byte, +* then MVN $01,$01 propagates the seed to the rest of the row. +* +* Args after PHP+SEI: +* firstRow ptr at SP+5..8 (low 16 = byte offset within bank 1) +* bytesPerRow at SP+9..10 +* rows at SP+11..12 +* fillWord at SP+13..14 (low byte = doubled fill byte) + php + sei + rep #$30 + LONGA ON + LONGI ON + +* Stash args into scratch slots (they're easier to work with via long +* mode than re-reading from sp,s every iteration). + lda 5,s + sta >fillrCurRow ; running row offset within bank 1 + lda 9,s + sta >fillrBytes ; bytesPerRow + lda 11,s + sta >fillrRowsLeft ; rows + lda 13,s + sta >fillrFillWord ; doubled-byte fillWord + +fillrRowLoop anop + lda >fillrRowsLeft + bne fillrRowDoIt + brl fillrDone +fillrRowDoIt anop + +* Seed the first byte of this row with the doubled fill byte. Stay +* in M=16 to load curRow (16-bit), drop to M=8 just for the byte +* store. STX has no long-abs form, so go via TAX. + lda >fillrCurRow + tax + sep #$20 + LONGA OFF + lda >fillrFillWord ; M=8: just the low byte + sta >$010000,x + rep #$20 + LONGA ON + +* If bytesPerRow < 2 we are done with this row (seed sufficed). + lda >fillrBytes + cmp #2 + bcc fillrNextRow + +* Set up MVN: src=curRow (X), dst=curRow+1 (Y), count=bytesPerRow-2. + lda >fillrCurRow + tax ; X = src offset + inc a + tay ; Y = dst offset = src + 1 + lda >fillrBytes + dec a + dec a ; A = bytesPerRow - 2 + mvn $010000,$010000 + +fillrNextRow anop + lda >fillrCurRow + clc + adc #160 + sta >fillrCurRow + lda >fillrRowsLeft + dec a + sta >fillrRowsLeft + brl fillrRowLoop + +fillrDone anop + LONGA OFF + LONGI OFF + plp + rtl + end + + +fillrSeed data DRAWPRIMS + ds 2 ; 2 bytes -- read as 16-bit fillWord + end +fillrBytes data DRAWPRIMS + ds 2 + end +fillrCurRow data DRAWPRIMS + ds 2 + end +fillrRowsLeft data DRAWPRIMS + ds 2 + end + + +**************************************************************** +* Scratch slots for fillRect's saved SP / loop counters. Same +* rationale as sclrSavedSp -- avoid cross-segment data references +* from the inner loop. +**************************************************************** + +fillrSavedSp data DRAWPRIMS + ds 2 + end + +fillrWordCount data DRAWPRIMS + ds 2 + end + +fillrSpAdvance data DRAWPRIMS + ds 2 + end + +fillrFillWord data DRAWPRIMS + ds 2 + end + + +**************************************************************** +* Tile primitives (iigsTileCopyInner / iigsTileCopyMaskedInner / +* iigsTilePasteInner / iigsTileSnapInner) +* +* Common pattern: D is set to point into the calling stack frame +* so [0],y reaches the dst long pointer and [4],y reaches the src +* long pointer. Lets us cross banks (stage at $01, heap surfaces +* anywhere) without touching DBR. Dst/src banks may differ. +* +* Stack layout after PHP/PHB/PHD on entry to a 2-pointer routine +* (4 prologue bytes total). The empirical offset for "first arg +* byte 0" matches the existing iigsSurfaceClearInner / iigsTile- +* FillInner pattern: with PHP only, first arg is at SP+4; each +* extra prologue byte shifts by 1, so PHP+PHB+PHD puts first arg +* at SP+7. TSC + ADC #7 + TCD makes D = SP+7, then [0] reaches +* the dst long pointer and [4] reaches src. +* +* For tileCopyMasked the third arg (transparent, 16-bit) sits at +* D+8 with the same prologue. +**************************************************************** + + +**************************************************************** +* iigsTileCopyInner(uint8_t *dst, const uint8_t *src) +* +* Opaque copy of an 8x8 tile region (32 bytes) between two 4bpp +* surfaces with 160-byte row stride. Dst and src may be in any +* banks. ~340 cyc per tile vs ORCA-C's ~900. +**************************************************************** + +iigsTileCopyInner start IIGSASM +dst equ 0 +src equ 4 + + php + phb + phd + rep #$30 + LONGA ON + LONGI ON + + tsc + clc + adc #8 + tcd ; D = SP+7 = dst arg + + ldy #0 + lda [src],y + sta [dst],y + ldy #2 + lda [src],y + sta [dst],y + ldy #160 + lda [src],y + sta [dst],y + ldy #162 + lda [src],y + sta [dst],y + ldy #320 + lda [src],y + sta [dst],y + ldy #322 + lda [src],y + sta [dst],y + ldy #480 + lda [src],y + sta [dst],y + ldy #482 + lda [src],y + sta [dst],y + ldy #640 + lda [src],y + sta [dst],y + ldy #642 + lda [src],y + sta [dst],y + ldy #800 + lda [src],y + sta [dst],y + ldy #802 + lda [src],y + sta [dst],y + ldy #960 + lda [src],y + sta [dst],y + ldy #962 + lda [src],y + sta [dst],y + ldy #1120 + lda [src],y + sta [dst],y + ldy #1122 + lda [src],y + sta [dst],y + + LONGA OFF + LONGI OFF + pld + plb + plp + rtl + end + + +**************************************************************** +* iigsTilePasteInner(uint8_t *dst, const uint8_t *src) +* +* Paste a packed TileT buffer (4 bytes/row * 8 rows = 32 bytes +* tight) onto a surface row at `dst`. Src stride 4, dst stride 160. +* Both args are 4-byte large-model pointers. +**************************************************************** + +iigsTilePasteInner start IIGSASM +dst equ 0 +src equ 4 + + php + phb + phd + rep #$30 + LONGA ON + LONGI ON + + tsc + clc + adc #8 + tcd + +* Row 0: src word 0 = 0, dst word 0 = 0 + ldy #0 + lda [src],y + sta [dst],y + ldy #2 + lda [src],y + sta [dst],y +* Row 1: src 4/6, dst 160/162 + ldy #4 + lda [src],y + ldy #160 + sta [dst],y + ldy #6 + lda [src],y + ldy #162 + sta [dst],y +* Row 2: src 8/10, dst 320/322 + ldy #8 + lda [src],y + ldy #320 + sta [dst],y + ldy #10 + lda [src],y + ldy #322 + sta [dst],y +* Row 3: src 12/14, dst 480/482 + ldy #12 + lda [src],y + ldy #480 + sta [dst],y + ldy #14 + lda [src],y + ldy #482 + sta [dst],y +* Row 4: src 16/18, dst 640/642 + ldy #16 + lda [src],y + ldy #640 + sta [dst],y + ldy #18 + lda [src],y + ldy #642 + sta [dst],y +* Row 5: src 20/22, dst 800/802 + ldy #20 + lda [src],y + ldy #800 + sta [dst],y + ldy #22 + lda [src],y + ldy #802 + sta [dst],y +* Row 6: src 24/26, dst 960/962 + ldy #24 + lda [src],y + ldy #960 + sta [dst],y + ldy #26 + lda [src],y + ldy #962 + sta [dst],y +* Row 7: src 28/30, dst 1120/1122 + ldy #28 + lda [src],y + ldy #1120 + sta [dst],y + ldy #30 + lda [src],y + ldy #1122 + sta [dst],y + + LONGA OFF + LONGI OFF + pld + plb + plp + rtl + end + + +**************************************************************** +* iigsTileSnapInner(uint8_t *dst, const uint8_t *src) +* +* Snapshot an 8x8 region of a surface into a packed TileT buffer. +* Src stride 160, dst stride 4. Mirrors tilePaste with stride +* values swapped. +**************************************************************** + +iigsTileSnapInner start IIGSASM +dst equ 0 +src equ 4 + + php + phb + phd + rep #$30 + LONGA ON + LONGI ON + + tsc + clc + adc #8 + tcd + +* Row 0: src 0/2, dst 0/2 + ldy #0 + lda [src],y + sta [dst],y + ldy #2 + lda [src],y + sta [dst],y +* Row 1: src 160/162, dst 4/6 + ldy #160 + lda [src],y + ldy #4 + sta [dst],y + ldy #162 + lda [src],y + ldy #6 + sta [dst],y +* Row 2: src 320/322, dst 8/10 + ldy #320 + lda [src],y + ldy #8 + sta [dst],y + ldy #322 + lda [src],y + ldy #10 + sta [dst],y +* Row 3: src 480/482, dst 12/14 + ldy #480 + lda [src],y + ldy #12 + sta [dst],y + ldy #482 + lda [src],y + ldy #14 + sta [dst],y +* Row 4: src 640/642, dst 16/18 + ldy #640 + lda [src],y + ldy #16 + sta [dst],y + ldy #642 + lda [src],y + ldy #18 + sta [dst],y +* Row 5: src 800/802, dst 20/22 + ldy #800 + lda [src],y + ldy #20 + sta [dst],y + ldy #802 + lda [src],y + ldy #22 + sta [dst],y +* Row 6: src 960/962, dst 24/26 + ldy #960 + lda [src],y + ldy #24 + sta [dst],y + ldy #962 + lda [src],y + ldy #26 + sta [dst],y +* Row 7: src 1120/1122, dst 28/30 + ldy #1120 + lda [src],y + ldy #28 + sta [dst],y + ldy #1122 + lda [src],y + ldy #30 + sta [dst],y + + LONGA OFF + LONGI OFF + pld + plb + plp + rtl + end + + +**************************************************************** +* iigsTileCopyMaskedInner(uint8_t *dst, const uint8_t *src, +* uint16_t transparent) +* +* Same shape as tileCopy (160/160 strides, 32 bytes), but source +* nibbles equal to `transparent` are skipped. Per-byte fast path: +* if both src nibbles == transparent the whole byte is skipped; +* otherwise build the output by mixing src/dst nibbles. +* +* Args: dst at SP+8..11, src at SP+12..15, transparent at SP+16..17. +**************************************************************** + +iigsTileCopyMaskedInner start IIGSASM +dst equ 0 +src equ 4 +trans equ 8 ; transparent (16-bit, low byte at D+8) + + php + phb + phd + rep #$30 + LONGA ON + LONGI ON + + tsc + clc + adc #8 + tcd ; D = SP+7 (dst arg) + +* Pre-compute scratch values from `transparent` (at D+8..9). + sep #$20 ; M=8 + LONGA OFF + + lda trans + and #$0F + sta >tmaskTLo ; tLo = T + + asl a + asl a + asl a + asl a + sta >tmaskTHi ; tHi = T<<4 + + ora >tmaskTLo + sta >tmaskDoubled ; doubled = (T<<4)|T + +* Walk 8 rows of 4 bytes. Y holds the byte offset; after the 3 +* in-row INYs Y is at row*160+3, so +157 lands at next row's +* start (row*160+160). + ldx #8 ; row counter + ldy #0 ; byte offset + +tmaskRowLoop anop + jsr tmaskByte ; byte 0 + iny + jsr tmaskByte ; byte 1 + iny + jsr tmaskByte ; byte 2 + iny + jsr tmaskByte ; byte 3 + + rep #$20 ; M=16 for arithmetic on Y + LONGA ON + tya + clc + adc #157 ; +156 stride, +1 because we're not at row+4 yet (we're at row+3) + tay + sep #$20 ; back to M=8 + LONGA OFF + + dex + bne tmaskRowLoop + + rep #$20 ; M=16 before epilogue + LONGA OFF + LONGI OFF + pld + plb + plp + rtl + +* tmaskByte: in M=8 X=16, Y holds byte offset. +* - load src byte +* - if both nibbles transparent: skip +* - else: assemble output from src/dst nibbles per nibble's +* non-transparency, store +tmaskByte anop + lda [src],y ; A = src byte + cmp >tmaskDoubled + beq tmaskSkip ; both nibbles == transparent + + sta >tmaskSrc + +* Decide hi nibble. + and #$F0 + cmp >tmaskTHi + beq tmaskUseDstHi + sta >tmaskOutHi ; src hi (kept in $F0 form) + bra tmaskHiDone +tmaskUseDstHi anop + lda [dst],y + and #$F0 + sta >tmaskOutHi +tmaskHiDone anop + +* Decide lo nibble. + lda >tmaskSrc + and #$0F + cmp >tmaskTLo + beq tmaskUseDstLo + sta >tmaskOutLo ; src lo + bra tmaskLoDone +tmaskUseDstLo anop + lda [dst],y + and #$0F + sta >tmaskOutLo +tmaskLoDone anop + + lda >tmaskOutHi + ora >tmaskOutLo + sta [dst],y + +tmaskSkip anop + rts + end + + +**************************************************************** +* Scratch for the masked-tile inner loop. Lives outside the +* primitive's stack frame because the per-byte subroutine call +* would otherwise need to thread state through registers. +**************************************************************** + +tmaskTLo data DRAWPRIMS + ds 1 + end + +tmaskTHi data DRAWPRIMS + ds 1 + end + +tmaskDoubled data DRAWPRIMS + ds 1 + end + +tmaskSrc data DRAWPRIMS + ds 1 + end + +tmaskOutHi data DRAWPRIMS + ds 1 + end + +tmaskOutLo data DRAWPRIMS + ds 1 + end + + +**************************************************************** +* iigsDrawPixelInner(uint8_t *pixels, uint16_t x, uint16_t y, +* uint16_t nibble) +* +* Plot one 4bpp pixel at (x,y) into a 320x200 surface. Caller has +* already validated bounds and stripped the high nibbles of `nibble` +* (only low 4 bits used). pixels is a 4-byte large-model pointer. +* +* Offset math: byte = y*160 + (x>>1). 160 = 128 + 32 = (y<<7)+(y<<5). +* Nibble half: x&1 == 0 picks high nibble (left pixel), else low. +**************************************************************** + +iigsDrawPixelInner start IIGSASM +pix equ 0 ; pixels long ptr +xx equ 4 ; x (16-bit) +yy equ 6 ; y (16-bit) +nib equ 8 ; nibble (low byte at D+8) + + php + phb + phd + rep #$30 + LONGA ON + LONGI ON + + tsc + clc + adc #8 + tcd + +* Compute byte offset = y*160 + (x>>1) into A. Use the LUT. + lda yy + asl a ; A = y*2 (LUT byte offset) + tax + lda >gRowOffsetLut,x ; A = y*160 + sta >dpxlTmp + lda xx + lsr a ; A = x >> 1 + clc + adc >dpxlTmp ; A = byte offset within surface + tay + + sep #$20 ; M=8 + LONGA OFF + + lda xx ; A = x low byte (parity in LSB) + and #1 + bne dpxlOdd + +* Even x -> high nibble. + lda nib + asl a + asl a + asl a + asl a + sta >dpxlNibPart + lda [pix],y + and #$0F + ora >dpxlNibPart + sta [pix],y + bra dpxlDone + +dpxlOdd anop +* Odd x -> low nibble. + lda nib + and #$0F + sta >dpxlNibPart + lda [pix],y + and #$F0 + ora >dpxlNibPart + sta [pix],y + +dpxlDone anop + rep #$20 ; back to M=16 for epilogue + LONGA ON + LONGA OFF + LONGI OFF + pld + plb + plp + rtl + end + + +dpxlTmp data DRAWPRIMS + ds 2 + end + +dpxlNibPart data DRAWPRIMS + ds 1 + end + + +**************************************************************** +* iigsDrawLineInner(uint8_t *pixels, uint16_t x0, uint16_t y0, +* uint16_t x1, uint16_t y1, uint16_t nibble) +* +* Bresenham line plot. Caller has clipped both endpoints into +* [0..319] x [0..199], so no per-pixel bounds check. +* +* Walks (dx, dy, sx, sy, err) and plots inline (no per-pixel +* function call). Each plot does the same parity-aware nibble +* RMW as drawPixel. State stored in DRAWDATA scratch because +* registers are too few for the full Bresenham state plus +* current pixel address. +**************************************************************** + +iigsDrawLineInner start IIGSASM +pix equ 0 ; pixels long ptr (D+0..3) +ax0 equ 4 ; x0 arg slot +ay0 equ 6 ; y0 +ax1 equ 8 ; x1 +ay1 equ 10 ; y1 +anib equ 12 ; nibble + + php + phb + phd + rep #$30 + LONGA ON + LONGI ON + + tsc + clc + adc #8 + tcd ; D = SP+7 = pixels arg + +* dx = |x1 - x0|, sx = sign(x1 - x0) + lda ax1 + sec + sbc ax0 + bpl dlnDxPos + eor #$FFFF + clc + adc #1 ; A = x0 - x1 (positive |dx|) + sta >dlnDx + lda #$FFFF + sta >dlnSx ; sx = -1 + bra dlnDxDone +dlnDxPos anop + sta >dlnDx + lda #1 + sta >dlnSx +dlnDxDone anop + +* dy = -|y1 - y0|, sy = sign(y1 - y0). Bresenham uses negative dy. + lda ay1 + sec + sbc ay0 + bpl dlnDyPos + sta >dlnDy ; A is negative; that's dy with the sign already + lda #$FFFF + sta >dlnSy ; sy = -1 + bra dlnDyDone +dlnDyPos anop +* y1 >= y0 -> raw dy positive; negate for Bresenham (-|y1-y0|). + eor #$FFFF + clc + adc #1 + sta >dlnDy + lda #1 + sta >dlnSy +dlnDyDone anop + +* err = dx + dy + lda >dlnDx + clc + adc >dlnDy + sta >dlnErr + +* Copy x0, y0 to running state slots. + lda ax0 + sta >dlnX + lda ay0 + sta >dlnY + +* Cache x1, y1 so the loop's compare is fast. + lda ax1 + sta >dlnX1 + lda ay1 + sta >dlnY1 + +* Cache nibble in 8-bit form. + sep #$20 + LONGA OFF + lda anib + and #$0F + sta >dlnNib + rep #$20 + LONGA ON + +dlnLoop anop +* --- Plot pixel at (dlnX, dlnY) --- LUT path. + lda >dlnY + asl a ; A = y*2 (LUT byte offset) + tax + lda >gRowOffsetLut,x ; A = y*160 + sta >dlnTmp + lda >dlnX + lsr a ; x >> 1 + clc + adc >dlnTmp ; byte offset + tay + + sep #$20 + LONGA OFF + + lda >dlnX ; x low byte parity + and #1 + bne dlnPlotOdd + + lda >dlnNib + asl a + asl a + asl a + asl a + sta >dlnNibPart + lda [pix],y + and #$0F + ora >dlnNibPart + sta [pix],y + bra dlnPlotDone +dlnPlotOdd anop + lda >dlnNib + sta >dlnNibPart + lda [pix],y + and #$F0 + ora >dlnNibPart + sta [pix],y +dlnPlotDone anop + rep #$20 + LONGA ON + +* --- Loop test: if X==X1 and Y==Y1, done --- + lda >dlnX + cmp >dlnX1 + bne dlnStep + lda >dlnY + cmp >dlnY1 + bne dlnStep + brl dlnExit + +dlnStep anop +* e2 = err << 1 (signed). Compare e2 against dy and dx. + lda >dlnErr + asl a ; e2 = err * 2 + sta >dlnE2 + +* if (e2 >= dy) { err += dy; X += sx; } +* dy is negative, so signed-compare with BMI/BPL needed. +* Test e2 - dy >= 0 via SEC; SBC dy; BPL. + sec + sbc >dlnDy + bmi dlnSkipX ; e2 < dy + + lda >dlnErr + clc + adc >dlnDy + sta >dlnErr + + lda >dlnX + clc + adc >dlnSx + sta >dlnX + +dlnSkipX anop +* if (e2 <= dx) { err += dx; Y += sy; } +* Test dx - e2 >= 0 via dx - e2 (BPL means e2 <= dx). + lda >dlnDx + sec + sbc >dlnE2 + bmi dlnSkipY ; dx < e2 + + lda >dlnErr + clc + adc >dlnDx + sta >dlnErr + + lda >dlnY + clc + adc >dlnSy + sta >dlnY + +dlnSkipY anop + brl dlnLoop + +dlnExit anop + LONGA OFF + LONGI OFF + pld + plb + plp + rtl + end + + +dlnDx data DRAWPRIMS + ds 2 + end +dlnDy data DRAWPRIMS + ds 2 + end +dlnSx data DRAWPRIMS + ds 2 + end +dlnSy data DRAWPRIMS + ds 2 + end +dlnErr data DRAWPRIMS + ds 2 + end +dlnE2 data DRAWPRIMS + ds 2 + end +dlnX data DRAWPRIMS + ds 2 + end +dlnY data DRAWPRIMS + ds 2 + end +dlnX1 data DRAWPRIMS + ds 2 + end +dlnY1 data DRAWPRIMS + ds 2 + end +dlnNib data DRAWPRIMS + ds 1 + end +dlnNibPart data DRAWPRIMS + ds 1 + end +dlnTmp data DRAWPRIMS + ds 2 + end + + +**************************************************************** +* iigsDrawCircleInner(uint8_t *pixels, uint16_t cx, uint16_t cy, +* uint16_t r, uint16_t nibble) +* +* Bresenham midpoint circle outline. Caller has verified that the +* whole bounding box (cx-r..cx+r, cy-r..cy+r) fits inside the +* surface, so the inner loop plots all 8 octants without per-pixel +* clip checks. Each iteration computes 4 row-base byte offsets +* (cy +/- y) * 160, (cy +/- x) * 160 then plots 8 pixels via the +* dcPlotPx subroutine. +* +* Args after PHP+PHB+PHD (TCD makes D = SP+8 = pixels arg): +* pix at D+0..3 (long ptr, used via [pix],y) +* cx at D+4..5 +* cy at D+6..7 +* r at D+8..9 +* nibble at D+10..11 +**************************************************************** + +iigsDrawCircleInner start IIGSASM +pix equ 0 +acx equ 4 +acy equ 6 +ar equ 8 +anib equ 10 + + php + phb + phd + rep #$30 + LONGA ON + LONGI ON + + tsc + clc + adc #8 + tcd ; D = SP+8 = pix arg + +* Cache the nibble in 8-bit form: dcNibLo = nib, dcNibHi = nib<<4. + lda anib + and #$000F + sta >dcNibLo ; low byte = nib, high byte = 0 + asl a + asl a + asl a + asl a + sta >dcNibHi ; low byte = nib<<4 + +* x = r, y = 0, err = 1 - x. + lda ar + sta >dcX + lda #0 + sta >dcY + lda #1 + sec + sbc ar + sta >dcErr + +dcLoop anop +* Loop guard: if x < y we are done. + lda >dcX + cmp >dcY + bcs dcLoopBody ; x >= y -> continue + brl dcExit + +dcLoopBody anop +* Compute 4 row bases. + lda acy + clc + adc >dcY + asl a ; A = y*2 (LUT byte offset) + tax + lda >gRowOffsetLut,x ; A = y*160 + sta >dcRowYP + lda acy + sec + sbc >dcY + asl a ; A = y*2 (LUT byte offset) + tax + lda >gRowOffsetLut,x ; A = y*160 + sta >dcRowYN + lda acy + clc + adc >dcX + asl a ; A = y*2 (LUT byte offset) + tax + lda >gRowOffsetLut,x ; A = y*160 + sta >dcRowXP + lda acy + sec + sbc >dcX + asl a ; A = y*2 (LUT byte offset) + tax + lda >gRowOffsetLut,x ; A = y*160 + sta >dcRowXN + +* 8 octant plots. dcPlotPx wants A=col, X=rowBase. LDX has no long- +* absolute mode, so for each plot we stash col, load row via LDA/TAX, +* then reload col into A. +* Octants 1-4 use the y-row pair (cx +/- x, cy +/- y). + lda acx + clc + adc >dcX + sta >dcSavedCol + lda >dcRowYP + tax + lda >dcSavedCol + jsr dcPlotPx ; (cx+x, cy+y) + + lda acx + sec + sbc >dcX + sta >dcSavedCol + lda >dcRowYP + tax + lda >dcSavedCol + jsr dcPlotPx ; (cx-x, cy+y) + + lda acx + clc + adc >dcX + sta >dcSavedCol + lda >dcRowYN + tax + lda >dcSavedCol + jsr dcPlotPx ; (cx+x, cy-y) + + lda acx + sec + sbc >dcX + sta >dcSavedCol + lda >dcRowYN + tax + lda >dcSavedCol + jsr dcPlotPx ; (cx-x, cy-y) + +* Octants 5-8 use the x-row pair (cx +/- y, cy +/- x). + lda acx + clc + adc >dcY + sta >dcSavedCol + lda >dcRowXP + tax + lda >dcSavedCol + jsr dcPlotPx ; (cx+y, cy+x) + + lda acx + sec + sbc >dcY + sta >dcSavedCol + lda >dcRowXP + tax + lda >dcSavedCol + jsr dcPlotPx ; (cx-y, cy+x) + + lda acx + clc + adc >dcY + sta >dcSavedCol + lda >dcRowXN + tax + lda >dcSavedCol + jsr dcPlotPx ; (cx+y, cy-x) + + lda acx + sec + sbc >dcY + sta >dcSavedCol + lda >dcRowXN + tax + lda >dcSavedCol + jsr dcPlotPx ; (cx-y, cy-x) + +* Update Bresenham: y++; if err <= 0: err += 2y+1; else x--; err += 2(y-x)+1. + lda >dcY + inc a + sta >dcY ; y now = old y + 1 + + lda >dcErr + bmi dcErrLE ; err < 0 -> err <= 0 + bne dcErrGT ; err > 0 +* err == 0: take the LE branch (err += 2y+1). +dcErrLE anop + lda >dcY + asl a + clc + adc #1 + clc + adc >dcErr + sta >dcErr + brl dcLoop +dcErrGT anop + lda >dcX + dec a + sta >dcX +* err += 2*(y-x) + 1 + lda >dcY + sec + sbc >dcX + asl a + clc + adc #1 + clc + adc >dcErr + sta >dcErr + brl dcLoop + +dcExit anop + LONGA OFF + LONGI OFF + pld + plb + plp + rtl + +* dcMul160 deleted -- callers now expand the y160lut macro inline. + +**************************************************************** +* dcPlotPx: plot a pixel at column A, with row-base offset in X. +* M=16, X=16 on entry. Trashes A, X, Y, P. D and B preserved. +* Switches to M=8 for the byte RMW then back to M=16 for caller. +**************************************************************** +dcPlotPx anop + lsr a ; A = col>>1, C = col & 1 + bcs dcPlotOdd +* Even column: high nibble. + sta >dcMulTmp + txa + clc + adc >dcMulTmp + tay + sep #$20 + LONGA OFF + lda [pix],y + and #$0F + ora >dcNibHi + sta [pix],y + rep #$20 + LONGA ON + rts +dcPlotOdd anop + sta >dcMulTmp + txa + clc + adc >dcMulTmp + tay + sep #$20 + LONGA OFF + lda [pix],y + and #$F0 + ora >dcNibLo + sta [pix],y + rep #$20 + LONGA ON + rts + end + + +**************************************************************** +* Scratch slots for circle outline state. Live in DRAWPRIMS so the +* asm subroutine's `>label` accesses match the load segment of the +* parent function. +**************************************************************** + +dcX data DRAWPRIMS + ds 2 + end +dcY data DRAWPRIMS + ds 2 + end +dcErr data DRAWPRIMS + ds 2 + end +dcNibLo data DRAWPRIMS + ds 2 + end +dcNibHi data DRAWPRIMS + ds 2 + end +dcRowYP data DRAWPRIMS + ds 2 + end +dcRowYN data DRAWPRIMS + ds 2 + end +dcRowXP data DRAWPRIMS + ds 2 + end +dcRowXN data DRAWPRIMS + ds 2 + end +dcMulTmp data DRAWPRIMS + ds 2 + end +dcSavedCol data DRAWPRIMS + ds 2 + end + + +**************************************************************** +* iigsFillCircleInner(uint8_t *pixels, uint16_t cx, uint16_t cy, +* uint16_t r, uint16_t fillWord) +* +* Filled circle with horizontal-span scanline output. Caller has +* verified the bbox fits inside the surface, so the inner loop +* fills every span unconditionally. Maintains xx = x*x and +* yy = y*y incrementally so the hot path uses only 16-bit add / +* sub / cmp -- no 65816 multiply at all (other than one r*r at +* setup, which is a small once-off shift-and-add). +* +* fillWord is the doubled byte (low byte) replicated, e.g. nibble 3 +* -> $3333 (we only use the low byte for nibble RMW; the duplicate +* high byte simplifies the C wrapper math). +* +* Args after PHP+PHB+PHD (TCD makes D = SP+8): +* pix at D+0..3 (long ptr) +* cx at D+4..5 +* cy at D+6..7 +* r at D+8..9 +* fillWord at D+10..11 +**************************************************************** + +iigsFillCircleInner start IIGSASM +fpix equ 0 +fcx equ 4 +fcy equ 6 +fr equ 8 +ffill equ 10 + + php + phb + phd + rep #$30 + LONGA ON + LONGI ON + + tsc + clc + adc #8 + tcd ; D = SP+8 + +* Cache fill nibble bytes. ffill low byte = doubled byte (e.g., $33). +* fcFillByte = doubled low byte (used for full byte fills + low nibble). +* fcFillHi = (nibble << 4) only (used for high nibble RMW). +* fcFillLo = nibble only (used for low nibble RMW). + lda ffill + and #$00FF + sta >fcFillByte ; doubled byte e.g. $33 + lsr a + lsr a + lsr a + lsr a ; A = nibble + sta >fcFillLo + asl a + asl a + asl a + asl a ; A = nibble << 4 + sta >fcFillHi + +* Compute r*r via 16-bit shift-and-add. Only done once. + lda fr + sta >fcMulA + lda fr + sta >fcMulB + jsr fcMul16 + sta >fcR2 + sta >fcXX ; xx = r*r initially + +* yy = 0, x = r, y = 0. + lda #0 + sta >fcYY + sta >fcY + lda fr + sta >fcX + +fcOuterLoop anop +* Inner: while (xx + yy) > r2 then xx -= 2x - 1; x--. +fcInnerLoop anop + lda >fcXX + clc + adc >fcYY + cmp >fcR2 + bcc fcInnerDone ; xx+yy < r2 + beq fcInnerDone ; xx+yy == r2 (acceptable) +* xx + yy > r2 -> decrement + lda >fcX + asl a ; A = 2x + sec + sbc #1 ; A = 2x - 1 + sta >fcDec + lda >fcXX + sec + sbc >fcDec + sta >fcXX + lda >fcX + dec a + sta >fcX + bra fcInnerLoop +fcInnerDone anop + +* leftCol = cx - x, rightCol = cx + x for both lower and (if y>0) upper rows. + lda fcx + sec + sbc >fcX + sta >fcLeftCol + lda fcx + clc + adc >fcX + sta >fcRightCol + +* Lower row: rowBase = (cy + y) * 160 + lda fcy + clc + adc >fcY + asl a ; A = y*2 (LUT byte offset) + tax + lda >gRowOffsetLut,x ; A = y*160 + sta >fcSpanRowBase + jsr fcDoSpan + +* If y > 0: upper row at (cy - y). + lda >fcY + beq fcSkipUpper + lda fcy + sec + sbc >fcY + asl a ; A = y*2 (LUT byte offset) + tax + lda >gRowOffsetLut,x ; A = y*160 + sta >fcSpanRowBase + jsr fcDoSpan +fcSkipUpper anop + +* yy += 2*y + 1 + lda >fcY + asl a + clc + adc #1 + clc + adc >fcYY + sta >fcYY + +* y++ + lda >fcY + inc a + sta >fcY + +* if y > r: exit, else loop + cmp fr + bcc fcContinue + beq fcContinue + brl fcExit +fcContinue anop + brl fcOuterLoop + +fcExit anop + LONGA OFF + LONGI OFF + pld + plb + plp + rtl + +* fcMul160 deleted -- callers now expand the y160lut macro inline. + +**************************************************************** +* fcMul16: A = fcMulA * fcMulB (16-bit unsigned, low 16 of result). +* Standard shift-and-add. 16 iterations. Caller has M=16, X=16. +**************************************************************** +fcMul16 anop + LONGA ON + LONGI ON + lda #0 + sta >fcMulRes + ldx #16 +fcMulLoop anop +* result <<= 1 + lda >fcMulRes + asl a + sta >fcMulRes +* multiplier <<= 1, MSB -> C + lda >fcMulA + asl a + sta >fcMulA + bcc fcMulSkip + lda >fcMulRes + clc + adc >fcMulB + sta >fcMulRes +fcMulSkip anop + dex + bne fcMulLoop + lda >fcMulRes + rts + +**************************************************************** +* fcDoSpan: fill horizontal span from fcLeftCol to fcRightCol on +* row whose byte base is in fcSpanRowBase. Uses byte-aligned fill +* for the middle bytes and per-pixel RMW for the partial-nibble +* leading/trailing bytes. +* +* Inputs: fcLeftCol, fcRightCol, fcSpanRowBase (M=16 values). +* Trashes A, X, Y, P. Preserves D and B (relies on D = SP+8). +**************************************************************** +fcDoSpan anop + LONGA ON + LONGI ON +* leftByte = leftCol >> 1, leftPartial = leftCol & 1 + lda >fcLeftCol + lsr a + sta >fcLeftByte + lda >fcLeftCol + and #1 + sta >fcLeftPartial + +* rightByte = rightCol >> 1, rightPartial = !(rightCol & 1) + lda >fcRightCol + lsr a + sta >fcRightByte + lda >fcRightCol + and #1 + eor #1 + sta >fcRightPartial + +* If leftByte == rightByte: single-byte case + lda >fcLeftByte + cmp >fcRightByte + bne fcSpanMulti + brl fcSpanSingle +fcSpanMulti anop + +* Multi-byte case. +* Leading partial (low nibble of leftByte) if leftPartial. + lda >fcLeftPartial + beq fcSkipLP + lda >fcSpanRowBase + clc + adc >fcLeftByte + tay + sep #$20 + LONGA OFF + lda [fpix],y + and #$F0 + ora >fcFillLo + sta [fpix],y + rep #$20 + LONGA ON +fcSkipLP anop + +* Trailing partial (high nibble of rightByte) if rightPartial. + lda >fcRightPartial + beq fcSkipRP + lda >fcSpanRowBase + clc + adc >fcRightByte + tay + sep #$20 + LONGA OFF + lda [fpix],y + and #$0F + ora >fcFillHi + sta [fpix],y + rep #$20 + LONGA ON +fcSkipRP anop + +* Mid bytes: from (leftByte + leftPartial) to (rightByte - rightPartial) inclusive. + lda >fcLeftByte + clc + adc >fcLeftPartial + sta >fcMidStart + lda >fcRightByte + sec + sbc >fcRightPartial + sta >fcMidEnd + cmp >fcMidStart + bcs fcMidGo + rts ; midEnd < midStart +fcMidGo anop + lda >fcMidEnd + sec + sbc >fcMidStart + inc a + sta >fcMidCount + + lda >fcSpanRowBase + clc + adc >fcMidStart + tay ; Y = byte offset + lda >fcMidCount + tax ; X = byte counter + + sep #$20 + LONGA OFF + lda >fcFillByte ; doubled byte +fcFillLoop anop + sta [fpix],y + iny + dex + bne fcFillLoop + rep #$20 + LONGA ON + rts + +fcSpanSingle anop +* Single-byte case: leftByte == rightByte. Three sub-cases: +* leftPartial=1 -> low nibble only (single pixel, leftCol odd, rightCol==leftCol) +* rightPartial=1 -> high nibble only (single pixel, leftCol==rightCol even) +* neither -> full byte (leftCol even, rightCol odd, two adjacent pixels) + lda >fcSpanRowBase + clc + adc >fcLeftByte + tay + sep #$20 + LONGA OFF + lda >fcLeftPartial + beq fcSbCheckRP + lda [fpix],y + and #$F0 + ora >fcFillLo + sta [fpix],y + rep #$20 + LONGA ON + rts +fcSbCheckRP anop + LONGA OFF ; reached via BEQ from M=8 path + lda >fcRightPartial + beq fcSbFull + lda [fpix],y + and #$0F + ora >fcFillHi + sta [fpix],y + rep #$20 + LONGA ON + rts +fcSbFull anop + LONGA OFF ; reached via BEQ from M=8 path + lda >fcFillByte + sta [fpix],y + rep #$20 + LONGA ON + rts + end + + +**************************************************************** +* Scratch slots for filled circle. +**************************************************************** + +fcX data DRAWPRIMS + ds 2 + end +fcY data DRAWPRIMS + ds 2 + end +fcXX data DRAWPRIMS + ds 2 + end +fcYY data DRAWPRIMS + ds 2 + end +fcR2 data DRAWPRIMS + ds 2 + end +fcDec data DRAWPRIMS + ds 2 + end +fcMulA data DRAWPRIMS + ds 2 + end +fcMulB data DRAWPRIMS + ds 2 + end +fcMulRes data DRAWPRIMS + ds 2 + end +fcLeftCol data DRAWPRIMS + ds 2 + end +fcRightCol data DRAWPRIMS + ds 2 + end +fcSpanRowBase data DRAWPRIMS + ds 2 + end +fcLeftByte data DRAWPRIMS + ds 2 + end +fcRightByte data DRAWPRIMS + ds 2 + end +fcLeftPartial data DRAWPRIMS + ds 2 + end +fcRightPartial data DRAWPRIMS + ds 2 + end +fcMidStart data DRAWPRIMS + ds 2 + end +fcMidEnd data DRAWPRIMS + ds 2 + end +fcMidCount data DRAWPRIMS + ds 2 + end +fcFillByte data DRAWPRIMS + ds 2 + end +fcFillLo data DRAWPRIMS + ds 2 + end +fcFillHi data DRAWPRIMS + ds 2 + end + + +**************************************************************** +* iigsFloodWalkInner(uint8_t *row, uint16_t startX, +* uint8_t matchColor, uint8_t newColor, +* uint8_t matchEqual) +* +* Combined seed test + walk-left + walk-right for floodFill. +* Tests pixel at startX against the match criterion. If it matches, +* walks left and right finding the contiguous run of matching pixels. +* +* Match criterion: +* matchEqual != 0: pix == matchColor (used by floodFill) +* matchEqual == 0: pix != matchColor && pix != newColor +* (used by floodFillBounded) +* +* Outputs (DRAWPRIMS globals): +* gFloodSeedMatch -- 1 if seed pixel matches, 0 otherwise +* gFloodLeftX -- leftmost matching column +* gFloodRightX -- rightmost matching column +* +* If seed doesn't match, leftX/rightX are not meaningful; caller +* should bail out on gFloodSeedMatch == 0. +* +* Args after PHP+PHB+PHD (TCD makes D = SP+8): +* row at D+0..3 (long ptr to row's first byte) +* startX at D+4..5 +* matchColor at D+6..7 (low byte) +* newColor at D+8..9 (low byte) +* matchEqual at D+10..11 (low byte: 0 = bounded mode) +**************************************************************** + +iigsFloodWalkInner start IIGSASM +fwrow equ 0 +fwstart equ 4 +fwmatch equ 6 +fwnew equ 8 +fweq equ 10 + + php + phb + phd + rep #$30 + LONGA ON + LONGI ON + + tsc + clc + adc #8 + tcd ; D = SP+8 + +* Cache match nibble + new nibble + matchEqual flag for fast +* comparisons. Use M=16 stores (high byte = 0 since values are +* small). + lda fwmatch + and #$00FF + sta >fwMatchNib + lda fwnew + and #$00FF + sta >fwNewNib + lda fweq + and #$00FF + sta >fwEqFlag + +* Seed test at startX. + lda fwstart + tax + jsr fwTest ; A = 1 if match, 0 if not + sta >gFloodSeedMatch + cmp #0 + bne fwSeedOk + brl fwExit ; seed doesn't match +fwSeedOk anop + +* Walk left: leftX = startX; while leftX > 0 and pix(leftX-1) match: leftX--. + lda fwstart + sta >gFloodLeftX +fwLeftLoop anop + lda >gFloodLeftX + beq fwLeftDone ; leftX == 0 + dec a + tax ; X = leftX - 1 + jsr fwTest + cmp #0 + beq fwLeftDone ; mismatch + lda >gFloodLeftX + dec a + sta >gFloodLeftX + brl fwLeftLoop +fwLeftDone anop + +* Walk right: rightX = startX; while rightX < 319 and pix(rightX+1) match: rightX++. + lda fwstart + sta >gFloodRightX +fwRightLoop anop + lda >gFloodRightX + cmp #319 + bcs fwRightDone ; rightX >= 319 + inc a + tax ; X = rightX + 1 + jsr fwTest + cmp #0 + beq fwRightDone + lda >gFloodRightX + inc a + sta >gFloodRightX + brl fwRightLoop +fwRightDone anop + +fwExit anop + LONGA OFF + LONGI OFF + pld + plb + plp + rtl + +**************************************************************** +* fwTest: test pixel at column X against match criterion. +* Input: X = column (M=16 X=16 on entry) +* Output: A = 1 if pixel matches, 0 if not (M=16 on exit) +* Trashes: A, Y, P. Preserves X, D, B. +**************************************************************** +fwTest anop + LONGA ON + LONGI ON +* byteOffset = X >> 1, parity = X & 1 + txa + lsr a ; A = X>>1, C = X & 1 + tay ; Y = byte offset + bcs fwTestOdd +* Even X -- high nibble. + sep #$20 + LONGA OFF + lda [fwrow],y + lsr a + lsr a + lsr a + lsr a ; A = high nibble + bra fwTestGotPix +fwTestOdd anop + sep #$20 + LONGA OFF + lda [fwrow],y + and #$0F ; A = low nibble +fwTestGotPix anop +* A holds nibble (M=8). Compare against match criterion. + cmp >fwMatchNib + bne fwNotMatchColor +* pix == matchColor. + lda >fwEqFlag + bne fwTestMatch ; matchEqual=1 + pix==matchColor -> match + bra fwTestNoMatch ; matchEqual=0 + pix==matchColor -> NOT match +fwNotMatchColor anop +* pix != matchColor. + lda >fwEqFlag + bne fwTestNoMatch ; matchEqual=1 + pix!=matchColor -> NOT match +* matchEqual=0 path: also need pix != newColor. Re-extract the nibble. + txa + and #$01 ; bit 0 = parity + bne fwNibLoCheck + lda [fwrow],y + lsr a + lsr a + lsr a + lsr a + bra fwGotNibForNew +fwNibLoCheck anop + lda [fwrow],y + and #$0F +fwGotNibForNew anop + cmp >fwNewNib + beq fwTestNoMatch + bra fwTestMatch +fwTestMatch anop + rep #$20 + LONGA ON + lda #1 + rts +fwTestNoMatch anop + rep #$20 + LONGA ON + lda #0 + rts + end + +**************************************************************** +* Globals for floodFill asm walk results. Live in DRAWPRIMS so +* both asm and C see them at the same long address. +**************************************************************** + +gFloodSeedMatch data DRAWPRIMS + ds 2 + end +gFloodLeftX data DRAWPRIMS + ds 2 + end +gFloodRightX data DRAWPRIMS + ds 2 + end +fwMatchNib data DRAWPRIMS + ds 2 + end +fwNewNib data DRAWPRIMS + ds 2 + end +fwEqFlag data DRAWPRIMS + ds 2 + end + + +**************************************************************** +* iigsBlitStageToShr(uint8_t *scbPtr, uint16_t *palettePtr) +* +* Three-step blit: +* 1. Pixels: MVN 32000 bytes from $01:$2000 to $E1:$2000. +* 2. SCB: 200 bytes from caller's SCB to $E1:$9D00. +* 3. Pal: 512 bytes from caller's palette to $E1:$9E00. +* +* After the pixel MVN, DBR is left at $E1 (MVN sets DBR to dst bank). +* That lets the SCB/palette loops use sta abs,Y (16-bit absolute, +* DBR-relative) instead of long-mode -- which has no Y-indexed form. +* +* Args after PHP+PHB+PHD (TCD makes D = SP+8): +* scbPtr at D+0..3 (long ptr) +* palettePtr at D+4..7 (long ptr) +**************************************************************** + +iigsBlitStageToShr start DRAWPRIMS +bscb equ 0 +bpal equ 4 + + php + phb + phd + rep #$30 + LONGA ON + LONGI ON + tsc + clc + adc #8 + tcd + +* 1. Pixel blit (DBR ends up = $E1 after MVN). + ldx #$2000 + ldy #$2000 + lda #31999 + mvn $010000,$E10000 + +* 2. SCB upload (200 bytes). DBR = $E1, so sta abs,Y -> $E1:abs+Y. + ldy #0 + sep #$20 + LONGA OFF +bscbLoop anop + cpy #200 + beq bscbDone + lda [bscb],y + sta $9D00,y + iny + bra bscbLoop +bscbDone anop + rep #$20 + LONGA ON + +* 3. Palette upload (512 bytes). + ldy #0 + sep #$20 + LONGA OFF +bpalLoop anop + cpy #512 + beq bpalDone + lda [bpal],y + sta $9E00,y + iny + bra bpalLoop +bpalDone anop + rep #$20 + LONGA ON + + LONGA OFF + LONGI OFF + pld + plb + plp + rtl + end + + + +**************************************************************** +* iigsFloodScanRowInner(uint8_t *row, uint16_t leftX, +* uint16_t rightX, uint8_t matchColor, +* uint8_t newColor, uint8_t matchEqual, +* uint8_t *markBuf) +* +* Walk pixels [leftX..rightX] (inclusive) of `row`. For each, write +* 1 to markBuf if the pixel "qualifies for flood": +* matchEqual != 0: pix == matchColor +* matchEqual == 0: pix != matchColor && pix != newColor +* Else write 0. +* +* C side then walks markBuf for run-edge transitions (no per-pixel +* nibble extract / function call) -- much faster than the C loop +* with srcPixel. +* +* Args after PHP+PHB+PHD (TCD makes D = SP+8): +* row at D+0..3 (long ptr) +* leftX at D+4..5 +* rightX at D+6..7 +* matchColor at D+8..9 (low byte) +* newColor at D+10..11 (low byte) +* matchEqual at D+12..13 (low byte) +* markBuf at D+14..17 (long ptr) +**************************************************************** + +iigsFloodScanRowInner start IIGSASM +fsRow equ 0 +fsLeft equ 4 +fsRight equ 6 +fsMatch equ 8 +fsNew equ 10 +fsEq equ 12 +fsMark equ 14 + + php + phb + phd + rep #$30 + LONGA ON + LONGI ON + tsc + clc + adc #8 + tcd + +* spanLen = rightX - leftX + 1; bail if rightX < leftX. + lda fsRight + sec + sbc fsLeft + bpl fsSpanOk + brl fsExit +fsSpanOk anop + inc a + sta >fsSpanLen + +* Cache 8-bit constants used in the inner loop. + sep #$20 + LONGA OFF + lda fsMatch + and #$0F + sta >fsMatchByte + lda fsNew + and #$0F + sta >fsNewByte + lda fsEq + sta >fsEqByte + rep #$20 + LONGA ON + + ldy #0 ; Y = i = 0..spanLen-1 + +fsLoop anop + tya + cmp >fsSpanLen + bcc fsBody + brl fsExit +fsBody anop +* curX = leftX + Y; byteIdx = curX >> 1; use Y for [fsRow],y read. + phy + clc + adc fsLeft + sta >fsCurXTemp + lsr a + tay + sep #$20 + LONGA OFF + lda [fsRow],y + sta >fsByte + rep #$20 + LONGA ON + ply + +* M=8 for the rest of the inner body. + sep #$20 + LONGA OFF + lda >fsCurXTemp ; A = low byte of curX + lsr a ; carry = curX & 1 + bcs fsOddPix + lda >fsByte + lsr a + lsr a + lsr a + lsr a + bra fsHaveNib +fsOddPix anop + lda >fsByte + and #$0F +fsHaveNib anop + sta >fsPixNib + + cmp >fsMatchByte + bne fsNotMatch + lda >fsEqByte + bne fsOne + bra fsZero +fsNotMatch anop + lda >fsEqByte + bne fsZero + lda >fsPixNib + cmp >fsNewByte + beq fsZero +fsOne anop + lda #1 + sta [fsMark],y + bra fsNextIter +fsZero anop + lda #0 + sta [fsMark],y +fsNextIter anop + rep #$20 + LONGA ON + iny + brl fsLoop + +fsExit anop + LONGA OFF + LONGI OFF + pld + plb + plp + rtl + end + + +fsSpanLen data DRAWPRIMS + ds 2 + end +fsCurXTemp data DRAWPRIMS + ds 2 + end +fsMatchByte data DRAWPRIMS + ds 2 + end +fsNewByte data DRAWPRIMS + ds 2 + end +fsEqByte data DRAWPRIMS + ds 2 + end +fsByte data DRAWPRIMS + ds 2 + end +fsPixNib data DRAWPRIMS + ds 2 + end + + +**************************************************************** +* iigsFloodScanAndPushInner(uint8_t *row, uint16_t leftX, +* uint16_t rightX, uint16_t matchColor, +* uint16_t newColor, uint16_t matchEqual, +* uint16_t scanY, +* int16_t *stackX, int16_t *stackY, +* uint16_t *spInOut, uint16_t maxSp) +* +* Combined per-pixel match scan + run-edge walk + seed push. Replaces +* the C loop that follows iigsFloodScanRowInner and walks markBuf for +* push edges. Doing it all in one asm call eliminates the per-pixel C +* loop overhead (call setup + array indexing under -b 4-byte ptrs). +* +* Args after PHP+PHB+PHD (TCD makes D = SP+8): +* row D+0..3 (long ptr) +* leftX D+4..5 +* rightX D+6..7 +* matchColor D+8..9 (low byte) +* newColor D+10..11 +* matchEqual D+12..13 +* scanY D+14..15 +* stackX D+16..19 (long ptr) +* stackY D+20..23 (long ptr) +* spInOut D+24..27 (long ptr to int16_t sp; read+write) +* maxSp D+28..29 +**************************************************************** + +iigsFloodScanAndPushInner start IIGSASM +fpRow equ 0 +fpLeft equ 4 +fpRight equ 6 +fpMatch equ 8 +fpNew equ 10 +fpEq equ 12 +fpScanY equ 14 +fpStackX equ 16 +fpStackY equ 20 +fpSpInOut equ 24 +fpMaxSp equ 28 + + php + phb + phd + rep #$30 + LONGA ON + LONGI ON + tsc + clc + adc #8 + tcd + +* spanLen = rightX - leftX + 1; bail if rightX < leftX. + lda fpRight + sec + sbc fpLeft + bpl fpSpanOk + brl fpExit +fpSpanOk anop + inc a + sta >fpSpanLen + +* Load sp from *spInOut into scratch. + ldy #0 + lda [fpSpInOut],y + sta >fpSp + +* Cache maxSp. + lda fpMaxSp + sta >fpMaxSpCache + +* Cache 8-bit constants and clear prevHit. + sep #$20 + LONGA OFF + lda fpMatch + and #$0F + sta >fpMatchByte + lda fpNew + and #$0F + sta >fpNewByte + lda #0 + sta >fpPrevHit + rep #$20 + LONGA ON + +* Branch on matchEqual ONCE: two specialized inner loops avoid the +* per-pixel test-and-branch on fpEqByte. + lda fpEq + and #$00FF + bne fpEqEntry + brl fpBoundEntry + +***** EQUAL MODE LOOP: hit = (pix == matchColor) ***** +fpEqEntry anop + ldy #0 ; Y = i + +fpEqLoop anop + tya + cmp >fpSpanLen + bcc fpEqBody + brl fpAfterLoop +fpEqBody anop + phy + clc + adc fpLeft + sta >fpCurX + lsr a + tay + sep #$20 + LONGA OFF + lda [fpRow],y + sta >fpByte + rep #$20 + LONGA ON + ply + + sep #$20 + LONGA OFF + lda >fpCurX + lsr a + bcs fpEqOdd + lda >fpByte + lsr a + lsr a + lsr a + lsr a + bra fpEqHaveNib +fpEqOdd anop + lda >fpByte + and #$0F +fpEqHaveNib anop + cmp >fpMatchByte + beq fpEqHit + lda #0 + bra fpEqStoreHit +fpEqHit anop + lda #1 +fpEqStoreHit anop + sta >fpCurHit + +* Falling edge: prevHit=1, curHit=0 -> push (curX - 1, scanY). + bne fpEqNoFall + lda >fpPrevHit + beq fpEqNoFall + rep #$20 + LONGA ON + lda >fpCurX + dec a + sta >fpPushX + jsr fpPushXY + sep #$20 + LONGA OFF +fpEqNoFall anop + lda >fpCurHit + sta >fpPrevHit + rep #$20 + LONGA ON + iny + brl fpEqLoop + +***** BOUNDARY MODE LOOP: hit = (pix != matchColor && pix != newColor) ***** +fpBoundEntry anop + ldy #0 + +fpBoundLoop anop + tya + cmp >fpSpanLen + bcc fpBoundBody + brl fpAfterLoop +fpBoundBody anop + phy + clc + adc fpLeft + sta >fpCurX + lsr a + tay + sep #$20 + LONGA OFF + lda [fpRow],y + sta >fpByte + rep #$20 + LONGA ON + ply + + sep #$20 + LONGA OFF + lda >fpCurX + lsr a + bcs fpBoundOdd + lda >fpByte + lsr a + lsr a + lsr a + lsr a + bra fpBoundHaveNib +fpBoundOdd anop + lda >fpByte + and #$0F +fpBoundHaveNib anop + cmp >fpMatchByte + beq fpBoundMiss + cmp >fpNewByte + beq fpBoundMiss + lda #1 + bra fpBoundStoreHit +fpBoundMiss anop + lda #0 +fpBoundStoreHit anop + sta >fpCurHit + + bne fpBoundNoFall + lda >fpPrevHit + beq fpBoundNoFall + rep #$20 + LONGA ON + lda >fpCurX + dec a + sta >fpPushX + jsr fpPushXY + sep #$20 + LONGA OFF +fpBoundNoFall anop + lda >fpCurHit + sta >fpPrevHit + rep #$20 + LONGA ON + iny + brl fpBoundLoop + +fpAfterLoop anop +* Trailing run: if prevHit, push (rightX, scanY). + sep #$20 + LONGA OFF + lda >fpPrevHit + beq fpStoreSp + rep #$20 + LONGA ON + lda fpRight + sta >fpPushX + jsr fpPushXY + bra fpStoreSpDone +fpStoreSp anop + rep #$20 + LONGA ON +fpStoreSpDone anop + ldy #0 + lda >fpSp + sta [fpSpInOut],y + +fpExit anop + LONGA OFF + LONGI OFF + pld + plb + plp + rtl + +* fpPushXY: push (fpPushX, fpScanY) onto stackX/stackY at sp. +* No-op if sp >= maxSp. M=16, X=16 on entry; preserves all caller regs. +fpPushXY anop + pha + phy + lda >fpSp + cmp >fpMaxSpCache + bcs fpPushSkip + asl a ; sp*2 = byte offset + tay + lda >fpPushX + sta [fpStackX],y + lda fpScanY + sta [fpStackY],y + lda >fpSp + inc a + sta >fpSp +fpPushSkip anop + ply + pla + rts + end + + +fpSpanLen data DRAWPRIMS + ds 2 + end +fpSp data DRAWPRIMS + ds 2 + end +fpMaxSpCache data DRAWPRIMS + ds 2 + end +fpMatchByte data DRAWPRIMS + ds 2 + end +fpNewByte data DRAWPRIMS + ds 2 + end +fpEqByte data DRAWPRIMS + ds 2 + end +fpPrevHit data DRAWPRIMS + ds 2 + end +fpCurHit data DRAWPRIMS + ds 2 + end +fpCurX data DRAWPRIMS + ds 2 + end +fpByte data DRAWPRIMS + ds 2 + end +fpPixNib data DRAWPRIMS + ds 2 + end +fpPushX data DRAWPRIMS + ds 2 + end + + +**************************************************************** +* iigsFloodWalkAndScansInner(uint8_t *pixels, uint16_t x, uint16_t y, +* uint16_t matchColor, uint16_t newColor, +* uint16_t matchEqual, +* int16_t *stackX, int16_t *stackY, +* uint16_t *spInOut, uint16_t maxSp) +* +* Combined per-popped-seed work. Replaces the C dispatch loop's three +* halFast* calls (walk + scan-above + scan-below) with one cross- +* segment call. All three sub-operations share the cached row addr, +* match-byte, new-byte, eq-flag, and matchByte-vs-newByte decoders. +* +* Outputs (DRAWPRIMS globals, same as iigsFloodWalkInner): +* gFloodSeedMatch -- 1 if seed pixel matched +* gFloodLeftX -- leftmost matching column (only valid if matched) +* gFloodRightX -- rightmost matching column +* Caller is responsible for the 1-row halFastFillRect after we return, +* using gFloodLeftX..gFloodRightX. We DON'T fill here -- the C-side +* fill is already as fast as the asm fill (both end up in +* iigsFillRectStageInner). +* +* Args after PHP+PHB+PHD (TCD makes D = SP+8): +* pixels D+0..3 (long ptr) +* x D+4..5 +* y D+6..7 +* matchColor D+8..9 (low byte) +* newColor D+10..11 +* matchEqual D+12..13 +* stackX D+14..17 (long ptr to int16_t[]) +* stackY D+18..21 (long ptr) +* spInOut D+22..25 (long ptr to int16_t) +* maxSp D+26..27 +**************************************************************** + +iigsFloodWalkAndScansInner start IIGSASM +* wsRow occupies the same 4 DP bytes as the pixels arg (D+0..3): we +* compute row = pixels + y*160 and overwrite the pixels slot with the +* result so that `lda [wsRow],y` (DP-indirect-long indexed Y) reads +* directly from the row in the inner loops -- no separate scratch ptr. +* Likewise wsScanRow overlaps wsMatch+wsNew (D+8..11): those args get +* cached to wsMatchByte/wsNewByte first, then we reuse the slot. +wsRow equ 0 ; D+0..3 (was wsPixels arg) +wsX equ 4 +wsY equ 6 +wsScanRow equ 8 ; D+8..11 (was wsMatch+wsNew args) +wsMatch equ 8 ; alias of wsScanRow during initial cache +wsNew equ 10 ; alias of wsScanRow+2 during initial cache +wsEq equ 12 +wsStackX equ 14 +wsStackY equ 18 +wsSpInOut equ 22 +wsMaxSp equ 26 +wsPixels equ 0 ; alias of wsRow for the initial row-addr compute +wsMidEnd equ 4 ; alias of wsX, used by inline fill (post-walk) +* DP scratch for the rewritten scan inner loop (2 pixels per byte read, +* DP-relative for ~1 cyc faster than long-mode `>fpScratch` access). +* All free post-walk / post-cache; sequential phases don't overlap. +wsScanCurX equ 4 ; alias wsX/wsMidEnd, 16-bit +wsScanByte equ 12 ; alias wsEq.lo, 8-bit +wsScanPrevHit equ 26 ; alias wsMaxSp.lo, 8-bit +wsScanCurHit equ 27 ; alias wsMaxSp.hi, 8-bit + + php + phb + phd + rep #$30 + LONGA ON + LONGI ON + tsc + clc + adc #8 + tcd + +* Cache 8-bit constants used across walk + scan + fill. +* wsNewHigh = newNibble << 4 (used for fill's trailing high-nibble RMW). +* wsDoubledByte = (newNibble << 4) | newNibble (full-byte fill value). + sep #$20 + LONGA OFF + lda wsMatch + and #$0F + sta >wsMatchByte + lda wsNew + and #$0F + sta >wsNewByte + ora #0 + pha ; save newByte for doubled compute + asl a + asl a + asl a + asl a ; A = newByte << 4 + sta >wsNewHigh + ora 1,s ; A = (newByte<<4) | newByte + sta >wsDoubledByte + pla ; clean stack (A=byte, then we overwrite) + lda wsEq + sta >wsEqByte + rep #$20 + LONGA ON + +* Compute rowAddr (long ptr) = pixels + lut[y*2]. LUT lookup beats +* the shift-add by ~7 cyc per call. Then store row.lo / row.hi over +* the pixels arg slot (= wsRow). + lda wsY + asl a ; A = y*2 (byte offset into LUT) + tax + lda >gRowOffsetLut,x ; A = y*160 + clc + adc wsPixels ; A = pixels.lo + y*160 = row.lo + tax ; X = row.lo (carry preserved through tax) + lda wsPixels+2 ; A = pixels.hi + adc #0 ; A = row.hi (carry from prev add) + sta wsRow+2 ; D+2..3 = row.hi + txa + sta wsRow ; D+0..1 = row.lo + +* === SEED TEST + WALK LEFT + WALK RIGHT === +* Specialized per matchEqual: branch once, then run inlined-test loops. +* Kills the per-pixel JSR/RTS to wsTestPix (~12 cyc) and the per-pixel +* matchEqual branch inside the test (~8 cyc) -- ~20 cyc per pixel +* across ~3600 walked pixels in the demo. +* +* leftX / rightX are held in DP slots (wsScanCurX = DP+4 alias of wsX) +* during the walk; the gFloodLeftX/RightX globals are only written at +* the end so we don't pay long-mode store cost on every step. + sep #$20 + LONGA OFF + lda >wsEqByte + rep #$20 + LONGA ON + bne wsWalkEqEntry + brl wsWalkBndEntry + +***** EQUAL MODE WALK ***** +wsWalkEqEntry anop +* Seed test at wsX (inline eq). + lda wsX + jsr wsTestEq ; A = 1 if pix == matchByte + sta >gFloodSeedMatch + cmp #0 + bne wsWalkEqSeedOk + brl wsExit +wsWalkEqSeedOk anop + +* Walk left: leftX = wsX; while leftX > 0 and pixel(leftX-1) matches, +* leftX--. Holds leftX in gFloodLeftX + +* Walk right: rightX = wsX; while rightX < 319 and pixel(rightX+1) +* matches, rightX++. Holds rightX in gFloodRightX + brl wsAfterWalk + +***** BOUNDARY MODE WALK ***** +wsWalkBndEntry anop + lda wsX + jsr wsTestBnd + sta >gFloodSeedMatch + cmp #0 + bne wsWalkBndSeedOk + brl wsExit +wsWalkBndSeedOk anop + + lda wsX + sta gFloodLeftX + + lda wsX + sta gFloodRightX + +wsAfterWalk anop + +* Cache leftX/rightX/spanLen for the scans. + lda >gFloodLeftX + sta >wsLeftX + lda >gFloodRightX + sta >wsRightX + sec + sbc >wsLeftX + inc a + sta >wsSpanLen + +* === FILL THE SPAN === +* Inline 1-row fill of row[leftX..rightX] with newNibble. Walk-out +* validated leftX/rightX bounds. Saves a cross-segment halFastFillRect +* call per popped seed; the C side now skips it when this asm path +* runs. wsMidEnd uses DP+4 (alias of wsX, free post-walk) so DP-direct +* `cpy wsMidEnd` works (CPY has no long-abs form). + lda >wsLeftX + lsr a ; A = leadingByte; carry = leftX & 1 + tay ; Y = leadingByte + bcc wsFillNoLead +* Leading partial byte: low nibble of byte[Y] = newNibble. + sep #$20 + LONGA OFF + lda [wsRow],y + and #$F0 + ora >wsNewByte + sta [wsRow],y + rep #$20 + LONGA ON + iny ; midStart = leadingByte + 1 +wsFillNoLead anop + +* midEnd = (rightX + 1) >> 1; trailing partial only if rightX even. + lda >wsRightX + inc a ; A = pxEnd = rightX + 1 + lsr a ; A = midEnd; carry = pxEnd & 1 + sta wsMidEnd ; DP store (so DP-direct CPY works) + bcs wsFillSetTrail + lda #0 + sta >wsHasTrail + bra wsFillMidLoop +wsFillSetTrail anop + lda #1 + sta >wsHasTrail + +wsFillMidLoop anop + sep #$20 + LONGA OFF + lda >wsDoubledByte +wsFillMidIter anop + cpy wsMidEnd ; DP-direct CPY (X=16-bit reads 2 bytes) + bcs wsFillMidDone + sta [wsRow],y + iny + bra wsFillMidIter +wsFillMidDone anop + rep #$20 + LONGA ON + + lda >wsHasTrail + beq wsFillDone +* Trailing partial byte: high nibble of byte[wsMidEnd] = newNibble. + ldy wsMidEnd ; Y = trailing byte index + sep #$20 + LONGA OFF + lda [wsRow],y + and #$0F + ora >wsNewHigh + sta [wsRow],y + rep #$20 + LONGA ON +wsFillDone anop + +* Load sp from *spInOut, cache maxSp. + ldy #0 + lda [wsSpInOut],y + sta >wsSp + lda wsMaxSp + sta >wsMaxSpCache + +* === SCAN ABOVE (if y > 0) === + lda wsY + bne wsHasAbove + brl wsSkipAbove +wsHasAbove anop + lda wsRow + sec + sbc #160 + sta wsScanRow + lda wsRow+2 + sbc #0 + sta wsScanRow+2 + lda wsY + dec a + sta >wsScanY + jsr wsScanAndPush +wsSkipAbove anop + +* === SCAN BELOW (if y < 199) === + lda wsY + cmp #199 + bcc wsHasBelow + brl wsSkipBelow +wsHasBelow anop + lda wsRow + clc + adc #160 + sta wsScanRow + lda wsRow+2 + adc #0 + sta wsScanRow+2 + lda wsY + inc a + sta >wsScanY + jsr wsScanAndPush +wsSkipBelow anop + +* Store updated sp back to *spInOut. + ldy #0 + lda >wsSp + sta [wsSpInOut],y + +wsExit anop + LONGA OFF + LONGI OFF + pld + plb + plp + rtl + +* wsTestEq: test pixel at column A against matchByte (eq mode). +* In: A = column (M=16). Reads from wsRow. +* Out: A = 1 if pix == matchByte, 0 otherwise. M=16 on exit. +* Trashes A, Y, P. Preserves X, D, B. +wsTestEq anop + lsr a ; A = byteIdx; C = column & 1 + tay + bcs wsTeqOdd + sep #$20 + LONGA OFF + lda [wsRow],y + lsr a + lsr a + lsr a + lsr a + bra wsTeqHave +wsTeqOdd anop + sep #$20 + LONGA OFF + lda [wsRow],y + and #$0F +wsTeqHave anop + cmp >wsMatchByte + rep #$20 + LONGA ON + bne wsTeqNo + lda #1 + rts +wsTeqNo anop + lda #0 + rts + + +* wsTestBnd: test pixel at column A against (matchByte, newByte) for +* boundary-mode flood: returns 1 iff pix != matchByte AND pix != newByte. +* In: A = column (M=16). Reads from wsRow. +* Out: A = 1 if qualifies, 0 otherwise. M=16 on exit. +* Trashes A, Y, P. Preserves X, D, B. +wsTestBnd anop + lsr a + tay + bcs wsTbnOdd + sep #$20 + LONGA OFF + lda [wsRow],y + lsr a + lsr a + lsr a + lsr a + bra wsTbnHave +wsTbnOdd anop + sep #$20 + LONGA OFF + lda [wsRow],y + and #$0F +wsTbnHave anop + cmp >wsMatchByte + beq wsTbnNo + cmp >wsNewByte + beq wsTbnNo + rep #$20 + LONGA ON + lda #1 + rts +wsTbnNo anop + rep #$20 + LONGA ON + lda #0 + rts + + +* wsScanAndPush: walk wsScanRow[wsLeftX..wsRightX] for run-edge +* transitions. Pushes (curX-1, wsScanY) on falling edges and +* (rightX, wsScanY) at end of trailing run. +* +* This version processes 2 pixels per byte read (one byte = high+low +* nibble) and keeps hot scratch (curX, byte, prevHit, curHit) in DP +* slots. Three phases: optional leading single low-nibble (if leftX +* odd), then a pair loop reading whole bytes, then optional trailing +* high-nibble (if curX == rightX after the pair loop). +* +* Y is maintained as the byte index within scanRow across the whole +* function -- no more phy/ply per pixel. +wsScanAndPush anop + sep #$20 + LONGA OFF + lda #0 + sta wsEqByte + rep #$20 + LONGA ON + bne wsEqEntry2 + brl wsBoundEntry2 + +***** EQUAL MODE ***** +wsEqEntry2 anop + lda >wsLeftX + sta wsMatchByte + beq wsEqLeadHit2 + lda #0 + bra wsEqLeadStore2 +wsEqLeadHit2 anop + lda #1 +wsEqLeadStore2 anop + sta wsPushX + jsr wsPushXY + sep #$20 + LONGA OFF +wsEqLeadNoFall2 anop + lda wsRightX + bcc wsEqDoPair2 + beq wsEqDoPair2 + brl wsEqTrailing2 + +wsEqDoPair2 anop + sep #$20 + LONGA OFF + lda [wsScanRow],y + sta wsMatchByte + beq wsEqHi2Hit + lda #0 + bra wsEqHi2Store +wsEqHi2Hit anop + lda #1 +wsEqHi2Store anop + sta wsPushX + jsr wsPushXY + sep #$20 + LONGA OFF +wsEqHi2NoFall anop + lda wsMatchByte + beq wsEqLo2Hit + lda #0 + bra wsEqLo2Store +wsEqLo2Hit anop + lda #1 +wsEqLo2Store anop + sta wsPushX + jsr wsPushXY + sep #$20 + LONGA OFF +wsEqLo2NoFall anop + lda wsRightX + bcc wsEqDoTrail2 + beq wsEqDoTrail2 + brl wsScanDone + +wsEqDoTrail2 anop + sep #$20 + LONGA OFF + lda [wsScanRow],y + lsr a + lsr a + lsr a + lsr a ; A = high nibble + cmp >wsMatchByte + beq wsEqTr2Hit + lda #0 + bra wsEqTr2Store +wsEqTr2Hit anop + lda #1 +wsEqTr2Store anop + sta wsPushX + jsr wsPushXY + sep #$20 + LONGA OFF +wsEqTr2NoFall anop + lda wsLeftX + sta wsMatchByte + beq wsBndLeadMiss2 + cmp >wsNewByte + beq wsBndLeadMiss2 + lda #1 + bra wsBndLeadStore2 +wsBndLeadMiss2 anop + lda #0 +wsBndLeadStore2 anop + sta wsPushX + jsr wsPushXY + sep #$20 + LONGA OFF +wsBndLeadNoFall2 anop + lda wsRightX + bcc wsBndDoPair2 + beq wsBndDoPair2 + brl wsBndTrailing2 + +wsBndDoPair2 anop + sep #$20 + LONGA OFF + lda [wsScanRow],y + sta wsMatchByte + beq wsBndHi2Miss + cmp >wsNewByte + beq wsBndHi2Miss + lda #1 + bra wsBndHi2Store +wsBndHi2Miss anop + lda #0 +wsBndHi2Store anop + sta wsPushX + jsr wsPushXY + sep #$20 + LONGA OFF +wsBndHi2NoFall anop + lda wsMatchByte + beq wsBndLo2Miss + cmp >wsNewByte + beq wsBndLo2Miss + lda #1 + bra wsBndLo2Store +wsBndLo2Miss anop + lda #0 +wsBndLo2Store anop + sta wsPushX + jsr wsPushXY + sep #$20 + LONGA OFF +wsBndLo2NoFall anop + lda wsRightX + bcc wsBndDoTrail2 + beq wsBndDoTrail2 + brl wsScanDone + +wsBndDoTrail2 anop + sep #$20 + LONGA OFF + lda [wsScanRow],y + lsr a + lsr a + lsr a + lsr a ; A = high nibble + cmp >wsMatchByte + beq wsBndTr2Miss + cmp >wsNewByte + beq wsBndTr2Miss + lda #1 + bra wsBndTr2Store +wsBndTr2Miss anop + lda #0 +wsBndTr2Store anop + sta wsPushX + jsr wsPushXY + sep #$20 + LONGA OFF +wsBndTr2NoFall anop + lda wsRightX + sta >wsPushX + jsr wsPushXY +wsScanReturn anop + rts + + +* wsPushXY: push (wsPushX, wsScanY) at sp if spwsSp + cmp >wsMaxSpCache + bcs wsPushSkip + asl a + tay + lda >wsPushX + sta [wsStackX],y + lda >wsScanY + sta [wsStackY],y + lda >wsSp + inc a + sta >wsSp +wsPushSkip anop + ply + pla + rts + end + + +wsRowMulTmp data DRAWPRIMS + ds 2 + end +wsLeftX data DRAWPRIMS + ds 2 + end +wsRightX data DRAWPRIMS + ds 2 + end +wsScanY data DRAWPRIMS + ds 2 + end +wsMatchByte data DRAWPRIMS + ds 2 + end +wsNewByte data DRAWPRIMS + ds 2 + end +wsEqByte data DRAWPRIMS + ds 2 + end +wsSp data DRAWPRIMS + ds 2 + end +wsMaxSpCache data DRAWPRIMS + ds 2 + end +wsSpanLen data DRAWPRIMS + ds 2 + end +wsCurX data DRAWPRIMS + ds 2 + end +wsByte data DRAWPRIMS + ds 2 + end +wsPixNib data DRAWPRIMS + ds 2 + end +wsPushX data DRAWPRIMS + ds 2 + end +wsPrevHit data DRAWPRIMS + ds 2 + end +wsCurHit data DRAWPRIMS + ds 2 + end +wsNewHigh data DRAWPRIMS + ds 2 + end +wsDoubledByte data DRAWPRIMS + ds 2 + end +wsHasTrail data DRAWPRIMS + ds 2 + end + + +**************************************************************** +* iigsBlitRectInner(uint8_t *dstRow0, uint16_t dstX, +* const uint8_t *srcRow0, uint16_t srcX, +* uint16_t copyW, uint16_t copyH, +* uint16_t srcRowBytes, +* uint16_t transparent) +* +* Per-pixel rect blit from src to dst. transparent == $FFFF means +* opaque blit (always copy). Otherwise pixels with src nibble equal +* to (transparent & $0F) are skipped. +* +* dst stride is hardcoded SURFACE_BYTES_PER_ROW (160). +* +* Args after PHP+PHB+PHD (TCD makes D = SP+8): +* dstRow0 at D+0..3 (long ptr) +* dstX at D+4..5 +* srcRow0 at D+6..9 (long ptr) +* srcX at D+10..11 +* copyW at D+12..13 +* copyH at D+14..15 +* srcRowBytes at D+16..17 +* transparent at D+18..19 (low byte; $FFFF = opaque) +**************************************************************** + +iigsBlitRectInner start IIGSASM +brDst equ 0 +brDstX equ 4 +brSrc equ 6 +brSrcX equ 10 +brW equ 12 +brH equ 14 +brSrcStride equ 16 +brTrans equ 18 + + php + phb + phd + rep #$30 + LONGA ON + LONGI ON + tsc + clc + adc #8 + tcd + +* rowsLeft = copyH. + lda brH + sta >brRowsLeft + +brRowLoop anop + lda >brRowsLeft + bne brRowGo + brl brExit +brRowGo anop + +* For this row: walk col 0..copyW-1. + lda #0 + sta >brCol + +brColLoop anop + lda >brCol + cmp brW + bcc brColGo + brl brColDone +brColGo anop + +* Compute src x = srcX + col, src byte addr offset = sx >> 1, parity = sx & 1. + lda brSrcX + clc + adc >brCol + sta >brSx + lsr a + tay ; Y = src byte offset + lda >brSx + and #$0001 + sta >brSxParity + + sep #$20 + LONGA OFF + lda [brSrc],y + sta >brSrcByte + rep #$20 + LONGA ON + lda >brSxParity + bne brSrcOdd +* Even src x -> high nibble. + sep #$20 + LONGA OFF + lda >brSrcByte + lsr a + lsr a + lsr a + lsr a + bra brSrcGotNib +brSrcOdd anop + sep #$20 + LONGA OFF + lda >brSrcByte + and #$0F +brSrcGotNib anop + sta >brNib ; M=8 store low byte + +* Transparency check: if low byte of brTrans == nib, skip write. + rep #$20 + LONGA ON + lda brTrans + and #$00FF + cmp #$00FF + beq brOpaque ; $FF means no transparency (opaque mode) + lda brTrans + and #$00FF + cmp >brNib + bne brOpaque ; not transparent, keep going + brl brColAdvance ; transparent, skip dst write +brOpaque anop + +* Compute dst x = dstX + col, dst byte addr offset = dx >> 1, parity = dx & 1. + lda brDstX + clc + adc >brCol + sta >brDx + lsr a + tay ; Y = dst byte offset + lda >brDx + and #$0001 + sta >brDxParity + + sep #$20 + LONGA OFF + lda [brDst],y + sta >brDstByte + rep #$20 + LONGA ON + lda >brDxParity + bne brDstOdd +* Even dst x -> high nibble. dst = (dst & 0x0F) | (nib << 4). + sep #$20 + LONGA OFF + lda >brNib + asl a + asl a + asl a + asl a + sta >brDstNibPart + lda >brDstByte + and #$0F + ora >brDstNibPart + sta [brDst],y + rep #$20 + LONGA ON + bra brColAdvance +brDstOdd anop + sep #$20 + LONGA OFF + lda >brNib + and #$0F + sta >brDstNibPart + lda >brDstByte + and #$F0 + ora >brDstNibPart + sta [brDst],y + rep #$20 + LONGA ON + +brColAdvance anop + lda >brCol + inc a + sta >brCol + brl brColLoop + +brColDone anop +* Advance srcRow ptr by srcRowBytes. + clc + lda brSrc ; low 16 of srcRow + adc brSrcStride + sta brSrc + bcc brSrcNoCarry + lda brSrc+2 ; bank/pad + clc + adc #1 + sta brSrc+2 +brSrcNoCarry anop + +* Advance dstRow ptr by 160. + clc + lda brDst + adc #160 + sta brDst + bcc brDstNoCarry + lda brDst+2 + clc + adc #1 + sta brDst+2 +brDstNoCarry anop + + lda >brRowsLeft + dec a + sta >brRowsLeft + brl brRowLoop + +brExit anop + LONGA OFF + LONGI OFF + pld + plb + plp + rtl + end + + +brRowsLeft data DRAWPRIMS + ds 2 + end +brCol data DRAWPRIMS + ds 2 + end +brSx data DRAWPRIMS + ds 2 + end +brSxParity data DRAWPRIMS + ds 2 + end +brSrcByte data DRAWPRIMS + ds 2 + end +brNib data DRAWPRIMS + ds 2 + end +brDx data DRAWPRIMS + ds 2 + end +brDxParity data DRAWPRIMS + ds 2 + end +brDstByte data DRAWPRIMS + ds 2 + end +brDstNibPart data DRAWPRIMS + ds 2 + end diff --git a/src/port/iigs/peislam.asm b/src/port/iigs/peislam.asm new file mode 100644 index 0000000..75a92ad --- /dev/null +++ b/src/port/iigs/peislam.asm @@ -0,0 +1,76 @@ +* peislam.asm - PEI-slam stage row to bank-$E1 SHR. +* +* Implements the //e AUXWRITE + RAMRD + SHR-shadow trick that lets +* 65816 stack pushes (which are bank-$00-implicit) end up in bank +* $E1 SHR display memory: +* +* - SHR shadow temporarily ENABLED (clear $C035 bit 3) so writes +* to bank-$01 in $2000-$9FFF mirror to $E1 SHR. +* - AUXWRITE on (any write to $C005) so bank-$00 stack writes +* redirect to bank $01, then mirror to $E1 via shadow. +* - RAMRD on (any write to $C003) so PEI dp's bank-$00-implicit +* reads redirect to bank $01 = the stage source. +* - SEI for the duration: stack pointer is hijacked to point at +* $E1-mapped stack space, soft-switch state would corrupt any +* C code that tried to access bank-$00 globals. +* +* All scratch reads/writes within the slam use long-mode `>name` +* addressing (24-bit, explicit bank) so they bypass RAMRD redirect +* and reach the actual bank-$00 global storage. +* +* Calling convention: ORCA-C memory model 1 (large model, JSL/RTL). +* void peiSlamFullRow(int16_t y); +* - Caller PHAs y (2 bytes) before JSL. +* - JSL pushes 3-byte return address. +* - On entry: y_LO at SP+4, y_HI at SP+5 (SP points one below PCL). +* - Function preserves DBR; returns via RTL with original SP. +* - Caller pops the y arg after RTL. +* +* Per call: ~50 cyc bracket + 80 PEIs * 6 cyc = ~530 cyc, vs the +* memcpy/MVN fallback's 7 cyc/byte * 160 bytes = ~1120 cyc. + + keep PEISLAM + case on + +* The operand to START names the LOAD segment this object segment +* belongs to (per ORCA/M for IIgs manual, ch. 6 "Load Segments"). +* Object segments without an operand land in the unnamed "blank +* segment" -- which on AUDIO is _ROOT, the very segment whose 64 KB +* budget peislam.asm was busting. Naming a load segment forces the +* linker to put us in our own segment, which the GS/OS loader then +* allocates in its own bank. +peiSlamFullRow start IIGSASM +* MVN-based row copy. Replaces the PEI-stack-slam approach (which +* needs RAMRD/AUXWRITE/SHADOW soft-switches and is sensitive to +* DRAWDATA bank placement). MVN copies 160 bytes from the bank-$01 +* stage row to the matching bank-$E1 SHR row at ~7 cyc/byte; that's +* slower than PEI-slam but rock-solid. +* +* Args after PHP: y (int16) at SP+5..6. Compute rowOffset = $2000 +* + y*160. MVN $01,$E1 with X=Y=rowOffset, A=159 copies 160 bytes +* from $01:rowOffset to $E1:rowOffset. + php + rep #$30 ; M=16, X=16 + + lda 5,s ; y + asl a + asl a + asl a + asl a + asl a ; A = y << 5 = y*32 + sta >gPeiTempRowBase + asl a + asl a ; A = y << 7 = y*128 + clc + adc >gPeiTempRowBase ; A = y*160 + clc + adc #$2000 ; A = $2000 + y*160 = row offset + + tax ; X = source offset (bank $01) + tay ; Y = dest offset (bank $E1) + lda #159 ; count - 1 (MVN copies count+1 = 160 bytes) + mvn $01,$E1 + + plp + rtl + end diff --git a/toolchains/install.sh b/toolchains/install.sh index ad3b36f..75fd43f 100755 --- a/toolchains/install.sh +++ b/toolchains/install.sh @@ -1071,6 +1071,7 @@ EOF fi } + install_gsplus() { local base="${SCRIPT_DIR}/emulators/gsplus" local bin="${base}/bin/gsplus"