From cf6ae093d3985f487b1376875a9448cf977f661b Mon Sep 17 00:00:00 2001 From: Scott Duensing Date: Mon, 4 May 2026 11:06:41 -0500 Subject: [PATCH] ST is more or less parity. --- README.md | 326 ++++++++++++++++++++++ scripts/dosbox-386sx16.conf | 28 ++ scripts/run-dos.sh | 6 + src/core/hal.h | 13 +- src/core/surface.c | 4 +- src/port/amiga/c2p.s | 127 --------- src/port/amiga/hal.c | 143 +--------- src/port/atarist/c2p.s | 188 ------------- src/port/atarist/circle.s | 20 +- src/port/atarist/fillCircle.s | 303 ++++++++++---------- src/port/atarist/hal.c | 505 +++++++--------------------------- src/port/atarist/lineSpan.s | 162 ++++++++++- src/port/atarist/spriteAsm.s | 195 ++++++++++--- src/port/dos/hal.c | 4 +- src/port/iigs/hal.c | 4 +- 15 files changed, 966 insertions(+), 1062 deletions(-) create mode 100644 scripts/dosbox-386sx16.conf delete mode 100644 src/port/amiga/c2p.s delete mode 100644 src/port/atarist/c2p.s diff --git a/README.md b/README.md index cc5dde9..810e8db 100644 --- a/README.md +++ b/README.md @@ -59,6 +59,332 @@ build// per-target build outputs ``` +## Public API + +Game code includes a single umbrella header: + +```c +#include +``` + +That pulls in every public surface listed below. Full documentation +lives in the per-feature headers under `include/joey/`; what follows +is a quick reference. Every entry point is plain C, no C++ extensions. + + +### Lifecycle (`joey/core.h`) + +```c +typedef struct { + HostModeE hostMode; // HOST_MODE_TAKEOVER or HOST_MODE_OS + uint32_t codegenBytes; // runtime compiled-sprite cache size + uint16_t maxSurfaces; // maximum concurrent surfaces + uint32_t audioBytes; // audio sample / module RAM pool + uint32_t assetBytes; // tileset / sprite / map RAM pool +} JoeyConfigT; + +bool joeyInit (const JoeyConfigT *config); +void joeyShutdown (void); +const char *joeyLastError (void); +const char *joeyPlatformName (void); +const char *joeyVersionString(void); + +void joeyWaitVBL (void); // block until next VBL +uint16_t joeyFrameCount (void); // monotonic 16-bit frame counter +uint16_t joeyFrameHz (void); // 50 / 60 / 70 depending on port +``` + + +### Surfaces (`joey/surface.h`) + +All surfaces are 320x200 4bpp packed (high nibble = left pixel) with +a 200-entry SCB table and 16 palettes of 16 `$0RGB` colors. + +```c +#define SURFACE_WIDTH 320 +#define SURFACE_HEIGHT 200 +#define SURFACE_BYTES_PER_ROW 160 +#define SURFACE_PIXELS_SIZE (SURFACE_BYTES_PER_ROW * SURFACE_HEIGHT) +#define SURFACE_PALETTE_COUNT 16 +#define SURFACE_COLORS_PER_PALETTE 16 + +typedef struct SurfaceT SurfaceT; // opaque + +SurfaceT *surfaceCreate (void); +void surfaceDestroy(SurfaceT *s); +SurfaceT *stageGet (void); // library back-buffer +void surfaceCopy (SurfaceT *dst, const SurfaceT *src); + +bool surfaceSaveFile(const SurfaceT *src, const char *path); +bool surfaceLoadFile(SurfaceT *dst, const char *path); +uint32_t surfaceHash (const SurfaceT *s); // FNV-1a of logical pixels +``` + +`surfaceSaveFile` writes the surface in **target-native** form. Files +are NOT cross-port portable; the asset pipeline handles conversion. + + +### Drawing (`joey/draw.h`) + +All primitives clip to the surface; off-surface coords are silent +no-ops. Color 0 is plotted normally (use the masked variants if you +need transparency). + +```c +void surfaceClear (SurfaceT *s, uint8_t color); +void drawPixel (SurfaceT *s, int16_t x, int16_t y, uint8_t color); +uint8_t samplePixel (const SurfaceT *s, int16_t x, int16_t y); + +void drawLine (SurfaceT *s, int16_t x0, int16_t y0, + int16_t x1, int16_t y1, uint8_t color); +void drawRect (SurfaceT *s, int16_t x, int16_t y, + uint16_t w, uint16_t h, uint8_t color); +void fillRect (SurfaceT *s, int16_t x, int16_t y, + uint16_t w, uint16_t h, uint8_t color); +void drawCircle (SurfaceT *s, int16_t cx, int16_t cy, + uint16_t r, uint8_t color); +void fillCircle (SurfaceT *s, int16_t cx, int16_t cy, + uint16_t r, uint8_t color); + +void floodFill (SurfaceT *s, int16_t x, int16_t y, uint8_t newColor); +void floodFillBounded (SurfaceT *s, int16_t x, int16_t y, + uint8_t newColor, uint8_t boundaryColor); + +void surfaceBlit (SurfaceT *dst, const JoeyAssetT *src, int16_t x, int16_t y); +void surfaceBlitMasked (SurfaceT *dst, const JoeyAssetT *src, + int16_t x, int16_t y, uint8_t transparentIndex); +``` + + +### Palette and SCB (`joey/palette.h`) + +Colors are 12-bit `$0RGB`. Color 0 of every palette is forced to +black on `paletteSet`. Each scanline picks one of the 16 palettes +via the SCB. + +```c +void paletteSet (SurfaceT *s, uint8_t paletteIndex, const uint16_t *colors16); +void paletteGet (const SurfaceT *s, uint8_t paletteIndex, uint16_t *out16); +void scbSet (SurfaceT *s, uint16_t line, uint8_t paletteIndex); +void scbSetRange (SurfaceT *s, uint16_t firstLine, uint16_t lastLine, + uint8_t paletteIndex); +uint8_t scbGet (const SurfaceT *s, uint16_t line); +``` + + +### Tiles (`joey/tile.h`) + +A "tile" is just an 8x8-aligned region of any surface. The API moves +32-byte chunks between surfaces and provides a small `TileT` value +type so callers can stash a copy without allocating a scratch surface. + +```c +#define TILE_PIXELS_PER_SIDE 8 +#define TILE_BYTES_PER_ROW 4 +#define TILE_BYTES (TILE_BYTES_PER_ROW * TILE_PIXELS_PER_SIDE) +#define TILE_BLOCKS_PER_ROW (SURFACE_WIDTH / TILE_PIXELS_PER_SIDE) // 40 +#define TILE_BLOCKS_PER_COL (SURFACE_HEIGHT / TILE_PIXELS_PER_SIDE) // 25 +#define TILE_NO_GLYPH ((uint16_t)0xFFFFu) + +typedef struct TileT { uint8_t pixels[TILE_BYTES]; } TileT; + +void tileCopy (SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, + const SurfaceT *src, uint8_t srcBx, uint8_t srcBy); +void tileCopyMasked (SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, + const SurfaceT *src, uint8_t srcBx, uint8_t srcBy, + uint8_t transparentIndex); +void tileFill (SurfaceT *s, uint8_t bx, uint8_t by, uint8_t color); +void tileSnap (const SurfaceT *src, uint8_t bx, uint8_t by, TileT *out); +void tilePaste (SurfaceT *dst, uint8_t bx, uint8_t by, const TileT *in); + +void drawText (SurfaceT *dst, uint8_t bx, uint8_t by, + const SurfaceT *fontSurface, const uint16_t *asciiMap, + const char *str); +``` + + +### Sprites (`joey/sprite.h`) + +Rectangles of 8x8 tiles drawn at arbitrary pixel positions with +color-0 transparency. Tile data is `widthTiles * heightTiles * 32` +bytes, tile-major 4bpp packed. Sprites can be runtime-compiled +into per-shift code variants for fast draws. + +```c +typedef enum { SPRITE_FLAGS_NONE = 0 } SpriteFlagsE; +typedef struct SpriteT SpriteT; // opaque + +typedef struct { + SpriteT *sprite; + int16_t x, y; + uint16_t width, height; // pixels + uint8_t *bytes; // caller-owned save-under buffer + uint16_t sizeBytes; +} SpriteBackupT; + +SpriteT *spriteCreate (const uint8_t *tileData, + uint8_t widthTiles, uint8_t heightTiles, + SpriteFlagsE flags); +SpriteT *spriteCreateFromSurface (const SurfaceT *src, int16_t x, int16_t y, + uint8_t widthTiles, uint8_t heightTiles, + SpriteFlagsE flags); +SpriteT *spriteLoadFile (const char *path, SpriteFlagsE flags); +SpriteT *spriteFromCompiledMem (const uint8_t *data, uint32_t length, + SpriteFlagsE flags); +bool spriteSaveFile (SpriteT *sp, const char *path); +void spriteDestroy (SpriteT *sp); + +bool spriteCompile (SpriteT *sp); // build per-shift fast path +void spritePrewarm (SpriteT *sp); // hint: compile if not already + +void spriteDraw (SurfaceT *s, SpriteT *sp, int16_t x, int16_t y); +void spriteSaveUnder (const SurfaceT *s, SpriteT *sp, + int16_t x, int16_t y, SpriteBackupT *backup); +void spriteRestoreUnder (SurfaceT *s, const SpriteBackupT *backup); +void spriteSaveAndDraw (SurfaceT *s, SpriteT *sp, int16_t x, int16_t y, + SpriteBackupT *backup); + +void spriteCompact (void); // defrag the codegen arena +uint32_t spriteCodegenBytesUsed (void); +uint32_t spriteCodegenBytesTotal (void); +``` + + +### Assets (`joey/asset.h`) + +Small bitmap blits with optional embedded palette, in `.jas` format. +Use embedded `const JoeyAssetT` for ship-with-binary art; use the +loaders for on-disk assets. + +```c +typedef struct { + uint16_t width; + uint16_t height; + bool hasPalette; + uint16_t palette[16]; // valid only if hasPalette + const uint8_t *pixels; // 4bpp packed, rowBytes = (width+1)/2 +} JoeyAssetT; + +JoeyAssetT *joeyAssetLoadFile (const char *path); +JoeyAssetT *joeyAssetFromMem (const uint8_t *data, uint32_t length); +void joeyAssetFree (JoeyAssetT *asset); +void joeyAssetApplyPalette (SurfaceT *dst, uint8_t paletteIndex, + const JoeyAssetT *asset); +``` + + +### Present (`joey/present.h`) + +```c +void stagePresent(void); +``` + +Flips the dirty rows of the stage to the display, then clears dirty +state. Drawing primitives mark dirty as a side effect, so calling +`stagePresent` once at end-of-frame is enough. + + +### Input (`joey/input.h`) + +Call `joeyInputPoll` once per frame, then query the state predicates. +Edge predicates (`*Pressed`, `*Released`) fire only in the frame the +transition happened. + +```c +typedef enum { /* KEY_NONE, KEY_A..KEY_Z, KEY_0..KEY_9, KEY_SPACE, + KEY_ESCAPE, KEY_RETURN, KEY_TAB, KEY_BACKSPACE, + KEY_UP/DOWN/LEFT/RIGHT, KEY_LSHIFT/RSHIFT/LCTRL/LALT, + KEY_F1..KEY_F10, KEY_COUNT */ } JoeyKeyE; +typedef enum { MOUSE_BUTTON_NONE, MOUSE_BUTTON_LEFT, MOUSE_BUTTON_RIGHT, + MOUSE_BUTTON_MIDDLE, MOUSE_BUTTON_COUNT } JoeyMouseButtonE; +typedef enum { JOYSTICK_0, JOYSTICK_1, JOYSTICK_COUNT } JoeyJoystickE; +typedef enum { JOY_BUTTON_0, JOY_BUTTON_1, JOY_BUTTON_COUNT } JoeyJoyButtonE; + +#define JOYSTICK_AXIS_MAX 127 +#define JOYSTICK_AXIS_MIN (-127) + +void joeyInputPoll (void); +void joeyWaitForAnyKey (void); + +bool joeyKeyDown (JoeyKeyE key); +bool joeyKeyPressed (JoeyKeyE key); +bool joeyKeyReleased (JoeyKeyE key); + +int16_t joeyMouseX (void); +int16_t joeyMouseY (void); +bool joeyMouseDown (JoeyMouseButtonE b); +bool joeyMousePressed (JoeyMouseButtonE b); +bool joeyMouseReleased (JoeyMouseButtonE b); + +bool joeyJoystickConnected(JoeyJoystickE js); +int8_t joeyJoystickX (JoeyJoystickE js); +int8_t joeyJoystickY (JoeyJoystickE js); +bool joeyJoyDown (JoeyJoystickE js, JoeyJoyButtonE b); +bool joeyJoyPressed (JoeyJoystickE js, JoeyJoyButtonE b); +bool joeyJoyReleased (JoeyJoystickE js, JoeyJoyButtonE b); +void joeyJoystickReset (JoeyJoystickE js, uint8_t deadZone); +``` + + +### Audio (`joey/audio.h`) + +4-channel Protracker-style music plus four one-shot SFX slots. Module +data must be the platform-native form produced by `tools/joeymod` +(`.mod` for Amiga/DOS/ST; `.ntp` for IIgs; `.amod` if you want +loop=false on Amiga). A failed `joeyAudioInit` is non-fatal; the rest +of the API stays callable as no-ops. + +```c +#define JOEY_AUDIO_SFX_SLOTS 4 + +bool joeyAudioInit (void); +void joeyAudioShutdown (void); + +void joeyAudioPlayMod (const uint8_t *data, uint32_t length, bool loop); +void joeyAudioStopMod (void); +bool joeyAudioIsPlayingMod (void); + +void joeyAudioPlaySfx (uint8_t slot, const uint8_t *sample, + uint32_t length, uint16_t rateHz); +void joeyAudioStopSfx (uint8_t slot); + +void joeyAudioFrameTick (void); +``` + + +### Debug logging (`joey/debug.h`) + +Crash-tracing logger. Writes are buffered and durable across normal +exit; call `joeyLogFlush` ahead of suspected hang points if you want +a guaranteed last-line-on-disk. + +```c +void joeyLog (const char *msg); +void joeyLogF (const char *fmt, ...); +void joeyLogFlush(void); +void joeyLogReset(void); +``` + +Output goes to `joeylog.txt` in the program's working directory. + + +### Platform macros (`joey/platform.h`) + +The build system normally sets the platform via `-D`; auto-detection +from compiler-predefined macros is a fallback. Game code can +conditionally compile on these: + +``` +JOEYLIB_PLATFORM_IIGS / _AMIGA / _ATARIST / _DOS // exactly one defined +JOEYLIB_CPU_65816 / _68000 / _X86 +JOEYLIB_ENDIAN_LITTLE / _BIG +JOEYLIB_NATIVE_CHUNKY / _NATIVE_PLANAR +JOEYLIB_HAS_BLITTER / _HAS_COPPER // Amiga only +JOEYLIB_PLATFORM_NAME // human-readable string +JOEYLIB_VERSION_MAJOR / _MINOR / _PATCH / _STRING +``` + + ## License TBD. diff --git a/scripts/dosbox-386sx16.conf b/scripts/dosbox-386sx16.conf new file mode 100644 index 0000000..95bbf27 --- /dev/null +++ b/scripts/dosbox-386sx16.conf @@ -0,0 +1,28 @@ +# DOSBox config: simulate an Intel 386SX-16 (1988), the slowest 386 +# desktop CPU JoeyLib could realistically be run on. Use this floor +# to verify the DOS port still hits its frame budget on the bottom of +# the 386 stack rather than coasting on host CPU. +# +# The 386SX is identical to the 386DX in instruction set; the only +# difference is the 16-bit external bus (vs 32-bit on DX), which slows +# memory-bound code. DOSBox does not model the bus split directly -- +# the cycles count below approximates the combined 386SX-16 throughput. +# +# Notes: +# core = normal accurate per-instruction cycles, not +# recompiled-to-host (auto / dynamic would +# defeat slow-CPU simulation). +# cputype = 386 386 instruction set (no 486 BSWAP / +# CMPXCHG, no Pentium MMX). +# cycles = fixed 2200 community-standard approximation for +# 386SX-16 throughput in DOSBox. +# DOSBox-Staging deprecates this in favor +# of cpu_cycles, but still accepts it. +# Vanilla DOSBox and DOSBox-X only know +# the old key, so 'cycles' stays for +# cross-fork portability. + +[cpu] +core = normal +cputype = 386 +cycles = fixed 2200 diff --git a/scripts/run-dos.sh b/scripts/run-dos.sh index 607d37c..62fdc99 100755 --- a/scripts/run-dos.sh +++ b/scripts/run-dos.sh @@ -18,6 +18,7 @@ fi prog=${1:-pattern} repo=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd) bin_dir=$repo/build/dos/bin +conf=$repo/scripts/dosbox-386sx16.conf file=${prog^^}.EXE if [[ ! -f "$bin_dir/$file" ]]; then @@ -34,7 +35,12 @@ fi # default capture-on-click behavior fights the VM's grab and mouse # input is unusable. On plain DOSBox this -set flag is unknown and is # logged once as a warning, then ignored -- harmless either way. +# +# -conf $conf locks the CPU to a simulated 386SX-16 (the slowest +# realistic 386 desktop). DOSBox layers configs: anything not set in +# our file falls back to the user's main dosbox.conf. exec dosbox \ + -conf "$conf" \ -set "mouse_capture=seamless" \ -c "C:" \ -c "$file" \ diff --git a/src/core/hal.h b/src/core/hal.h index 05cdbd8..f5eba77 100644 --- a/src/core/hal.h +++ b/src/core/hal.h @@ -140,15 +140,16 @@ void halSpriteRestorePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint1 // s->pixels src->dst; on planar ports there is no chunky to copy // (planes already covered by halSurfaceCopyPlanes). Chunky ports // do the memcpy here; Amiga is a no-op. -// halSurfaceLoadFileChunky / halSurfaceSaveFileChunky wrap fread / -// fwrite of the pixel data. Chunky ports stream directly to/from -// s->pixels; Amiga uses a scratch buffer + c2p (load) or -// plane->chunky derivation (save). +// halSurfaceLoadFile / halSurfaceSaveFile wrap fread / fwrite of the +// pixel data using each port's native pixel format (chunky on +// IIgs/DOS, interleaved planar on ST, plane-major on Amiga). Files +// written by one port are NOT loadable by another -- conversion is +// the asset pipeline's job. uint8_t halSamplePixel(const SurfaceT *s, int16_t x, int16_t y); uint32_t halSurfaceHash(const SurfaceT *s); void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src); -bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp); -bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp); +bool halSurfaceLoadFile(SurfaceT *dst, FILE *fp); +bool halSurfaceSaveFile(const SurfaceT *src, FILE *fp); // Present the dirty regions of the source surface to the display. // The cross-platform stagePresent walks the dirty arrays before diff --git a/src/core/surface.c b/src/core/surface.c index 229b5f0..d2c5c62 100644 --- a/src/core/surface.c +++ b/src/core/surface.c @@ -158,7 +158,7 @@ bool surfaceLoadFile(SurfaceT *dst, const char *path) { fclose(fp); return false; } - if (!halSurfaceLoadFileChunky(dst, fp)) { + if (!halSurfaceLoadFile(dst, fp)) { fclose(fp); return false; } @@ -186,7 +186,7 @@ bool surfaceSaveFile(const SurfaceT *src, const char *path) { if (fp == NULL) { return false; } - if (!halSurfaceSaveFileChunky(src, fp)) { + if (!halSurfaceSaveFile(src, fp)) { fclose(fp); return false; } diff --git a/src/port/amiga/c2p.s b/src/port/amiga/c2p.s deleted file mode 100644 index 25554fa..0000000 --- a/src/port/amiga/c2p.s +++ /dev/null @@ -1,127 +0,0 @@ -| Amiga chunky-to-planar conversion -- 68000 hand-rolled. -| -| Drop-in replacement for hal.c's old c2pRange C inner loop. Uses a -| 4 KB lookup table built once at HAL init: each (sourceByte, position, -| plane) tuple maps to the plane-byte bit contribution that source -| byte makes when it sits at that position within a 4-byte (8-pixel) -| planar group going to that plane. -| -| Calling convention: m68k-amigaos-gcc cdecl. -| Args on stack at 4(sp), 8(sp), ... -| d2-d7, a2-a6 are callee-save. -| No return value. -| -| void chunkyToPlanarRow(const uint8_t *src, ; 4(sp) - 4bpp packed source row -| uint8_t *p0, ; 8(sp) - plane 0 dest row -| uint8_t *p1, ; 12(sp) - plane 1 dest row -| uint8_t *p2, ; 16(sp) - plane 2 dest row -| uint8_t *p3, ; 20(sp) - plane 3 dest row -| uint16_t n, ; 24(sp) - planar byte count (low word) -| const uint8_t *lut); ; 28(sp) - 4 KB LUT base -| -| LUT layout: lut[src*16 + pos*4 + plane] = 1-byte plane contribution -| for source byte `src` sitting at byte-position `pos` (0..3) within -| its 4-byte planar group, going to plane `plane` (0..3). All 16 -| (pos, plane) entries for one src byte are contiguous, so the inner -| loop reaches every entry off (a5, d4.w) with an 8-bit displacement -| (0..15) and never has to advance an index register. -| -| Per planar byte we consume 4 source bytes (positions 0..3 of the -| 8-pixel group). For each we compute d4 = src*16 with four add.w's -| (faster than asl.w on 68000) and OR the four plane contributions -| into d0..d3 with byte-displaced (a5,d4.w) reads. -| -| GAS-syntax (binutils m68k); assembled by m68k-amigaos-as via the -| gcc driver. - - .text - .globl _chunkyToPlanarRow - -| Stack frame size of MOVEM.L block: d2-d7 (6) + a2-a6 (5) = 11 regs -| * 4 bytes = 44 bytes. Args therefore start at the original sp+4 -| offset PLUS 44. - .equ SAVED_REGS_SIZE, 44 - - -_chunkyToPlanarRow: - movem.l %d2-%d7/%a2-%a6,-(%sp) - - move.l 4+SAVED_REGS_SIZE(%sp),%a0 | src - move.l 8+SAVED_REGS_SIZE(%sp),%a1 | p0 - move.l 12+SAVED_REGS_SIZE(%sp),%a2 | p1 - move.l 16+SAVED_REGS_SIZE(%sp),%a3 | p2 - move.l 20+SAVED_REGS_SIZE(%sp),%a4 | p3 - | n is a uint16_t but GCC promotes to int and pushes a - | full 4 bytes -- the low word lives at +2 in big-endian - | layout. - move.w 24+SAVED_REGS_SIZE+2(%sp),%d7 | planar byte count - move.l 28+SAVED_REGS_SIZE(%sp),%a5 | LUT base - - subq.w #1,%d7 | DBRA: count-1 - bmi .Ldone | nothing to do - -.LbyteLoop: - moveq #0,%d0 | plane 0 acc - moveq #0,%d1 | plane 1 acc - moveq #0,%d2 | plane 2 acc - moveq #0,%d3 | plane 3 acc - - | ----- Source byte position 0 ----- - moveq #0,%d4 - move.b (%a0)+,%d4 | src[0] - add.w %d4,%d4 - add.w %d4,%d4 - add.w %d4,%d4 - add.w %d4,%d4 | d4 = src * 16 - or.b 0(%a5,%d4.w),%d0 | pos0 plane0 - or.b 1(%a5,%d4.w),%d1 | pos0 plane1 - or.b 2(%a5,%d4.w),%d2 | pos0 plane2 - or.b 3(%a5,%d4.w),%d3 | pos0 plane3 - - | ----- Source byte position 1 ----- - moveq #0,%d4 - move.b (%a0)+,%d4 | src[1] - add.w %d4,%d4 - add.w %d4,%d4 - add.w %d4,%d4 - add.w %d4,%d4 - or.b 4(%a5,%d4.w),%d0 | pos1 plane0 - or.b 5(%a5,%d4.w),%d1 | pos1 plane1 - or.b 6(%a5,%d4.w),%d2 | pos1 plane2 - or.b 7(%a5,%d4.w),%d3 | pos1 plane3 - - | ----- Source byte position 2 ----- - moveq #0,%d4 - move.b (%a0)+,%d4 | src[2] - add.w %d4,%d4 - add.w %d4,%d4 - add.w %d4,%d4 - add.w %d4,%d4 - or.b 8(%a5,%d4.w),%d0 | pos2 plane0 - or.b 9(%a5,%d4.w),%d1 | pos2 plane1 - or.b 10(%a5,%d4.w),%d2 | pos2 plane2 - or.b 11(%a5,%d4.w),%d3 | pos2 plane3 - - | ----- Source byte position 3 ----- - moveq #0,%d4 - move.b (%a0)+,%d4 | src[3] - add.w %d4,%d4 - add.w %d4,%d4 - add.w %d4,%d4 - add.w %d4,%d4 - or.b 12(%a5,%d4.w),%d0 | pos3 plane0 - or.b 13(%a5,%d4.w),%d1 | pos3 plane1 - or.b 14(%a5,%d4.w),%d2 | pos3 plane2 - or.b 15(%a5,%d4.w),%d3 | pos3 plane3 - - | ----- Store plane bytes ----- - move.b %d0,(%a1)+ - move.b %d1,(%a2)+ - move.b %d2,(%a3)+ - move.b %d3,(%a4)+ - - dbra %d7,.LbyteLoop - -.Ldone: - movem.l (%sp)+,%d2-%d7/%a2-%a6 - rts diff --git a/src/port/amiga/hal.c b/src/port/amiga/hal.c index c51b5a9..7e87f2d 100644 --- a/src/port/amiga/hal.c +++ b/src/port/amiga/hal.c @@ -115,69 +115,10 @@ static uint8_t gCachedScb [SURFACE_HEIGHT] static uint16_t gCachedPalette[SURFACE_PALETTE_COUNT][SURFACE_COLORS_PER_PALETTE] __attribute__((aligned(4))); static bool gCacheValid = false; -// 4 KB chunky-to-planar lookup table consumed by chunkyToPlanarRow -// (src/port/amiga/c2p.s). Layout: gC2pLut[src*16 + pos*4 + plane] = -// the plane-byte bit contribution that source byte `src` makes to -// plane `plane` when it sits at byte-position `pos` within a 4-byte -// (8-pixel) planar group. The src-major layout lets the asm inner -// loop reach all 16 (pos, plane) entries for a single src byte via -// 8-bit displacements off (a5, d4.w) without any LEA between reads. -static uint8_t gC2pLut[4 * 1024]; -static bool gC2pLutReady = false; - static bool paletteOrScbChanged(const SurfaceT *src); -static void initC2pLut(void); - -// Provided by src/port/amiga/c2p.s. -extern void chunkyToPlanarRow(const uint8_t *src, - uint8_t *p0, uint8_t *p1, uint8_t *p2, uint8_t *p3, - uint16_t numPlanarBytes, - const uint8_t *lut); // ----- Internal helpers (alphabetical) ----- -// Build the 4 KB chunky-to-planar lookup table consumed by -// chunkyToPlanarRow. For each (pos, plane, src) tuple, store the -// bit contribution that source byte `src` makes to plane `plane` -// when it sits at byte-position `pos` (0..3) within a 4-byte -// (8-pixel) planar group: -// -// - src high nibble = leftmost pixel -> plane bit (7 - 2*pos) -// - src low nibble = rightmost pixel -> plane bit (6 - 2*pos) -static void initC2pLut(void) { - uint16_t pos; - uint16_t plane; - uint16_t src; - uint8_t highShift; - uint8_t lowShift; - uint8_t highBit; - uint8_t lowBit; - - if (gC2pLutReady) { - return; - } - for (src = 0; src < 256; src++) { - for (pos = 0; pos < 4; pos++) { - highShift = (uint8_t)(7 - 2 * pos); - lowShift = (uint8_t)(6 - 2 * pos); - for (plane = 0; plane < 4; plane++) { - highBit = (uint8_t)(((src >> 4) >> plane) & 1); - lowBit = (uint8_t)(((src & 0x0F) >> plane) & 1); - gC2pLut[src * 16 + pos * 4 + plane] = - (uint8_t)((highBit << highShift) | (lowBit << lowShift)); - } - } - } - gC2pLutReady = true; -} - - -// (Phase 9 deleted c2pRange. halSurfaceLoadPlanes inlines its own -// per-row chunkyToPlanarRow loop -- the only code path that still -// converts chunky to planar today, since asset loading is the only -// surface mutation that doesn't go through a planar-aware primitive.) - - // Build a user copper list for per-scanline palette (SCB emulation). // One WAIT + 16 MOVEs per displayed scanline + one CEND. The list is // stored in gNewUCL until installCopperList swaps it onto the screen. @@ -1358,35 +1299,6 @@ void halSpriteRestorePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint1 } -/* Helper used by Amiga halSurfaceLoadFileChunky to populate planes - * from a freshly-loaded chunky pixel buffer (s->pixels). */ -static void amigaPopulatePlanesFromChunky(SurfaceT *s) { - AmigaPlanarT *pd; - int16_t y; - const uint8_t *srcLine; - UBYTE *p0; - UBYTE *p1; - UBYTE *p2; - UBYTE *p3; - - pd = (AmigaPlanarT *)s->portData; - if (pd == NULL) { - return; - } - if (!gC2pLutReady) { - initC2pLut(); - } - for (y = 0; y < SURFACE_HEIGHT; y++) { - srcLine = &s->pixels[y * SURFACE_BYTES_PER_ROW]; - p0 = pd->planes[0] + (uint16_t)y * AMIGA_BYTES_PER_ROW; - p1 = pd->planes[1] + (uint16_t)y * AMIGA_BYTES_PER_ROW; - p2 = pd->planes[2] + (uint16_t)y * AMIGA_BYTES_PER_ROW; - p3 = pd->planes[3] + (uint16_t)y * AMIGA_BYTES_PER_ROW; - chunkyToPlanarRow(srcLine, p0, p1, p2, p3, AMIGA_BYTES_PER_ROW, gC2pLut); - } -} - - // Phase 6 planar dual-write for sprite draw. Walks the sprite's // chunky tile data with the same clipping the cross-platform code // applies, calling amigaPlanarSetPixel for every non-transparent @@ -2118,7 +2030,9 @@ uint8_t halSamplePixel(const SurfaceT *s, int16_t x, int16_t y) { /* Reverse-c2p: per row, derive 160 chunky bytes from 40 plane bytes - * (per plane, 4 planes). Used by halSurfaceHash, halSurfaceSaveFileChunky. + * (per plane, 4 planes). Used by halSurfaceHash to fold the planar + * surface into the same byte-stream the chunky ports hash, so cross- + * port hash comparisons stay valid. * Walks 8 pixels per planar-byte column; per pixel assembles nibble * from 4 plane bits. Output: 4 chunky bytes per planar-byte column * (since 8 pixels = 4 chunky bytes at 2px/byte). */ @@ -2204,62 +2118,35 @@ void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src) { } -bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp) { +// On-disk format is the Amiga's native plane-major buffer: planes +// 0..3 written sequentially, AMIGA_PLANE_SIZE bytes each. +bool halSurfaceLoadFile(SurfaceT *dst, FILE *fp) { AmigaPlanarT *pd; - uint8_t *scratch; - uint8_t *srcLine; - int16_t y; - UBYTE *p0; - UBYTE *p1; - UBYTE *p2; - UBYTE *p3; - bool ok; + uint8_t i; pd = (AmigaPlanarT *)dst->portData; if (pd == NULL) { return false; } - /* fread the chunky file payload into a scratch buffer, then c2p - * directly into our planes. The scratch is a one-shot AllocMem - * (PUBLIC, not chip) since chunkyToPlanarRow only reads it. */ - scratch = (uint8_t *)AllocMem((ULONG)SURFACE_PIXELS_SIZE, (ULONG)MEMF_PUBLIC); - if (scratch == NULL) { - return false; - } - ok = (fread(scratch, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE); - if (ok) { - if (!gC2pLutReady) { - initC2pLut(); - } - for (y = 0; y < SURFACE_HEIGHT; y++) { - srcLine = &scratch[y * SURFACE_BYTES_PER_ROW]; - p0 = pd->planes[0] + (uint16_t)y * AMIGA_BYTES_PER_ROW; - p1 = pd->planes[1] + (uint16_t)y * AMIGA_BYTES_PER_ROW; - p2 = pd->planes[2] + (uint16_t)y * AMIGA_BYTES_PER_ROW; - p3 = pd->planes[3] + (uint16_t)y * AMIGA_BYTES_PER_ROW; - chunkyToPlanarRow(srcLine, p0, p1, p2, p3, AMIGA_BYTES_PER_ROW, gC2pLut); + for (i = 0; i < AMIGA_BITPLANES; i++) { + if (fread(pd->planes[i], 1, AMIGA_PLANE_SIZE, fp) != AMIGA_PLANE_SIZE) { + return false; } } - FreeMem(scratch, (ULONG)SURFACE_PIXELS_SIZE); - return ok; + return true; } -bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp) { +bool halSurfaceSaveFile(const SurfaceT *src, FILE *fp) { AmigaPlanarT *pd; - uint8_t chunkyRow[SURFACE_BYTES_PER_ROW]; - int16_t y; + uint8_t i; pd = (AmigaPlanarT *)src->portData; if (pd == NULL) { return false; } - /* Per row: derive chunky from planes, write 160 bytes. Less - * efficient than a single fwrite of a full buffer but avoids - * needing a 32 KB scratch allocation. */ - for (y = 0; y < SURFACE_HEIGHT; y++) { - amigaPlanesToChunkyRow(pd, y, chunkyRow); - if (fwrite(chunkyRow, 1, SURFACE_BYTES_PER_ROW, fp) != SURFACE_BYTES_PER_ROW) { + for (i = 0; i < AMIGA_BITPLANES; i++) { + if (fwrite(pd->planes[i], 1, AMIGA_PLANE_SIZE, fp) != AMIGA_PLANE_SIZE) { return false; } } diff --git a/src/port/atarist/c2p.s b/src/port/atarist/c2p.s deleted file mode 100644 index c4a2df6..0000000 --- a/src/port/atarist/c2p.s +++ /dev/null @@ -1,188 +0,0 @@ -| Atari ST chunky-to-planar conversion -- 68000 hand-rolled. -| -| Drop-in replacement for hal.c's old c2pRow C inner loop. The C -| version walked every pixel and built each plane word with a -| run-time variable bit shift (`1 << bit`), which costs ~6+2*bit -| cycles on 68000 -- roughly 100+ cycles per pixel after GCC's m68k -| codegen overhead. This rewrite uses a 4 KB lookup table built once -| at HAL init: same layout as the Amiga c2p LUT, so the -| (sourceByte, position, plane) -> 2-bit contribution mapping is -| identical, but the routine packs results into ST word-interleaved -| planar (4 plane words per 16-pixel group) instead of 4 separate -| plane bytes. -| -| Each ST group is 8 source bytes -> 4 plane words. Source byte -| positions 0..3 contribute to the HIGH byte of each plane word -| (bits 15..8); positions 4..7 contribute to the LOW byte (bits -| 7..0). Within a byte, the LUT for (src, bp%4, plane) already -| places bits at (7-2*(bp%4), 6-2*(bp%4)), so we use the SAME LUT -| entries for both halves -- we just shift d0..d3 left by 8 between -| the halves to move the high-half bits up before the low half ORs -| into the now-empty low byte. -| -| Calling convention: m68k-atari-mint-gcc cdecl. -| Args on stack at 4(sp), 8(sp), ... -| d2-d7, a2-a6 are callee-save. -| No return value. -| -| void chunkyToPlanarRowSt(const uint8_t *src, ; 4(sp) - 4bpp packed source row -| uint16_t *dst, ; 8(sp) - planar dest row (uint16_t*) -| uint16_t groupStart, ; 12(sp) - first group index (low word) -| uint16_t groupEnd, ; 16(sp) - one-past-last group index (low word) -| const uint8_t *lut); ; 20(sp) - 4 KB LUT base -| -| LUT layout: lut[src*16 + pos*4 + plane] (uint8) = the 2-bit plane -| contribution for source byte `src` at byte-position `pos` (0..3 -| within a 4-byte chunk) going to plane `plane` (0..3). All 16 -| (pos, plane) entries for one src byte are contiguous, so the inner -| loop reaches every entry off (a5, d4.w) with an 8-bit displacement -| (0..15) without LEA between reads. -| -| GAS-syntax (binutils m68k); assembled by m68k-atari-mint-as via -| the gcc driver. - - .text - .globl _chunkyToPlanarRowSt - -| MOVEM frame: d2-d7 (6) + a2-a6 (5) = 11 regs * 4 bytes = 44 bytes. - .equ SAVED_REGS_SIZE, 44 - - -_chunkyToPlanarRowSt: - movem.l %d2-%d7/%a2-%a6,-(%sp) - - move.l 4+SAVED_REGS_SIZE(%sp),%a0 | src row base - move.l 8+SAVED_REGS_SIZE(%sp),%a1 | dst (uint16_t*) - | Both groupStart and groupEnd are uint16_t but GCC - | promotes them to int and pushes 4 bytes each; the - | low word lives at +2 in big-endian layout. - move.w 12+SAVED_REGS_SIZE+2(%sp),%d6 | groupStart - move.w 16+SAVED_REGS_SIZE+2(%sp),%d7 | groupEnd - move.l 20+SAVED_REGS_SIZE(%sp),%a5 | LUT base - - | Advance src and dst to the first group's data. - | Each group consumes 8 source bytes and produces 4 - | dest words (8 bytes), so both pointers advance by - | groupStart * 8. - move.w %d6,%d4 - lsl.w #3,%d4 - add.w %d4,%a0 - add.w %d4,%a1 - - sub.w %d6,%d7 | groupCount = end - start - subq.w #1,%d7 | DBRA bias - bmi .Ldone - -.LgroupLoop: - moveq #0,%d0 | plane 0 acc - moveq #0,%d1 | plane 1 acc - moveq #0,%d2 | plane 2 acc - moveq #0,%d3 | plane 3 acc - - | ===== Source bytes 0..3 -> high byte of each plane word ===== - moveq #0,%d4 - move.b (%a0)+,%d4 - add.w %d4,%d4 - add.w %d4,%d4 - add.w %d4,%d4 - add.w %d4,%d4 | d4 = src * 16 - or.b 0(%a5,%d4.w),%d0 - or.b 1(%a5,%d4.w),%d1 - or.b 2(%a5,%d4.w),%d2 - or.b 3(%a5,%d4.w),%d3 - - moveq #0,%d4 - move.b (%a0)+,%d4 - add.w %d4,%d4 - add.w %d4,%d4 - add.w %d4,%d4 - add.w %d4,%d4 - or.b 4(%a5,%d4.w),%d0 - or.b 5(%a5,%d4.w),%d1 - or.b 6(%a5,%d4.w),%d2 - or.b 7(%a5,%d4.w),%d3 - - moveq #0,%d4 - move.b (%a0)+,%d4 - add.w %d4,%d4 - add.w %d4,%d4 - add.w %d4,%d4 - add.w %d4,%d4 - or.b 8(%a5,%d4.w),%d0 - or.b 9(%a5,%d4.w),%d1 - or.b 10(%a5,%d4.w),%d2 - or.b 11(%a5,%d4.w),%d3 - - moveq #0,%d4 - move.b (%a0)+,%d4 - add.w %d4,%d4 - add.w %d4,%d4 - add.w %d4,%d4 - add.w %d4,%d4 - or.b 12(%a5,%d4.w),%d0 - or.b 13(%a5,%d4.w),%d1 - or.b 14(%a5,%d4.w),%d2 - or.b 15(%a5,%d4.w),%d3 - - | Move accumulated bits into the HIGH byte of each word. - lsl.w #8,%d0 - lsl.w #8,%d1 - lsl.w #8,%d2 - lsl.w #8,%d3 - - | ===== Source bytes 4..7 -> low byte of each plane word ===== - moveq #0,%d4 - move.b (%a0)+,%d4 - add.w %d4,%d4 - add.w %d4,%d4 - add.w %d4,%d4 - add.w %d4,%d4 - or.b 0(%a5,%d4.w),%d0 - or.b 1(%a5,%d4.w),%d1 - or.b 2(%a5,%d4.w),%d2 - or.b 3(%a5,%d4.w),%d3 - - moveq #0,%d4 - move.b (%a0)+,%d4 - add.w %d4,%d4 - add.w %d4,%d4 - add.w %d4,%d4 - add.w %d4,%d4 - or.b 4(%a5,%d4.w),%d0 - or.b 5(%a5,%d4.w),%d1 - or.b 6(%a5,%d4.w),%d2 - or.b 7(%a5,%d4.w),%d3 - - moveq #0,%d4 - move.b (%a0)+,%d4 - add.w %d4,%d4 - add.w %d4,%d4 - add.w %d4,%d4 - add.w %d4,%d4 - or.b 8(%a5,%d4.w),%d0 - or.b 9(%a5,%d4.w),%d1 - or.b 10(%a5,%d4.w),%d2 - or.b 11(%a5,%d4.w),%d3 - - moveq #0,%d4 - move.b (%a0)+,%d4 - add.w %d4,%d4 - add.w %d4,%d4 - add.w %d4,%d4 - add.w %d4,%d4 - or.b 12(%a5,%d4.w),%d0 - or.b 13(%a5,%d4.w),%d1 - or.b 14(%a5,%d4.w),%d2 - or.b 15(%a5,%d4.w),%d3 - - | Store 4 plane words. - move.w %d0,(%a1)+ - move.w %d1,(%a1)+ - move.w %d2,(%a1)+ - move.w %d3,(%a1)+ - - dbra %d7,.LgroupLoop - -.Ldone: - movem.l (%sp)+,%d2-%d7/%a2-%a6 - rts diff --git a/src/port/atarist/circle.s b/src/port/atarist/circle.s index b7c65c9..d7e21cd 100644 --- a/src/port/atarist/circle.s +++ b/src/port/atarist/circle.s @@ -82,11 +82,9 @@ .macro YP_REC slot, signOp, yreg move.l %a4,%d6 \signOp\().w \yreg,%d6 | d6.w = yp - move.w %d6,%d0 - lsl.w #5,%d6 | d6 = yp << 5 - lsl.w #7,%d0 | d0 = yp << 7 - add.w %d6,%d0 | d0 = yp * 160 - move.w %d0,\slot(%sp) + add.w %d6,%d6 | * 2 for word index + move.w (%a6,%d6.w),%d6 | yLut[yp] = yp * 160 + move.w %d6,\slot(%sp) .endm @@ -223,14 +221,21 @@ _surface68kStCircleOutline: moveq #1,%d4 sub.w %d2,%d4 | err = 1 - bx + | a6 = yLut base (yp -> yp*160). Lookup is faster than + | the 4 cyc + 4 cyc + 18 cyc + 22 cyc + 4 cyc shift+add + | chain we used to do per YP_REC. Saved across all 4 + | YP_RECs per Bresenham iter (~120 cyc/iter). + | Shared LUT lives in lineSpan.s; reference absolute. + lea _gStRowOffsetLut,%a6 + | Dispatch on color (low 4 bits) -> one of 16 main loops. moveq #0,%d6 move.b SP_COLOR(%sp),%d6 and.w #0x0F,%d6 add.w %d6,%d6 add.w %d6,%d6 | * 4 for bra.w table - lea .LcoStTable(%pc),%a6 - jmp 0(%a6,%d6.w) + lea .LcoStTable(%pc),%a2 + jmp 0(%a2,%d6.w) .LcoStTable: bra.w .LcoStLoop_0 @@ -280,3 +285,4 @@ bitMaskWordLut: .word 0x0800, 0x0400, 0x0200, 0x0100 .word 0x0080, 0x0040, 0x0020, 0x0010 .word 0x0008, 0x0004, 0x0002, 0x0001 +| (yLut now lives in lineSpan.s as the shared _gStRowOffsetLut) diff --git a/src/port/atarist/fillCircle.s b/src/port/atarist/fillCircle.s index ba508df..7ed25a8 100644 --- a/src/port/atarist/fillCircle.s +++ b/src/port/atarist/fillCircle.s @@ -9,28 +9,16 @@ | Caller MUST guarantee the bounding box (cx-r, cy-r) (cx+r, cy+r) | is fully on-surface. Off-surface circles fall back to the C walker. | +| Phase 10 final: 16-way color dispatch at the OUTER loop. Each color +| variant has its own Bresenham body where SPAN_BODY inlines a hard- +| coded 4-plane mask RMW (no btst, no bsr/rts). Saves ~120 cyc per +| applyMask call (was ~180 via bsr applyMask with runtime btst on d7). +| | ABI: cdecl. d2-d7/a2-a6 callee-save. | | void surface68kStFillCircle(uint8_t *base, | uint16_t cx, uint16_t cy, | uint16_t r, uint8_t color); -| -| Register allocation across the loop: -| d2.w = bx (Bresenham, starts at r) -| d3.w = by (Bresenham, starts at 0) -| d4.w = err -| d5.l = loLong (planes 0+1 long template) -| d6.l = hiLong (planes 2+3 long template) -| d7.b = color (low nibble; tested via btst) -| a3 = base -| a4 = scratch / current group pointer -| d0,d1 = scratch -| -| Stack scratch (8 bytes at 0(sp)..7(sp)): -| 0..1 leftMask (word; per pair) -| 2..3 rightMask (word; per pair) -| 4..5 numGroups (word; per pair) -| 6..7 groupFirstByteOff (word; per pair) .text @@ -42,7 +30,7 @@ .equ SP_FC_CX, SP_FC_OFF + 4 + 2 .equ SP_FC_CY, SP_FC_OFF + 8 + 2 .equ SP_FC_R, SP_FC_OFF + 12 + 2 - .equ SP_FC_COLOR, SP_FC_OFF + 16 + 3 + .equ SP_FC_COLOR, SP_FC_OFF + 20 + 3 | ---- COMPUTE_PAIR_MASKS macro ----------------------------------- @@ -50,18 +38,15 @@ | Output: 0(sp) leftMask, 2(sp) rightMask, 4(sp) numGroups, | 6(sp) groupFirstByteOff | Trashes: d0, d1 -| (No labels: straightline.) .macro COMPUTE_PAIR_MASKS move.w %d0,0(%sp) | stash left move.w %d1,2(%sp) | stash right - | groupFirst & groupFirstByteOff move.w %d0,%d1 lsr.w #4,%d1 | groupFirst move.w %d1,%d0 lsl.w #3,%d0 | groupFirstByteOff move.w %d0,6(%sp) - | numGroups = (right >> 4) - groupFirst move.w 2(%sp),%d0 lsr.w #4,%d0 | groupLast sub.w %d1,%d0 | numGroups @@ -81,25 +66,53 @@ .endm -| ---- SPAN_BODY macro -------------------------------------------- -| Render one row span using the pair masks at 0(sp)..7(sp). -| Input: d0.w = y (signed) -| a3 = base, d5 = loLong, d6 = hiLong, d7 = color -| Trashes: d0, d1, a4 -| Macro takes an idx parameter for unique labels. +| ---- APPLY_MASK_INLINE macro ------------------------------------ +| 4-plane mask RMW with HARDCODED color. a4 advances by 8 (postinc). +| Inputs: d0.w = mask, a4 = group ptr +| Trashes: d1 (notMask scratch) - .macro SPAN_BODY - | a4 = base + y*160 - ext.l %d0 - move.l %d0,%d1 - lsl.l #5,%d0 - lsl.l #7,%d1 - add.l %d1,%d0 | y*160 - lea 0(%a3,%d0.l),%a4 - | a4 += groupFirstByteOff - moveq #0,%d0 - move.w 6(%sp),%d0 - add.l %d0,%a4 + .macro APPLY_MASK_INLINE color + move.w %d0,%d1 + not.w %d1 + .if ((\color) & 1) + or.w %d0,(%a4)+ + .else + and.w %d1,(%a4)+ + .endif + .if ((\color) & 2) + or.w %d0,(%a4)+ + .else + and.w %d1,(%a4)+ + .endif + .if ((\color) & 4) + or.w %d0,(%a4)+ + .else + and.w %d1,(%a4)+ + .endif + .if ((\color) & 8) + or.w %d0,(%a4)+ + .else + and.w %d1,(%a4)+ + .endif + .endm + + +| ---- SPAN_BODY macro -------------------------------------------- +| Render one row span. Color hardcoded. +| Input: d0.w = y (signed) +| a3 = base, d5 = loLong, d6 = hiLong +| masks at 0..7(sp): leftMask, rightMask, numGroups, groupFirstByteOff +| Trashes: d0, d1, a4 + + .macro SPAN_BODY color + | a4 = base + y*160 + groupFirstByteOff + | y*160 via shared _gStRowOffsetLut (a2 holds lut base). + | byteOff (y*160 + groupFirstByteOff) fits in 16 bits + | (max 31992), so word-only ops + .w-indexed lea. + add.w %d0,%d0 | y * 2 (word index) + move.w (%a2,%d0.w),%d0 | d0 = y * 160 + add.w 6(%sp),%d0 | + groupFirstByteOff + lea 0(%a3,%d0.w),%a4 | numGroups in d1 move.w 4(%sp),%d1 tst.w %d1 @@ -107,15 +120,14 @@ | single-group: combinedMask = leftMask & rightMask move.w 0(%sp),%d0 and.w 2(%sp),%d0 - bsr .Lfc_applyMask + APPLY_MASK_INLINE \color bra.w .Lsb_done\@ .Lsb_multi\@: - | leading mask. applyMask postinc-advances a4 by 8 - | (the 4 plane RMWs each advance by 2 via (a4)+). - | applyMask trashes d1, so reload numGroups after bsr. + | leading mask. APPLY_MASK_INLINE postinc-advances a4 by 8. + | APPLY trashes d1, so reload numGroups after. move.w 0(%sp),%d0 - bsr .Lfc_applyMask - move.w 4(%sp),%d1 | reload numGroups + APPLY_MASK_INLINE \color + move.w 4(%sp),%d1 subq.w #1,%d1 | d1 = numMid beq.s .Lsb_skipMid\@ .Lsb_midLoop\@: @@ -126,11 +138,71 @@ .Lsb_skipMid\@: | trailing mask move.w 2(%sp),%d0 - bsr .Lfc_applyMask + APPLY_MASK_INLINE \color .Lsb_done\@: .endm +| ---- CO_BODY macro: per-color full Bresenham loop body ---------- + + .macro CO_BODY color +.Lfc_loop_\color: + cmp.w %d3,%d2 + bcs.w .Lfc_done + + | --- Pair A: x range = (cx - bx, cx + bx) + move.w SP_FC_CX(%sp),%d0 + move.w %d0,%d1 + sub.w %d2,%d0 + add.w %d2,%d1 + COMPUTE_PAIR_MASKS + + | Span A1: y = cy + by + move.w SP_FC_CY(%sp),%d0 + add.w %d3,%d0 + SPAN_BODY \color + + | Span A2: y = cy - by + move.w SP_FC_CY(%sp),%d0 + sub.w %d3,%d0 + SPAN_BODY \color + + | --- Pair B: x range = (cx - by, cx + by) + move.w SP_FC_CX(%sp),%d0 + move.w %d0,%d1 + sub.w %d3,%d0 + add.w %d3,%d1 + COMPUTE_PAIR_MASKS + + | Span B1: y = cy + bx + move.w SP_FC_CY(%sp),%d0 + add.w %d2,%d0 + SPAN_BODY \color + + | Span B2: y = cy - bx + move.w SP_FC_CY(%sp),%d0 + sub.w %d2,%d0 + SPAN_BODY \color + + | --- Bresenham step + addq.w #1,%d3 + tst.w %d4 + bgt.s .Lfc_decBx_\color + add.w %d3,%d4 + add.w %d3,%d4 + addq.w #1,%d4 + bra.w .Lfc_loop_\color +.Lfc_decBx_\color: + subq.w #1,%d2 + add.w %d3,%d4 + add.w %d3,%d4 + sub.w %d2,%d4 + sub.w %d2,%d4 + addq.w #1,%d4 + bra.w .Lfc_loop_\color + .endm + + .globl _surface68kStFillCircle _surface68kStFillCircle: @@ -142,10 +214,11 @@ _surface68kStFillCircle: moveq #0,%d7 move.b SP_FC_COLOR(%sp),%d7 - | LUT bases (PC-relative indexed has only 8-bit - | displacement, so cache full pointers in a-regs). + | LUT bases. a5/a6 = mask LUTs (used by COMPUTE_PAIR_MASKS). + | a2 = shared _gStRowOffsetLut (used by SPAN_BODY for y*160). lea leftMaskLut(%pc),%a5 lea rightMaskLut(%pc),%a6 + lea _gStRowOffsetLut,%a2 | loLong = ((c&1)?0xFFFF0000:0) | ((c&2)?0x0000FFFF:0) moveq #0,%d5 @@ -174,60 +247,50 @@ _surface68kStFillCircle: moveq #1,%d4 sub.w %d2,%d4 -.Lfc_loop: - cmp.w %d3,%d2 - bcs.w .Lfc_done + | Dispatch on color (low 4 bits) -> 16 specialized loops. + | Use a4 (gets overwritten in SPAN_BODY's first lea) as + | dispatch scratch since a2 now holds yLut for the body. + and.w #0x0F,%d7 + move.w %d7,%d0 + add.w %d0,%d0 + add.w %d0,%d0 | * 4 for bra.w table + lea .Lfc_table(%pc),%a4 + jmp 0(%a4,%d0.w) - | --- Pair A: x range = (cx - bx, cx + bx) - move.w SP_FC_CX(%sp),%d0 - move.w %d0,%d1 - sub.w %d2,%d0 | left = cx - bx - add.w %d2,%d1 | right = cx + bx - COMPUTE_PAIR_MASKS +.Lfc_table: + bra.w .Lfc_loop_0 + bra.w .Lfc_loop_1 + bra.w .Lfc_loop_2 + bra.w .Lfc_loop_3 + bra.w .Lfc_loop_4 + bra.w .Lfc_loop_5 + bra.w .Lfc_loop_6 + bra.w .Lfc_loop_7 + bra.w .Lfc_loop_8 + bra.w .Lfc_loop_9 + bra.w .Lfc_loop_10 + bra.w .Lfc_loop_11 + bra.w .Lfc_loop_12 + bra.w .Lfc_loop_13 + bra.w .Lfc_loop_14 + bra.w .Lfc_loop_15 - | Span A1: y = cy + by - move.w SP_FC_CY(%sp),%d0 - add.w %d3,%d0 - SPAN_BODY - - | Span A2: y = cy - by - move.w SP_FC_CY(%sp),%d0 - sub.w %d3,%d0 - SPAN_BODY - - | --- Pair B: x range = (cx - by, cx + by) - move.w SP_FC_CX(%sp),%d0 - move.w %d0,%d1 - sub.w %d3,%d0 | left = cx - by - add.w %d3,%d1 | right = cx + by - COMPUTE_PAIR_MASKS - - | Span B1: y = cy + bx - move.w SP_FC_CY(%sp),%d0 - add.w %d2,%d0 - SPAN_BODY - - | Span B2: y = cy - bx - move.w SP_FC_CY(%sp),%d0 - sub.w %d2,%d0 - SPAN_BODY - - | --- Bresenham step - addq.w #1,%d3 - tst.w %d4 - bgt.s .Lfc_decBx - add.w %d3,%d4 - add.w %d3,%d4 - addq.w #1,%d4 - bra.w .Lfc_loop -.Lfc_decBx: - subq.w #1,%d2 - add.w %d3,%d4 - add.w %d3,%d4 - sub.w %d2,%d4 - sub.w %d2,%d4 - addq.w #1,%d4 - bra.w .Lfc_loop + CO_BODY 0 + CO_BODY 1 + CO_BODY 2 + CO_BODY 3 + CO_BODY 4 + CO_BODY 5 + CO_BODY 6 + CO_BODY 7 + CO_BODY 8 + CO_BODY 9 + CO_BODY 10 + CO_BODY 11 + CO_BODY 12 + CO_BODY 13 + CO_BODY 14 + CO_BODY 15 .Lfc_done: @@ -236,46 +299,6 @@ _surface68kStFillCircle: rts -| ---- Apply 4-plane mask at (a4) ------------------------------- -| Input: d0.w = mask, d7.b = color, a4 = group ptr -| Output: a4 advanced by 8 (next group). Caller must NOT post-add 8. -| Trashes: d0, d1 -| Subroutine, called via bsr from SPAN_BODY. Postinc on each plane -| RMW saves 4 cyc/plane vs displacement (12 vs 16 EA cyc). - -.Lfc_applyMask: - move.w %d0,%d1 - not.w %d1 | d1 = notMask - btst #0,%d7 - beq.s .Lfc_am0a - or.w %d0,(%a4)+ - bra.s .Lfc_am1 -.Lfc_am0a: - and.w %d1,(%a4)+ -.Lfc_am1: - btst #1,%d7 - beq.s .Lfc_am1a - or.w %d0,(%a4)+ - bra.s .Lfc_am2 -.Lfc_am1a: - and.w %d1,(%a4)+ -.Lfc_am2: - btst #2,%d7 - beq.s .Lfc_am2a - or.w %d0,(%a4)+ - bra.s .Lfc_am3 -.Lfc_am2a: - and.w %d1,(%a4)+ -.Lfc_am3: - btst #3,%d7 - beq.s .Lfc_am3a - or.w %d0,(%a4)+ - rts -.Lfc_am3a: - and.w %d1,(%a4)+ - rts - - .align 2 | leftMaskLut[i] = (1 << (16 - i)) - 1, indexed by bitFirst (0..15) leftMaskLut: diff --git a/src/port/atarist/hal.c b/src/port/atarist/hal.c index 77a5c5c..bf68308 100644 --- a/src/port/atarist/hal.c +++ b/src/port/atarist/hal.c @@ -2,7 +2,7 @@ // // M2 scope: // * XBIOS Setscreen to ST low-res (320x200x16, mode 0). -// * Chunky 4bpp to word-interleaved ST planar c2p at present time. +// * Word-interleaved ST planar buffer copied to the screen at present. // // M2.5 scope (per-band palette / SCB emulation): // * halPresent scans the SurfaceT's SCB array and builds a compact @@ -136,17 +136,9 @@ static inline __attribute__((always_inline)) uint8_t stPlanarGetPixel(const StPl } static uint16_t quantizeColorToSt(uint16_t orgb); static void flattenScbPalettes(const SurfaceT *src); -static void initC2pLut(void); static void writeDiagnostics(void); static long writePrevPaletteRegs(void); -// Provided by src/port/atarist/c2p.s. -extern void chunkyToPlanarRowSt(const uint8_t *src, - uint16_t *dst, - uint16_t groupStart, - uint16_t groupEnd, - const uint8_t *lut); - static __attribute__((interrupt_handler)) void timerBIsr(void); static __attribute__((interrupt_handler)) void vblIsr(void); static void buildTransitions(const SurfaceT *src); @@ -201,72 +193,11 @@ static void (*gOldTimerBVec)(void) = NULL; // SCB; neither is cheap on a 7 MHz 68000. In the typical game loop // (and every frame of the keys demo after the initial paint) SCB and // palette never change, so caching and skipping those passes keeps -// rect presents down to just the c2p work. +// rect presents down to just the screen blit. static uint8_t gCachedScb [SURFACE_HEIGHT]; static uint16_t gCachedPalette[SURFACE_PALETTE_COUNT][SURFACE_COLORS_PER_PALETTE]; static bool gCacheValid = false; -// 256-long plane-spread LUT for the asm sprite SAVE path (defined in -// spriteAsm.s). For plane byte b, LUT[b] is a 32-bit value where each -// of b's 8 bits is placed at the bit-0 position of the corresponding -// pixel's nibble inside a 4-byte chunky long. The asm shifts the LUT -// entry left by N to get plane N's contribution; OR'd across 4 planes -// gives the full chunky long. Initialized lazily. -// -// LUT used by surface68kStSpriteSaveByteAligned. The asm reads via -// `move.l (a_ptr, d0.l), d4` which requires the LUT to be long- -// aligned -- and TOS .PRG BSS only does 2-byte alignment. Worse, -// the cascading offsets from the odd-sized gC2pLut put even -// `uint32_t` BSS slots at addr mod 4 == 2. -// -// Fix: malloc the LUT. mintlib's malloc returns long-aligned memory. -// The pointer is passed to the asm via the C-side wrapper (so the -// asm reads it from the stack, where it's guaranteed long-aligned -// regardless of where the static pointer slot lives). -static uint32_t *gStPlaneSpreadLutPtr = NULL; -static bool gStPlaneSpreadLutReady = false; - -static bool initStPlaneSpreadLut(void) { - int b; - int i; - - if (gStPlaneSpreadLutReady) { - return true; - } - gStPlaneSpreadLutPtr = (uint32_t *)malloc(256 * sizeof(uint32_t)); - if (gStPlaneSpreadLutPtr == NULL) { - return false; - } - - for (b = 0; b < 256; b++) { - uint32_t v = 0u; - for (i = 0; i < 8; i++) { - if (b & (0x80 >> i)) { - int byteIdx = i >> 1; - int isHigh = ((i & 1) == 0); - int bitInLong = (3 - byteIdx) * 8 + (isHigh ? 4 : 0); - v |= (uint32_t)1u << bitInLong; - } - } - gStPlaneSpreadLutPtr[b] = v; - } - gStPlaneSpreadLutReady = true; - return true; -} - - -// 4 KB chunky-to-planar lookup table consumed by chunkyToPlanarRowSt -// (src/port/atarist/c2p.s). Layout: gC2pLut[src*16 + pos*4 + plane] -// = the 2-bit plane-byte contribution for source byte `src` at -// byte-position `pos` (0..3 within a 4-byte chunk) going to plane -// `plane`. Bit positions inside the byte are (7-2*pos, 6-2*pos), so -// the same table feeds both halves of an ST plane word: positions -// 0..3 land in the high byte, 4..7 (re-indexed mod 4) in the low -// byte. Built once by initC2pLut on the first halPresent call. -/* Exported (no static) so spriteAsm.s can `lea _gC2pLut, %a2`. */ -uint8_t gC2pLut[4 * 1024]; -static bool gC2pLutReady = false; - // ----- Internal helpers (alphabetical) ----- // Scan the surface's SCB and record one transition entry for each @@ -350,37 +281,6 @@ static void refreshPaletteStateIfNeeded(const SurfaceT *src) { } -// Build the 4 KB chunky-to-planar lookup table consumed by -// chunkyToPlanarRowSt. Same layout/contents as the Amiga c2p LUT; -// see src/port/atarist/c2p.s for the addressing math. -static void initC2pLut(void) { - uint16_t pos; - uint16_t plane; - uint16_t src; - uint8_t highShift; - uint8_t lowShift; - uint8_t highBit; - uint8_t lowBit; - - if (gC2pLutReady) { - return; - } - for (src = 0; src < 256; src++) { - for (pos = 0; pos < 4; pos++) { - highShift = (uint8_t)(7 - 2 * pos); - lowShift = (uint8_t)(6 - 2 * pos); - for (plane = 0; plane < 4; plane++) { - highBit = (uint8_t)(((src >> 4) >> plane) & 1); - lowBit = (uint8_t)(((src & 0x0F) >> plane) & 1); - gC2pLut[src * 16 + pos * 4 + plane] = - (uint8_t)((highBit << highShift) | (lowBit << lowShift)); - } - } - } - gC2pLutReady = true; -} - - // 12-bit $0RGB to STF 9-bit palette register (drops the low bit of // each 4-bit channel). static uint16_t quantizeColorToSt(uint16_t orgb) { @@ -619,11 +519,8 @@ void halPresent(const SurfaceT *src) { } refreshPaletteStateIfNeeded(src); - // Phase 9: planar shadow -> screen RAM. Same dirty-word band - // tracking the c2p path used; just memcpy the planar bytes for - // each band instead of running c2p on the chunky shadow. Each - // dirty word covers 4 pixels = ?of one group = quarter of an - // 8-byte group. We round to whole groups (8 bytes each) for a + // Planar buffer -> screen RAM. Each dirty word covers 4 pixels + // (a quarter of an 8-byte group). Round to whole groups for a // simple aligned memcpy, since planar groups are the natural // copy unit. for (y = 0; y < SURFACE_HEIGHT; y++) { @@ -720,8 +617,11 @@ extern void surface68kStFillCircle(uint8_t *base, uint16_t cx, uint16_t cy, uint extern void surface68kStFillRectSingleGroup(uint8_t *firstGroupPtr, uint16_t mask, uint16_t h, uint8_t color); extern void surface68kStFillRectMulti(uint8_t *base, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t color); extern void surface68kStLongFill(uint8_t *dst, uint16_t numGroups, uint32_t loLong, uint32_t hiLong); -extern void surface68kStSpriteSaveByteAligned(uint8_t *base, uint16_t x, uint16_t y, uint16_t w, uint16_t h, uint8_t *dstChunky, const uint32_t *lut); -extern void surface68kStSpriteRestoreByteAligned(uint8_t *base, uint16_t x, uint16_t y, uint16_t w, uint16_t h, const uint8_t *srcChunky, const uint8_t *c2pLut); +extern void surface68kStTileFill8x8(uint8_t *firstGroupPtr, uint16_t mask, uint8_t color); +extern void surface68kStSprite16x16Save(uint8_t *base, uint16_t x, uint16_t y, uint8_t *dstBuf); +extern void surface68kStSprite16x16Restore(uint8_t *base, uint16_t x, uint16_t y, const uint8_t *srcBuf); +extern void surface68kStSpriteSaveByteAligned(uint8_t *base, uint16_t x, uint16_t y, uint16_t w, uint16_t h, uint8_t *dstPlaneBytes); +extern void surface68kStSpriteRestoreByteAligned(uint8_t *base, uint16_t x, uint16_t y, uint16_t w, uint16_t h, const uint8_t *srcPlaneBytes); // Phase 9: clear the entire planar buffer to a 4-bit color. Build an @@ -1262,17 +1162,12 @@ void halTileFillPlanes(SurfaceT *s, uint8_t bx, uint8_t by, uint8_t colorIndex) group = (uint16_t)((uint16_t)bx >> 1); halfMask = ((bx & 1u) == 0u) ? 0xFF00u : 0x00FFu; gp = pd->base + (uint16_t)by * 8u * ST_BYTES_PER_ROW + group * ST_BYTES_PER_GROUP; - surface68kStFillRectSingleGroup(gp, halfMask, TILE_PIXELS_PER_SIDE, colorIndex); + /* Phase 10 final: specialized 8x8 unrolled tile-fill skips the + * generic FRG_LOOP's per-row subq+bne overhead. */ + surface68kStTileFill8x8(gp, halfMask, colorIndex); } -// Phase 10: group-aware tile paste. Per row: extract 8 pixels from -// 4 chunky bytes, build 4 plane bytes (one per plane), drop them -// into the high or low half of the 4 plane words at this group -- -// 4 word RMWs per row instead of 64 per-pixel calls. -static const uint8_t kStTileBitLut[8] = { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 }; - - // Phase 10: tile paste/snap reuse the asm sprite save/restore // helpers -- identical per-row work patterns at byte-aligned // positions. Width 8 = single tile column = single half-group @@ -1301,14 +1196,25 @@ void halTilePastePlanes(SurfaceT *dst, uint8_t bx, uint8_t by, const uint8_t *ti + (uint16_t)by * 8u * ST_BYTES_PER_ROW + group * ST_BYTES_PER_GROUP + (uint16_t)(bx & 1u); - for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) { - dstAddr[0] = tileBytes[0]; - dstAddr[2] = tileBytes[1]; - dstAddr[4] = tileBytes[2]; - dstAddr[6] = tileBytes[3]; - dstAddr += ST_BYTES_PER_ROW; - tileBytes += TILE_BYTES_PER_ROW; - } + (void)row; +#define ST_TILE_PASTE_ROW \ + do { \ + dstAddr[0] = tileBytes[0]; \ + dstAddr[2] = tileBytes[1]; \ + dstAddr[4] = tileBytes[2]; \ + dstAddr[6] = tileBytes[3]; \ + dstAddr += ST_BYTES_PER_ROW; \ + tileBytes += TILE_BYTES_PER_ROW; \ + } while (0) + ST_TILE_PASTE_ROW; + ST_TILE_PASTE_ROW; + ST_TILE_PASTE_ROW; + ST_TILE_PASTE_ROW; + ST_TILE_PASTE_ROW; + ST_TILE_PASTE_ROW; + ST_TILE_PASTE_ROW; + ST_TILE_PASTE_ROW; +#undef ST_TILE_PASTE_ROW } @@ -1331,136 +1237,25 @@ void halTileSnapPlanes(const SurfaceT *src, uint8_t bx, uint8_t by, uint8_t *til + (uint16_t)by * 8u * ST_BYTES_PER_ROW + group * ST_BYTES_PER_GROUP + (uint16_t)(bx & 1u); - for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) { - tileOut[0] = srcAddr[0]; - tileOut[1] = srcAddr[2]; - tileOut[2] = srcAddr[4]; - tileOut[3] = srcAddr[6]; - srcAddr += ST_BYTES_PER_ROW; - tileOut += TILE_BYTES_PER_ROW; - } -} - - -/* Slow-path C versions kept (renamed) for reference; not in the - * active call chain. */ -static void halTilePastePlanes_oldC(SurfaceT *dst, uint8_t bx, uint8_t by, const uint8_t *chunkyTile) { - StPlanarT *pd; - uint16_t group; - uint16_t halfMask; - uint16_t notHalfMask; - bool isHigh; - uint8_t *rowBase; - int16_t row; - int16_t pix; - uint16_t *pw; - uint8_t b; - uint8_t color; - uint8_t pb0; - uint8_t pb1; - uint8_t pb2; - uint8_t pb3; - uint8_t bit; - - if (dst == NULL || chunkyTile == NULL) { - return; - } - pd = (StPlanarT *)dst->portData; - if (pd == NULL) { - return; - } - group = (uint16_t)((uint16_t)bx >> 1); - isHigh = ((bx & 1u) == 0u); - halfMask = isHigh ? 0xFF00u : 0x00FFu; - notHalfMask = (uint16_t)~halfMask; - rowBase = pd->base - + (uint16_t)by * TILE_PIXELS_PER_SIDE * ST_BYTES_PER_ROW - + group * ST_BYTES_PER_GROUP; - for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) { - pb0 = pb1 = pb2 = pb3 = 0u; - for (pix = 0; pix < TILE_PIXELS_PER_SIDE; pix++) { - b = chunkyTile[row * TILE_BYTES_PER_ROW + (pix >> 1)]; - color = (pix & 1) ? (uint8_t)(b & 0x0Fu) : (uint8_t)(b >> 4); - bit = kStTileBitLut[pix]; - if (color & 1u) { pb0 = (uint8_t)(pb0 | bit); } - if (color & 2u) { pb1 = (uint8_t)(pb1 | bit); } - if (color & 4u) { pb2 = (uint8_t)(pb2 | bit); } - if (color & 8u) { pb3 = (uint8_t)(pb3 | bit); } - } - pw = (uint16_t *)rowBase; - if (isHigh) { - pw[0] = (uint16_t)((pw[0] & notHalfMask) | ((uint16_t)pb0 << 8)); - pw[1] = (uint16_t)((pw[1] & notHalfMask) | ((uint16_t)pb1 << 8)); - pw[2] = (uint16_t)((pw[2] & notHalfMask) | ((uint16_t)pb2 << 8)); - pw[3] = (uint16_t)((pw[3] & notHalfMask) | ((uint16_t)pb3 << 8)); - } else { - pw[0] = (uint16_t)((pw[0] & notHalfMask) | (uint16_t)pb0); - pw[1] = (uint16_t)((pw[1] & notHalfMask) | (uint16_t)pb1); - pw[2] = (uint16_t)((pw[2] & notHalfMask) | (uint16_t)pb2); - pw[3] = (uint16_t)((pw[3] & notHalfMask) | (uint16_t)pb3); - } - rowBase += ST_BYTES_PER_ROW; - } -} - - -// Phase 10: group-aware tile snap. Read 4 plane half-words for the -// row's group, distribute the 8 plane bits per plane into chunky -// nibbles. 4 word reads per row + 4 chunky bytes per row, no -// per-pixel function calls. Replaced by the asm-routed halTileSnapPlanes -// above; kept for reference as the C-only fallback. -static void halTileSnapPlanes_oldC(const SurfaceT *src, uint8_t bx, uint8_t by, uint8_t *chunkyTileOut) { - const StPlanarT *pd; - uint16_t group; - uint16_t halfShift; - const uint8_t *rowBase; - int16_t row; - int16_t pair; - const uint16_t *pw; - uint8_t pb0; - uint8_t pb1; - uint8_t pb2; - uint8_t pb3; - uint8_t bitHi; - uint8_t bitLo; - uint8_t hi; - uint8_t lo; - - if (src == NULL || chunkyTileOut == NULL) { - return; - } - pd = (const StPlanarT *)src->portData; - if (pd == NULL) { - return; - } - group = (uint16_t)((uint16_t)bx >> 1); - halfShift = ((bx & 1u) == 0u) ? 8u : 0u; - rowBase = pd->base - + (uint16_t)by * TILE_PIXELS_PER_SIDE * ST_BYTES_PER_ROW - + group * ST_BYTES_PER_GROUP; - for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) { - pw = (const uint16_t *)rowBase; - pb0 = (uint8_t)(pw[0] >> halfShift); - pb1 = (uint8_t)(pw[1] >> halfShift); - pb2 = (uint8_t)(pw[2] >> halfShift); - pb3 = (uint8_t)(pw[3] >> halfShift); - for (pair = 0; pair < TILE_BYTES_PER_ROW; pair++) { - bitHi = kStTileBitLut[pair * 2]; - bitLo = kStTileBitLut[pair * 2 + 1]; - hi = 0u; - lo = 0u; - if (pb0 & bitHi) hi = (uint8_t)(hi | 1u); - if (pb1 & bitHi) hi = (uint8_t)(hi | 2u); - if (pb2 & bitHi) hi = (uint8_t)(hi | 4u); - if (pb3 & bitHi) hi = (uint8_t)(hi | 8u); - if (pb0 & bitLo) lo = (uint8_t)(lo | 1u); - if (pb1 & bitLo) lo = (uint8_t)(lo | 2u); - if (pb2 & bitLo) lo = (uint8_t)(lo | 4u); - if (pb3 & bitLo) lo = (uint8_t)(lo | 8u); - chunkyTileOut[row * TILE_BYTES_PER_ROW + pair] = (uint8_t)((hi << 4) | lo); - } - rowBase += ST_BYTES_PER_ROW; - } + (void)row; +#define ST_TILE_SNAP_ROW \ + do { \ + tileOut[0] = srcAddr[0]; \ + tileOut[1] = srcAddr[2]; \ + tileOut[2] = srcAddr[4]; \ + tileOut[3] = srcAddr[6]; \ + srcAddr += ST_BYTES_PER_ROW; \ + tileOut += TILE_BYTES_PER_ROW; \ + } while (0) + ST_TILE_SNAP_ROW; + ST_TILE_SNAP_ROW; + ST_TILE_SNAP_ROW; + ST_TILE_SNAP_ROW; + ST_TILE_SNAP_ROW; + ST_TILE_SNAP_ROW; + ST_TILE_SNAP_ROW; + ST_TILE_SNAP_ROW; +#undef ST_TILE_SNAP_ROW } @@ -1496,14 +1291,28 @@ void halTileCopyPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const Surfac + (uint16_t)dstBy * 8u * ST_BYTES_PER_ROW + dstGroup * ST_BYTES_PER_GROUP + (uint16_t)(dstBx & 1u); - for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) { - dstAddr[0] = srcAddr[0]; /* plane 0 byte (high or low half) */ - dstAddr[2] = srcAddr[2]; /* plane 1 */ - dstAddr[4] = srcAddr[4]; /* plane 2 */ - dstAddr[6] = srcAddr[6]; /* plane 3 */ - srcAddr += ST_BYTES_PER_ROW; - dstAddr += ST_BYTES_PER_ROW; - } + /* gcc-mint -O2 does NOT unroll the 8-iter byte-copy loop, + * leaving cmpl + bnes loop overhead per row. Manual unroll + * drops ~150 cyc/call. (void)row keeps the unused decl quiet. */ + (void)row; +#define ST_TILE_COPY_ROW \ + do { \ + dstAddr[0] = srcAddr[0]; \ + dstAddr[2] = srcAddr[2]; \ + dstAddr[4] = srcAddr[4]; \ + dstAddr[6] = srcAddr[6]; \ + srcAddr += ST_BYTES_PER_ROW; \ + dstAddr += ST_BYTES_PER_ROW; \ + } while (0) + ST_TILE_COPY_ROW; /* row 0 */ + ST_TILE_COPY_ROW; /* row 1 */ + ST_TILE_COPY_ROW; /* row 2 */ + ST_TILE_COPY_ROW; /* row 3 */ + ST_TILE_COPY_ROW; /* row 4 */ + ST_TILE_COPY_ROW; /* row 5 */ + ST_TILE_COPY_ROW; /* row 6 */ + ST_TILE_COPY_ROW; /* row 7 */ +#undef ST_TILE_COPY_ROW } @@ -1792,109 +1601,6 @@ void halBlitRectPlanes(SurfaceT *dst, int16_t x, int16_t y, const uint8_t *srcBy } -// Phase 10 fast paths for save/restore. Hand-rolled asm -// (surface68kStSprite{Save,Restore}ByteAligned) does the chunky <-> -// plane bit transpose via ASL+ROXL and walks rows/tile columns. The -// C wrappers below are kept as a fallback / reference; they're not -// in the critical path now that the asm versions are wired in. -static void stSpriteSaveByteAligned(const StPlanarT *pd, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t *dstChunkyBytes) { - int16_t bytesPerRow = (int16_t)(w >> 1); - int16_t tileCols = (int16_t)(w >> 3); - const uint8_t *rowBase = pd->base + (uint16_t)y * ST_BYTES_PER_ROW; - int16_t row; - int16_t tileCol; - - for (row = 0; row < (int16_t)h; row++) { - uint8_t *dstRow = &dstChunkyBytes[(uint16_t)row * (uint16_t)bytesPerRow]; - for (tileCol = 0; tileCol < tileCols; tileCol++) { - int16_t srcX = (int16_t)(x + tileCol * 8); - uint16_t group = (uint16_t)((uint16_t)srcX >> 4); - uint16_t shift = ((srcX & 8) == 0) ? 8u : 0u; - const uint16_t *pw = (const uint16_t *)(rowBase + group * ST_BYTES_PER_GROUP); - uint8_t pb0 = (uint8_t)(pw[0] >> shift); - uint8_t pb1 = (uint8_t)(pw[1] >> shift); - uint8_t pb2 = (uint8_t)(pw[2] >> shift); - uint8_t pb3 = (uint8_t)(pw[3] >> shift); - int16_t pair; - for (pair = 0; pair < 4; pair++) { - uint8_t bitHi = (uint8_t)(0x80u >> (pair * 2)); - uint8_t bitLo = (uint8_t)(0x80u >> (pair * 2 + 1)); - uint8_t hi = 0u; - uint8_t lo = 0u; - if (pb0 & bitHi) { hi = (uint8_t)(hi | 1u); } - if (pb1 & bitHi) { hi = (uint8_t)(hi | 2u); } - if (pb2 & bitHi) { hi = (uint8_t)(hi | 4u); } - if (pb3 & bitHi) { hi = (uint8_t)(hi | 8u); } - if (pb0 & bitLo) { lo = (uint8_t)(lo | 1u); } - if (pb1 & bitLo) { lo = (uint8_t)(lo | 2u); } - if (pb2 & bitLo) { lo = (uint8_t)(lo | 4u); } - if (pb3 & bitLo) { lo = (uint8_t)(lo | 8u); } - dstRow[tileCol * 4 + pair] = (uint8_t)((hi << 4) | lo); - } - } - rowBase += ST_BYTES_PER_ROW; - } -} - - -static void stSpriteRestoreByteAligned(StPlanarT *pd, int16_t x, int16_t y, uint16_t w, uint16_t h, const uint8_t *srcChunkyBytes) { - int16_t bytesPerRow = (int16_t)(w >> 1); - int16_t tileCols = (int16_t)(w >> 3); - uint8_t *rowBase = pd->base + (uint16_t)y * ST_BYTES_PER_ROW; - int16_t row; - int16_t tileCol; - - for (row = 0; row < (int16_t)h; row++) { - const uint8_t *srcRow = &srcChunkyBytes[(uint16_t)row * (uint16_t)bytesPerRow]; - for (tileCol = 0; tileCol < tileCols; tileCol++) { - uint8_t b0 = srcRow[tileCol * 4 + 0]; - uint8_t b1 = srcRow[tileCol * 4 + 1]; - uint8_t b2 = srcRow[tileCol * 4 + 2]; - uint8_t b3 = srcRow[tileCol * 4 + 3]; - uint8_t pb0 = 0u; - uint8_t pb1 = 0u; - uint8_t pb2 = 0u; - uint8_t pb3 = 0u; - uint8_t c; - int16_t dstX; - uint16_t group; - uint16_t *pw; - uint16_t halfMask; - uint16_t notHalfMask; - - c = (uint8_t)(b0 >> 4); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x80u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x80u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x80u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x80u); - c = (uint8_t)(b0 & 0x0Fu); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x40u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x40u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x40u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x40u); - c = (uint8_t)(b1 >> 4); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x20u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x20u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x20u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x20u); - c = (uint8_t)(b1 & 0x0Fu); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x10u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x10u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x10u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x10u); - c = (uint8_t)(b2 >> 4); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x08u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x08u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x08u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x08u); - c = (uint8_t)(b2 & 0x0Fu); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x04u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x04u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x04u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x04u); - c = (uint8_t)(b3 >> 4); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x02u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x02u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x02u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x02u); - c = (uint8_t)(b3 & 0x0Fu); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x01u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x01u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x01u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x01u); - - dstX = (int16_t)(x + tileCol * 8); - group = (uint16_t)((uint16_t)dstX >> 4); - pw = (uint16_t *)(rowBase + group * ST_BYTES_PER_GROUP); - if ((dstX & 8) == 0) { - halfMask = 0xFF00u; - pw[0] = (uint16_t)((pw[0] & 0x00FFu) | ((uint16_t)pb0 << 8)); - pw[1] = (uint16_t)((pw[1] & 0x00FFu) | ((uint16_t)pb1 << 8)); - pw[2] = (uint16_t)((pw[2] & 0x00FFu) | ((uint16_t)pb2 << 8)); - pw[3] = (uint16_t)((pw[3] & 0x00FFu) | ((uint16_t)pb3 << 8)); - } else { - halfMask = 0x00FFu; - pw[0] = (uint16_t)((pw[0] & 0xFF00u) | (uint16_t)pb0); - pw[1] = (uint16_t)((pw[1] & 0xFF00u) | (uint16_t)pb1); - pw[2] = (uint16_t)((pw[2] & 0xFF00u) | (uint16_t)pb2); - pw[3] = (uint16_t)((pw[3] & 0xFF00u) | (uint16_t)pb3); - } - (void)halfMask; - (void)notHalfMask; - } - rowBase += ST_BYTES_PER_ROW; - } -} - - // Phase 10: hoist y*160 to per-row, fold setPixel/getPixel bodies // inline. Each pixel's group address differs only in (x), so we // can compute base+row*160 once per row and just do per-pixel @@ -1916,11 +1622,16 @@ void halSpriteSavePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t return; } /* Phase 10.5 fast path: byte-aligned, fully on-surface. - * Asm walker does direct planar byte copy (LUT pointer unused). */ + * Specialized 16x16 (the UBER ball-sprite size) skips the asm + * walker's per-row col-init + col-loop-check overhead. */ if ((x & 7) == 0 && (w & 7) == 0 && x >= 0 && (x + (int16_t)w) <= SURFACE_WIDTH && y >= 0 && (y + (int16_t)h) <= SURFACE_HEIGHT) { - surface68kStSpriteSaveByteAligned(pd->base, (uint16_t)x, (uint16_t)y, w, h, dstPlaneBytes, NULL); + if (w == 16u && h == 16u) { + surface68kStSprite16x16Save(pd->base, (uint16_t)x, (uint16_t)y, dstPlaneBytes); + } else { + surface68kStSpriteSaveByteAligned(pd->base, (uint16_t)x, (uint16_t)y, w, h, dstPlaneBytes); + } return; } @@ -1980,11 +1691,15 @@ void halSpriteRestorePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint1 return; } /* Phase 10.5 fast path: byte-aligned, fully on-surface. - * Asm walker does direct planar byte copy (LUT pointer unused). */ + * Specialized 16x16 (UBER ball-sprite) skips walker overhead. */ if ((x & 7) == 0 && (w & 7) == 0 && x >= 0 && (x + (int16_t)w) <= SURFACE_WIDTH && y >= 0 && (y + (int16_t)h) <= SURFACE_HEIGHT) { - surface68kStSpriteRestoreByteAligned(pd->base, (uint16_t)x, (uint16_t)y, w, h, srcPlaneBytes, NULL); + if (w == 16u && h == 16u) { + surface68kStSprite16x16Restore(pd->base, (uint16_t)x, (uint16_t)y, srcPlaneBytes); + } else { + surface68kStSpriteRestoreByteAligned(pd->base, (uint16_t)x, (uint16_t)y, w, h, srcPlaneBytes); + } return; } @@ -2049,10 +1764,11 @@ uint8_t halSamplePixel(const SurfaceT *s, int16_t x, int16_t y) { } -// Phase 9: derive 160 chunky bytes per row from the word-interleaved -// planar buffer (20 groups x 4 plane words). Same shape as the Amiga's +// Derive 160 chunky bytes per row from the word-interleaved planar +// buffer (20 groups x 4 plane words). Same shape as the Amiga's // amigaPlanesToChunkyRow but per-group instead of per-byte. Used by -// halSurfaceHash and halSurfaceSaveFileChunky. +// halSurfaceHash to fold the planar surface into the same byte stream +// the chunky ports hash, so cross-port hash comparisons stay valid. static void stPlanarToChunkyRow(const StPlanarT *pd, int16_t y, uint8_t *dstChunkyRow) { uint16_t group; uint16_t p; @@ -2134,58 +1850,27 @@ void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src) { } -// Phase 9: read chunky from file into a temporary scratch buffer, -// then c2p once into the planar shadow. The .joeysurface file format -// is still chunky 4bpp on disk (cross-port asset interchange); the -// in-memory representation is what changes. -bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp) { +// On-disk format is the ST's native interleaved planar buffer; one +// fread fills it directly, no chunky scratch or c2p step. +bool halSurfaceLoadFile(SurfaceT *dst, FILE *fp) { StPlanarT *pd; - uint8_t *scratch; - int16_t y; - bool ok; pd = (StPlanarT *)dst->portData; if (pd == NULL) { return false; } - scratch = (uint8_t *)malloc(SURFACE_PIXELS_SIZE); - if (scratch == NULL) { - return false; - } - ok = (fread(scratch, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE); - if (ok) { - if (!gC2pLutReady) { - initC2pLut(); - } - for (y = 0; y < SURFACE_HEIGHT; y++) { - const uint8_t *srcLine = &scratch[y * SURFACE_BYTES_PER_ROW]; - uint16_t *dstLine = (uint16_t *)&pd->base[y * ST_BYTES_PER_ROW]; - chunkyToPlanarRowSt(srcLine, dstLine, 0u, ST_GROUPS_PER_ROW, gC2pLut); - } - } - free(scratch); - return ok; + return fread(pd->base, 1, ST_PLANAR_SIZE, fp) == ST_PLANAR_SIZE; } -// Phase 9: derive chunky bytes from the planar shadow row by row, -// stream to file. Avoids needing a full 32 KB scratch buffer. -bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp) { +bool halSurfaceSaveFile(const SurfaceT *src, FILE *fp) { StPlanarT *pd; - uint8_t chunkyRow[SURFACE_BYTES_PER_ROW]; - int16_t y; pd = (StPlanarT *)src->portData; if (pd == NULL) { return false; } - for (y = 0; y < SURFACE_HEIGHT; y++) { - stPlanarToChunkyRow(pd, y, chunkyRow); - if (fwrite(chunkyRow, 1, SURFACE_BYTES_PER_ROW, fp) != SURFACE_BYTES_PER_ROW) { - return false; - } - } - return true; + return fwrite(pd->base, 1, ST_PLANAR_SIZE, fp) == ST_PLANAR_SIZE; } diff --git a/src/port/atarist/lineSpan.s b/src/port/atarist/lineSpan.s index 242b7b4..ce189d8 100644 --- a/src/port/atarist/lineSpan.s +++ b/src/port/atarist/lineSpan.s @@ -50,19 +50,17 @@ | Trashes: d0, d1, a2 .macro DL_PLOT color - | byteOff = y*160 + (x>>4)*8 + | byteOff = y*160 + (x>>4)*8 (fits in 16 bits since + | surface is 32000 bytes < 32K). Skip ext.l + .l add + | + .l indexed lea -- all word-sized ops save 14 cyc/pixel. move.w %d3,%d0 - ext.l %d0 - move.l %d0,%d1 - lsl.l #5,%d0 | y << 5 - lsl.l #7,%d1 | y << 7 - add.l %d1,%d0 | d0 = y * 160 + add.w %d0,%d0 | y * 2 (word index) + move.w (%a6,%d0.w),%d0 | d0 = y * 160 move.w %d2,%d1 lsr.w #4,%d1 lsl.w #3,%d1 | (x>>4) * 8 - ext.l %d1 - add.l %d1,%d0 | d0 = byteOff - lea 0(%a3,%d0.l),%a2 | a2 = base + byteOff + add.w %d1,%d0 | d0 = byteOff (fits in 16 bits) + lea 0(%a3,%d0.w),%a2 | a2 = base + byteOff | d1 = bitMask, d0 = notMask move.w %d2,%d1 and.w #15,%d1 @@ -127,9 +125,11 @@ _surface68kStDrawLine: movem.l %d2-%d7/%a2-%a6,-(%sp) lea -SP_LOCAL(%sp),%sp - | Load base & lut. + | Load base & luts. move.l SP_BASE(%sp),%a3 lea bitMaskWordLut(%pc),%a5 + | a6 = yLut base (yp -> yp*160) for use in DL_PLOT. + lea _gStRowOffsetLut(%pc),%a6 | x = x0, y = y0 move.w SP_X0(%sp),%d2 @@ -179,8 +179,8 @@ _surface68kStDrawLine: and.w #0x0F,%d0 add.w %d0,%d0 add.w %d0,%d0 | * 4 for bra.w table - lea .LdlStTable(%pc),%a6 - jmp 0(%a6,%d0.w) + lea .LdlStTable(%pc),%a2 | a2 scratch (a6 holds yLut) + jmp 0(%a2,%d0.w) .LdlStTable: bra.w .LdlStLoop_0 @@ -529,6 +529,129 @@ _surface68kStFillRectSingleGroup: rts +| ---- surface68kStTileFill8x8 --------------------------------------- +| +| Specialized 8x8 single-group fill: 16-way color dispatch + 8 rows +| fully unrolled. Drops the per-row subq+bne overhead that the +| generic FRG_LOOP pays. Used by halTileFillPlanes. +| +| void surface68kStTileFill8x8(uint8_t *firstGroupPtr, +| uint16_t mask, +| uint8_t color); +| +| Per row body: 4 plane RMW with postinc + lea 152(a3),a3 to next +| row. Row 7 skips the trailing lea (a3 not used after). + + .equ SP_TF_SAVED, 16 | d3-d4/a2-a3 = 4 longs + .equ SP_TF_OFF, (SP_TF_SAVED + 4) + .equ SP_TF_PTR, SP_TF_OFF + 0 + .equ SP_TF_MASK, SP_TF_OFF + 4 + 2 + .equ SP_TF_COLOR, SP_TF_OFF + 8 + 3 + + + .macro TF8_ROW_BARE color + .if ((\color) & 1) + or.w %d3,(%a3)+ + .else + and.w %d4,(%a3)+ + .endif + .if ((\color) & 2) + or.w %d3,(%a3)+ + .else + and.w %d4,(%a3)+ + .endif + .if ((\color) & 4) + or.w %d3,(%a3)+ + .else + and.w %d4,(%a3)+ + .endif + .if ((\color) & 8) + or.w %d3,(%a3)+ + .else + and.w %d4,(%a3)+ + .endif + .endm + + + .macro TF8_ROW color + TF8_ROW_BARE \color + lea 152(%a3),%a3 + .endm + + + .macro TF8_BODY color +.Ltf8_body_\color: + TF8_ROW \color | row 0 + TF8_ROW \color | row 1 + TF8_ROW \color | row 2 + TF8_ROW \color | row 3 + TF8_ROW \color | row 4 + TF8_ROW \color | row 5 + TF8_ROW \color | row 6 + TF8_ROW_BARE \color | row 7 (no trailing lea) + bra.w .Ltf8_done + .endm + + + .globl _surface68kStTileFill8x8 + +_surface68kStTileFill8x8: + movem.l %d3-%d4/%a2-%a3,-(%sp) + + move.l SP_TF_PTR(%sp),%a3 + move.w SP_TF_MASK(%sp),%d3 + move.w %d3,%d4 + not.w %d4 + + | Color dispatch + moveq #0,%d0 + move.b SP_TF_COLOR(%sp),%d0 + and.w #0x0F,%d0 + add.w %d0,%d0 + add.w %d0,%d0 | * 4 for bra.w table + lea .Ltf8_table(%pc),%a2 + jmp 0(%a2,%d0.w) + +.Ltf8_table: + bra.w .Ltf8_body_0 + bra.w .Ltf8_body_1 + bra.w .Ltf8_body_2 + bra.w .Ltf8_body_3 + bra.w .Ltf8_body_4 + bra.w .Ltf8_body_5 + bra.w .Ltf8_body_6 + bra.w .Ltf8_body_7 + bra.w .Ltf8_body_8 + bra.w .Ltf8_body_9 + bra.w .Ltf8_body_10 + bra.w .Ltf8_body_11 + bra.w .Ltf8_body_12 + bra.w .Ltf8_body_13 + bra.w .Ltf8_body_14 + bra.w .Ltf8_body_15 + + TF8_BODY 0 + TF8_BODY 1 + TF8_BODY 2 + TF8_BODY 3 + TF8_BODY 4 + TF8_BODY 5 + TF8_BODY 6 + TF8_BODY 7 + TF8_BODY 8 + TF8_BODY 9 + TF8_BODY 10 + TF8_BODY 11 + TF8_BODY 12 + TF8_BODY 13 + TF8_BODY 14 + TF8_BODY 15 + +.Ltf8_done: + movem.l (%sp)+,%d3-%d4/%a2-%a3 + rts + + | ---- surface68kStFillRectMulti ------------------------------------- | | Multi-group fillRect: groupFirst != groupLast. Caller pre-clips. @@ -782,6 +905,21 @@ frmRightMaskLut: .word 0xFFF8, 0xFFFC, 0xFFFE, 0xFFFF + .align 2 +| Shared y -> y*160 LUT. Used by drawLine (DL_PLOT), drawCircle +| (YP_REC), fillCircle (SPAN_BODY). 200 words = 400 bytes. +| Replaces a 44-cyc lsl.w #5 + lsl.w #7 + add.w shift chain with +| a 14-cyc indexed-word load. Exported so circle.s and fillCircle.s +| can reference it via absolute addressing without duplication. + .globl _gStRowOffsetLut +_gStRowOffsetLut: + .set li_y, 0 + .rept 200 + .word li_y * 160 + .set li_y, li_y + 1 + .endr + + | ---- surface68kStLongFill ---------------------------------------- | | Bulk long-fill helper for full-row fills (surfaceClear, fillRect diff --git a/src/port/atarist/spriteAsm.s b/src/port/atarist/spriteAsm.s index b1b233c..97969c8 100644 --- a/src/port/atarist/spriteAsm.s +++ b/src/port/atarist/spriteAsm.s @@ -1,30 +1,19 @@ -| ST byte-aligned sprite save / restore via 256-entry plane-spread -| LUT. The LUT entry for each plane byte value is a 32-bit "spread" -| where each plane byte bit lands at the corresponding plane-0 bit -| position of the 4-byte chunky output. For plane N, we shift the -| LUT entry left by N to put bits at the plane-N positions, then OR -| the 4 plane contributions together to get the chunky long. -| -| LUT layout (256 longs = 1 KB), populated by initStPlaneSpreadLut -| in hal.c: -| -| gStPlaneSpreadLut[b] for plane byte b: -| bit i of b (i = 0 = MSB = leftmost pixel) maps to bit -| bitInLong(i) = (3 - (i >> 1)) * 8 + ((i & 1) ? 0 : 4) -| of the long. Plane 0's bits land at nibble bit 0 of each -| chunky byte; left-shift the LUT entry by N for plane N. +| ST byte-aligned sprite save / restore. Buffer holds plane-major +| bytes: per row, plane0/1/2/3 per tile col, for w/8 tile cols. The +| inner per-tile-col macro is 4 byte copies (no chunky <-> planar +| conversion since the buffer matches the surface's plane layout). | | ABI: cdecl. d2-d7/a2-a6 callee-save. C signatures: | | void surface68kStSpriteSaveByteAligned(uint8_t *base, | uint16_t x, uint16_t y, | uint16_t w, uint16_t h, -| uint8_t *dstChunky); +| uint8_t *dstPlaneBytes); | | void surface68kStSpriteRestoreByteAligned(uint8_t *base, | uint16_t x, uint16_t y, | uint16_t w, uint16_t h, -| const uint8_t *srcChunky); +| const uint8_t *srcPlaneBytes); .text @@ -36,19 +25,12 @@ .equ SP_Y, SP_OFF + 8 + 2 .equ SP_W, SP_OFF + 12 + 2 .equ SP_H, SP_OFF + 16 + 2 - .equ SP_CHUNKY, SP_OFF + 20 - .equ SP_LUT, SP_OFF + 24 + .equ SP_BUF, SP_OFF + 20 | Per-tile-col SAVE: 4 plane bytes -> 4 contiguous bytes in buffer. | a0 -> plane 0 byte (high or low half), strides 2 to next plane | a1 -> output planar bytes (advanced by 4) -| a2 -> unused (LUT no longer needed) -| -| Phase 10.5: dropped chunky <-> planar conversion. The buffer holds -| plane-major bytes (per row: plane0, plane1, plane2, plane3 per -| tile col, for w/8 tile cols). 4 byte copies instead of 4 LUT -| lookups + shifts + ORs. .macro SAVE_TILECOL move.b (%a0),(%a1)+ | plane 0 @@ -64,13 +46,7 @@ _surface68kStSpriteSaveByteAligned: movem.l %d2-%d7/%a2-%a6,-(%sp) move.l SP_BASE(%sp),%a3 - move.l SP_CHUNKY(%sp),%a1 - | LUT pointer comes in via stack arg -- guaranteed - | long-aligned because gcc passes ptr args via - | move.l on a long-aligned sp slot. Avoids the BSS - | misalignment problem on TOS .PRG (BSS pads only to - | 2 bytes, even uint32_t slots can land at mod-4 = 2). - move.l SP_LUT(%sp),%a2 + move.l SP_BUF(%sp),%a1 move.w SP_W(%sp),%d5 lsr.w #3,%d5 | d5 = tileCols @@ -128,10 +104,6 @@ _surface68kStSpriteSaveByteAligned: | Per-tile-col RESTORE: 4 contiguous bytes from buffer -> 4 plane bytes. | a0 -> plane 0 byte (high or low half) | a1 -> input planar bytes (advanced by 4) -| a2 -> unused (LUT no longer needed) -| -| Phase 10.5: dropped chunky -> planar conversion. Buffer layout -| matches SAVE_TILECOL: per row, plane0/1/2/3 per tile col. .macro RESTORE_TILECOL move.b (%a1)+,(%a0) | plane 0 @@ -147,8 +119,7 @@ _surface68kStSpriteRestoreByteAligned: movem.l %d2-%d7/%a2-%a6,-(%sp) move.l SP_BASE(%sp),%a3 - move.l SP_CHUNKY(%sp),%a1 - move.l SP_LUT(%sp),%a2 | gC2pLut passed in + move.l SP_BUF(%sp),%a1 | tileCols is held in a5 (not d5) because the macro | trashes d5 (uses it for pb3). @@ -200,3 +171,151 @@ _surface68kStSpriteRestoreByteAligned: movem.l (%sp)+,%d2-%d7/%a2-%a6 rts + + +| ---- surface68kStSprite16x16Save / Restore ----------------------- +| +| Specialized 16x16 sprite save/restore: 16 rows fully unrolled, +| 8 byte copies per row (2 tile cols), no col loop. Drops the asm +| walker's per-row col-init + col-loop-check overhead. +| +| void surface68kStSprite16x16Save(uint8_t *base, +| uint16_t x, uint16_t y, +| uint8_t *dstBuf); +| +| void surface68kStSprite16x16Restore(uint8_t *base, +| uint16_t x, uint16_t y, +| const uint8_t *srcBuf); +| +| Caller guarantees x is byte-aligned (x mod 8 == 0). Two halfOff +| variants dispatch on (x & 8): halfOff=0 reads/writes within one +| group (offsets 0/2/4/6 high half + 1/3/5/7 low half). halfOff=1 +| spans two groups (low half of group N + high half of group N+1). + + .equ SP16_SAVED, 12 | d2/a2-a3 = 3 longs + .equ SP16_OFF, (SP16_SAVED + 4) + .equ SP16_BASE, SP16_OFF + 0 + .equ SP16_X, SP16_OFF + 4 + 2 + .equ SP16_Y, SP16_OFF + 8 + 2 + .equ SP16_BUF, SP16_OFF + 12 + + +| Macro: setup a0 = base + y*160 + group*8 + halfOff +| Trashes: d0, d1, d2; a0 left at row start + + .macro SP16_SETUP_A0 + move.l SP16_BASE(%sp),%a3 + move.w SP16_X(%sp),%d0 + move.w SP16_Y(%sp),%d1 + + | a0 = base + y*160 + ext.l %d1 + move.l %d1,%d2 + lsl.l #5,%d1 + lsl.l #7,%d2 + add.l %d2,%d1 + lea 0(%a3,%d1.l),%a0 + + | a0 += (x>>4) * 8 + move.w %d0,%d1 + lsr.w #4,%d1 + lsl.w #3,%d1 + ext.l %d1 + add.l %d1,%a0 + + | a0 += halfOff (= (x & 8) >> 3) + and.w #8,%d0 + lsr.w #3,%d0 + ext.l %d0 + add.l %d0,%a0 + | d0 = halfOff (0 or 1) for downstream dispatch + .endm + + + .globl _surface68kStSprite16x16Save + +_surface68kStSprite16x16Save: + movem.l %d2/%a2-%a3,-(%sp) + SP16_SETUP_A0 + move.l SP16_BUF(%sp),%a1 + + tst.w %d0 + bne.w .Lsp16s_low + + | halfOff=0: a0 at high half. Col 0 = high (offsets + | 0,2,4,6); col 1 = low (offsets 1,3,5,7). + .rept 16 + move.b (%a0),(%a1)+ + move.b 2(%a0),(%a1)+ + move.b 4(%a0),(%a1)+ + move.b 6(%a0),(%a1)+ + move.b 1(%a0),(%a1)+ + move.b 3(%a0),(%a1)+ + move.b 5(%a0),(%a1)+ + move.b 7(%a0),(%a1)+ + lea 160(%a0),%a0 + .endr + bra.w .Lsp16s_done + +.Lsp16s_low: + | halfOff=1: a0 at low half (group+1). Col 0 = low of + | this group, offsets 0,2,4,6 from a0. Col 1 = high of + | next group, at offsets 7,9,11,13 from a0. + .rept 16 + move.b (%a0),(%a1)+ + move.b 2(%a0),(%a1)+ + move.b 4(%a0),(%a1)+ + move.b 6(%a0),(%a1)+ + move.b 7(%a0),(%a1)+ + move.b 9(%a0),(%a1)+ + move.b 11(%a0),(%a1)+ + move.b 13(%a0),(%a1)+ + lea 160(%a0),%a0 + .endr + +.Lsp16s_done: + movem.l (%sp)+,%d2/%a2-%a3 + rts + + + .globl _surface68kStSprite16x16Restore + +_surface68kStSprite16x16Restore: + movem.l %d2/%a2-%a3,-(%sp) + SP16_SETUP_A0 + move.l SP16_BUF(%sp),%a1 + + tst.w %d0 + bne.w .Lsp16r_low + + | halfOff=0: write high half (col 0) + low half (col 1). + .rept 16 + move.b (%a1)+,(%a0) + move.b (%a1)+,2(%a0) + move.b (%a1)+,4(%a0) + move.b (%a1)+,6(%a0) + move.b (%a1)+,1(%a0) + move.b (%a1)+,3(%a0) + move.b (%a1)+,5(%a0) + move.b (%a1)+,7(%a0) + lea 160(%a0),%a0 + .endr + bra.w .Lsp16r_done + +.Lsp16r_low: + | halfOff=1 + .rept 16 + move.b (%a1)+,(%a0) + move.b (%a1)+,2(%a0) + move.b (%a1)+,4(%a0) + move.b (%a1)+,6(%a0) + move.b (%a1)+,7(%a0) + move.b (%a1)+,9(%a0) + move.b (%a1)+,11(%a0) + move.b (%a1)+,13(%a0) + lea 160(%a0),%a0 + .endr + +.Lsp16r_done: + movem.l (%sp)+,%d2/%a2-%a3 + rts diff --git a/src/port/dos/hal.c b/src/port/dos/hal.c index d1ca693..7b91f58 100644 --- a/src/port/dos/hal.c +++ b/src/port/dos/hal.c @@ -614,12 +614,12 @@ void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src) { } -bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp) { +bool halSurfaceLoadFile(SurfaceT *dst, FILE *fp) { return fread(dst->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE; } -bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp) { +bool halSurfaceSaveFile(const SurfaceT *src, FILE *fp) { return fwrite(src->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE; } diff --git a/src/port/iigs/hal.c b/src/port/iigs/hal.c index 237fcab..a41a151 100644 --- a/src/port/iigs/hal.c +++ b/src/port/iigs/hal.c @@ -395,12 +395,12 @@ void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src) { } -bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp) { +bool halSurfaceLoadFile(SurfaceT *dst, FILE *fp) { return fread(dst->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE; } -bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp) { +bool halSurfaceSaveFile(const SurfaceT *src, FILE *fp) { return fwrite(src->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE; }