ST is more or less parity.

This commit is contained in:
Scott Duensing 2026-05-04 11:06:41 -05:00
parent 818dc801db
commit cf6ae093d3
15 changed files with 966 additions and 1062 deletions

326
README.md
View file

@ -59,6 +59,332 @@ build/<plat>/ per-target build outputs
```
## Public API
Game code includes a single umbrella header:
```c
#include <joey/joey.h>
```
That pulls in every public surface listed below. Full documentation
lives in the per-feature headers under `include/joey/`; what follows
is a quick reference. Every entry point is plain C, no C++ extensions.
### Lifecycle (`joey/core.h`)
```c
typedef struct {
HostModeE hostMode; // HOST_MODE_TAKEOVER or HOST_MODE_OS
uint32_t codegenBytes; // runtime compiled-sprite cache size
uint16_t maxSurfaces; // maximum concurrent surfaces
uint32_t audioBytes; // audio sample / module RAM pool
uint32_t assetBytes; // tileset / sprite / map RAM pool
} JoeyConfigT;
bool joeyInit (const JoeyConfigT *config);
void joeyShutdown (void);
const char *joeyLastError (void);
const char *joeyPlatformName (void);
const char *joeyVersionString(void);
void joeyWaitVBL (void); // block until next VBL
uint16_t joeyFrameCount (void); // monotonic 16-bit frame counter
uint16_t joeyFrameHz (void); // 50 / 60 / 70 depending on port
```
### Surfaces (`joey/surface.h`)
All surfaces are 320x200 4bpp packed (high nibble = left pixel) with
a 200-entry SCB table and 16 palettes of 16 `$0RGB` colors.
```c
#define SURFACE_WIDTH 320
#define SURFACE_HEIGHT 200
#define SURFACE_BYTES_PER_ROW 160
#define SURFACE_PIXELS_SIZE (SURFACE_BYTES_PER_ROW * SURFACE_HEIGHT)
#define SURFACE_PALETTE_COUNT 16
#define SURFACE_COLORS_PER_PALETTE 16
typedef struct SurfaceT SurfaceT; // opaque
SurfaceT *surfaceCreate (void);
void surfaceDestroy(SurfaceT *s);
SurfaceT *stageGet (void); // library back-buffer
void surfaceCopy (SurfaceT *dst, const SurfaceT *src);
bool surfaceSaveFile(const SurfaceT *src, const char *path);
bool surfaceLoadFile(SurfaceT *dst, const char *path);
uint32_t surfaceHash (const SurfaceT *s); // FNV-1a of logical pixels
```
`surfaceSaveFile` writes the surface in **target-native** form. Files
are NOT cross-port portable; the asset pipeline handles conversion.
### Drawing (`joey/draw.h`)
All primitives clip to the surface; off-surface coords are silent
no-ops. Color 0 is plotted normally (use the masked variants if you
need transparency).
```c
void surfaceClear (SurfaceT *s, uint8_t color);
void drawPixel (SurfaceT *s, int16_t x, int16_t y, uint8_t color);
uint8_t samplePixel (const SurfaceT *s, int16_t x, int16_t y);
void drawLine (SurfaceT *s, int16_t x0, int16_t y0,
int16_t x1, int16_t y1, uint8_t color);
void drawRect (SurfaceT *s, int16_t x, int16_t y,
uint16_t w, uint16_t h, uint8_t color);
void fillRect (SurfaceT *s, int16_t x, int16_t y,
uint16_t w, uint16_t h, uint8_t color);
void drawCircle (SurfaceT *s, int16_t cx, int16_t cy,
uint16_t r, uint8_t color);
void fillCircle (SurfaceT *s, int16_t cx, int16_t cy,
uint16_t r, uint8_t color);
void floodFill (SurfaceT *s, int16_t x, int16_t y, uint8_t newColor);
void floodFillBounded (SurfaceT *s, int16_t x, int16_t y,
uint8_t newColor, uint8_t boundaryColor);
void surfaceBlit (SurfaceT *dst, const JoeyAssetT *src, int16_t x, int16_t y);
void surfaceBlitMasked (SurfaceT *dst, const JoeyAssetT *src,
int16_t x, int16_t y, uint8_t transparentIndex);
```
### Palette and SCB (`joey/palette.h`)
Colors are 12-bit `$0RGB`. Color 0 of every palette is forced to
black on `paletteSet`. Each scanline picks one of the 16 palettes
via the SCB.
```c
void paletteSet (SurfaceT *s, uint8_t paletteIndex, const uint16_t *colors16);
void paletteGet (const SurfaceT *s, uint8_t paletteIndex, uint16_t *out16);
void scbSet (SurfaceT *s, uint16_t line, uint8_t paletteIndex);
void scbSetRange (SurfaceT *s, uint16_t firstLine, uint16_t lastLine,
uint8_t paletteIndex);
uint8_t scbGet (const SurfaceT *s, uint16_t line);
```
### Tiles (`joey/tile.h`)
A "tile" is just an 8x8-aligned region of any surface. The API moves
32-byte chunks between surfaces and provides a small `TileT` value
type so callers can stash a copy without allocating a scratch surface.
```c
#define TILE_PIXELS_PER_SIDE 8
#define TILE_BYTES_PER_ROW 4
#define TILE_BYTES (TILE_BYTES_PER_ROW * TILE_PIXELS_PER_SIDE)
#define TILE_BLOCKS_PER_ROW (SURFACE_WIDTH / TILE_PIXELS_PER_SIDE) // 40
#define TILE_BLOCKS_PER_COL (SURFACE_HEIGHT / TILE_PIXELS_PER_SIDE) // 25
#define TILE_NO_GLYPH ((uint16_t)0xFFFFu)
typedef struct TileT { uint8_t pixels[TILE_BYTES]; } TileT;
void tileCopy (SurfaceT *dst, uint8_t dstBx, uint8_t dstBy,
const SurfaceT *src, uint8_t srcBx, uint8_t srcBy);
void tileCopyMasked (SurfaceT *dst, uint8_t dstBx, uint8_t dstBy,
const SurfaceT *src, uint8_t srcBx, uint8_t srcBy,
uint8_t transparentIndex);
void tileFill (SurfaceT *s, uint8_t bx, uint8_t by, uint8_t color);
void tileSnap (const SurfaceT *src, uint8_t bx, uint8_t by, TileT *out);
void tilePaste (SurfaceT *dst, uint8_t bx, uint8_t by, const TileT *in);
void drawText (SurfaceT *dst, uint8_t bx, uint8_t by,
const SurfaceT *fontSurface, const uint16_t *asciiMap,
const char *str);
```
### Sprites (`joey/sprite.h`)
Rectangles of 8x8 tiles drawn at arbitrary pixel positions with
color-0 transparency. Tile data is `widthTiles * heightTiles * 32`
bytes, tile-major 4bpp packed. Sprites can be runtime-compiled
into per-shift code variants for fast draws.
```c
typedef enum { SPRITE_FLAGS_NONE = 0 } SpriteFlagsE;
typedef struct SpriteT SpriteT; // opaque
typedef struct {
SpriteT *sprite;
int16_t x, y;
uint16_t width, height; // pixels
uint8_t *bytes; // caller-owned save-under buffer
uint16_t sizeBytes;
} SpriteBackupT;
SpriteT *spriteCreate (const uint8_t *tileData,
uint8_t widthTiles, uint8_t heightTiles,
SpriteFlagsE flags);
SpriteT *spriteCreateFromSurface (const SurfaceT *src, int16_t x, int16_t y,
uint8_t widthTiles, uint8_t heightTiles,
SpriteFlagsE flags);
SpriteT *spriteLoadFile (const char *path, SpriteFlagsE flags);
SpriteT *spriteFromCompiledMem (const uint8_t *data, uint32_t length,
SpriteFlagsE flags);
bool spriteSaveFile (SpriteT *sp, const char *path);
void spriteDestroy (SpriteT *sp);
bool spriteCompile (SpriteT *sp); // build per-shift fast path
void spritePrewarm (SpriteT *sp); // hint: compile if not already
void spriteDraw (SurfaceT *s, SpriteT *sp, int16_t x, int16_t y);
void spriteSaveUnder (const SurfaceT *s, SpriteT *sp,
int16_t x, int16_t y, SpriteBackupT *backup);
void spriteRestoreUnder (SurfaceT *s, const SpriteBackupT *backup);
void spriteSaveAndDraw (SurfaceT *s, SpriteT *sp, int16_t x, int16_t y,
SpriteBackupT *backup);
void spriteCompact (void); // defrag the codegen arena
uint32_t spriteCodegenBytesUsed (void);
uint32_t spriteCodegenBytesTotal (void);
```
### Assets (`joey/asset.h`)
Small bitmap blits with optional embedded palette, in `.jas` format.
Use embedded `const JoeyAssetT` for ship-with-binary art; use the
loaders for on-disk assets.
```c
typedef struct {
uint16_t width;
uint16_t height;
bool hasPalette;
uint16_t palette[16]; // valid only if hasPalette
const uint8_t *pixels; // 4bpp packed, rowBytes = (width+1)/2
} JoeyAssetT;
JoeyAssetT *joeyAssetLoadFile (const char *path);
JoeyAssetT *joeyAssetFromMem (const uint8_t *data, uint32_t length);
void joeyAssetFree (JoeyAssetT *asset);
void joeyAssetApplyPalette (SurfaceT *dst, uint8_t paletteIndex,
const JoeyAssetT *asset);
```
### Present (`joey/present.h`)
```c
void stagePresent(void);
```
Flips the dirty rows of the stage to the display, then clears dirty
state. Drawing primitives mark dirty as a side effect, so calling
`stagePresent` once at end-of-frame is enough.
### Input (`joey/input.h`)
Call `joeyInputPoll` once per frame, then query the state predicates.
Edge predicates (`*Pressed`, `*Released`) fire only in the frame the
transition happened.
```c
typedef enum { /* KEY_NONE, KEY_A..KEY_Z, KEY_0..KEY_9, KEY_SPACE,
KEY_ESCAPE, KEY_RETURN, KEY_TAB, KEY_BACKSPACE,
KEY_UP/DOWN/LEFT/RIGHT, KEY_LSHIFT/RSHIFT/LCTRL/LALT,
KEY_F1..KEY_F10, KEY_COUNT */ } JoeyKeyE;
typedef enum { MOUSE_BUTTON_NONE, MOUSE_BUTTON_LEFT, MOUSE_BUTTON_RIGHT,
MOUSE_BUTTON_MIDDLE, MOUSE_BUTTON_COUNT } JoeyMouseButtonE;
typedef enum { JOYSTICK_0, JOYSTICK_1, JOYSTICK_COUNT } JoeyJoystickE;
typedef enum { JOY_BUTTON_0, JOY_BUTTON_1, JOY_BUTTON_COUNT } JoeyJoyButtonE;
#define JOYSTICK_AXIS_MAX 127
#define JOYSTICK_AXIS_MIN (-127)
void joeyInputPoll (void);
void joeyWaitForAnyKey (void);
bool joeyKeyDown (JoeyKeyE key);
bool joeyKeyPressed (JoeyKeyE key);
bool joeyKeyReleased (JoeyKeyE key);
int16_t joeyMouseX (void);
int16_t joeyMouseY (void);
bool joeyMouseDown (JoeyMouseButtonE b);
bool joeyMousePressed (JoeyMouseButtonE b);
bool joeyMouseReleased (JoeyMouseButtonE b);
bool joeyJoystickConnected(JoeyJoystickE js);
int8_t joeyJoystickX (JoeyJoystickE js);
int8_t joeyJoystickY (JoeyJoystickE js);
bool joeyJoyDown (JoeyJoystickE js, JoeyJoyButtonE b);
bool joeyJoyPressed (JoeyJoystickE js, JoeyJoyButtonE b);
bool joeyJoyReleased (JoeyJoystickE js, JoeyJoyButtonE b);
void joeyJoystickReset (JoeyJoystickE js, uint8_t deadZone);
```
### Audio (`joey/audio.h`)
4-channel Protracker-style music plus four one-shot SFX slots. Module
data must be the platform-native form produced by `tools/joeymod`
(`.mod` for Amiga/DOS/ST; `.ntp` for IIgs; `.amod` if you want
loop=false on Amiga). A failed `joeyAudioInit` is non-fatal; the rest
of the API stays callable as no-ops.
```c
#define JOEY_AUDIO_SFX_SLOTS 4
bool joeyAudioInit (void);
void joeyAudioShutdown (void);
void joeyAudioPlayMod (const uint8_t *data, uint32_t length, bool loop);
void joeyAudioStopMod (void);
bool joeyAudioIsPlayingMod (void);
void joeyAudioPlaySfx (uint8_t slot, const uint8_t *sample,
uint32_t length, uint16_t rateHz);
void joeyAudioStopSfx (uint8_t slot);
void joeyAudioFrameTick (void);
```
### Debug logging (`joey/debug.h`)
Crash-tracing logger. Writes are buffered and durable across normal
exit; call `joeyLogFlush` ahead of suspected hang points if you want
a guaranteed last-line-on-disk.
```c
void joeyLog (const char *msg);
void joeyLogF (const char *fmt, ...);
void joeyLogFlush(void);
void joeyLogReset(void);
```
Output goes to `joeylog.txt` in the program's working directory.
### Platform macros (`joey/platform.h`)
The build system normally sets the platform via `-D`; auto-detection
from compiler-predefined macros is a fallback. Game code can
conditionally compile on these:
```
JOEYLIB_PLATFORM_IIGS / _AMIGA / _ATARIST / _DOS // exactly one defined
JOEYLIB_CPU_65816 / _68000 / _X86
JOEYLIB_ENDIAN_LITTLE / _BIG
JOEYLIB_NATIVE_CHUNKY / _NATIVE_PLANAR
JOEYLIB_HAS_BLITTER / _HAS_COPPER // Amiga only
JOEYLIB_PLATFORM_NAME // human-readable string
JOEYLIB_VERSION_MAJOR / _MINOR / _PATCH / _STRING
```
## License
TBD.

View file

@ -0,0 +1,28 @@
# DOSBox config: simulate an Intel 386SX-16 (1988), the slowest 386
# desktop CPU JoeyLib could realistically be run on. Use this floor
# to verify the DOS port still hits its frame budget on the bottom of
# the 386 stack rather than coasting on host CPU.
#
# The 386SX is identical to the 386DX in instruction set; the only
# difference is the 16-bit external bus (vs 32-bit on DX), which slows
# memory-bound code. DOSBox does not model the bus split directly --
# the cycles count below approximates the combined 386SX-16 throughput.
#
# Notes:
# core = normal accurate per-instruction cycles, not
# recompiled-to-host (auto / dynamic would
# defeat slow-CPU simulation).
# cputype = 386 386 instruction set (no 486 BSWAP /
# CMPXCHG, no Pentium MMX).
# cycles = fixed 2200 community-standard approximation for
# 386SX-16 throughput in DOSBox.
# DOSBox-Staging deprecates this in favor
# of cpu_cycles, but still accepts it.
# Vanilla DOSBox and DOSBox-X only know
# the old key, so 'cycles' stays for
# cross-fork portability.
[cpu]
core = normal
cputype = 386
cycles = fixed 2200

View file

@ -18,6 +18,7 @@ fi
prog=${1:-pattern}
repo=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)
bin_dir=$repo/build/dos/bin
conf=$repo/scripts/dosbox-386sx16.conf
file=${prog^^}.EXE
if [[ ! -f "$bin_dir/$file" ]]; then
@ -34,7 +35,12 @@ fi
# default capture-on-click behavior fights the VM's grab and mouse
# input is unusable. On plain DOSBox this -set flag is unknown and is
# logged once as a warning, then ignored -- harmless either way.
#
# -conf $conf locks the CPU to a simulated 386SX-16 (the slowest
# realistic 386 desktop). DOSBox layers configs: anything not set in
# our file falls back to the user's main dosbox.conf.
exec dosbox \
-conf "$conf" \
-set "mouse_capture=seamless" \
-c "C:" \
-c "$file" \

View file

@ -140,15 +140,16 @@ void halSpriteRestorePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint1
// s->pixels src->dst; on planar ports there is no chunky to copy
// (planes already covered by halSurfaceCopyPlanes). Chunky ports
// do the memcpy here; Amiga is a no-op.
// halSurfaceLoadFileChunky / halSurfaceSaveFileChunky wrap fread /
// fwrite of the pixel data. Chunky ports stream directly to/from
// s->pixels; Amiga uses a scratch buffer + c2p (load) or
// plane->chunky derivation (save).
// halSurfaceLoadFile / halSurfaceSaveFile wrap fread / fwrite of the
// pixel data using each port's native pixel format (chunky on
// IIgs/DOS, interleaved planar on ST, plane-major on Amiga). Files
// written by one port are NOT loadable by another -- conversion is
// the asset pipeline's job.
uint8_t halSamplePixel(const SurfaceT *s, int16_t x, int16_t y);
uint32_t halSurfaceHash(const SurfaceT *s);
void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src);
bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp);
bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp);
bool halSurfaceLoadFile(SurfaceT *dst, FILE *fp);
bool halSurfaceSaveFile(const SurfaceT *src, FILE *fp);
// Present the dirty regions of the source surface to the display.
// The cross-platform stagePresent walks the dirty arrays before

View file

@ -158,7 +158,7 @@ bool surfaceLoadFile(SurfaceT *dst, const char *path) {
fclose(fp);
return false;
}
if (!halSurfaceLoadFileChunky(dst, fp)) {
if (!halSurfaceLoadFile(dst, fp)) {
fclose(fp);
return false;
}
@ -186,7 +186,7 @@ bool surfaceSaveFile(const SurfaceT *src, const char *path) {
if (fp == NULL) {
return false;
}
if (!halSurfaceSaveFileChunky(src, fp)) {
if (!halSurfaceSaveFile(src, fp)) {
fclose(fp);
return false;
}

View file

@ -1,127 +0,0 @@
| Amiga chunky-to-planar conversion -- 68000 hand-rolled.
|
| Drop-in replacement for hal.c's old c2pRange C inner loop. Uses a
| 4 KB lookup table built once at HAL init: each (sourceByte, position,
| plane) tuple maps to the plane-byte bit contribution that source
| byte makes when it sits at that position within a 4-byte (8-pixel)
| planar group going to that plane.
|
| Calling convention: m68k-amigaos-gcc cdecl.
| Args on stack at 4(sp), 8(sp), ...
| d2-d7, a2-a6 are callee-save.
| No return value.
|
| void chunkyToPlanarRow(const uint8_t *src, ; 4(sp) - 4bpp packed source row
| uint8_t *p0, ; 8(sp) - plane 0 dest row
| uint8_t *p1, ; 12(sp) - plane 1 dest row
| uint8_t *p2, ; 16(sp) - plane 2 dest row
| uint8_t *p3, ; 20(sp) - plane 3 dest row
| uint16_t n, ; 24(sp) - planar byte count (low word)
| const uint8_t *lut); ; 28(sp) - 4 KB LUT base
|
| LUT layout: lut[src*16 + pos*4 + plane] = 1-byte plane contribution
| for source byte `src` sitting at byte-position `pos` (0..3) within
| its 4-byte planar group, going to plane `plane` (0..3). All 16
| (pos, plane) entries for one src byte are contiguous, so the inner
| loop reaches every entry off (a5, d4.w) with an 8-bit displacement
| (0..15) and never has to advance an index register.
|
| Per planar byte we consume 4 source bytes (positions 0..3 of the
| 8-pixel group). For each we compute d4 = src*16 with four add.w's
| (faster than asl.w on 68000) and OR the four plane contributions
| into d0..d3 with byte-displaced (a5,d4.w) reads.
|
| GAS-syntax (binutils m68k); assembled by m68k-amigaos-as via the
| gcc driver.
.text
.globl _chunkyToPlanarRow
| Stack frame size of MOVEM.L block: d2-d7 (6) + a2-a6 (5) = 11 regs
| * 4 bytes = 44 bytes. Args therefore start at the original sp+4
| offset PLUS 44.
.equ SAVED_REGS_SIZE, 44
_chunkyToPlanarRow:
movem.l %d2-%d7/%a2-%a6,-(%sp)
move.l 4+SAVED_REGS_SIZE(%sp),%a0 | src
move.l 8+SAVED_REGS_SIZE(%sp),%a1 | p0
move.l 12+SAVED_REGS_SIZE(%sp),%a2 | p1
move.l 16+SAVED_REGS_SIZE(%sp),%a3 | p2
move.l 20+SAVED_REGS_SIZE(%sp),%a4 | p3
| n is a uint16_t but GCC promotes to int and pushes a
| full 4 bytes -- the low word lives at +2 in big-endian
| layout.
move.w 24+SAVED_REGS_SIZE+2(%sp),%d7 | planar byte count
move.l 28+SAVED_REGS_SIZE(%sp),%a5 | LUT base
subq.w #1,%d7 | DBRA: count-1
bmi .Ldone | nothing to do
.LbyteLoop:
moveq #0,%d0 | plane 0 acc
moveq #0,%d1 | plane 1 acc
moveq #0,%d2 | plane 2 acc
moveq #0,%d3 | plane 3 acc
| ----- Source byte position 0 -----
moveq #0,%d4
move.b (%a0)+,%d4 | src[0]
add.w %d4,%d4
add.w %d4,%d4
add.w %d4,%d4
add.w %d4,%d4 | d4 = src * 16
or.b 0(%a5,%d4.w),%d0 | pos0 plane0
or.b 1(%a5,%d4.w),%d1 | pos0 plane1
or.b 2(%a5,%d4.w),%d2 | pos0 plane2
or.b 3(%a5,%d4.w),%d3 | pos0 plane3
| ----- Source byte position 1 -----
moveq #0,%d4
move.b (%a0)+,%d4 | src[1]
add.w %d4,%d4
add.w %d4,%d4
add.w %d4,%d4
add.w %d4,%d4
or.b 4(%a5,%d4.w),%d0 | pos1 plane0
or.b 5(%a5,%d4.w),%d1 | pos1 plane1
or.b 6(%a5,%d4.w),%d2 | pos1 plane2
or.b 7(%a5,%d4.w),%d3 | pos1 plane3
| ----- Source byte position 2 -----
moveq #0,%d4
move.b (%a0)+,%d4 | src[2]
add.w %d4,%d4
add.w %d4,%d4
add.w %d4,%d4
add.w %d4,%d4
or.b 8(%a5,%d4.w),%d0 | pos2 plane0
or.b 9(%a5,%d4.w),%d1 | pos2 plane1
or.b 10(%a5,%d4.w),%d2 | pos2 plane2
or.b 11(%a5,%d4.w),%d3 | pos2 plane3
| ----- Source byte position 3 -----
moveq #0,%d4
move.b (%a0)+,%d4 | src[3]
add.w %d4,%d4
add.w %d4,%d4
add.w %d4,%d4
add.w %d4,%d4
or.b 12(%a5,%d4.w),%d0 | pos3 plane0
or.b 13(%a5,%d4.w),%d1 | pos3 plane1
or.b 14(%a5,%d4.w),%d2 | pos3 plane2
or.b 15(%a5,%d4.w),%d3 | pos3 plane3
| ----- Store plane bytes -----
move.b %d0,(%a1)+
move.b %d1,(%a2)+
move.b %d2,(%a3)+
move.b %d3,(%a4)+
dbra %d7,.LbyteLoop
.Ldone:
movem.l (%sp)+,%d2-%d7/%a2-%a6
rts

View file

@ -115,69 +115,10 @@ static uint8_t gCachedScb [SURFACE_HEIGHT]
static uint16_t gCachedPalette[SURFACE_PALETTE_COUNT][SURFACE_COLORS_PER_PALETTE] __attribute__((aligned(4)));
static bool gCacheValid = false;
// 4 KB chunky-to-planar lookup table consumed by chunkyToPlanarRow
// (src/port/amiga/c2p.s). Layout: gC2pLut[src*16 + pos*4 + plane] =
// the plane-byte bit contribution that source byte `src` makes to
// plane `plane` when it sits at byte-position `pos` within a 4-byte
// (8-pixel) planar group. The src-major layout lets the asm inner
// loop reach all 16 (pos, plane) entries for a single src byte via
// 8-bit displacements off (a5, d4.w) without any LEA between reads.
static uint8_t gC2pLut[4 * 1024];
static bool gC2pLutReady = false;
static bool paletteOrScbChanged(const SurfaceT *src);
static void initC2pLut(void);
// Provided by src/port/amiga/c2p.s.
extern void chunkyToPlanarRow(const uint8_t *src,
uint8_t *p0, uint8_t *p1, uint8_t *p2, uint8_t *p3,
uint16_t numPlanarBytes,
const uint8_t *lut);
// ----- Internal helpers (alphabetical) -----
// Build the 4 KB chunky-to-planar lookup table consumed by
// chunkyToPlanarRow. For each (pos, plane, src) tuple, store the
// bit contribution that source byte `src` makes to plane `plane`
// when it sits at byte-position `pos` (0..3) within a 4-byte
// (8-pixel) planar group:
//
// - src high nibble = leftmost pixel -> plane bit (7 - 2*pos)
// - src low nibble = rightmost pixel -> plane bit (6 - 2*pos)
static void initC2pLut(void) {
uint16_t pos;
uint16_t plane;
uint16_t src;
uint8_t highShift;
uint8_t lowShift;
uint8_t highBit;
uint8_t lowBit;
if (gC2pLutReady) {
return;
}
for (src = 0; src < 256; src++) {
for (pos = 0; pos < 4; pos++) {
highShift = (uint8_t)(7 - 2 * pos);
lowShift = (uint8_t)(6 - 2 * pos);
for (plane = 0; plane < 4; plane++) {
highBit = (uint8_t)(((src >> 4) >> plane) & 1);
lowBit = (uint8_t)(((src & 0x0F) >> plane) & 1);
gC2pLut[src * 16 + pos * 4 + plane] =
(uint8_t)((highBit << highShift) | (lowBit << lowShift));
}
}
}
gC2pLutReady = true;
}
// (Phase 9 deleted c2pRange. halSurfaceLoadPlanes inlines its own
// per-row chunkyToPlanarRow loop -- the only code path that still
// converts chunky to planar today, since asset loading is the only
// surface mutation that doesn't go through a planar-aware primitive.)
// Build a user copper list for per-scanline palette (SCB emulation).
// One WAIT + 16 MOVEs per displayed scanline + one CEND. The list is
// stored in gNewUCL until installCopperList swaps it onto the screen.
@ -1358,35 +1299,6 @@ void halSpriteRestorePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint1
}
/* Helper used by Amiga halSurfaceLoadFileChunky to populate planes
* from a freshly-loaded chunky pixel buffer (s->pixels). */
static void amigaPopulatePlanesFromChunky(SurfaceT *s) {
AmigaPlanarT *pd;
int16_t y;
const uint8_t *srcLine;
UBYTE *p0;
UBYTE *p1;
UBYTE *p2;
UBYTE *p3;
pd = (AmigaPlanarT *)s->portData;
if (pd == NULL) {
return;
}
if (!gC2pLutReady) {
initC2pLut();
}
for (y = 0; y < SURFACE_HEIGHT; y++) {
srcLine = &s->pixels[y * SURFACE_BYTES_PER_ROW];
p0 = pd->planes[0] + (uint16_t)y * AMIGA_BYTES_PER_ROW;
p1 = pd->planes[1] + (uint16_t)y * AMIGA_BYTES_PER_ROW;
p2 = pd->planes[2] + (uint16_t)y * AMIGA_BYTES_PER_ROW;
p3 = pd->planes[3] + (uint16_t)y * AMIGA_BYTES_PER_ROW;
chunkyToPlanarRow(srcLine, p0, p1, p2, p3, AMIGA_BYTES_PER_ROW, gC2pLut);
}
}
// Phase 6 planar dual-write for sprite draw. Walks the sprite's
// chunky tile data with the same clipping the cross-platform code
// applies, calling amigaPlanarSetPixel for every non-transparent
@ -2118,7 +2030,9 @@ uint8_t halSamplePixel(const SurfaceT *s, int16_t x, int16_t y) {
/* Reverse-c2p: per row, derive 160 chunky bytes from 40 plane bytes
* (per plane, 4 planes). Used by halSurfaceHash, halSurfaceSaveFileChunky.
* (per plane, 4 planes). Used by halSurfaceHash to fold the planar
* surface into the same byte-stream the chunky ports hash, so cross-
* port hash comparisons stay valid.
* Walks 8 pixels per planar-byte column; per pixel assembles nibble
* from 4 plane bits. Output: 4 chunky bytes per planar-byte column
* (since 8 pixels = 4 chunky bytes at 2px/byte). */
@ -2204,62 +2118,35 @@ void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src) {
}
bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp) {
// On-disk format is the Amiga's native plane-major buffer: planes
// 0..3 written sequentially, AMIGA_PLANE_SIZE bytes each.
bool halSurfaceLoadFile(SurfaceT *dst, FILE *fp) {
AmigaPlanarT *pd;
uint8_t *scratch;
uint8_t *srcLine;
int16_t y;
UBYTE *p0;
UBYTE *p1;
UBYTE *p2;
UBYTE *p3;
bool ok;
uint8_t i;
pd = (AmigaPlanarT *)dst->portData;
if (pd == NULL) {
return false;
}
/* fread the chunky file payload into a scratch buffer, then c2p
* directly into our planes. The scratch is a one-shot AllocMem
* (PUBLIC, not chip) since chunkyToPlanarRow only reads it. */
scratch = (uint8_t *)AllocMem((ULONG)SURFACE_PIXELS_SIZE, (ULONG)MEMF_PUBLIC);
if (scratch == NULL) {
for (i = 0; i < AMIGA_BITPLANES; i++) {
if (fread(pd->planes[i], 1, AMIGA_PLANE_SIZE, fp) != AMIGA_PLANE_SIZE) {
return false;
}
ok = (fread(scratch, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE);
if (ok) {
if (!gC2pLutReady) {
initC2pLut();
}
for (y = 0; y < SURFACE_HEIGHT; y++) {
srcLine = &scratch[y * SURFACE_BYTES_PER_ROW];
p0 = pd->planes[0] + (uint16_t)y * AMIGA_BYTES_PER_ROW;
p1 = pd->planes[1] + (uint16_t)y * AMIGA_BYTES_PER_ROW;
p2 = pd->planes[2] + (uint16_t)y * AMIGA_BYTES_PER_ROW;
p3 = pd->planes[3] + (uint16_t)y * AMIGA_BYTES_PER_ROW;
chunkyToPlanarRow(srcLine, p0, p1, p2, p3, AMIGA_BYTES_PER_ROW, gC2pLut);
}
}
FreeMem(scratch, (ULONG)SURFACE_PIXELS_SIZE);
return ok;
return true;
}
bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp) {
bool halSurfaceSaveFile(const SurfaceT *src, FILE *fp) {
AmigaPlanarT *pd;
uint8_t chunkyRow[SURFACE_BYTES_PER_ROW];
int16_t y;
uint8_t i;
pd = (AmigaPlanarT *)src->portData;
if (pd == NULL) {
return false;
}
/* Per row: derive chunky from planes, write 160 bytes. Less
* efficient than a single fwrite of a full buffer but avoids
* needing a 32 KB scratch allocation. */
for (y = 0; y < SURFACE_HEIGHT; y++) {
amigaPlanesToChunkyRow(pd, y, chunkyRow);
if (fwrite(chunkyRow, 1, SURFACE_BYTES_PER_ROW, fp) != SURFACE_BYTES_PER_ROW) {
for (i = 0; i < AMIGA_BITPLANES; i++) {
if (fwrite(pd->planes[i], 1, AMIGA_PLANE_SIZE, fp) != AMIGA_PLANE_SIZE) {
return false;
}
}

View file

@ -1,188 +0,0 @@
| Atari ST chunky-to-planar conversion -- 68000 hand-rolled.
|
| Drop-in replacement for hal.c's old c2pRow C inner loop. The C
| version walked every pixel and built each plane word with a
| run-time variable bit shift (`1 << bit`), which costs ~6+2*bit
| cycles on 68000 -- roughly 100+ cycles per pixel after GCC's m68k
| codegen overhead. This rewrite uses a 4 KB lookup table built once
| at HAL init: same layout as the Amiga c2p LUT, so the
| (sourceByte, position, plane) -> 2-bit contribution mapping is
| identical, but the routine packs results into ST word-interleaved
| planar (4 plane words per 16-pixel group) instead of 4 separate
| plane bytes.
|
| Each ST group is 8 source bytes -> 4 plane words. Source byte
| positions 0..3 contribute to the HIGH byte of each plane word
| (bits 15..8); positions 4..7 contribute to the LOW byte (bits
| 7..0). Within a byte, the LUT for (src, bp%4, plane) already
| places bits at (7-2*(bp%4), 6-2*(bp%4)), so we use the SAME LUT
| entries for both halves -- we just shift d0..d3 left by 8 between
| the halves to move the high-half bits up before the low half ORs
| into the now-empty low byte.
|
| Calling convention: m68k-atari-mint-gcc cdecl.
| Args on stack at 4(sp), 8(sp), ...
| d2-d7, a2-a6 are callee-save.
| No return value.
|
| void chunkyToPlanarRowSt(const uint8_t *src, ; 4(sp) - 4bpp packed source row
| uint16_t *dst, ; 8(sp) - planar dest row (uint16_t*)
| uint16_t groupStart, ; 12(sp) - first group index (low word)
| uint16_t groupEnd, ; 16(sp) - one-past-last group index (low word)
| const uint8_t *lut); ; 20(sp) - 4 KB LUT base
|
| LUT layout: lut[src*16 + pos*4 + plane] (uint8) = the 2-bit plane
| contribution for source byte `src` at byte-position `pos` (0..3
| within a 4-byte chunk) going to plane `plane` (0..3). All 16
| (pos, plane) entries for one src byte are contiguous, so the inner
| loop reaches every entry off (a5, d4.w) with an 8-bit displacement
| (0..15) without LEA between reads.
|
| GAS-syntax (binutils m68k); assembled by m68k-atari-mint-as via
| the gcc driver.
.text
.globl _chunkyToPlanarRowSt
| MOVEM frame: d2-d7 (6) + a2-a6 (5) = 11 regs * 4 bytes = 44 bytes.
.equ SAVED_REGS_SIZE, 44
_chunkyToPlanarRowSt:
movem.l %d2-%d7/%a2-%a6,-(%sp)
move.l 4+SAVED_REGS_SIZE(%sp),%a0 | src row base
move.l 8+SAVED_REGS_SIZE(%sp),%a1 | dst (uint16_t*)
| Both groupStart and groupEnd are uint16_t but GCC
| promotes them to int and pushes 4 bytes each; the
| low word lives at +2 in big-endian layout.
move.w 12+SAVED_REGS_SIZE+2(%sp),%d6 | groupStart
move.w 16+SAVED_REGS_SIZE+2(%sp),%d7 | groupEnd
move.l 20+SAVED_REGS_SIZE(%sp),%a5 | LUT base
| Advance src and dst to the first group's data.
| Each group consumes 8 source bytes and produces 4
| dest words (8 bytes), so both pointers advance by
| groupStart * 8.
move.w %d6,%d4
lsl.w #3,%d4
add.w %d4,%a0
add.w %d4,%a1
sub.w %d6,%d7 | groupCount = end - start
subq.w #1,%d7 | DBRA bias
bmi .Ldone
.LgroupLoop:
moveq #0,%d0 | plane 0 acc
moveq #0,%d1 | plane 1 acc
moveq #0,%d2 | plane 2 acc
moveq #0,%d3 | plane 3 acc
| ===== Source bytes 0..3 -> high byte of each plane word =====
moveq #0,%d4
move.b (%a0)+,%d4
add.w %d4,%d4
add.w %d4,%d4
add.w %d4,%d4
add.w %d4,%d4 | d4 = src * 16
or.b 0(%a5,%d4.w),%d0
or.b 1(%a5,%d4.w),%d1
or.b 2(%a5,%d4.w),%d2
or.b 3(%a5,%d4.w),%d3
moveq #0,%d4
move.b (%a0)+,%d4
add.w %d4,%d4
add.w %d4,%d4
add.w %d4,%d4
add.w %d4,%d4
or.b 4(%a5,%d4.w),%d0
or.b 5(%a5,%d4.w),%d1
or.b 6(%a5,%d4.w),%d2
or.b 7(%a5,%d4.w),%d3
moveq #0,%d4
move.b (%a0)+,%d4
add.w %d4,%d4
add.w %d4,%d4
add.w %d4,%d4
add.w %d4,%d4
or.b 8(%a5,%d4.w),%d0
or.b 9(%a5,%d4.w),%d1
or.b 10(%a5,%d4.w),%d2
or.b 11(%a5,%d4.w),%d3
moveq #0,%d4
move.b (%a0)+,%d4
add.w %d4,%d4
add.w %d4,%d4
add.w %d4,%d4
add.w %d4,%d4
or.b 12(%a5,%d4.w),%d0
or.b 13(%a5,%d4.w),%d1
or.b 14(%a5,%d4.w),%d2
or.b 15(%a5,%d4.w),%d3
| Move accumulated bits into the HIGH byte of each word.
lsl.w #8,%d0
lsl.w #8,%d1
lsl.w #8,%d2
lsl.w #8,%d3
| ===== Source bytes 4..7 -> low byte of each plane word =====
moveq #0,%d4
move.b (%a0)+,%d4
add.w %d4,%d4
add.w %d4,%d4
add.w %d4,%d4
add.w %d4,%d4
or.b 0(%a5,%d4.w),%d0
or.b 1(%a5,%d4.w),%d1
or.b 2(%a5,%d4.w),%d2
or.b 3(%a5,%d4.w),%d3
moveq #0,%d4
move.b (%a0)+,%d4
add.w %d4,%d4
add.w %d4,%d4
add.w %d4,%d4
add.w %d4,%d4
or.b 4(%a5,%d4.w),%d0
or.b 5(%a5,%d4.w),%d1
or.b 6(%a5,%d4.w),%d2
or.b 7(%a5,%d4.w),%d3
moveq #0,%d4
move.b (%a0)+,%d4
add.w %d4,%d4
add.w %d4,%d4
add.w %d4,%d4
add.w %d4,%d4
or.b 8(%a5,%d4.w),%d0
or.b 9(%a5,%d4.w),%d1
or.b 10(%a5,%d4.w),%d2
or.b 11(%a5,%d4.w),%d3
moveq #0,%d4
move.b (%a0)+,%d4
add.w %d4,%d4
add.w %d4,%d4
add.w %d4,%d4
add.w %d4,%d4
or.b 12(%a5,%d4.w),%d0
or.b 13(%a5,%d4.w),%d1
or.b 14(%a5,%d4.w),%d2
or.b 15(%a5,%d4.w),%d3
| Store 4 plane words.
move.w %d0,(%a1)+
move.w %d1,(%a1)+
move.w %d2,(%a1)+
move.w %d3,(%a1)+
dbra %d7,.LgroupLoop
.Ldone:
movem.l (%sp)+,%d2-%d7/%a2-%a6
rts

View file

@ -82,11 +82,9 @@
.macro YP_REC slot, signOp, yreg
move.l %a4,%d6
\signOp\().w \yreg,%d6 | d6.w = yp
move.w %d6,%d0
lsl.w #5,%d6 | d6 = yp << 5
lsl.w #7,%d0 | d0 = yp << 7
add.w %d6,%d0 | d0 = yp * 160
move.w %d0,\slot(%sp)
add.w %d6,%d6 | * 2 for word index
move.w (%a6,%d6.w),%d6 | yLut[yp] = yp * 160
move.w %d6,\slot(%sp)
.endm
@ -223,14 +221,21 @@ _surface68kStCircleOutline:
moveq #1,%d4
sub.w %d2,%d4 | err = 1 - bx
| a6 = yLut base (yp -> yp*160). Lookup is faster than
| the 4 cyc + 4 cyc + 18 cyc + 22 cyc + 4 cyc shift+add
| chain we used to do per YP_REC. Saved across all 4
| YP_RECs per Bresenham iter (~120 cyc/iter).
| Shared LUT lives in lineSpan.s; reference absolute.
lea _gStRowOffsetLut,%a6
| Dispatch on color (low 4 bits) -> one of 16 main loops.
moveq #0,%d6
move.b SP_COLOR(%sp),%d6
and.w #0x0F,%d6
add.w %d6,%d6
add.w %d6,%d6 | * 4 for bra.w table
lea .LcoStTable(%pc),%a6
jmp 0(%a6,%d6.w)
lea .LcoStTable(%pc),%a2
jmp 0(%a2,%d6.w)
.LcoStTable:
bra.w .LcoStLoop_0
@ -280,3 +285,4 @@ bitMaskWordLut:
.word 0x0800, 0x0400, 0x0200, 0x0100
.word 0x0080, 0x0040, 0x0020, 0x0010
.word 0x0008, 0x0004, 0x0002, 0x0001
| (yLut now lives in lineSpan.s as the shared _gStRowOffsetLut)

View file

@ -9,28 +9,16 @@
| Caller MUST guarantee the bounding box (cx-r, cy-r) (cx+r, cy+r)
| is fully on-surface. Off-surface circles fall back to the C walker.
|
| Phase 10 final: 16-way color dispatch at the OUTER loop. Each color
| variant has its own Bresenham body where SPAN_BODY inlines a hard-
| coded 4-plane mask RMW (no btst, no bsr/rts). Saves ~120 cyc per
| applyMask call (was ~180 via bsr applyMask with runtime btst on d7).
|
| ABI: cdecl. d2-d7/a2-a6 callee-save.
|
| void surface68kStFillCircle(uint8_t *base,
| uint16_t cx, uint16_t cy,
| uint16_t r, uint8_t color);
|
| Register allocation across the loop:
| d2.w = bx (Bresenham, starts at r)
| d3.w = by (Bresenham, starts at 0)
| d4.w = err
| d5.l = loLong (planes 0+1 long template)
| d6.l = hiLong (planes 2+3 long template)
| d7.b = color (low nibble; tested via btst)
| a3 = base
| a4 = scratch / current group pointer
| d0,d1 = scratch
|
| Stack scratch (8 bytes at 0(sp)..7(sp)):
| 0..1 leftMask (word; per pair)
| 2..3 rightMask (word; per pair)
| 4..5 numGroups (word; per pair)
| 6..7 groupFirstByteOff (word; per pair)
.text
@ -42,7 +30,7 @@
.equ SP_FC_CX, SP_FC_OFF + 4 + 2
.equ SP_FC_CY, SP_FC_OFF + 8 + 2
.equ SP_FC_R, SP_FC_OFF + 12 + 2
.equ SP_FC_COLOR, SP_FC_OFF + 16 + 3
.equ SP_FC_COLOR, SP_FC_OFF + 20 + 3
| ---- COMPUTE_PAIR_MASKS macro -----------------------------------
@ -50,18 +38,15 @@
| Output: 0(sp) leftMask, 2(sp) rightMask, 4(sp) numGroups,
| 6(sp) groupFirstByteOff
| Trashes: d0, d1
| (No labels: straightline.)
.macro COMPUTE_PAIR_MASKS
move.w %d0,0(%sp) | stash left
move.w %d1,2(%sp) | stash right
| groupFirst & groupFirstByteOff
move.w %d0,%d1
lsr.w #4,%d1 | groupFirst
move.w %d1,%d0
lsl.w #3,%d0 | groupFirstByteOff
move.w %d0,6(%sp)
| numGroups = (right >> 4) - groupFirst
move.w 2(%sp),%d0
lsr.w #4,%d0 | groupLast
sub.w %d1,%d0 | numGroups
@ -81,25 +66,53 @@
.endm
| ---- SPAN_BODY macro --------------------------------------------
| Render one row span using the pair masks at 0(sp)..7(sp).
| Input: d0.w = y (signed)
| a3 = base, d5 = loLong, d6 = hiLong, d7 = color
| Trashes: d0, d1, a4
| Macro takes an idx parameter for unique labels.
| ---- APPLY_MASK_INLINE macro ------------------------------------
| 4-plane mask RMW with HARDCODED color. a4 advances by 8 (postinc).
| Inputs: d0.w = mask, a4 = group ptr
| Trashes: d1 (notMask scratch)
.macro SPAN_BODY
| a4 = base + y*160
ext.l %d0
move.l %d0,%d1
lsl.l #5,%d0
lsl.l #7,%d1
add.l %d1,%d0 | y*160
lea 0(%a3,%d0.l),%a4
| a4 += groupFirstByteOff
moveq #0,%d0
move.w 6(%sp),%d0
add.l %d0,%a4
.macro APPLY_MASK_INLINE color
move.w %d0,%d1
not.w %d1
.if ((\color) & 1)
or.w %d0,(%a4)+
.else
and.w %d1,(%a4)+
.endif
.if ((\color) & 2)
or.w %d0,(%a4)+
.else
and.w %d1,(%a4)+
.endif
.if ((\color) & 4)
or.w %d0,(%a4)+
.else
and.w %d1,(%a4)+
.endif
.if ((\color) & 8)
or.w %d0,(%a4)+
.else
and.w %d1,(%a4)+
.endif
.endm
| ---- SPAN_BODY macro --------------------------------------------
| Render one row span. Color hardcoded.
| Input: d0.w = y (signed)
| a3 = base, d5 = loLong, d6 = hiLong
| masks at 0..7(sp): leftMask, rightMask, numGroups, groupFirstByteOff
| Trashes: d0, d1, a4
.macro SPAN_BODY color
| a4 = base + y*160 + groupFirstByteOff
| y*160 via shared _gStRowOffsetLut (a2 holds lut base).
| byteOff (y*160 + groupFirstByteOff) fits in 16 bits
| (max 31992), so word-only ops + .w-indexed lea.
add.w %d0,%d0 | y * 2 (word index)
move.w (%a2,%d0.w),%d0 | d0 = y * 160
add.w 6(%sp),%d0 | + groupFirstByteOff
lea 0(%a3,%d0.w),%a4
| numGroups in d1
move.w 4(%sp),%d1
tst.w %d1
@ -107,15 +120,14 @@
| single-group: combinedMask = leftMask & rightMask
move.w 0(%sp),%d0
and.w 2(%sp),%d0
bsr .Lfc_applyMask
APPLY_MASK_INLINE \color
bra.w .Lsb_done\@
.Lsb_multi\@:
| leading mask. applyMask postinc-advances a4 by 8
| (the 4 plane RMWs each advance by 2 via (a4)+).
| applyMask trashes d1, so reload numGroups after bsr.
| leading mask. APPLY_MASK_INLINE postinc-advances a4 by 8.
| APPLY trashes d1, so reload numGroups after.
move.w 0(%sp),%d0
bsr .Lfc_applyMask
move.w 4(%sp),%d1 | reload numGroups
APPLY_MASK_INLINE \color
move.w 4(%sp),%d1
subq.w #1,%d1 | d1 = numMid
beq.s .Lsb_skipMid\@
.Lsb_midLoop\@:
@ -126,11 +138,71 @@
.Lsb_skipMid\@:
| trailing mask
move.w 2(%sp),%d0
bsr .Lfc_applyMask
APPLY_MASK_INLINE \color
.Lsb_done\@:
.endm
| ---- CO_BODY macro: per-color full Bresenham loop body ----------
.macro CO_BODY color
.Lfc_loop_\color:
cmp.w %d3,%d2
bcs.w .Lfc_done
| --- Pair A: x range = (cx - bx, cx + bx)
move.w SP_FC_CX(%sp),%d0
move.w %d0,%d1
sub.w %d2,%d0
add.w %d2,%d1
COMPUTE_PAIR_MASKS
| Span A1: y = cy + by
move.w SP_FC_CY(%sp),%d0
add.w %d3,%d0
SPAN_BODY \color
| Span A2: y = cy - by
move.w SP_FC_CY(%sp),%d0
sub.w %d3,%d0
SPAN_BODY \color
| --- Pair B: x range = (cx - by, cx + by)
move.w SP_FC_CX(%sp),%d0
move.w %d0,%d1
sub.w %d3,%d0
add.w %d3,%d1
COMPUTE_PAIR_MASKS
| Span B1: y = cy + bx
move.w SP_FC_CY(%sp),%d0
add.w %d2,%d0
SPAN_BODY \color
| Span B2: y = cy - bx
move.w SP_FC_CY(%sp),%d0
sub.w %d2,%d0
SPAN_BODY \color
| --- Bresenham step
addq.w #1,%d3
tst.w %d4
bgt.s .Lfc_decBx_\color
add.w %d3,%d4
add.w %d3,%d4
addq.w #1,%d4
bra.w .Lfc_loop_\color
.Lfc_decBx_\color:
subq.w #1,%d2
add.w %d3,%d4
add.w %d3,%d4
sub.w %d2,%d4
sub.w %d2,%d4
addq.w #1,%d4
bra.w .Lfc_loop_\color
.endm
.globl _surface68kStFillCircle
_surface68kStFillCircle:
@ -142,10 +214,11 @@ _surface68kStFillCircle:
moveq #0,%d7
move.b SP_FC_COLOR(%sp),%d7
| LUT bases (PC-relative indexed has only 8-bit
| displacement, so cache full pointers in a-regs).
| LUT bases. a5/a6 = mask LUTs (used by COMPUTE_PAIR_MASKS).
| a2 = shared _gStRowOffsetLut (used by SPAN_BODY for y*160).
lea leftMaskLut(%pc),%a5
lea rightMaskLut(%pc),%a6
lea _gStRowOffsetLut,%a2
| loLong = ((c&1)?0xFFFF0000:0) | ((c&2)?0x0000FFFF:0)
moveq #0,%d5
@ -174,60 +247,50 @@ _surface68kStFillCircle:
moveq #1,%d4
sub.w %d2,%d4
.Lfc_loop:
cmp.w %d3,%d2
bcs.w .Lfc_done
| Dispatch on color (low 4 bits) -> 16 specialized loops.
| Use a4 (gets overwritten in SPAN_BODY's first lea) as
| dispatch scratch since a2 now holds yLut for the body.
and.w #0x0F,%d7
move.w %d7,%d0
add.w %d0,%d0
add.w %d0,%d0 | * 4 for bra.w table
lea .Lfc_table(%pc),%a4
jmp 0(%a4,%d0.w)
| --- Pair A: x range = (cx - bx, cx + bx)
move.w SP_FC_CX(%sp),%d0
move.w %d0,%d1
sub.w %d2,%d0 | left = cx - bx
add.w %d2,%d1 | right = cx + bx
COMPUTE_PAIR_MASKS
.Lfc_table:
bra.w .Lfc_loop_0
bra.w .Lfc_loop_1
bra.w .Lfc_loop_2
bra.w .Lfc_loop_3
bra.w .Lfc_loop_4
bra.w .Lfc_loop_5
bra.w .Lfc_loop_6
bra.w .Lfc_loop_7
bra.w .Lfc_loop_8
bra.w .Lfc_loop_9
bra.w .Lfc_loop_10
bra.w .Lfc_loop_11
bra.w .Lfc_loop_12
bra.w .Lfc_loop_13
bra.w .Lfc_loop_14
bra.w .Lfc_loop_15
| Span A1: y = cy + by
move.w SP_FC_CY(%sp),%d0
add.w %d3,%d0
SPAN_BODY
| Span A2: y = cy - by
move.w SP_FC_CY(%sp),%d0
sub.w %d3,%d0
SPAN_BODY
| --- Pair B: x range = (cx - by, cx + by)
move.w SP_FC_CX(%sp),%d0
move.w %d0,%d1
sub.w %d3,%d0 | left = cx - by
add.w %d3,%d1 | right = cx + by
COMPUTE_PAIR_MASKS
| Span B1: y = cy + bx
move.w SP_FC_CY(%sp),%d0
add.w %d2,%d0
SPAN_BODY
| Span B2: y = cy - bx
move.w SP_FC_CY(%sp),%d0
sub.w %d2,%d0
SPAN_BODY
| --- Bresenham step
addq.w #1,%d3
tst.w %d4
bgt.s .Lfc_decBx
add.w %d3,%d4
add.w %d3,%d4
addq.w #1,%d4
bra.w .Lfc_loop
.Lfc_decBx:
subq.w #1,%d2
add.w %d3,%d4
add.w %d3,%d4
sub.w %d2,%d4
sub.w %d2,%d4
addq.w #1,%d4
bra.w .Lfc_loop
CO_BODY 0
CO_BODY 1
CO_BODY 2
CO_BODY 3
CO_BODY 4
CO_BODY 5
CO_BODY 6
CO_BODY 7
CO_BODY 8
CO_BODY 9
CO_BODY 10
CO_BODY 11
CO_BODY 12
CO_BODY 13
CO_BODY 14
CO_BODY 15
.Lfc_done:
@ -236,46 +299,6 @@ _surface68kStFillCircle:
rts
| ---- Apply 4-plane mask at (a4) -------------------------------
| Input: d0.w = mask, d7.b = color, a4 = group ptr
| Output: a4 advanced by 8 (next group). Caller must NOT post-add 8.
| Trashes: d0, d1
| Subroutine, called via bsr from SPAN_BODY. Postinc on each plane
| RMW saves 4 cyc/plane vs displacement (12 vs 16 EA cyc).
.Lfc_applyMask:
move.w %d0,%d1
not.w %d1 | d1 = notMask
btst #0,%d7
beq.s .Lfc_am0a
or.w %d0,(%a4)+
bra.s .Lfc_am1
.Lfc_am0a:
and.w %d1,(%a4)+
.Lfc_am1:
btst #1,%d7
beq.s .Lfc_am1a
or.w %d0,(%a4)+
bra.s .Lfc_am2
.Lfc_am1a:
and.w %d1,(%a4)+
.Lfc_am2:
btst #2,%d7
beq.s .Lfc_am2a
or.w %d0,(%a4)+
bra.s .Lfc_am3
.Lfc_am2a:
and.w %d1,(%a4)+
.Lfc_am3:
btst #3,%d7
beq.s .Lfc_am3a
or.w %d0,(%a4)+
rts
.Lfc_am3a:
and.w %d1,(%a4)+
rts
.align 2
| leftMaskLut[i] = (1 << (16 - i)) - 1, indexed by bitFirst (0..15)
leftMaskLut:

View file

@ -2,7 +2,7 @@
//
// M2 scope:
// * XBIOS Setscreen to ST low-res (320x200x16, mode 0).
// * Chunky 4bpp to word-interleaved ST planar c2p at present time.
// * Word-interleaved ST planar buffer copied to the screen at present.
//
// M2.5 scope (per-band palette / SCB emulation):
// * halPresent scans the SurfaceT's SCB array and builds a compact
@ -136,17 +136,9 @@ static inline __attribute__((always_inline)) uint8_t stPlanarGetPixel(const StPl
}
static uint16_t quantizeColorToSt(uint16_t orgb);
static void flattenScbPalettes(const SurfaceT *src);
static void initC2pLut(void);
static void writeDiagnostics(void);
static long writePrevPaletteRegs(void);
// Provided by src/port/atarist/c2p.s.
extern void chunkyToPlanarRowSt(const uint8_t *src,
uint16_t *dst,
uint16_t groupStart,
uint16_t groupEnd,
const uint8_t *lut);
static __attribute__((interrupt_handler)) void timerBIsr(void);
static __attribute__((interrupt_handler)) void vblIsr(void);
static void buildTransitions(const SurfaceT *src);
@ -201,72 +193,11 @@ static void (*gOldTimerBVec)(void) = NULL;
// SCB; neither is cheap on a 7 MHz 68000. In the typical game loop
// (and every frame of the keys demo after the initial paint) SCB and
// palette never change, so caching and skipping those passes keeps
// rect presents down to just the c2p work.
// rect presents down to just the screen blit.
static uint8_t gCachedScb [SURFACE_HEIGHT];
static uint16_t gCachedPalette[SURFACE_PALETTE_COUNT][SURFACE_COLORS_PER_PALETTE];
static bool gCacheValid = false;
// 256-long plane-spread LUT for the asm sprite SAVE path (defined in
// spriteAsm.s). For plane byte b, LUT[b] is a 32-bit value where each
// of b's 8 bits is placed at the bit-0 position of the corresponding
// pixel's nibble inside a 4-byte chunky long. The asm shifts the LUT
// entry left by N to get plane N's contribution; OR'd across 4 planes
// gives the full chunky long. Initialized lazily.
//
// LUT used by surface68kStSpriteSaveByteAligned. The asm reads via
// `move.l (a_ptr, d0.l), d4` which requires the LUT to be long-
// aligned -- and TOS .PRG BSS only does 2-byte alignment. Worse,
// the cascading offsets from the odd-sized gC2pLut put even
// `uint32_t` BSS slots at addr mod 4 == 2.
//
// Fix: malloc the LUT. mintlib's malloc returns long-aligned memory.
// The pointer is passed to the asm via the C-side wrapper (so the
// asm reads it from the stack, where it's guaranteed long-aligned
// regardless of where the static pointer slot lives).
static uint32_t *gStPlaneSpreadLutPtr = NULL;
static bool gStPlaneSpreadLutReady = false;
static bool initStPlaneSpreadLut(void) {
int b;
int i;
if (gStPlaneSpreadLutReady) {
return true;
}
gStPlaneSpreadLutPtr = (uint32_t *)malloc(256 * sizeof(uint32_t));
if (gStPlaneSpreadLutPtr == NULL) {
return false;
}
for (b = 0; b < 256; b++) {
uint32_t v = 0u;
for (i = 0; i < 8; i++) {
if (b & (0x80 >> i)) {
int byteIdx = i >> 1;
int isHigh = ((i & 1) == 0);
int bitInLong = (3 - byteIdx) * 8 + (isHigh ? 4 : 0);
v |= (uint32_t)1u << bitInLong;
}
}
gStPlaneSpreadLutPtr[b] = v;
}
gStPlaneSpreadLutReady = true;
return true;
}
// 4 KB chunky-to-planar lookup table consumed by chunkyToPlanarRowSt
// (src/port/atarist/c2p.s). Layout: gC2pLut[src*16 + pos*4 + plane]
// = the 2-bit plane-byte contribution for source byte `src` at
// byte-position `pos` (0..3 within a 4-byte chunk) going to plane
// `plane`. Bit positions inside the byte are (7-2*pos, 6-2*pos), so
// the same table feeds both halves of an ST plane word: positions
// 0..3 land in the high byte, 4..7 (re-indexed mod 4) in the low
// byte. Built once by initC2pLut on the first halPresent call.
/* Exported (no static) so spriteAsm.s can `lea _gC2pLut, %a2`. */
uint8_t gC2pLut[4 * 1024];
static bool gC2pLutReady = false;
// ----- Internal helpers (alphabetical) -----
// Scan the surface's SCB and record one transition entry for each
@ -350,37 +281,6 @@ static void refreshPaletteStateIfNeeded(const SurfaceT *src) {
}
// Build the 4 KB chunky-to-planar lookup table consumed by
// chunkyToPlanarRowSt. Same layout/contents as the Amiga c2p LUT;
// see src/port/atarist/c2p.s for the addressing math.
static void initC2pLut(void) {
uint16_t pos;
uint16_t plane;
uint16_t src;
uint8_t highShift;
uint8_t lowShift;
uint8_t highBit;
uint8_t lowBit;
if (gC2pLutReady) {
return;
}
for (src = 0; src < 256; src++) {
for (pos = 0; pos < 4; pos++) {
highShift = (uint8_t)(7 - 2 * pos);
lowShift = (uint8_t)(6 - 2 * pos);
for (plane = 0; plane < 4; plane++) {
highBit = (uint8_t)(((src >> 4) >> plane) & 1);
lowBit = (uint8_t)(((src & 0x0F) >> plane) & 1);
gC2pLut[src * 16 + pos * 4 + plane] =
(uint8_t)((highBit << highShift) | (lowBit << lowShift));
}
}
}
gC2pLutReady = true;
}
// 12-bit $0RGB to STF 9-bit palette register (drops the low bit of
// each 4-bit channel).
static uint16_t quantizeColorToSt(uint16_t orgb) {
@ -619,11 +519,8 @@ void halPresent(const SurfaceT *src) {
}
refreshPaletteStateIfNeeded(src);
// Phase 9: planar shadow -> screen RAM. Same dirty-word band
// tracking the c2p path used; just memcpy the planar bytes for
// each band instead of running c2p on the chunky shadow. Each
// dirty word covers 4 pixels = ?of one group = quarter of an
// 8-byte group. We round to whole groups (8 bytes each) for a
// Planar buffer -> screen RAM. Each dirty word covers 4 pixels
// (a quarter of an 8-byte group). Round to whole groups for a
// simple aligned memcpy, since planar groups are the natural
// copy unit.
for (y = 0; y < SURFACE_HEIGHT; y++) {
@ -720,8 +617,11 @@ extern void surface68kStFillCircle(uint8_t *base, uint16_t cx, uint16_t cy, uint
extern void surface68kStFillRectSingleGroup(uint8_t *firstGroupPtr, uint16_t mask, uint16_t h, uint8_t color);
extern void surface68kStFillRectMulti(uint8_t *base, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t color);
extern void surface68kStLongFill(uint8_t *dst, uint16_t numGroups, uint32_t loLong, uint32_t hiLong);
extern void surface68kStSpriteSaveByteAligned(uint8_t *base, uint16_t x, uint16_t y, uint16_t w, uint16_t h, uint8_t *dstChunky, const uint32_t *lut);
extern void surface68kStSpriteRestoreByteAligned(uint8_t *base, uint16_t x, uint16_t y, uint16_t w, uint16_t h, const uint8_t *srcChunky, const uint8_t *c2pLut);
extern void surface68kStTileFill8x8(uint8_t *firstGroupPtr, uint16_t mask, uint8_t color);
extern void surface68kStSprite16x16Save(uint8_t *base, uint16_t x, uint16_t y, uint8_t *dstBuf);
extern void surface68kStSprite16x16Restore(uint8_t *base, uint16_t x, uint16_t y, const uint8_t *srcBuf);
extern void surface68kStSpriteSaveByteAligned(uint8_t *base, uint16_t x, uint16_t y, uint16_t w, uint16_t h, uint8_t *dstPlaneBytes);
extern void surface68kStSpriteRestoreByteAligned(uint8_t *base, uint16_t x, uint16_t y, uint16_t w, uint16_t h, const uint8_t *srcPlaneBytes);
// Phase 9: clear the entire planar buffer to a 4-bit color. Build an
@ -1262,17 +1162,12 @@ void halTileFillPlanes(SurfaceT *s, uint8_t bx, uint8_t by, uint8_t colorIndex)
group = (uint16_t)((uint16_t)bx >> 1);
halfMask = ((bx & 1u) == 0u) ? 0xFF00u : 0x00FFu;
gp = pd->base + (uint16_t)by * 8u * ST_BYTES_PER_ROW + group * ST_BYTES_PER_GROUP;
surface68kStFillRectSingleGroup(gp, halfMask, TILE_PIXELS_PER_SIDE, colorIndex);
/* Phase 10 final: specialized 8x8 unrolled tile-fill skips the
* generic FRG_LOOP's per-row subq+bne overhead. */
surface68kStTileFill8x8(gp, halfMask, colorIndex);
}
// Phase 10: group-aware tile paste. Per row: extract 8 pixels from
// 4 chunky bytes, build 4 plane bytes (one per plane), drop them
// into the high or low half of the 4 plane words at this group --
// 4 word RMWs per row instead of 64 per-pixel calls.
static const uint8_t kStTileBitLut[8] = { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 };
// Phase 10: tile paste/snap reuse the asm sprite save/restore
// helpers -- identical per-row work patterns at byte-aligned
// positions. Width 8 = single tile column = single half-group
@ -1301,14 +1196,25 @@ void halTilePastePlanes(SurfaceT *dst, uint8_t bx, uint8_t by, const uint8_t *ti
+ (uint16_t)by * 8u * ST_BYTES_PER_ROW
+ group * ST_BYTES_PER_GROUP
+ (uint16_t)(bx & 1u);
for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) {
dstAddr[0] = tileBytes[0];
dstAddr[2] = tileBytes[1];
dstAddr[4] = tileBytes[2];
dstAddr[6] = tileBytes[3];
dstAddr += ST_BYTES_PER_ROW;
tileBytes += TILE_BYTES_PER_ROW;
}
(void)row;
#define ST_TILE_PASTE_ROW \
do { \
dstAddr[0] = tileBytes[0]; \
dstAddr[2] = tileBytes[1]; \
dstAddr[4] = tileBytes[2]; \
dstAddr[6] = tileBytes[3]; \
dstAddr += ST_BYTES_PER_ROW; \
tileBytes += TILE_BYTES_PER_ROW; \
} while (0)
ST_TILE_PASTE_ROW;
ST_TILE_PASTE_ROW;
ST_TILE_PASTE_ROW;
ST_TILE_PASTE_ROW;
ST_TILE_PASTE_ROW;
ST_TILE_PASTE_ROW;
ST_TILE_PASTE_ROW;
ST_TILE_PASTE_ROW;
#undef ST_TILE_PASTE_ROW
}
@ -1331,136 +1237,25 @@ void halTileSnapPlanes(const SurfaceT *src, uint8_t bx, uint8_t by, uint8_t *til
+ (uint16_t)by * 8u * ST_BYTES_PER_ROW
+ group * ST_BYTES_PER_GROUP
+ (uint16_t)(bx & 1u);
for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) {
tileOut[0] = srcAddr[0];
tileOut[1] = srcAddr[2];
tileOut[2] = srcAddr[4];
tileOut[3] = srcAddr[6];
srcAddr += ST_BYTES_PER_ROW;
tileOut += TILE_BYTES_PER_ROW;
}
}
/* Slow-path C versions kept (renamed) for reference; not in the
* active call chain. */
static void halTilePastePlanes_oldC(SurfaceT *dst, uint8_t bx, uint8_t by, const uint8_t *chunkyTile) {
StPlanarT *pd;
uint16_t group;
uint16_t halfMask;
uint16_t notHalfMask;
bool isHigh;
uint8_t *rowBase;
int16_t row;
int16_t pix;
uint16_t *pw;
uint8_t b;
uint8_t color;
uint8_t pb0;
uint8_t pb1;
uint8_t pb2;
uint8_t pb3;
uint8_t bit;
if (dst == NULL || chunkyTile == NULL) {
return;
}
pd = (StPlanarT *)dst->portData;
if (pd == NULL) {
return;
}
group = (uint16_t)((uint16_t)bx >> 1);
isHigh = ((bx & 1u) == 0u);
halfMask = isHigh ? 0xFF00u : 0x00FFu;
notHalfMask = (uint16_t)~halfMask;
rowBase = pd->base
+ (uint16_t)by * TILE_PIXELS_PER_SIDE * ST_BYTES_PER_ROW
+ group * ST_BYTES_PER_GROUP;
for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) {
pb0 = pb1 = pb2 = pb3 = 0u;
for (pix = 0; pix < TILE_PIXELS_PER_SIDE; pix++) {
b = chunkyTile[row * TILE_BYTES_PER_ROW + (pix >> 1)];
color = (pix & 1) ? (uint8_t)(b & 0x0Fu) : (uint8_t)(b >> 4);
bit = kStTileBitLut[pix];
if (color & 1u) { pb0 = (uint8_t)(pb0 | bit); }
if (color & 2u) { pb1 = (uint8_t)(pb1 | bit); }
if (color & 4u) { pb2 = (uint8_t)(pb2 | bit); }
if (color & 8u) { pb3 = (uint8_t)(pb3 | bit); }
}
pw = (uint16_t *)rowBase;
if (isHigh) {
pw[0] = (uint16_t)((pw[0] & notHalfMask) | ((uint16_t)pb0 << 8));
pw[1] = (uint16_t)((pw[1] & notHalfMask) | ((uint16_t)pb1 << 8));
pw[2] = (uint16_t)((pw[2] & notHalfMask) | ((uint16_t)pb2 << 8));
pw[3] = (uint16_t)((pw[3] & notHalfMask) | ((uint16_t)pb3 << 8));
} else {
pw[0] = (uint16_t)((pw[0] & notHalfMask) | (uint16_t)pb0);
pw[1] = (uint16_t)((pw[1] & notHalfMask) | (uint16_t)pb1);
pw[2] = (uint16_t)((pw[2] & notHalfMask) | (uint16_t)pb2);
pw[3] = (uint16_t)((pw[3] & notHalfMask) | (uint16_t)pb3);
}
rowBase += ST_BYTES_PER_ROW;
}
}
// Phase 10: group-aware tile snap. Read 4 plane half-words for the
// row's group, distribute the 8 plane bits per plane into chunky
// nibbles. 4 word reads per row + 4 chunky bytes per row, no
// per-pixel function calls. Replaced by the asm-routed halTileSnapPlanes
// above; kept for reference as the C-only fallback.
static void halTileSnapPlanes_oldC(const SurfaceT *src, uint8_t bx, uint8_t by, uint8_t *chunkyTileOut) {
const StPlanarT *pd;
uint16_t group;
uint16_t halfShift;
const uint8_t *rowBase;
int16_t row;
int16_t pair;
const uint16_t *pw;
uint8_t pb0;
uint8_t pb1;
uint8_t pb2;
uint8_t pb3;
uint8_t bitHi;
uint8_t bitLo;
uint8_t hi;
uint8_t lo;
if (src == NULL || chunkyTileOut == NULL) {
return;
}
pd = (const StPlanarT *)src->portData;
if (pd == NULL) {
return;
}
group = (uint16_t)((uint16_t)bx >> 1);
halfShift = ((bx & 1u) == 0u) ? 8u : 0u;
rowBase = pd->base
+ (uint16_t)by * TILE_PIXELS_PER_SIDE * ST_BYTES_PER_ROW
+ group * ST_BYTES_PER_GROUP;
for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) {
pw = (const uint16_t *)rowBase;
pb0 = (uint8_t)(pw[0] >> halfShift);
pb1 = (uint8_t)(pw[1] >> halfShift);
pb2 = (uint8_t)(pw[2] >> halfShift);
pb3 = (uint8_t)(pw[3] >> halfShift);
for (pair = 0; pair < TILE_BYTES_PER_ROW; pair++) {
bitHi = kStTileBitLut[pair * 2];
bitLo = kStTileBitLut[pair * 2 + 1];
hi = 0u;
lo = 0u;
if (pb0 & bitHi) hi = (uint8_t)(hi | 1u);
if (pb1 & bitHi) hi = (uint8_t)(hi | 2u);
if (pb2 & bitHi) hi = (uint8_t)(hi | 4u);
if (pb3 & bitHi) hi = (uint8_t)(hi | 8u);
if (pb0 & bitLo) lo = (uint8_t)(lo | 1u);
if (pb1 & bitLo) lo = (uint8_t)(lo | 2u);
if (pb2 & bitLo) lo = (uint8_t)(lo | 4u);
if (pb3 & bitLo) lo = (uint8_t)(lo | 8u);
chunkyTileOut[row * TILE_BYTES_PER_ROW + pair] = (uint8_t)((hi << 4) | lo);
}
rowBase += ST_BYTES_PER_ROW;
}
(void)row;
#define ST_TILE_SNAP_ROW \
do { \
tileOut[0] = srcAddr[0]; \
tileOut[1] = srcAddr[2]; \
tileOut[2] = srcAddr[4]; \
tileOut[3] = srcAddr[6]; \
srcAddr += ST_BYTES_PER_ROW; \
tileOut += TILE_BYTES_PER_ROW; \
} while (0)
ST_TILE_SNAP_ROW;
ST_TILE_SNAP_ROW;
ST_TILE_SNAP_ROW;
ST_TILE_SNAP_ROW;
ST_TILE_SNAP_ROW;
ST_TILE_SNAP_ROW;
ST_TILE_SNAP_ROW;
ST_TILE_SNAP_ROW;
#undef ST_TILE_SNAP_ROW
}
@ -1496,14 +1291,28 @@ void halTileCopyPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const Surfac
+ (uint16_t)dstBy * 8u * ST_BYTES_PER_ROW
+ dstGroup * ST_BYTES_PER_GROUP
+ (uint16_t)(dstBx & 1u);
for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) {
dstAddr[0] = srcAddr[0]; /* plane 0 byte (high or low half) */
dstAddr[2] = srcAddr[2]; /* plane 1 */
dstAddr[4] = srcAddr[4]; /* plane 2 */
dstAddr[6] = srcAddr[6]; /* plane 3 */
srcAddr += ST_BYTES_PER_ROW;
dstAddr += ST_BYTES_PER_ROW;
}
/* gcc-mint -O2 does NOT unroll the 8-iter byte-copy loop,
* leaving cmpl + bnes loop overhead per row. Manual unroll
* drops ~150 cyc/call. (void)row keeps the unused decl quiet. */
(void)row;
#define ST_TILE_COPY_ROW \
do { \
dstAddr[0] = srcAddr[0]; \
dstAddr[2] = srcAddr[2]; \
dstAddr[4] = srcAddr[4]; \
dstAddr[6] = srcAddr[6]; \
srcAddr += ST_BYTES_PER_ROW; \
dstAddr += ST_BYTES_PER_ROW; \
} while (0)
ST_TILE_COPY_ROW; /* row 0 */
ST_TILE_COPY_ROW; /* row 1 */
ST_TILE_COPY_ROW; /* row 2 */
ST_TILE_COPY_ROW; /* row 3 */
ST_TILE_COPY_ROW; /* row 4 */
ST_TILE_COPY_ROW; /* row 5 */
ST_TILE_COPY_ROW; /* row 6 */
ST_TILE_COPY_ROW; /* row 7 */
#undef ST_TILE_COPY_ROW
}
@ -1792,109 +1601,6 @@ void halBlitRectPlanes(SurfaceT *dst, int16_t x, int16_t y, const uint8_t *srcBy
}
// Phase 10 fast paths for save/restore. Hand-rolled asm
// (surface68kStSprite{Save,Restore}ByteAligned) does the chunky <->
// plane bit transpose via ASL+ROXL and walks rows/tile columns. The
// C wrappers below are kept as a fallback / reference; they're not
// in the critical path now that the asm versions are wired in.
static void stSpriteSaveByteAligned(const StPlanarT *pd, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t *dstChunkyBytes) {
int16_t bytesPerRow = (int16_t)(w >> 1);
int16_t tileCols = (int16_t)(w >> 3);
const uint8_t *rowBase = pd->base + (uint16_t)y * ST_BYTES_PER_ROW;
int16_t row;
int16_t tileCol;
for (row = 0; row < (int16_t)h; row++) {
uint8_t *dstRow = &dstChunkyBytes[(uint16_t)row * (uint16_t)bytesPerRow];
for (tileCol = 0; tileCol < tileCols; tileCol++) {
int16_t srcX = (int16_t)(x + tileCol * 8);
uint16_t group = (uint16_t)((uint16_t)srcX >> 4);
uint16_t shift = ((srcX & 8) == 0) ? 8u : 0u;
const uint16_t *pw = (const uint16_t *)(rowBase + group * ST_BYTES_PER_GROUP);
uint8_t pb0 = (uint8_t)(pw[0] >> shift);
uint8_t pb1 = (uint8_t)(pw[1] >> shift);
uint8_t pb2 = (uint8_t)(pw[2] >> shift);
uint8_t pb3 = (uint8_t)(pw[3] >> shift);
int16_t pair;
for (pair = 0; pair < 4; pair++) {
uint8_t bitHi = (uint8_t)(0x80u >> (pair * 2));
uint8_t bitLo = (uint8_t)(0x80u >> (pair * 2 + 1));
uint8_t hi = 0u;
uint8_t lo = 0u;
if (pb0 & bitHi) { hi = (uint8_t)(hi | 1u); }
if (pb1 & bitHi) { hi = (uint8_t)(hi | 2u); }
if (pb2 & bitHi) { hi = (uint8_t)(hi | 4u); }
if (pb3 & bitHi) { hi = (uint8_t)(hi | 8u); }
if (pb0 & bitLo) { lo = (uint8_t)(lo | 1u); }
if (pb1 & bitLo) { lo = (uint8_t)(lo | 2u); }
if (pb2 & bitLo) { lo = (uint8_t)(lo | 4u); }
if (pb3 & bitLo) { lo = (uint8_t)(lo | 8u); }
dstRow[tileCol * 4 + pair] = (uint8_t)((hi << 4) | lo);
}
}
rowBase += ST_BYTES_PER_ROW;
}
}
static void stSpriteRestoreByteAligned(StPlanarT *pd, int16_t x, int16_t y, uint16_t w, uint16_t h, const uint8_t *srcChunkyBytes) {
int16_t bytesPerRow = (int16_t)(w >> 1);
int16_t tileCols = (int16_t)(w >> 3);
uint8_t *rowBase = pd->base + (uint16_t)y * ST_BYTES_PER_ROW;
int16_t row;
int16_t tileCol;
for (row = 0; row < (int16_t)h; row++) {
const uint8_t *srcRow = &srcChunkyBytes[(uint16_t)row * (uint16_t)bytesPerRow];
for (tileCol = 0; tileCol < tileCols; tileCol++) {
uint8_t b0 = srcRow[tileCol * 4 + 0];
uint8_t b1 = srcRow[tileCol * 4 + 1];
uint8_t b2 = srcRow[tileCol * 4 + 2];
uint8_t b3 = srcRow[tileCol * 4 + 3];
uint8_t pb0 = 0u;
uint8_t pb1 = 0u;
uint8_t pb2 = 0u;
uint8_t pb3 = 0u;
uint8_t c;
int16_t dstX;
uint16_t group;
uint16_t *pw;
uint16_t halfMask;
uint16_t notHalfMask;
c = (uint8_t)(b0 >> 4); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x80u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x80u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x80u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x80u);
c = (uint8_t)(b0 & 0x0Fu); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x40u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x40u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x40u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x40u);
c = (uint8_t)(b1 >> 4); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x20u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x20u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x20u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x20u);
c = (uint8_t)(b1 & 0x0Fu); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x10u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x10u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x10u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x10u);
c = (uint8_t)(b2 >> 4); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x08u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x08u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x08u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x08u);
c = (uint8_t)(b2 & 0x0Fu); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x04u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x04u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x04u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x04u);
c = (uint8_t)(b3 >> 4); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x02u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x02u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x02u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x02u);
c = (uint8_t)(b3 & 0x0Fu); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x01u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x01u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x01u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x01u);
dstX = (int16_t)(x + tileCol * 8);
group = (uint16_t)((uint16_t)dstX >> 4);
pw = (uint16_t *)(rowBase + group * ST_BYTES_PER_GROUP);
if ((dstX & 8) == 0) {
halfMask = 0xFF00u;
pw[0] = (uint16_t)((pw[0] & 0x00FFu) | ((uint16_t)pb0 << 8));
pw[1] = (uint16_t)((pw[1] & 0x00FFu) | ((uint16_t)pb1 << 8));
pw[2] = (uint16_t)((pw[2] & 0x00FFu) | ((uint16_t)pb2 << 8));
pw[3] = (uint16_t)((pw[3] & 0x00FFu) | ((uint16_t)pb3 << 8));
} else {
halfMask = 0x00FFu;
pw[0] = (uint16_t)((pw[0] & 0xFF00u) | (uint16_t)pb0);
pw[1] = (uint16_t)((pw[1] & 0xFF00u) | (uint16_t)pb1);
pw[2] = (uint16_t)((pw[2] & 0xFF00u) | (uint16_t)pb2);
pw[3] = (uint16_t)((pw[3] & 0xFF00u) | (uint16_t)pb3);
}
(void)halfMask;
(void)notHalfMask;
}
rowBase += ST_BYTES_PER_ROW;
}
}
// Phase 10: hoist y*160 to per-row, fold setPixel/getPixel bodies
// inline. Each pixel's group address differs only in (x), so we
// can compute base+row*160 once per row and just do per-pixel
@ -1916,11 +1622,16 @@ void halSpriteSavePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t
return;
}
/* Phase 10.5 fast path: byte-aligned, fully on-surface.
* Asm walker does direct planar byte copy (LUT pointer unused). */
* Specialized 16x16 (the UBER ball-sprite size) skips the asm
* walker's per-row col-init + col-loop-check overhead. */
if ((x & 7) == 0 && (w & 7) == 0
&& x >= 0 && (x + (int16_t)w) <= SURFACE_WIDTH
&& y >= 0 && (y + (int16_t)h) <= SURFACE_HEIGHT) {
surface68kStSpriteSaveByteAligned(pd->base, (uint16_t)x, (uint16_t)y, w, h, dstPlaneBytes, NULL);
if (w == 16u && h == 16u) {
surface68kStSprite16x16Save(pd->base, (uint16_t)x, (uint16_t)y, dstPlaneBytes);
} else {
surface68kStSpriteSaveByteAligned(pd->base, (uint16_t)x, (uint16_t)y, w, h, dstPlaneBytes);
}
return;
}
@ -1980,11 +1691,15 @@ void halSpriteRestorePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint1
return;
}
/* Phase 10.5 fast path: byte-aligned, fully on-surface.
* Asm walker does direct planar byte copy (LUT pointer unused). */
* Specialized 16x16 (UBER ball-sprite) skips walker overhead. */
if ((x & 7) == 0 && (w & 7) == 0
&& x >= 0 && (x + (int16_t)w) <= SURFACE_WIDTH
&& y >= 0 && (y + (int16_t)h) <= SURFACE_HEIGHT) {
surface68kStSpriteRestoreByteAligned(pd->base, (uint16_t)x, (uint16_t)y, w, h, srcPlaneBytes, NULL);
if (w == 16u && h == 16u) {
surface68kStSprite16x16Restore(pd->base, (uint16_t)x, (uint16_t)y, srcPlaneBytes);
} else {
surface68kStSpriteRestoreByteAligned(pd->base, (uint16_t)x, (uint16_t)y, w, h, srcPlaneBytes);
}
return;
}
@ -2049,10 +1764,11 @@ uint8_t halSamplePixel(const SurfaceT *s, int16_t x, int16_t y) {
}
// Phase 9: derive 160 chunky bytes per row from the word-interleaved
// planar buffer (20 groups x 4 plane words). Same shape as the Amiga's
// Derive 160 chunky bytes per row from the word-interleaved planar
// buffer (20 groups x 4 plane words). Same shape as the Amiga's
// amigaPlanesToChunkyRow but per-group instead of per-byte. Used by
// halSurfaceHash and halSurfaceSaveFileChunky.
// halSurfaceHash to fold the planar surface into the same byte stream
// the chunky ports hash, so cross-port hash comparisons stay valid.
static void stPlanarToChunkyRow(const StPlanarT *pd, int16_t y, uint8_t *dstChunkyRow) {
uint16_t group;
uint16_t p;
@ -2134,58 +1850,27 @@ void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src) {
}
// Phase 9: read chunky from file into a temporary scratch buffer,
// then c2p once into the planar shadow. The .joeysurface file format
// is still chunky 4bpp on disk (cross-port asset interchange); the
// in-memory representation is what changes.
bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp) {
// On-disk format is the ST's native interleaved planar buffer; one
// fread fills it directly, no chunky scratch or c2p step.
bool halSurfaceLoadFile(SurfaceT *dst, FILE *fp) {
StPlanarT *pd;
uint8_t *scratch;
int16_t y;
bool ok;
pd = (StPlanarT *)dst->portData;
if (pd == NULL) {
return false;
}
scratch = (uint8_t *)malloc(SURFACE_PIXELS_SIZE);
if (scratch == NULL) {
return false;
}
ok = (fread(scratch, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE);
if (ok) {
if (!gC2pLutReady) {
initC2pLut();
}
for (y = 0; y < SURFACE_HEIGHT; y++) {
const uint8_t *srcLine = &scratch[y * SURFACE_BYTES_PER_ROW];
uint16_t *dstLine = (uint16_t *)&pd->base[y * ST_BYTES_PER_ROW];
chunkyToPlanarRowSt(srcLine, dstLine, 0u, ST_GROUPS_PER_ROW, gC2pLut);
}
}
free(scratch);
return ok;
return fread(pd->base, 1, ST_PLANAR_SIZE, fp) == ST_PLANAR_SIZE;
}
// Phase 9: derive chunky bytes from the planar shadow row by row,
// stream to file. Avoids needing a full 32 KB scratch buffer.
bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp) {
bool halSurfaceSaveFile(const SurfaceT *src, FILE *fp) {
StPlanarT *pd;
uint8_t chunkyRow[SURFACE_BYTES_PER_ROW];
int16_t y;
pd = (StPlanarT *)src->portData;
if (pd == NULL) {
return false;
}
for (y = 0; y < SURFACE_HEIGHT; y++) {
stPlanarToChunkyRow(pd, y, chunkyRow);
if (fwrite(chunkyRow, 1, SURFACE_BYTES_PER_ROW, fp) != SURFACE_BYTES_PER_ROW) {
return false;
}
}
return true;
return fwrite(pd->base, 1, ST_PLANAR_SIZE, fp) == ST_PLANAR_SIZE;
}

View file

@ -50,19 +50,17 @@
| Trashes: d0, d1, a2
.macro DL_PLOT color
| byteOff = y*160 + (x>>4)*8
| byteOff = y*160 + (x>>4)*8 (fits in 16 bits since
| surface is 32000 bytes < 32K). Skip ext.l + .l add
| + .l indexed lea -- all word-sized ops save 14 cyc/pixel.
move.w %d3,%d0
ext.l %d0
move.l %d0,%d1
lsl.l #5,%d0 | y << 5
lsl.l #7,%d1 | y << 7
add.l %d1,%d0 | d0 = y * 160
add.w %d0,%d0 | y * 2 (word index)
move.w (%a6,%d0.w),%d0 | d0 = y * 160
move.w %d2,%d1
lsr.w #4,%d1
lsl.w #3,%d1 | (x>>4) * 8
ext.l %d1
add.l %d1,%d0 | d0 = byteOff
lea 0(%a3,%d0.l),%a2 | a2 = base + byteOff
add.w %d1,%d0 | d0 = byteOff (fits in 16 bits)
lea 0(%a3,%d0.w),%a2 | a2 = base + byteOff
| d1 = bitMask, d0 = notMask
move.w %d2,%d1
and.w #15,%d1
@ -127,9 +125,11 @@ _surface68kStDrawLine:
movem.l %d2-%d7/%a2-%a6,-(%sp)
lea -SP_LOCAL(%sp),%sp
| Load base & lut.
| Load base & luts.
move.l SP_BASE(%sp),%a3
lea bitMaskWordLut(%pc),%a5
| a6 = yLut base (yp -> yp*160) for use in DL_PLOT.
lea _gStRowOffsetLut(%pc),%a6
| x = x0, y = y0
move.w SP_X0(%sp),%d2
@ -179,8 +179,8 @@ _surface68kStDrawLine:
and.w #0x0F,%d0
add.w %d0,%d0
add.w %d0,%d0 | * 4 for bra.w table
lea .LdlStTable(%pc),%a6
jmp 0(%a6,%d0.w)
lea .LdlStTable(%pc),%a2 | a2 scratch (a6 holds yLut)
jmp 0(%a2,%d0.w)
.LdlStTable:
bra.w .LdlStLoop_0
@ -529,6 +529,129 @@ _surface68kStFillRectSingleGroup:
rts
| ---- surface68kStTileFill8x8 ---------------------------------------
|
| Specialized 8x8 single-group fill: 16-way color dispatch + 8 rows
| fully unrolled. Drops the per-row subq+bne overhead that the
| generic FRG_LOOP pays. Used by halTileFillPlanes.
|
| void surface68kStTileFill8x8(uint8_t *firstGroupPtr,
| uint16_t mask,
| uint8_t color);
|
| Per row body: 4 plane RMW with postinc + lea 152(a3),a3 to next
| row. Row 7 skips the trailing lea (a3 not used after).
.equ SP_TF_SAVED, 16 | d3-d4/a2-a3 = 4 longs
.equ SP_TF_OFF, (SP_TF_SAVED + 4)
.equ SP_TF_PTR, SP_TF_OFF + 0
.equ SP_TF_MASK, SP_TF_OFF + 4 + 2
.equ SP_TF_COLOR, SP_TF_OFF + 8 + 3
.macro TF8_ROW_BARE color
.if ((\color) & 1)
or.w %d3,(%a3)+
.else
and.w %d4,(%a3)+
.endif
.if ((\color) & 2)
or.w %d3,(%a3)+
.else
and.w %d4,(%a3)+
.endif
.if ((\color) & 4)
or.w %d3,(%a3)+
.else
and.w %d4,(%a3)+
.endif
.if ((\color) & 8)
or.w %d3,(%a3)+
.else
and.w %d4,(%a3)+
.endif
.endm
.macro TF8_ROW color
TF8_ROW_BARE \color
lea 152(%a3),%a3
.endm
.macro TF8_BODY color
.Ltf8_body_\color:
TF8_ROW \color | row 0
TF8_ROW \color | row 1
TF8_ROW \color | row 2
TF8_ROW \color | row 3
TF8_ROW \color | row 4
TF8_ROW \color | row 5
TF8_ROW \color | row 6
TF8_ROW_BARE \color | row 7 (no trailing lea)
bra.w .Ltf8_done
.endm
.globl _surface68kStTileFill8x8
_surface68kStTileFill8x8:
movem.l %d3-%d4/%a2-%a3,-(%sp)
move.l SP_TF_PTR(%sp),%a3
move.w SP_TF_MASK(%sp),%d3
move.w %d3,%d4
not.w %d4
| Color dispatch
moveq #0,%d0
move.b SP_TF_COLOR(%sp),%d0
and.w #0x0F,%d0
add.w %d0,%d0
add.w %d0,%d0 | * 4 for bra.w table
lea .Ltf8_table(%pc),%a2
jmp 0(%a2,%d0.w)
.Ltf8_table:
bra.w .Ltf8_body_0
bra.w .Ltf8_body_1
bra.w .Ltf8_body_2
bra.w .Ltf8_body_3
bra.w .Ltf8_body_4
bra.w .Ltf8_body_5
bra.w .Ltf8_body_6
bra.w .Ltf8_body_7
bra.w .Ltf8_body_8
bra.w .Ltf8_body_9
bra.w .Ltf8_body_10
bra.w .Ltf8_body_11
bra.w .Ltf8_body_12
bra.w .Ltf8_body_13
bra.w .Ltf8_body_14
bra.w .Ltf8_body_15
TF8_BODY 0
TF8_BODY 1
TF8_BODY 2
TF8_BODY 3
TF8_BODY 4
TF8_BODY 5
TF8_BODY 6
TF8_BODY 7
TF8_BODY 8
TF8_BODY 9
TF8_BODY 10
TF8_BODY 11
TF8_BODY 12
TF8_BODY 13
TF8_BODY 14
TF8_BODY 15
.Ltf8_done:
movem.l (%sp)+,%d3-%d4/%a2-%a3
rts
| ---- surface68kStFillRectMulti -------------------------------------
|
| Multi-group fillRect: groupFirst != groupLast. Caller pre-clips.
@ -782,6 +905,21 @@ frmRightMaskLut:
.word 0xFFF8, 0xFFFC, 0xFFFE, 0xFFFF
.align 2
| Shared y -> y*160 LUT. Used by drawLine (DL_PLOT), drawCircle
| (YP_REC), fillCircle (SPAN_BODY). 200 words = 400 bytes.
| Replaces a 44-cyc lsl.w #5 + lsl.w #7 + add.w shift chain with
| a 14-cyc indexed-word load. Exported so circle.s and fillCircle.s
| can reference it via absolute addressing without duplication.
.globl _gStRowOffsetLut
_gStRowOffsetLut:
.set li_y, 0
.rept 200
.word li_y * 160
.set li_y, li_y + 1
.endr
| ---- surface68kStLongFill ----------------------------------------
|
| Bulk long-fill helper for full-row fills (surfaceClear, fillRect

View file

@ -1,30 +1,19 @@
| ST byte-aligned sprite save / restore via 256-entry plane-spread
| LUT. The LUT entry for each plane byte value is a 32-bit "spread"
| where each plane byte bit lands at the corresponding plane-0 bit
| position of the 4-byte chunky output. For plane N, we shift the
| LUT entry left by N to put bits at the plane-N positions, then OR
| the 4 plane contributions together to get the chunky long.
|
| LUT layout (256 longs = 1 KB), populated by initStPlaneSpreadLut
| in hal.c:
|
| gStPlaneSpreadLut[b] for plane byte b:
| bit i of b (i = 0 = MSB = leftmost pixel) maps to bit
| bitInLong(i) = (3 - (i >> 1)) * 8 + ((i & 1) ? 0 : 4)
| of the long. Plane 0's bits land at nibble bit 0 of each
| chunky byte; left-shift the LUT entry by N for plane N.
| ST byte-aligned sprite save / restore. Buffer holds plane-major
| bytes: per row, plane0/1/2/3 per tile col, for w/8 tile cols. The
| inner per-tile-col macro is 4 byte copies (no chunky <-> planar
| conversion since the buffer matches the surface's plane layout).
|
| ABI: cdecl. d2-d7/a2-a6 callee-save. C signatures:
|
| void surface68kStSpriteSaveByteAligned(uint8_t *base,
| uint16_t x, uint16_t y,
| uint16_t w, uint16_t h,
| uint8_t *dstChunky);
| uint8_t *dstPlaneBytes);
|
| void surface68kStSpriteRestoreByteAligned(uint8_t *base,
| uint16_t x, uint16_t y,
| uint16_t w, uint16_t h,
| const uint8_t *srcChunky);
| const uint8_t *srcPlaneBytes);
.text
@ -36,19 +25,12 @@
.equ SP_Y, SP_OFF + 8 + 2
.equ SP_W, SP_OFF + 12 + 2
.equ SP_H, SP_OFF + 16 + 2
.equ SP_CHUNKY, SP_OFF + 20
.equ SP_LUT, SP_OFF + 24
.equ SP_BUF, SP_OFF + 20
| Per-tile-col SAVE: 4 plane bytes -> 4 contiguous bytes in buffer.
| a0 -> plane 0 byte (high or low half), strides 2 to next plane
| a1 -> output planar bytes (advanced by 4)
| a2 -> unused (LUT no longer needed)
|
| Phase 10.5: dropped chunky <-> planar conversion. The buffer holds
| plane-major bytes (per row: plane0, plane1, plane2, plane3 per
| tile col, for w/8 tile cols). 4 byte copies instead of 4 LUT
| lookups + shifts + ORs.
.macro SAVE_TILECOL
move.b (%a0),(%a1)+ | plane 0
@ -64,13 +46,7 @@ _surface68kStSpriteSaveByteAligned:
movem.l %d2-%d7/%a2-%a6,-(%sp)
move.l SP_BASE(%sp),%a3
move.l SP_CHUNKY(%sp),%a1
| LUT pointer comes in via stack arg -- guaranteed
| long-aligned because gcc passes ptr args via
| move.l on a long-aligned sp slot. Avoids the BSS
| misalignment problem on TOS .PRG (BSS pads only to
| 2 bytes, even uint32_t slots can land at mod-4 = 2).
move.l SP_LUT(%sp),%a2
move.l SP_BUF(%sp),%a1
move.w SP_W(%sp),%d5
lsr.w #3,%d5 | d5 = tileCols
@ -128,10 +104,6 @@ _surface68kStSpriteSaveByteAligned:
| Per-tile-col RESTORE: 4 contiguous bytes from buffer -> 4 plane bytes.
| a0 -> plane 0 byte (high or low half)
| a1 -> input planar bytes (advanced by 4)
| a2 -> unused (LUT no longer needed)
|
| Phase 10.5: dropped chunky -> planar conversion. Buffer layout
| matches SAVE_TILECOL: per row, plane0/1/2/3 per tile col.
.macro RESTORE_TILECOL
move.b (%a1)+,(%a0) | plane 0
@ -147,8 +119,7 @@ _surface68kStSpriteRestoreByteAligned:
movem.l %d2-%d7/%a2-%a6,-(%sp)
move.l SP_BASE(%sp),%a3
move.l SP_CHUNKY(%sp),%a1
move.l SP_LUT(%sp),%a2 | gC2pLut passed in
move.l SP_BUF(%sp),%a1
| tileCols is held in a5 (not d5) because the macro
| trashes d5 (uses it for pb3).
@ -200,3 +171,151 @@ _surface68kStSpriteRestoreByteAligned:
movem.l (%sp)+,%d2-%d7/%a2-%a6
rts
| ---- surface68kStSprite16x16Save / Restore -----------------------
|
| Specialized 16x16 sprite save/restore: 16 rows fully unrolled,
| 8 byte copies per row (2 tile cols), no col loop. Drops the asm
| walker's per-row col-init + col-loop-check overhead.
|
| void surface68kStSprite16x16Save(uint8_t *base,
| uint16_t x, uint16_t y,
| uint8_t *dstBuf);
|
| void surface68kStSprite16x16Restore(uint8_t *base,
| uint16_t x, uint16_t y,
| const uint8_t *srcBuf);
|
| Caller guarantees x is byte-aligned (x mod 8 == 0). Two halfOff
| variants dispatch on (x & 8): halfOff=0 reads/writes within one
| group (offsets 0/2/4/6 high half + 1/3/5/7 low half). halfOff=1
| spans two groups (low half of group N + high half of group N+1).
.equ SP16_SAVED, 12 | d2/a2-a3 = 3 longs
.equ SP16_OFF, (SP16_SAVED + 4)
.equ SP16_BASE, SP16_OFF + 0
.equ SP16_X, SP16_OFF + 4 + 2
.equ SP16_Y, SP16_OFF + 8 + 2
.equ SP16_BUF, SP16_OFF + 12
| Macro: setup a0 = base + y*160 + group*8 + halfOff
| Trashes: d0, d1, d2; a0 left at row start
.macro SP16_SETUP_A0
move.l SP16_BASE(%sp),%a3
move.w SP16_X(%sp),%d0
move.w SP16_Y(%sp),%d1
| a0 = base + y*160
ext.l %d1
move.l %d1,%d2
lsl.l #5,%d1
lsl.l #7,%d2
add.l %d2,%d1
lea 0(%a3,%d1.l),%a0
| a0 += (x>>4) * 8
move.w %d0,%d1
lsr.w #4,%d1
lsl.w #3,%d1
ext.l %d1
add.l %d1,%a0
| a0 += halfOff (= (x & 8) >> 3)
and.w #8,%d0
lsr.w #3,%d0
ext.l %d0
add.l %d0,%a0
| d0 = halfOff (0 or 1) for downstream dispatch
.endm
.globl _surface68kStSprite16x16Save
_surface68kStSprite16x16Save:
movem.l %d2/%a2-%a3,-(%sp)
SP16_SETUP_A0
move.l SP16_BUF(%sp),%a1
tst.w %d0
bne.w .Lsp16s_low
| halfOff=0: a0 at high half. Col 0 = high (offsets
| 0,2,4,6); col 1 = low (offsets 1,3,5,7).
.rept 16
move.b (%a0),(%a1)+
move.b 2(%a0),(%a1)+
move.b 4(%a0),(%a1)+
move.b 6(%a0),(%a1)+
move.b 1(%a0),(%a1)+
move.b 3(%a0),(%a1)+
move.b 5(%a0),(%a1)+
move.b 7(%a0),(%a1)+
lea 160(%a0),%a0
.endr
bra.w .Lsp16s_done
.Lsp16s_low:
| halfOff=1: a0 at low half (group+1). Col 0 = low of
| this group, offsets 0,2,4,6 from a0. Col 1 = high of
| next group, at offsets 7,9,11,13 from a0.
.rept 16
move.b (%a0),(%a1)+
move.b 2(%a0),(%a1)+
move.b 4(%a0),(%a1)+
move.b 6(%a0),(%a1)+
move.b 7(%a0),(%a1)+
move.b 9(%a0),(%a1)+
move.b 11(%a0),(%a1)+
move.b 13(%a0),(%a1)+
lea 160(%a0),%a0
.endr
.Lsp16s_done:
movem.l (%sp)+,%d2/%a2-%a3
rts
.globl _surface68kStSprite16x16Restore
_surface68kStSprite16x16Restore:
movem.l %d2/%a2-%a3,-(%sp)
SP16_SETUP_A0
move.l SP16_BUF(%sp),%a1
tst.w %d0
bne.w .Lsp16r_low
| halfOff=0: write high half (col 0) + low half (col 1).
.rept 16
move.b (%a1)+,(%a0)
move.b (%a1)+,2(%a0)
move.b (%a1)+,4(%a0)
move.b (%a1)+,6(%a0)
move.b (%a1)+,1(%a0)
move.b (%a1)+,3(%a0)
move.b (%a1)+,5(%a0)
move.b (%a1)+,7(%a0)
lea 160(%a0),%a0
.endr
bra.w .Lsp16r_done
.Lsp16r_low:
| halfOff=1
.rept 16
move.b (%a1)+,(%a0)
move.b (%a1)+,2(%a0)
move.b (%a1)+,4(%a0)
move.b (%a1)+,6(%a0)
move.b (%a1)+,7(%a0)
move.b (%a1)+,9(%a0)
move.b (%a1)+,11(%a0)
move.b (%a1)+,13(%a0)
lea 160(%a0),%a0
.endr
.Lsp16r_done:
movem.l (%sp)+,%d2/%a2-%a3
rts

View file

@ -614,12 +614,12 @@ void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src) {
}
bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp) {
bool halSurfaceLoadFile(SurfaceT *dst, FILE *fp) {
return fread(dst->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE;
}
bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp) {
bool halSurfaceSaveFile(const SurfaceT *src, FILE *fp) {
return fwrite(src->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE;
}

View file

@ -395,12 +395,12 @@ void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src) {
}
bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp) {
bool halSurfaceLoadFile(SurfaceT *dst, FILE *fp) {
return fread(dst->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE;
}
bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp) {
bool halSurfaceSaveFile(const SurfaceT *src, FILE *fp) {
return fwrite(src->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE;
}