ST is more or less parity.
This commit is contained in:
parent
818dc801db
commit
cf6ae093d3
15 changed files with 966 additions and 1062 deletions
326
README.md
326
README.md
|
|
@ -59,6 +59,332 @@ build/<plat>/ per-target build outputs
|
|||
```
|
||||
|
||||
|
||||
## Public API
|
||||
|
||||
Game code includes a single umbrella header:
|
||||
|
||||
```c
|
||||
#include <joey/joey.h>
|
||||
```
|
||||
|
||||
That pulls in every public surface listed below. Full documentation
|
||||
lives in the per-feature headers under `include/joey/`; what follows
|
||||
is a quick reference. Every entry point is plain C, no C++ extensions.
|
||||
|
||||
|
||||
### Lifecycle (`joey/core.h`)
|
||||
|
||||
```c
|
||||
typedef struct {
|
||||
HostModeE hostMode; // HOST_MODE_TAKEOVER or HOST_MODE_OS
|
||||
uint32_t codegenBytes; // runtime compiled-sprite cache size
|
||||
uint16_t maxSurfaces; // maximum concurrent surfaces
|
||||
uint32_t audioBytes; // audio sample / module RAM pool
|
||||
uint32_t assetBytes; // tileset / sprite / map RAM pool
|
||||
} JoeyConfigT;
|
||||
|
||||
bool joeyInit (const JoeyConfigT *config);
|
||||
void joeyShutdown (void);
|
||||
const char *joeyLastError (void);
|
||||
const char *joeyPlatformName (void);
|
||||
const char *joeyVersionString(void);
|
||||
|
||||
void joeyWaitVBL (void); // block until next VBL
|
||||
uint16_t joeyFrameCount (void); // monotonic 16-bit frame counter
|
||||
uint16_t joeyFrameHz (void); // 50 / 60 / 70 depending on port
|
||||
```
|
||||
|
||||
|
||||
### Surfaces (`joey/surface.h`)
|
||||
|
||||
All surfaces are 320x200 4bpp packed (high nibble = left pixel) with
|
||||
a 200-entry SCB table and 16 palettes of 16 `$0RGB` colors.
|
||||
|
||||
```c
|
||||
#define SURFACE_WIDTH 320
|
||||
#define SURFACE_HEIGHT 200
|
||||
#define SURFACE_BYTES_PER_ROW 160
|
||||
#define SURFACE_PIXELS_SIZE (SURFACE_BYTES_PER_ROW * SURFACE_HEIGHT)
|
||||
#define SURFACE_PALETTE_COUNT 16
|
||||
#define SURFACE_COLORS_PER_PALETTE 16
|
||||
|
||||
typedef struct SurfaceT SurfaceT; // opaque
|
||||
|
||||
SurfaceT *surfaceCreate (void);
|
||||
void surfaceDestroy(SurfaceT *s);
|
||||
SurfaceT *stageGet (void); // library back-buffer
|
||||
void surfaceCopy (SurfaceT *dst, const SurfaceT *src);
|
||||
|
||||
bool surfaceSaveFile(const SurfaceT *src, const char *path);
|
||||
bool surfaceLoadFile(SurfaceT *dst, const char *path);
|
||||
uint32_t surfaceHash (const SurfaceT *s); // FNV-1a of logical pixels
|
||||
```
|
||||
|
||||
`surfaceSaveFile` writes the surface in **target-native** form. Files
|
||||
are NOT cross-port portable; the asset pipeline handles conversion.
|
||||
|
||||
|
||||
### Drawing (`joey/draw.h`)
|
||||
|
||||
All primitives clip to the surface; off-surface coords are silent
|
||||
no-ops. Color 0 is plotted normally (use the masked variants if you
|
||||
need transparency).
|
||||
|
||||
```c
|
||||
void surfaceClear (SurfaceT *s, uint8_t color);
|
||||
void drawPixel (SurfaceT *s, int16_t x, int16_t y, uint8_t color);
|
||||
uint8_t samplePixel (const SurfaceT *s, int16_t x, int16_t y);
|
||||
|
||||
void drawLine (SurfaceT *s, int16_t x0, int16_t y0,
|
||||
int16_t x1, int16_t y1, uint8_t color);
|
||||
void drawRect (SurfaceT *s, int16_t x, int16_t y,
|
||||
uint16_t w, uint16_t h, uint8_t color);
|
||||
void fillRect (SurfaceT *s, int16_t x, int16_t y,
|
||||
uint16_t w, uint16_t h, uint8_t color);
|
||||
void drawCircle (SurfaceT *s, int16_t cx, int16_t cy,
|
||||
uint16_t r, uint8_t color);
|
||||
void fillCircle (SurfaceT *s, int16_t cx, int16_t cy,
|
||||
uint16_t r, uint8_t color);
|
||||
|
||||
void floodFill (SurfaceT *s, int16_t x, int16_t y, uint8_t newColor);
|
||||
void floodFillBounded (SurfaceT *s, int16_t x, int16_t y,
|
||||
uint8_t newColor, uint8_t boundaryColor);
|
||||
|
||||
void surfaceBlit (SurfaceT *dst, const JoeyAssetT *src, int16_t x, int16_t y);
|
||||
void surfaceBlitMasked (SurfaceT *dst, const JoeyAssetT *src,
|
||||
int16_t x, int16_t y, uint8_t transparentIndex);
|
||||
```
|
||||
|
||||
|
||||
### Palette and SCB (`joey/palette.h`)
|
||||
|
||||
Colors are 12-bit `$0RGB`. Color 0 of every palette is forced to
|
||||
black on `paletteSet`. Each scanline picks one of the 16 palettes
|
||||
via the SCB.
|
||||
|
||||
```c
|
||||
void paletteSet (SurfaceT *s, uint8_t paletteIndex, const uint16_t *colors16);
|
||||
void paletteGet (const SurfaceT *s, uint8_t paletteIndex, uint16_t *out16);
|
||||
void scbSet (SurfaceT *s, uint16_t line, uint8_t paletteIndex);
|
||||
void scbSetRange (SurfaceT *s, uint16_t firstLine, uint16_t lastLine,
|
||||
uint8_t paletteIndex);
|
||||
uint8_t scbGet (const SurfaceT *s, uint16_t line);
|
||||
```
|
||||
|
||||
|
||||
### Tiles (`joey/tile.h`)
|
||||
|
||||
A "tile" is just an 8x8-aligned region of any surface. The API moves
|
||||
32-byte chunks between surfaces and provides a small `TileT` value
|
||||
type so callers can stash a copy without allocating a scratch surface.
|
||||
|
||||
```c
|
||||
#define TILE_PIXELS_PER_SIDE 8
|
||||
#define TILE_BYTES_PER_ROW 4
|
||||
#define TILE_BYTES (TILE_BYTES_PER_ROW * TILE_PIXELS_PER_SIDE)
|
||||
#define TILE_BLOCKS_PER_ROW (SURFACE_WIDTH / TILE_PIXELS_PER_SIDE) // 40
|
||||
#define TILE_BLOCKS_PER_COL (SURFACE_HEIGHT / TILE_PIXELS_PER_SIDE) // 25
|
||||
#define TILE_NO_GLYPH ((uint16_t)0xFFFFu)
|
||||
|
||||
typedef struct TileT { uint8_t pixels[TILE_BYTES]; } TileT;
|
||||
|
||||
void tileCopy (SurfaceT *dst, uint8_t dstBx, uint8_t dstBy,
|
||||
const SurfaceT *src, uint8_t srcBx, uint8_t srcBy);
|
||||
void tileCopyMasked (SurfaceT *dst, uint8_t dstBx, uint8_t dstBy,
|
||||
const SurfaceT *src, uint8_t srcBx, uint8_t srcBy,
|
||||
uint8_t transparentIndex);
|
||||
void tileFill (SurfaceT *s, uint8_t bx, uint8_t by, uint8_t color);
|
||||
void tileSnap (const SurfaceT *src, uint8_t bx, uint8_t by, TileT *out);
|
||||
void tilePaste (SurfaceT *dst, uint8_t bx, uint8_t by, const TileT *in);
|
||||
|
||||
void drawText (SurfaceT *dst, uint8_t bx, uint8_t by,
|
||||
const SurfaceT *fontSurface, const uint16_t *asciiMap,
|
||||
const char *str);
|
||||
```
|
||||
|
||||
|
||||
### Sprites (`joey/sprite.h`)
|
||||
|
||||
Rectangles of 8x8 tiles drawn at arbitrary pixel positions with
|
||||
color-0 transparency. Tile data is `widthTiles * heightTiles * 32`
|
||||
bytes, tile-major 4bpp packed. Sprites can be runtime-compiled
|
||||
into per-shift code variants for fast draws.
|
||||
|
||||
```c
|
||||
typedef enum { SPRITE_FLAGS_NONE = 0 } SpriteFlagsE;
|
||||
typedef struct SpriteT SpriteT; // opaque
|
||||
|
||||
typedef struct {
|
||||
SpriteT *sprite;
|
||||
int16_t x, y;
|
||||
uint16_t width, height; // pixels
|
||||
uint8_t *bytes; // caller-owned save-under buffer
|
||||
uint16_t sizeBytes;
|
||||
} SpriteBackupT;
|
||||
|
||||
SpriteT *spriteCreate (const uint8_t *tileData,
|
||||
uint8_t widthTiles, uint8_t heightTiles,
|
||||
SpriteFlagsE flags);
|
||||
SpriteT *spriteCreateFromSurface (const SurfaceT *src, int16_t x, int16_t y,
|
||||
uint8_t widthTiles, uint8_t heightTiles,
|
||||
SpriteFlagsE flags);
|
||||
SpriteT *spriteLoadFile (const char *path, SpriteFlagsE flags);
|
||||
SpriteT *spriteFromCompiledMem (const uint8_t *data, uint32_t length,
|
||||
SpriteFlagsE flags);
|
||||
bool spriteSaveFile (SpriteT *sp, const char *path);
|
||||
void spriteDestroy (SpriteT *sp);
|
||||
|
||||
bool spriteCompile (SpriteT *sp); // build per-shift fast path
|
||||
void spritePrewarm (SpriteT *sp); // hint: compile if not already
|
||||
|
||||
void spriteDraw (SurfaceT *s, SpriteT *sp, int16_t x, int16_t y);
|
||||
void spriteSaveUnder (const SurfaceT *s, SpriteT *sp,
|
||||
int16_t x, int16_t y, SpriteBackupT *backup);
|
||||
void spriteRestoreUnder (SurfaceT *s, const SpriteBackupT *backup);
|
||||
void spriteSaveAndDraw (SurfaceT *s, SpriteT *sp, int16_t x, int16_t y,
|
||||
SpriteBackupT *backup);
|
||||
|
||||
void spriteCompact (void); // defrag the codegen arena
|
||||
uint32_t spriteCodegenBytesUsed (void);
|
||||
uint32_t spriteCodegenBytesTotal (void);
|
||||
```
|
||||
|
||||
|
||||
### Assets (`joey/asset.h`)
|
||||
|
||||
Small bitmap blits with optional embedded palette, in `.jas` format.
|
||||
Use embedded `const JoeyAssetT` for ship-with-binary art; use the
|
||||
loaders for on-disk assets.
|
||||
|
||||
```c
|
||||
typedef struct {
|
||||
uint16_t width;
|
||||
uint16_t height;
|
||||
bool hasPalette;
|
||||
uint16_t palette[16]; // valid only if hasPalette
|
||||
const uint8_t *pixels; // 4bpp packed, rowBytes = (width+1)/2
|
||||
} JoeyAssetT;
|
||||
|
||||
JoeyAssetT *joeyAssetLoadFile (const char *path);
|
||||
JoeyAssetT *joeyAssetFromMem (const uint8_t *data, uint32_t length);
|
||||
void joeyAssetFree (JoeyAssetT *asset);
|
||||
void joeyAssetApplyPalette (SurfaceT *dst, uint8_t paletteIndex,
|
||||
const JoeyAssetT *asset);
|
||||
```
|
||||
|
||||
|
||||
### Present (`joey/present.h`)
|
||||
|
||||
```c
|
||||
void stagePresent(void);
|
||||
```
|
||||
|
||||
Flips the dirty rows of the stage to the display, then clears dirty
|
||||
state. Drawing primitives mark dirty as a side effect, so calling
|
||||
`stagePresent` once at end-of-frame is enough.
|
||||
|
||||
|
||||
### Input (`joey/input.h`)
|
||||
|
||||
Call `joeyInputPoll` once per frame, then query the state predicates.
|
||||
Edge predicates (`*Pressed`, `*Released`) fire only in the frame the
|
||||
transition happened.
|
||||
|
||||
```c
|
||||
typedef enum { /* KEY_NONE, KEY_A..KEY_Z, KEY_0..KEY_9, KEY_SPACE,
|
||||
KEY_ESCAPE, KEY_RETURN, KEY_TAB, KEY_BACKSPACE,
|
||||
KEY_UP/DOWN/LEFT/RIGHT, KEY_LSHIFT/RSHIFT/LCTRL/LALT,
|
||||
KEY_F1..KEY_F10, KEY_COUNT */ } JoeyKeyE;
|
||||
typedef enum { MOUSE_BUTTON_NONE, MOUSE_BUTTON_LEFT, MOUSE_BUTTON_RIGHT,
|
||||
MOUSE_BUTTON_MIDDLE, MOUSE_BUTTON_COUNT } JoeyMouseButtonE;
|
||||
typedef enum { JOYSTICK_0, JOYSTICK_1, JOYSTICK_COUNT } JoeyJoystickE;
|
||||
typedef enum { JOY_BUTTON_0, JOY_BUTTON_1, JOY_BUTTON_COUNT } JoeyJoyButtonE;
|
||||
|
||||
#define JOYSTICK_AXIS_MAX 127
|
||||
#define JOYSTICK_AXIS_MIN (-127)
|
||||
|
||||
void joeyInputPoll (void);
|
||||
void joeyWaitForAnyKey (void);
|
||||
|
||||
bool joeyKeyDown (JoeyKeyE key);
|
||||
bool joeyKeyPressed (JoeyKeyE key);
|
||||
bool joeyKeyReleased (JoeyKeyE key);
|
||||
|
||||
int16_t joeyMouseX (void);
|
||||
int16_t joeyMouseY (void);
|
||||
bool joeyMouseDown (JoeyMouseButtonE b);
|
||||
bool joeyMousePressed (JoeyMouseButtonE b);
|
||||
bool joeyMouseReleased (JoeyMouseButtonE b);
|
||||
|
||||
bool joeyJoystickConnected(JoeyJoystickE js);
|
||||
int8_t joeyJoystickX (JoeyJoystickE js);
|
||||
int8_t joeyJoystickY (JoeyJoystickE js);
|
||||
bool joeyJoyDown (JoeyJoystickE js, JoeyJoyButtonE b);
|
||||
bool joeyJoyPressed (JoeyJoystickE js, JoeyJoyButtonE b);
|
||||
bool joeyJoyReleased (JoeyJoystickE js, JoeyJoyButtonE b);
|
||||
void joeyJoystickReset (JoeyJoystickE js, uint8_t deadZone);
|
||||
```
|
||||
|
||||
|
||||
### Audio (`joey/audio.h`)
|
||||
|
||||
4-channel Protracker-style music plus four one-shot SFX slots. Module
|
||||
data must be the platform-native form produced by `tools/joeymod`
|
||||
(`.mod` for Amiga/DOS/ST; `.ntp` for IIgs; `.amod` if you want
|
||||
loop=false on Amiga). A failed `joeyAudioInit` is non-fatal; the rest
|
||||
of the API stays callable as no-ops.
|
||||
|
||||
```c
|
||||
#define JOEY_AUDIO_SFX_SLOTS 4
|
||||
|
||||
bool joeyAudioInit (void);
|
||||
void joeyAudioShutdown (void);
|
||||
|
||||
void joeyAudioPlayMod (const uint8_t *data, uint32_t length, bool loop);
|
||||
void joeyAudioStopMod (void);
|
||||
bool joeyAudioIsPlayingMod (void);
|
||||
|
||||
void joeyAudioPlaySfx (uint8_t slot, const uint8_t *sample,
|
||||
uint32_t length, uint16_t rateHz);
|
||||
void joeyAudioStopSfx (uint8_t slot);
|
||||
|
||||
void joeyAudioFrameTick (void);
|
||||
```
|
||||
|
||||
|
||||
### Debug logging (`joey/debug.h`)
|
||||
|
||||
Crash-tracing logger. Writes are buffered and durable across normal
|
||||
exit; call `joeyLogFlush` ahead of suspected hang points if you want
|
||||
a guaranteed last-line-on-disk.
|
||||
|
||||
```c
|
||||
void joeyLog (const char *msg);
|
||||
void joeyLogF (const char *fmt, ...);
|
||||
void joeyLogFlush(void);
|
||||
void joeyLogReset(void);
|
||||
```
|
||||
|
||||
Output goes to `joeylog.txt` in the program's working directory.
|
||||
|
||||
|
||||
### Platform macros (`joey/platform.h`)
|
||||
|
||||
The build system normally sets the platform via `-D`; auto-detection
|
||||
from compiler-predefined macros is a fallback. Game code can
|
||||
conditionally compile on these:
|
||||
|
||||
```
|
||||
JOEYLIB_PLATFORM_IIGS / _AMIGA / _ATARIST / _DOS // exactly one defined
|
||||
JOEYLIB_CPU_65816 / _68000 / _X86
|
||||
JOEYLIB_ENDIAN_LITTLE / _BIG
|
||||
JOEYLIB_NATIVE_CHUNKY / _NATIVE_PLANAR
|
||||
JOEYLIB_HAS_BLITTER / _HAS_COPPER // Amiga only
|
||||
JOEYLIB_PLATFORM_NAME // human-readable string
|
||||
JOEYLIB_VERSION_MAJOR / _MINOR / _PATCH / _STRING
|
||||
```
|
||||
|
||||
|
||||
## License
|
||||
|
||||
TBD.
|
||||
|
|
|
|||
28
scripts/dosbox-386sx16.conf
Normal file
28
scripts/dosbox-386sx16.conf
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
# DOSBox config: simulate an Intel 386SX-16 (1988), the slowest 386
|
||||
# desktop CPU JoeyLib could realistically be run on. Use this floor
|
||||
# to verify the DOS port still hits its frame budget on the bottom of
|
||||
# the 386 stack rather than coasting on host CPU.
|
||||
#
|
||||
# The 386SX is identical to the 386DX in instruction set; the only
|
||||
# difference is the 16-bit external bus (vs 32-bit on DX), which slows
|
||||
# memory-bound code. DOSBox does not model the bus split directly --
|
||||
# the cycles count below approximates the combined 386SX-16 throughput.
|
||||
#
|
||||
# Notes:
|
||||
# core = normal accurate per-instruction cycles, not
|
||||
# recompiled-to-host (auto / dynamic would
|
||||
# defeat slow-CPU simulation).
|
||||
# cputype = 386 386 instruction set (no 486 BSWAP /
|
||||
# CMPXCHG, no Pentium MMX).
|
||||
# cycles = fixed 2200 community-standard approximation for
|
||||
# 386SX-16 throughput in DOSBox.
|
||||
# DOSBox-Staging deprecates this in favor
|
||||
# of cpu_cycles, but still accepts it.
|
||||
# Vanilla DOSBox and DOSBox-X only know
|
||||
# the old key, so 'cycles' stays for
|
||||
# cross-fork portability.
|
||||
|
||||
[cpu]
|
||||
core = normal
|
||||
cputype = 386
|
||||
cycles = fixed 2200
|
||||
|
|
@ -18,6 +18,7 @@ fi
|
|||
prog=${1:-pattern}
|
||||
repo=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)
|
||||
bin_dir=$repo/build/dos/bin
|
||||
conf=$repo/scripts/dosbox-386sx16.conf
|
||||
file=${prog^^}.EXE
|
||||
|
||||
if [[ ! -f "$bin_dir/$file" ]]; then
|
||||
|
|
@ -34,7 +35,12 @@ fi
|
|||
# default capture-on-click behavior fights the VM's grab and mouse
|
||||
# input is unusable. On plain DOSBox this -set flag is unknown and is
|
||||
# logged once as a warning, then ignored -- harmless either way.
|
||||
#
|
||||
# -conf $conf locks the CPU to a simulated 386SX-16 (the slowest
|
||||
# realistic 386 desktop). DOSBox layers configs: anything not set in
|
||||
# our file falls back to the user's main dosbox.conf.
|
||||
exec dosbox \
|
||||
-conf "$conf" \
|
||||
-set "mouse_capture=seamless" \
|
||||
-c "C:" \
|
||||
-c "$file" \
|
||||
|
|
|
|||
|
|
@ -140,15 +140,16 @@ void halSpriteRestorePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint1
|
|||
// s->pixels src->dst; on planar ports there is no chunky to copy
|
||||
// (planes already covered by halSurfaceCopyPlanes). Chunky ports
|
||||
// do the memcpy here; Amiga is a no-op.
|
||||
// halSurfaceLoadFileChunky / halSurfaceSaveFileChunky wrap fread /
|
||||
// fwrite of the pixel data. Chunky ports stream directly to/from
|
||||
// s->pixels; Amiga uses a scratch buffer + c2p (load) or
|
||||
// plane->chunky derivation (save).
|
||||
// halSurfaceLoadFile / halSurfaceSaveFile wrap fread / fwrite of the
|
||||
// pixel data using each port's native pixel format (chunky on
|
||||
// IIgs/DOS, interleaved planar on ST, plane-major on Amiga). Files
|
||||
// written by one port are NOT loadable by another -- conversion is
|
||||
// the asset pipeline's job.
|
||||
uint8_t halSamplePixel(const SurfaceT *s, int16_t x, int16_t y);
|
||||
uint32_t halSurfaceHash(const SurfaceT *s);
|
||||
void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src);
|
||||
bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp);
|
||||
bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp);
|
||||
bool halSurfaceLoadFile(SurfaceT *dst, FILE *fp);
|
||||
bool halSurfaceSaveFile(const SurfaceT *src, FILE *fp);
|
||||
|
||||
// Present the dirty regions of the source surface to the display.
|
||||
// The cross-platform stagePresent walks the dirty arrays before
|
||||
|
|
|
|||
|
|
@ -158,7 +158,7 @@ bool surfaceLoadFile(SurfaceT *dst, const char *path) {
|
|||
fclose(fp);
|
||||
return false;
|
||||
}
|
||||
if (!halSurfaceLoadFileChunky(dst, fp)) {
|
||||
if (!halSurfaceLoadFile(dst, fp)) {
|
||||
fclose(fp);
|
||||
return false;
|
||||
}
|
||||
|
|
@ -186,7 +186,7 @@ bool surfaceSaveFile(const SurfaceT *src, const char *path) {
|
|||
if (fp == NULL) {
|
||||
return false;
|
||||
}
|
||||
if (!halSurfaceSaveFileChunky(src, fp)) {
|
||||
if (!halSurfaceSaveFile(src, fp)) {
|
||||
fclose(fp);
|
||||
return false;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,127 +0,0 @@
|
|||
| Amiga chunky-to-planar conversion -- 68000 hand-rolled.
|
||||
|
|
||||
| Drop-in replacement for hal.c's old c2pRange C inner loop. Uses a
|
||||
| 4 KB lookup table built once at HAL init: each (sourceByte, position,
|
||||
| plane) tuple maps to the plane-byte bit contribution that source
|
||||
| byte makes when it sits at that position within a 4-byte (8-pixel)
|
||||
| planar group going to that plane.
|
||||
|
|
||||
| Calling convention: m68k-amigaos-gcc cdecl.
|
||||
| Args on stack at 4(sp), 8(sp), ...
|
||||
| d2-d7, a2-a6 are callee-save.
|
||||
| No return value.
|
||||
|
|
||||
| void chunkyToPlanarRow(const uint8_t *src, ; 4(sp) - 4bpp packed source row
|
||||
| uint8_t *p0, ; 8(sp) - plane 0 dest row
|
||||
| uint8_t *p1, ; 12(sp) - plane 1 dest row
|
||||
| uint8_t *p2, ; 16(sp) - plane 2 dest row
|
||||
| uint8_t *p3, ; 20(sp) - plane 3 dest row
|
||||
| uint16_t n, ; 24(sp) - planar byte count (low word)
|
||||
| const uint8_t *lut); ; 28(sp) - 4 KB LUT base
|
||||
|
|
||||
| LUT layout: lut[src*16 + pos*4 + plane] = 1-byte plane contribution
|
||||
| for source byte `src` sitting at byte-position `pos` (0..3) within
|
||||
| its 4-byte planar group, going to plane `plane` (0..3). All 16
|
||||
| (pos, plane) entries for one src byte are contiguous, so the inner
|
||||
| loop reaches every entry off (a5, d4.w) with an 8-bit displacement
|
||||
| (0..15) and never has to advance an index register.
|
||||
|
|
||||
| Per planar byte we consume 4 source bytes (positions 0..3 of the
|
||||
| 8-pixel group). For each we compute d4 = src*16 with four add.w's
|
||||
| (faster than asl.w on 68000) and OR the four plane contributions
|
||||
| into d0..d3 with byte-displaced (a5,d4.w) reads.
|
||||
|
|
||||
| GAS-syntax (binutils m68k); assembled by m68k-amigaos-as via the
|
||||
| gcc driver.
|
||||
|
||||
.text
|
||||
.globl _chunkyToPlanarRow
|
||||
|
||||
| Stack frame size of MOVEM.L block: d2-d7 (6) + a2-a6 (5) = 11 regs
|
||||
| * 4 bytes = 44 bytes. Args therefore start at the original sp+4
|
||||
| offset PLUS 44.
|
||||
.equ SAVED_REGS_SIZE, 44
|
||||
|
||||
|
||||
_chunkyToPlanarRow:
|
||||
movem.l %d2-%d7/%a2-%a6,-(%sp)
|
||||
|
||||
move.l 4+SAVED_REGS_SIZE(%sp),%a0 | src
|
||||
move.l 8+SAVED_REGS_SIZE(%sp),%a1 | p0
|
||||
move.l 12+SAVED_REGS_SIZE(%sp),%a2 | p1
|
||||
move.l 16+SAVED_REGS_SIZE(%sp),%a3 | p2
|
||||
move.l 20+SAVED_REGS_SIZE(%sp),%a4 | p3
|
||||
| n is a uint16_t but GCC promotes to int and pushes a
|
||||
| full 4 bytes -- the low word lives at +2 in big-endian
|
||||
| layout.
|
||||
move.w 24+SAVED_REGS_SIZE+2(%sp),%d7 | planar byte count
|
||||
move.l 28+SAVED_REGS_SIZE(%sp),%a5 | LUT base
|
||||
|
||||
subq.w #1,%d7 | DBRA: count-1
|
||||
bmi .Ldone | nothing to do
|
||||
|
||||
.LbyteLoop:
|
||||
moveq #0,%d0 | plane 0 acc
|
||||
moveq #0,%d1 | plane 1 acc
|
||||
moveq #0,%d2 | plane 2 acc
|
||||
moveq #0,%d3 | plane 3 acc
|
||||
|
||||
| ----- Source byte position 0 -----
|
||||
moveq #0,%d4
|
||||
move.b (%a0)+,%d4 | src[0]
|
||||
add.w %d4,%d4
|
||||
add.w %d4,%d4
|
||||
add.w %d4,%d4
|
||||
add.w %d4,%d4 | d4 = src * 16
|
||||
or.b 0(%a5,%d4.w),%d0 | pos0 plane0
|
||||
or.b 1(%a5,%d4.w),%d1 | pos0 plane1
|
||||
or.b 2(%a5,%d4.w),%d2 | pos0 plane2
|
||||
or.b 3(%a5,%d4.w),%d3 | pos0 plane3
|
||||
|
||||
| ----- Source byte position 1 -----
|
||||
moveq #0,%d4
|
||||
move.b (%a0)+,%d4 | src[1]
|
||||
add.w %d4,%d4
|
||||
add.w %d4,%d4
|
||||
add.w %d4,%d4
|
||||
add.w %d4,%d4
|
||||
or.b 4(%a5,%d4.w),%d0 | pos1 plane0
|
||||
or.b 5(%a5,%d4.w),%d1 | pos1 plane1
|
||||
or.b 6(%a5,%d4.w),%d2 | pos1 plane2
|
||||
or.b 7(%a5,%d4.w),%d3 | pos1 plane3
|
||||
|
||||
| ----- Source byte position 2 -----
|
||||
moveq #0,%d4
|
||||
move.b (%a0)+,%d4 | src[2]
|
||||
add.w %d4,%d4
|
||||
add.w %d4,%d4
|
||||
add.w %d4,%d4
|
||||
add.w %d4,%d4
|
||||
or.b 8(%a5,%d4.w),%d0 | pos2 plane0
|
||||
or.b 9(%a5,%d4.w),%d1 | pos2 plane1
|
||||
or.b 10(%a5,%d4.w),%d2 | pos2 plane2
|
||||
or.b 11(%a5,%d4.w),%d3 | pos2 plane3
|
||||
|
||||
| ----- Source byte position 3 -----
|
||||
moveq #0,%d4
|
||||
move.b (%a0)+,%d4 | src[3]
|
||||
add.w %d4,%d4
|
||||
add.w %d4,%d4
|
||||
add.w %d4,%d4
|
||||
add.w %d4,%d4
|
||||
or.b 12(%a5,%d4.w),%d0 | pos3 plane0
|
||||
or.b 13(%a5,%d4.w),%d1 | pos3 plane1
|
||||
or.b 14(%a5,%d4.w),%d2 | pos3 plane2
|
||||
or.b 15(%a5,%d4.w),%d3 | pos3 plane3
|
||||
|
||||
| ----- Store plane bytes -----
|
||||
move.b %d0,(%a1)+
|
||||
move.b %d1,(%a2)+
|
||||
move.b %d2,(%a3)+
|
||||
move.b %d3,(%a4)+
|
||||
|
||||
dbra %d7,.LbyteLoop
|
||||
|
||||
.Ldone:
|
||||
movem.l (%sp)+,%d2-%d7/%a2-%a6
|
||||
rts
|
||||
|
|
@ -115,69 +115,10 @@ static uint8_t gCachedScb [SURFACE_HEIGHT]
|
|||
static uint16_t gCachedPalette[SURFACE_PALETTE_COUNT][SURFACE_COLORS_PER_PALETTE] __attribute__((aligned(4)));
|
||||
static bool gCacheValid = false;
|
||||
|
||||
// 4 KB chunky-to-planar lookup table consumed by chunkyToPlanarRow
|
||||
// (src/port/amiga/c2p.s). Layout: gC2pLut[src*16 + pos*4 + plane] =
|
||||
// the plane-byte bit contribution that source byte `src` makes to
|
||||
// plane `plane` when it sits at byte-position `pos` within a 4-byte
|
||||
// (8-pixel) planar group. The src-major layout lets the asm inner
|
||||
// loop reach all 16 (pos, plane) entries for a single src byte via
|
||||
// 8-bit displacements off (a5, d4.w) without any LEA between reads.
|
||||
static uint8_t gC2pLut[4 * 1024];
|
||||
static bool gC2pLutReady = false;
|
||||
|
||||
static bool paletteOrScbChanged(const SurfaceT *src);
|
||||
static void initC2pLut(void);
|
||||
|
||||
// Provided by src/port/amiga/c2p.s.
|
||||
extern void chunkyToPlanarRow(const uint8_t *src,
|
||||
uint8_t *p0, uint8_t *p1, uint8_t *p2, uint8_t *p3,
|
||||
uint16_t numPlanarBytes,
|
||||
const uint8_t *lut);
|
||||
|
||||
// ----- Internal helpers (alphabetical) -----
|
||||
|
||||
// Build the 4 KB chunky-to-planar lookup table consumed by
|
||||
// chunkyToPlanarRow. For each (pos, plane, src) tuple, store the
|
||||
// bit contribution that source byte `src` makes to plane `plane`
|
||||
// when it sits at byte-position `pos` (0..3) within a 4-byte
|
||||
// (8-pixel) planar group:
|
||||
//
|
||||
// - src high nibble = leftmost pixel -> plane bit (7 - 2*pos)
|
||||
// - src low nibble = rightmost pixel -> plane bit (6 - 2*pos)
|
||||
static void initC2pLut(void) {
|
||||
uint16_t pos;
|
||||
uint16_t plane;
|
||||
uint16_t src;
|
||||
uint8_t highShift;
|
||||
uint8_t lowShift;
|
||||
uint8_t highBit;
|
||||
uint8_t lowBit;
|
||||
|
||||
if (gC2pLutReady) {
|
||||
return;
|
||||
}
|
||||
for (src = 0; src < 256; src++) {
|
||||
for (pos = 0; pos < 4; pos++) {
|
||||
highShift = (uint8_t)(7 - 2 * pos);
|
||||
lowShift = (uint8_t)(6 - 2 * pos);
|
||||
for (plane = 0; plane < 4; plane++) {
|
||||
highBit = (uint8_t)(((src >> 4) >> plane) & 1);
|
||||
lowBit = (uint8_t)(((src & 0x0F) >> plane) & 1);
|
||||
gC2pLut[src * 16 + pos * 4 + plane] =
|
||||
(uint8_t)((highBit << highShift) | (lowBit << lowShift));
|
||||
}
|
||||
}
|
||||
}
|
||||
gC2pLutReady = true;
|
||||
}
|
||||
|
||||
|
||||
// (Phase 9 deleted c2pRange. halSurfaceLoadPlanes inlines its own
|
||||
// per-row chunkyToPlanarRow loop -- the only code path that still
|
||||
// converts chunky to planar today, since asset loading is the only
|
||||
// surface mutation that doesn't go through a planar-aware primitive.)
|
||||
|
||||
|
||||
// Build a user copper list for per-scanline palette (SCB emulation).
|
||||
// One WAIT + 16 MOVEs per displayed scanline + one CEND. The list is
|
||||
// stored in gNewUCL until installCopperList swaps it onto the screen.
|
||||
|
|
@ -1358,35 +1299,6 @@ void halSpriteRestorePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint1
|
|||
}
|
||||
|
||||
|
||||
/* Helper used by Amiga halSurfaceLoadFileChunky to populate planes
|
||||
* from a freshly-loaded chunky pixel buffer (s->pixels). */
|
||||
static void amigaPopulatePlanesFromChunky(SurfaceT *s) {
|
||||
AmigaPlanarT *pd;
|
||||
int16_t y;
|
||||
const uint8_t *srcLine;
|
||||
UBYTE *p0;
|
||||
UBYTE *p1;
|
||||
UBYTE *p2;
|
||||
UBYTE *p3;
|
||||
|
||||
pd = (AmigaPlanarT *)s->portData;
|
||||
if (pd == NULL) {
|
||||
return;
|
||||
}
|
||||
if (!gC2pLutReady) {
|
||||
initC2pLut();
|
||||
}
|
||||
for (y = 0; y < SURFACE_HEIGHT; y++) {
|
||||
srcLine = &s->pixels[y * SURFACE_BYTES_PER_ROW];
|
||||
p0 = pd->planes[0] + (uint16_t)y * AMIGA_BYTES_PER_ROW;
|
||||
p1 = pd->planes[1] + (uint16_t)y * AMIGA_BYTES_PER_ROW;
|
||||
p2 = pd->planes[2] + (uint16_t)y * AMIGA_BYTES_PER_ROW;
|
||||
p3 = pd->planes[3] + (uint16_t)y * AMIGA_BYTES_PER_ROW;
|
||||
chunkyToPlanarRow(srcLine, p0, p1, p2, p3, AMIGA_BYTES_PER_ROW, gC2pLut);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Phase 6 planar dual-write for sprite draw. Walks the sprite's
|
||||
// chunky tile data with the same clipping the cross-platform code
|
||||
// applies, calling amigaPlanarSetPixel for every non-transparent
|
||||
|
|
@ -2118,7 +2030,9 @@ uint8_t halSamplePixel(const SurfaceT *s, int16_t x, int16_t y) {
|
|||
|
||||
|
||||
/* Reverse-c2p: per row, derive 160 chunky bytes from 40 plane bytes
|
||||
* (per plane, 4 planes). Used by halSurfaceHash, halSurfaceSaveFileChunky.
|
||||
* (per plane, 4 planes). Used by halSurfaceHash to fold the planar
|
||||
* surface into the same byte-stream the chunky ports hash, so cross-
|
||||
* port hash comparisons stay valid.
|
||||
* Walks 8 pixels per planar-byte column; per pixel assembles nibble
|
||||
* from 4 plane bits. Output: 4 chunky bytes per planar-byte column
|
||||
* (since 8 pixels = 4 chunky bytes at 2px/byte). */
|
||||
|
|
@ -2204,62 +2118,35 @@ void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src) {
|
|||
}
|
||||
|
||||
|
||||
bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp) {
|
||||
// On-disk format is the Amiga's native plane-major buffer: planes
|
||||
// 0..3 written sequentially, AMIGA_PLANE_SIZE bytes each.
|
||||
bool halSurfaceLoadFile(SurfaceT *dst, FILE *fp) {
|
||||
AmigaPlanarT *pd;
|
||||
uint8_t *scratch;
|
||||
uint8_t *srcLine;
|
||||
int16_t y;
|
||||
UBYTE *p0;
|
||||
UBYTE *p1;
|
||||
UBYTE *p2;
|
||||
UBYTE *p3;
|
||||
bool ok;
|
||||
uint8_t i;
|
||||
|
||||
pd = (AmigaPlanarT *)dst->portData;
|
||||
if (pd == NULL) {
|
||||
return false;
|
||||
}
|
||||
/* fread the chunky file payload into a scratch buffer, then c2p
|
||||
* directly into our planes. The scratch is a one-shot AllocMem
|
||||
* (PUBLIC, not chip) since chunkyToPlanarRow only reads it. */
|
||||
scratch = (uint8_t *)AllocMem((ULONG)SURFACE_PIXELS_SIZE, (ULONG)MEMF_PUBLIC);
|
||||
if (scratch == NULL) {
|
||||
for (i = 0; i < AMIGA_BITPLANES; i++) {
|
||||
if (fread(pd->planes[i], 1, AMIGA_PLANE_SIZE, fp) != AMIGA_PLANE_SIZE) {
|
||||
return false;
|
||||
}
|
||||
ok = (fread(scratch, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE);
|
||||
if (ok) {
|
||||
if (!gC2pLutReady) {
|
||||
initC2pLut();
|
||||
}
|
||||
for (y = 0; y < SURFACE_HEIGHT; y++) {
|
||||
srcLine = &scratch[y * SURFACE_BYTES_PER_ROW];
|
||||
p0 = pd->planes[0] + (uint16_t)y * AMIGA_BYTES_PER_ROW;
|
||||
p1 = pd->planes[1] + (uint16_t)y * AMIGA_BYTES_PER_ROW;
|
||||
p2 = pd->planes[2] + (uint16_t)y * AMIGA_BYTES_PER_ROW;
|
||||
p3 = pd->planes[3] + (uint16_t)y * AMIGA_BYTES_PER_ROW;
|
||||
chunkyToPlanarRow(srcLine, p0, p1, p2, p3, AMIGA_BYTES_PER_ROW, gC2pLut);
|
||||
}
|
||||
}
|
||||
FreeMem(scratch, (ULONG)SURFACE_PIXELS_SIZE);
|
||||
return ok;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp) {
|
||||
bool halSurfaceSaveFile(const SurfaceT *src, FILE *fp) {
|
||||
AmigaPlanarT *pd;
|
||||
uint8_t chunkyRow[SURFACE_BYTES_PER_ROW];
|
||||
int16_t y;
|
||||
uint8_t i;
|
||||
|
||||
pd = (AmigaPlanarT *)src->portData;
|
||||
if (pd == NULL) {
|
||||
return false;
|
||||
}
|
||||
/* Per row: derive chunky from planes, write 160 bytes. Less
|
||||
* efficient than a single fwrite of a full buffer but avoids
|
||||
* needing a 32 KB scratch allocation. */
|
||||
for (y = 0; y < SURFACE_HEIGHT; y++) {
|
||||
amigaPlanesToChunkyRow(pd, y, chunkyRow);
|
||||
if (fwrite(chunkyRow, 1, SURFACE_BYTES_PER_ROW, fp) != SURFACE_BYTES_PER_ROW) {
|
||||
for (i = 0; i < AMIGA_BITPLANES; i++) {
|
||||
if (fwrite(pd->planes[i], 1, AMIGA_PLANE_SIZE, fp) != AMIGA_PLANE_SIZE) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,188 +0,0 @@
|
|||
| Atari ST chunky-to-planar conversion -- 68000 hand-rolled.
|
||||
|
|
||||
| Drop-in replacement for hal.c's old c2pRow C inner loop. The C
|
||||
| version walked every pixel and built each plane word with a
|
||||
| run-time variable bit shift (`1 << bit`), which costs ~6+2*bit
|
||||
| cycles on 68000 -- roughly 100+ cycles per pixel after GCC's m68k
|
||||
| codegen overhead. This rewrite uses a 4 KB lookup table built once
|
||||
| at HAL init: same layout as the Amiga c2p LUT, so the
|
||||
| (sourceByte, position, plane) -> 2-bit contribution mapping is
|
||||
| identical, but the routine packs results into ST word-interleaved
|
||||
| planar (4 plane words per 16-pixel group) instead of 4 separate
|
||||
| plane bytes.
|
||||
|
|
||||
| Each ST group is 8 source bytes -> 4 plane words. Source byte
|
||||
| positions 0..3 contribute to the HIGH byte of each plane word
|
||||
| (bits 15..8); positions 4..7 contribute to the LOW byte (bits
|
||||
| 7..0). Within a byte, the LUT for (src, bp%4, plane) already
|
||||
| places bits at (7-2*(bp%4), 6-2*(bp%4)), so we use the SAME LUT
|
||||
| entries for both halves -- we just shift d0..d3 left by 8 between
|
||||
| the halves to move the high-half bits up before the low half ORs
|
||||
| into the now-empty low byte.
|
||||
|
|
||||
| Calling convention: m68k-atari-mint-gcc cdecl.
|
||||
| Args on stack at 4(sp), 8(sp), ...
|
||||
| d2-d7, a2-a6 are callee-save.
|
||||
| No return value.
|
||||
|
|
||||
| void chunkyToPlanarRowSt(const uint8_t *src, ; 4(sp) - 4bpp packed source row
|
||||
| uint16_t *dst, ; 8(sp) - planar dest row (uint16_t*)
|
||||
| uint16_t groupStart, ; 12(sp) - first group index (low word)
|
||||
| uint16_t groupEnd, ; 16(sp) - one-past-last group index (low word)
|
||||
| const uint8_t *lut); ; 20(sp) - 4 KB LUT base
|
||||
|
|
||||
| LUT layout: lut[src*16 + pos*4 + plane] (uint8) = the 2-bit plane
|
||||
| contribution for source byte `src` at byte-position `pos` (0..3
|
||||
| within a 4-byte chunk) going to plane `plane` (0..3). All 16
|
||||
| (pos, plane) entries for one src byte are contiguous, so the inner
|
||||
| loop reaches every entry off (a5, d4.w) with an 8-bit displacement
|
||||
| (0..15) without LEA between reads.
|
||||
|
|
||||
| GAS-syntax (binutils m68k); assembled by m68k-atari-mint-as via
|
||||
| the gcc driver.
|
||||
|
||||
.text
|
||||
.globl _chunkyToPlanarRowSt
|
||||
|
||||
| MOVEM frame: d2-d7 (6) + a2-a6 (5) = 11 regs * 4 bytes = 44 bytes.
|
||||
.equ SAVED_REGS_SIZE, 44
|
||||
|
||||
|
||||
_chunkyToPlanarRowSt:
|
||||
movem.l %d2-%d7/%a2-%a6,-(%sp)
|
||||
|
||||
move.l 4+SAVED_REGS_SIZE(%sp),%a0 | src row base
|
||||
move.l 8+SAVED_REGS_SIZE(%sp),%a1 | dst (uint16_t*)
|
||||
| Both groupStart and groupEnd are uint16_t but GCC
|
||||
| promotes them to int and pushes 4 bytes each; the
|
||||
| low word lives at +2 in big-endian layout.
|
||||
move.w 12+SAVED_REGS_SIZE+2(%sp),%d6 | groupStart
|
||||
move.w 16+SAVED_REGS_SIZE+2(%sp),%d7 | groupEnd
|
||||
move.l 20+SAVED_REGS_SIZE(%sp),%a5 | LUT base
|
||||
|
||||
| Advance src and dst to the first group's data.
|
||||
| Each group consumes 8 source bytes and produces 4
|
||||
| dest words (8 bytes), so both pointers advance by
|
||||
| groupStart * 8.
|
||||
move.w %d6,%d4
|
||||
lsl.w #3,%d4
|
||||
add.w %d4,%a0
|
||||
add.w %d4,%a1
|
||||
|
||||
sub.w %d6,%d7 | groupCount = end - start
|
||||
subq.w #1,%d7 | DBRA bias
|
||||
bmi .Ldone
|
||||
|
||||
.LgroupLoop:
|
||||
moveq #0,%d0 | plane 0 acc
|
||||
moveq #0,%d1 | plane 1 acc
|
||||
moveq #0,%d2 | plane 2 acc
|
||||
moveq #0,%d3 | plane 3 acc
|
||||
|
||||
| ===== Source bytes 0..3 -> high byte of each plane word =====
|
||||
moveq #0,%d4
|
||||
move.b (%a0)+,%d4
|
||||
add.w %d4,%d4
|
||||
add.w %d4,%d4
|
||||
add.w %d4,%d4
|
||||
add.w %d4,%d4 | d4 = src * 16
|
||||
or.b 0(%a5,%d4.w),%d0
|
||||
or.b 1(%a5,%d4.w),%d1
|
||||
or.b 2(%a5,%d4.w),%d2
|
||||
or.b 3(%a5,%d4.w),%d3
|
||||
|
||||
moveq #0,%d4
|
||||
move.b (%a0)+,%d4
|
||||
add.w %d4,%d4
|
||||
add.w %d4,%d4
|
||||
add.w %d4,%d4
|
||||
add.w %d4,%d4
|
||||
or.b 4(%a5,%d4.w),%d0
|
||||
or.b 5(%a5,%d4.w),%d1
|
||||
or.b 6(%a5,%d4.w),%d2
|
||||
or.b 7(%a5,%d4.w),%d3
|
||||
|
||||
moveq #0,%d4
|
||||
move.b (%a0)+,%d4
|
||||
add.w %d4,%d4
|
||||
add.w %d4,%d4
|
||||
add.w %d4,%d4
|
||||
add.w %d4,%d4
|
||||
or.b 8(%a5,%d4.w),%d0
|
||||
or.b 9(%a5,%d4.w),%d1
|
||||
or.b 10(%a5,%d4.w),%d2
|
||||
or.b 11(%a5,%d4.w),%d3
|
||||
|
||||
moveq #0,%d4
|
||||
move.b (%a0)+,%d4
|
||||
add.w %d4,%d4
|
||||
add.w %d4,%d4
|
||||
add.w %d4,%d4
|
||||
add.w %d4,%d4
|
||||
or.b 12(%a5,%d4.w),%d0
|
||||
or.b 13(%a5,%d4.w),%d1
|
||||
or.b 14(%a5,%d4.w),%d2
|
||||
or.b 15(%a5,%d4.w),%d3
|
||||
|
||||
| Move accumulated bits into the HIGH byte of each word.
|
||||
lsl.w #8,%d0
|
||||
lsl.w #8,%d1
|
||||
lsl.w #8,%d2
|
||||
lsl.w #8,%d3
|
||||
|
||||
| ===== Source bytes 4..7 -> low byte of each plane word =====
|
||||
moveq #0,%d4
|
||||
move.b (%a0)+,%d4
|
||||
add.w %d4,%d4
|
||||
add.w %d4,%d4
|
||||
add.w %d4,%d4
|
||||
add.w %d4,%d4
|
||||
or.b 0(%a5,%d4.w),%d0
|
||||
or.b 1(%a5,%d4.w),%d1
|
||||
or.b 2(%a5,%d4.w),%d2
|
||||
or.b 3(%a5,%d4.w),%d3
|
||||
|
||||
moveq #0,%d4
|
||||
move.b (%a0)+,%d4
|
||||
add.w %d4,%d4
|
||||
add.w %d4,%d4
|
||||
add.w %d4,%d4
|
||||
add.w %d4,%d4
|
||||
or.b 4(%a5,%d4.w),%d0
|
||||
or.b 5(%a5,%d4.w),%d1
|
||||
or.b 6(%a5,%d4.w),%d2
|
||||
or.b 7(%a5,%d4.w),%d3
|
||||
|
||||
moveq #0,%d4
|
||||
move.b (%a0)+,%d4
|
||||
add.w %d4,%d4
|
||||
add.w %d4,%d4
|
||||
add.w %d4,%d4
|
||||
add.w %d4,%d4
|
||||
or.b 8(%a5,%d4.w),%d0
|
||||
or.b 9(%a5,%d4.w),%d1
|
||||
or.b 10(%a5,%d4.w),%d2
|
||||
or.b 11(%a5,%d4.w),%d3
|
||||
|
||||
moveq #0,%d4
|
||||
move.b (%a0)+,%d4
|
||||
add.w %d4,%d4
|
||||
add.w %d4,%d4
|
||||
add.w %d4,%d4
|
||||
add.w %d4,%d4
|
||||
or.b 12(%a5,%d4.w),%d0
|
||||
or.b 13(%a5,%d4.w),%d1
|
||||
or.b 14(%a5,%d4.w),%d2
|
||||
or.b 15(%a5,%d4.w),%d3
|
||||
|
||||
| Store 4 plane words.
|
||||
move.w %d0,(%a1)+
|
||||
move.w %d1,(%a1)+
|
||||
move.w %d2,(%a1)+
|
||||
move.w %d3,(%a1)+
|
||||
|
||||
dbra %d7,.LgroupLoop
|
||||
|
||||
.Ldone:
|
||||
movem.l (%sp)+,%d2-%d7/%a2-%a6
|
||||
rts
|
||||
|
|
@ -82,11 +82,9 @@
|
|||
.macro YP_REC slot, signOp, yreg
|
||||
move.l %a4,%d6
|
||||
\signOp\().w \yreg,%d6 | d6.w = yp
|
||||
move.w %d6,%d0
|
||||
lsl.w #5,%d6 | d6 = yp << 5
|
||||
lsl.w #7,%d0 | d0 = yp << 7
|
||||
add.w %d6,%d0 | d0 = yp * 160
|
||||
move.w %d0,\slot(%sp)
|
||||
add.w %d6,%d6 | * 2 for word index
|
||||
move.w (%a6,%d6.w),%d6 | yLut[yp] = yp * 160
|
||||
move.w %d6,\slot(%sp)
|
||||
.endm
|
||||
|
||||
|
||||
|
|
@ -223,14 +221,21 @@ _surface68kStCircleOutline:
|
|||
moveq #1,%d4
|
||||
sub.w %d2,%d4 | err = 1 - bx
|
||||
|
||||
| a6 = yLut base (yp -> yp*160). Lookup is faster than
|
||||
| the 4 cyc + 4 cyc + 18 cyc + 22 cyc + 4 cyc shift+add
|
||||
| chain we used to do per YP_REC. Saved across all 4
|
||||
| YP_RECs per Bresenham iter (~120 cyc/iter).
|
||||
| Shared LUT lives in lineSpan.s; reference absolute.
|
||||
lea _gStRowOffsetLut,%a6
|
||||
|
||||
| Dispatch on color (low 4 bits) -> one of 16 main loops.
|
||||
moveq #0,%d6
|
||||
move.b SP_COLOR(%sp),%d6
|
||||
and.w #0x0F,%d6
|
||||
add.w %d6,%d6
|
||||
add.w %d6,%d6 | * 4 for bra.w table
|
||||
lea .LcoStTable(%pc),%a6
|
||||
jmp 0(%a6,%d6.w)
|
||||
lea .LcoStTable(%pc),%a2
|
||||
jmp 0(%a2,%d6.w)
|
||||
|
||||
.LcoStTable:
|
||||
bra.w .LcoStLoop_0
|
||||
|
|
@ -280,3 +285,4 @@ bitMaskWordLut:
|
|||
.word 0x0800, 0x0400, 0x0200, 0x0100
|
||||
.word 0x0080, 0x0040, 0x0020, 0x0010
|
||||
.word 0x0008, 0x0004, 0x0002, 0x0001
|
||||
| (yLut now lives in lineSpan.s as the shared _gStRowOffsetLut)
|
||||
|
|
|
|||
|
|
@ -9,28 +9,16 @@
|
|||
| Caller MUST guarantee the bounding box (cx-r, cy-r) (cx+r, cy+r)
|
||||
| is fully on-surface. Off-surface circles fall back to the C walker.
|
||||
|
|
||||
| Phase 10 final: 16-way color dispatch at the OUTER loop. Each color
|
||||
| variant has its own Bresenham body where SPAN_BODY inlines a hard-
|
||||
| coded 4-plane mask RMW (no btst, no bsr/rts). Saves ~120 cyc per
|
||||
| applyMask call (was ~180 via bsr applyMask with runtime btst on d7).
|
||||
|
|
||||
| ABI: cdecl. d2-d7/a2-a6 callee-save.
|
||||
|
|
||||
| void surface68kStFillCircle(uint8_t *base,
|
||||
| uint16_t cx, uint16_t cy,
|
||||
| uint16_t r, uint8_t color);
|
||||
|
|
||||
| Register allocation across the loop:
|
||||
| d2.w = bx (Bresenham, starts at r)
|
||||
| d3.w = by (Bresenham, starts at 0)
|
||||
| d4.w = err
|
||||
| d5.l = loLong (planes 0+1 long template)
|
||||
| d6.l = hiLong (planes 2+3 long template)
|
||||
| d7.b = color (low nibble; tested via btst)
|
||||
| a3 = base
|
||||
| a4 = scratch / current group pointer
|
||||
| d0,d1 = scratch
|
||||
|
|
||||
| Stack scratch (8 bytes at 0(sp)..7(sp)):
|
||||
| 0..1 leftMask (word; per pair)
|
||||
| 2..3 rightMask (word; per pair)
|
||||
| 4..5 numGroups (word; per pair)
|
||||
| 6..7 groupFirstByteOff (word; per pair)
|
||||
|
||||
.text
|
||||
|
||||
|
|
@ -42,7 +30,7 @@
|
|||
.equ SP_FC_CX, SP_FC_OFF + 4 + 2
|
||||
.equ SP_FC_CY, SP_FC_OFF + 8 + 2
|
||||
.equ SP_FC_R, SP_FC_OFF + 12 + 2
|
||||
.equ SP_FC_COLOR, SP_FC_OFF + 16 + 3
|
||||
.equ SP_FC_COLOR, SP_FC_OFF + 20 + 3
|
||||
|
||||
|
||||
| ---- COMPUTE_PAIR_MASKS macro -----------------------------------
|
||||
|
|
@ -50,18 +38,15 @@
|
|||
| Output: 0(sp) leftMask, 2(sp) rightMask, 4(sp) numGroups,
|
||||
| 6(sp) groupFirstByteOff
|
||||
| Trashes: d0, d1
|
||||
| (No labels: straightline.)
|
||||
|
||||
.macro COMPUTE_PAIR_MASKS
|
||||
move.w %d0,0(%sp) | stash left
|
||||
move.w %d1,2(%sp) | stash right
|
||||
| groupFirst & groupFirstByteOff
|
||||
move.w %d0,%d1
|
||||
lsr.w #4,%d1 | groupFirst
|
||||
move.w %d1,%d0
|
||||
lsl.w #3,%d0 | groupFirstByteOff
|
||||
move.w %d0,6(%sp)
|
||||
| numGroups = (right >> 4) - groupFirst
|
||||
move.w 2(%sp),%d0
|
||||
lsr.w #4,%d0 | groupLast
|
||||
sub.w %d1,%d0 | numGroups
|
||||
|
|
@ -81,25 +66,53 @@
|
|||
.endm
|
||||
|
||||
|
||||
| ---- SPAN_BODY macro --------------------------------------------
|
||||
| Render one row span using the pair masks at 0(sp)..7(sp).
|
||||
| Input: d0.w = y (signed)
|
||||
| a3 = base, d5 = loLong, d6 = hiLong, d7 = color
|
||||
| Trashes: d0, d1, a4
|
||||
| Macro takes an idx parameter for unique labels.
|
||||
| ---- APPLY_MASK_INLINE macro ------------------------------------
|
||||
| 4-plane mask RMW with HARDCODED color. a4 advances by 8 (postinc).
|
||||
| Inputs: d0.w = mask, a4 = group ptr
|
||||
| Trashes: d1 (notMask scratch)
|
||||
|
||||
.macro SPAN_BODY
|
||||
| a4 = base + y*160
|
||||
ext.l %d0
|
||||
move.l %d0,%d1
|
||||
lsl.l #5,%d0
|
||||
lsl.l #7,%d1
|
||||
add.l %d1,%d0 | y*160
|
||||
lea 0(%a3,%d0.l),%a4
|
||||
| a4 += groupFirstByteOff
|
||||
moveq #0,%d0
|
||||
move.w 6(%sp),%d0
|
||||
add.l %d0,%a4
|
||||
.macro APPLY_MASK_INLINE color
|
||||
move.w %d0,%d1
|
||||
not.w %d1
|
||||
.if ((\color) & 1)
|
||||
or.w %d0,(%a4)+
|
||||
.else
|
||||
and.w %d1,(%a4)+
|
||||
.endif
|
||||
.if ((\color) & 2)
|
||||
or.w %d0,(%a4)+
|
||||
.else
|
||||
and.w %d1,(%a4)+
|
||||
.endif
|
||||
.if ((\color) & 4)
|
||||
or.w %d0,(%a4)+
|
||||
.else
|
||||
and.w %d1,(%a4)+
|
||||
.endif
|
||||
.if ((\color) & 8)
|
||||
or.w %d0,(%a4)+
|
||||
.else
|
||||
and.w %d1,(%a4)+
|
||||
.endif
|
||||
.endm
|
||||
|
||||
|
||||
| ---- SPAN_BODY macro --------------------------------------------
|
||||
| Render one row span. Color hardcoded.
|
||||
| Input: d0.w = y (signed)
|
||||
| a3 = base, d5 = loLong, d6 = hiLong
|
||||
| masks at 0..7(sp): leftMask, rightMask, numGroups, groupFirstByteOff
|
||||
| Trashes: d0, d1, a4
|
||||
|
||||
.macro SPAN_BODY color
|
||||
| a4 = base + y*160 + groupFirstByteOff
|
||||
| y*160 via shared _gStRowOffsetLut (a2 holds lut base).
|
||||
| byteOff (y*160 + groupFirstByteOff) fits in 16 bits
|
||||
| (max 31992), so word-only ops + .w-indexed lea.
|
||||
add.w %d0,%d0 | y * 2 (word index)
|
||||
move.w (%a2,%d0.w),%d0 | d0 = y * 160
|
||||
add.w 6(%sp),%d0 | + groupFirstByteOff
|
||||
lea 0(%a3,%d0.w),%a4
|
||||
| numGroups in d1
|
||||
move.w 4(%sp),%d1
|
||||
tst.w %d1
|
||||
|
|
@ -107,15 +120,14 @@
|
|||
| single-group: combinedMask = leftMask & rightMask
|
||||
move.w 0(%sp),%d0
|
||||
and.w 2(%sp),%d0
|
||||
bsr .Lfc_applyMask
|
||||
APPLY_MASK_INLINE \color
|
||||
bra.w .Lsb_done\@
|
||||
.Lsb_multi\@:
|
||||
| leading mask. applyMask postinc-advances a4 by 8
|
||||
| (the 4 plane RMWs each advance by 2 via (a4)+).
|
||||
| applyMask trashes d1, so reload numGroups after bsr.
|
||||
| leading mask. APPLY_MASK_INLINE postinc-advances a4 by 8.
|
||||
| APPLY trashes d1, so reload numGroups after.
|
||||
move.w 0(%sp),%d0
|
||||
bsr .Lfc_applyMask
|
||||
move.w 4(%sp),%d1 | reload numGroups
|
||||
APPLY_MASK_INLINE \color
|
||||
move.w 4(%sp),%d1
|
||||
subq.w #1,%d1 | d1 = numMid
|
||||
beq.s .Lsb_skipMid\@
|
||||
.Lsb_midLoop\@:
|
||||
|
|
@ -126,11 +138,71 @@
|
|||
.Lsb_skipMid\@:
|
||||
| trailing mask
|
||||
move.w 2(%sp),%d0
|
||||
bsr .Lfc_applyMask
|
||||
APPLY_MASK_INLINE \color
|
||||
.Lsb_done\@:
|
||||
.endm
|
||||
|
||||
|
||||
| ---- CO_BODY macro: per-color full Bresenham loop body ----------
|
||||
|
||||
.macro CO_BODY color
|
||||
.Lfc_loop_\color:
|
||||
cmp.w %d3,%d2
|
||||
bcs.w .Lfc_done
|
||||
|
||||
| --- Pair A: x range = (cx - bx, cx + bx)
|
||||
move.w SP_FC_CX(%sp),%d0
|
||||
move.w %d0,%d1
|
||||
sub.w %d2,%d0
|
||||
add.w %d2,%d1
|
||||
COMPUTE_PAIR_MASKS
|
||||
|
||||
| Span A1: y = cy + by
|
||||
move.w SP_FC_CY(%sp),%d0
|
||||
add.w %d3,%d0
|
||||
SPAN_BODY \color
|
||||
|
||||
| Span A2: y = cy - by
|
||||
move.w SP_FC_CY(%sp),%d0
|
||||
sub.w %d3,%d0
|
||||
SPAN_BODY \color
|
||||
|
||||
| --- Pair B: x range = (cx - by, cx + by)
|
||||
move.w SP_FC_CX(%sp),%d0
|
||||
move.w %d0,%d1
|
||||
sub.w %d3,%d0
|
||||
add.w %d3,%d1
|
||||
COMPUTE_PAIR_MASKS
|
||||
|
||||
| Span B1: y = cy + bx
|
||||
move.w SP_FC_CY(%sp),%d0
|
||||
add.w %d2,%d0
|
||||
SPAN_BODY \color
|
||||
|
||||
| Span B2: y = cy - bx
|
||||
move.w SP_FC_CY(%sp),%d0
|
||||
sub.w %d2,%d0
|
||||
SPAN_BODY \color
|
||||
|
||||
| --- Bresenham step
|
||||
addq.w #1,%d3
|
||||
tst.w %d4
|
||||
bgt.s .Lfc_decBx_\color
|
||||
add.w %d3,%d4
|
||||
add.w %d3,%d4
|
||||
addq.w #1,%d4
|
||||
bra.w .Lfc_loop_\color
|
||||
.Lfc_decBx_\color:
|
||||
subq.w #1,%d2
|
||||
add.w %d3,%d4
|
||||
add.w %d3,%d4
|
||||
sub.w %d2,%d4
|
||||
sub.w %d2,%d4
|
||||
addq.w #1,%d4
|
||||
bra.w .Lfc_loop_\color
|
||||
.endm
|
||||
|
||||
|
||||
.globl _surface68kStFillCircle
|
||||
|
||||
_surface68kStFillCircle:
|
||||
|
|
@ -142,10 +214,11 @@ _surface68kStFillCircle:
|
|||
moveq #0,%d7
|
||||
move.b SP_FC_COLOR(%sp),%d7
|
||||
|
||||
| LUT bases (PC-relative indexed has only 8-bit
|
||||
| displacement, so cache full pointers in a-regs).
|
||||
| LUT bases. a5/a6 = mask LUTs (used by COMPUTE_PAIR_MASKS).
|
||||
| a2 = shared _gStRowOffsetLut (used by SPAN_BODY for y*160).
|
||||
lea leftMaskLut(%pc),%a5
|
||||
lea rightMaskLut(%pc),%a6
|
||||
lea _gStRowOffsetLut,%a2
|
||||
|
||||
| loLong = ((c&1)?0xFFFF0000:0) | ((c&2)?0x0000FFFF:0)
|
||||
moveq #0,%d5
|
||||
|
|
@ -174,60 +247,50 @@ _surface68kStFillCircle:
|
|||
moveq #1,%d4
|
||||
sub.w %d2,%d4
|
||||
|
||||
.Lfc_loop:
|
||||
cmp.w %d3,%d2
|
||||
bcs.w .Lfc_done
|
||||
| Dispatch on color (low 4 bits) -> 16 specialized loops.
|
||||
| Use a4 (gets overwritten in SPAN_BODY's first lea) as
|
||||
| dispatch scratch since a2 now holds yLut for the body.
|
||||
and.w #0x0F,%d7
|
||||
move.w %d7,%d0
|
||||
add.w %d0,%d0
|
||||
add.w %d0,%d0 | * 4 for bra.w table
|
||||
lea .Lfc_table(%pc),%a4
|
||||
jmp 0(%a4,%d0.w)
|
||||
|
||||
| --- Pair A: x range = (cx - bx, cx + bx)
|
||||
move.w SP_FC_CX(%sp),%d0
|
||||
move.w %d0,%d1
|
||||
sub.w %d2,%d0 | left = cx - bx
|
||||
add.w %d2,%d1 | right = cx + bx
|
||||
COMPUTE_PAIR_MASKS
|
||||
.Lfc_table:
|
||||
bra.w .Lfc_loop_0
|
||||
bra.w .Lfc_loop_1
|
||||
bra.w .Lfc_loop_2
|
||||
bra.w .Lfc_loop_3
|
||||
bra.w .Lfc_loop_4
|
||||
bra.w .Lfc_loop_5
|
||||
bra.w .Lfc_loop_6
|
||||
bra.w .Lfc_loop_7
|
||||
bra.w .Lfc_loop_8
|
||||
bra.w .Lfc_loop_9
|
||||
bra.w .Lfc_loop_10
|
||||
bra.w .Lfc_loop_11
|
||||
bra.w .Lfc_loop_12
|
||||
bra.w .Lfc_loop_13
|
||||
bra.w .Lfc_loop_14
|
||||
bra.w .Lfc_loop_15
|
||||
|
||||
| Span A1: y = cy + by
|
||||
move.w SP_FC_CY(%sp),%d0
|
||||
add.w %d3,%d0
|
||||
SPAN_BODY
|
||||
|
||||
| Span A2: y = cy - by
|
||||
move.w SP_FC_CY(%sp),%d0
|
||||
sub.w %d3,%d0
|
||||
SPAN_BODY
|
||||
|
||||
| --- Pair B: x range = (cx - by, cx + by)
|
||||
move.w SP_FC_CX(%sp),%d0
|
||||
move.w %d0,%d1
|
||||
sub.w %d3,%d0 | left = cx - by
|
||||
add.w %d3,%d1 | right = cx + by
|
||||
COMPUTE_PAIR_MASKS
|
||||
|
||||
| Span B1: y = cy + bx
|
||||
move.w SP_FC_CY(%sp),%d0
|
||||
add.w %d2,%d0
|
||||
SPAN_BODY
|
||||
|
||||
| Span B2: y = cy - bx
|
||||
move.w SP_FC_CY(%sp),%d0
|
||||
sub.w %d2,%d0
|
||||
SPAN_BODY
|
||||
|
||||
| --- Bresenham step
|
||||
addq.w #1,%d3
|
||||
tst.w %d4
|
||||
bgt.s .Lfc_decBx
|
||||
add.w %d3,%d4
|
||||
add.w %d3,%d4
|
||||
addq.w #1,%d4
|
||||
bra.w .Lfc_loop
|
||||
.Lfc_decBx:
|
||||
subq.w #1,%d2
|
||||
add.w %d3,%d4
|
||||
add.w %d3,%d4
|
||||
sub.w %d2,%d4
|
||||
sub.w %d2,%d4
|
||||
addq.w #1,%d4
|
||||
bra.w .Lfc_loop
|
||||
CO_BODY 0
|
||||
CO_BODY 1
|
||||
CO_BODY 2
|
||||
CO_BODY 3
|
||||
CO_BODY 4
|
||||
CO_BODY 5
|
||||
CO_BODY 6
|
||||
CO_BODY 7
|
||||
CO_BODY 8
|
||||
CO_BODY 9
|
||||
CO_BODY 10
|
||||
CO_BODY 11
|
||||
CO_BODY 12
|
||||
CO_BODY 13
|
||||
CO_BODY 14
|
||||
CO_BODY 15
|
||||
|
||||
|
||||
.Lfc_done:
|
||||
|
|
@ -236,46 +299,6 @@ _surface68kStFillCircle:
|
|||
rts
|
||||
|
||||
|
||||
| ---- Apply 4-plane mask at (a4) -------------------------------
|
||||
| Input: d0.w = mask, d7.b = color, a4 = group ptr
|
||||
| Output: a4 advanced by 8 (next group). Caller must NOT post-add 8.
|
||||
| Trashes: d0, d1
|
||||
| Subroutine, called via bsr from SPAN_BODY. Postinc on each plane
|
||||
| RMW saves 4 cyc/plane vs displacement (12 vs 16 EA cyc).
|
||||
|
||||
.Lfc_applyMask:
|
||||
move.w %d0,%d1
|
||||
not.w %d1 | d1 = notMask
|
||||
btst #0,%d7
|
||||
beq.s .Lfc_am0a
|
||||
or.w %d0,(%a4)+
|
||||
bra.s .Lfc_am1
|
||||
.Lfc_am0a:
|
||||
and.w %d1,(%a4)+
|
||||
.Lfc_am1:
|
||||
btst #1,%d7
|
||||
beq.s .Lfc_am1a
|
||||
or.w %d0,(%a4)+
|
||||
bra.s .Lfc_am2
|
||||
.Lfc_am1a:
|
||||
and.w %d1,(%a4)+
|
||||
.Lfc_am2:
|
||||
btst #2,%d7
|
||||
beq.s .Lfc_am2a
|
||||
or.w %d0,(%a4)+
|
||||
bra.s .Lfc_am3
|
||||
.Lfc_am2a:
|
||||
and.w %d1,(%a4)+
|
||||
.Lfc_am3:
|
||||
btst #3,%d7
|
||||
beq.s .Lfc_am3a
|
||||
or.w %d0,(%a4)+
|
||||
rts
|
||||
.Lfc_am3a:
|
||||
and.w %d1,(%a4)+
|
||||
rts
|
||||
|
||||
|
||||
.align 2
|
||||
| leftMaskLut[i] = (1 << (16 - i)) - 1, indexed by bitFirst (0..15)
|
||||
leftMaskLut:
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
//
|
||||
// M2 scope:
|
||||
// * XBIOS Setscreen to ST low-res (320x200x16, mode 0).
|
||||
// * Chunky 4bpp to word-interleaved ST planar c2p at present time.
|
||||
// * Word-interleaved ST planar buffer copied to the screen at present.
|
||||
//
|
||||
// M2.5 scope (per-band palette / SCB emulation):
|
||||
// * halPresent scans the SurfaceT's SCB array and builds a compact
|
||||
|
|
@ -136,17 +136,9 @@ static inline __attribute__((always_inline)) uint8_t stPlanarGetPixel(const StPl
|
|||
}
|
||||
static uint16_t quantizeColorToSt(uint16_t orgb);
|
||||
static void flattenScbPalettes(const SurfaceT *src);
|
||||
static void initC2pLut(void);
|
||||
static void writeDiagnostics(void);
|
||||
static long writePrevPaletteRegs(void);
|
||||
|
||||
// Provided by src/port/atarist/c2p.s.
|
||||
extern void chunkyToPlanarRowSt(const uint8_t *src,
|
||||
uint16_t *dst,
|
||||
uint16_t groupStart,
|
||||
uint16_t groupEnd,
|
||||
const uint8_t *lut);
|
||||
|
||||
static __attribute__((interrupt_handler)) void timerBIsr(void);
|
||||
static __attribute__((interrupt_handler)) void vblIsr(void);
|
||||
static void buildTransitions(const SurfaceT *src);
|
||||
|
|
@ -201,72 +193,11 @@ static void (*gOldTimerBVec)(void) = NULL;
|
|||
// SCB; neither is cheap on a 7 MHz 68000. In the typical game loop
|
||||
// (and every frame of the keys demo after the initial paint) SCB and
|
||||
// palette never change, so caching and skipping those passes keeps
|
||||
// rect presents down to just the c2p work.
|
||||
// rect presents down to just the screen blit.
|
||||
static uint8_t gCachedScb [SURFACE_HEIGHT];
|
||||
static uint16_t gCachedPalette[SURFACE_PALETTE_COUNT][SURFACE_COLORS_PER_PALETTE];
|
||||
static bool gCacheValid = false;
|
||||
|
||||
// 256-long plane-spread LUT for the asm sprite SAVE path (defined in
|
||||
// spriteAsm.s). For plane byte b, LUT[b] is a 32-bit value where each
|
||||
// of b's 8 bits is placed at the bit-0 position of the corresponding
|
||||
// pixel's nibble inside a 4-byte chunky long. The asm shifts the LUT
|
||||
// entry left by N to get plane N's contribution; OR'd across 4 planes
|
||||
// gives the full chunky long. Initialized lazily.
|
||||
//
|
||||
// LUT used by surface68kStSpriteSaveByteAligned. The asm reads via
|
||||
// `move.l (a_ptr, d0.l), d4` which requires the LUT to be long-
|
||||
// aligned -- and TOS .PRG BSS only does 2-byte alignment. Worse,
|
||||
// the cascading offsets from the odd-sized gC2pLut put even
|
||||
// `uint32_t` BSS slots at addr mod 4 == 2.
|
||||
//
|
||||
// Fix: malloc the LUT. mintlib's malloc returns long-aligned memory.
|
||||
// The pointer is passed to the asm via the C-side wrapper (so the
|
||||
// asm reads it from the stack, where it's guaranteed long-aligned
|
||||
// regardless of where the static pointer slot lives).
|
||||
static uint32_t *gStPlaneSpreadLutPtr = NULL;
|
||||
static bool gStPlaneSpreadLutReady = false;
|
||||
|
||||
static bool initStPlaneSpreadLut(void) {
|
||||
int b;
|
||||
int i;
|
||||
|
||||
if (gStPlaneSpreadLutReady) {
|
||||
return true;
|
||||
}
|
||||
gStPlaneSpreadLutPtr = (uint32_t *)malloc(256 * sizeof(uint32_t));
|
||||
if (gStPlaneSpreadLutPtr == NULL) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (b = 0; b < 256; b++) {
|
||||
uint32_t v = 0u;
|
||||
for (i = 0; i < 8; i++) {
|
||||
if (b & (0x80 >> i)) {
|
||||
int byteIdx = i >> 1;
|
||||
int isHigh = ((i & 1) == 0);
|
||||
int bitInLong = (3 - byteIdx) * 8 + (isHigh ? 4 : 0);
|
||||
v |= (uint32_t)1u << bitInLong;
|
||||
}
|
||||
}
|
||||
gStPlaneSpreadLutPtr[b] = v;
|
||||
}
|
||||
gStPlaneSpreadLutReady = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
// 4 KB chunky-to-planar lookup table consumed by chunkyToPlanarRowSt
|
||||
// (src/port/atarist/c2p.s). Layout: gC2pLut[src*16 + pos*4 + plane]
|
||||
// = the 2-bit plane-byte contribution for source byte `src` at
|
||||
// byte-position `pos` (0..3 within a 4-byte chunk) going to plane
|
||||
// `plane`. Bit positions inside the byte are (7-2*pos, 6-2*pos), so
|
||||
// the same table feeds both halves of an ST plane word: positions
|
||||
// 0..3 land in the high byte, 4..7 (re-indexed mod 4) in the low
|
||||
// byte. Built once by initC2pLut on the first halPresent call.
|
||||
/* Exported (no static) so spriteAsm.s can `lea _gC2pLut, %a2`. */
|
||||
uint8_t gC2pLut[4 * 1024];
|
||||
static bool gC2pLutReady = false;
|
||||
|
||||
// ----- Internal helpers (alphabetical) -----
|
||||
|
||||
// Scan the surface's SCB and record one transition entry for each
|
||||
|
|
@ -350,37 +281,6 @@ static void refreshPaletteStateIfNeeded(const SurfaceT *src) {
|
|||
}
|
||||
|
||||
|
||||
// Build the 4 KB chunky-to-planar lookup table consumed by
|
||||
// chunkyToPlanarRowSt. Same layout/contents as the Amiga c2p LUT;
|
||||
// see src/port/atarist/c2p.s for the addressing math.
|
||||
static void initC2pLut(void) {
|
||||
uint16_t pos;
|
||||
uint16_t plane;
|
||||
uint16_t src;
|
||||
uint8_t highShift;
|
||||
uint8_t lowShift;
|
||||
uint8_t highBit;
|
||||
uint8_t lowBit;
|
||||
|
||||
if (gC2pLutReady) {
|
||||
return;
|
||||
}
|
||||
for (src = 0; src < 256; src++) {
|
||||
for (pos = 0; pos < 4; pos++) {
|
||||
highShift = (uint8_t)(7 - 2 * pos);
|
||||
lowShift = (uint8_t)(6 - 2 * pos);
|
||||
for (plane = 0; plane < 4; plane++) {
|
||||
highBit = (uint8_t)(((src >> 4) >> plane) & 1);
|
||||
lowBit = (uint8_t)(((src & 0x0F) >> plane) & 1);
|
||||
gC2pLut[src * 16 + pos * 4 + plane] =
|
||||
(uint8_t)((highBit << highShift) | (lowBit << lowShift));
|
||||
}
|
||||
}
|
||||
}
|
||||
gC2pLutReady = true;
|
||||
}
|
||||
|
||||
|
||||
// 12-bit $0RGB to STF 9-bit palette register (drops the low bit of
|
||||
// each 4-bit channel).
|
||||
static uint16_t quantizeColorToSt(uint16_t orgb) {
|
||||
|
|
@ -619,11 +519,8 @@ void halPresent(const SurfaceT *src) {
|
|||
}
|
||||
refreshPaletteStateIfNeeded(src);
|
||||
|
||||
// Phase 9: planar shadow -> screen RAM. Same dirty-word band
|
||||
// tracking the c2p path used; just memcpy the planar bytes for
|
||||
// each band instead of running c2p on the chunky shadow. Each
|
||||
// dirty word covers 4 pixels = ?of one group = quarter of an
|
||||
// 8-byte group. We round to whole groups (8 bytes each) for a
|
||||
// Planar buffer -> screen RAM. Each dirty word covers 4 pixels
|
||||
// (a quarter of an 8-byte group). Round to whole groups for a
|
||||
// simple aligned memcpy, since planar groups are the natural
|
||||
// copy unit.
|
||||
for (y = 0; y < SURFACE_HEIGHT; y++) {
|
||||
|
|
@ -720,8 +617,11 @@ extern void surface68kStFillCircle(uint8_t *base, uint16_t cx, uint16_t cy, uint
|
|||
extern void surface68kStFillRectSingleGroup(uint8_t *firstGroupPtr, uint16_t mask, uint16_t h, uint8_t color);
|
||||
extern void surface68kStFillRectMulti(uint8_t *base, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t color);
|
||||
extern void surface68kStLongFill(uint8_t *dst, uint16_t numGroups, uint32_t loLong, uint32_t hiLong);
|
||||
extern void surface68kStSpriteSaveByteAligned(uint8_t *base, uint16_t x, uint16_t y, uint16_t w, uint16_t h, uint8_t *dstChunky, const uint32_t *lut);
|
||||
extern void surface68kStSpriteRestoreByteAligned(uint8_t *base, uint16_t x, uint16_t y, uint16_t w, uint16_t h, const uint8_t *srcChunky, const uint8_t *c2pLut);
|
||||
extern void surface68kStTileFill8x8(uint8_t *firstGroupPtr, uint16_t mask, uint8_t color);
|
||||
extern void surface68kStSprite16x16Save(uint8_t *base, uint16_t x, uint16_t y, uint8_t *dstBuf);
|
||||
extern void surface68kStSprite16x16Restore(uint8_t *base, uint16_t x, uint16_t y, const uint8_t *srcBuf);
|
||||
extern void surface68kStSpriteSaveByteAligned(uint8_t *base, uint16_t x, uint16_t y, uint16_t w, uint16_t h, uint8_t *dstPlaneBytes);
|
||||
extern void surface68kStSpriteRestoreByteAligned(uint8_t *base, uint16_t x, uint16_t y, uint16_t w, uint16_t h, const uint8_t *srcPlaneBytes);
|
||||
|
||||
|
||||
// Phase 9: clear the entire planar buffer to a 4-bit color. Build an
|
||||
|
|
@ -1262,17 +1162,12 @@ void halTileFillPlanes(SurfaceT *s, uint8_t bx, uint8_t by, uint8_t colorIndex)
|
|||
group = (uint16_t)((uint16_t)bx >> 1);
|
||||
halfMask = ((bx & 1u) == 0u) ? 0xFF00u : 0x00FFu;
|
||||
gp = pd->base + (uint16_t)by * 8u * ST_BYTES_PER_ROW + group * ST_BYTES_PER_GROUP;
|
||||
surface68kStFillRectSingleGroup(gp, halfMask, TILE_PIXELS_PER_SIDE, colorIndex);
|
||||
/* Phase 10 final: specialized 8x8 unrolled tile-fill skips the
|
||||
* generic FRG_LOOP's per-row subq+bne overhead. */
|
||||
surface68kStTileFill8x8(gp, halfMask, colorIndex);
|
||||
}
|
||||
|
||||
|
||||
// Phase 10: group-aware tile paste. Per row: extract 8 pixels from
|
||||
// 4 chunky bytes, build 4 plane bytes (one per plane), drop them
|
||||
// into the high or low half of the 4 plane words at this group --
|
||||
// 4 word RMWs per row instead of 64 per-pixel calls.
|
||||
static const uint8_t kStTileBitLut[8] = { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 };
|
||||
|
||||
|
||||
// Phase 10: tile paste/snap reuse the asm sprite save/restore
|
||||
// helpers -- identical per-row work patterns at byte-aligned
|
||||
// positions. Width 8 = single tile column = single half-group
|
||||
|
|
@ -1301,14 +1196,25 @@ void halTilePastePlanes(SurfaceT *dst, uint8_t bx, uint8_t by, const uint8_t *ti
|
|||
+ (uint16_t)by * 8u * ST_BYTES_PER_ROW
|
||||
+ group * ST_BYTES_PER_GROUP
|
||||
+ (uint16_t)(bx & 1u);
|
||||
for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) {
|
||||
dstAddr[0] = tileBytes[0];
|
||||
dstAddr[2] = tileBytes[1];
|
||||
dstAddr[4] = tileBytes[2];
|
||||
dstAddr[6] = tileBytes[3];
|
||||
dstAddr += ST_BYTES_PER_ROW;
|
||||
tileBytes += TILE_BYTES_PER_ROW;
|
||||
}
|
||||
(void)row;
|
||||
#define ST_TILE_PASTE_ROW \
|
||||
do { \
|
||||
dstAddr[0] = tileBytes[0]; \
|
||||
dstAddr[2] = tileBytes[1]; \
|
||||
dstAddr[4] = tileBytes[2]; \
|
||||
dstAddr[6] = tileBytes[3]; \
|
||||
dstAddr += ST_BYTES_PER_ROW; \
|
||||
tileBytes += TILE_BYTES_PER_ROW; \
|
||||
} while (0)
|
||||
ST_TILE_PASTE_ROW;
|
||||
ST_TILE_PASTE_ROW;
|
||||
ST_TILE_PASTE_ROW;
|
||||
ST_TILE_PASTE_ROW;
|
||||
ST_TILE_PASTE_ROW;
|
||||
ST_TILE_PASTE_ROW;
|
||||
ST_TILE_PASTE_ROW;
|
||||
ST_TILE_PASTE_ROW;
|
||||
#undef ST_TILE_PASTE_ROW
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -1331,136 +1237,25 @@ void halTileSnapPlanes(const SurfaceT *src, uint8_t bx, uint8_t by, uint8_t *til
|
|||
+ (uint16_t)by * 8u * ST_BYTES_PER_ROW
|
||||
+ group * ST_BYTES_PER_GROUP
|
||||
+ (uint16_t)(bx & 1u);
|
||||
for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) {
|
||||
tileOut[0] = srcAddr[0];
|
||||
tileOut[1] = srcAddr[2];
|
||||
tileOut[2] = srcAddr[4];
|
||||
tileOut[3] = srcAddr[6];
|
||||
srcAddr += ST_BYTES_PER_ROW;
|
||||
tileOut += TILE_BYTES_PER_ROW;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Slow-path C versions kept (renamed) for reference; not in the
|
||||
* active call chain. */
|
||||
static void halTilePastePlanes_oldC(SurfaceT *dst, uint8_t bx, uint8_t by, const uint8_t *chunkyTile) {
|
||||
StPlanarT *pd;
|
||||
uint16_t group;
|
||||
uint16_t halfMask;
|
||||
uint16_t notHalfMask;
|
||||
bool isHigh;
|
||||
uint8_t *rowBase;
|
||||
int16_t row;
|
||||
int16_t pix;
|
||||
uint16_t *pw;
|
||||
uint8_t b;
|
||||
uint8_t color;
|
||||
uint8_t pb0;
|
||||
uint8_t pb1;
|
||||
uint8_t pb2;
|
||||
uint8_t pb3;
|
||||
uint8_t bit;
|
||||
|
||||
if (dst == NULL || chunkyTile == NULL) {
|
||||
return;
|
||||
}
|
||||
pd = (StPlanarT *)dst->portData;
|
||||
if (pd == NULL) {
|
||||
return;
|
||||
}
|
||||
group = (uint16_t)((uint16_t)bx >> 1);
|
||||
isHigh = ((bx & 1u) == 0u);
|
||||
halfMask = isHigh ? 0xFF00u : 0x00FFu;
|
||||
notHalfMask = (uint16_t)~halfMask;
|
||||
rowBase = pd->base
|
||||
+ (uint16_t)by * TILE_PIXELS_PER_SIDE * ST_BYTES_PER_ROW
|
||||
+ group * ST_BYTES_PER_GROUP;
|
||||
for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) {
|
||||
pb0 = pb1 = pb2 = pb3 = 0u;
|
||||
for (pix = 0; pix < TILE_PIXELS_PER_SIDE; pix++) {
|
||||
b = chunkyTile[row * TILE_BYTES_PER_ROW + (pix >> 1)];
|
||||
color = (pix & 1) ? (uint8_t)(b & 0x0Fu) : (uint8_t)(b >> 4);
|
||||
bit = kStTileBitLut[pix];
|
||||
if (color & 1u) { pb0 = (uint8_t)(pb0 | bit); }
|
||||
if (color & 2u) { pb1 = (uint8_t)(pb1 | bit); }
|
||||
if (color & 4u) { pb2 = (uint8_t)(pb2 | bit); }
|
||||
if (color & 8u) { pb3 = (uint8_t)(pb3 | bit); }
|
||||
}
|
||||
pw = (uint16_t *)rowBase;
|
||||
if (isHigh) {
|
||||
pw[0] = (uint16_t)((pw[0] & notHalfMask) | ((uint16_t)pb0 << 8));
|
||||
pw[1] = (uint16_t)((pw[1] & notHalfMask) | ((uint16_t)pb1 << 8));
|
||||
pw[2] = (uint16_t)((pw[2] & notHalfMask) | ((uint16_t)pb2 << 8));
|
||||
pw[3] = (uint16_t)((pw[3] & notHalfMask) | ((uint16_t)pb3 << 8));
|
||||
} else {
|
||||
pw[0] = (uint16_t)((pw[0] & notHalfMask) | (uint16_t)pb0);
|
||||
pw[1] = (uint16_t)((pw[1] & notHalfMask) | (uint16_t)pb1);
|
||||
pw[2] = (uint16_t)((pw[2] & notHalfMask) | (uint16_t)pb2);
|
||||
pw[3] = (uint16_t)((pw[3] & notHalfMask) | (uint16_t)pb3);
|
||||
}
|
||||
rowBase += ST_BYTES_PER_ROW;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Phase 10: group-aware tile snap. Read 4 plane half-words for the
|
||||
// row's group, distribute the 8 plane bits per plane into chunky
|
||||
// nibbles. 4 word reads per row + 4 chunky bytes per row, no
|
||||
// per-pixel function calls. Replaced by the asm-routed halTileSnapPlanes
|
||||
// above; kept for reference as the C-only fallback.
|
||||
static void halTileSnapPlanes_oldC(const SurfaceT *src, uint8_t bx, uint8_t by, uint8_t *chunkyTileOut) {
|
||||
const StPlanarT *pd;
|
||||
uint16_t group;
|
||||
uint16_t halfShift;
|
||||
const uint8_t *rowBase;
|
||||
int16_t row;
|
||||
int16_t pair;
|
||||
const uint16_t *pw;
|
||||
uint8_t pb0;
|
||||
uint8_t pb1;
|
||||
uint8_t pb2;
|
||||
uint8_t pb3;
|
||||
uint8_t bitHi;
|
||||
uint8_t bitLo;
|
||||
uint8_t hi;
|
||||
uint8_t lo;
|
||||
|
||||
if (src == NULL || chunkyTileOut == NULL) {
|
||||
return;
|
||||
}
|
||||
pd = (const StPlanarT *)src->portData;
|
||||
if (pd == NULL) {
|
||||
return;
|
||||
}
|
||||
group = (uint16_t)((uint16_t)bx >> 1);
|
||||
halfShift = ((bx & 1u) == 0u) ? 8u : 0u;
|
||||
rowBase = pd->base
|
||||
+ (uint16_t)by * TILE_PIXELS_PER_SIDE * ST_BYTES_PER_ROW
|
||||
+ group * ST_BYTES_PER_GROUP;
|
||||
for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) {
|
||||
pw = (const uint16_t *)rowBase;
|
||||
pb0 = (uint8_t)(pw[0] >> halfShift);
|
||||
pb1 = (uint8_t)(pw[1] >> halfShift);
|
||||
pb2 = (uint8_t)(pw[2] >> halfShift);
|
||||
pb3 = (uint8_t)(pw[3] >> halfShift);
|
||||
for (pair = 0; pair < TILE_BYTES_PER_ROW; pair++) {
|
||||
bitHi = kStTileBitLut[pair * 2];
|
||||
bitLo = kStTileBitLut[pair * 2 + 1];
|
||||
hi = 0u;
|
||||
lo = 0u;
|
||||
if (pb0 & bitHi) hi = (uint8_t)(hi | 1u);
|
||||
if (pb1 & bitHi) hi = (uint8_t)(hi | 2u);
|
||||
if (pb2 & bitHi) hi = (uint8_t)(hi | 4u);
|
||||
if (pb3 & bitHi) hi = (uint8_t)(hi | 8u);
|
||||
if (pb0 & bitLo) lo = (uint8_t)(lo | 1u);
|
||||
if (pb1 & bitLo) lo = (uint8_t)(lo | 2u);
|
||||
if (pb2 & bitLo) lo = (uint8_t)(lo | 4u);
|
||||
if (pb3 & bitLo) lo = (uint8_t)(lo | 8u);
|
||||
chunkyTileOut[row * TILE_BYTES_PER_ROW + pair] = (uint8_t)((hi << 4) | lo);
|
||||
}
|
||||
rowBase += ST_BYTES_PER_ROW;
|
||||
}
|
||||
(void)row;
|
||||
#define ST_TILE_SNAP_ROW \
|
||||
do { \
|
||||
tileOut[0] = srcAddr[0]; \
|
||||
tileOut[1] = srcAddr[2]; \
|
||||
tileOut[2] = srcAddr[4]; \
|
||||
tileOut[3] = srcAddr[6]; \
|
||||
srcAddr += ST_BYTES_PER_ROW; \
|
||||
tileOut += TILE_BYTES_PER_ROW; \
|
||||
} while (0)
|
||||
ST_TILE_SNAP_ROW;
|
||||
ST_TILE_SNAP_ROW;
|
||||
ST_TILE_SNAP_ROW;
|
||||
ST_TILE_SNAP_ROW;
|
||||
ST_TILE_SNAP_ROW;
|
||||
ST_TILE_SNAP_ROW;
|
||||
ST_TILE_SNAP_ROW;
|
||||
ST_TILE_SNAP_ROW;
|
||||
#undef ST_TILE_SNAP_ROW
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -1496,14 +1291,28 @@ void halTileCopyPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const Surfac
|
|||
+ (uint16_t)dstBy * 8u * ST_BYTES_PER_ROW
|
||||
+ dstGroup * ST_BYTES_PER_GROUP
|
||||
+ (uint16_t)(dstBx & 1u);
|
||||
for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) {
|
||||
dstAddr[0] = srcAddr[0]; /* plane 0 byte (high or low half) */
|
||||
dstAddr[2] = srcAddr[2]; /* plane 1 */
|
||||
dstAddr[4] = srcAddr[4]; /* plane 2 */
|
||||
dstAddr[6] = srcAddr[6]; /* plane 3 */
|
||||
srcAddr += ST_BYTES_PER_ROW;
|
||||
dstAddr += ST_BYTES_PER_ROW;
|
||||
}
|
||||
/* gcc-mint -O2 does NOT unroll the 8-iter byte-copy loop,
|
||||
* leaving cmpl + bnes loop overhead per row. Manual unroll
|
||||
* drops ~150 cyc/call. (void)row keeps the unused decl quiet. */
|
||||
(void)row;
|
||||
#define ST_TILE_COPY_ROW \
|
||||
do { \
|
||||
dstAddr[0] = srcAddr[0]; \
|
||||
dstAddr[2] = srcAddr[2]; \
|
||||
dstAddr[4] = srcAddr[4]; \
|
||||
dstAddr[6] = srcAddr[6]; \
|
||||
srcAddr += ST_BYTES_PER_ROW; \
|
||||
dstAddr += ST_BYTES_PER_ROW; \
|
||||
} while (0)
|
||||
ST_TILE_COPY_ROW; /* row 0 */
|
||||
ST_TILE_COPY_ROW; /* row 1 */
|
||||
ST_TILE_COPY_ROW; /* row 2 */
|
||||
ST_TILE_COPY_ROW; /* row 3 */
|
||||
ST_TILE_COPY_ROW; /* row 4 */
|
||||
ST_TILE_COPY_ROW; /* row 5 */
|
||||
ST_TILE_COPY_ROW; /* row 6 */
|
||||
ST_TILE_COPY_ROW; /* row 7 */
|
||||
#undef ST_TILE_COPY_ROW
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -1792,109 +1601,6 @@ void halBlitRectPlanes(SurfaceT *dst, int16_t x, int16_t y, const uint8_t *srcBy
|
|||
}
|
||||
|
||||
|
||||
// Phase 10 fast paths for save/restore. Hand-rolled asm
|
||||
// (surface68kStSprite{Save,Restore}ByteAligned) does the chunky <->
|
||||
// plane bit transpose via ASL+ROXL and walks rows/tile columns. The
|
||||
// C wrappers below are kept as a fallback / reference; they're not
|
||||
// in the critical path now that the asm versions are wired in.
|
||||
static void stSpriteSaveByteAligned(const StPlanarT *pd, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t *dstChunkyBytes) {
|
||||
int16_t bytesPerRow = (int16_t)(w >> 1);
|
||||
int16_t tileCols = (int16_t)(w >> 3);
|
||||
const uint8_t *rowBase = pd->base + (uint16_t)y * ST_BYTES_PER_ROW;
|
||||
int16_t row;
|
||||
int16_t tileCol;
|
||||
|
||||
for (row = 0; row < (int16_t)h; row++) {
|
||||
uint8_t *dstRow = &dstChunkyBytes[(uint16_t)row * (uint16_t)bytesPerRow];
|
||||
for (tileCol = 0; tileCol < tileCols; tileCol++) {
|
||||
int16_t srcX = (int16_t)(x + tileCol * 8);
|
||||
uint16_t group = (uint16_t)((uint16_t)srcX >> 4);
|
||||
uint16_t shift = ((srcX & 8) == 0) ? 8u : 0u;
|
||||
const uint16_t *pw = (const uint16_t *)(rowBase + group * ST_BYTES_PER_GROUP);
|
||||
uint8_t pb0 = (uint8_t)(pw[0] >> shift);
|
||||
uint8_t pb1 = (uint8_t)(pw[1] >> shift);
|
||||
uint8_t pb2 = (uint8_t)(pw[2] >> shift);
|
||||
uint8_t pb3 = (uint8_t)(pw[3] >> shift);
|
||||
int16_t pair;
|
||||
for (pair = 0; pair < 4; pair++) {
|
||||
uint8_t bitHi = (uint8_t)(0x80u >> (pair * 2));
|
||||
uint8_t bitLo = (uint8_t)(0x80u >> (pair * 2 + 1));
|
||||
uint8_t hi = 0u;
|
||||
uint8_t lo = 0u;
|
||||
if (pb0 & bitHi) { hi = (uint8_t)(hi | 1u); }
|
||||
if (pb1 & bitHi) { hi = (uint8_t)(hi | 2u); }
|
||||
if (pb2 & bitHi) { hi = (uint8_t)(hi | 4u); }
|
||||
if (pb3 & bitHi) { hi = (uint8_t)(hi | 8u); }
|
||||
if (pb0 & bitLo) { lo = (uint8_t)(lo | 1u); }
|
||||
if (pb1 & bitLo) { lo = (uint8_t)(lo | 2u); }
|
||||
if (pb2 & bitLo) { lo = (uint8_t)(lo | 4u); }
|
||||
if (pb3 & bitLo) { lo = (uint8_t)(lo | 8u); }
|
||||
dstRow[tileCol * 4 + pair] = (uint8_t)((hi << 4) | lo);
|
||||
}
|
||||
}
|
||||
rowBase += ST_BYTES_PER_ROW;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void stSpriteRestoreByteAligned(StPlanarT *pd, int16_t x, int16_t y, uint16_t w, uint16_t h, const uint8_t *srcChunkyBytes) {
|
||||
int16_t bytesPerRow = (int16_t)(w >> 1);
|
||||
int16_t tileCols = (int16_t)(w >> 3);
|
||||
uint8_t *rowBase = pd->base + (uint16_t)y * ST_BYTES_PER_ROW;
|
||||
int16_t row;
|
||||
int16_t tileCol;
|
||||
|
||||
for (row = 0; row < (int16_t)h; row++) {
|
||||
const uint8_t *srcRow = &srcChunkyBytes[(uint16_t)row * (uint16_t)bytesPerRow];
|
||||
for (tileCol = 0; tileCol < tileCols; tileCol++) {
|
||||
uint8_t b0 = srcRow[tileCol * 4 + 0];
|
||||
uint8_t b1 = srcRow[tileCol * 4 + 1];
|
||||
uint8_t b2 = srcRow[tileCol * 4 + 2];
|
||||
uint8_t b3 = srcRow[tileCol * 4 + 3];
|
||||
uint8_t pb0 = 0u;
|
||||
uint8_t pb1 = 0u;
|
||||
uint8_t pb2 = 0u;
|
||||
uint8_t pb3 = 0u;
|
||||
uint8_t c;
|
||||
int16_t dstX;
|
||||
uint16_t group;
|
||||
uint16_t *pw;
|
||||
uint16_t halfMask;
|
||||
uint16_t notHalfMask;
|
||||
|
||||
c = (uint8_t)(b0 >> 4); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x80u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x80u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x80u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x80u);
|
||||
c = (uint8_t)(b0 & 0x0Fu); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x40u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x40u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x40u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x40u);
|
||||
c = (uint8_t)(b1 >> 4); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x20u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x20u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x20u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x20u);
|
||||
c = (uint8_t)(b1 & 0x0Fu); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x10u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x10u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x10u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x10u);
|
||||
c = (uint8_t)(b2 >> 4); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x08u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x08u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x08u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x08u);
|
||||
c = (uint8_t)(b2 & 0x0Fu); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x04u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x04u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x04u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x04u);
|
||||
c = (uint8_t)(b3 >> 4); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x02u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x02u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x02u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x02u);
|
||||
c = (uint8_t)(b3 & 0x0Fu); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x01u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x01u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x01u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x01u);
|
||||
|
||||
dstX = (int16_t)(x + tileCol * 8);
|
||||
group = (uint16_t)((uint16_t)dstX >> 4);
|
||||
pw = (uint16_t *)(rowBase + group * ST_BYTES_PER_GROUP);
|
||||
if ((dstX & 8) == 0) {
|
||||
halfMask = 0xFF00u;
|
||||
pw[0] = (uint16_t)((pw[0] & 0x00FFu) | ((uint16_t)pb0 << 8));
|
||||
pw[1] = (uint16_t)((pw[1] & 0x00FFu) | ((uint16_t)pb1 << 8));
|
||||
pw[2] = (uint16_t)((pw[2] & 0x00FFu) | ((uint16_t)pb2 << 8));
|
||||
pw[3] = (uint16_t)((pw[3] & 0x00FFu) | ((uint16_t)pb3 << 8));
|
||||
} else {
|
||||
halfMask = 0x00FFu;
|
||||
pw[0] = (uint16_t)((pw[0] & 0xFF00u) | (uint16_t)pb0);
|
||||
pw[1] = (uint16_t)((pw[1] & 0xFF00u) | (uint16_t)pb1);
|
||||
pw[2] = (uint16_t)((pw[2] & 0xFF00u) | (uint16_t)pb2);
|
||||
pw[3] = (uint16_t)((pw[3] & 0xFF00u) | (uint16_t)pb3);
|
||||
}
|
||||
(void)halfMask;
|
||||
(void)notHalfMask;
|
||||
}
|
||||
rowBase += ST_BYTES_PER_ROW;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Phase 10: hoist y*160 to per-row, fold setPixel/getPixel bodies
|
||||
// inline. Each pixel's group address differs only in (x), so we
|
||||
// can compute base+row*160 once per row and just do per-pixel
|
||||
|
|
@ -1916,11 +1622,16 @@ void halSpriteSavePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t
|
|||
return;
|
||||
}
|
||||
/* Phase 10.5 fast path: byte-aligned, fully on-surface.
|
||||
* Asm walker does direct planar byte copy (LUT pointer unused). */
|
||||
* Specialized 16x16 (the UBER ball-sprite size) skips the asm
|
||||
* walker's per-row col-init + col-loop-check overhead. */
|
||||
if ((x & 7) == 0 && (w & 7) == 0
|
||||
&& x >= 0 && (x + (int16_t)w) <= SURFACE_WIDTH
|
||||
&& y >= 0 && (y + (int16_t)h) <= SURFACE_HEIGHT) {
|
||||
surface68kStSpriteSaveByteAligned(pd->base, (uint16_t)x, (uint16_t)y, w, h, dstPlaneBytes, NULL);
|
||||
if (w == 16u && h == 16u) {
|
||||
surface68kStSprite16x16Save(pd->base, (uint16_t)x, (uint16_t)y, dstPlaneBytes);
|
||||
} else {
|
||||
surface68kStSpriteSaveByteAligned(pd->base, (uint16_t)x, (uint16_t)y, w, h, dstPlaneBytes);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -1980,11 +1691,15 @@ void halSpriteRestorePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint1
|
|||
return;
|
||||
}
|
||||
/* Phase 10.5 fast path: byte-aligned, fully on-surface.
|
||||
* Asm walker does direct planar byte copy (LUT pointer unused). */
|
||||
* Specialized 16x16 (UBER ball-sprite) skips walker overhead. */
|
||||
if ((x & 7) == 0 && (w & 7) == 0
|
||||
&& x >= 0 && (x + (int16_t)w) <= SURFACE_WIDTH
|
||||
&& y >= 0 && (y + (int16_t)h) <= SURFACE_HEIGHT) {
|
||||
surface68kStSpriteRestoreByteAligned(pd->base, (uint16_t)x, (uint16_t)y, w, h, srcPlaneBytes, NULL);
|
||||
if (w == 16u && h == 16u) {
|
||||
surface68kStSprite16x16Restore(pd->base, (uint16_t)x, (uint16_t)y, srcPlaneBytes);
|
||||
} else {
|
||||
surface68kStSpriteRestoreByteAligned(pd->base, (uint16_t)x, (uint16_t)y, w, h, srcPlaneBytes);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -2049,10 +1764,11 @@ uint8_t halSamplePixel(const SurfaceT *s, int16_t x, int16_t y) {
|
|||
}
|
||||
|
||||
|
||||
// Phase 9: derive 160 chunky bytes per row from the word-interleaved
|
||||
// planar buffer (20 groups x 4 plane words). Same shape as the Amiga's
|
||||
// Derive 160 chunky bytes per row from the word-interleaved planar
|
||||
// buffer (20 groups x 4 plane words). Same shape as the Amiga's
|
||||
// amigaPlanesToChunkyRow but per-group instead of per-byte. Used by
|
||||
// halSurfaceHash and halSurfaceSaveFileChunky.
|
||||
// halSurfaceHash to fold the planar surface into the same byte stream
|
||||
// the chunky ports hash, so cross-port hash comparisons stay valid.
|
||||
static void stPlanarToChunkyRow(const StPlanarT *pd, int16_t y, uint8_t *dstChunkyRow) {
|
||||
uint16_t group;
|
||||
uint16_t p;
|
||||
|
|
@ -2134,58 +1850,27 @@ void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src) {
|
|||
}
|
||||
|
||||
|
||||
// Phase 9: read chunky from file into a temporary scratch buffer,
|
||||
// then c2p once into the planar shadow. The .joeysurface file format
|
||||
// is still chunky 4bpp on disk (cross-port asset interchange); the
|
||||
// in-memory representation is what changes.
|
||||
bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp) {
|
||||
// On-disk format is the ST's native interleaved planar buffer; one
|
||||
// fread fills it directly, no chunky scratch or c2p step.
|
||||
bool halSurfaceLoadFile(SurfaceT *dst, FILE *fp) {
|
||||
StPlanarT *pd;
|
||||
uint8_t *scratch;
|
||||
int16_t y;
|
||||
bool ok;
|
||||
|
||||
pd = (StPlanarT *)dst->portData;
|
||||
if (pd == NULL) {
|
||||
return false;
|
||||
}
|
||||
scratch = (uint8_t *)malloc(SURFACE_PIXELS_SIZE);
|
||||
if (scratch == NULL) {
|
||||
return false;
|
||||
}
|
||||
ok = (fread(scratch, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE);
|
||||
if (ok) {
|
||||
if (!gC2pLutReady) {
|
||||
initC2pLut();
|
||||
}
|
||||
for (y = 0; y < SURFACE_HEIGHT; y++) {
|
||||
const uint8_t *srcLine = &scratch[y * SURFACE_BYTES_PER_ROW];
|
||||
uint16_t *dstLine = (uint16_t *)&pd->base[y * ST_BYTES_PER_ROW];
|
||||
chunkyToPlanarRowSt(srcLine, dstLine, 0u, ST_GROUPS_PER_ROW, gC2pLut);
|
||||
}
|
||||
}
|
||||
free(scratch);
|
||||
return ok;
|
||||
return fread(pd->base, 1, ST_PLANAR_SIZE, fp) == ST_PLANAR_SIZE;
|
||||
}
|
||||
|
||||
|
||||
// Phase 9: derive chunky bytes from the planar shadow row by row,
|
||||
// stream to file. Avoids needing a full 32 KB scratch buffer.
|
||||
bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp) {
|
||||
bool halSurfaceSaveFile(const SurfaceT *src, FILE *fp) {
|
||||
StPlanarT *pd;
|
||||
uint8_t chunkyRow[SURFACE_BYTES_PER_ROW];
|
||||
int16_t y;
|
||||
|
||||
pd = (StPlanarT *)src->portData;
|
||||
if (pd == NULL) {
|
||||
return false;
|
||||
}
|
||||
for (y = 0; y < SURFACE_HEIGHT; y++) {
|
||||
stPlanarToChunkyRow(pd, y, chunkyRow);
|
||||
if (fwrite(chunkyRow, 1, SURFACE_BYTES_PER_ROW, fp) != SURFACE_BYTES_PER_ROW) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
return fwrite(pd->base, 1, ST_PLANAR_SIZE, fp) == ST_PLANAR_SIZE;
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -50,19 +50,17 @@
|
|||
| Trashes: d0, d1, a2
|
||||
|
||||
.macro DL_PLOT color
|
||||
| byteOff = y*160 + (x>>4)*8
|
||||
| byteOff = y*160 + (x>>4)*8 (fits in 16 bits since
|
||||
| surface is 32000 bytes < 32K). Skip ext.l + .l add
|
||||
| + .l indexed lea -- all word-sized ops save 14 cyc/pixel.
|
||||
move.w %d3,%d0
|
||||
ext.l %d0
|
||||
move.l %d0,%d1
|
||||
lsl.l #5,%d0 | y << 5
|
||||
lsl.l #7,%d1 | y << 7
|
||||
add.l %d1,%d0 | d0 = y * 160
|
||||
add.w %d0,%d0 | y * 2 (word index)
|
||||
move.w (%a6,%d0.w),%d0 | d0 = y * 160
|
||||
move.w %d2,%d1
|
||||
lsr.w #4,%d1
|
||||
lsl.w #3,%d1 | (x>>4) * 8
|
||||
ext.l %d1
|
||||
add.l %d1,%d0 | d0 = byteOff
|
||||
lea 0(%a3,%d0.l),%a2 | a2 = base + byteOff
|
||||
add.w %d1,%d0 | d0 = byteOff (fits in 16 bits)
|
||||
lea 0(%a3,%d0.w),%a2 | a2 = base + byteOff
|
||||
| d1 = bitMask, d0 = notMask
|
||||
move.w %d2,%d1
|
||||
and.w #15,%d1
|
||||
|
|
@ -127,9 +125,11 @@ _surface68kStDrawLine:
|
|||
movem.l %d2-%d7/%a2-%a6,-(%sp)
|
||||
lea -SP_LOCAL(%sp),%sp
|
||||
|
||||
| Load base & lut.
|
||||
| Load base & luts.
|
||||
move.l SP_BASE(%sp),%a3
|
||||
lea bitMaskWordLut(%pc),%a5
|
||||
| a6 = yLut base (yp -> yp*160) for use in DL_PLOT.
|
||||
lea _gStRowOffsetLut(%pc),%a6
|
||||
|
||||
| x = x0, y = y0
|
||||
move.w SP_X0(%sp),%d2
|
||||
|
|
@ -179,8 +179,8 @@ _surface68kStDrawLine:
|
|||
and.w #0x0F,%d0
|
||||
add.w %d0,%d0
|
||||
add.w %d0,%d0 | * 4 for bra.w table
|
||||
lea .LdlStTable(%pc),%a6
|
||||
jmp 0(%a6,%d0.w)
|
||||
lea .LdlStTable(%pc),%a2 | a2 scratch (a6 holds yLut)
|
||||
jmp 0(%a2,%d0.w)
|
||||
|
||||
.LdlStTable:
|
||||
bra.w .LdlStLoop_0
|
||||
|
|
@ -529,6 +529,129 @@ _surface68kStFillRectSingleGroup:
|
|||
rts
|
||||
|
||||
|
||||
| ---- surface68kStTileFill8x8 ---------------------------------------
|
||||
|
|
||||
| Specialized 8x8 single-group fill: 16-way color dispatch + 8 rows
|
||||
| fully unrolled. Drops the per-row subq+bne overhead that the
|
||||
| generic FRG_LOOP pays. Used by halTileFillPlanes.
|
||||
|
|
||||
| void surface68kStTileFill8x8(uint8_t *firstGroupPtr,
|
||||
| uint16_t mask,
|
||||
| uint8_t color);
|
||||
|
|
||||
| Per row body: 4 plane RMW with postinc + lea 152(a3),a3 to next
|
||||
| row. Row 7 skips the trailing lea (a3 not used after).
|
||||
|
||||
.equ SP_TF_SAVED, 16 | d3-d4/a2-a3 = 4 longs
|
||||
.equ SP_TF_OFF, (SP_TF_SAVED + 4)
|
||||
.equ SP_TF_PTR, SP_TF_OFF + 0
|
||||
.equ SP_TF_MASK, SP_TF_OFF + 4 + 2
|
||||
.equ SP_TF_COLOR, SP_TF_OFF + 8 + 3
|
||||
|
||||
|
||||
.macro TF8_ROW_BARE color
|
||||
.if ((\color) & 1)
|
||||
or.w %d3,(%a3)+
|
||||
.else
|
||||
and.w %d4,(%a3)+
|
||||
.endif
|
||||
.if ((\color) & 2)
|
||||
or.w %d3,(%a3)+
|
||||
.else
|
||||
and.w %d4,(%a3)+
|
||||
.endif
|
||||
.if ((\color) & 4)
|
||||
or.w %d3,(%a3)+
|
||||
.else
|
||||
and.w %d4,(%a3)+
|
||||
.endif
|
||||
.if ((\color) & 8)
|
||||
or.w %d3,(%a3)+
|
||||
.else
|
||||
and.w %d4,(%a3)+
|
||||
.endif
|
||||
.endm
|
||||
|
||||
|
||||
.macro TF8_ROW color
|
||||
TF8_ROW_BARE \color
|
||||
lea 152(%a3),%a3
|
||||
.endm
|
||||
|
||||
|
||||
.macro TF8_BODY color
|
||||
.Ltf8_body_\color:
|
||||
TF8_ROW \color | row 0
|
||||
TF8_ROW \color | row 1
|
||||
TF8_ROW \color | row 2
|
||||
TF8_ROW \color | row 3
|
||||
TF8_ROW \color | row 4
|
||||
TF8_ROW \color | row 5
|
||||
TF8_ROW \color | row 6
|
||||
TF8_ROW_BARE \color | row 7 (no trailing lea)
|
||||
bra.w .Ltf8_done
|
||||
.endm
|
||||
|
||||
|
||||
.globl _surface68kStTileFill8x8
|
||||
|
||||
_surface68kStTileFill8x8:
|
||||
movem.l %d3-%d4/%a2-%a3,-(%sp)
|
||||
|
||||
move.l SP_TF_PTR(%sp),%a3
|
||||
move.w SP_TF_MASK(%sp),%d3
|
||||
move.w %d3,%d4
|
||||
not.w %d4
|
||||
|
||||
| Color dispatch
|
||||
moveq #0,%d0
|
||||
move.b SP_TF_COLOR(%sp),%d0
|
||||
and.w #0x0F,%d0
|
||||
add.w %d0,%d0
|
||||
add.w %d0,%d0 | * 4 for bra.w table
|
||||
lea .Ltf8_table(%pc),%a2
|
||||
jmp 0(%a2,%d0.w)
|
||||
|
||||
.Ltf8_table:
|
||||
bra.w .Ltf8_body_0
|
||||
bra.w .Ltf8_body_1
|
||||
bra.w .Ltf8_body_2
|
||||
bra.w .Ltf8_body_3
|
||||
bra.w .Ltf8_body_4
|
||||
bra.w .Ltf8_body_5
|
||||
bra.w .Ltf8_body_6
|
||||
bra.w .Ltf8_body_7
|
||||
bra.w .Ltf8_body_8
|
||||
bra.w .Ltf8_body_9
|
||||
bra.w .Ltf8_body_10
|
||||
bra.w .Ltf8_body_11
|
||||
bra.w .Ltf8_body_12
|
||||
bra.w .Ltf8_body_13
|
||||
bra.w .Ltf8_body_14
|
||||
bra.w .Ltf8_body_15
|
||||
|
||||
TF8_BODY 0
|
||||
TF8_BODY 1
|
||||
TF8_BODY 2
|
||||
TF8_BODY 3
|
||||
TF8_BODY 4
|
||||
TF8_BODY 5
|
||||
TF8_BODY 6
|
||||
TF8_BODY 7
|
||||
TF8_BODY 8
|
||||
TF8_BODY 9
|
||||
TF8_BODY 10
|
||||
TF8_BODY 11
|
||||
TF8_BODY 12
|
||||
TF8_BODY 13
|
||||
TF8_BODY 14
|
||||
TF8_BODY 15
|
||||
|
||||
.Ltf8_done:
|
||||
movem.l (%sp)+,%d3-%d4/%a2-%a3
|
||||
rts
|
||||
|
||||
|
||||
| ---- surface68kStFillRectMulti -------------------------------------
|
||||
|
|
||||
| Multi-group fillRect: groupFirst != groupLast. Caller pre-clips.
|
||||
|
|
@ -782,6 +905,21 @@ frmRightMaskLut:
|
|||
.word 0xFFF8, 0xFFFC, 0xFFFE, 0xFFFF
|
||||
|
||||
|
||||
.align 2
|
||||
| Shared y -> y*160 LUT. Used by drawLine (DL_PLOT), drawCircle
|
||||
| (YP_REC), fillCircle (SPAN_BODY). 200 words = 400 bytes.
|
||||
| Replaces a 44-cyc lsl.w #5 + lsl.w #7 + add.w shift chain with
|
||||
| a 14-cyc indexed-word load. Exported so circle.s and fillCircle.s
|
||||
| can reference it via absolute addressing without duplication.
|
||||
.globl _gStRowOffsetLut
|
||||
_gStRowOffsetLut:
|
||||
.set li_y, 0
|
||||
.rept 200
|
||||
.word li_y * 160
|
||||
.set li_y, li_y + 1
|
||||
.endr
|
||||
|
||||
|
||||
| ---- surface68kStLongFill ----------------------------------------
|
||||
|
|
||||
| Bulk long-fill helper for full-row fills (surfaceClear, fillRect
|
||||
|
|
|
|||
|
|
@ -1,30 +1,19 @@
|
|||
| ST byte-aligned sprite save / restore via 256-entry plane-spread
|
||||
| LUT. The LUT entry for each plane byte value is a 32-bit "spread"
|
||||
| where each plane byte bit lands at the corresponding plane-0 bit
|
||||
| position of the 4-byte chunky output. For plane N, we shift the
|
||||
| LUT entry left by N to put bits at the plane-N positions, then OR
|
||||
| the 4 plane contributions together to get the chunky long.
|
||||
|
|
||||
| LUT layout (256 longs = 1 KB), populated by initStPlaneSpreadLut
|
||||
| in hal.c:
|
||||
|
|
||||
| gStPlaneSpreadLut[b] for plane byte b:
|
||||
| bit i of b (i = 0 = MSB = leftmost pixel) maps to bit
|
||||
| bitInLong(i) = (3 - (i >> 1)) * 8 + ((i & 1) ? 0 : 4)
|
||||
| of the long. Plane 0's bits land at nibble bit 0 of each
|
||||
| chunky byte; left-shift the LUT entry by N for plane N.
|
||||
| ST byte-aligned sprite save / restore. Buffer holds plane-major
|
||||
| bytes: per row, plane0/1/2/3 per tile col, for w/8 tile cols. The
|
||||
| inner per-tile-col macro is 4 byte copies (no chunky <-> planar
|
||||
| conversion since the buffer matches the surface's plane layout).
|
||||
|
|
||||
| ABI: cdecl. d2-d7/a2-a6 callee-save. C signatures:
|
||||
|
|
||||
| void surface68kStSpriteSaveByteAligned(uint8_t *base,
|
||||
| uint16_t x, uint16_t y,
|
||||
| uint16_t w, uint16_t h,
|
||||
| uint8_t *dstChunky);
|
||||
| uint8_t *dstPlaneBytes);
|
||||
|
|
||||
| void surface68kStSpriteRestoreByteAligned(uint8_t *base,
|
||||
| uint16_t x, uint16_t y,
|
||||
| uint16_t w, uint16_t h,
|
||||
| const uint8_t *srcChunky);
|
||||
| const uint8_t *srcPlaneBytes);
|
||||
|
||||
.text
|
||||
|
||||
|
|
@ -36,19 +25,12 @@
|
|||
.equ SP_Y, SP_OFF + 8 + 2
|
||||
.equ SP_W, SP_OFF + 12 + 2
|
||||
.equ SP_H, SP_OFF + 16 + 2
|
||||
.equ SP_CHUNKY, SP_OFF + 20
|
||||
.equ SP_LUT, SP_OFF + 24
|
||||
.equ SP_BUF, SP_OFF + 20
|
||||
|
||||
|
||||
| Per-tile-col SAVE: 4 plane bytes -> 4 contiguous bytes in buffer.
|
||||
| a0 -> plane 0 byte (high or low half), strides 2 to next plane
|
||||
| a1 -> output planar bytes (advanced by 4)
|
||||
| a2 -> unused (LUT no longer needed)
|
||||
|
|
||||
| Phase 10.5: dropped chunky <-> planar conversion. The buffer holds
|
||||
| plane-major bytes (per row: plane0, plane1, plane2, plane3 per
|
||||
| tile col, for w/8 tile cols). 4 byte copies instead of 4 LUT
|
||||
| lookups + shifts + ORs.
|
||||
|
||||
.macro SAVE_TILECOL
|
||||
move.b (%a0),(%a1)+ | plane 0
|
||||
|
|
@ -64,13 +46,7 @@ _surface68kStSpriteSaveByteAligned:
|
|||
movem.l %d2-%d7/%a2-%a6,-(%sp)
|
||||
|
||||
move.l SP_BASE(%sp),%a3
|
||||
move.l SP_CHUNKY(%sp),%a1
|
||||
| LUT pointer comes in via stack arg -- guaranteed
|
||||
| long-aligned because gcc passes ptr args via
|
||||
| move.l on a long-aligned sp slot. Avoids the BSS
|
||||
| misalignment problem on TOS .PRG (BSS pads only to
|
||||
| 2 bytes, even uint32_t slots can land at mod-4 = 2).
|
||||
move.l SP_LUT(%sp),%a2
|
||||
move.l SP_BUF(%sp),%a1
|
||||
|
||||
move.w SP_W(%sp),%d5
|
||||
lsr.w #3,%d5 | d5 = tileCols
|
||||
|
|
@ -128,10 +104,6 @@ _surface68kStSpriteSaveByteAligned:
|
|||
| Per-tile-col RESTORE: 4 contiguous bytes from buffer -> 4 plane bytes.
|
||||
| a0 -> plane 0 byte (high or low half)
|
||||
| a1 -> input planar bytes (advanced by 4)
|
||||
| a2 -> unused (LUT no longer needed)
|
||||
|
|
||||
| Phase 10.5: dropped chunky -> planar conversion. Buffer layout
|
||||
| matches SAVE_TILECOL: per row, plane0/1/2/3 per tile col.
|
||||
|
||||
.macro RESTORE_TILECOL
|
||||
move.b (%a1)+,(%a0) | plane 0
|
||||
|
|
@ -147,8 +119,7 @@ _surface68kStSpriteRestoreByteAligned:
|
|||
movem.l %d2-%d7/%a2-%a6,-(%sp)
|
||||
|
||||
move.l SP_BASE(%sp),%a3
|
||||
move.l SP_CHUNKY(%sp),%a1
|
||||
move.l SP_LUT(%sp),%a2 | gC2pLut passed in
|
||||
move.l SP_BUF(%sp),%a1
|
||||
|
||||
| tileCols is held in a5 (not d5) because the macro
|
||||
| trashes d5 (uses it for pb3).
|
||||
|
|
@ -200,3 +171,151 @@ _surface68kStSpriteRestoreByteAligned:
|
|||
|
||||
movem.l (%sp)+,%d2-%d7/%a2-%a6
|
||||
rts
|
||||
|
||||
|
||||
| ---- surface68kStSprite16x16Save / Restore -----------------------
|
||||
|
|
||||
| Specialized 16x16 sprite save/restore: 16 rows fully unrolled,
|
||||
| 8 byte copies per row (2 tile cols), no col loop. Drops the asm
|
||||
| walker's per-row col-init + col-loop-check overhead.
|
||||
|
|
||||
| void surface68kStSprite16x16Save(uint8_t *base,
|
||||
| uint16_t x, uint16_t y,
|
||||
| uint8_t *dstBuf);
|
||||
|
|
||||
| void surface68kStSprite16x16Restore(uint8_t *base,
|
||||
| uint16_t x, uint16_t y,
|
||||
| const uint8_t *srcBuf);
|
||||
|
|
||||
| Caller guarantees x is byte-aligned (x mod 8 == 0). Two halfOff
|
||||
| variants dispatch on (x & 8): halfOff=0 reads/writes within one
|
||||
| group (offsets 0/2/4/6 high half + 1/3/5/7 low half). halfOff=1
|
||||
| spans two groups (low half of group N + high half of group N+1).
|
||||
|
||||
.equ SP16_SAVED, 12 | d2/a2-a3 = 3 longs
|
||||
.equ SP16_OFF, (SP16_SAVED + 4)
|
||||
.equ SP16_BASE, SP16_OFF + 0
|
||||
.equ SP16_X, SP16_OFF + 4 + 2
|
||||
.equ SP16_Y, SP16_OFF + 8 + 2
|
||||
.equ SP16_BUF, SP16_OFF + 12
|
||||
|
||||
|
||||
| Macro: setup a0 = base + y*160 + group*8 + halfOff
|
||||
| Trashes: d0, d1, d2; a0 left at row start
|
||||
|
||||
.macro SP16_SETUP_A0
|
||||
move.l SP16_BASE(%sp),%a3
|
||||
move.w SP16_X(%sp),%d0
|
||||
move.w SP16_Y(%sp),%d1
|
||||
|
||||
| a0 = base + y*160
|
||||
ext.l %d1
|
||||
move.l %d1,%d2
|
||||
lsl.l #5,%d1
|
||||
lsl.l #7,%d2
|
||||
add.l %d2,%d1
|
||||
lea 0(%a3,%d1.l),%a0
|
||||
|
||||
| a0 += (x>>4) * 8
|
||||
move.w %d0,%d1
|
||||
lsr.w #4,%d1
|
||||
lsl.w #3,%d1
|
||||
ext.l %d1
|
||||
add.l %d1,%a0
|
||||
|
||||
| a0 += halfOff (= (x & 8) >> 3)
|
||||
and.w #8,%d0
|
||||
lsr.w #3,%d0
|
||||
ext.l %d0
|
||||
add.l %d0,%a0
|
||||
| d0 = halfOff (0 or 1) for downstream dispatch
|
||||
.endm
|
||||
|
||||
|
||||
.globl _surface68kStSprite16x16Save
|
||||
|
||||
_surface68kStSprite16x16Save:
|
||||
movem.l %d2/%a2-%a3,-(%sp)
|
||||
SP16_SETUP_A0
|
||||
move.l SP16_BUF(%sp),%a1
|
||||
|
||||
tst.w %d0
|
||||
bne.w .Lsp16s_low
|
||||
|
||||
| halfOff=0: a0 at high half. Col 0 = high (offsets
|
||||
| 0,2,4,6); col 1 = low (offsets 1,3,5,7).
|
||||
.rept 16
|
||||
move.b (%a0),(%a1)+
|
||||
move.b 2(%a0),(%a1)+
|
||||
move.b 4(%a0),(%a1)+
|
||||
move.b 6(%a0),(%a1)+
|
||||
move.b 1(%a0),(%a1)+
|
||||
move.b 3(%a0),(%a1)+
|
||||
move.b 5(%a0),(%a1)+
|
||||
move.b 7(%a0),(%a1)+
|
||||
lea 160(%a0),%a0
|
||||
.endr
|
||||
bra.w .Lsp16s_done
|
||||
|
||||
.Lsp16s_low:
|
||||
| halfOff=1: a0 at low half (group+1). Col 0 = low of
|
||||
| this group, offsets 0,2,4,6 from a0. Col 1 = high of
|
||||
| next group, at offsets 7,9,11,13 from a0.
|
||||
.rept 16
|
||||
move.b (%a0),(%a1)+
|
||||
move.b 2(%a0),(%a1)+
|
||||
move.b 4(%a0),(%a1)+
|
||||
move.b 6(%a0),(%a1)+
|
||||
move.b 7(%a0),(%a1)+
|
||||
move.b 9(%a0),(%a1)+
|
||||
move.b 11(%a0),(%a1)+
|
||||
move.b 13(%a0),(%a1)+
|
||||
lea 160(%a0),%a0
|
||||
.endr
|
||||
|
||||
.Lsp16s_done:
|
||||
movem.l (%sp)+,%d2/%a2-%a3
|
||||
rts
|
||||
|
||||
|
||||
.globl _surface68kStSprite16x16Restore
|
||||
|
||||
_surface68kStSprite16x16Restore:
|
||||
movem.l %d2/%a2-%a3,-(%sp)
|
||||
SP16_SETUP_A0
|
||||
move.l SP16_BUF(%sp),%a1
|
||||
|
||||
tst.w %d0
|
||||
bne.w .Lsp16r_low
|
||||
|
||||
| halfOff=0: write high half (col 0) + low half (col 1).
|
||||
.rept 16
|
||||
move.b (%a1)+,(%a0)
|
||||
move.b (%a1)+,2(%a0)
|
||||
move.b (%a1)+,4(%a0)
|
||||
move.b (%a1)+,6(%a0)
|
||||
move.b (%a1)+,1(%a0)
|
||||
move.b (%a1)+,3(%a0)
|
||||
move.b (%a1)+,5(%a0)
|
||||
move.b (%a1)+,7(%a0)
|
||||
lea 160(%a0),%a0
|
||||
.endr
|
||||
bra.w .Lsp16r_done
|
||||
|
||||
.Lsp16r_low:
|
||||
| halfOff=1
|
||||
.rept 16
|
||||
move.b (%a1)+,(%a0)
|
||||
move.b (%a1)+,2(%a0)
|
||||
move.b (%a1)+,4(%a0)
|
||||
move.b (%a1)+,6(%a0)
|
||||
move.b (%a1)+,7(%a0)
|
||||
move.b (%a1)+,9(%a0)
|
||||
move.b (%a1)+,11(%a0)
|
||||
move.b (%a1)+,13(%a0)
|
||||
lea 160(%a0),%a0
|
||||
.endr
|
||||
|
||||
.Lsp16r_done:
|
||||
movem.l (%sp)+,%d2/%a2-%a3
|
||||
rts
|
||||
|
|
|
|||
|
|
@ -614,12 +614,12 @@ void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src) {
|
|||
}
|
||||
|
||||
|
||||
bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp) {
|
||||
bool halSurfaceLoadFile(SurfaceT *dst, FILE *fp) {
|
||||
return fread(dst->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE;
|
||||
}
|
||||
|
||||
|
||||
bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp) {
|
||||
bool halSurfaceSaveFile(const SurfaceT *src, FILE *fp) {
|
||||
return fwrite(src->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -395,12 +395,12 @@ void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src) {
|
|||
}
|
||||
|
||||
|
||||
bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp) {
|
||||
bool halSurfaceLoadFile(SurfaceT *dst, FILE *fp) {
|
||||
return fread(dst->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE;
|
||||
}
|
||||
|
||||
|
||||
bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp) {
|
||||
bool halSurfaceSaveFile(const SurfaceT *src, FILE *fp) {
|
||||
return fwrite(src->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE;
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue