ST is more or less parity.
This commit is contained in:
parent
818dc801db
commit
cf6ae093d3
15 changed files with 966 additions and 1062 deletions
326
README.md
326
README.md
|
|
@ -59,6 +59,332 @@ build/<plat>/ per-target build outputs
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## Public API
|
||||||
|
|
||||||
|
Game code includes a single umbrella header:
|
||||||
|
|
||||||
|
```c
|
||||||
|
#include <joey/joey.h>
|
||||||
|
```
|
||||||
|
|
||||||
|
That pulls in every public surface listed below. Full documentation
|
||||||
|
lives in the per-feature headers under `include/joey/`; what follows
|
||||||
|
is a quick reference. Every entry point is plain C, no C++ extensions.
|
||||||
|
|
||||||
|
|
||||||
|
### Lifecycle (`joey/core.h`)
|
||||||
|
|
||||||
|
```c
|
||||||
|
typedef struct {
|
||||||
|
HostModeE hostMode; // HOST_MODE_TAKEOVER or HOST_MODE_OS
|
||||||
|
uint32_t codegenBytes; // runtime compiled-sprite cache size
|
||||||
|
uint16_t maxSurfaces; // maximum concurrent surfaces
|
||||||
|
uint32_t audioBytes; // audio sample / module RAM pool
|
||||||
|
uint32_t assetBytes; // tileset / sprite / map RAM pool
|
||||||
|
} JoeyConfigT;
|
||||||
|
|
||||||
|
bool joeyInit (const JoeyConfigT *config);
|
||||||
|
void joeyShutdown (void);
|
||||||
|
const char *joeyLastError (void);
|
||||||
|
const char *joeyPlatformName (void);
|
||||||
|
const char *joeyVersionString(void);
|
||||||
|
|
||||||
|
void joeyWaitVBL (void); // block until next VBL
|
||||||
|
uint16_t joeyFrameCount (void); // monotonic 16-bit frame counter
|
||||||
|
uint16_t joeyFrameHz (void); // 50 / 60 / 70 depending on port
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
### Surfaces (`joey/surface.h`)
|
||||||
|
|
||||||
|
All surfaces are 320x200 4bpp packed (high nibble = left pixel) with
|
||||||
|
a 200-entry SCB table and 16 palettes of 16 `$0RGB` colors.
|
||||||
|
|
||||||
|
```c
|
||||||
|
#define SURFACE_WIDTH 320
|
||||||
|
#define SURFACE_HEIGHT 200
|
||||||
|
#define SURFACE_BYTES_PER_ROW 160
|
||||||
|
#define SURFACE_PIXELS_SIZE (SURFACE_BYTES_PER_ROW * SURFACE_HEIGHT)
|
||||||
|
#define SURFACE_PALETTE_COUNT 16
|
||||||
|
#define SURFACE_COLORS_PER_PALETTE 16
|
||||||
|
|
||||||
|
typedef struct SurfaceT SurfaceT; // opaque
|
||||||
|
|
||||||
|
SurfaceT *surfaceCreate (void);
|
||||||
|
void surfaceDestroy(SurfaceT *s);
|
||||||
|
SurfaceT *stageGet (void); // library back-buffer
|
||||||
|
void surfaceCopy (SurfaceT *dst, const SurfaceT *src);
|
||||||
|
|
||||||
|
bool surfaceSaveFile(const SurfaceT *src, const char *path);
|
||||||
|
bool surfaceLoadFile(SurfaceT *dst, const char *path);
|
||||||
|
uint32_t surfaceHash (const SurfaceT *s); // FNV-1a of logical pixels
|
||||||
|
```
|
||||||
|
|
||||||
|
`surfaceSaveFile` writes the surface in **target-native** form. Files
|
||||||
|
are NOT cross-port portable; the asset pipeline handles conversion.
|
||||||
|
|
||||||
|
|
||||||
|
### Drawing (`joey/draw.h`)
|
||||||
|
|
||||||
|
All primitives clip to the surface; off-surface coords are silent
|
||||||
|
no-ops. Color 0 is plotted normally (use the masked variants if you
|
||||||
|
need transparency).
|
||||||
|
|
||||||
|
```c
|
||||||
|
void surfaceClear (SurfaceT *s, uint8_t color);
|
||||||
|
void drawPixel (SurfaceT *s, int16_t x, int16_t y, uint8_t color);
|
||||||
|
uint8_t samplePixel (const SurfaceT *s, int16_t x, int16_t y);
|
||||||
|
|
||||||
|
void drawLine (SurfaceT *s, int16_t x0, int16_t y0,
|
||||||
|
int16_t x1, int16_t y1, uint8_t color);
|
||||||
|
void drawRect (SurfaceT *s, int16_t x, int16_t y,
|
||||||
|
uint16_t w, uint16_t h, uint8_t color);
|
||||||
|
void fillRect (SurfaceT *s, int16_t x, int16_t y,
|
||||||
|
uint16_t w, uint16_t h, uint8_t color);
|
||||||
|
void drawCircle (SurfaceT *s, int16_t cx, int16_t cy,
|
||||||
|
uint16_t r, uint8_t color);
|
||||||
|
void fillCircle (SurfaceT *s, int16_t cx, int16_t cy,
|
||||||
|
uint16_t r, uint8_t color);
|
||||||
|
|
||||||
|
void floodFill (SurfaceT *s, int16_t x, int16_t y, uint8_t newColor);
|
||||||
|
void floodFillBounded (SurfaceT *s, int16_t x, int16_t y,
|
||||||
|
uint8_t newColor, uint8_t boundaryColor);
|
||||||
|
|
||||||
|
void surfaceBlit (SurfaceT *dst, const JoeyAssetT *src, int16_t x, int16_t y);
|
||||||
|
void surfaceBlitMasked (SurfaceT *dst, const JoeyAssetT *src,
|
||||||
|
int16_t x, int16_t y, uint8_t transparentIndex);
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
### Palette and SCB (`joey/palette.h`)
|
||||||
|
|
||||||
|
Colors are 12-bit `$0RGB`. Color 0 of every palette is forced to
|
||||||
|
black on `paletteSet`. Each scanline picks one of the 16 palettes
|
||||||
|
via the SCB.
|
||||||
|
|
||||||
|
```c
|
||||||
|
void paletteSet (SurfaceT *s, uint8_t paletteIndex, const uint16_t *colors16);
|
||||||
|
void paletteGet (const SurfaceT *s, uint8_t paletteIndex, uint16_t *out16);
|
||||||
|
void scbSet (SurfaceT *s, uint16_t line, uint8_t paletteIndex);
|
||||||
|
void scbSetRange (SurfaceT *s, uint16_t firstLine, uint16_t lastLine,
|
||||||
|
uint8_t paletteIndex);
|
||||||
|
uint8_t scbGet (const SurfaceT *s, uint16_t line);
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
### Tiles (`joey/tile.h`)
|
||||||
|
|
||||||
|
A "tile" is just an 8x8-aligned region of any surface. The API moves
|
||||||
|
32-byte chunks between surfaces and provides a small `TileT` value
|
||||||
|
type so callers can stash a copy without allocating a scratch surface.
|
||||||
|
|
||||||
|
```c
|
||||||
|
#define TILE_PIXELS_PER_SIDE 8
|
||||||
|
#define TILE_BYTES_PER_ROW 4
|
||||||
|
#define TILE_BYTES (TILE_BYTES_PER_ROW * TILE_PIXELS_PER_SIDE)
|
||||||
|
#define TILE_BLOCKS_PER_ROW (SURFACE_WIDTH / TILE_PIXELS_PER_SIDE) // 40
|
||||||
|
#define TILE_BLOCKS_PER_COL (SURFACE_HEIGHT / TILE_PIXELS_PER_SIDE) // 25
|
||||||
|
#define TILE_NO_GLYPH ((uint16_t)0xFFFFu)
|
||||||
|
|
||||||
|
typedef struct TileT { uint8_t pixels[TILE_BYTES]; } TileT;
|
||||||
|
|
||||||
|
void tileCopy (SurfaceT *dst, uint8_t dstBx, uint8_t dstBy,
|
||||||
|
const SurfaceT *src, uint8_t srcBx, uint8_t srcBy);
|
||||||
|
void tileCopyMasked (SurfaceT *dst, uint8_t dstBx, uint8_t dstBy,
|
||||||
|
const SurfaceT *src, uint8_t srcBx, uint8_t srcBy,
|
||||||
|
uint8_t transparentIndex);
|
||||||
|
void tileFill (SurfaceT *s, uint8_t bx, uint8_t by, uint8_t color);
|
||||||
|
void tileSnap (const SurfaceT *src, uint8_t bx, uint8_t by, TileT *out);
|
||||||
|
void tilePaste (SurfaceT *dst, uint8_t bx, uint8_t by, const TileT *in);
|
||||||
|
|
||||||
|
void drawText (SurfaceT *dst, uint8_t bx, uint8_t by,
|
||||||
|
const SurfaceT *fontSurface, const uint16_t *asciiMap,
|
||||||
|
const char *str);
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
### Sprites (`joey/sprite.h`)
|
||||||
|
|
||||||
|
Rectangles of 8x8 tiles drawn at arbitrary pixel positions with
|
||||||
|
color-0 transparency. Tile data is `widthTiles * heightTiles * 32`
|
||||||
|
bytes, tile-major 4bpp packed. Sprites can be runtime-compiled
|
||||||
|
into per-shift code variants for fast draws.
|
||||||
|
|
||||||
|
```c
|
||||||
|
typedef enum { SPRITE_FLAGS_NONE = 0 } SpriteFlagsE;
|
||||||
|
typedef struct SpriteT SpriteT; // opaque
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
SpriteT *sprite;
|
||||||
|
int16_t x, y;
|
||||||
|
uint16_t width, height; // pixels
|
||||||
|
uint8_t *bytes; // caller-owned save-under buffer
|
||||||
|
uint16_t sizeBytes;
|
||||||
|
} SpriteBackupT;
|
||||||
|
|
||||||
|
SpriteT *spriteCreate (const uint8_t *tileData,
|
||||||
|
uint8_t widthTiles, uint8_t heightTiles,
|
||||||
|
SpriteFlagsE flags);
|
||||||
|
SpriteT *spriteCreateFromSurface (const SurfaceT *src, int16_t x, int16_t y,
|
||||||
|
uint8_t widthTiles, uint8_t heightTiles,
|
||||||
|
SpriteFlagsE flags);
|
||||||
|
SpriteT *spriteLoadFile (const char *path, SpriteFlagsE flags);
|
||||||
|
SpriteT *spriteFromCompiledMem (const uint8_t *data, uint32_t length,
|
||||||
|
SpriteFlagsE flags);
|
||||||
|
bool spriteSaveFile (SpriteT *sp, const char *path);
|
||||||
|
void spriteDestroy (SpriteT *sp);
|
||||||
|
|
||||||
|
bool spriteCompile (SpriteT *sp); // build per-shift fast path
|
||||||
|
void spritePrewarm (SpriteT *sp); // hint: compile if not already
|
||||||
|
|
||||||
|
void spriteDraw (SurfaceT *s, SpriteT *sp, int16_t x, int16_t y);
|
||||||
|
void spriteSaveUnder (const SurfaceT *s, SpriteT *sp,
|
||||||
|
int16_t x, int16_t y, SpriteBackupT *backup);
|
||||||
|
void spriteRestoreUnder (SurfaceT *s, const SpriteBackupT *backup);
|
||||||
|
void spriteSaveAndDraw (SurfaceT *s, SpriteT *sp, int16_t x, int16_t y,
|
||||||
|
SpriteBackupT *backup);
|
||||||
|
|
||||||
|
void spriteCompact (void); // defrag the codegen arena
|
||||||
|
uint32_t spriteCodegenBytesUsed (void);
|
||||||
|
uint32_t spriteCodegenBytesTotal (void);
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
### Assets (`joey/asset.h`)
|
||||||
|
|
||||||
|
Small bitmap blits with optional embedded palette, in `.jas` format.
|
||||||
|
Use embedded `const JoeyAssetT` for ship-with-binary art; use the
|
||||||
|
loaders for on-disk assets.
|
||||||
|
|
||||||
|
```c
|
||||||
|
typedef struct {
|
||||||
|
uint16_t width;
|
||||||
|
uint16_t height;
|
||||||
|
bool hasPalette;
|
||||||
|
uint16_t palette[16]; // valid only if hasPalette
|
||||||
|
const uint8_t *pixels; // 4bpp packed, rowBytes = (width+1)/2
|
||||||
|
} JoeyAssetT;
|
||||||
|
|
||||||
|
JoeyAssetT *joeyAssetLoadFile (const char *path);
|
||||||
|
JoeyAssetT *joeyAssetFromMem (const uint8_t *data, uint32_t length);
|
||||||
|
void joeyAssetFree (JoeyAssetT *asset);
|
||||||
|
void joeyAssetApplyPalette (SurfaceT *dst, uint8_t paletteIndex,
|
||||||
|
const JoeyAssetT *asset);
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
### Present (`joey/present.h`)
|
||||||
|
|
||||||
|
```c
|
||||||
|
void stagePresent(void);
|
||||||
|
```
|
||||||
|
|
||||||
|
Flips the dirty rows of the stage to the display, then clears dirty
|
||||||
|
state. Drawing primitives mark dirty as a side effect, so calling
|
||||||
|
`stagePresent` once at end-of-frame is enough.
|
||||||
|
|
||||||
|
|
||||||
|
### Input (`joey/input.h`)
|
||||||
|
|
||||||
|
Call `joeyInputPoll` once per frame, then query the state predicates.
|
||||||
|
Edge predicates (`*Pressed`, `*Released`) fire only in the frame the
|
||||||
|
transition happened.
|
||||||
|
|
||||||
|
```c
|
||||||
|
typedef enum { /* KEY_NONE, KEY_A..KEY_Z, KEY_0..KEY_9, KEY_SPACE,
|
||||||
|
KEY_ESCAPE, KEY_RETURN, KEY_TAB, KEY_BACKSPACE,
|
||||||
|
KEY_UP/DOWN/LEFT/RIGHT, KEY_LSHIFT/RSHIFT/LCTRL/LALT,
|
||||||
|
KEY_F1..KEY_F10, KEY_COUNT */ } JoeyKeyE;
|
||||||
|
typedef enum { MOUSE_BUTTON_NONE, MOUSE_BUTTON_LEFT, MOUSE_BUTTON_RIGHT,
|
||||||
|
MOUSE_BUTTON_MIDDLE, MOUSE_BUTTON_COUNT } JoeyMouseButtonE;
|
||||||
|
typedef enum { JOYSTICK_0, JOYSTICK_1, JOYSTICK_COUNT } JoeyJoystickE;
|
||||||
|
typedef enum { JOY_BUTTON_0, JOY_BUTTON_1, JOY_BUTTON_COUNT } JoeyJoyButtonE;
|
||||||
|
|
||||||
|
#define JOYSTICK_AXIS_MAX 127
|
||||||
|
#define JOYSTICK_AXIS_MIN (-127)
|
||||||
|
|
||||||
|
void joeyInputPoll (void);
|
||||||
|
void joeyWaitForAnyKey (void);
|
||||||
|
|
||||||
|
bool joeyKeyDown (JoeyKeyE key);
|
||||||
|
bool joeyKeyPressed (JoeyKeyE key);
|
||||||
|
bool joeyKeyReleased (JoeyKeyE key);
|
||||||
|
|
||||||
|
int16_t joeyMouseX (void);
|
||||||
|
int16_t joeyMouseY (void);
|
||||||
|
bool joeyMouseDown (JoeyMouseButtonE b);
|
||||||
|
bool joeyMousePressed (JoeyMouseButtonE b);
|
||||||
|
bool joeyMouseReleased (JoeyMouseButtonE b);
|
||||||
|
|
||||||
|
bool joeyJoystickConnected(JoeyJoystickE js);
|
||||||
|
int8_t joeyJoystickX (JoeyJoystickE js);
|
||||||
|
int8_t joeyJoystickY (JoeyJoystickE js);
|
||||||
|
bool joeyJoyDown (JoeyJoystickE js, JoeyJoyButtonE b);
|
||||||
|
bool joeyJoyPressed (JoeyJoystickE js, JoeyJoyButtonE b);
|
||||||
|
bool joeyJoyReleased (JoeyJoystickE js, JoeyJoyButtonE b);
|
||||||
|
void joeyJoystickReset (JoeyJoystickE js, uint8_t deadZone);
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
### Audio (`joey/audio.h`)
|
||||||
|
|
||||||
|
4-channel Protracker-style music plus four one-shot SFX slots. Module
|
||||||
|
data must be the platform-native form produced by `tools/joeymod`
|
||||||
|
(`.mod` for Amiga/DOS/ST; `.ntp` for IIgs; `.amod` if you want
|
||||||
|
loop=false on Amiga). A failed `joeyAudioInit` is non-fatal; the rest
|
||||||
|
of the API stays callable as no-ops.
|
||||||
|
|
||||||
|
```c
|
||||||
|
#define JOEY_AUDIO_SFX_SLOTS 4
|
||||||
|
|
||||||
|
bool joeyAudioInit (void);
|
||||||
|
void joeyAudioShutdown (void);
|
||||||
|
|
||||||
|
void joeyAudioPlayMod (const uint8_t *data, uint32_t length, bool loop);
|
||||||
|
void joeyAudioStopMod (void);
|
||||||
|
bool joeyAudioIsPlayingMod (void);
|
||||||
|
|
||||||
|
void joeyAudioPlaySfx (uint8_t slot, const uint8_t *sample,
|
||||||
|
uint32_t length, uint16_t rateHz);
|
||||||
|
void joeyAudioStopSfx (uint8_t slot);
|
||||||
|
|
||||||
|
void joeyAudioFrameTick (void);
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
### Debug logging (`joey/debug.h`)
|
||||||
|
|
||||||
|
Crash-tracing logger. Writes are buffered and durable across normal
|
||||||
|
exit; call `joeyLogFlush` ahead of suspected hang points if you want
|
||||||
|
a guaranteed last-line-on-disk.
|
||||||
|
|
||||||
|
```c
|
||||||
|
void joeyLog (const char *msg);
|
||||||
|
void joeyLogF (const char *fmt, ...);
|
||||||
|
void joeyLogFlush(void);
|
||||||
|
void joeyLogReset(void);
|
||||||
|
```
|
||||||
|
|
||||||
|
Output goes to `joeylog.txt` in the program's working directory.
|
||||||
|
|
||||||
|
|
||||||
|
### Platform macros (`joey/platform.h`)
|
||||||
|
|
||||||
|
The build system normally sets the platform via `-D`; auto-detection
|
||||||
|
from compiler-predefined macros is a fallback. Game code can
|
||||||
|
conditionally compile on these:
|
||||||
|
|
||||||
|
```
|
||||||
|
JOEYLIB_PLATFORM_IIGS / _AMIGA / _ATARIST / _DOS // exactly one defined
|
||||||
|
JOEYLIB_CPU_65816 / _68000 / _X86
|
||||||
|
JOEYLIB_ENDIAN_LITTLE / _BIG
|
||||||
|
JOEYLIB_NATIVE_CHUNKY / _NATIVE_PLANAR
|
||||||
|
JOEYLIB_HAS_BLITTER / _HAS_COPPER // Amiga only
|
||||||
|
JOEYLIB_PLATFORM_NAME // human-readable string
|
||||||
|
JOEYLIB_VERSION_MAJOR / _MINOR / _PATCH / _STRING
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
## License
|
## License
|
||||||
|
|
||||||
TBD.
|
TBD.
|
||||||
|
|
|
||||||
28
scripts/dosbox-386sx16.conf
Normal file
28
scripts/dosbox-386sx16.conf
Normal file
|
|
@ -0,0 +1,28 @@
|
||||||
|
# DOSBox config: simulate an Intel 386SX-16 (1988), the slowest 386
|
||||||
|
# desktop CPU JoeyLib could realistically be run on. Use this floor
|
||||||
|
# to verify the DOS port still hits its frame budget on the bottom of
|
||||||
|
# the 386 stack rather than coasting on host CPU.
|
||||||
|
#
|
||||||
|
# The 386SX is identical to the 386DX in instruction set; the only
|
||||||
|
# difference is the 16-bit external bus (vs 32-bit on DX), which slows
|
||||||
|
# memory-bound code. DOSBox does not model the bus split directly --
|
||||||
|
# the cycles count below approximates the combined 386SX-16 throughput.
|
||||||
|
#
|
||||||
|
# Notes:
|
||||||
|
# core = normal accurate per-instruction cycles, not
|
||||||
|
# recompiled-to-host (auto / dynamic would
|
||||||
|
# defeat slow-CPU simulation).
|
||||||
|
# cputype = 386 386 instruction set (no 486 BSWAP /
|
||||||
|
# CMPXCHG, no Pentium MMX).
|
||||||
|
# cycles = fixed 2200 community-standard approximation for
|
||||||
|
# 386SX-16 throughput in DOSBox.
|
||||||
|
# DOSBox-Staging deprecates this in favor
|
||||||
|
# of cpu_cycles, but still accepts it.
|
||||||
|
# Vanilla DOSBox and DOSBox-X only know
|
||||||
|
# the old key, so 'cycles' stays for
|
||||||
|
# cross-fork portability.
|
||||||
|
|
||||||
|
[cpu]
|
||||||
|
core = normal
|
||||||
|
cputype = 386
|
||||||
|
cycles = fixed 2200
|
||||||
|
|
@ -18,6 +18,7 @@ fi
|
||||||
prog=${1:-pattern}
|
prog=${1:-pattern}
|
||||||
repo=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)
|
repo=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)
|
||||||
bin_dir=$repo/build/dos/bin
|
bin_dir=$repo/build/dos/bin
|
||||||
|
conf=$repo/scripts/dosbox-386sx16.conf
|
||||||
file=${prog^^}.EXE
|
file=${prog^^}.EXE
|
||||||
|
|
||||||
if [[ ! -f "$bin_dir/$file" ]]; then
|
if [[ ! -f "$bin_dir/$file" ]]; then
|
||||||
|
|
@ -34,7 +35,12 @@ fi
|
||||||
# default capture-on-click behavior fights the VM's grab and mouse
|
# default capture-on-click behavior fights the VM's grab and mouse
|
||||||
# input is unusable. On plain DOSBox this -set flag is unknown and is
|
# input is unusable. On plain DOSBox this -set flag is unknown and is
|
||||||
# logged once as a warning, then ignored -- harmless either way.
|
# logged once as a warning, then ignored -- harmless either way.
|
||||||
|
#
|
||||||
|
# -conf $conf locks the CPU to a simulated 386SX-16 (the slowest
|
||||||
|
# realistic 386 desktop). DOSBox layers configs: anything not set in
|
||||||
|
# our file falls back to the user's main dosbox.conf.
|
||||||
exec dosbox \
|
exec dosbox \
|
||||||
|
-conf "$conf" \
|
||||||
-set "mouse_capture=seamless" \
|
-set "mouse_capture=seamless" \
|
||||||
-c "C:" \
|
-c "C:" \
|
||||||
-c "$file" \
|
-c "$file" \
|
||||||
|
|
|
||||||
|
|
@ -140,15 +140,16 @@ void halSpriteRestorePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint1
|
||||||
// s->pixels src->dst; on planar ports there is no chunky to copy
|
// s->pixels src->dst; on planar ports there is no chunky to copy
|
||||||
// (planes already covered by halSurfaceCopyPlanes). Chunky ports
|
// (planes already covered by halSurfaceCopyPlanes). Chunky ports
|
||||||
// do the memcpy here; Amiga is a no-op.
|
// do the memcpy here; Amiga is a no-op.
|
||||||
// halSurfaceLoadFileChunky / halSurfaceSaveFileChunky wrap fread /
|
// halSurfaceLoadFile / halSurfaceSaveFile wrap fread / fwrite of the
|
||||||
// fwrite of the pixel data. Chunky ports stream directly to/from
|
// pixel data using each port's native pixel format (chunky on
|
||||||
// s->pixels; Amiga uses a scratch buffer + c2p (load) or
|
// IIgs/DOS, interleaved planar on ST, plane-major on Amiga). Files
|
||||||
// plane->chunky derivation (save).
|
// written by one port are NOT loadable by another -- conversion is
|
||||||
|
// the asset pipeline's job.
|
||||||
uint8_t halSamplePixel(const SurfaceT *s, int16_t x, int16_t y);
|
uint8_t halSamplePixel(const SurfaceT *s, int16_t x, int16_t y);
|
||||||
uint32_t halSurfaceHash(const SurfaceT *s);
|
uint32_t halSurfaceHash(const SurfaceT *s);
|
||||||
void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src);
|
void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src);
|
||||||
bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp);
|
bool halSurfaceLoadFile(SurfaceT *dst, FILE *fp);
|
||||||
bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp);
|
bool halSurfaceSaveFile(const SurfaceT *src, FILE *fp);
|
||||||
|
|
||||||
// Present the dirty regions of the source surface to the display.
|
// Present the dirty regions of the source surface to the display.
|
||||||
// The cross-platform stagePresent walks the dirty arrays before
|
// The cross-platform stagePresent walks the dirty arrays before
|
||||||
|
|
|
||||||
|
|
@ -158,7 +158,7 @@ bool surfaceLoadFile(SurfaceT *dst, const char *path) {
|
||||||
fclose(fp);
|
fclose(fp);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (!halSurfaceLoadFileChunky(dst, fp)) {
|
if (!halSurfaceLoadFile(dst, fp)) {
|
||||||
fclose(fp);
|
fclose(fp);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
@ -186,7 +186,7 @@ bool surfaceSaveFile(const SurfaceT *src, const char *path) {
|
||||||
if (fp == NULL) {
|
if (fp == NULL) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (!halSurfaceSaveFileChunky(src, fp)) {
|
if (!halSurfaceSaveFile(src, fp)) {
|
||||||
fclose(fp);
|
fclose(fp);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,127 +0,0 @@
|
||||||
| Amiga chunky-to-planar conversion -- 68000 hand-rolled.
|
|
||||||
|
|
|
||||||
| Drop-in replacement for hal.c's old c2pRange C inner loop. Uses a
|
|
||||||
| 4 KB lookup table built once at HAL init: each (sourceByte, position,
|
|
||||||
| plane) tuple maps to the plane-byte bit contribution that source
|
|
||||||
| byte makes when it sits at that position within a 4-byte (8-pixel)
|
|
||||||
| planar group going to that plane.
|
|
||||||
|
|
|
||||||
| Calling convention: m68k-amigaos-gcc cdecl.
|
|
||||||
| Args on stack at 4(sp), 8(sp), ...
|
|
||||||
| d2-d7, a2-a6 are callee-save.
|
|
||||||
| No return value.
|
|
||||||
|
|
|
||||||
| void chunkyToPlanarRow(const uint8_t *src, ; 4(sp) - 4bpp packed source row
|
|
||||||
| uint8_t *p0, ; 8(sp) - plane 0 dest row
|
|
||||||
| uint8_t *p1, ; 12(sp) - plane 1 dest row
|
|
||||||
| uint8_t *p2, ; 16(sp) - plane 2 dest row
|
|
||||||
| uint8_t *p3, ; 20(sp) - plane 3 dest row
|
|
||||||
| uint16_t n, ; 24(sp) - planar byte count (low word)
|
|
||||||
| const uint8_t *lut); ; 28(sp) - 4 KB LUT base
|
|
||||||
|
|
|
||||||
| LUT layout: lut[src*16 + pos*4 + plane] = 1-byte plane contribution
|
|
||||||
| for source byte `src` sitting at byte-position `pos` (0..3) within
|
|
||||||
| its 4-byte planar group, going to plane `plane` (0..3). All 16
|
|
||||||
| (pos, plane) entries for one src byte are contiguous, so the inner
|
|
||||||
| loop reaches every entry off (a5, d4.w) with an 8-bit displacement
|
|
||||||
| (0..15) and never has to advance an index register.
|
|
||||||
|
|
|
||||||
| Per planar byte we consume 4 source bytes (positions 0..3 of the
|
|
||||||
| 8-pixel group). For each we compute d4 = src*16 with four add.w's
|
|
||||||
| (faster than asl.w on 68000) and OR the four plane contributions
|
|
||||||
| into d0..d3 with byte-displaced (a5,d4.w) reads.
|
|
||||||
|
|
|
||||||
| GAS-syntax (binutils m68k); assembled by m68k-amigaos-as via the
|
|
||||||
| gcc driver.
|
|
||||||
|
|
||||||
.text
|
|
||||||
.globl _chunkyToPlanarRow
|
|
||||||
|
|
||||||
| Stack frame size of MOVEM.L block: d2-d7 (6) + a2-a6 (5) = 11 regs
|
|
||||||
| * 4 bytes = 44 bytes. Args therefore start at the original sp+4
|
|
||||||
| offset PLUS 44.
|
|
||||||
.equ SAVED_REGS_SIZE, 44
|
|
||||||
|
|
||||||
|
|
||||||
_chunkyToPlanarRow:
|
|
||||||
movem.l %d2-%d7/%a2-%a6,-(%sp)
|
|
||||||
|
|
||||||
move.l 4+SAVED_REGS_SIZE(%sp),%a0 | src
|
|
||||||
move.l 8+SAVED_REGS_SIZE(%sp),%a1 | p0
|
|
||||||
move.l 12+SAVED_REGS_SIZE(%sp),%a2 | p1
|
|
||||||
move.l 16+SAVED_REGS_SIZE(%sp),%a3 | p2
|
|
||||||
move.l 20+SAVED_REGS_SIZE(%sp),%a4 | p3
|
|
||||||
| n is a uint16_t but GCC promotes to int and pushes a
|
|
||||||
| full 4 bytes -- the low word lives at +2 in big-endian
|
|
||||||
| layout.
|
|
||||||
move.w 24+SAVED_REGS_SIZE+2(%sp),%d7 | planar byte count
|
|
||||||
move.l 28+SAVED_REGS_SIZE(%sp),%a5 | LUT base
|
|
||||||
|
|
||||||
subq.w #1,%d7 | DBRA: count-1
|
|
||||||
bmi .Ldone | nothing to do
|
|
||||||
|
|
||||||
.LbyteLoop:
|
|
||||||
moveq #0,%d0 | plane 0 acc
|
|
||||||
moveq #0,%d1 | plane 1 acc
|
|
||||||
moveq #0,%d2 | plane 2 acc
|
|
||||||
moveq #0,%d3 | plane 3 acc
|
|
||||||
|
|
||||||
| ----- Source byte position 0 -----
|
|
||||||
moveq #0,%d4
|
|
||||||
move.b (%a0)+,%d4 | src[0]
|
|
||||||
add.w %d4,%d4
|
|
||||||
add.w %d4,%d4
|
|
||||||
add.w %d4,%d4
|
|
||||||
add.w %d4,%d4 | d4 = src * 16
|
|
||||||
or.b 0(%a5,%d4.w),%d0 | pos0 plane0
|
|
||||||
or.b 1(%a5,%d4.w),%d1 | pos0 plane1
|
|
||||||
or.b 2(%a5,%d4.w),%d2 | pos0 plane2
|
|
||||||
or.b 3(%a5,%d4.w),%d3 | pos0 plane3
|
|
||||||
|
|
||||||
| ----- Source byte position 1 -----
|
|
||||||
moveq #0,%d4
|
|
||||||
move.b (%a0)+,%d4 | src[1]
|
|
||||||
add.w %d4,%d4
|
|
||||||
add.w %d4,%d4
|
|
||||||
add.w %d4,%d4
|
|
||||||
add.w %d4,%d4
|
|
||||||
or.b 4(%a5,%d4.w),%d0 | pos1 plane0
|
|
||||||
or.b 5(%a5,%d4.w),%d1 | pos1 plane1
|
|
||||||
or.b 6(%a5,%d4.w),%d2 | pos1 plane2
|
|
||||||
or.b 7(%a5,%d4.w),%d3 | pos1 plane3
|
|
||||||
|
|
||||||
| ----- Source byte position 2 -----
|
|
||||||
moveq #0,%d4
|
|
||||||
move.b (%a0)+,%d4 | src[2]
|
|
||||||
add.w %d4,%d4
|
|
||||||
add.w %d4,%d4
|
|
||||||
add.w %d4,%d4
|
|
||||||
add.w %d4,%d4
|
|
||||||
or.b 8(%a5,%d4.w),%d0 | pos2 plane0
|
|
||||||
or.b 9(%a5,%d4.w),%d1 | pos2 plane1
|
|
||||||
or.b 10(%a5,%d4.w),%d2 | pos2 plane2
|
|
||||||
or.b 11(%a5,%d4.w),%d3 | pos2 plane3
|
|
||||||
|
|
||||||
| ----- Source byte position 3 -----
|
|
||||||
moveq #0,%d4
|
|
||||||
move.b (%a0)+,%d4 | src[3]
|
|
||||||
add.w %d4,%d4
|
|
||||||
add.w %d4,%d4
|
|
||||||
add.w %d4,%d4
|
|
||||||
add.w %d4,%d4
|
|
||||||
or.b 12(%a5,%d4.w),%d0 | pos3 plane0
|
|
||||||
or.b 13(%a5,%d4.w),%d1 | pos3 plane1
|
|
||||||
or.b 14(%a5,%d4.w),%d2 | pos3 plane2
|
|
||||||
or.b 15(%a5,%d4.w),%d3 | pos3 plane3
|
|
||||||
|
|
||||||
| ----- Store plane bytes -----
|
|
||||||
move.b %d0,(%a1)+
|
|
||||||
move.b %d1,(%a2)+
|
|
||||||
move.b %d2,(%a3)+
|
|
||||||
move.b %d3,(%a4)+
|
|
||||||
|
|
||||||
dbra %d7,.LbyteLoop
|
|
||||||
|
|
||||||
.Ldone:
|
|
||||||
movem.l (%sp)+,%d2-%d7/%a2-%a6
|
|
||||||
rts
|
|
||||||
|
|
@ -115,69 +115,10 @@ static uint8_t gCachedScb [SURFACE_HEIGHT]
|
||||||
static uint16_t gCachedPalette[SURFACE_PALETTE_COUNT][SURFACE_COLORS_PER_PALETTE] __attribute__((aligned(4)));
|
static uint16_t gCachedPalette[SURFACE_PALETTE_COUNT][SURFACE_COLORS_PER_PALETTE] __attribute__((aligned(4)));
|
||||||
static bool gCacheValid = false;
|
static bool gCacheValid = false;
|
||||||
|
|
||||||
// 4 KB chunky-to-planar lookup table consumed by chunkyToPlanarRow
|
|
||||||
// (src/port/amiga/c2p.s). Layout: gC2pLut[src*16 + pos*4 + plane] =
|
|
||||||
// the plane-byte bit contribution that source byte `src` makes to
|
|
||||||
// plane `plane` when it sits at byte-position `pos` within a 4-byte
|
|
||||||
// (8-pixel) planar group. The src-major layout lets the asm inner
|
|
||||||
// loop reach all 16 (pos, plane) entries for a single src byte via
|
|
||||||
// 8-bit displacements off (a5, d4.w) without any LEA between reads.
|
|
||||||
static uint8_t gC2pLut[4 * 1024];
|
|
||||||
static bool gC2pLutReady = false;
|
|
||||||
|
|
||||||
static bool paletteOrScbChanged(const SurfaceT *src);
|
static bool paletteOrScbChanged(const SurfaceT *src);
|
||||||
static void initC2pLut(void);
|
|
||||||
|
|
||||||
// Provided by src/port/amiga/c2p.s.
|
|
||||||
extern void chunkyToPlanarRow(const uint8_t *src,
|
|
||||||
uint8_t *p0, uint8_t *p1, uint8_t *p2, uint8_t *p3,
|
|
||||||
uint16_t numPlanarBytes,
|
|
||||||
const uint8_t *lut);
|
|
||||||
|
|
||||||
// ----- Internal helpers (alphabetical) -----
|
// ----- Internal helpers (alphabetical) -----
|
||||||
|
|
||||||
// Build the 4 KB chunky-to-planar lookup table consumed by
|
|
||||||
// chunkyToPlanarRow. For each (pos, plane, src) tuple, store the
|
|
||||||
// bit contribution that source byte `src` makes to plane `plane`
|
|
||||||
// when it sits at byte-position `pos` (0..3) within a 4-byte
|
|
||||||
// (8-pixel) planar group:
|
|
||||||
//
|
|
||||||
// - src high nibble = leftmost pixel -> plane bit (7 - 2*pos)
|
|
||||||
// - src low nibble = rightmost pixel -> plane bit (6 - 2*pos)
|
|
||||||
static void initC2pLut(void) {
|
|
||||||
uint16_t pos;
|
|
||||||
uint16_t plane;
|
|
||||||
uint16_t src;
|
|
||||||
uint8_t highShift;
|
|
||||||
uint8_t lowShift;
|
|
||||||
uint8_t highBit;
|
|
||||||
uint8_t lowBit;
|
|
||||||
|
|
||||||
if (gC2pLutReady) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
for (src = 0; src < 256; src++) {
|
|
||||||
for (pos = 0; pos < 4; pos++) {
|
|
||||||
highShift = (uint8_t)(7 - 2 * pos);
|
|
||||||
lowShift = (uint8_t)(6 - 2 * pos);
|
|
||||||
for (plane = 0; plane < 4; plane++) {
|
|
||||||
highBit = (uint8_t)(((src >> 4) >> plane) & 1);
|
|
||||||
lowBit = (uint8_t)(((src & 0x0F) >> plane) & 1);
|
|
||||||
gC2pLut[src * 16 + pos * 4 + plane] =
|
|
||||||
(uint8_t)((highBit << highShift) | (lowBit << lowShift));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
gC2pLutReady = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// (Phase 9 deleted c2pRange. halSurfaceLoadPlanes inlines its own
|
|
||||||
// per-row chunkyToPlanarRow loop -- the only code path that still
|
|
||||||
// converts chunky to planar today, since asset loading is the only
|
|
||||||
// surface mutation that doesn't go through a planar-aware primitive.)
|
|
||||||
|
|
||||||
|
|
||||||
// Build a user copper list for per-scanline palette (SCB emulation).
|
// Build a user copper list for per-scanline palette (SCB emulation).
|
||||||
// One WAIT + 16 MOVEs per displayed scanline + one CEND. The list is
|
// One WAIT + 16 MOVEs per displayed scanline + one CEND. The list is
|
||||||
// stored in gNewUCL until installCopperList swaps it onto the screen.
|
// stored in gNewUCL until installCopperList swaps it onto the screen.
|
||||||
|
|
@ -1358,35 +1299,6 @@ void halSpriteRestorePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint1
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/* Helper used by Amiga halSurfaceLoadFileChunky to populate planes
|
|
||||||
* from a freshly-loaded chunky pixel buffer (s->pixels). */
|
|
||||||
static void amigaPopulatePlanesFromChunky(SurfaceT *s) {
|
|
||||||
AmigaPlanarT *pd;
|
|
||||||
int16_t y;
|
|
||||||
const uint8_t *srcLine;
|
|
||||||
UBYTE *p0;
|
|
||||||
UBYTE *p1;
|
|
||||||
UBYTE *p2;
|
|
||||||
UBYTE *p3;
|
|
||||||
|
|
||||||
pd = (AmigaPlanarT *)s->portData;
|
|
||||||
if (pd == NULL) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
if (!gC2pLutReady) {
|
|
||||||
initC2pLut();
|
|
||||||
}
|
|
||||||
for (y = 0; y < SURFACE_HEIGHT; y++) {
|
|
||||||
srcLine = &s->pixels[y * SURFACE_BYTES_PER_ROW];
|
|
||||||
p0 = pd->planes[0] + (uint16_t)y * AMIGA_BYTES_PER_ROW;
|
|
||||||
p1 = pd->planes[1] + (uint16_t)y * AMIGA_BYTES_PER_ROW;
|
|
||||||
p2 = pd->planes[2] + (uint16_t)y * AMIGA_BYTES_PER_ROW;
|
|
||||||
p3 = pd->planes[3] + (uint16_t)y * AMIGA_BYTES_PER_ROW;
|
|
||||||
chunkyToPlanarRow(srcLine, p0, p1, p2, p3, AMIGA_BYTES_PER_ROW, gC2pLut);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// Phase 6 planar dual-write for sprite draw. Walks the sprite's
|
// Phase 6 planar dual-write for sprite draw. Walks the sprite's
|
||||||
// chunky tile data with the same clipping the cross-platform code
|
// chunky tile data with the same clipping the cross-platform code
|
||||||
// applies, calling amigaPlanarSetPixel for every non-transparent
|
// applies, calling amigaPlanarSetPixel for every non-transparent
|
||||||
|
|
@ -2118,7 +2030,9 @@ uint8_t halSamplePixel(const SurfaceT *s, int16_t x, int16_t y) {
|
||||||
|
|
||||||
|
|
||||||
/* Reverse-c2p: per row, derive 160 chunky bytes from 40 plane bytes
|
/* Reverse-c2p: per row, derive 160 chunky bytes from 40 plane bytes
|
||||||
* (per plane, 4 planes). Used by halSurfaceHash, halSurfaceSaveFileChunky.
|
* (per plane, 4 planes). Used by halSurfaceHash to fold the planar
|
||||||
|
* surface into the same byte-stream the chunky ports hash, so cross-
|
||||||
|
* port hash comparisons stay valid.
|
||||||
* Walks 8 pixels per planar-byte column; per pixel assembles nibble
|
* Walks 8 pixels per planar-byte column; per pixel assembles nibble
|
||||||
* from 4 plane bits. Output: 4 chunky bytes per planar-byte column
|
* from 4 plane bits. Output: 4 chunky bytes per planar-byte column
|
||||||
* (since 8 pixels = 4 chunky bytes at 2px/byte). */
|
* (since 8 pixels = 4 chunky bytes at 2px/byte). */
|
||||||
|
|
@ -2204,62 +2118,35 @@ void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src) {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp) {
|
// On-disk format is the Amiga's native plane-major buffer: planes
|
||||||
|
// 0..3 written sequentially, AMIGA_PLANE_SIZE bytes each.
|
||||||
|
bool halSurfaceLoadFile(SurfaceT *dst, FILE *fp) {
|
||||||
AmigaPlanarT *pd;
|
AmigaPlanarT *pd;
|
||||||
uint8_t *scratch;
|
uint8_t i;
|
||||||
uint8_t *srcLine;
|
|
||||||
int16_t y;
|
|
||||||
UBYTE *p0;
|
|
||||||
UBYTE *p1;
|
|
||||||
UBYTE *p2;
|
|
||||||
UBYTE *p3;
|
|
||||||
bool ok;
|
|
||||||
|
|
||||||
pd = (AmigaPlanarT *)dst->portData;
|
pd = (AmigaPlanarT *)dst->portData;
|
||||||
if (pd == NULL) {
|
if (pd == NULL) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
/* fread the chunky file payload into a scratch buffer, then c2p
|
for (i = 0; i < AMIGA_BITPLANES; i++) {
|
||||||
* directly into our planes. The scratch is a one-shot AllocMem
|
if (fread(pd->planes[i], 1, AMIGA_PLANE_SIZE, fp) != AMIGA_PLANE_SIZE) {
|
||||||
* (PUBLIC, not chip) since chunkyToPlanarRow only reads it. */
|
return false;
|
||||||
scratch = (uint8_t *)AllocMem((ULONG)SURFACE_PIXELS_SIZE, (ULONG)MEMF_PUBLIC);
|
|
||||||
if (scratch == NULL) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
ok = (fread(scratch, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE);
|
|
||||||
if (ok) {
|
|
||||||
if (!gC2pLutReady) {
|
|
||||||
initC2pLut();
|
|
||||||
}
|
|
||||||
for (y = 0; y < SURFACE_HEIGHT; y++) {
|
|
||||||
srcLine = &scratch[y * SURFACE_BYTES_PER_ROW];
|
|
||||||
p0 = pd->planes[0] + (uint16_t)y * AMIGA_BYTES_PER_ROW;
|
|
||||||
p1 = pd->planes[1] + (uint16_t)y * AMIGA_BYTES_PER_ROW;
|
|
||||||
p2 = pd->planes[2] + (uint16_t)y * AMIGA_BYTES_PER_ROW;
|
|
||||||
p3 = pd->planes[3] + (uint16_t)y * AMIGA_BYTES_PER_ROW;
|
|
||||||
chunkyToPlanarRow(srcLine, p0, p1, p2, p3, AMIGA_BYTES_PER_ROW, gC2pLut);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
FreeMem(scratch, (ULONG)SURFACE_PIXELS_SIZE);
|
return true;
|
||||||
return ok;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp) {
|
bool halSurfaceSaveFile(const SurfaceT *src, FILE *fp) {
|
||||||
AmigaPlanarT *pd;
|
AmigaPlanarT *pd;
|
||||||
uint8_t chunkyRow[SURFACE_BYTES_PER_ROW];
|
uint8_t i;
|
||||||
int16_t y;
|
|
||||||
|
|
||||||
pd = (AmigaPlanarT *)src->portData;
|
pd = (AmigaPlanarT *)src->portData;
|
||||||
if (pd == NULL) {
|
if (pd == NULL) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
/* Per row: derive chunky from planes, write 160 bytes. Less
|
for (i = 0; i < AMIGA_BITPLANES; i++) {
|
||||||
* efficient than a single fwrite of a full buffer but avoids
|
if (fwrite(pd->planes[i], 1, AMIGA_PLANE_SIZE, fp) != AMIGA_PLANE_SIZE) {
|
||||||
* needing a 32 KB scratch allocation. */
|
|
||||||
for (y = 0; y < SURFACE_HEIGHT; y++) {
|
|
||||||
amigaPlanesToChunkyRow(pd, y, chunkyRow);
|
|
||||||
if (fwrite(chunkyRow, 1, SURFACE_BYTES_PER_ROW, fp) != SURFACE_BYTES_PER_ROW) {
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,188 +0,0 @@
|
||||||
| Atari ST chunky-to-planar conversion -- 68000 hand-rolled.
|
|
||||||
|
|
|
||||||
| Drop-in replacement for hal.c's old c2pRow C inner loop. The C
|
|
||||||
| version walked every pixel and built each plane word with a
|
|
||||||
| run-time variable bit shift (`1 << bit`), which costs ~6+2*bit
|
|
||||||
| cycles on 68000 -- roughly 100+ cycles per pixel after GCC's m68k
|
|
||||||
| codegen overhead. This rewrite uses a 4 KB lookup table built once
|
|
||||||
| at HAL init: same layout as the Amiga c2p LUT, so the
|
|
||||||
| (sourceByte, position, plane) -> 2-bit contribution mapping is
|
|
||||||
| identical, but the routine packs results into ST word-interleaved
|
|
||||||
| planar (4 plane words per 16-pixel group) instead of 4 separate
|
|
||||||
| plane bytes.
|
|
||||||
|
|
|
||||||
| Each ST group is 8 source bytes -> 4 plane words. Source byte
|
|
||||||
| positions 0..3 contribute to the HIGH byte of each plane word
|
|
||||||
| (bits 15..8); positions 4..7 contribute to the LOW byte (bits
|
|
||||||
| 7..0). Within a byte, the LUT for (src, bp%4, plane) already
|
|
||||||
| places bits at (7-2*(bp%4), 6-2*(bp%4)), so we use the SAME LUT
|
|
||||||
| entries for both halves -- we just shift d0..d3 left by 8 between
|
|
||||||
| the halves to move the high-half bits up before the low half ORs
|
|
||||||
| into the now-empty low byte.
|
|
||||||
|
|
|
||||||
| Calling convention: m68k-atari-mint-gcc cdecl.
|
|
||||||
| Args on stack at 4(sp), 8(sp), ...
|
|
||||||
| d2-d7, a2-a6 are callee-save.
|
|
||||||
| No return value.
|
|
||||||
|
|
|
||||||
| void chunkyToPlanarRowSt(const uint8_t *src, ; 4(sp) - 4bpp packed source row
|
|
||||||
| uint16_t *dst, ; 8(sp) - planar dest row (uint16_t*)
|
|
||||||
| uint16_t groupStart, ; 12(sp) - first group index (low word)
|
|
||||||
| uint16_t groupEnd, ; 16(sp) - one-past-last group index (low word)
|
|
||||||
| const uint8_t *lut); ; 20(sp) - 4 KB LUT base
|
|
||||||
|
|
|
||||||
| LUT layout: lut[src*16 + pos*4 + plane] (uint8) = the 2-bit plane
|
|
||||||
| contribution for source byte `src` at byte-position `pos` (0..3
|
|
||||||
| within a 4-byte chunk) going to plane `plane` (0..3). All 16
|
|
||||||
| (pos, plane) entries for one src byte are contiguous, so the inner
|
|
||||||
| loop reaches every entry off (a5, d4.w) with an 8-bit displacement
|
|
||||||
| (0..15) without LEA between reads.
|
|
||||||
|
|
|
||||||
| GAS-syntax (binutils m68k); assembled by m68k-atari-mint-as via
|
|
||||||
| the gcc driver.
|
|
||||||
|
|
||||||
.text
|
|
||||||
.globl _chunkyToPlanarRowSt
|
|
||||||
|
|
||||||
| MOVEM frame: d2-d7 (6) + a2-a6 (5) = 11 regs * 4 bytes = 44 bytes.
|
|
||||||
.equ SAVED_REGS_SIZE, 44
|
|
||||||
|
|
||||||
|
|
||||||
_chunkyToPlanarRowSt:
|
|
||||||
movem.l %d2-%d7/%a2-%a6,-(%sp)
|
|
||||||
|
|
||||||
move.l 4+SAVED_REGS_SIZE(%sp),%a0 | src row base
|
|
||||||
move.l 8+SAVED_REGS_SIZE(%sp),%a1 | dst (uint16_t*)
|
|
||||||
| Both groupStart and groupEnd are uint16_t but GCC
|
|
||||||
| promotes them to int and pushes 4 bytes each; the
|
|
||||||
| low word lives at +2 in big-endian layout.
|
|
||||||
move.w 12+SAVED_REGS_SIZE+2(%sp),%d6 | groupStart
|
|
||||||
move.w 16+SAVED_REGS_SIZE+2(%sp),%d7 | groupEnd
|
|
||||||
move.l 20+SAVED_REGS_SIZE(%sp),%a5 | LUT base
|
|
||||||
|
|
||||||
| Advance src and dst to the first group's data.
|
|
||||||
| Each group consumes 8 source bytes and produces 4
|
|
||||||
| dest words (8 bytes), so both pointers advance by
|
|
||||||
| groupStart * 8.
|
|
||||||
move.w %d6,%d4
|
|
||||||
lsl.w #3,%d4
|
|
||||||
add.w %d4,%a0
|
|
||||||
add.w %d4,%a1
|
|
||||||
|
|
||||||
sub.w %d6,%d7 | groupCount = end - start
|
|
||||||
subq.w #1,%d7 | DBRA bias
|
|
||||||
bmi .Ldone
|
|
||||||
|
|
||||||
.LgroupLoop:
|
|
||||||
moveq #0,%d0 | plane 0 acc
|
|
||||||
moveq #0,%d1 | plane 1 acc
|
|
||||||
moveq #0,%d2 | plane 2 acc
|
|
||||||
moveq #0,%d3 | plane 3 acc
|
|
||||||
|
|
||||||
| ===== Source bytes 0..3 -> high byte of each plane word =====
|
|
||||||
moveq #0,%d4
|
|
||||||
move.b (%a0)+,%d4
|
|
||||||
add.w %d4,%d4
|
|
||||||
add.w %d4,%d4
|
|
||||||
add.w %d4,%d4
|
|
||||||
add.w %d4,%d4 | d4 = src * 16
|
|
||||||
or.b 0(%a5,%d4.w),%d0
|
|
||||||
or.b 1(%a5,%d4.w),%d1
|
|
||||||
or.b 2(%a5,%d4.w),%d2
|
|
||||||
or.b 3(%a5,%d4.w),%d3
|
|
||||||
|
|
||||||
moveq #0,%d4
|
|
||||||
move.b (%a0)+,%d4
|
|
||||||
add.w %d4,%d4
|
|
||||||
add.w %d4,%d4
|
|
||||||
add.w %d4,%d4
|
|
||||||
add.w %d4,%d4
|
|
||||||
or.b 4(%a5,%d4.w),%d0
|
|
||||||
or.b 5(%a5,%d4.w),%d1
|
|
||||||
or.b 6(%a5,%d4.w),%d2
|
|
||||||
or.b 7(%a5,%d4.w),%d3
|
|
||||||
|
|
||||||
moveq #0,%d4
|
|
||||||
move.b (%a0)+,%d4
|
|
||||||
add.w %d4,%d4
|
|
||||||
add.w %d4,%d4
|
|
||||||
add.w %d4,%d4
|
|
||||||
add.w %d4,%d4
|
|
||||||
or.b 8(%a5,%d4.w),%d0
|
|
||||||
or.b 9(%a5,%d4.w),%d1
|
|
||||||
or.b 10(%a5,%d4.w),%d2
|
|
||||||
or.b 11(%a5,%d4.w),%d3
|
|
||||||
|
|
||||||
moveq #0,%d4
|
|
||||||
move.b (%a0)+,%d4
|
|
||||||
add.w %d4,%d4
|
|
||||||
add.w %d4,%d4
|
|
||||||
add.w %d4,%d4
|
|
||||||
add.w %d4,%d4
|
|
||||||
or.b 12(%a5,%d4.w),%d0
|
|
||||||
or.b 13(%a5,%d4.w),%d1
|
|
||||||
or.b 14(%a5,%d4.w),%d2
|
|
||||||
or.b 15(%a5,%d4.w),%d3
|
|
||||||
|
|
||||||
| Move accumulated bits into the HIGH byte of each word.
|
|
||||||
lsl.w #8,%d0
|
|
||||||
lsl.w #8,%d1
|
|
||||||
lsl.w #8,%d2
|
|
||||||
lsl.w #8,%d3
|
|
||||||
|
|
||||||
| ===== Source bytes 4..7 -> low byte of each plane word =====
|
|
||||||
moveq #0,%d4
|
|
||||||
move.b (%a0)+,%d4
|
|
||||||
add.w %d4,%d4
|
|
||||||
add.w %d4,%d4
|
|
||||||
add.w %d4,%d4
|
|
||||||
add.w %d4,%d4
|
|
||||||
or.b 0(%a5,%d4.w),%d0
|
|
||||||
or.b 1(%a5,%d4.w),%d1
|
|
||||||
or.b 2(%a5,%d4.w),%d2
|
|
||||||
or.b 3(%a5,%d4.w),%d3
|
|
||||||
|
|
||||||
moveq #0,%d4
|
|
||||||
move.b (%a0)+,%d4
|
|
||||||
add.w %d4,%d4
|
|
||||||
add.w %d4,%d4
|
|
||||||
add.w %d4,%d4
|
|
||||||
add.w %d4,%d4
|
|
||||||
or.b 4(%a5,%d4.w),%d0
|
|
||||||
or.b 5(%a5,%d4.w),%d1
|
|
||||||
or.b 6(%a5,%d4.w),%d2
|
|
||||||
or.b 7(%a5,%d4.w),%d3
|
|
||||||
|
|
||||||
moveq #0,%d4
|
|
||||||
move.b (%a0)+,%d4
|
|
||||||
add.w %d4,%d4
|
|
||||||
add.w %d4,%d4
|
|
||||||
add.w %d4,%d4
|
|
||||||
add.w %d4,%d4
|
|
||||||
or.b 8(%a5,%d4.w),%d0
|
|
||||||
or.b 9(%a5,%d4.w),%d1
|
|
||||||
or.b 10(%a5,%d4.w),%d2
|
|
||||||
or.b 11(%a5,%d4.w),%d3
|
|
||||||
|
|
||||||
moveq #0,%d4
|
|
||||||
move.b (%a0)+,%d4
|
|
||||||
add.w %d4,%d4
|
|
||||||
add.w %d4,%d4
|
|
||||||
add.w %d4,%d4
|
|
||||||
add.w %d4,%d4
|
|
||||||
or.b 12(%a5,%d4.w),%d0
|
|
||||||
or.b 13(%a5,%d4.w),%d1
|
|
||||||
or.b 14(%a5,%d4.w),%d2
|
|
||||||
or.b 15(%a5,%d4.w),%d3
|
|
||||||
|
|
||||||
| Store 4 plane words.
|
|
||||||
move.w %d0,(%a1)+
|
|
||||||
move.w %d1,(%a1)+
|
|
||||||
move.w %d2,(%a1)+
|
|
||||||
move.w %d3,(%a1)+
|
|
||||||
|
|
||||||
dbra %d7,.LgroupLoop
|
|
||||||
|
|
||||||
.Ldone:
|
|
||||||
movem.l (%sp)+,%d2-%d7/%a2-%a6
|
|
||||||
rts
|
|
||||||
|
|
@ -82,11 +82,9 @@
|
||||||
.macro YP_REC slot, signOp, yreg
|
.macro YP_REC slot, signOp, yreg
|
||||||
move.l %a4,%d6
|
move.l %a4,%d6
|
||||||
\signOp\().w \yreg,%d6 | d6.w = yp
|
\signOp\().w \yreg,%d6 | d6.w = yp
|
||||||
move.w %d6,%d0
|
add.w %d6,%d6 | * 2 for word index
|
||||||
lsl.w #5,%d6 | d6 = yp << 5
|
move.w (%a6,%d6.w),%d6 | yLut[yp] = yp * 160
|
||||||
lsl.w #7,%d0 | d0 = yp << 7
|
move.w %d6,\slot(%sp)
|
||||||
add.w %d6,%d0 | d0 = yp * 160
|
|
||||||
move.w %d0,\slot(%sp)
|
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -223,14 +221,21 @@ _surface68kStCircleOutline:
|
||||||
moveq #1,%d4
|
moveq #1,%d4
|
||||||
sub.w %d2,%d4 | err = 1 - bx
|
sub.w %d2,%d4 | err = 1 - bx
|
||||||
|
|
||||||
|
| a6 = yLut base (yp -> yp*160). Lookup is faster than
|
||||||
|
| the 4 cyc + 4 cyc + 18 cyc + 22 cyc + 4 cyc shift+add
|
||||||
|
| chain we used to do per YP_REC. Saved across all 4
|
||||||
|
| YP_RECs per Bresenham iter (~120 cyc/iter).
|
||||||
|
| Shared LUT lives in lineSpan.s; reference absolute.
|
||||||
|
lea _gStRowOffsetLut,%a6
|
||||||
|
|
||||||
| Dispatch on color (low 4 bits) -> one of 16 main loops.
|
| Dispatch on color (low 4 bits) -> one of 16 main loops.
|
||||||
moveq #0,%d6
|
moveq #0,%d6
|
||||||
move.b SP_COLOR(%sp),%d6
|
move.b SP_COLOR(%sp),%d6
|
||||||
and.w #0x0F,%d6
|
and.w #0x0F,%d6
|
||||||
add.w %d6,%d6
|
add.w %d6,%d6
|
||||||
add.w %d6,%d6 | * 4 for bra.w table
|
add.w %d6,%d6 | * 4 for bra.w table
|
||||||
lea .LcoStTable(%pc),%a6
|
lea .LcoStTable(%pc),%a2
|
||||||
jmp 0(%a6,%d6.w)
|
jmp 0(%a2,%d6.w)
|
||||||
|
|
||||||
.LcoStTable:
|
.LcoStTable:
|
||||||
bra.w .LcoStLoop_0
|
bra.w .LcoStLoop_0
|
||||||
|
|
@ -280,3 +285,4 @@ bitMaskWordLut:
|
||||||
.word 0x0800, 0x0400, 0x0200, 0x0100
|
.word 0x0800, 0x0400, 0x0200, 0x0100
|
||||||
.word 0x0080, 0x0040, 0x0020, 0x0010
|
.word 0x0080, 0x0040, 0x0020, 0x0010
|
||||||
.word 0x0008, 0x0004, 0x0002, 0x0001
|
.word 0x0008, 0x0004, 0x0002, 0x0001
|
||||||
|
| (yLut now lives in lineSpan.s as the shared _gStRowOffsetLut)
|
||||||
|
|
|
||||||
|
|
@ -9,28 +9,16 @@
|
||||||
| Caller MUST guarantee the bounding box (cx-r, cy-r) (cx+r, cy+r)
|
| Caller MUST guarantee the bounding box (cx-r, cy-r) (cx+r, cy+r)
|
||||||
| is fully on-surface. Off-surface circles fall back to the C walker.
|
| is fully on-surface. Off-surface circles fall back to the C walker.
|
||||||
|
|
|
|
||||||
|
| Phase 10 final: 16-way color dispatch at the OUTER loop. Each color
|
||||||
|
| variant has its own Bresenham body where SPAN_BODY inlines a hard-
|
||||||
|
| coded 4-plane mask RMW (no btst, no bsr/rts). Saves ~120 cyc per
|
||||||
|
| applyMask call (was ~180 via bsr applyMask with runtime btst on d7).
|
||||||
|
|
|
||||||
| ABI: cdecl. d2-d7/a2-a6 callee-save.
|
| ABI: cdecl. d2-d7/a2-a6 callee-save.
|
||||||
|
|
|
|
||||||
| void surface68kStFillCircle(uint8_t *base,
|
| void surface68kStFillCircle(uint8_t *base,
|
||||||
| uint16_t cx, uint16_t cy,
|
| uint16_t cx, uint16_t cy,
|
||||||
| uint16_t r, uint8_t color);
|
| uint16_t r, uint8_t color);
|
||||||
|
|
|
||||||
| Register allocation across the loop:
|
|
||||||
| d2.w = bx (Bresenham, starts at r)
|
|
||||||
| d3.w = by (Bresenham, starts at 0)
|
|
||||||
| d4.w = err
|
|
||||||
| d5.l = loLong (planes 0+1 long template)
|
|
||||||
| d6.l = hiLong (planes 2+3 long template)
|
|
||||||
| d7.b = color (low nibble; tested via btst)
|
|
||||||
| a3 = base
|
|
||||||
| a4 = scratch / current group pointer
|
|
||||||
| d0,d1 = scratch
|
|
||||||
|
|
|
||||||
| Stack scratch (8 bytes at 0(sp)..7(sp)):
|
|
||||||
| 0..1 leftMask (word; per pair)
|
|
||||||
| 2..3 rightMask (word; per pair)
|
|
||||||
| 4..5 numGroups (word; per pair)
|
|
||||||
| 6..7 groupFirstByteOff (word; per pair)
|
|
||||||
|
|
||||||
.text
|
.text
|
||||||
|
|
||||||
|
|
@ -42,7 +30,7 @@
|
||||||
.equ SP_FC_CX, SP_FC_OFF + 4 + 2
|
.equ SP_FC_CX, SP_FC_OFF + 4 + 2
|
||||||
.equ SP_FC_CY, SP_FC_OFF + 8 + 2
|
.equ SP_FC_CY, SP_FC_OFF + 8 + 2
|
||||||
.equ SP_FC_R, SP_FC_OFF + 12 + 2
|
.equ SP_FC_R, SP_FC_OFF + 12 + 2
|
||||||
.equ SP_FC_COLOR, SP_FC_OFF + 16 + 3
|
.equ SP_FC_COLOR, SP_FC_OFF + 20 + 3
|
||||||
|
|
||||||
|
|
||||||
| ---- COMPUTE_PAIR_MASKS macro -----------------------------------
|
| ---- COMPUTE_PAIR_MASKS macro -----------------------------------
|
||||||
|
|
@ -50,18 +38,15 @@
|
||||||
| Output: 0(sp) leftMask, 2(sp) rightMask, 4(sp) numGroups,
|
| Output: 0(sp) leftMask, 2(sp) rightMask, 4(sp) numGroups,
|
||||||
| 6(sp) groupFirstByteOff
|
| 6(sp) groupFirstByteOff
|
||||||
| Trashes: d0, d1
|
| Trashes: d0, d1
|
||||||
| (No labels: straightline.)
|
|
||||||
|
|
||||||
.macro COMPUTE_PAIR_MASKS
|
.macro COMPUTE_PAIR_MASKS
|
||||||
move.w %d0,0(%sp) | stash left
|
move.w %d0,0(%sp) | stash left
|
||||||
move.w %d1,2(%sp) | stash right
|
move.w %d1,2(%sp) | stash right
|
||||||
| groupFirst & groupFirstByteOff
|
|
||||||
move.w %d0,%d1
|
move.w %d0,%d1
|
||||||
lsr.w #4,%d1 | groupFirst
|
lsr.w #4,%d1 | groupFirst
|
||||||
move.w %d1,%d0
|
move.w %d1,%d0
|
||||||
lsl.w #3,%d0 | groupFirstByteOff
|
lsl.w #3,%d0 | groupFirstByteOff
|
||||||
move.w %d0,6(%sp)
|
move.w %d0,6(%sp)
|
||||||
| numGroups = (right >> 4) - groupFirst
|
|
||||||
move.w 2(%sp),%d0
|
move.w 2(%sp),%d0
|
||||||
lsr.w #4,%d0 | groupLast
|
lsr.w #4,%d0 | groupLast
|
||||||
sub.w %d1,%d0 | numGroups
|
sub.w %d1,%d0 | numGroups
|
||||||
|
|
@ -81,25 +66,53 @@
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
|
|
||||||
| ---- SPAN_BODY macro --------------------------------------------
|
| ---- APPLY_MASK_INLINE macro ------------------------------------
|
||||||
| Render one row span using the pair masks at 0(sp)..7(sp).
|
| 4-plane mask RMW with HARDCODED color. a4 advances by 8 (postinc).
|
||||||
| Input: d0.w = y (signed)
|
| Inputs: d0.w = mask, a4 = group ptr
|
||||||
| a3 = base, d5 = loLong, d6 = hiLong, d7 = color
|
| Trashes: d1 (notMask scratch)
|
||||||
| Trashes: d0, d1, a4
|
|
||||||
| Macro takes an idx parameter for unique labels.
|
|
||||||
|
|
||||||
.macro SPAN_BODY
|
.macro APPLY_MASK_INLINE color
|
||||||
| a4 = base + y*160
|
move.w %d0,%d1
|
||||||
ext.l %d0
|
not.w %d1
|
||||||
move.l %d0,%d1
|
.if ((\color) & 1)
|
||||||
lsl.l #5,%d0
|
or.w %d0,(%a4)+
|
||||||
lsl.l #7,%d1
|
.else
|
||||||
add.l %d1,%d0 | y*160
|
and.w %d1,(%a4)+
|
||||||
lea 0(%a3,%d0.l),%a4
|
.endif
|
||||||
| a4 += groupFirstByteOff
|
.if ((\color) & 2)
|
||||||
moveq #0,%d0
|
or.w %d0,(%a4)+
|
||||||
move.w 6(%sp),%d0
|
.else
|
||||||
add.l %d0,%a4
|
and.w %d1,(%a4)+
|
||||||
|
.endif
|
||||||
|
.if ((\color) & 4)
|
||||||
|
or.w %d0,(%a4)+
|
||||||
|
.else
|
||||||
|
and.w %d1,(%a4)+
|
||||||
|
.endif
|
||||||
|
.if ((\color) & 8)
|
||||||
|
or.w %d0,(%a4)+
|
||||||
|
.else
|
||||||
|
and.w %d1,(%a4)+
|
||||||
|
.endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
| ---- SPAN_BODY macro --------------------------------------------
|
||||||
|
| Render one row span. Color hardcoded.
|
||||||
|
| Input: d0.w = y (signed)
|
||||||
|
| a3 = base, d5 = loLong, d6 = hiLong
|
||||||
|
| masks at 0..7(sp): leftMask, rightMask, numGroups, groupFirstByteOff
|
||||||
|
| Trashes: d0, d1, a4
|
||||||
|
|
||||||
|
.macro SPAN_BODY color
|
||||||
|
| a4 = base + y*160 + groupFirstByteOff
|
||||||
|
| y*160 via shared _gStRowOffsetLut (a2 holds lut base).
|
||||||
|
| byteOff (y*160 + groupFirstByteOff) fits in 16 bits
|
||||||
|
| (max 31992), so word-only ops + .w-indexed lea.
|
||||||
|
add.w %d0,%d0 | y * 2 (word index)
|
||||||
|
move.w (%a2,%d0.w),%d0 | d0 = y * 160
|
||||||
|
add.w 6(%sp),%d0 | + groupFirstByteOff
|
||||||
|
lea 0(%a3,%d0.w),%a4
|
||||||
| numGroups in d1
|
| numGroups in d1
|
||||||
move.w 4(%sp),%d1
|
move.w 4(%sp),%d1
|
||||||
tst.w %d1
|
tst.w %d1
|
||||||
|
|
@ -107,15 +120,14 @@
|
||||||
| single-group: combinedMask = leftMask & rightMask
|
| single-group: combinedMask = leftMask & rightMask
|
||||||
move.w 0(%sp),%d0
|
move.w 0(%sp),%d0
|
||||||
and.w 2(%sp),%d0
|
and.w 2(%sp),%d0
|
||||||
bsr .Lfc_applyMask
|
APPLY_MASK_INLINE \color
|
||||||
bra.w .Lsb_done\@
|
bra.w .Lsb_done\@
|
||||||
.Lsb_multi\@:
|
.Lsb_multi\@:
|
||||||
| leading mask. applyMask postinc-advances a4 by 8
|
| leading mask. APPLY_MASK_INLINE postinc-advances a4 by 8.
|
||||||
| (the 4 plane RMWs each advance by 2 via (a4)+).
|
| APPLY trashes d1, so reload numGroups after.
|
||||||
| applyMask trashes d1, so reload numGroups after bsr.
|
|
||||||
move.w 0(%sp),%d0
|
move.w 0(%sp),%d0
|
||||||
bsr .Lfc_applyMask
|
APPLY_MASK_INLINE \color
|
||||||
move.w 4(%sp),%d1 | reload numGroups
|
move.w 4(%sp),%d1
|
||||||
subq.w #1,%d1 | d1 = numMid
|
subq.w #1,%d1 | d1 = numMid
|
||||||
beq.s .Lsb_skipMid\@
|
beq.s .Lsb_skipMid\@
|
||||||
.Lsb_midLoop\@:
|
.Lsb_midLoop\@:
|
||||||
|
|
@ -126,11 +138,71 @@
|
||||||
.Lsb_skipMid\@:
|
.Lsb_skipMid\@:
|
||||||
| trailing mask
|
| trailing mask
|
||||||
move.w 2(%sp),%d0
|
move.w 2(%sp),%d0
|
||||||
bsr .Lfc_applyMask
|
APPLY_MASK_INLINE \color
|
||||||
.Lsb_done\@:
|
.Lsb_done\@:
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
| ---- CO_BODY macro: per-color full Bresenham loop body ----------
|
||||||
|
|
||||||
|
.macro CO_BODY color
|
||||||
|
.Lfc_loop_\color:
|
||||||
|
cmp.w %d3,%d2
|
||||||
|
bcs.w .Lfc_done
|
||||||
|
|
||||||
|
| --- Pair A: x range = (cx - bx, cx + bx)
|
||||||
|
move.w SP_FC_CX(%sp),%d0
|
||||||
|
move.w %d0,%d1
|
||||||
|
sub.w %d2,%d0
|
||||||
|
add.w %d2,%d1
|
||||||
|
COMPUTE_PAIR_MASKS
|
||||||
|
|
||||||
|
| Span A1: y = cy + by
|
||||||
|
move.w SP_FC_CY(%sp),%d0
|
||||||
|
add.w %d3,%d0
|
||||||
|
SPAN_BODY \color
|
||||||
|
|
||||||
|
| Span A2: y = cy - by
|
||||||
|
move.w SP_FC_CY(%sp),%d0
|
||||||
|
sub.w %d3,%d0
|
||||||
|
SPAN_BODY \color
|
||||||
|
|
||||||
|
| --- Pair B: x range = (cx - by, cx + by)
|
||||||
|
move.w SP_FC_CX(%sp),%d0
|
||||||
|
move.w %d0,%d1
|
||||||
|
sub.w %d3,%d0
|
||||||
|
add.w %d3,%d1
|
||||||
|
COMPUTE_PAIR_MASKS
|
||||||
|
|
||||||
|
| Span B1: y = cy + bx
|
||||||
|
move.w SP_FC_CY(%sp),%d0
|
||||||
|
add.w %d2,%d0
|
||||||
|
SPAN_BODY \color
|
||||||
|
|
||||||
|
| Span B2: y = cy - bx
|
||||||
|
move.w SP_FC_CY(%sp),%d0
|
||||||
|
sub.w %d2,%d0
|
||||||
|
SPAN_BODY \color
|
||||||
|
|
||||||
|
| --- Bresenham step
|
||||||
|
addq.w #1,%d3
|
||||||
|
tst.w %d4
|
||||||
|
bgt.s .Lfc_decBx_\color
|
||||||
|
add.w %d3,%d4
|
||||||
|
add.w %d3,%d4
|
||||||
|
addq.w #1,%d4
|
||||||
|
bra.w .Lfc_loop_\color
|
||||||
|
.Lfc_decBx_\color:
|
||||||
|
subq.w #1,%d2
|
||||||
|
add.w %d3,%d4
|
||||||
|
add.w %d3,%d4
|
||||||
|
sub.w %d2,%d4
|
||||||
|
sub.w %d2,%d4
|
||||||
|
addq.w #1,%d4
|
||||||
|
bra.w .Lfc_loop_\color
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
.globl _surface68kStFillCircle
|
.globl _surface68kStFillCircle
|
||||||
|
|
||||||
_surface68kStFillCircle:
|
_surface68kStFillCircle:
|
||||||
|
|
@ -142,10 +214,11 @@ _surface68kStFillCircle:
|
||||||
moveq #0,%d7
|
moveq #0,%d7
|
||||||
move.b SP_FC_COLOR(%sp),%d7
|
move.b SP_FC_COLOR(%sp),%d7
|
||||||
|
|
||||||
| LUT bases (PC-relative indexed has only 8-bit
|
| LUT bases. a5/a6 = mask LUTs (used by COMPUTE_PAIR_MASKS).
|
||||||
| displacement, so cache full pointers in a-regs).
|
| a2 = shared _gStRowOffsetLut (used by SPAN_BODY for y*160).
|
||||||
lea leftMaskLut(%pc),%a5
|
lea leftMaskLut(%pc),%a5
|
||||||
lea rightMaskLut(%pc),%a6
|
lea rightMaskLut(%pc),%a6
|
||||||
|
lea _gStRowOffsetLut,%a2
|
||||||
|
|
||||||
| loLong = ((c&1)?0xFFFF0000:0) | ((c&2)?0x0000FFFF:0)
|
| loLong = ((c&1)?0xFFFF0000:0) | ((c&2)?0x0000FFFF:0)
|
||||||
moveq #0,%d5
|
moveq #0,%d5
|
||||||
|
|
@ -174,60 +247,50 @@ _surface68kStFillCircle:
|
||||||
moveq #1,%d4
|
moveq #1,%d4
|
||||||
sub.w %d2,%d4
|
sub.w %d2,%d4
|
||||||
|
|
||||||
.Lfc_loop:
|
| Dispatch on color (low 4 bits) -> 16 specialized loops.
|
||||||
cmp.w %d3,%d2
|
| Use a4 (gets overwritten in SPAN_BODY's first lea) as
|
||||||
bcs.w .Lfc_done
|
| dispatch scratch since a2 now holds yLut for the body.
|
||||||
|
and.w #0x0F,%d7
|
||||||
|
move.w %d7,%d0
|
||||||
|
add.w %d0,%d0
|
||||||
|
add.w %d0,%d0 | * 4 for bra.w table
|
||||||
|
lea .Lfc_table(%pc),%a4
|
||||||
|
jmp 0(%a4,%d0.w)
|
||||||
|
|
||||||
| --- Pair A: x range = (cx - bx, cx + bx)
|
.Lfc_table:
|
||||||
move.w SP_FC_CX(%sp),%d0
|
bra.w .Lfc_loop_0
|
||||||
move.w %d0,%d1
|
bra.w .Lfc_loop_1
|
||||||
sub.w %d2,%d0 | left = cx - bx
|
bra.w .Lfc_loop_2
|
||||||
add.w %d2,%d1 | right = cx + bx
|
bra.w .Lfc_loop_3
|
||||||
COMPUTE_PAIR_MASKS
|
bra.w .Lfc_loop_4
|
||||||
|
bra.w .Lfc_loop_5
|
||||||
|
bra.w .Lfc_loop_6
|
||||||
|
bra.w .Lfc_loop_7
|
||||||
|
bra.w .Lfc_loop_8
|
||||||
|
bra.w .Lfc_loop_9
|
||||||
|
bra.w .Lfc_loop_10
|
||||||
|
bra.w .Lfc_loop_11
|
||||||
|
bra.w .Lfc_loop_12
|
||||||
|
bra.w .Lfc_loop_13
|
||||||
|
bra.w .Lfc_loop_14
|
||||||
|
bra.w .Lfc_loop_15
|
||||||
|
|
||||||
| Span A1: y = cy + by
|
CO_BODY 0
|
||||||
move.w SP_FC_CY(%sp),%d0
|
CO_BODY 1
|
||||||
add.w %d3,%d0
|
CO_BODY 2
|
||||||
SPAN_BODY
|
CO_BODY 3
|
||||||
|
CO_BODY 4
|
||||||
| Span A2: y = cy - by
|
CO_BODY 5
|
||||||
move.w SP_FC_CY(%sp),%d0
|
CO_BODY 6
|
||||||
sub.w %d3,%d0
|
CO_BODY 7
|
||||||
SPAN_BODY
|
CO_BODY 8
|
||||||
|
CO_BODY 9
|
||||||
| --- Pair B: x range = (cx - by, cx + by)
|
CO_BODY 10
|
||||||
move.w SP_FC_CX(%sp),%d0
|
CO_BODY 11
|
||||||
move.w %d0,%d1
|
CO_BODY 12
|
||||||
sub.w %d3,%d0 | left = cx - by
|
CO_BODY 13
|
||||||
add.w %d3,%d1 | right = cx + by
|
CO_BODY 14
|
||||||
COMPUTE_PAIR_MASKS
|
CO_BODY 15
|
||||||
|
|
||||||
| Span B1: y = cy + bx
|
|
||||||
move.w SP_FC_CY(%sp),%d0
|
|
||||||
add.w %d2,%d0
|
|
||||||
SPAN_BODY
|
|
||||||
|
|
||||||
| Span B2: y = cy - bx
|
|
||||||
move.w SP_FC_CY(%sp),%d0
|
|
||||||
sub.w %d2,%d0
|
|
||||||
SPAN_BODY
|
|
||||||
|
|
||||||
| --- Bresenham step
|
|
||||||
addq.w #1,%d3
|
|
||||||
tst.w %d4
|
|
||||||
bgt.s .Lfc_decBx
|
|
||||||
add.w %d3,%d4
|
|
||||||
add.w %d3,%d4
|
|
||||||
addq.w #1,%d4
|
|
||||||
bra.w .Lfc_loop
|
|
||||||
.Lfc_decBx:
|
|
||||||
subq.w #1,%d2
|
|
||||||
add.w %d3,%d4
|
|
||||||
add.w %d3,%d4
|
|
||||||
sub.w %d2,%d4
|
|
||||||
sub.w %d2,%d4
|
|
||||||
addq.w #1,%d4
|
|
||||||
bra.w .Lfc_loop
|
|
||||||
|
|
||||||
|
|
||||||
.Lfc_done:
|
.Lfc_done:
|
||||||
|
|
@ -236,46 +299,6 @@ _surface68kStFillCircle:
|
||||||
rts
|
rts
|
||||||
|
|
||||||
|
|
||||||
| ---- Apply 4-plane mask at (a4) -------------------------------
|
|
||||||
| Input: d0.w = mask, d7.b = color, a4 = group ptr
|
|
||||||
| Output: a4 advanced by 8 (next group). Caller must NOT post-add 8.
|
|
||||||
| Trashes: d0, d1
|
|
||||||
| Subroutine, called via bsr from SPAN_BODY. Postinc on each plane
|
|
||||||
| RMW saves 4 cyc/plane vs displacement (12 vs 16 EA cyc).
|
|
||||||
|
|
||||||
.Lfc_applyMask:
|
|
||||||
move.w %d0,%d1
|
|
||||||
not.w %d1 | d1 = notMask
|
|
||||||
btst #0,%d7
|
|
||||||
beq.s .Lfc_am0a
|
|
||||||
or.w %d0,(%a4)+
|
|
||||||
bra.s .Lfc_am1
|
|
||||||
.Lfc_am0a:
|
|
||||||
and.w %d1,(%a4)+
|
|
||||||
.Lfc_am1:
|
|
||||||
btst #1,%d7
|
|
||||||
beq.s .Lfc_am1a
|
|
||||||
or.w %d0,(%a4)+
|
|
||||||
bra.s .Lfc_am2
|
|
||||||
.Lfc_am1a:
|
|
||||||
and.w %d1,(%a4)+
|
|
||||||
.Lfc_am2:
|
|
||||||
btst #2,%d7
|
|
||||||
beq.s .Lfc_am2a
|
|
||||||
or.w %d0,(%a4)+
|
|
||||||
bra.s .Lfc_am3
|
|
||||||
.Lfc_am2a:
|
|
||||||
and.w %d1,(%a4)+
|
|
||||||
.Lfc_am3:
|
|
||||||
btst #3,%d7
|
|
||||||
beq.s .Lfc_am3a
|
|
||||||
or.w %d0,(%a4)+
|
|
||||||
rts
|
|
||||||
.Lfc_am3a:
|
|
||||||
and.w %d1,(%a4)+
|
|
||||||
rts
|
|
||||||
|
|
||||||
|
|
||||||
.align 2
|
.align 2
|
||||||
| leftMaskLut[i] = (1 << (16 - i)) - 1, indexed by bitFirst (0..15)
|
| leftMaskLut[i] = (1 << (16 - i)) - 1, indexed by bitFirst (0..15)
|
||||||
leftMaskLut:
|
leftMaskLut:
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@
|
||||||
//
|
//
|
||||||
// M2 scope:
|
// M2 scope:
|
||||||
// * XBIOS Setscreen to ST low-res (320x200x16, mode 0).
|
// * XBIOS Setscreen to ST low-res (320x200x16, mode 0).
|
||||||
// * Chunky 4bpp to word-interleaved ST planar c2p at present time.
|
// * Word-interleaved ST planar buffer copied to the screen at present.
|
||||||
//
|
//
|
||||||
// M2.5 scope (per-band palette / SCB emulation):
|
// M2.5 scope (per-band palette / SCB emulation):
|
||||||
// * halPresent scans the SurfaceT's SCB array and builds a compact
|
// * halPresent scans the SurfaceT's SCB array and builds a compact
|
||||||
|
|
@ -136,17 +136,9 @@ static inline __attribute__((always_inline)) uint8_t stPlanarGetPixel(const StPl
|
||||||
}
|
}
|
||||||
static uint16_t quantizeColorToSt(uint16_t orgb);
|
static uint16_t quantizeColorToSt(uint16_t orgb);
|
||||||
static void flattenScbPalettes(const SurfaceT *src);
|
static void flattenScbPalettes(const SurfaceT *src);
|
||||||
static void initC2pLut(void);
|
|
||||||
static void writeDiagnostics(void);
|
static void writeDiagnostics(void);
|
||||||
static long writePrevPaletteRegs(void);
|
static long writePrevPaletteRegs(void);
|
||||||
|
|
||||||
// Provided by src/port/atarist/c2p.s.
|
|
||||||
extern void chunkyToPlanarRowSt(const uint8_t *src,
|
|
||||||
uint16_t *dst,
|
|
||||||
uint16_t groupStart,
|
|
||||||
uint16_t groupEnd,
|
|
||||||
const uint8_t *lut);
|
|
||||||
|
|
||||||
static __attribute__((interrupt_handler)) void timerBIsr(void);
|
static __attribute__((interrupt_handler)) void timerBIsr(void);
|
||||||
static __attribute__((interrupt_handler)) void vblIsr(void);
|
static __attribute__((interrupt_handler)) void vblIsr(void);
|
||||||
static void buildTransitions(const SurfaceT *src);
|
static void buildTransitions(const SurfaceT *src);
|
||||||
|
|
@ -201,72 +193,11 @@ static void (*gOldTimerBVec)(void) = NULL;
|
||||||
// SCB; neither is cheap on a 7 MHz 68000. In the typical game loop
|
// SCB; neither is cheap on a 7 MHz 68000. In the typical game loop
|
||||||
// (and every frame of the keys demo after the initial paint) SCB and
|
// (and every frame of the keys demo after the initial paint) SCB and
|
||||||
// palette never change, so caching and skipping those passes keeps
|
// palette never change, so caching and skipping those passes keeps
|
||||||
// rect presents down to just the c2p work.
|
// rect presents down to just the screen blit.
|
||||||
static uint8_t gCachedScb [SURFACE_HEIGHT];
|
static uint8_t gCachedScb [SURFACE_HEIGHT];
|
||||||
static uint16_t gCachedPalette[SURFACE_PALETTE_COUNT][SURFACE_COLORS_PER_PALETTE];
|
static uint16_t gCachedPalette[SURFACE_PALETTE_COUNT][SURFACE_COLORS_PER_PALETTE];
|
||||||
static bool gCacheValid = false;
|
static bool gCacheValid = false;
|
||||||
|
|
||||||
// 256-long plane-spread LUT for the asm sprite SAVE path (defined in
|
|
||||||
// spriteAsm.s). For plane byte b, LUT[b] is a 32-bit value where each
|
|
||||||
// of b's 8 bits is placed at the bit-0 position of the corresponding
|
|
||||||
// pixel's nibble inside a 4-byte chunky long. The asm shifts the LUT
|
|
||||||
// entry left by N to get plane N's contribution; OR'd across 4 planes
|
|
||||||
// gives the full chunky long. Initialized lazily.
|
|
||||||
//
|
|
||||||
// LUT used by surface68kStSpriteSaveByteAligned. The asm reads via
|
|
||||||
// `move.l (a_ptr, d0.l), d4` which requires the LUT to be long-
|
|
||||||
// aligned -- and TOS .PRG BSS only does 2-byte alignment. Worse,
|
|
||||||
// the cascading offsets from the odd-sized gC2pLut put even
|
|
||||||
// `uint32_t` BSS slots at addr mod 4 == 2.
|
|
||||||
//
|
|
||||||
// Fix: malloc the LUT. mintlib's malloc returns long-aligned memory.
|
|
||||||
// The pointer is passed to the asm via the C-side wrapper (so the
|
|
||||||
// asm reads it from the stack, where it's guaranteed long-aligned
|
|
||||||
// regardless of where the static pointer slot lives).
|
|
||||||
static uint32_t *gStPlaneSpreadLutPtr = NULL;
|
|
||||||
static bool gStPlaneSpreadLutReady = false;
|
|
||||||
|
|
||||||
static bool initStPlaneSpreadLut(void) {
|
|
||||||
int b;
|
|
||||||
int i;
|
|
||||||
|
|
||||||
if (gStPlaneSpreadLutReady) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
gStPlaneSpreadLutPtr = (uint32_t *)malloc(256 * sizeof(uint32_t));
|
|
||||||
if (gStPlaneSpreadLutPtr == NULL) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (b = 0; b < 256; b++) {
|
|
||||||
uint32_t v = 0u;
|
|
||||||
for (i = 0; i < 8; i++) {
|
|
||||||
if (b & (0x80 >> i)) {
|
|
||||||
int byteIdx = i >> 1;
|
|
||||||
int isHigh = ((i & 1) == 0);
|
|
||||||
int bitInLong = (3 - byteIdx) * 8 + (isHigh ? 4 : 0);
|
|
||||||
v |= (uint32_t)1u << bitInLong;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
gStPlaneSpreadLutPtr[b] = v;
|
|
||||||
}
|
|
||||||
gStPlaneSpreadLutReady = true;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// 4 KB chunky-to-planar lookup table consumed by chunkyToPlanarRowSt
|
|
||||||
// (src/port/atarist/c2p.s). Layout: gC2pLut[src*16 + pos*4 + plane]
|
|
||||||
// = the 2-bit plane-byte contribution for source byte `src` at
|
|
||||||
// byte-position `pos` (0..3 within a 4-byte chunk) going to plane
|
|
||||||
// `plane`. Bit positions inside the byte are (7-2*pos, 6-2*pos), so
|
|
||||||
// the same table feeds both halves of an ST plane word: positions
|
|
||||||
// 0..3 land in the high byte, 4..7 (re-indexed mod 4) in the low
|
|
||||||
// byte. Built once by initC2pLut on the first halPresent call.
|
|
||||||
/* Exported (no static) so spriteAsm.s can `lea _gC2pLut, %a2`. */
|
|
||||||
uint8_t gC2pLut[4 * 1024];
|
|
||||||
static bool gC2pLutReady = false;
|
|
||||||
|
|
||||||
// ----- Internal helpers (alphabetical) -----
|
// ----- Internal helpers (alphabetical) -----
|
||||||
|
|
||||||
// Scan the surface's SCB and record one transition entry for each
|
// Scan the surface's SCB and record one transition entry for each
|
||||||
|
|
@ -350,37 +281,6 @@ static void refreshPaletteStateIfNeeded(const SurfaceT *src) {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// Build the 4 KB chunky-to-planar lookup table consumed by
|
|
||||||
// chunkyToPlanarRowSt. Same layout/contents as the Amiga c2p LUT;
|
|
||||||
// see src/port/atarist/c2p.s for the addressing math.
|
|
||||||
static void initC2pLut(void) {
|
|
||||||
uint16_t pos;
|
|
||||||
uint16_t plane;
|
|
||||||
uint16_t src;
|
|
||||||
uint8_t highShift;
|
|
||||||
uint8_t lowShift;
|
|
||||||
uint8_t highBit;
|
|
||||||
uint8_t lowBit;
|
|
||||||
|
|
||||||
if (gC2pLutReady) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
for (src = 0; src < 256; src++) {
|
|
||||||
for (pos = 0; pos < 4; pos++) {
|
|
||||||
highShift = (uint8_t)(7 - 2 * pos);
|
|
||||||
lowShift = (uint8_t)(6 - 2 * pos);
|
|
||||||
for (plane = 0; plane < 4; plane++) {
|
|
||||||
highBit = (uint8_t)(((src >> 4) >> plane) & 1);
|
|
||||||
lowBit = (uint8_t)(((src & 0x0F) >> plane) & 1);
|
|
||||||
gC2pLut[src * 16 + pos * 4 + plane] =
|
|
||||||
(uint8_t)((highBit << highShift) | (lowBit << lowShift));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
gC2pLutReady = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// 12-bit $0RGB to STF 9-bit palette register (drops the low bit of
|
// 12-bit $0RGB to STF 9-bit palette register (drops the low bit of
|
||||||
// each 4-bit channel).
|
// each 4-bit channel).
|
||||||
static uint16_t quantizeColorToSt(uint16_t orgb) {
|
static uint16_t quantizeColorToSt(uint16_t orgb) {
|
||||||
|
|
@ -619,11 +519,8 @@ void halPresent(const SurfaceT *src) {
|
||||||
}
|
}
|
||||||
refreshPaletteStateIfNeeded(src);
|
refreshPaletteStateIfNeeded(src);
|
||||||
|
|
||||||
// Phase 9: planar shadow -> screen RAM. Same dirty-word band
|
// Planar buffer -> screen RAM. Each dirty word covers 4 pixels
|
||||||
// tracking the c2p path used; just memcpy the planar bytes for
|
// (a quarter of an 8-byte group). Round to whole groups for a
|
||||||
// each band instead of running c2p on the chunky shadow. Each
|
|
||||||
// dirty word covers 4 pixels = ?of one group = quarter of an
|
|
||||||
// 8-byte group. We round to whole groups (8 bytes each) for a
|
|
||||||
// simple aligned memcpy, since planar groups are the natural
|
// simple aligned memcpy, since planar groups are the natural
|
||||||
// copy unit.
|
// copy unit.
|
||||||
for (y = 0; y < SURFACE_HEIGHT; y++) {
|
for (y = 0; y < SURFACE_HEIGHT; y++) {
|
||||||
|
|
@ -720,8 +617,11 @@ extern void surface68kStFillCircle(uint8_t *base, uint16_t cx, uint16_t cy, uint
|
||||||
extern void surface68kStFillRectSingleGroup(uint8_t *firstGroupPtr, uint16_t mask, uint16_t h, uint8_t color);
|
extern void surface68kStFillRectSingleGroup(uint8_t *firstGroupPtr, uint16_t mask, uint16_t h, uint8_t color);
|
||||||
extern void surface68kStFillRectMulti(uint8_t *base, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t color);
|
extern void surface68kStFillRectMulti(uint8_t *base, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t color);
|
||||||
extern void surface68kStLongFill(uint8_t *dst, uint16_t numGroups, uint32_t loLong, uint32_t hiLong);
|
extern void surface68kStLongFill(uint8_t *dst, uint16_t numGroups, uint32_t loLong, uint32_t hiLong);
|
||||||
extern void surface68kStSpriteSaveByteAligned(uint8_t *base, uint16_t x, uint16_t y, uint16_t w, uint16_t h, uint8_t *dstChunky, const uint32_t *lut);
|
extern void surface68kStTileFill8x8(uint8_t *firstGroupPtr, uint16_t mask, uint8_t color);
|
||||||
extern void surface68kStSpriteRestoreByteAligned(uint8_t *base, uint16_t x, uint16_t y, uint16_t w, uint16_t h, const uint8_t *srcChunky, const uint8_t *c2pLut);
|
extern void surface68kStSprite16x16Save(uint8_t *base, uint16_t x, uint16_t y, uint8_t *dstBuf);
|
||||||
|
extern void surface68kStSprite16x16Restore(uint8_t *base, uint16_t x, uint16_t y, const uint8_t *srcBuf);
|
||||||
|
extern void surface68kStSpriteSaveByteAligned(uint8_t *base, uint16_t x, uint16_t y, uint16_t w, uint16_t h, uint8_t *dstPlaneBytes);
|
||||||
|
extern void surface68kStSpriteRestoreByteAligned(uint8_t *base, uint16_t x, uint16_t y, uint16_t w, uint16_t h, const uint8_t *srcPlaneBytes);
|
||||||
|
|
||||||
|
|
||||||
// Phase 9: clear the entire planar buffer to a 4-bit color. Build an
|
// Phase 9: clear the entire planar buffer to a 4-bit color. Build an
|
||||||
|
|
@ -1262,17 +1162,12 @@ void halTileFillPlanes(SurfaceT *s, uint8_t bx, uint8_t by, uint8_t colorIndex)
|
||||||
group = (uint16_t)((uint16_t)bx >> 1);
|
group = (uint16_t)((uint16_t)bx >> 1);
|
||||||
halfMask = ((bx & 1u) == 0u) ? 0xFF00u : 0x00FFu;
|
halfMask = ((bx & 1u) == 0u) ? 0xFF00u : 0x00FFu;
|
||||||
gp = pd->base + (uint16_t)by * 8u * ST_BYTES_PER_ROW + group * ST_BYTES_PER_GROUP;
|
gp = pd->base + (uint16_t)by * 8u * ST_BYTES_PER_ROW + group * ST_BYTES_PER_GROUP;
|
||||||
surface68kStFillRectSingleGroup(gp, halfMask, TILE_PIXELS_PER_SIDE, colorIndex);
|
/* Phase 10 final: specialized 8x8 unrolled tile-fill skips the
|
||||||
|
* generic FRG_LOOP's per-row subq+bne overhead. */
|
||||||
|
surface68kStTileFill8x8(gp, halfMask, colorIndex);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// Phase 10: group-aware tile paste. Per row: extract 8 pixels from
|
|
||||||
// 4 chunky bytes, build 4 plane bytes (one per plane), drop them
|
|
||||||
// into the high or low half of the 4 plane words at this group --
|
|
||||||
// 4 word RMWs per row instead of 64 per-pixel calls.
|
|
||||||
static const uint8_t kStTileBitLut[8] = { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 };
|
|
||||||
|
|
||||||
|
|
||||||
// Phase 10: tile paste/snap reuse the asm sprite save/restore
|
// Phase 10: tile paste/snap reuse the asm sprite save/restore
|
||||||
// helpers -- identical per-row work patterns at byte-aligned
|
// helpers -- identical per-row work patterns at byte-aligned
|
||||||
// positions. Width 8 = single tile column = single half-group
|
// positions. Width 8 = single tile column = single half-group
|
||||||
|
|
@ -1301,14 +1196,25 @@ void halTilePastePlanes(SurfaceT *dst, uint8_t bx, uint8_t by, const uint8_t *ti
|
||||||
+ (uint16_t)by * 8u * ST_BYTES_PER_ROW
|
+ (uint16_t)by * 8u * ST_BYTES_PER_ROW
|
||||||
+ group * ST_BYTES_PER_GROUP
|
+ group * ST_BYTES_PER_GROUP
|
||||||
+ (uint16_t)(bx & 1u);
|
+ (uint16_t)(bx & 1u);
|
||||||
for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) {
|
(void)row;
|
||||||
dstAddr[0] = tileBytes[0];
|
#define ST_TILE_PASTE_ROW \
|
||||||
dstAddr[2] = tileBytes[1];
|
do { \
|
||||||
dstAddr[4] = tileBytes[2];
|
dstAddr[0] = tileBytes[0]; \
|
||||||
dstAddr[6] = tileBytes[3];
|
dstAddr[2] = tileBytes[1]; \
|
||||||
dstAddr += ST_BYTES_PER_ROW;
|
dstAddr[4] = tileBytes[2]; \
|
||||||
tileBytes += TILE_BYTES_PER_ROW;
|
dstAddr[6] = tileBytes[3]; \
|
||||||
}
|
dstAddr += ST_BYTES_PER_ROW; \
|
||||||
|
tileBytes += TILE_BYTES_PER_ROW; \
|
||||||
|
} while (0)
|
||||||
|
ST_TILE_PASTE_ROW;
|
||||||
|
ST_TILE_PASTE_ROW;
|
||||||
|
ST_TILE_PASTE_ROW;
|
||||||
|
ST_TILE_PASTE_ROW;
|
||||||
|
ST_TILE_PASTE_ROW;
|
||||||
|
ST_TILE_PASTE_ROW;
|
||||||
|
ST_TILE_PASTE_ROW;
|
||||||
|
ST_TILE_PASTE_ROW;
|
||||||
|
#undef ST_TILE_PASTE_ROW
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1331,136 +1237,25 @@ void halTileSnapPlanes(const SurfaceT *src, uint8_t bx, uint8_t by, uint8_t *til
|
||||||
+ (uint16_t)by * 8u * ST_BYTES_PER_ROW
|
+ (uint16_t)by * 8u * ST_BYTES_PER_ROW
|
||||||
+ group * ST_BYTES_PER_GROUP
|
+ group * ST_BYTES_PER_GROUP
|
||||||
+ (uint16_t)(bx & 1u);
|
+ (uint16_t)(bx & 1u);
|
||||||
for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) {
|
(void)row;
|
||||||
tileOut[0] = srcAddr[0];
|
#define ST_TILE_SNAP_ROW \
|
||||||
tileOut[1] = srcAddr[2];
|
do { \
|
||||||
tileOut[2] = srcAddr[4];
|
tileOut[0] = srcAddr[0]; \
|
||||||
tileOut[3] = srcAddr[6];
|
tileOut[1] = srcAddr[2]; \
|
||||||
srcAddr += ST_BYTES_PER_ROW;
|
tileOut[2] = srcAddr[4]; \
|
||||||
tileOut += TILE_BYTES_PER_ROW;
|
tileOut[3] = srcAddr[6]; \
|
||||||
}
|
srcAddr += ST_BYTES_PER_ROW; \
|
||||||
}
|
tileOut += TILE_BYTES_PER_ROW; \
|
||||||
|
} while (0)
|
||||||
|
ST_TILE_SNAP_ROW;
|
||||||
/* Slow-path C versions kept (renamed) for reference; not in the
|
ST_TILE_SNAP_ROW;
|
||||||
* active call chain. */
|
ST_TILE_SNAP_ROW;
|
||||||
static void halTilePastePlanes_oldC(SurfaceT *dst, uint8_t bx, uint8_t by, const uint8_t *chunkyTile) {
|
ST_TILE_SNAP_ROW;
|
||||||
StPlanarT *pd;
|
ST_TILE_SNAP_ROW;
|
||||||
uint16_t group;
|
ST_TILE_SNAP_ROW;
|
||||||
uint16_t halfMask;
|
ST_TILE_SNAP_ROW;
|
||||||
uint16_t notHalfMask;
|
ST_TILE_SNAP_ROW;
|
||||||
bool isHigh;
|
#undef ST_TILE_SNAP_ROW
|
||||||
uint8_t *rowBase;
|
|
||||||
int16_t row;
|
|
||||||
int16_t pix;
|
|
||||||
uint16_t *pw;
|
|
||||||
uint8_t b;
|
|
||||||
uint8_t color;
|
|
||||||
uint8_t pb0;
|
|
||||||
uint8_t pb1;
|
|
||||||
uint8_t pb2;
|
|
||||||
uint8_t pb3;
|
|
||||||
uint8_t bit;
|
|
||||||
|
|
||||||
if (dst == NULL || chunkyTile == NULL) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
pd = (StPlanarT *)dst->portData;
|
|
||||||
if (pd == NULL) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
group = (uint16_t)((uint16_t)bx >> 1);
|
|
||||||
isHigh = ((bx & 1u) == 0u);
|
|
||||||
halfMask = isHigh ? 0xFF00u : 0x00FFu;
|
|
||||||
notHalfMask = (uint16_t)~halfMask;
|
|
||||||
rowBase = pd->base
|
|
||||||
+ (uint16_t)by * TILE_PIXELS_PER_SIDE * ST_BYTES_PER_ROW
|
|
||||||
+ group * ST_BYTES_PER_GROUP;
|
|
||||||
for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) {
|
|
||||||
pb0 = pb1 = pb2 = pb3 = 0u;
|
|
||||||
for (pix = 0; pix < TILE_PIXELS_PER_SIDE; pix++) {
|
|
||||||
b = chunkyTile[row * TILE_BYTES_PER_ROW + (pix >> 1)];
|
|
||||||
color = (pix & 1) ? (uint8_t)(b & 0x0Fu) : (uint8_t)(b >> 4);
|
|
||||||
bit = kStTileBitLut[pix];
|
|
||||||
if (color & 1u) { pb0 = (uint8_t)(pb0 | bit); }
|
|
||||||
if (color & 2u) { pb1 = (uint8_t)(pb1 | bit); }
|
|
||||||
if (color & 4u) { pb2 = (uint8_t)(pb2 | bit); }
|
|
||||||
if (color & 8u) { pb3 = (uint8_t)(pb3 | bit); }
|
|
||||||
}
|
|
||||||
pw = (uint16_t *)rowBase;
|
|
||||||
if (isHigh) {
|
|
||||||
pw[0] = (uint16_t)((pw[0] & notHalfMask) | ((uint16_t)pb0 << 8));
|
|
||||||
pw[1] = (uint16_t)((pw[1] & notHalfMask) | ((uint16_t)pb1 << 8));
|
|
||||||
pw[2] = (uint16_t)((pw[2] & notHalfMask) | ((uint16_t)pb2 << 8));
|
|
||||||
pw[3] = (uint16_t)((pw[3] & notHalfMask) | ((uint16_t)pb3 << 8));
|
|
||||||
} else {
|
|
||||||
pw[0] = (uint16_t)((pw[0] & notHalfMask) | (uint16_t)pb0);
|
|
||||||
pw[1] = (uint16_t)((pw[1] & notHalfMask) | (uint16_t)pb1);
|
|
||||||
pw[2] = (uint16_t)((pw[2] & notHalfMask) | (uint16_t)pb2);
|
|
||||||
pw[3] = (uint16_t)((pw[3] & notHalfMask) | (uint16_t)pb3);
|
|
||||||
}
|
|
||||||
rowBase += ST_BYTES_PER_ROW;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// Phase 10: group-aware tile snap. Read 4 plane half-words for the
|
|
||||||
// row's group, distribute the 8 plane bits per plane into chunky
|
|
||||||
// nibbles. 4 word reads per row + 4 chunky bytes per row, no
|
|
||||||
// per-pixel function calls. Replaced by the asm-routed halTileSnapPlanes
|
|
||||||
// above; kept for reference as the C-only fallback.
|
|
||||||
static void halTileSnapPlanes_oldC(const SurfaceT *src, uint8_t bx, uint8_t by, uint8_t *chunkyTileOut) {
|
|
||||||
const StPlanarT *pd;
|
|
||||||
uint16_t group;
|
|
||||||
uint16_t halfShift;
|
|
||||||
const uint8_t *rowBase;
|
|
||||||
int16_t row;
|
|
||||||
int16_t pair;
|
|
||||||
const uint16_t *pw;
|
|
||||||
uint8_t pb0;
|
|
||||||
uint8_t pb1;
|
|
||||||
uint8_t pb2;
|
|
||||||
uint8_t pb3;
|
|
||||||
uint8_t bitHi;
|
|
||||||
uint8_t bitLo;
|
|
||||||
uint8_t hi;
|
|
||||||
uint8_t lo;
|
|
||||||
|
|
||||||
if (src == NULL || chunkyTileOut == NULL) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
pd = (const StPlanarT *)src->portData;
|
|
||||||
if (pd == NULL) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
group = (uint16_t)((uint16_t)bx >> 1);
|
|
||||||
halfShift = ((bx & 1u) == 0u) ? 8u : 0u;
|
|
||||||
rowBase = pd->base
|
|
||||||
+ (uint16_t)by * TILE_PIXELS_PER_SIDE * ST_BYTES_PER_ROW
|
|
||||||
+ group * ST_BYTES_PER_GROUP;
|
|
||||||
for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) {
|
|
||||||
pw = (const uint16_t *)rowBase;
|
|
||||||
pb0 = (uint8_t)(pw[0] >> halfShift);
|
|
||||||
pb1 = (uint8_t)(pw[1] >> halfShift);
|
|
||||||
pb2 = (uint8_t)(pw[2] >> halfShift);
|
|
||||||
pb3 = (uint8_t)(pw[3] >> halfShift);
|
|
||||||
for (pair = 0; pair < TILE_BYTES_PER_ROW; pair++) {
|
|
||||||
bitHi = kStTileBitLut[pair * 2];
|
|
||||||
bitLo = kStTileBitLut[pair * 2 + 1];
|
|
||||||
hi = 0u;
|
|
||||||
lo = 0u;
|
|
||||||
if (pb0 & bitHi) hi = (uint8_t)(hi | 1u);
|
|
||||||
if (pb1 & bitHi) hi = (uint8_t)(hi | 2u);
|
|
||||||
if (pb2 & bitHi) hi = (uint8_t)(hi | 4u);
|
|
||||||
if (pb3 & bitHi) hi = (uint8_t)(hi | 8u);
|
|
||||||
if (pb0 & bitLo) lo = (uint8_t)(lo | 1u);
|
|
||||||
if (pb1 & bitLo) lo = (uint8_t)(lo | 2u);
|
|
||||||
if (pb2 & bitLo) lo = (uint8_t)(lo | 4u);
|
|
||||||
if (pb3 & bitLo) lo = (uint8_t)(lo | 8u);
|
|
||||||
chunkyTileOut[row * TILE_BYTES_PER_ROW + pair] = (uint8_t)((hi << 4) | lo);
|
|
||||||
}
|
|
||||||
rowBase += ST_BYTES_PER_ROW;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1496,14 +1291,28 @@ void halTileCopyPlanes(SurfaceT *dst, uint8_t dstBx, uint8_t dstBy, const Surfac
|
||||||
+ (uint16_t)dstBy * 8u * ST_BYTES_PER_ROW
|
+ (uint16_t)dstBy * 8u * ST_BYTES_PER_ROW
|
||||||
+ dstGroup * ST_BYTES_PER_GROUP
|
+ dstGroup * ST_BYTES_PER_GROUP
|
||||||
+ (uint16_t)(dstBx & 1u);
|
+ (uint16_t)(dstBx & 1u);
|
||||||
for (row = 0; row < TILE_PIXELS_PER_SIDE; row++) {
|
/* gcc-mint -O2 does NOT unroll the 8-iter byte-copy loop,
|
||||||
dstAddr[0] = srcAddr[0]; /* plane 0 byte (high or low half) */
|
* leaving cmpl + bnes loop overhead per row. Manual unroll
|
||||||
dstAddr[2] = srcAddr[2]; /* plane 1 */
|
* drops ~150 cyc/call. (void)row keeps the unused decl quiet. */
|
||||||
dstAddr[4] = srcAddr[4]; /* plane 2 */
|
(void)row;
|
||||||
dstAddr[6] = srcAddr[6]; /* plane 3 */
|
#define ST_TILE_COPY_ROW \
|
||||||
srcAddr += ST_BYTES_PER_ROW;
|
do { \
|
||||||
dstAddr += ST_BYTES_PER_ROW;
|
dstAddr[0] = srcAddr[0]; \
|
||||||
}
|
dstAddr[2] = srcAddr[2]; \
|
||||||
|
dstAddr[4] = srcAddr[4]; \
|
||||||
|
dstAddr[6] = srcAddr[6]; \
|
||||||
|
srcAddr += ST_BYTES_PER_ROW; \
|
||||||
|
dstAddr += ST_BYTES_PER_ROW; \
|
||||||
|
} while (0)
|
||||||
|
ST_TILE_COPY_ROW; /* row 0 */
|
||||||
|
ST_TILE_COPY_ROW; /* row 1 */
|
||||||
|
ST_TILE_COPY_ROW; /* row 2 */
|
||||||
|
ST_TILE_COPY_ROW; /* row 3 */
|
||||||
|
ST_TILE_COPY_ROW; /* row 4 */
|
||||||
|
ST_TILE_COPY_ROW; /* row 5 */
|
||||||
|
ST_TILE_COPY_ROW; /* row 6 */
|
||||||
|
ST_TILE_COPY_ROW; /* row 7 */
|
||||||
|
#undef ST_TILE_COPY_ROW
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1792,109 +1601,6 @@ void halBlitRectPlanes(SurfaceT *dst, int16_t x, int16_t y, const uint8_t *srcBy
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// Phase 10 fast paths for save/restore. Hand-rolled asm
|
|
||||||
// (surface68kStSprite{Save,Restore}ByteAligned) does the chunky <->
|
|
||||||
// plane bit transpose via ASL+ROXL and walks rows/tile columns. The
|
|
||||||
// C wrappers below are kept as a fallback / reference; they're not
|
|
||||||
// in the critical path now that the asm versions are wired in.
|
|
||||||
static void stSpriteSaveByteAligned(const StPlanarT *pd, int16_t x, int16_t y, uint16_t w, uint16_t h, uint8_t *dstChunkyBytes) {
|
|
||||||
int16_t bytesPerRow = (int16_t)(w >> 1);
|
|
||||||
int16_t tileCols = (int16_t)(w >> 3);
|
|
||||||
const uint8_t *rowBase = pd->base + (uint16_t)y * ST_BYTES_PER_ROW;
|
|
||||||
int16_t row;
|
|
||||||
int16_t tileCol;
|
|
||||||
|
|
||||||
for (row = 0; row < (int16_t)h; row++) {
|
|
||||||
uint8_t *dstRow = &dstChunkyBytes[(uint16_t)row * (uint16_t)bytesPerRow];
|
|
||||||
for (tileCol = 0; tileCol < tileCols; tileCol++) {
|
|
||||||
int16_t srcX = (int16_t)(x + tileCol * 8);
|
|
||||||
uint16_t group = (uint16_t)((uint16_t)srcX >> 4);
|
|
||||||
uint16_t shift = ((srcX & 8) == 0) ? 8u : 0u;
|
|
||||||
const uint16_t *pw = (const uint16_t *)(rowBase + group * ST_BYTES_PER_GROUP);
|
|
||||||
uint8_t pb0 = (uint8_t)(pw[0] >> shift);
|
|
||||||
uint8_t pb1 = (uint8_t)(pw[1] >> shift);
|
|
||||||
uint8_t pb2 = (uint8_t)(pw[2] >> shift);
|
|
||||||
uint8_t pb3 = (uint8_t)(pw[3] >> shift);
|
|
||||||
int16_t pair;
|
|
||||||
for (pair = 0; pair < 4; pair++) {
|
|
||||||
uint8_t bitHi = (uint8_t)(0x80u >> (pair * 2));
|
|
||||||
uint8_t bitLo = (uint8_t)(0x80u >> (pair * 2 + 1));
|
|
||||||
uint8_t hi = 0u;
|
|
||||||
uint8_t lo = 0u;
|
|
||||||
if (pb0 & bitHi) { hi = (uint8_t)(hi | 1u); }
|
|
||||||
if (pb1 & bitHi) { hi = (uint8_t)(hi | 2u); }
|
|
||||||
if (pb2 & bitHi) { hi = (uint8_t)(hi | 4u); }
|
|
||||||
if (pb3 & bitHi) { hi = (uint8_t)(hi | 8u); }
|
|
||||||
if (pb0 & bitLo) { lo = (uint8_t)(lo | 1u); }
|
|
||||||
if (pb1 & bitLo) { lo = (uint8_t)(lo | 2u); }
|
|
||||||
if (pb2 & bitLo) { lo = (uint8_t)(lo | 4u); }
|
|
||||||
if (pb3 & bitLo) { lo = (uint8_t)(lo | 8u); }
|
|
||||||
dstRow[tileCol * 4 + pair] = (uint8_t)((hi << 4) | lo);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
rowBase += ST_BYTES_PER_ROW;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
static void stSpriteRestoreByteAligned(StPlanarT *pd, int16_t x, int16_t y, uint16_t w, uint16_t h, const uint8_t *srcChunkyBytes) {
|
|
||||||
int16_t bytesPerRow = (int16_t)(w >> 1);
|
|
||||||
int16_t tileCols = (int16_t)(w >> 3);
|
|
||||||
uint8_t *rowBase = pd->base + (uint16_t)y * ST_BYTES_PER_ROW;
|
|
||||||
int16_t row;
|
|
||||||
int16_t tileCol;
|
|
||||||
|
|
||||||
for (row = 0; row < (int16_t)h; row++) {
|
|
||||||
const uint8_t *srcRow = &srcChunkyBytes[(uint16_t)row * (uint16_t)bytesPerRow];
|
|
||||||
for (tileCol = 0; tileCol < tileCols; tileCol++) {
|
|
||||||
uint8_t b0 = srcRow[tileCol * 4 + 0];
|
|
||||||
uint8_t b1 = srcRow[tileCol * 4 + 1];
|
|
||||||
uint8_t b2 = srcRow[tileCol * 4 + 2];
|
|
||||||
uint8_t b3 = srcRow[tileCol * 4 + 3];
|
|
||||||
uint8_t pb0 = 0u;
|
|
||||||
uint8_t pb1 = 0u;
|
|
||||||
uint8_t pb2 = 0u;
|
|
||||||
uint8_t pb3 = 0u;
|
|
||||||
uint8_t c;
|
|
||||||
int16_t dstX;
|
|
||||||
uint16_t group;
|
|
||||||
uint16_t *pw;
|
|
||||||
uint16_t halfMask;
|
|
||||||
uint16_t notHalfMask;
|
|
||||||
|
|
||||||
c = (uint8_t)(b0 >> 4); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x80u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x80u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x80u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x80u);
|
|
||||||
c = (uint8_t)(b0 & 0x0Fu); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x40u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x40u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x40u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x40u);
|
|
||||||
c = (uint8_t)(b1 >> 4); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x20u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x20u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x20u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x20u);
|
|
||||||
c = (uint8_t)(b1 & 0x0Fu); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x10u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x10u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x10u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x10u);
|
|
||||||
c = (uint8_t)(b2 >> 4); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x08u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x08u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x08u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x08u);
|
|
||||||
c = (uint8_t)(b2 & 0x0Fu); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x04u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x04u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x04u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x04u);
|
|
||||||
c = (uint8_t)(b3 >> 4); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x02u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x02u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x02u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x02u);
|
|
||||||
c = (uint8_t)(b3 & 0x0Fu); if (c & 1u) pb0 = (uint8_t)(pb0 | 0x01u); if (c & 2u) pb1 = (uint8_t)(pb1 | 0x01u); if (c & 4u) pb2 = (uint8_t)(pb2 | 0x01u); if (c & 8u) pb3 = (uint8_t)(pb3 | 0x01u);
|
|
||||||
|
|
||||||
dstX = (int16_t)(x + tileCol * 8);
|
|
||||||
group = (uint16_t)((uint16_t)dstX >> 4);
|
|
||||||
pw = (uint16_t *)(rowBase + group * ST_BYTES_PER_GROUP);
|
|
||||||
if ((dstX & 8) == 0) {
|
|
||||||
halfMask = 0xFF00u;
|
|
||||||
pw[0] = (uint16_t)((pw[0] & 0x00FFu) | ((uint16_t)pb0 << 8));
|
|
||||||
pw[1] = (uint16_t)((pw[1] & 0x00FFu) | ((uint16_t)pb1 << 8));
|
|
||||||
pw[2] = (uint16_t)((pw[2] & 0x00FFu) | ((uint16_t)pb2 << 8));
|
|
||||||
pw[3] = (uint16_t)((pw[3] & 0x00FFu) | ((uint16_t)pb3 << 8));
|
|
||||||
} else {
|
|
||||||
halfMask = 0x00FFu;
|
|
||||||
pw[0] = (uint16_t)((pw[0] & 0xFF00u) | (uint16_t)pb0);
|
|
||||||
pw[1] = (uint16_t)((pw[1] & 0xFF00u) | (uint16_t)pb1);
|
|
||||||
pw[2] = (uint16_t)((pw[2] & 0xFF00u) | (uint16_t)pb2);
|
|
||||||
pw[3] = (uint16_t)((pw[3] & 0xFF00u) | (uint16_t)pb3);
|
|
||||||
}
|
|
||||||
(void)halfMask;
|
|
||||||
(void)notHalfMask;
|
|
||||||
}
|
|
||||||
rowBase += ST_BYTES_PER_ROW;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// Phase 10: hoist y*160 to per-row, fold setPixel/getPixel bodies
|
// Phase 10: hoist y*160 to per-row, fold setPixel/getPixel bodies
|
||||||
// inline. Each pixel's group address differs only in (x), so we
|
// inline. Each pixel's group address differs only in (x), so we
|
||||||
// can compute base+row*160 once per row and just do per-pixel
|
// can compute base+row*160 once per row and just do per-pixel
|
||||||
|
|
@ -1916,11 +1622,16 @@ void halSpriteSavePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint16_t
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
/* Phase 10.5 fast path: byte-aligned, fully on-surface.
|
/* Phase 10.5 fast path: byte-aligned, fully on-surface.
|
||||||
* Asm walker does direct planar byte copy (LUT pointer unused). */
|
* Specialized 16x16 (the UBER ball-sprite size) skips the asm
|
||||||
|
* walker's per-row col-init + col-loop-check overhead. */
|
||||||
if ((x & 7) == 0 && (w & 7) == 0
|
if ((x & 7) == 0 && (w & 7) == 0
|
||||||
&& x >= 0 && (x + (int16_t)w) <= SURFACE_WIDTH
|
&& x >= 0 && (x + (int16_t)w) <= SURFACE_WIDTH
|
||||||
&& y >= 0 && (y + (int16_t)h) <= SURFACE_HEIGHT) {
|
&& y >= 0 && (y + (int16_t)h) <= SURFACE_HEIGHT) {
|
||||||
surface68kStSpriteSaveByteAligned(pd->base, (uint16_t)x, (uint16_t)y, w, h, dstPlaneBytes, NULL);
|
if (w == 16u && h == 16u) {
|
||||||
|
surface68kStSprite16x16Save(pd->base, (uint16_t)x, (uint16_t)y, dstPlaneBytes);
|
||||||
|
} else {
|
||||||
|
surface68kStSpriteSaveByteAligned(pd->base, (uint16_t)x, (uint16_t)y, w, h, dstPlaneBytes);
|
||||||
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1980,11 +1691,15 @@ void halSpriteRestorePlanes(SurfaceT *s, int16_t x, int16_t y, uint16_t w, uint1
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
/* Phase 10.5 fast path: byte-aligned, fully on-surface.
|
/* Phase 10.5 fast path: byte-aligned, fully on-surface.
|
||||||
* Asm walker does direct planar byte copy (LUT pointer unused). */
|
* Specialized 16x16 (UBER ball-sprite) skips walker overhead. */
|
||||||
if ((x & 7) == 0 && (w & 7) == 0
|
if ((x & 7) == 0 && (w & 7) == 0
|
||||||
&& x >= 0 && (x + (int16_t)w) <= SURFACE_WIDTH
|
&& x >= 0 && (x + (int16_t)w) <= SURFACE_WIDTH
|
||||||
&& y >= 0 && (y + (int16_t)h) <= SURFACE_HEIGHT) {
|
&& y >= 0 && (y + (int16_t)h) <= SURFACE_HEIGHT) {
|
||||||
surface68kStSpriteRestoreByteAligned(pd->base, (uint16_t)x, (uint16_t)y, w, h, srcPlaneBytes, NULL);
|
if (w == 16u && h == 16u) {
|
||||||
|
surface68kStSprite16x16Restore(pd->base, (uint16_t)x, (uint16_t)y, srcPlaneBytes);
|
||||||
|
} else {
|
||||||
|
surface68kStSpriteRestoreByteAligned(pd->base, (uint16_t)x, (uint16_t)y, w, h, srcPlaneBytes);
|
||||||
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -2049,10 +1764,11 @@ uint8_t halSamplePixel(const SurfaceT *s, int16_t x, int16_t y) {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// Phase 9: derive 160 chunky bytes per row from the word-interleaved
|
// Derive 160 chunky bytes per row from the word-interleaved planar
|
||||||
// planar buffer (20 groups x 4 plane words). Same shape as the Amiga's
|
// buffer (20 groups x 4 plane words). Same shape as the Amiga's
|
||||||
// amigaPlanesToChunkyRow but per-group instead of per-byte. Used by
|
// amigaPlanesToChunkyRow but per-group instead of per-byte. Used by
|
||||||
// halSurfaceHash and halSurfaceSaveFileChunky.
|
// halSurfaceHash to fold the planar surface into the same byte stream
|
||||||
|
// the chunky ports hash, so cross-port hash comparisons stay valid.
|
||||||
static void stPlanarToChunkyRow(const StPlanarT *pd, int16_t y, uint8_t *dstChunkyRow) {
|
static void stPlanarToChunkyRow(const StPlanarT *pd, int16_t y, uint8_t *dstChunkyRow) {
|
||||||
uint16_t group;
|
uint16_t group;
|
||||||
uint16_t p;
|
uint16_t p;
|
||||||
|
|
@ -2134,58 +1850,27 @@ void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src) {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// Phase 9: read chunky from file into a temporary scratch buffer,
|
// On-disk format is the ST's native interleaved planar buffer; one
|
||||||
// then c2p once into the planar shadow. The .joeysurface file format
|
// fread fills it directly, no chunky scratch or c2p step.
|
||||||
// is still chunky 4bpp on disk (cross-port asset interchange); the
|
bool halSurfaceLoadFile(SurfaceT *dst, FILE *fp) {
|
||||||
// in-memory representation is what changes.
|
|
||||||
bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp) {
|
|
||||||
StPlanarT *pd;
|
StPlanarT *pd;
|
||||||
uint8_t *scratch;
|
|
||||||
int16_t y;
|
|
||||||
bool ok;
|
|
||||||
|
|
||||||
pd = (StPlanarT *)dst->portData;
|
pd = (StPlanarT *)dst->portData;
|
||||||
if (pd == NULL) {
|
if (pd == NULL) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
scratch = (uint8_t *)malloc(SURFACE_PIXELS_SIZE);
|
return fread(pd->base, 1, ST_PLANAR_SIZE, fp) == ST_PLANAR_SIZE;
|
||||||
if (scratch == NULL) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
ok = (fread(scratch, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE);
|
|
||||||
if (ok) {
|
|
||||||
if (!gC2pLutReady) {
|
|
||||||
initC2pLut();
|
|
||||||
}
|
|
||||||
for (y = 0; y < SURFACE_HEIGHT; y++) {
|
|
||||||
const uint8_t *srcLine = &scratch[y * SURFACE_BYTES_PER_ROW];
|
|
||||||
uint16_t *dstLine = (uint16_t *)&pd->base[y * ST_BYTES_PER_ROW];
|
|
||||||
chunkyToPlanarRowSt(srcLine, dstLine, 0u, ST_GROUPS_PER_ROW, gC2pLut);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
free(scratch);
|
|
||||||
return ok;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// Phase 9: derive chunky bytes from the planar shadow row by row,
|
bool halSurfaceSaveFile(const SurfaceT *src, FILE *fp) {
|
||||||
// stream to file. Avoids needing a full 32 KB scratch buffer.
|
|
||||||
bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp) {
|
|
||||||
StPlanarT *pd;
|
StPlanarT *pd;
|
||||||
uint8_t chunkyRow[SURFACE_BYTES_PER_ROW];
|
|
||||||
int16_t y;
|
|
||||||
|
|
||||||
pd = (StPlanarT *)src->portData;
|
pd = (StPlanarT *)src->portData;
|
||||||
if (pd == NULL) {
|
if (pd == NULL) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
for (y = 0; y < SURFACE_HEIGHT; y++) {
|
return fwrite(pd->base, 1, ST_PLANAR_SIZE, fp) == ST_PLANAR_SIZE;
|
||||||
stPlanarToChunkyRow(pd, y, chunkyRow);
|
|
||||||
if (fwrite(chunkyRow, 1, SURFACE_BYTES_PER_ROW, fp) != SURFACE_BYTES_PER_ROW) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -50,19 +50,17 @@
|
||||||
| Trashes: d0, d1, a2
|
| Trashes: d0, d1, a2
|
||||||
|
|
||||||
.macro DL_PLOT color
|
.macro DL_PLOT color
|
||||||
| byteOff = y*160 + (x>>4)*8
|
| byteOff = y*160 + (x>>4)*8 (fits in 16 bits since
|
||||||
|
| surface is 32000 bytes < 32K). Skip ext.l + .l add
|
||||||
|
| + .l indexed lea -- all word-sized ops save 14 cyc/pixel.
|
||||||
move.w %d3,%d0
|
move.w %d3,%d0
|
||||||
ext.l %d0
|
add.w %d0,%d0 | y * 2 (word index)
|
||||||
move.l %d0,%d1
|
move.w (%a6,%d0.w),%d0 | d0 = y * 160
|
||||||
lsl.l #5,%d0 | y << 5
|
|
||||||
lsl.l #7,%d1 | y << 7
|
|
||||||
add.l %d1,%d0 | d0 = y * 160
|
|
||||||
move.w %d2,%d1
|
move.w %d2,%d1
|
||||||
lsr.w #4,%d1
|
lsr.w #4,%d1
|
||||||
lsl.w #3,%d1 | (x>>4) * 8
|
lsl.w #3,%d1 | (x>>4) * 8
|
||||||
ext.l %d1
|
add.w %d1,%d0 | d0 = byteOff (fits in 16 bits)
|
||||||
add.l %d1,%d0 | d0 = byteOff
|
lea 0(%a3,%d0.w),%a2 | a2 = base + byteOff
|
||||||
lea 0(%a3,%d0.l),%a2 | a2 = base + byteOff
|
|
||||||
| d1 = bitMask, d0 = notMask
|
| d1 = bitMask, d0 = notMask
|
||||||
move.w %d2,%d1
|
move.w %d2,%d1
|
||||||
and.w #15,%d1
|
and.w #15,%d1
|
||||||
|
|
@ -127,9 +125,11 @@ _surface68kStDrawLine:
|
||||||
movem.l %d2-%d7/%a2-%a6,-(%sp)
|
movem.l %d2-%d7/%a2-%a6,-(%sp)
|
||||||
lea -SP_LOCAL(%sp),%sp
|
lea -SP_LOCAL(%sp),%sp
|
||||||
|
|
||||||
| Load base & lut.
|
| Load base & luts.
|
||||||
move.l SP_BASE(%sp),%a3
|
move.l SP_BASE(%sp),%a3
|
||||||
lea bitMaskWordLut(%pc),%a5
|
lea bitMaskWordLut(%pc),%a5
|
||||||
|
| a6 = yLut base (yp -> yp*160) for use in DL_PLOT.
|
||||||
|
lea _gStRowOffsetLut(%pc),%a6
|
||||||
|
|
||||||
| x = x0, y = y0
|
| x = x0, y = y0
|
||||||
move.w SP_X0(%sp),%d2
|
move.w SP_X0(%sp),%d2
|
||||||
|
|
@ -179,8 +179,8 @@ _surface68kStDrawLine:
|
||||||
and.w #0x0F,%d0
|
and.w #0x0F,%d0
|
||||||
add.w %d0,%d0
|
add.w %d0,%d0
|
||||||
add.w %d0,%d0 | * 4 for bra.w table
|
add.w %d0,%d0 | * 4 for bra.w table
|
||||||
lea .LdlStTable(%pc),%a6
|
lea .LdlStTable(%pc),%a2 | a2 scratch (a6 holds yLut)
|
||||||
jmp 0(%a6,%d0.w)
|
jmp 0(%a2,%d0.w)
|
||||||
|
|
||||||
.LdlStTable:
|
.LdlStTable:
|
||||||
bra.w .LdlStLoop_0
|
bra.w .LdlStLoop_0
|
||||||
|
|
@ -529,6 +529,129 @@ _surface68kStFillRectSingleGroup:
|
||||||
rts
|
rts
|
||||||
|
|
||||||
|
|
||||||
|
| ---- surface68kStTileFill8x8 ---------------------------------------
|
||||||
|
|
|
||||||
|
| Specialized 8x8 single-group fill: 16-way color dispatch + 8 rows
|
||||||
|
| fully unrolled. Drops the per-row subq+bne overhead that the
|
||||||
|
| generic FRG_LOOP pays. Used by halTileFillPlanes.
|
||||||
|
|
|
||||||
|
| void surface68kStTileFill8x8(uint8_t *firstGroupPtr,
|
||||||
|
| uint16_t mask,
|
||||||
|
| uint8_t color);
|
||||||
|
|
|
||||||
|
| Per row body: 4 plane RMW with postinc + lea 152(a3),a3 to next
|
||||||
|
| row. Row 7 skips the trailing lea (a3 not used after).
|
||||||
|
|
||||||
|
.equ SP_TF_SAVED, 16 | d3-d4/a2-a3 = 4 longs
|
||||||
|
.equ SP_TF_OFF, (SP_TF_SAVED + 4)
|
||||||
|
.equ SP_TF_PTR, SP_TF_OFF + 0
|
||||||
|
.equ SP_TF_MASK, SP_TF_OFF + 4 + 2
|
||||||
|
.equ SP_TF_COLOR, SP_TF_OFF + 8 + 3
|
||||||
|
|
||||||
|
|
||||||
|
.macro TF8_ROW_BARE color
|
||||||
|
.if ((\color) & 1)
|
||||||
|
or.w %d3,(%a3)+
|
||||||
|
.else
|
||||||
|
and.w %d4,(%a3)+
|
||||||
|
.endif
|
||||||
|
.if ((\color) & 2)
|
||||||
|
or.w %d3,(%a3)+
|
||||||
|
.else
|
||||||
|
and.w %d4,(%a3)+
|
||||||
|
.endif
|
||||||
|
.if ((\color) & 4)
|
||||||
|
or.w %d3,(%a3)+
|
||||||
|
.else
|
||||||
|
and.w %d4,(%a3)+
|
||||||
|
.endif
|
||||||
|
.if ((\color) & 8)
|
||||||
|
or.w %d3,(%a3)+
|
||||||
|
.else
|
||||||
|
and.w %d4,(%a3)+
|
||||||
|
.endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
.macro TF8_ROW color
|
||||||
|
TF8_ROW_BARE \color
|
||||||
|
lea 152(%a3),%a3
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
.macro TF8_BODY color
|
||||||
|
.Ltf8_body_\color:
|
||||||
|
TF8_ROW \color | row 0
|
||||||
|
TF8_ROW \color | row 1
|
||||||
|
TF8_ROW \color | row 2
|
||||||
|
TF8_ROW \color | row 3
|
||||||
|
TF8_ROW \color | row 4
|
||||||
|
TF8_ROW \color | row 5
|
||||||
|
TF8_ROW \color | row 6
|
||||||
|
TF8_ROW_BARE \color | row 7 (no trailing lea)
|
||||||
|
bra.w .Ltf8_done
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
.globl _surface68kStTileFill8x8
|
||||||
|
|
||||||
|
_surface68kStTileFill8x8:
|
||||||
|
movem.l %d3-%d4/%a2-%a3,-(%sp)
|
||||||
|
|
||||||
|
move.l SP_TF_PTR(%sp),%a3
|
||||||
|
move.w SP_TF_MASK(%sp),%d3
|
||||||
|
move.w %d3,%d4
|
||||||
|
not.w %d4
|
||||||
|
|
||||||
|
| Color dispatch
|
||||||
|
moveq #0,%d0
|
||||||
|
move.b SP_TF_COLOR(%sp),%d0
|
||||||
|
and.w #0x0F,%d0
|
||||||
|
add.w %d0,%d0
|
||||||
|
add.w %d0,%d0 | * 4 for bra.w table
|
||||||
|
lea .Ltf8_table(%pc),%a2
|
||||||
|
jmp 0(%a2,%d0.w)
|
||||||
|
|
||||||
|
.Ltf8_table:
|
||||||
|
bra.w .Ltf8_body_0
|
||||||
|
bra.w .Ltf8_body_1
|
||||||
|
bra.w .Ltf8_body_2
|
||||||
|
bra.w .Ltf8_body_3
|
||||||
|
bra.w .Ltf8_body_4
|
||||||
|
bra.w .Ltf8_body_5
|
||||||
|
bra.w .Ltf8_body_6
|
||||||
|
bra.w .Ltf8_body_7
|
||||||
|
bra.w .Ltf8_body_8
|
||||||
|
bra.w .Ltf8_body_9
|
||||||
|
bra.w .Ltf8_body_10
|
||||||
|
bra.w .Ltf8_body_11
|
||||||
|
bra.w .Ltf8_body_12
|
||||||
|
bra.w .Ltf8_body_13
|
||||||
|
bra.w .Ltf8_body_14
|
||||||
|
bra.w .Ltf8_body_15
|
||||||
|
|
||||||
|
TF8_BODY 0
|
||||||
|
TF8_BODY 1
|
||||||
|
TF8_BODY 2
|
||||||
|
TF8_BODY 3
|
||||||
|
TF8_BODY 4
|
||||||
|
TF8_BODY 5
|
||||||
|
TF8_BODY 6
|
||||||
|
TF8_BODY 7
|
||||||
|
TF8_BODY 8
|
||||||
|
TF8_BODY 9
|
||||||
|
TF8_BODY 10
|
||||||
|
TF8_BODY 11
|
||||||
|
TF8_BODY 12
|
||||||
|
TF8_BODY 13
|
||||||
|
TF8_BODY 14
|
||||||
|
TF8_BODY 15
|
||||||
|
|
||||||
|
.Ltf8_done:
|
||||||
|
movem.l (%sp)+,%d3-%d4/%a2-%a3
|
||||||
|
rts
|
||||||
|
|
||||||
|
|
||||||
| ---- surface68kStFillRectMulti -------------------------------------
|
| ---- surface68kStFillRectMulti -------------------------------------
|
||||||
|
|
|
|
||||||
| Multi-group fillRect: groupFirst != groupLast. Caller pre-clips.
|
| Multi-group fillRect: groupFirst != groupLast. Caller pre-clips.
|
||||||
|
|
@ -782,6 +905,21 @@ frmRightMaskLut:
|
||||||
.word 0xFFF8, 0xFFFC, 0xFFFE, 0xFFFF
|
.word 0xFFF8, 0xFFFC, 0xFFFE, 0xFFFF
|
||||||
|
|
||||||
|
|
||||||
|
.align 2
|
||||||
|
| Shared y -> y*160 LUT. Used by drawLine (DL_PLOT), drawCircle
|
||||||
|
| (YP_REC), fillCircle (SPAN_BODY). 200 words = 400 bytes.
|
||||||
|
| Replaces a 44-cyc lsl.w #5 + lsl.w #7 + add.w shift chain with
|
||||||
|
| a 14-cyc indexed-word load. Exported so circle.s and fillCircle.s
|
||||||
|
| can reference it via absolute addressing without duplication.
|
||||||
|
.globl _gStRowOffsetLut
|
||||||
|
_gStRowOffsetLut:
|
||||||
|
.set li_y, 0
|
||||||
|
.rept 200
|
||||||
|
.word li_y * 160
|
||||||
|
.set li_y, li_y + 1
|
||||||
|
.endr
|
||||||
|
|
||||||
|
|
||||||
| ---- surface68kStLongFill ----------------------------------------
|
| ---- surface68kStLongFill ----------------------------------------
|
||||||
|
|
|
|
||||||
| Bulk long-fill helper for full-row fills (surfaceClear, fillRect
|
| Bulk long-fill helper for full-row fills (surfaceClear, fillRect
|
||||||
|
|
|
||||||
|
|
@ -1,30 +1,19 @@
|
||||||
| ST byte-aligned sprite save / restore via 256-entry plane-spread
|
| ST byte-aligned sprite save / restore. Buffer holds plane-major
|
||||||
| LUT. The LUT entry for each plane byte value is a 32-bit "spread"
|
| bytes: per row, plane0/1/2/3 per tile col, for w/8 tile cols. The
|
||||||
| where each plane byte bit lands at the corresponding plane-0 bit
|
| inner per-tile-col macro is 4 byte copies (no chunky <-> planar
|
||||||
| position of the 4-byte chunky output. For plane N, we shift the
|
| conversion since the buffer matches the surface's plane layout).
|
||||||
| LUT entry left by N to put bits at the plane-N positions, then OR
|
|
||||||
| the 4 plane contributions together to get the chunky long.
|
|
||||||
|
|
|
||||||
| LUT layout (256 longs = 1 KB), populated by initStPlaneSpreadLut
|
|
||||||
| in hal.c:
|
|
||||||
|
|
|
||||||
| gStPlaneSpreadLut[b] for plane byte b:
|
|
||||||
| bit i of b (i = 0 = MSB = leftmost pixel) maps to bit
|
|
||||||
| bitInLong(i) = (3 - (i >> 1)) * 8 + ((i & 1) ? 0 : 4)
|
|
||||||
| of the long. Plane 0's bits land at nibble bit 0 of each
|
|
||||||
| chunky byte; left-shift the LUT entry by N for plane N.
|
|
||||||
|
|
|
|
||||||
| ABI: cdecl. d2-d7/a2-a6 callee-save. C signatures:
|
| ABI: cdecl. d2-d7/a2-a6 callee-save. C signatures:
|
||||||
|
|
|
|
||||||
| void surface68kStSpriteSaveByteAligned(uint8_t *base,
|
| void surface68kStSpriteSaveByteAligned(uint8_t *base,
|
||||||
| uint16_t x, uint16_t y,
|
| uint16_t x, uint16_t y,
|
||||||
| uint16_t w, uint16_t h,
|
| uint16_t w, uint16_t h,
|
||||||
| uint8_t *dstChunky);
|
| uint8_t *dstPlaneBytes);
|
||||||
|
|
|
|
||||||
| void surface68kStSpriteRestoreByteAligned(uint8_t *base,
|
| void surface68kStSpriteRestoreByteAligned(uint8_t *base,
|
||||||
| uint16_t x, uint16_t y,
|
| uint16_t x, uint16_t y,
|
||||||
| uint16_t w, uint16_t h,
|
| uint16_t w, uint16_t h,
|
||||||
| const uint8_t *srcChunky);
|
| const uint8_t *srcPlaneBytes);
|
||||||
|
|
||||||
.text
|
.text
|
||||||
|
|
||||||
|
|
@ -36,19 +25,12 @@
|
||||||
.equ SP_Y, SP_OFF + 8 + 2
|
.equ SP_Y, SP_OFF + 8 + 2
|
||||||
.equ SP_W, SP_OFF + 12 + 2
|
.equ SP_W, SP_OFF + 12 + 2
|
||||||
.equ SP_H, SP_OFF + 16 + 2
|
.equ SP_H, SP_OFF + 16 + 2
|
||||||
.equ SP_CHUNKY, SP_OFF + 20
|
.equ SP_BUF, SP_OFF + 20
|
||||||
.equ SP_LUT, SP_OFF + 24
|
|
||||||
|
|
||||||
|
|
||||||
| Per-tile-col SAVE: 4 plane bytes -> 4 contiguous bytes in buffer.
|
| Per-tile-col SAVE: 4 plane bytes -> 4 contiguous bytes in buffer.
|
||||||
| a0 -> plane 0 byte (high or low half), strides 2 to next plane
|
| a0 -> plane 0 byte (high or low half), strides 2 to next plane
|
||||||
| a1 -> output planar bytes (advanced by 4)
|
| a1 -> output planar bytes (advanced by 4)
|
||||||
| a2 -> unused (LUT no longer needed)
|
|
||||||
|
|
|
||||||
| Phase 10.5: dropped chunky <-> planar conversion. The buffer holds
|
|
||||||
| plane-major bytes (per row: plane0, plane1, plane2, plane3 per
|
|
||||||
| tile col, for w/8 tile cols). 4 byte copies instead of 4 LUT
|
|
||||||
| lookups + shifts + ORs.
|
|
||||||
|
|
||||||
.macro SAVE_TILECOL
|
.macro SAVE_TILECOL
|
||||||
move.b (%a0),(%a1)+ | plane 0
|
move.b (%a0),(%a1)+ | plane 0
|
||||||
|
|
@ -64,13 +46,7 @@ _surface68kStSpriteSaveByteAligned:
|
||||||
movem.l %d2-%d7/%a2-%a6,-(%sp)
|
movem.l %d2-%d7/%a2-%a6,-(%sp)
|
||||||
|
|
||||||
move.l SP_BASE(%sp),%a3
|
move.l SP_BASE(%sp),%a3
|
||||||
move.l SP_CHUNKY(%sp),%a1
|
move.l SP_BUF(%sp),%a1
|
||||||
| LUT pointer comes in via stack arg -- guaranteed
|
|
||||||
| long-aligned because gcc passes ptr args via
|
|
||||||
| move.l on a long-aligned sp slot. Avoids the BSS
|
|
||||||
| misalignment problem on TOS .PRG (BSS pads only to
|
|
||||||
| 2 bytes, even uint32_t slots can land at mod-4 = 2).
|
|
||||||
move.l SP_LUT(%sp),%a2
|
|
||||||
|
|
||||||
move.w SP_W(%sp),%d5
|
move.w SP_W(%sp),%d5
|
||||||
lsr.w #3,%d5 | d5 = tileCols
|
lsr.w #3,%d5 | d5 = tileCols
|
||||||
|
|
@ -128,10 +104,6 @@ _surface68kStSpriteSaveByteAligned:
|
||||||
| Per-tile-col RESTORE: 4 contiguous bytes from buffer -> 4 plane bytes.
|
| Per-tile-col RESTORE: 4 contiguous bytes from buffer -> 4 plane bytes.
|
||||||
| a0 -> plane 0 byte (high or low half)
|
| a0 -> plane 0 byte (high or low half)
|
||||||
| a1 -> input planar bytes (advanced by 4)
|
| a1 -> input planar bytes (advanced by 4)
|
||||||
| a2 -> unused (LUT no longer needed)
|
|
||||||
|
|
|
||||||
| Phase 10.5: dropped chunky -> planar conversion. Buffer layout
|
|
||||||
| matches SAVE_TILECOL: per row, plane0/1/2/3 per tile col.
|
|
||||||
|
|
||||||
.macro RESTORE_TILECOL
|
.macro RESTORE_TILECOL
|
||||||
move.b (%a1)+,(%a0) | plane 0
|
move.b (%a1)+,(%a0) | plane 0
|
||||||
|
|
@ -147,8 +119,7 @@ _surface68kStSpriteRestoreByteAligned:
|
||||||
movem.l %d2-%d7/%a2-%a6,-(%sp)
|
movem.l %d2-%d7/%a2-%a6,-(%sp)
|
||||||
|
|
||||||
move.l SP_BASE(%sp),%a3
|
move.l SP_BASE(%sp),%a3
|
||||||
move.l SP_CHUNKY(%sp),%a1
|
move.l SP_BUF(%sp),%a1
|
||||||
move.l SP_LUT(%sp),%a2 | gC2pLut passed in
|
|
||||||
|
|
||||||
| tileCols is held in a5 (not d5) because the macro
|
| tileCols is held in a5 (not d5) because the macro
|
||||||
| trashes d5 (uses it for pb3).
|
| trashes d5 (uses it for pb3).
|
||||||
|
|
@ -200,3 +171,151 @@ _surface68kStSpriteRestoreByteAligned:
|
||||||
|
|
||||||
movem.l (%sp)+,%d2-%d7/%a2-%a6
|
movem.l (%sp)+,%d2-%d7/%a2-%a6
|
||||||
rts
|
rts
|
||||||
|
|
||||||
|
|
||||||
|
| ---- surface68kStSprite16x16Save / Restore -----------------------
|
||||||
|
|
|
||||||
|
| Specialized 16x16 sprite save/restore: 16 rows fully unrolled,
|
||||||
|
| 8 byte copies per row (2 tile cols), no col loop. Drops the asm
|
||||||
|
| walker's per-row col-init + col-loop-check overhead.
|
||||||
|
|
|
||||||
|
| void surface68kStSprite16x16Save(uint8_t *base,
|
||||||
|
| uint16_t x, uint16_t y,
|
||||||
|
| uint8_t *dstBuf);
|
||||||
|
|
|
||||||
|
| void surface68kStSprite16x16Restore(uint8_t *base,
|
||||||
|
| uint16_t x, uint16_t y,
|
||||||
|
| const uint8_t *srcBuf);
|
||||||
|
|
|
||||||
|
| Caller guarantees x is byte-aligned (x mod 8 == 0). Two halfOff
|
||||||
|
| variants dispatch on (x & 8): halfOff=0 reads/writes within one
|
||||||
|
| group (offsets 0/2/4/6 high half + 1/3/5/7 low half). halfOff=1
|
||||||
|
| spans two groups (low half of group N + high half of group N+1).
|
||||||
|
|
||||||
|
.equ SP16_SAVED, 12 | d2/a2-a3 = 3 longs
|
||||||
|
.equ SP16_OFF, (SP16_SAVED + 4)
|
||||||
|
.equ SP16_BASE, SP16_OFF + 0
|
||||||
|
.equ SP16_X, SP16_OFF + 4 + 2
|
||||||
|
.equ SP16_Y, SP16_OFF + 8 + 2
|
||||||
|
.equ SP16_BUF, SP16_OFF + 12
|
||||||
|
|
||||||
|
|
||||||
|
| Macro: setup a0 = base + y*160 + group*8 + halfOff
|
||||||
|
| Trashes: d0, d1, d2; a0 left at row start
|
||||||
|
|
||||||
|
.macro SP16_SETUP_A0
|
||||||
|
move.l SP16_BASE(%sp),%a3
|
||||||
|
move.w SP16_X(%sp),%d0
|
||||||
|
move.w SP16_Y(%sp),%d1
|
||||||
|
|
||||||
|
| a0 = base + y*160
|
||||||
|
ext.l %d1
|
||||||
|
move.l %d1,%d2
|
||||||
|
lsl.l #5,%d1
|
||||||
|
lsl.l #7,%d2
|
||||||
|
add.l %d2,%d1
|
||||||
|
lea 0(%a3,%d1.l),%a0
|
||||||
|
|
||||||
|
| a0 += (x>>4) * 8
|
||||||
|
move.w %d0,%d1
|
||||||
|
lsr.w #4,%d1
|
||||||
|
lsl.w #3,%d1
|
||||||
|
ext.l %d1
|
||||||
|
add.l %d1,%a0
|
||||||
|
|
||||||
|
| a0 += halfOff (= (x & 8) >> 3)
|
||||||
|
and.w #8,%d0
|
||||||
|
lsr.w #3,%d0
|
||||||
|
ext.l %d0
|
||||||
|
add.l %d0,%a0
|
||||||
|
| d0 = halfOff (0 or 1) for downstream dispatch
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
.globl _surface68kStSprite16x16Save
|
||||||
|
|
||||||
|
_surface68kStSprite16x16Save:
|
||||||
|
movem.l %d2/%a2-%a3,-(%sp)
|
||||||
|
SP16_SETUP_A0
|
||||||
|
move.l SP16_BUF(%sp),%a1
|
||||||
|
|
||||||
|
tst.w %d0
|
||||||
|
bne.w .Lsp16s_low
|
||||||
|
|
||||||
|
| halfOff=0: a0 at high half. Col 0 = high (offsets
|
||||||
|
| 0,2,4,6); col 1 = low (offsets 1,3,5,7).
|
||||||
|
.rept 16
|
||||||
|
move.b (%a0),(%a1)+
|
||||||
|
move.b 2(%a0),(%a1)+
|
||||||
|
move.b 4(%a0),(%a1)+
|
||||||
|
move.b 6(%a0),(%a1)+
|
||||||
|
move.b 1(%a0),(%a1)+
|
||||||
|
move.b 3(%a0),(%a1)+
|
||||||
|
move.b 5(%a0),(%a1)+
|
||||||
|
move.b 7(%a0),(%a1)+
|
||||||
|
lea 160(%a0),%a0
|
||||||
|
.endr
|
||||||
|
bra.w .Lsp16s_done
|
||||||
|
|
||||||
|
.Lsp16s_low:
|
||||||
|
| halfOff=1: a0 at low half (group+1). Col 0 = low of
|
||||||
|
| this group, offsets 0,2,4,6 from a0. Col 1 = high of
|
||||||
|
| next group, at offsets 7,9,11,13 from a0.
|
||||||
|
.rept 16
|
||||||
|
move.b (%a0),(%a1)+
|
||||||
|
move.b 2(%a0),(%a1)+
|
||||||
|
move.b 4(%a0),(%a1)+
|
||||||
|
move.b 6(%a0),(%a1)+
|
||||||
|
move.b 7(%a0),(%a1)+
|
||||||
|
move.b 9(%a0),(%a1)+
|
||||||
|
move.b 11(%a0),(%a1)+
|
||||||
|
move.b 13(%a0),(%a1)+
|
||||||
|
lea 160(%a0),%a0
|
||||||
|
.endr
|
||||||
|
|
||||||
|
.Lsp16s_done:
|
||||||
|
movem.l (%sp)+,%d2/%a2-%a3
|
||||||
|
rts
|
||||||
|
|
||||||
|
|
||||||
|
.globl _surface68kStSprite16x16Restore
|
||||||
|
|
||||||
|
_surface68kStSprite16x16Restore:
|
||||||
|
movem.l %d2/%a2-%a3,-(%sp)
|
||||||
|
SP16_SETUP_A0
|
||||||
|
move.l SP16_BUF(%sp),%a1
|
||||||
|
|
||||||
|
tst.w %d0
|
||||||
|
bne.w .Lsp16r_low
|
||||||
|
|
||||||
|
| halfOff=0: write high half (col 0) + low half (col 1).
|
||||||
|
.rept 16
|
||||||
|
move.b (%a1)+,(%a0)
|
||||||
|
move.b (%a1)+,2(%a0)
|
||||||
|
move.b (%a1)+,4(%a0)
|
||||||
|
move.b (%a1)+,6(%a0)
|
||||||
|
move.b (%a1)+,1(%a0)
|
||||||
|
move.b (%a1)+,3(%a0)
|
||||||
|
move.b (%a1)+,5(%a0)
|
||||||
|
move.b (%a1)+,7(%a0)
|
||||||
|
lea 160(%a0),%a0
|
||||||
|
.endr
|
||||||
|
bra.w .Lsp16r_done
|
||||||
|
|
||||||
|
.Lsp16r_low:
|
||||||
|
| halfOff=1
|
||||||
|
.rept 16
|
||||||
|
move.b (%a1)+,(%a0)
|
||||||
|
move.b (%a1)+,2(%a0)
|
||||||
|
move.b (%a1)+,4(%a0)
|
||||||
|
move.b (%a1)+,6(%a0)
|
||||||
|
move.b (%a1)+,7(%a0)
|
||||||
|
move.b (%a1)+,9(%a0)
|
||||||
|
move.b (%a1)+,11(%a0)
|
||||||
|
move.b (%a1)+,13(%a0)
|
||||||
|
lea 160(%a0),%a0
|
||||||
|
.endr
|
||||||
|
|
||||||
|
.Lsp16r_done:
|
||||||
|
movem.l (%sp)+,%d2/%a2-%a3
|
||||||
|
rts
|
||||||
|
|
|
||||||
|
|
@ -614,12 +614,12 @@ void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src) {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp) {
|
bool halSurfaceLoadFile(SurfaceT *dst, FILE *fp) {
|
||||||
return fread(dst->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE;
|
return fread(dst->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp) {
|
bool halSurfaceSaveFile(const SurfaceT *src, FILE *fp) {
|
||||||
return fwrite(src->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE;
|
return fwrite(src->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -395,12 +395,12 @@ void halSurfaceCopyChunky(SurfaceT *dst, const SurfaceT *src) {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
bool halSurfaceLoadFileChunky(SurfaceT *dst, FILE *fp) {
|
bool halSurfaceLoadFile(SurfaceT *dst, FILE *fp) {
|
||||||
return fread(dst->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE;
|
return fread(dst->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
bool halSurfaceSaveFileChunky(const SurfaceT *src, FILE *fp) {
|
bool halSurfaceSaveFile(const SurfaceT *src, FILE *fp) {
|
||||||
return fwrite(src->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE;
|
return fwrite(src->pixels, 1, SURFACE_PIXELS_SIZE, fp) == SURFACE_PIXELS_SIZE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue