joeylib2/src/port/atarist/hal.c

551 lines
18 KiB
C

// Atari ST HAL for M2 + M2.5.
//
// M2 scope:
// * XBIOS Setscreen to ST low-res (320x200x16, mode 0).
// * Chunky 4bpp to word-interleaved ST planar c2p at present time.
//
// M2.5 scope (per-band palette / SCB emulation):
// * halPresent scans the SurfaceT's SCB array and builds a compact
// transitions table: each entry is (start_line, palette_index)
// for a new palette region. For pattern.c's 8 uniform bands this
// is 8 entries; in the worst case it is 200 (one per scanline).
// * VBL ISR pre-loads the first band's palette, then programs
// MFP Timer B (event-count mode, TBDR = HBL delta to first
// transition) to fire at the END of the last scanline before
// the next band starts.
// * Timer B ISR writes the current band's palette, advances the
// transition index, and (stop/reload TBDR/restart) reprograms
// Timer B to fire at the next transition. With 8 transitions per
// frame the ISR runs 8 times instead of 313 -- well under the
// ~147-HBL-fires-per-frame cap Hatari's MFP emulation imposes on
// event-count mode, and ~0.2% CPU overhead vs ~60% for per-HBL.
// * gLinePalettes is a flat pre-quantized (line, color)->$0RGB
// table built in halPresent by flattenScbPalettes; the ISR uses
// its first row per band as the source of 16 shifter writes.
//
// Deferred:
// * Takeover mode (direct shifter programming without TOS).
// * STE's extended palette bits (we drop to STF 9-bit for now).
#include <stddef.h>
#include <stdio.h>
#include <string.h>
#include <mint/osbind.h>
#include "hal.h"
#include "surfaceInternal.h"
// ----- Constants -----
// Word-interleaved ST planar uses the same 160 bytes/scanline as our
// chunky source, but organized as 20 groups of 4 words per scanline,
// with each word holding the 16 one-bit samples for one bitplane.
#define ST_BYTES_PER_ROW 160
#define ST_GROUPS_PER_ROW 20
#define ST_SCREEN_ALIGN 256
// Shifter palette registers: 16 words at $FFFF8240..$FFFF825F.
#define ST_PALETTE_REGS ((volatile uint16_t *)0xFFFF8240L)
// MFP hardware addresses.
#define ST_MFP_TBCR ((volatile uint8_t *)0xFFFFFA1BL) // Timer B control
#define ST_MFP_TBDR ((volatile uint8_t *)0xFFFFFA21L) // Timer B data
#define ST_MFP_ISRA ((volatile uint8_t *)0xFFFFFA0FL) // In-service A
#define MFP_TBCR_STOP 0x00
#define MFP_TBCR_EVENT 0x08
#define MFP_TB_CLEAR 0xFE // clear bit 0 of ISRA (Timer B)
// Exception-vector numbers passed to Setexc (= vector offset / 4).
#define VEC_VBL (0x70 / 4) // 68k autovector IRQ 4
#define VEC_MFP_TB (0x120 / 4) // MFP Timer B
#define INT_TIMER_B 8
// ----- Prototypes -----
static uint16_t quantizeColorToSt(uint16_t orgb);
static void c2pRow(const uint8_t *src, uint16_t *dst, uint16_t groupStart, uint16_t groupEnd);
static void c2pRange(const SurfaceT *src, int16_t y0, int16_t y1, uint16_t groupStart, uint16_t groupEnd);
static void flattenScbPalettes(const SurfaceT *src);
static void writeDiagnostics(void);
static long writePrevPaletteRegs(void);
static __attribute__((interrupt_handler)) void timerBIsr(void);
static __attribute__((interrupt_handler)) void vblIsr(void);
static void buildTransitions(const SurfaceT *src);
static bool paletteOrScbChanged(const SurfaceT *src);
static void refreshPaletteStateIfNeeded(const SurfaceT *src);
// ----- Module state -----
// Screen buffer: enough for 320x200x4bpp planar plus padding for
// runtime 256-byte alignment. TOS .PRG format only supports 2-byte
// object-file alignment, so we overallocate and align the pointer
// manually in halInit.
static uint8_t gScreenBuffer[SURFACE_PIXELS_SIZE + ST_SCREEN_ALIGN];
static uint8_t *gScreenBase = NULL;
static void *gPrevPhysbase = NULL;
static void *gPrevLogbase = NULL;
static int16_t gPrevRez = 0;
static uint16_t gPrevPalette[SURFACE_COLORS_PER_PALETTE];
static bool gModeSet = false;
// Per-scanline pre-quantized palette table. Indexed by display line;
// each row is a 16-word palette ready to be copied straight into the
// shifter registers. Written at present() time, read by the Timer B
// ISR with no CPU-side math beyond a counter subtract.
static uint16_t gLinePalettes[SURFACE_HEIGHT][SURFACE_COLORS_PER_PALETTE];
// Band-transition table. Each entry is one palette change: at
// display line gBandStart[i], load palette indexed by gBandPalIdx[i].
// Built once per halPresent from the SurfaceT's SCB array.
#define MAX_BANDS SURFACE_HEIGHT
static uint16_t gBandStart [MAX_BANDS];
static uint8_t gBandPalIdx[MAX_BANDS];
static uint16_t gBandCount = 0;
// Index of the band the Timer B ISR is currently scheduling TO. At
// VBL this is 0 (band 0 palette pre-loaded, Timer B scheduled to
// fire when it's time to transition to band 1). Each ISR fire writes
// the palette for gCurrentBand and advances to the next.
static volatile uint16_t gCurrentBand = 0;
// Diagnostic captures.
static volatile int16_t gFrameCount = 0;
static volatile uint16_t gLastBandCount = 0;
// Saved exception vectors for restore on shutdown.
static void (*gOldVblVec)(void) = NULL;
static void (*gOldTimerBVec)(void) = NULL;
// Cached SCB + palette from the last present. flattenScbPalettes runs
// 200 * 16 quantize conversions and buildTransitions rescans the full
// SCB; neither is cheap on a 7 MHz 68000. In the typical game loop
// (and every frame of the keys demo after the initial paint) SCB and
// palette never change, so caching and skipping those passes keeps
// rect presents down to just the c2p work.
static uint8_t gCachedScb [SURFACE_HEIGHT];
static uint16_t gCachedPalette[SURFACE_PALETTE_COUNT][SURFACE_COLORS_PER_PALETTE];
static bool gCacheValid = false;
// ----- Internal helpers (alphabetical) -----
// Convert 16 chunky pixels (8 bytes 4bpp packed) to 4 ST planar words
// per group. groupStart..groupEnd selects a horizontal sub-range so
// halPresentRect can avoid touching unchanged groups.
static void c2pRow(const uint8_t *src, uint16_t *dst, uint16_t groupStart, uint16_t groupEnd) {
uint16_t group;
uint16_t px;
uint16_t plane0;
uint16_t plane1;
uint16_t plane2;
uint16_t plane3;
uint8_t byte;
uint8_t nibble;
uint16_t bit;
for (group = groupStart; group < groupEnd; group++) {
plane0 = 0;
plane1 = 0;
plane2 = 0;
plane3 = 0;
for (px = 0; px < 16; px++) {
byte = src[(group * 8) + (px >> 1)];
nibble = (uint8_t)((px & 1) ? (byte & 0x0F) : (byte >> 4));
bit = (uint16_t)(15 - px);
plane0 = (uint16_t)(plane0 | (((nibble >> 0) & 1) << bit));
plane1 = (uint16_t)(plane1 | (((nibble >> 1) & 1) << bit));
plane2 = (uint16_t)(plane2 | (((nibble >> 2) & 1) << bit));
plane3 = (uint16_t)(plane3 | (((nibble >> 3) & 1) << bit));
}
dst[(group * 4) + 0] = plane0;
dst[(group * 4) + 1] = plane1;
dst[(group * 4) + 2] = plane2;
dst[(group * 4) + 3] = plane3;
}
}
static void c2pRange(const SurfaceT *src, int16_t y0, int16_t y1, uint16_t groupStart, uint16_t groupEnd) {
int16_t y;
const uint8_t *srcLine;
uint16_t *dstLine;
for (y = y0; y < y1; y++) {
srcLine = &src->pixels[y * SURFACE_BYTES_PER_ROW];
dstLine = (uint16_t *)&gScreenBase[y * ST_BYTES_PER_ROW];
c2pRow(srcLine, dstLine, groupStart, groupEnd);
}
}
// Scan the surface's SCB and record one transition entry for each
// run of the same palette index. gBandCount is the number of
// distinct bands; gBandStart[i] is the display line where band i
// begins; gBandPalIdx[i] is the palette index that band uses.
static void buildTransitions(const SurfaceT *src) {
uint16_t line;
uint8_t idx;
uint8_t prev;
gBandCount = 0;
prev = 0xFF;
for (line = 0; line < SURFACE_HEIGHT; line++) {
idx = src->scb[line];
if (idx >= SURFACE_PALETTE_COUNT) {
idx = 0;
}
if (idx != prev) {
if (gBandCount < MAX_BANDS) {
gBandStart [gBandCount] = line;
gBandPalIdx[gBandCount] = idx;
gBandCount++;
}
prev = idx;
}
}
gLastBandCount = gBandCount;
}
// Pre-quantize every palette row indexed by scanline through the SCB
// into gLinePalettes, so the Timer B ISR can do a flat indexed copy
// without any surface-level lookups. Called once per halPresent.
static void flattenScbPalettes(const SurfaceT *src) {
uint16_t line;
uint16_t col;
uint8_t idx;
for (line = 0; line < SURFACE_HEIGHT; line++) {
idx = src->scb[line];
if (idx >= SURFACE_PALETTE_COUNT) {
idx = 0;
}
for (col = 0; col < SURFACE_COLORS_PER_PALETTE; col++) {
gLinePalettes[line][col] = quantizeColorToSt(src->palette[idx][col]);
}
}
}
// Returns true if SCB or palette values differ from the last present.
static bool paletteOrScbChanged(const SurfaceT *src) {
if (!gCacheValid) {
return true;
}
if (memcmp(gCachedScb, src->scb, sizeof(gCachedScb)) != 0) {
return true;
}
if (memcmp(gCachedPalette, src->palette, sizeof(gCachedPalette)) != 0) {
return true;
}
return false;
}
// Rebuild the per-line palette table and band-transition table only
// when the SCB/palette state has actually changed. Both are hot -- the
// flatten pass runs 3200 palette entries through quantization -- so
// skipping them on clean frames dominates rect-present timing.
static void refreshPaletteStateIfNeeded(const SurfaceT *src) {
if (!paletteOrScbChanged(src)) {
return;
}
flattenScbPalettes(src);
buildTransitions(src);
memcpy(gCachedScb, src->scb, sizeof(gCachedScb));
memcpy(gCachedPalette, src->palette, sizeof(gCachedPalette));
gCacheValid = true;
}
// 12-bit $0RGB to STF 9-bit palette register (drops the low bit of
// each 4-bit channel).
static uint16_t quantizeColorToSt(uint16_t orgb) {
uint16_t r;
uint16_t g;
uint16_t b;
r = (orgb >> 8) & 0x0F;
g = (orgb >> 4) & 0x0F;
b = orgb & 0x0F;
r = r >> 1;
g = g >> 1;
b = b >> 1;
return (uint16_t)((r << 8) | (g << 4) | b);
}
// Timer B interrupt handler. Fires once at each band transition;
// writes the band's palette to the shifter and lets Timer B's
// auto-reload keep counting for the next fire. We deliberately do
// NOT stop/reload/restart the timer here: that sequence would cost
// 1-2 HBL edges each fire, and those losses compound across 7+
// transitions into a visible "last band short" drift. Updating
// TBDR in place is enough for variable-length bands -- the new
// value takes effect on the fire AFTER next, which is acceptable
// when adjacent bands have similar lengths; uniform bands (like
// pattern.c) don't need TBDR updates at all so stay perfectly
// aligned with no drift.
static void timerBIsr(void) {
uint16_t band;
uint8_t palIdx;
const uint16_t *src;
volatile uint16_t *dst;
uint16_t nextDelta;
band = gCurrentBand + 1;
gCurrentBand = band;
if (band < gBandCount) {
palIdx = gBandPalIdx[band];
if (palIdx >= SURFACE_PALETTE_COUNT) {
palIdx = 0;
}
src = &gLinePalettes[gBandStart[band]][0];
dst = ST_PALETTE_REGS;
dst[ 0] = src[ 0];
dst[ 1] = src[ 1];
dst[ 2] = src[ 2];
dst[ 3] = src[ 3];
dst[ 4] = src[ 4];
dst[ 5] = src[ 5];
dst[ 6] = src[ 6];
dst[ 7] = src[ 7];
dst[ 8] = src[ 8];
dst[ 9] = src[ 9];
dst[10] = src[10];
dst[11] = src[11];
dst[12] = src[12];
dst[13] = src[13];
dst[14] = src[14];
dst[15] = src[15];
if (band + 1 < gBandCount) {
// Update TBDR for the fire-after-next (auto-reload at
// the NEXT fire still uses the old value). Don't stop
// the timer.
nextDelta = gBandStart[band + 1] - gBandStart[band];
if (nextDelta == 0 || nextDelta > 255) {
nextDelta = 1;
}
*ST_MFP_TBDR = (uint8_t)nextDelta;
*ST_MFP_ISRA = MFP_TB_CLEAR;
return;
}
}
// No further transitions this frame; stopping Timer B here only
// affects the (never-used) next fire, not timing of any band
// we've already scheduled.
*ST_MFP_TBCR = MFP_TBCR_STOP;
*ST_MFP_ISRA = MFP_TB_CLEAR;
}
// Vertical blank handler. Pre-loads band 0's palette so the first
// visible scanline is correct, then programs Timer B to fire at
// the HBL delta to the next band transition (if any).
static void vblIsr(void) {
uint16_t delta;
const uint16_t *src;
volatile uint16_t *dst;
gFrameCount = gFrameCount + 1;
gCurrentBand = 0;
if (gBandCount == 0) {
return;
}
// Stage band 0's palette into the shifter registers.
src = &gLinePalettes[gBandStart[0]][0];
dst = ST_PALETTE_REGS;
dst[ 0] = src[ 0];
dst[ 1] = src[ 1];
dst[ 2] = src[ 2];
dst[ 3] = src[ 3];
dst[ 4] = src[ 4];
dst[ 5] = src[ 5];
dst[ 6] = src[ 6];
dst[ 7] = src[ 7];
dst[ 8] = src[ 8];
dst[ 9] = src[ 9];
dst[10] = src[10];
dst[11] = src[11];
dst[12] = src[12];
dst[13] = src[13];
dst[14] = src[14];
dst[15] = src[15];
// Program Timer B for the next band transition.
if (gBandCount > 1) {
delta = gBandStart[1] - gBandStart[0];
if (delta == 0 || delta > 255) {
delta = 1;
}
*ST_MFP_TBCR = MFP_TBCR_STOP;
*ST_MFP_TBDR = (uint8_t)delta;
*ST_MFP_ISRA = MFP_TB_CLEAR;
*ST_MFP_TBCR = MFP_TBCR_EVENT;
} else {
*ST_MFP_TBCR = MFP_TBCR_STOP;
*ST_MFP_ISRA = MFP_TB_CLEAR;
}
}
static long writePrevPaletteRegs(void) {
uint16_t i;
for (i = 0; i < SURFACE_COLORS_PER_PALETTE; i++) {
ST_PALETTE_REGS[i] = gPrevPalette[i];
}
return 0;
}
static void writeDiagnostics(void) {
FILE *fp;
uint16_t i;
fp = fopen("diag.txt", "w");
if (fp == NULL) {
return;
}
fprintf(fp, "frames observed: %d\n", (int)gFrameCount);
fprintf(fp, "band count: %d\n", (int)gLastBandCount);
for (i = 0; i < gLastBandCount && i < 16; i++) {
fprintf(fp, " band %2d: start line %3d, palIdx %d\n",
(int)i, (int)gBandStart[i], (int)gBandPalIdx[i]);
}
fclose(fp);
}
// ----- HAL API (alphabetical) -----
bool halInit(const JoeyConfigT *config) {
uintptr_t addr;
(void)config;
// Align screen buffer to 256 bytes inside the static storage.
addr = (uintptr_t)gScreenBuffer;
addr = (addr + (ST_SCREEN_ALIGN - 1)) & ~((uintptr_t)ST_SCREEN_ALIGN - 1);
gScreenBase = (uint8_t *)addr;
memset(gScreenBase, 0, SURFACE_PIXELS_SIZE);
gPrevPhysbase = Physbase();
gPrevLogbase = Logbase();
gPrevRez = Getrez();
// Capture current palette so we can restore exactly on shutdown.
{
uint16_t i;
for (i = 0; i < SURFACE_COLORS_PER_PALETTE; i++) {
gPrevPalette[i] = (uint16_t)Setcolor((int16_t)i, -1);
}
}
// Switch to ST low-res: 320x200x16, mode 0.
Setscreen((long)gScreenBase, (long)gScreenBase, 0);
gModeSet = true;
// Force hardware palette entry 0 to black so the overscan border
// (which the ST shows in palette[0]) stays black until the app's
// first refreshPaletteStateIfNeeded uploads its own palette.
Setcolor(0, 0x000);
// Save previous VBL + Timer B vectors, install ours. Timer B
// is at MFP vector $120; vector installed by Xbtimer below.
gOldVblVec = (void (*)(void))Setexc(VEC_VBL, -1L);
gOldTimerBVec = (void (*)(void))Setexc(VEC_MFP_TB, -1L);
(void)Setexc(VEC_VBL, (long)vblIsr);
// Program MFP Timer B: event-count (HBL) mode, initial TBDR=1
// (a placeholder -- VBL ISR reprograms it for the first real
// transition per frame), vector=timerBIsr, then enable in IMRA.
Xbtimer(1, MFP_TBCR_EVENT, 1, timerBIsr);
Jenabint(INT_TIMER_B);
return true;
}
const char *halLastError(void) {
return NULL;
}
void halPresent(const SurfaceT *src) {
if (src == NULL || !gModeSet) {
return;
}
refreshPaletteStateIfNeeded(src);
c2pRange(src, 0, SURFACE_HEIGHT, 0, ST_GROUPS_PER_ROW);
}
void halPresentRect(const SurfaceT *src, int16_t x, int16_t y, uint16_t w, uint16_t h) {
uint16_t groupStart;
uint16_t groupEnd;
if (src == NULL || !gModeSet) {
return;
}
refreshPaletteStateIfNeeded(src);
// Each c2p group covers 16 horizontal pixels. Round dirty pixel
// range to the enclosing group range to keep the planar word
// alignment without missing edge pixels.
groupStart = (uint16_t)(x >> 4);
groupEnd = (uint16_t)(((uint16_t)x + w + 15) >> 4);
if (groupEnd > ST_GROUPS_PER_ROW) {
groupEnd = ST_GROUPS_PER_ROW;
}
c2pRange(src, y, y + (int16_t)h, groupStart, groupEnd);
}
// Vsync() is XBIOS opcode 37; mintlib exposes it directly. It blocks
// until the next 50 Hz (PAL) or 60 Hz (NTSC) vertical blank.
void halWaitVBL(void) {
int16_t before;
// Can't use Vsync(): TOS's Vsync increments _vblsem inside its
// own VBL ISR, which we replaced (Setexc(VEC_VBL, vblIsr)) with
// our SCB-emulating ISR that doesn't chain to the original.
// Spin on gFrameCount instead -- it's volatile and bumped every
// VBL by our ISR.
before = gFrameCount;
while (gFrameCount == before) {
// wait
}
}
void halShutdown(void) {
if (!gModeSet) {
return;
}
// Disable MFP Timer B and restore the exception vectors before
// changing the screen -- a late ISR firing mid-Setscreen would
// write palette into whatever buffer TOS remapped.
Jdisint(INT_TIMER_B);
if (gOldTimerBVec != NULL) {
(void)Setexc(VEC_MFP_TB, (long)gOldTimerBVec);
}
if (gOldVblVec != NULL) {
(void)Setexc(VEC_VBL, (long)gOldVblVec);
}
Setscreen((long)gPrevLogbase, (long)gPrevPhysbase, gPrevRez);
Supexec(writePrevPaletteRegs);
writeDiagnostics();
gModeSet = false;
}