From a20c488959eba1908ca003320c4899af4fb4524a Mon Sep 17 00:00:00 2001 From: Scott Duensing Date: Mon, 13 Apr 2026 19:40:45 -0500 Subject: [PATCH] Initial commit. --- .gitignore | 4 + Makefile | 52 ++ README.md | 399 +++++++++++++++ accelVid.c | 574 ++++++++++++++++++++++ accelVid.h | 257 ++++++++++ atiMach64.c | 960 ++++++++++++++++++++++++++++++++++++ banshee.c | 715 +++++++++++++++++++++++++++ cirrusGd54.c | 732 ++++++++++++++++++++++++++++ cirrusLaguna.c | 585 ++++++++++++++++++++++ demo.c | 869 +++++++++++++++++++++++++++++++++ matroxMga.c | 843 ++++++++++++++++++++++++++++++++ nvidia.c | 677 ++++++++++++++++++++++++++ pci.c | 307 ++++++++++++ pci.h | 98 ++++ s3Trio.c | 1216 ++++++++++++++++++++++++++++++++++++++++++++++ sis.c | 561 +++++++++++++++++++++ test/86box.cfg | 62 +++ test/README.txt | 121 +++++ test/rundemo.bat | 3 + trident.c | 630 ++++++++++++++++++++++++ tsengW32.c | 698 ++++++++++++++++++++++++++ vgaCommon.c | 505 +++++++++++++++++++ vgaCommon.h | 198 ++++++++ 23 files changed, 11066 insertions(+) create mode 100644 .gitignore create mode 100644 Makefile create mode 100644 README.md create mode 100644 accelVid.c create mode 100644 accelVid.h create mode 100644 atiMach64.c create mode 100644 banshee.c create mode 100644 cirrusGd54.c create mode 100644 cirrusLaguna.c create mode 100644 demo.c create mode 100644 matroxMga.c create mode 100644 nvidia.c create mode 100644 pci.c create mode 100644 pci.h create mode 100644 s3Trio.c create mode 100644 sis.c create mode 100644 test/86box.cfg create mode 100644 test/README.txt create mode 100644 test/rundemo.bat create mode 100644 trident.c create mode 100644 tsengW32.c create mode 100644 vgaCommon.c create mode 100644 vgaCommon.h diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..cbcbcb2 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.claude/ +obj/ +bin/ +PLAN.md diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..9ac4de9 --- /dev/null +++ b/Makefile @@ -0,0 +1,52 @@ +# Makefile -- DOS Accelerated Video Driver Framework +# +# DJGPP cross-compilation build matching DVX conventions. +# Produces demo.exe as the test application. + +DJGPP_PREFIX = $(HOME)/djgpp/djgpp +CC = $(DJGPP_PREFIX)/bin/i586-pc-msdosdjgpp-gcc +CFLAGS = -O2 -Wall -Wextra -Werror -Wno-type-limits -Wno-sign-compare -Wno-format-truncation -march=i486 -mtune=i586 + +OBJDIR = obj +BINDIR = bin + +# Source files +SRCS = pci.c vgaCommon.c accelVid.c s3Trio.c cirrusGd54.c cirrusLaguna.c atiMach64.c tsengW32.c matroxMga.c banshee.c nvidia.c trident.c sis.c demo.c +OBJS = $(patsubst %.c,$(OBJDIR)/%.o,$(SRCS)) + +TARGET = $(BINDIR)/demo.exe + +.PHONY: all clean + +all: $(TARGET) + +$(TARGET): $(OBJS) | $(BINDIR) + $(CC) $(CFLAGS) -o $@ $(OBJS) + +$(OBJDIR)/%.o: %.c | $(OBJDIR) + $(CC) $(CFLAGS) -c -o $@ $< + +$(OBJDIR): + mkdir -p $(OBJDIR) + +$(BINDIR): + mkdir -p $(BINDIR) + +# Dependencies +$(OBJDIR)/pci.o: pci.c pci.h +$(OBJDIR)/vgaCommon.o: vgaCommon.c vgaCommon.h +$(OBJDIR)/accelVid.o: accelVid.c accelVid.h pci.h +$(OBJDIR)/s3Trio.o: s3Trio.c accelVid.h vgaCommon.h pci.h +$(OBJDIR)/cirrusGd54.o: cirrusGd54.c accelVid.h vgaCommon.h pci.h +$(OBJDIR)/atiMach64.o: atiMach64.c accelVid.h vgaCommon.h pci.h +$(OBJDIR)/tsengW32.o: tsengW32.c accelVid.h vgaCommon.h pci.h +$(OBJDIR)/matroxMga.o: matroxMga.c accelVid.h vgaCommon.h pci.h +$(OBJDIR)/banshee.o: banshee.c accelVid.h vgaCommon.h pci.h +$(OBJDIR)/nvidia.o: nvidia.c accelVid.h vgaCommon.h pci.h +$(OBJDIR)/trident.o: trident.c accelVid.h vgaCommon.h pci.h +$(OBJDIR)/cirrusLaguna.o: cirrusLaguna.c accelVid.h vgaCommon.h pci.h +$(OBJDIR)/sis.o: sis.c accelVid.h vgaCommon.h pci.h +$(OBJDIR)/demo.o: demo.c accelVid.h pci.h + +clean: + rm -rf $(OBJDIR) $(BINDIR) diff --git a/README.md b/README.md new file mode 100644 index 0000000..86fe0b8 --- /dev/null +++ b/README.md @@ -0,0 +1,399 @@ +# DOS Accelerated Video Driver Framework + +Hardware-accelerated 2D video drivers for DOS/DJGPP. Programs the +acceleration engines on PCI video cards directly -- no VESA, no BIOS +calls for rendering. A common API lets applications use acceleration +without knowing which chip is present. + +## Supported Video Cards + +### S3 (s3Trio.c) + +| Chip | Device ID | Notes | +|------|-----------|-------| +| Trio32 | 0x8810 | | +| Trio64 | 0x8811 | MMIO at LFB+16MB | +| Trio64V+ | 0x8814 | MMIO at LFB+16MB | +| ViRGE | 0x5631 | MMIO, 3D engine ignored | +| ViRGE/VX | 0x883D | | +| ViRGE/DX/GX | 0x8A01 | | +| ViRGE/GX2 | 0x8A10 | | +| ViRGE/MX | 0x8C01, 0x8C03 | | +| Savage3D | 0x8A20, 0x8A21 | | +| Savage4 | 0x8A22 | | +| Savage/MX | 0x8C10, 0x8C11 | | +| Savage/IX | 0x8C12, 0x8C13 | | +| Savage 2000 | 0x9102 | | +| Vision864 | 0x88C0, 0x88C1 | I/O only (no MMIO) | +| Vision868 | 0x8880 | I/O only | +| Vision964 | 0x88D0 | I/O only | +| Vision968 | 0x88F0, 0x88F1 | I/O only | + +Hardware ops: RectFill, PatFill, BitBlt, HostBlit, ColorExpand, +LineDraw, HwCursor, Clip + +### ATI Mach64 / Rage (atiMach64.c) + +| Chip | Device ID | Notes | +|------|-----------|-------| +| Mach64 GX | 0x4758 | I/O only | +| Mach64 CX | 0x4358 | I/O only | +| Mach64 CT | 0x4354 | MMIO at end of aperture | +| Mach64 ET | 0x4554 | | +| Mach64 VT | 0x5654, 0x5655 | | +| 3D Rage II | 0x4754, 0x4755 | | +| Rage Pro | 0x4750, 0x4752 | | +| Rage 128 | 0x5245, 0x5246, 0x524B, 0x524C | | +| Rage 128 Pro | 0x5046, 0x5052 | | + +Hardware ops: RectFill, PatFill, BitBlt, HostBlit, ColorExpand, +LineDraw, HwCursor, Clip + +### Matrox MGA (matroxMga.c) + +| Chip | Device ID | Notes | +|------|-----------|-------| +| Millennium (MGA2064W) | 0x0519 | Separate MMIO BAR | +| Mystique (MGA1064SG) | 0x051A | | +| G100 | 0x1000, 0x1001 | | +| G200 | 0x0520, 0x0521 | | +| G400 | 0x0525 | | +| G450 | 0x2527 | | + +Hardware ops: RectFill, PatFill, BitBlt, HostBlit, ColorExpand, +LineDraw, HwCursor, Clip + +### 3dfx (banshee.c) + +| Chip | Device ID | Notes | +|------|-----------|-------| +| Banshee | 0x0003 | MMIO + launch area for data | +| Voodoo3 | 0x0005 | | + +Hardware ops: RectFill, PatFill, BitBlt, HostBlit, ColorExpand, +LineDraw, HwCursor, Clip + +### Cirrus Logic GD54xx (cirrusGd54.c) + +| Chip | Device ID | Notes | +|------|-----------|-------| +| GD5434 | 0x00A0, 0x00A8 | BLT via GR registers | +| GD5436 | 0x00AC | | +| GD5446 | 0x00B8 | | +| GD5480 | 0x00BC | | + +Hardware ops: RectFill, BitBlt, HostBlit, ColorExpand, HwCursor + +### Cirrus Logic Laguna (cirrusLaguna.c) + +| Chip | Device ID | Notes | +|------|-----------|-------| +| GD5462 | 0x00D0 | MMIO, different engine from GD54xx | +| GD5464 | 0x00D4 | | +| GD5465 | 0x00D6 | | + +Hardware ops: RectFill, BitBlt, HostBlit, ColorExpand, HwCursor, Clip + +### Nvidia RIVA / TNT (nvidia.c) + +| Chip | Device ID | Notes | +|------|-----------|-------| +| RIVA 128 | 0x0018 | PGRAPH subchannel interface | +| RIVA 128 ZX | 0x0019 | | +| TNT | 0x0020 | | +| TNT2 | 0x0028 | | +| TNT2 Ultra | 0x0029 | | +| TNT2 M64 | 0x002D | | +| Vanta | 0x002C | | + +Hardware ops: RectFill, BitBlt, HostBlit, HwCursor, Clip + +### Tseng ET4000/W32 (tsengW32.c) + +| Chip | Device ID | Notes | +|------|-----------|-------| +| W32 | 0x3202 | ACL engine via I/O ports | +| W32i | 0x3205 | | +| W32p rev A | 0x3206 | HwCursor on W32p only | +| W32p rev B | 0x3207 | | +| W32p rev C | 0x3208 | | +| W32p rev D | 0x4702 | | + +Hardware ops: RectFill, BitBlt, HostBlit, HwCursor (W32p only) + +### Trident TGUI (trident.c) + +| Chip | Device ID | Notes | +|------|-----------|-------| +| TGUI9440 | 0x9440 | GER engine via I/O ports | +| TGUI9660 | 0x9660 | | +| TGUI9680 | 0x9680 | | +| ProVidia 9685 | 0x9685 | | +| Blade3D | 0x9880 | | +| CyberBlade | 0x9910 | | + +Hardware ops: RectFill, BitBlt, HostBlit, HwCursor + +### SiS (sis.c) + +| Chip | Device ID | Notes | +|------|-----------|-------| +| 6326 | 0x6326 | MMIO queue-based engine | +| 300 | 0x0300 | | +| 305 | 0x0305 | | +| 315 | 0x0315 | | +| 330 | 0x0330 | | + +Hardware ops: RectFill, BitBlt, HostBlit, HwCursor, Clip + +## Capability Matrix + +Operations not implemented in hardware get automatic software fallbacks. +Every function pointer is always callable -- callers never need to +check for NULL. + +| Operation | S3 | ATI | Matrox | 3dfx | CL 54xx | CL Laguna | Nvidia | Tseng | Trident | SiS | +|-----------|:--:|:---:|:------:|:----:|:-------:|:---------:|:------:|:-----:|:-------:|:---:| +| RectFill | HW | HW | HW | HW | HW | HW | HW | HW | HW | HW | +| PatFill | HW | HW | HW | HW | sw | sw | sw | sw | sw | sw | +| BitBlt | HW | HW | HW | HW | HW | HW | HW | HW | HW | HW | +| HostBlit | HW | HW | HW | HW | HW | HW | HW | HW | HW | HW | +| ColorExpand | HW | HW | HW | HW | HW | HW | sw | sw | sw | sw | +| LineDraw | HW | HW | HW | HW | sw | sw | sw | sw | sw | sw | +| HwCursor | HW | HW | HW | HW | HW | HW | HW | W32p | HW | HW | +| Clip | HW | HW | HW | HW | sw | HW | HW | sw | sw | HW | + +HW = hardware accelerated, sw = software fallback + +## API Usage + +### Basic Lifecycle + +```c +#include "accelVid.h" + +// Declare registration functions for the drivers you want +extern void s3RegisterDriver(void); +extern void atiRegisterDriver(void); +// ... etc + +int main(void) { + // 1. Register drivers (order = detection priority) + s3RegisterDriver(); + atiRegisterDriver(); + + // 2. Detect hardware + AccelDriverT *drv = accelDetect(); + if (!drv) { + printf("No supported video card found\n"); + return 1; + } + + // 3. Initialize with a video mode + AccelModeRequestT req; + req.width = 640; + req.height = 480; + req.bpp = 16; + + if (!accelInit(drv, &req)) { + printf("Failed to set video mode\n"); + return 1; + } + + // Mode info is now available + printf("Mode: %dx%dx%d pitch=%d\n", + drv->mode.width, drv->mode.height, + drv->mode.bpp, drv->mode.pitch); + + // 4. Draw + drv->rectFill(drv, 0, 0, 640, 480, 0x001F); // blue + drv->waitIdle(drv); + + // 5. Shut down + accelShutdown(drv); + return 0; +} +``` + +### Drawing Operations + +All drawing functions take the driver pointer as the first argument. +Colors are packed in the display's native pixel format. + +```c +// Solid rectangle fill +drv->rectFill(drv, x, y, w, h, color); + +// 8x8 mono pattern fill (1=fg, 0=bg, MSB first, 8 bytes) +uint8_t checkerboard[8] = { + 0xAA, 0x55, 0xAA, 0x55, + 0xAA, 0x55, 0xAA, 0x55 +}; +drv->rectFillPat(drv, x, y, w, h, checkerboard, fgColor, bgColor); + +// Screen-to-screen blit (handles overlapping regions) +drv->bitBlt(drv, srcX, srcY, dstX, dstY, w, h); + +// CPU-to-screen blit (transfer RAM buffer to VRAM) +// srcBuf = packed pixels in display format, srcPitch = byte stride +drv->hostBlit(drv, buffer, pitch, dstX, dstY, w, h); + +// Monochrome color expansion (1bpp -> full color) +// Each 1-bit becomes fg, each 0-bit becomes bg +// srcBuf = packed MSB-first mono bitmap, srcPitch = byte stride +drv->colorExpand(drv, glyphData, 1, dstX, dstY, 8, 16, fg, bg); + +// Bresenham line draw (inclusive endpoints) +drv->lineDraw(drv, x1, y1, x2, y2, color); + +// Hardware clip rectangle +drv->setClip(drv, clipX, clipY, clipW, clipH); +``` + +### Hardware Cursor + +```c +// Define a cursor image (64x64 max, AND/XOR masks) +HwCursorImageT cursor; +cursor.width = 16; +cursor.height = 16; +cursor.hotX = 0; +cursor.hotY = 0; +memset(cursor.andMask, 0xFF, sizeof(cursor.andMask)); // transparent +memset(cursor.xorMask, 0x00, sizeof(cursor.xorMask)); +// ... fill in actual cursor shape ... + +// Upload and enable +drv->setCursor(drv, &cursor); +drv->showCursor(drv, true); + +// Move (call on every mouse poll) +drv->moveCursor(drv, mouseX, mouseY); + +// Hide +drv->showCursor(drv, false); +``` + +### Checking Capabilities + +The `caps` field indicates which operations are hardware-accelerated. +Software fallbacks are always installed, so you can call any operation +regardless of caps. Use caps to make optimization decisions: + +```c +if (drv->caps & ACAP_COLOR_EXPAND) { + // Use color expansion for text -- 16x less bus traffic + drv->colorExpand(drv, glyph, 1, x, y, 8, 16, fg, bg); +} else { + // Software fallback is installed but may be slow -- + // consider pre-rendering text to a RAM buffer instead + drv->colorExpand(drv, glyph, 1, x, y, 8, 16, fg, bg); +} + +if (drv->caps & ACAP_HW_CURSOR) { + // Hardware cursor eliminates cursor dirty rectangles + drv->setCursor(drv, &cursorImage); + drv->showCursor(drv, true); +} +``` + +### Synchronization + +The acceleration engine runs asynchronously. Drawing functions return +immediately after queuing the command. Use `waitIdle` before reading +from VRAM or when you need all pending operations to complete: + +```c +drv->rectFill(drv, 0, 0, 100, 100, color1); +drv->rectFill(drv, 50, 50, 100, 100, color2); +drv->bitBlt(drv, 0, 0, 200, 0, 150, 150); + +// Wait for everything to finish before reading VRAM +drv->waitIdle(drv); +uint16_t pixel = *(uint16_t *)(drv->mode.framebuffer + offset); +``` + +### Mode Information + +After `accelInit` succeeds, `drv->mode` contains: + +| Field | Description | +|-------|-------------| +| `width` | Horizontal resolution in pixels | +| `height` | Vertical resolution in pixels | +| `bpp` | Bits per pixel (8, 15, 16, or 32) | +| `pitch` | Bytes per scanline (may exceed width * bpp/8) | +| `framebuffer` | Direct pointer to the linear framebuffer | +| `vramSize` | Total video RAM in bytes | +| `offscreenBase` | Byte offset where offscreen VRAM begins | + +The framebuffer pointer can be used for direct pixel access when +the acceleration engine doesn't offer a suitable operation. + +## Adding a New Driver + +1. Create a new source file (e.g., `newchip.c`) +2. Include `accelVid.h`, `vgaCommon.h`, and `pci.h` +3. Define a static `AccelDriverT` with your function pointers +4. Use shared helpers for boilerplate: + - `vesaFindAndSetMode()` for VESA mode enumeration and setting + - `dpmiMapFramebuffer()` for DPMI physical address mapping + - `pciSizeBar()` for PCI BAR size detection +5. Leave unsupported operations as NULL -- the driver manager + installs software fallbacks automatically +6. Add a registration function: `void newchipRegisterDriver(void)` +7. Add the source file to the Makefile and call the registration + function from `main()` + +See `trident.c` (simplest driver) or `matroxMga.c` (most complete) +as reference implementations. + +## Building + +Requires a DJGPP cross-compiler targeting i586-pc-msdosdjgpp. + +``` +make # build bin/demo.exe +make clean # remove build artifacts +``` + +The Makefile expects the DJGPP toolchain at `$HOME/djgpp/djgpp`. +Override with `make DJGPP_PREFIX=/path/to/djgpp`. + +Compiler flags: `-O2 -Wall -Wextra -Werror -march=i486 -mtune=i586` + +## Testing + +The `test/` directory contains an 86Box configuration for testing +with an emulated S3 Trio64. See `test/README.txt` for setup +instructions. + +``` +demo.exe [width height bpp] +``` + +Default mode: 640x480x16. Controls: SPACE cycles demos, B runs +benchmarks, ESC exits. + +## Project Structure + +``` +accelVid.h Driver abstraction and manager API +accelVid.c Driver manager, software fallbacks +pci.h / pci.c PCI configuration space access +vgaCommon.h / .c Shared VGA registers, VESA, DPMI helpers +s3Trio.c S3 Trio/ViRGE/Savage/Vision driver +atiMach64.c ATI Mach64 / Rage driver +matroxMga.c Matrox Millennium / Mystique / G-series driver +banshee.c 3dfx Banshee / Voodoo3 driver +cirrusGd54.c Cirrus Logic GD5434/36/46/80 driver +cirrusLaguna.c Cirrus Logic Laguna GD5462/64/65 driver +nvidia.c Nvidia RIVA 128 / TNT family driver +tsengW32.c Tseng ET4000/W32 family driver +trident.c Trident TGUI / Blade / CyberBlade driver +sis.c SiS 6326/300/315 driver +demo.c Test/demo application +Makefile DJGPP cross-compilation build +PLAN.md Architecture plan and chipset reference +test/ 86Box test configuration and setup guide +``` diff --git a/accelVid.c b/accelVid.c new file mode 100644 index 0000000..58eace1 --- /dev/null +++ b/accelVid.c @@ -0,0 +1,574 @@ +// accelVid.c -- Accelerated video driver manager +// +// Manages registration, detection, and lifecycle of hardware-specific +// video drivers. Drivers register themselves at startup, then the +// manager probes each in order to find matching hardware. +// +// After a chip driver's init() succeeds, the manager fills in +// software fallback implementations for any drawing operations +// the driver left as NULL. This means callers never need to +// check function pointers -- every operation is always callable. +// The fallbacks draw directly to the LFB using simple loops. + +#include "accelVid.h" + +#include +#include +#include + +// Maximum number of registered drivers. This is more than enough +// for all chip families we'll ever support. +#define MAX_DRIVERS 32 + +// ============================================================ +// Prototypes -- public API +// ============================================================ + +AccelDriverT *accelDetect(void); +uint32_t accelGetCaps(const AccelDriverT *drv); +const char *accelGetName(const AccelDriverT *drv); +bool accelInit(AccelDriverT *drv, const AccelModeRequestT *req); +void accelRegisterDriver(AccelDriverT *drv); +void accelShutdown(AccelDriverT *drv); + +// ============================================================ +// Prototypes -- software fallbacks +// ============================================================ + +static void swBitBlt(AccelDriverT *drv, int32_t srcX, int32_t srcY, int32_t dstX, int32_t dstY, int32_t w, int32_t h); +static void swColorExpand(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h, uint32_t fg, uint32_t bg); +static void swHostBlit(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h); +static void swLineDraw(AccelDriverT *drv, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color); +static void swRectFill(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color); +static void swRectFillPat(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, const uint8_t *pattern, uint32_t fg, uint32_t bg); +static void swSetClip(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h); +static void swWaitIdle(AccelDriverT *drv); +static void swInstallFallbacks(AccelDriverT *drv); + +// ============================================================ +// Inline helpers for software fallbacks +// ============================================================ + +// Write a pixel at (x, y) in the framebuffer. No bounds checking +// -- the caller must clip before calling. +static inline void swPutPixel(AccelDriverT *drv, int32_t x, int32_t y, uint32_t color) { + uint8_t *fb = drv->mode.framebuffer; + int32_t bpp = (drv->mode.bpp + 7) / 8; + uint8_t *dst = fb + y * drv->mode.pitch + x * bpp; + + switch (bpp) { + case 1: + *dst = (uint8_t)color; + break; + case 2: + *(uint16_t *)dst = (uint16_t)color; + break; + case 4: + *(uint32_t *)dst = color; + break; + } +} + +// ============================================================ +// Module state +// ============================================================ + +static AccelDriverT *sDrivers[MAX_DRIVERS]; +static int32_t sDriverCount = 0; + +// Software clip rectangle (used by fallbacks when no hardware clip) +static int32_t sClipX = 0; +static int32_t sClipY = 0; +static int32_t sClipW = 0; +static int32_t sClipH = 0; + +// ============================================================ +// accelDetect +// ============================================================ +// +// Iterates all registered drivers and calls detect() on each. +// Returns the first driver that claims the hardware, or NULL +// if no supported hardware is found. +// +// Detection order matters: drivers registered first are tried +// first. This allows callers to prioritize specific drivers +// (e.g. prefer S3 over generic VESA). + +AccelDriverT *accelDetect(void) { + if (!pciDetect()) { + fprintf(stderr, "accelVid: PCI bus not detected\n"); + return NULL; + } + + for (int32_t i = 0; i < sDriverCount; i++) { + if (sDrivers[i]->detect(sDrivers[i])) { + printf("accelVid: Detected %s (PCI %02X:%02X.%X, " + "vendor=%04X device=%04X)\n", + sDrivers[i]->name, + sDrivers[i]->pciDev.bus, + sDrivers[i]->pciDev.dev, + sDrivers[i]->pciDev.func, + sDrivers[i]->pciDev.vendorId, + sDrivers[i]->pciDev.deviceId); + return sDrivers[i]; + } + } + + fprintf(stderr, "accelVid: No supported video hardware found\n"); + return NULL; +} + + +// ============================================================ +// accelGetCaps +// ============================================================ + +uint32_t accelGetCaps(const AccelDriverT *drv) { + if (!drv) { + return 0; + } + + return drv->caps; +} + + +// ============================================================ +// accelGetName +// ============================================================ + +const char *accelGetName(const AccelDriverT *drv) { + if (!drv) { + return "none"; + } + + return drv->name; +} + + +// ============================================================ +// accelInit +// ============================================================ + +bool accelInit(AccelDriverT *drv, const AccelModeRequestT *req) { + if (!drv || !drv->init) { + return false; + } + + memset(&drv->mode, 0, sizeof(drv->mode)); + + if (!drv->init(drv, req)) { + fprintf(stderr, "accelVid: Failed to initialize %s\n", drv->name); + return false; + } + + printf("accelVid: Initialized %s at %ldx%ldx%ld (pitch=%ld, vram=%luKB)\n", + drv->name, + (long)drv->mode.width, + (long)drv->mode.height, + (long)drv->mode.bpp, + (long)drv->mode.pitch, + (unsigned long)(drv->mode.vramSize / 1024)); + + // Report capabilities + printf("accelVid: Capabilities:"); + + if (drv->caps & ACAP_RECT_FILL) { + printf(" RectFill"); + } + if (drv->caps & ACAP_RECT_FILL_PAT) { + printf(" PatFill"); + } + if (drv->caps & ACAP_BITBLT) { + printf(" BitBlt"); + } + if (drv->caps & ACAP_COLOR_EXPAND) { + printf(" ColorExpand"); + } + if (drv->caps & ACAP_LINE_DRAW) { + printf(" LineDraw"); + } + if (drv->caps & ACAP_HW_CURSOR) { + printf(" HwCursor"); + } + if (drv->caps & ACAP_HOST_BLIT) { + printf(" HostBlit"); + } + if (drv->caps & ACAP_CLIP) { + printf(" Clip"); + } + if (drv->caps & ACAP_TRANSPARENCY) { + printf(" Transparency"); + } + + printf("\n"); + + // Install software fallbacks for any operations the driver + // didn't implement in hardware + swInstallFallbacks(drv); + + return true; +} + + +// ============================================================ +// accelRegisterDriver +// ============================================================ + +void accelRegisterDriver(AccelDriverT *drv) { + if (sDriverCount >= MAX_DRIVERS) { + fprintf(stderr, "accelVid: Too many drivers registered (max %d)\n", + MAX_DRIVERS); + return; + } + + sDrivers[sDriverCount++] = drv; +} + + +// ============================================================ +// accelShutdown +// ============================================================ + +void accelShutdown(AccelDriverT *drv) { + if (!drv) { + return; + } + + if (drv->waitIdle) { + drv->waitIdle(drv); + } + + if (drv->showCursor) { + drv->showCursor(drv, false); + } + + if (drv->shutdown) { + drv->shutdown(drv); + } + + memset(&drv->mode, 0, sizeof(drv->mode)); +} + + +// ============================================================ +// Software fallback implementations +// ============================================================ +// +// These draw directly to the LFB. They're correct but slow +// (uncached PCI writes). The point isn't performance -- it's +// ensuring every operation is always callable so the caller +// never needs to check for NULL function pointers. + + +// ============================================================ +// swBitBlt +// ============================================================ +// +// Screen-to-screen blit via the LFB. Handles overlapping regions +// by choosing copy direction. + +static void swBitBlt(AccelDriverT *drv, int32_t srcX, int32_t srcY, int32_t dstX, int32_t dstY, int32_t w, int32_t h) { + if (w <= 0 || h <= 0) { + return; + } + + uint8_t *fb = drv->mode.framebuffer; + int32_t pitch = drv->mode.pitch; + int32_t bpp = (drv->mode.bpp + 7) / 8; + int32_t rowBytes = w * bpp; + + if (dstY < srcY || (dstY == srcY && dstX <= srcX)) { + // Copy forward (top to bottom, left to right) + for (int32_t row = 0; row < h; row++) { + uint8_t *src = fb + (srcY + row) * pitch + srcX * bpp; + uint8_t *dst = fb + (dstY + row) * pitch + dstX * bpp; + memmove(dst, src, rowBytes); + } + } else { + // Copy backward (bottom to top) + for (int32_t row = h - 1; row >= 0; row--) { + uint8_t *src = fb + (srcY + row) * pitch + srcX * bpp; + uint8_t *dst = fb + (dstY + row) * pitch + dstX * bpp; + memmove(dst, src, rowBytes); + } + } +} + + +// ============================================================ +// swColorExpand +// ============================================================ +// +// Monochrome-to-color expansion via the LFB. Each 1-bit in srcBuf +// becomes the fg color, each 0-bit becomes the bg color. + +static void swColorExpand(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h, uint32_t fg, uint32_t bg) { + if (w <= 0 || h <= 0) { + return; + } + + uint8_t *fb = drv->mode.framebuffer; + int32_t pitch = drv->mode.pitch; + int32_t bpp = (drv->mode.bpp + 7) / 8; + + for (int32_t row = 0; row < h; row++) { + const uint8_t *mono = srcBuf + row * srcPitch; + uint8_t *dst = fb + (dstY + row) * pitch + dstX * bpp; + + for (int32_t col = 0; col < w; col++) { + int32_t byteIdx = col / 8; + int32_t bitIdx = 7 - (col % 8); + uint32_t color = (mono[byteIdx] >> bitIdx) & 1 ? fg : bg; + + switch (bpp) { + case 1: + dst[col] = (uint8_t)color; + break; + case 2: + ((uint16_t *)dst)[col] = (uint16_t)color; + break; + case 4: + ((uint32_t *)dst)[col] = color; + break; + } + } + } +} + + +// ============================================================ +// swHostBlit +// ============================================================ +// +// CPU-to-screen blit via the LFB. Just a memcpy per scanline. + +static void swHostBlit(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h) { + if (w <= 0 || h <= 0) { + return; + } + + uint8_t *fb = drv->mode.framebuffer; + int32_t pitch = drv->mode.pitch; + int32_t bpp = (drv->mode.bpp + 7) / 8; + int32_t rowBytes = w * bpp; + + for (int32_t row = 0; row < h; row++) { + const uint8_t *src = srcBuf + row * srcPitch; + uint8_t *dst = fb + (dstY + row) * pitch + dstX * bpp; + memcpy(dst, src, rowBytes); + } +} + + +// ============================================================ +// swLineDraw +// ============================================================ +// +// Bresenham line draw via the LFB. + +static void swLineDraw(AccelDriverT *drv, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color) { + int32_t dx = abs(x2 - x1); + int32_t dy = abs(y2 - y1); + int32_t sx = (x1 < x2) ? 1 : -1; + int32_t sy = (y1 < y2) ? 1 : -1; + int32_t err = dx - dy; + int32_t x = x1; + int32_t y = y1; + + for (;;) { + if (x >= sClipX && x < sClipX + sClipW && + y >= sClipY && y < sClipY + sClipH) { + swPutPixel(drv, x, y, color); + } + + if (x == x2 && y == y2) { + break; + } + + int32_t e2 = 2 * err; + + if (e2 > -dy) { + err -= dy; + x += sx; + } + + if (e2 < dx) { + err += dx; + y += sy; + } + } +} + + +// ============================================================ +// swRectFill +// ============================================================ +// +// Solid rectangle fill via the LFB. + +static void swRectFill(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color) { + if (w <= 0 || h <= 0) { + return; + } + + uint8_t *fb = drv->mode.framebuffer; + int32_t pitch = drv->mode.pitch; + int32_t bpp = (drv->mode.bpp + 7) / 8; + + for (int32_t row = 0; row < h; row++) { + uint8_t *dst = fb + (y + row) * pitch + x * bpp; + + switch (bpp) { + case 1: + memset(dst, (uint8_t)color, w); + break; + case 2: { + uint16_t *dst16 = (uint16_t *)dst; + for (int32_t col = 0; col < w; col++) { + dst16[col] = (uint16_t)color; + } + break; + } + case 4: { + uint32_t *dst32 = (uint32_t *)dst; + for (int32_t col = 0; col < w; col++) { + dst32[col] = color; + } + break; + } + } + } +} + + +// ============================================================ +// swRectFillPat +// ============================================================ +// +// 8x8 monochrome pattern fill via the LFB. The pattern is 8 bytes, +// one bit per pixel, MSB-first, row 0 first. Each 1-bit gets the +// fg color, each 0-bit gets the bg color. The pattern tiles across +// the destination rectangle with alignment to screen coordinates +// (so patterns line up across adjacent fills). + +static void swRectFillPat(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, const uint8_t *pattern, uint32_t fg, uint32_t bg) { + if (w <= 0 || h <= 0) { + return; + } + + uint8_t *fb = drv->mode.framebuffer; + int32_t pitch = drv->mode.pitch; + int32_t bpp = (drv->mode.bpp + 7) / 8; + + for (int32_t row = 0; row < h; row++) { + uint8_t patRow = pattern[(y + row) & 7]; + uint8_t *dst = fb + (y + row) * pitch + x * bpp; + + for (int32_t col = 0; col < w; col++) { + int32_t patBit = 7 - ((x + col) & 7); + uint32_t color = (patRow >> patBit) & 1 ? fg : bg; + + switch (bpp) { + case 1: + dst[col] = (uint8_t)color; + break; + case 2: + ((uint16_t *)dst)[col] = (uint16_t)color; + break; + case 4: + ((uint32_t *)dst)[col] = color; + break; + } + } + } +} + + +// ============================================================ +// swSetClip +// ============================================================ +// +// Software clip rectangle for fallback line drawing. + +static void swSetClip(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h) { + (void)drv; + sClipX = x; + sClipY = y; + sClipW = w; + sClipH = h; +} + + +// ============================================================ +// swWaitIdle +// ============================================================ +// +// No-op -- software operations complete synchronously. + +static void swWaitIdle(AccelDriverT *drv) { + (void)drv; +} + + +// ============================================================ +// swInstallFallbacks +// ============================================================ +// +// Fills in software implementations for any NULL function +// pointers in the driver struct. Called by accelInit() after +// the chip driver's init() succeeds. This guarantees that +// every drawing operation is always callable. + +static void swInstallFallbacks(AccelDriverT *drv) { + int32_t count = 0; + + if (!drv->waitIdle) { + drv->waitIdle = swWaitIdle; + } + + if (!drv->setClip) { + drv->setClip = swSetClip; + count++; + } + + if (!drv->rectFill) { + drv->rectFill = swRectFill; + count++; + } + + if (!drv->bitBlt) { + drv->bitBlt = swBitBlt; + count++; + } + + if (!drv->hostBlit) { + drv->hostBlit = swHostBlit; + count++; + } + + if (!drv->colorExpand) { + drv->colorExpand = swColorExpand; + count++; + } + + if (!drv->rectFillPat) { + drv->rectFillPat = swRectFillPat; + count++; + } + + if (!drv->lineDraw) { + drv->lineDraw = swLineDraw; + count++; + } + + // Initialize the software clip rect to full screen + sClipX = 0; + sClipY = 0; + sClipW = drv->mode.width; + sClipH = drv->mode.height; + + if (count > 0) { + printf("accelVid: %ld operation(s) using software fallback\n", + (long)count); + } +} diff --git a/accelVid.h b/accelVid.h new file mode 100644 index 0000000..4ed2d07 --- /dev/null +++ b/accelVid.h @@ -0,0 +1,257 @@ +// accelVid.h -- Accelerated video driver abstraction for DOS +// +// Defines the common interface that all hardware-specific video +// drivers implement. Each driver fills in an AccelDriverT struct +// with function pointers for its accelerated operations and sets +// capability flags indicating which operations are hardware-backed. +// +// The driver manager (accelVid.c) iterates registered drivers, +// calls detect() on each, and returns the first match. The caller +// then uses the function pointers directly -- no dispatch overhead +// beyond the initial detection. +// +// Operations that aren't hardware-accelerated on a given chip +// should be left as NULL. The caller is responsible for falling +// back to software rendering for NULL operations. Capability +// flags in AccelDriverT.caps indicate which operations are +// available so callers can check without testing each pointer. +// +// All coordinates and dimensions are in pixels. Colors are packed +// in the display's native pixel format (same as DVX's packColor). +#ifndef ACCEL_VID_H +#define ACCEL_VID_H + +#include +#include + +#include "pci.h" + +// ============================================================ +// Capability flags +// ============================================================ +// +// Bit flags indicating which operations are hardware-accelerated. +// A driver sets these in its caps field during detect/init. The +// caller can test (drv->caps & ACAP_xxx) to decide whether to +// use hardware or fall back to software. + +#define ACAP_RECT_FILL 0x00000001 // solid rectangle fill +#define ACAP_RECT_FILL_PAT 0x00000002 // pattern rectangle fill (8x8) +#define ACAP_BITBLT 0x00000004 // screen-to-screen blit +#define ACAP_COLOR_EXPAND 0x00000008 // mono-to-color expansion (text/glyphs) +#define ACAP_LINE_DRAW 0x00000010 // Bresenham line drawing +#define ACAP_HW_CURSOR 0x00000020 // hardware sprite cursor +#define ACAP_HOST_BLIT 0x00000040 // CPU-to-screen blit (image upload) +#define ACAP_CLIP 0x00000080 // hardware clip rectangle +#define ACAP_TRANSPARENCY 0x00000100 // transparent blit (color key) + +// ============================================================ +// Raster operation codes +// ============================================================ +// +// Standard Microsoft/GDI ROP codes used by Windows drivers. +// These map to the 256 possible ternary raster operations, but +// we only define the commonly used ones. The hardware engines +// typically support these natively. + +#define ROP_COPY 0xCC // dest = src +#define ROP_PAT_COPY 0xF0 // dest = pattern +#define ROP_ZERO 0x00 // dest = 0 (black) +#define ROP_ONE 0xFF // dest = 1 (white) +#define ROP_SRC_AND 0x88 // dest = src AND dest +#define ROP_SRC_OR 0xEE // dest = src OR dest +#define ROP_SRC_XOR 0x66 // dest = src XOR dest +#define ROP_NOT 0x55 // dest = NOT dest +#define ROP_PAT_AND 0xA0 // dest = pat AND dest +#define ROP_PAT_OR 0xFA // dest = pat OR dest +#define ROP_PAT_XOR 0x5A // dest = pat XOR dest + +// ============================================================ +// Hardware cursor image format +// ============================================================ +// +// Hardware cursors use a 2-bit-per-pixel AND/XOR format: +// AND=0, XOR=0 -> cursor color 0 (background) +// AND=0, XOR=1 -> cursor color 1 (foreground) +// AND=1, XOR=0 -> transparent (screen shows through) +// AND=1, XOR=1 -> inverted (screen pixel is inverted) +// +// Most chips support 64x64 cursors (S3, Matrox, ATI, Tseng W32p). +// Older Cirrus (GD5426/28) support only 32x32. + +#define HW_CURSOR_MAX_SIZE 64 + +typedef struct { + int32_t width; + int32_t height; + int32_t hotX; + int32_t hotY; + uint8_t andMask[HW_CURSOR_MAX_SIZE * HW_CURSOR_MAX_SIZE / 8]; + uint8_t xorMask[HW_CURSOR_MAX_SIZE * HW_CURSOR_MAX_SIZE / 8]; +} HwCursorImageT; + +// ============================================================ +// Video mode request / result +// ============================================================ + +typedef struct { + int32_t width; + int32_t height; + int32_t bpp; // requested bits per pixel (8, 15, 16, 32) +} AccelModeRequestT; + +typedef struct { + int32_t width; + int32_t height; + int32_t bpp; + int32_t pitch; // bytes per scanline (may be > width * bytesPerPixel) + uint8_t *framebuffer; // mapped linear framebuffer pointer + uint32_t vramSize; // total video RAM in bytes + uint32_t offscreenBase; // offset to start of offscreen VRAM (for allocations) +} AccelModeResultT; + +// ============================================================ +// Driver structure +// ============================================================ +// +// Each chip driver provides a statically-allocated AccelDriverT +// and registers it with accelRegisterDriver(). The driver manager +// calls detect() on each registered driver during accelInit(). +// +// The init() function receives a mode request and returns detailed +// mode info. It is responsible for: +// - Programming the CRTC/sequencer for the requested mode +// - Enabling the linear framebuffer +// - Unlocking the acceleration engine +// - Setting up MMIO mappings if needed +// +// All accelerated drawing functions must call waitIdle() internally +// before returning if the operation is asynchronous. The explicit +// waitIdle() in the API is for synchronization points where the +// caller needs to read back from VRAM after a series of operations. + +typedef struct AccelDriverT { + // Driver identification + const char *name; // human-readable name (e.g. "S3 Trio64") + const char *chipFamily; // family identifier (e.g. "s3", "cirrus") + uint32_t caps; // ACAP_xxx capability flags + + // PCI device info (filled by detect) + PciDeviceT pciDev; + + // Current mode info (filled by init) + AccelModeResultT mode; + + // -------------------------------------------------------- + // Lifecycle + // -------------------------------------------------------- + + // Probe for this chip. Returns true if this driver's hardware + // is present. Must not change any hardware state. + bool (*detect)(struct AccelDriverT *drv); + + // Initialize the chip: set the requested video mode, enable + // acceleration, map the framebuffer. Returns true on success. + bool (*init)(struct AccelDriverT *drv, const AccelModeRequestT *req); + + // Shut down: restore text mode, disable acceleration, unmap + // memory. Safe to call even if init() was never called. + void (*shutdown)(struct AccelDriverT *drv); + + // -------------------------------------------------------- + // Synchronization + // -------------------------------------------------------- + + // Wait until the acceleration engine is idle. All pending + // drawing commands must complete before this returns. + void (*waitIdle)(struct AccelDriverT *drv); + + // -------------------------------------------------------- + // Hardware clip rectangle + // -------------------------------------------------------- + + // Set the hardware clip rectangle. All subsequent drawing + // operations are clipped to this region. Pass full-screen + // dimensions to disable clipping. + void (*setClip)(struct AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h); + + // -------------------------------------------------------- + // Accelerated drawing operations + // -------------------------------------------------------- + + // Solid rectangle fill. + void (*rectFill)(struct AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color); + + // Pattern rectangle fill (8x8 pattern, one color + transparent + // or two-color). Pattern data is 8 bytes, one bit per pixel, + // MSB-first, top row first. + void (*rectFillPat)(struct AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, const uint8_t *pattern, uint32_t fg, uint32_t bg); + + // Screen-to-screen blit. + void (*bitBlt)(struct AccelDriverT *drv, int32_t srcX, int32_t srcY, int32_t dstX, int32_t dstY, int32_t w, int32_t h); + + // CPU-to-screen blit: transfer pixels from system RAM to VRAM. + // srcBuf points to packed pixel data in display format. + // srcPitch is the byte stride of the source buffer. + void (*hostBlit)(struct AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h); + + // Monochrome color expansion: convert 1bpp bitmap data to + // full-color pixels. Used for fast text/glyph rendering. + // srcBuf is packed MSB-first, one bit per pixel. + // srcPitch is the byte stride between rows. + void (*colorExpand)(struct AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h, uint32_t fg, uint32_t bg); + + // Bresenham line draw (inclusive endpoints). + void (*lineDraw)(struct AccelDriverT *drv, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color); + + // -------------------------------------------------------- + // Hardware cursor + // -------------------------------------------------------- + + // Set the cursor image. Called when the cursor shape changes. + void (*setCursor)(struct AccelDriverT *drv, const HwCursorImageT *image); + + // Move the cursor to a screen position. Called every mouse poll. + void (*moveCursor)(struct AccelDriverT *drv, int32_t x, int32_t y); + + // Show or hide the hardware cursor. + void (*showCursor)(struct AccelDriverT *drv, bool visible); + + // -------------------------------------------------------- + // Private driver data + // -------------------------------------------------------- + + // Opaque pointer for chip-specific state (MMIO base address, + // current engine state, etc.). Each driver allocates and manages + // its own private data. + void *privData; + +} AccelDriverT; + +// ============================================================ +// Driver manager API +// ============================================================ + +// Register a driver with the manager. Call once per driver at +// startup (typically from main before accelInit). Drivers are +// probed in registration order. +void accelRegisterDriver(AccelDriverT *drv); + +// Probe all registered drivers and return the first one whose +// detect() succeeds. Returns NULL if no supported hardware is found. +AccelDriverT *accelDetect(void); + +// Initialize the detected driver with the given mode. +// Returns true on success. On failure the driver is not usable. +bool accelInit(AccelDriverT *drv, const AccelModeRequestT *req); + +// Shut down the active driver and restore text mode. +void accelShutdown(AccelDriverT *drv); + +// Return the driver name string for display. +const char *accelGetName(const AccelDriverT *drv); + +// Return the capability flags for the active driver. +uint32_t accelGetCaps(const AccelDriverT *drv); + +#endif // ACCEL_VID_H diff --git a/atiMach64.c b/atiMach64.c new file mode 100644 index 0000000..83b4065 --- /dev/null +++ b/atiMach64.c @@ -0,0 +1,960 @@ +// atiMach64.c -- ATI Mach64 / Rage accelerated video driver +// +// Supports the ATI Mach64 family: GX, CX, CT, ET, VT, GT (Rage II), +// and Rage Pro. These were among the most capable 2D accelerators +// of the mid-1990s, with features including: +// - Solid and pattern rectangle fill +// - Screen-to-screen BitBLT +// - Host-to-screen blit (CPU data transfer) +// - Monochrome color expansion +// - Bresenham line draw +// - Trapezoid fill +// - Hardware scissor rectangle +// - 64x64 two-color hardware cursor +// +// Register access: +// The Mach64 has two register access methods: +// 1. I/O port: registers at block I/O base + offset. The base +// is typically 0x02EC for Mach64, determined by CONFIG_CHIP_ID. +// 2. MMIO: register block at end of LFB (BAR0 + aperture_size - 1KB) +// or via a dedicated BAR. +// +// We use MMIO for speed. The register block is 1KB at the end +// of the aperture (LFB base + size - 0x400 on most variants, +// or LFB base + size - 0x800 for 8MB apertures). +// +// Some early Mach64 chips (GX/CX) may not support MMIO well; +// for those we fall back to I/O port access. + +#include "accelVid.h" +#include "vgaCommon.h" +#include "pci.h" + +#include +#include +#include +#include + +// ============================================================ +// ATI vendor/device IDs +// ============================================================ + +#define ATI_VENDOR_ID 0x1002 + +#define ATI_MACH64_GX 0x4758 // Mach64 GX +#define ATI_MACH64_CX 0x4358 // Mach64 CX +#define ATI_MACH64_CT 0x4354 // Mach64 CT +#define ATI_MACH64_ET 0x4554 // Mach64 ET +#define ATI_MACH64_VT 0x5654 // Mach64 VT +#define ATI_MACH64_VT_B 0x5655 // Mach64 VT-B +#define ATI_MACH64_GT 0x4754 // Mach64 GT (3D Rage II) +#define ATI_MACH64_GT_B 0x4755 // Mach64 GT-B (3D Rage II+) +#define ATI_RAGE_PRO 0x4750 // Rage Pro +#define ATI_RAGE_PRO_AGP 0x4752 // Rage Pro AGP +#define ATI_RAGE_XL_PCI 0x4752 // Rage XL PCI (shares ID with Pro AGP) +#define ATI_RAGE_128_RE 0x5245 // Rage 128 RE +#define ATI_RAGE_128_RF 0x5246 // Rage 128 RF +#define ATI_RAGE_128_RK 0x524B // Rage 128 RK +#define ATI_RAGE_128_RL 0x524C // Rage 128 RL +#define ATI_RAGE_128_PRO_PF 0x5046 // Rage 128 Pro PF +#define ATI_RAGE_128_PRO_PR 0x5052 // Rage 128 Pro PR +#define ATI_RAGE_FURY 0x5046 // Rage Fury (same as 128 Pro PF) + +static const uint16_t sAtiDeviceIds[] = { + ATI_VENDOR_ID, ATI_MACH64_GX, + ATI_VENDOR_ID, ATI_MACH64_CX, + ATI_VENDOR_ID, ATI_MACH64_CT, + ATI_VENDOR_ID, ATI_MACH64_ET, + ATI_VENDOR_ID, ATI_MACH64_VT, + ATI_VENDOR_ID, ATI_MACH64_VT_B, + ATI_VENDOR_ID, ATI_MACH64_GT, + ATI_VENDOR_ID, ATI_MACH64_GT_B, + ATI_VENDOR_ID, ATI_RAGE_PRO, + ATI_VENDOR_ID, ATI_RAGE_PRO_AGP, + ATI_VENDOR_ID, ATI_RAGE_128_RE, + ATI_VENDOR_ID, ATI_RAGE_128_RF, + ATI_VENDOR_ID, ATI_RAGE_128_RK, + ATI_VENDOR_ID, ATI_RAGE_128_RL, + ATI_VENDOR_ID, ATI_RAGE_128_PRO_PF, + ATI_VENDOR_ID, ATI_RAGE_128_PRO_PR, + 0, 0 +}; + +// ============================================================ +// Mach64 register offsets (from MMIO base) +// ============================================================ +// +// The Mach64 has a flat register space. For I/O access, these +// offsets are added to the I/O base port. For MMIO, they're +// byte offsets from the MMIO base address. + +// Drawing engine source registers +#define ATI_SRC_OFF_PITCH 0x0000 // source offset and pitch +#define ATI_SRC_Y 0x0004 // source Y +#define ATI_SRC_X 0x0008 // source X (alias: SRC_HEIGHT1) +#define ATI_SRC_Y_X 0x000C // source Y and X combined +#define ATI_SRC_WIDTH1 0x0010 +#define ATI_SRC_HEIGHT1 0x0014 + +// Drawing engine destination registers +#define ATI_DST_OFF_PITCH 0x0040 // destination offset and pitch +#define ATI_DST_Y 0x0044 +#define ATI_DST_X 0x0048 +#define ATI_DST_Y_X 0x004C +#define ATI_DST_HEIGHT 0x0050 +#define ATI_DST_WIDTH 0x0054 +#define ATI_DST_HEIGHT_WIDTH 0x0058 // triggers blit +#define ATI_DST_X_WIDTH 0x005C +#define ATI_DST_BRES_ERR 0x0064 +#define ATI_DST_BRES_INC 0x0068 +#define ATI_DST_BRES_DEC 0x006C +#define ATI_DST_BRES_LNTH 0x0070 +#define ATI_DST_BRES_LNTH_END 0x0074 // triggers line draw + +// Host data (CPU-to-screen) +#define ATI_HOST_DATA0 0x0200 + +// Scissor registers +#define ATI_SC_LEFT 0x00A0 +#define ATI_SC_RIGHT 0x00A4 +#define ATI_SC_TOP 0x00A8 +#define ATI_SC_BOTTOM 0x00AC + +// Drawing processor registers +#define ATI_DP_BKGD_CLR 0x00B0 +#define ATI_DP_FRGD_CLR 0x00B4 +#define ATI_DP_WRITE_MASK 0x00B8 +#define ATI_DP_CHAIN_MASK 0x00BC +#define ATI_DP_PIX_WIDTH 0x00D0 +#define ATI_DP_MIX 0x00D4 +#define ATI_DP_SRC 0x00D8 + +// Clock/config +#define ATI_CLR_CMP_CNTL 0x0100 +#define ATI_GUI_TRAJ_CNTL 0x00CC +#define ATI_GUI_STAT 0x00CE // I/O only; for MMIO see below + +// FIFO and status (MMIO addresses) +#define ATI_FIFO_STAT 0x0310 +#define ATI_GUI_STAT_MMIO 0x0338 + +// Hardware cursor +#define ATI_CUR_CLR0 0x0260 +#define ATI_CUR_CLR1 0x0264 +#define ATI_CUR_OFFSET 0x0268 +#define ATI_CUR_HORZ_VERT_POSN 0x026C +#define ATI_CUR_HORZ_VERT_OFF 0x0270 +#define ATI_GEN_TEST_CNTL 0x0034 // general test/cursor control + +// Memory config +#define ATI_MEM_CNTL 0x0140 + +// I/O and MMIO constants +#define ATI_IO_BASE_DEFAULT 0x02EC // default block I/O base port +#define ATI_MMIO_SIZE 0x0400 // MMIO block size (1KB at end of aperture) +#define ATI_CONFIG_CHIP_ID 0x00E0 + +// ============================================================ +// Mach64 DP_MIX values +// ============================================================ +// +// The drawing processor MIX register controls the raster operation +// for foreground (bits 20:16) and background (bits 4:0). + +#define ATI_MIX_NOT_DST 0x00 +#define ATI_MIX_ZERO 0x01 +#define ATI_MIX_ONE 0x02 +#define ATI_MIX_DST 0x03 +#define ATI_MIX_NOT_SRC 0x04 +#define ATI_MIX_XOR 0x05 +#define ATI_MIX_XNOR 0x06 +#define ATI_MIX_COPY 0x07 // dest = source (most common) +#define ATI_MIX_NOT_SRC_AND 0x08 +#define ATI_MIX_SRC_AND_DST 0x0C +#define ATI_MIX_SRC_OR_DST 0x0E + +// Foreground mix is in bits 20:16, background in bits 4:0 +#define ATI_FRGD_MIX(rop) ((uint32_t)(rop) << 16) +#define ATI_BKGD_MIX(rop) ((uint32_t)(rop)) + +// ============================================================ +// Mach64 DP_SRC values +// ============================================================ + +#define ATI_SRC_BKGD_CLR 0x00 // background color register +#define ATI_SRC_FRGD_CLR 0x01 // foreground color register +#define ATI_SRC_HOST 0x02 // CPU host data +#define ATI_SRC_BLIT 0x03 // video memory (blit) +#define ATI_SRC_PATTERN 0x04 // pattern register + +// DP_SRC packs three source selects: mono src (bits 10:8), +// foreground src (bits 18:16 on some, or bits 10:8), background src +// In practice, the format is: +// bits 2:0 = background source +// bits 10:8 = foreground source +// bits 18:16 = mono source (for color expand) + +#define ATI_DP_SRC_BKGD(s) ((uint32_t)(s)) +#define ATI_DP_SRC_FRGD(s) ((uint32_t)(s) << 8) +#define ATI_DP_SRC_MONO(s) ((uint32_t)(s) << 16) + +// ============================================================ +// Mach64 DP_PIX_WIDTH values +// ============================================================ + +#define ATI_PIX_8BPP 0x02 +#define ATI_PIX_15BPP 0x03 +#define ATI_PIX_16BPP 0x04 +#define ATI_PIX_32BPP 0x06 + +// HOST byte/word/dword order -- use native (little-endian) +#define ATI_HOST_BYTE_ORDER 0x00 + +// GUI_TRAJ_CNTL direction bits +#define ATI_DST_X_DIR_LEFT 0x00 +#define ATI_DST_X_DIR_RIGHT 0x01 +#define ATI_DST_Y_DIR_UP 0x00 +#define ATI_DST_Y_DIR_DOWN 0x02 + +// GUI_STAT busy bit +#define ATI_GUI_STAT_BUSY 0x00000001 +#define ATI_FIFO_STAT_MASK 0x0000FFFF + +// Hardware cursor size +#define ATI_HW_CURSOR_SIZE 64 +#define ATI_HW_CURSOR_BYTES 1024 // 64*64*2bpp/8 + +// Maximum wait iterations +#define ATI_MAX_IDLE_WAIT 1000000 + +// ============================================================ +// Private driver state +// ============================================================ + +typedef struct { + uint32_t lfbPhysAddr; + uint32_t vramSize; + uint32_t cursorOffset; + int32_t bytesPerPixel; + int32_t screenPitch; + volatile uint32_t *mmio; // mapped MMIO register base + uint32_t mmioPhysAddr; + bool useIo; // fall back to I/O on old GX/CX + uint16_t ioBase; // I/O base port for register access + DpmiMappingT lfbMapping; +} AtiPrivateT; + +// ============================================================ +// Prototypes +// ============================================================ + +static void atiBitBlt(AccelDriverT *drv, int32_t srcX, int32_t srcY, int32_t dstX, int32_t dstY, int32_t w, int32_t h); +static void atiColorExpand(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h, uint32_t fg, uint32_t bg); +static bool atiDetect(AccelDriverT *drv); +static void atiHostBlit(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h); +static bool atiInit(AccelDriverT *drv, const AccelModeRequestT *req); +static void atiLineDraw(AccelDriverT *drv, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color); +static void atiMoveCursor(AccelDriverT *drv, int32_t x, int32_t y); +static void atiRectFill(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color); +static void atiRectFillPat(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, const uint8_t *pattern, uint32_t fg, uint32_t bg); +static void atiSetClip(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h); +static void atiSetCursor(AccelDriverT *drv, const HwCursorImageT *image); +static void atiShowCursor(AccelDriverT *drv, bool visible); +static void atiShutdown(AccelDriverT *drv); +static void atiWaitFifo(AtiPrivateT *priv, int32_t entries); +static void atiWaitIdle(AccelDriverT *drv); +static void atiWriteReg(AtiPrivateT *priv, uint32_t reg, uint32_t val); +static uint32_t atiReadReg(AtiPrivateT *priv, uint32_t reg); + +// ============================================================ +// Driver instance +// ============================================================ + +static AtiPrivateT sAtiPrivate; + +static AccelDriverT sAtiDriver = { + .name = "ATI Mach64", + .chipFamily = "ati", + .caps = 0, + .privData = &sAtiPrivate, + .detect = atiDetect, + .init = atiInit, + .shutdown = atiShutdown, + .waitIdle = atiWaitIdle, + .setClip = atiSetClip, + .rectFill = atiRectFill, + .rectFillPat = atiRectFillPat, + .bitBlt = atiBitBlt, + .hostBlit = atiHostBlit, + .colorExpand = atiColorExpand, + .lineDraw = atiLineDraw, + .setCursor = atiSetCursor, + .moveCursor = atiMoveCursor, + .showCursor = atiShowCursor, +}; + +// ============================================================ +// atiRegisterDriver +// ============================================================ + +void atiRegisterDriver(void) { + accelRegisterDriver(&sAtiDriver); +} + + +// ============================================================ +// atiReadReg / atiWriteReg +// ============================================================ +// +// Register access abstraction. Uses MMIO when available, falls +// back to I/O port access on older chips. + +static uint32_t atiReadReg(AtiPrivateT *priv, uint32_t reg) { + if (priv->useIo) { + return inportl(priv->ioBase + reg); + } + + return priv->mmio[reg / 4]; +} + +static void atiWriteReg(AtiPrivateT *priv, uint32_t reg, uint32_t val) { + if (priv->useIo) { + outportl(priv->ioBase + reg, val); + return; + } + + priv->mmio[reg / 4] = val; +} + + +// ============================================================ +// atiBitBlt +// ============================================================ +// +// Screen-to-screen BitBLT. The Mach64 engine handles overlapping +// regions automatically based on the trajectory control register. + +static void atiBitBlt(AccelDriverT *drv, int32_t srcX, int32_t srcY, int32_t dstX, int32_t dstY, int32_t w, int32_t h) { + AtiPrivateT *priv = (AtiPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + // Determine blit direction + uint32_t direction = ATI_DST_X_DIR_RIGHT | ATI_DST_Y_DIR_DOWN; + int32_t sx = srcX; + int32_t sy = srcY; + int32_t dx = dstX; + int32_t dy = dstY; + + if (srcX < dstX) { + direction &= ~ATI_DST_X_DIR_RIGHT; + sx += w - 1; + dx += w - 1; + } + if (srcY < dstY) { + direction &= ~ATI_DST_Y_DIR_DOWN; + sy += h - 1; + dy += h - 1; + } + + atiWaitFifo(priv, 7); + atiWriteReg(priv, ATI_GUI_TRAJ_CNTL, direction); + atiWriteReg(priv, ATI_DP_MIX, ATI_FRGD_MIX(ATI_MIX_COPY) | ATI_BKGD_MIX(ATI_MIX_COPY)); + atiWriteReg(priv, ATI_DP_SRC, ATI_DP_SRC_FRGD(ATI_SRC_BLIT)); + atiWriteReg(priv, ATI_SRC_Y_X, ((uint32_t)sx << 16) | (uint32_t)sy); + atiWriteReg(priv, ATI_SRC_WIDTH1, w); + atiWriteReg(priv, ATI_DST_Y_X, ((uint32_t)dx << 16) | (uint32_t)dy); + atiWriteReg(priv, ATI_DST_HEIGHT_WIDTH, ((uint32_t)w << 16) | (uint32_t)h); +} + + +// ============================================================ +// atiColorExpand +// ============================================================ +// +// Monochrome-to-color expansion via the host data path. +// Converts 1bpp source bitmap to full-color pixels using the +// Mach64 engine. Source data is packed MSB-first, padded to +// dword boundaries per scanline. + +static void atiColorExpand(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h, uint32_t fg, uint32_t bg) { + AtiPrivateT *priv = (AtiPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + // Number of dwords per scanline of monochrome data + int32_t dwordsPerRow = (w + 31) / 32; + + // Set up color expand: mono source from host, fg/bg from color regs + atiWaitFifo(priv, 7); + atiWriteReg(priv, ATI_DP_FRGD_CLR, fg); + atiWriteReg(priv, ATI_DP_BKGD_CLR, bg); + atiWriteReg(priv, ATI_DP_SRC, ATI_DP_SRC_MONO(ATI_SRC_HOST) | ATI_DP_SRC_FRGD(ATI_SRC_FRGD_CLR) | ATI_DP_SRC_BKGD(ATI_SRC_BKGD_CLR)); + atiWriteReg(priv, ATI_DP_MIX, ATI_FRGD_MIX(ATI_MIX_COPY) | ATI_BKGD_MIX(ATI_MIX_COPY)); + atiWriteReg(priv, ATI_GUI_TRAJ_CNTL, ATI_DST_X_DIR_RIGHT | ATI_DST_Y_DIR_DOWN); + atiWriteReg(priv, ATI_DST_Y_X, ((uint32_t)dstX << 16) | (uint32_t)dstY); + atiWriteReg(priv, ATI_DST_HEIGHT_WIDTH, ((uint32_t)w << 16) | (uint32_t)h); + + // Feed monochrome data row by row through HOST_DATA0 + for (int32_t row = 0; row < h; row++) { + const uint8_t *rowPtr = srcBuf + row * srcPitch; + + for (int32_t dw = 0; dw < dwordsPerRow; dw++) { + // Pack bytes into a dword (MSB-first bit order) + int32_t byteOff = dw * 4; + uint32_t data = 0; + + for (int32_t b = 0; b < 4; b++) { + uint8_t srcByte = 0; + if (byteOff + b < srcPitch) { + srcByte = rowPtr[byteOff + b]; + } + data |= (uint32_t)srcByte << (24 - b * 8); + } + + atiWaitFifo(priv, 1); + atiWriteReg(priv, ATI_HOST_DATA0, data); + } + } +} + + +// ============================================================ +// atiDetect +// ============================================================ + +static bool atiDetect(AccelDriverT *drv) { + int32_t matchIdx; + + if (!pciFindDeviceList(sAtiDeviceIds, &drv->pciDev, &matchIdx)) { + return false; + } + + switch (drv->pciDev.deviceId) { + case ATI_MACH64_GX: + drv->name = "ATI Mach64 GX"; + break; + case ATI_MACH64_CX: + drv->name = "ATI Mach64 CX"; + break; + case ATI_MACH64_CT: + drv->name = "ATI Mach64 CT"; + break; + case ATI_MACH64_ET: + drv->name = "ATI Mach64 ET"; + break; + case ATI_MACH64_VT: + case ATI_MACH64_VT_B: + drv->name = "ATI Mach64 VT"; + break; + case ATI_MACH64_GT: + case ATI_MACH64_GT_B: + drv->name = "ATI 3D Rage II"; + break; + case ATI_RAGE_PRO: + case ATI_RAGE_PRO_AGP: + drv->name = "ATI Rage Pro"; + break; + case ATI_RAGE_128_RE: + case ATI_RAGE_128_RF: + case ATI_RAGE_128_RK: + case ATI_RAGE_128_RL: + drv->name = "ATI Rage 128"; + break; + case ATI_RAGE_128_PRO_PF: + case ATI_RAGE_128_PRO_PR: + drv->name = "ATI Rage 128 Pro"; + break; + default: + drv->name = "ATI Mach64"; + break; + } + + return true; +} + + +// ============================================================ +// atiHostBlit +// ============================================================ +// +// CPU-to-screen blit. Transfers pixel data from system memory +// to VRAM through the Mach64 host data registers. + +static void atiHostBlit(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h) { + AtiPrivateT *priv = (AtiPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + int32_t bytesPerPixel = priv->bytesPerPixel; + int32_t rowBytes = w * bytesPerPixel; + int32_t dwordsPerRow = (rowBytes + 3) / 4; + + // Set up host-to-screen blit + atiWaitFifo(priv, 5); + atiWriteReg(priv, ATI_DP_SRC, ATI_DP_SRC_FRGD(ATI_SRC_HOST)); + atiWriteReg(priv, ATI_DP_MIX, ATI_FRGD_MIX(ATI_MIX_COPY) | ATI_BKGD_MIX(ATI_MIX_COPY)); + atiWriteReg(priv, ATI_GUI_TRAJ_CNTL, ATI_DST_X_DIR_RIGHT | ATI_DST_Y_DIR_DOWN); + atiWriteReg(priv, ATI_DST_Y_X, ((uint32_t)dstX << 16) | (uint32_t)dstY); + atiWriteReg(priv, ATI_DST_HEIGHT_WIDTH, ((uint32_t)w << 16) | (uint32_t)h); + + // Write pixel data row by row through HOST_DATA0 + for (int32_t row = 0; row < h; row++) { + const uint8_t *rowPtr = srcBuf + row * srcPitch; + + for (int32_t dw = 0; dw < dwordsPerRow; dw++) { + int32_t byteOff = dw * 4; + uint32_t data = 0; + + // Pack bytes into a dword (little-endian native order) + for (int32_t b = 0; b < 4; b++) { + if (byteOff + b < rowBytes) { + data |= (uint32_t)rowPtr[byteOff + b] << (b * 8); + } + } + + atiWaitFifo(priv, 1); + atiWriteReg(priv, ATI_HOST_DATA0, data); + } + } +} + + +// ============================================================ +// atiInit +// ============================================================ + +static bool atiInit(AccelDriverT *drv, const AccelModeRequestT *req) { + AtiPrivateT *priv = (AtiPrivateT *)drv->privData; + + memset(priv, 0, sizeof(*priv)); + + // Determine if this is an old GX/CX (I/O only) or newer (MMIO) + priv->useIo = (drv->pciDev.deviceId == ATI_MACH64_GX + || drv->pciDev.deviceId == ATI_MACH64_CX); + priv->ioBase = ATI_IO_BASE_DEFAULT; + + // Get LFB address and size from PCI BAR0 + uint32_t bar0 = pciRead32(drv->pciDev.bus, drv->pciDev.dev, + drv->pciDev.func, PCI_BAR0); + priv->lfbPhysAddr = bar0 & 0xFFFFFFF0; + + uint32_t barSize = pciSizeBar(drv->pciDev.bus, drv->pciDev.dev, drv->pciDev.func, PCI_BAR0); + + // Aperture size != VRAM size on Mach64 (aperture is typically 8MB) + // Read actual VRAM from MEM_CNTL register + uint32_t memCntl; + if (priv->useIo) { + memCntl = inportl(priv->ioBase + ATI_MEM_CNTL); + } else { + // Need a temporary MMIO mapping to read MEM_CNTL + // MMIO is at the end of the aperture + priv->mmioPhysAddr = priv->lfbPhysAddr + barSize - ATI_MMIO_SIZE; + memCntl = 0; // will determine from aperture size + } + + // Determine VRAM size + if (memCntl != 0) { + uint32_t memSize = memCntl & 0x07; + switch (memSize) { + case 0: priv->vramSize = 512 * 1024; break; + case 1: priv->vramSize = 1024 * 1024; break; + case 2: priv->vramSize = 2 * 1024 * 1024; break; + case 3: priv->vramSize = 4 * 1024 * 1024; break; + case 4: priv->vramSize = 6 * 1024 * 1024; break; + case 5: priv->vramSize = 8 * 1024 * 1024; break; + default: priv->vramSize = 2 * 1024 * 1024; break; + } + } else { + // Conservative fallback + priv->vramSize = (barSize > 8 * 1024 * 1024) ? 4 * 1024 * 1024 : barSize; + } + + // Set VESA mode + VesaModeResultT vesa; + if (!vesaFindAndSetMode(req->width, req->height, req->bpp, &vesa)) { + return false; + } + + // Map LFB + MMIO region (map entire aperture; MMIO is at end) + if (!dpmiMapFramebuffer(priv->lfbPhysAddr, barSize, &priv->lfbMapping)) { + vgaRestoreTextMode(); + return false; + } + + priv->bytesPerPixel = (vesa.bpp + 7) / 8; + priv->screenPitch = vesa.pitch; + + drv->mode.width = vesa.width; + drv->mode.height = vesa.height; + drv->mode.bpp = vesa.bpp; + drv->mode.pitch = vesa.pitch; + drv->mode.framebuffer = priv->lfbMapping.ptr; + drv->mode.vramSize = priv->vramSize; + drv->mode.offscreenBase = vesa.pitch * vesa.height; + + // Set up MMIO pointer at end of aperture + if (!priv->useIo) { + priv->mmio = (volatile uint32_t *)(priv->lfbMapping.ptr + barSize - ATI_MMIO_SIZE); + } + + // Configure the drawing engine pixel width + uint32_t pixWidth; + switch (vesa.bpp) { + case 8: pixWidth = ATI_PIX_8BPP; break; + case 15: pixWidth = ATI_PIX_15BPP; break; + case 16: pixWidth = ATI_PIX_16BPP; break; + case 32: pixWidth = ATI_PIX_32BPP; break; + default: pixWidth = ATI_PIX_16BPP; break; + } + + // DP_PIX_WIDTH: set all fields to the same depth + uint32_t dpPixWidth = pixWidth + | (pixWidth << 4) // host data + | (pixWidth << 8) // source + | (pixWidth << 16) // destination + | (pixWidth << 28); // default + atiWaitFifo(priv, 2); + atiWriteReg(priv, ATI_DP_PIX_WIDTH, dpPixWidth); + atiWriteReg(priv, ATI_DP_WRITE_MASK, 0xFFFFFFFF); + + // Set DST_OFF_PITCH: offset = 0, pitch in units of 8 pixels + uint32_t pitch8 = vesa.pitch / priv->bytesPerPixel / 8; + atiWriteReg(priv, ATI_DST_OFF_PITCH, pitch8 << 22); + atiWriteReg(priv, ATI_SRC_OFF_PITCH, pitch8 << 22); + + // Set up cursor at end of VRAM + priv->cursorOffset = priv->vramSize - ATI_HW_CURSOR_BYTES; + priv->cursorOffset &= ~(ATI_HW_CURSOR_BYTES - 1); + + drv->caps = ACAP_RECT_FILL + | ACAP_RECT_FILL_PAT + | ACAP_BITBLT + | ACAP_HOST_BLIT + | ACAP_COLOR_EXPAND + | ACAP_LINE_DRAW + | ACAP_HW_CURSOR + | ACAP_CLIP; + + // Full screen clip + atiSetClip(drv, 0, 0, vesa.width, vesa.height); + + atiWaitIdle(drv); + return true; +} + + +// ============================================================ +// atiLineDraw +// ============================================================ +// +// Bresenham line draw using the Mach64 DST_BRES registers. + +static void atiLineDraw(AccelDriverT *drv, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color) { + AtiPrivateT *priv = (AtiPrivateT *)drv->privData; + + int32_t dx = x2 - x1; + int32_t dy = y2 - y1; + + uint32_t direction = ATI_DST_X_DIR_RIGHT | ATI_DST_Y_DIR_DOWN; + + if (dx < 0) { + dx = -dx; + direction &= ~ATI_DST_X_DIR_RIGHT; + } + if (dy < 0) { + dy = -dy; + direction &= ~ATI_DST_Y_DIR_DOWN; + } + + int32_t majAxis; + int32_t minAxis; + + if (dx >= dy) { + majAxis = dx; + minAxis = dy; + } else { + majAxis = dy; + minAxis = dx; + // Swap X/Y major + direction |= 0x04; // Y major axis select + } + + if (majAxis == 0) { + return; + } + + int32_t errTerm = 2 * minAxis - majAxis; + int32_t errInc = 2 * minAxis; + int32_t errDec = 2 * (minAxis - majAxis); + + atiWaitFifo(priv, 8); + atiWriteReg(priv, ATI_GUI_TRAJ_CNTL, direction); + atiWriteReg(priv, ATI_DP_MIX, ATI_FRGD_MIX(ATI_MIX_COPY)); + atiWriteReg(priv, ATI_DP_SRC, ATI_DP_SRC_FRGD(ATI_SRC_FRGD_CLR)); + atiWriteReg(priv, ATI_DP_FRGD_CLR, color); + atiWriteReg(priv, ATI_DST_Y_X, ((uint32_t)x1 << 16) | (uint32_t)y1); + atiWriteReg(priv, ATI_DST_BRES_ERR, errTerm); + atiWriteReg(priv, ATI_DST_BRES_INC, errInc); + atiWriteReg(priv, ATI_DST_BRES_DEC, errDec); + + atiWaitFifo(priv, 1); + atiWriteReg(priv, ATI_DST_BRES_LNTH, majAxis + 1); +} + + +// ============================================================ +// atiMoveCursor +// ============================================================ + +static void atiMoveCursor(AccelDriverT *drv, int32_t x, int32_t y) { + AtiPrivateT *priv = (AtiPrivateT *)drv->privData; + + uint32_t offset = 0; + + if (x < 0) { + offset |= ((-x) & 0x3F) << 16; + x = 0; + } + if (y < 0) { + offset |= (-y) & 0x3F; + y = 0; + } + + atiWriteReg(priv, ATI_CUR_HORZ_VERT_OFF, offset); + atiWriteReg(priv, ATI_CUR_HORZ_VERT_POSN, + ((uint32_t)x << 16) | (uint32_t)y); +} + + +// ============================================================ +// atiRectFill +// ============================================================ + +static void atiRectFill(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color) { + AtiPrivateT *priv = (AtiPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + atiWaitFifo(priv, 5); + atiWriteReg(priv, ATI_GUI_TRAJ_CNTL, ATI_DST_X_DIR_RIGHT | ATI_DST_Y_DIR_DOWN); + atiWriteReg(priv, ATI_DP_MIX, ATI_FRGD_MIX(ATI_MIX_COPY)); + atiWriteReg(priv, ATI_DP_SRC, ATI_DP_SRC_FRGD(ATI_SRC_FRGD_CLR)); + atiWriteReg(priv, ATI_DP_FRGD_CLR, color); + atiWriteReg(priv, ATI_DST_Y_X, ((uint32_t)x << 16) | (uint32_t)y); + + atiWaitFifo(priv, 1); + atiWriteReg(priv, ATI_DST_HEIGHT_WIDTH, ((uint32_t)w << 16) | (uint32_t)h); +} + + +// ============================================================ +// atiRectFillPat +// ============================================================ +// +// 8x8 mono pattern fill using the host data path. The pattern is +// 8 bytes (one per row, MSB-first), tiled across the rectangle. +// 1-bits use the foreground color, 0-bits use the background. +// Data is fed through HOST_DATA0, repeating the 8-row pattern +// for the full height, with each row padded to a dword boundary. + +static void atiRectFillPat(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, const uint8_t *pattern, uint32_t fg, uint32_t bg) { + AtiPrivateT *priv = (AtiPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + // Number of dwords per scanline of monochrome data + int32_t dwordsPerRow = (w + 31) / 32; + + // Set up color expand: mono source from host, fg/bg from color regs + atiWaitFifo(priv, 7); + atiWriteReg(priv, ATI_DP_FRGD_CLR, fg); + atiWriteReg(priv, ATI_DP_BKGD_CLR, bg); + atiWriteReg(priv, ATI_DP_SRC, ATI_DP_SRC_MONO(ATI_SRC_HOST) | ATI_DP_SRC_FRGD(ATI_SRC_FRGD_CLR) | ATI_DP_SRC_BKGD(ATI_SRC_BKGD_CLR)); + atiWriteReg(priv, ATI_DP_MIX, ATI_FRGD_MIX(ATI_MIX_COPY) | ATI_BKGD_MIX(ATI_MIX_COPY)); + atiWriteReg(priv, ATI_GUI_TRAJ_CNTL, ATI_DST_X_DIR_RIGHT | ATI_DST_Y_DIR_DOWN); + atiWriteReg(priv, ATI_DST_Y_X, ((uint32_t)x << 16) | (uint32_t)y); + atiWriteReg(priv, ATI_DST_HEIGHT_WIDTH, ((uint32_t)w << 16) | (uint32_t)h); + + // Feed tiled pattern data through HOST_DATA0 + for (int32_t row = 0; row < h; row++) { + uint8_t patByte = pattern[row & 7]; + + for (int32_t dw = 0; dw < dwordsPerRow; dw++) { + // Replicate the pattern byte across all 4 bytes of the dword. + // MSB-first bit order: place the pattern byte in the high byte. + uint32_t data = ((uint32_t)patByte << 24) + | ((uint32_t)patByte << 16) + | ((uint32_t)patByte << 8) + | (uint32_t)patByte; + + atiWaitFifo(priv, 1); + atiWriteReg(priv, ATI_HOST_DATA0, data); + } + } +} + + +// ============================================================ +// atiSetClip +// ============================================================ + +static void atiSetClip(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h) { + AtiPrivateT *priv = (AtiPrivateT *)drv->privData; + + atiWaitFifo(priv, 4); + atiWriteReg(priv, ATI_SC_LEFT, x); + atiWriteReg(priv, ATI_SC_TOP, y); + atiWriteReg(priv, ATI_SC_RIGHT, x + w - 1); + atiWriteReg(priv, ATI_SC_BOTTOM, y + h - 1); +} + + +// ============================================================ +// atiSetCursor +// ============================================================ + +static void atiSetCursor(AccelDriverT *drv, const HwCursorImageT *image) { + AtiPrivateT *priv = (AtiPrivateT *)drv->privData; + + if (!image) { + atiShowCursor(drv, false); + return; + } + + atiWaitIdle(drv); + + // Write cursor image to VRAM + // Mach64 cursor format: 64x64, 2bpp, rows of 16 bytes + // Bit encoding: 00=cursor color 0, 01=cursor color 1, + // 10=transparent, 11=inverted + uint8_t *cursorMem = drv->mode.framebuffer + priv->cursorOffset; + + for (int32_t row = 0; row < ATI_HW_CURSOR_SIZE; row++) { + for (int32_t byte = 0; byte < 16; byte++) { + uint8_t val = 0xAA; // all transparent (10 pattern) + + if (row < image->height && byte < (image->width + 3) / 4) { + // Convert AND/XOR to Mach64 2bpp encoding + int32_t bitOff = byte * 4; + uint8_t andBits = 0; + uint8_t xorBits = 0; + + if (bitOff / 8 < (image->width + 7) / 8) { + andBits = image->andMask[row * 8 + bitOff / 8]; + xorBits = image->xorMask[row * 8 + bitOff / 8]; + } + + // Pack 4 pixels into one byte (2 bits each) + val = 0; + for (int32_t px = 0; px < 4; px++) { + int32_t srcBit = (bitOff + px) % 8; + uint8_t andBit = (andBits >> (7 - srcBit)) & 1; + uint8_t xorBit = (xorBits >> (7 - srcBit)) & 1; + uint8_t pixel; + + if (andBit && !xorBit) { + pixel = 0x02; // transparent + } else if (andBit && xorBit) { + pixel = 0x03; // inverted + } else if (!andBit && xorBit) { + pixel = 0x01; // cursor color 1 + } else { + pixel = 0x00; // cursor color 0 + } + + val |= pixel << (6 - px * 2); + } + } + + cursorMem[row * 16 + byte] = val; + } + } + + // Set cursor offset (in units of 8 bytes) + atiWriteReg(priv, ATI_CUR_OFFSET, priv->cursorOffset / 8); + + // Set cursor colors (white foreground, black background) + atiWriteReg(priv, ATI_CUR_CLR0, 0x00000000); + atiWriteReg(priv, ATI_CUR_CLR1, 0x00FFFFFF); +} + + +// ============================================================ +// atiShowCursor +// ============================================================ + +static void atiShowCursor(AccelDriverT *drv, bool visible) { + AtiPrivateT *priv = (AtiPrivateT *)drv->privData; + + uint32_t val = atiReadReg(priv, ATI_GEN_TEST_CNTL); + + if (visible) { + val |= 0x80; // enable cursor + } else { + val &= ~0x80; + } + + atiWriteReg(priv, ATI_GEN_TEST_CNTL, val); +} + + +// ============================================================ +// atiShutdown +// ============================================================ + +static void atiShutdown(AccelDriverT *drv) { + AtiPrivateT *priv = (AtiPrivateT *)drv->privData; + + atiShowCursor(drv, false); + dpmiUnmapFramebuffer(&priv->lfbMapping); + vgaRestoreTextMode(); +} + + +// ============================================================ +// atiWaitFifo +// ============================================================ +// +// Wait until the Mach64 FIFO has at least 'entries' free slots. +// The FIFO_STAT register indicates free entries (bits 15:0, +// value = 0x8000 means 0 free, lower values mean more free). + +static void atiWaitFifo(AtiPrivateT *priv, int32_t entries) { + uint32_t mask = ATI_FIFO_STAT_MASK >> entries; + + for (int32_t i = 0; i < ATI_MAX_IDLE_WAIT; i++) { + if (!(atiReadReg(priv, ATI_FIFO_STAT) & mask)) { + return; + } + } +} + + +// ============================================================ +// atiWaitIdle +// ============================================================ + +static void atiWaitIdle(AccelDriverT *drv) { + AtiPrivateT *priv = (AtiPrivateT *)drv->privData; + + // First wait for FIFO to drain + atiWaitFifo(priv, 16); + + // Then wait for engine idle + for (int32_t i = 0; i < ATI_MAX_IDLE_WAIT; i++) { + if (!(atiReadReg(priv, ATI_GUI_STAT_MMIO) & ATI_GUI_STAT_BUSY)) { + return; + } + } +} diff --git a/banshee.c b/banshee.c new file mode 100644 index 0000000..a5aa8bd --- /dev/null +++ b/banshee.c @@ -0,0 +1,715 @@ +// banshee.c -- 3dfx Banshee/Voodoo3 accelerated video driver +// +// Supports the 3dfx Banshee and Voodoo3 2D/3D accelerators. +// The Banshee was 3dfx's first 2D/3D combo chip, and the Voodoo3 +// improved on it with higher clock speeds. Both share the same +// 2D register interface: +// - Hardware rectangle fill +// - Screen-to-screen BitBLT +// - CPU-to-screen blit (host blit via launch area) +// - Monochrome color expansion (host blit with mono source) +// - Bresenham line draw +// - Hardware clip rectangle +// - 64x64 hardware cursor +// +// Register access: +// BAR0 maps the 32KB MMIO register block. The 2D engine +// registers live at offsets 0x200-0x270 within this block. +// The status register at 0x100 provides engine busy state. +// +// For host-to-screen operations, pixel data is fed through the +// "launch area" -- a write-combining window at MMIO physical +// address + 0x80000. Data is written as 32-bit dwords. +// +// BAR1 maps the linear framebuffer. + +#include "accelVid.h" +#include "vgaCommon.h" +#include "pci.h" + +#include +#include +#include +#include +#include + +// ============================================================ +// 3dfx vendor/device IDs +// ============================================================ + +#define TDFX_VENDOR_ID 0x121A + +#define TDFX_BANSHEE 0x0003 +#define TDFX_VOODOO3 0x0005 + +static const uint16_t sBansheeDeviceIds[] = { + TDFX_VENDOR_ID, TDFX_BANSHEE, + TDFX_VENDOR_ID, TDFX_VOODOO3, + 0, 0 +}; + +// ============================================================ +// 2D engine register offsets (from MMIO base) +// ============================================================ + +#define BAN_STATUS 0x100 // bits 0-10: busy when any set +#define BAN_INTRCTRL 0x108 // interrupt control + +#define BAN_CLIP0MIN 0x200 // clip rect 0 min (X | Y<<16) +#define BAN_CLIP0MAX 0x204 // clip rect 0 max (X | Y<<16) +#define BAN_DSTBASEADDR 0x208 // destination base address +#define BAN_DSTFORMAT 0x20C // pitch<<16 | bpp encoding +#define BAN_SRCCKMIN 0x210 // source color key min +#define BAN_SRCCKMAX 0x214 // source color key max +#define BAN_DSTCKMIN 0x218 // dest color key min +#define BAN_DSTCKMAX 0x21C // dest color key max +#define BAN_BRESERROR0 0x220 // Bresenham error 0 +#define BAN_BRESERROR1 0x224 // Bresenham error 1 +#define BAN_ROP 0x230 // raster operation (bits 7:0) +#define BAN_SRCBASEADDR 0x234 // source base address +#define BAN_COMMANDEXTRA 0x238 // command extra +#define BAN_LINESTIPPLE 0x23C // line stipple +#define BAN_LINESTYLE 0x240 // line style +#define BAN_PATTERN0 0x244 // pattern alias 0 +#define BAN_PATTERN1 0x248 // pattern alias 1 +#define BAN_CLIP1MIN 0x24C // clip rect 1 min +#define BAN_CLIP1MAX 0x250 // clip rect 1 max +#define BAN_SRCFORMAT 0x254 // pitch<<16 | bpp encoding +#define BAN_SRCSIZE 0x258 // width | height<<16 +#define BAN_SRCXY 0x25C // X | Y<<16 +#define BAN_COLORBACK 0x260 // background color +#define BAN_COLORFORE 0x264 // foreground color +#define BAN_DSTSIZE 0x268 // width | height<<16 +#define BAN_DSTXY 0x26C // X | Y<<16 +#define BAN_COMMAND 0x270 // command (triggers operation) + +// ============================================================ +// Command register encoding +// ============================================================ + +// Command types (bits 3:0) +#define BAN_CMD_NOP 0x00 +#define BAN_CMD_S2S_BLIT 0x01 // screen-to-screen blit +#define BAN_CMD_S2S_STRETCH 0x02 // screen-to-screen stretch blit +#define BAN_CMD_H2S_BLIT 0x03 // host-to-screen blit +#define BAN_CMD_RECTFILL 0x05 // rectangle fill +#define BAN_CMD_LINEDRAW 0x06 // line draw +#define BAN_CMD_POLYLINE 0x07 // polyline + +// Command flags +#define BAN_CMD_INITIATE (1 << 4) // must be set to start operation +#define BAN_CMD_STIPPLE (1 << 8) // stipple line +#define BAN_CMD_CLIPSEL1 (1 << 9) // use clip1 instead of clip0 +#define BAN_CMD_SRCCKENA (1 << 12) // source color key enable +#define BAN_CMD_DSTCKENA (1 << 13) // dest color key enable +#define BAN_CMD_MONOPAT (1 << 14) // mono pattern +#define BAN_CMD_SRCMONO (1 << 15) // source is monochrome + +// ============================================================ +// BPP format encodings (for srcFormat/dstFormat low bits) +// ============================================================ + +#define BAN_FMT_8BPP 1 +#define BAN_FMT_16BPP 3 +#define BAN_FMT_32BPP 5 + +// ============================================================ +// Status register +// ============================================================ + +#define BAN_STATUS_BUSY_MASK 0x7FF // bits 0-10: engine busy + +// ============================================================ +// Hardware cursor registers +// ============================================================ + +#define BAN_VIDPROCCFG 0x5C // bit 27 = cursor enable +#define BAN_CURSORLOC 0x60 // X | Y<<16 + +#define BAN_CURSOR_ENABLE (1 << 27) + +// ============================================================ +// Launch area +// ============================================================ + +#define BAN_LAUNCH_OFFSET 0x80000 // offset from MMIO phys base +#define BAN_LAUNCH_MAP_SIZE 4096 // map 4KB of launch area + +// ============================================================ +// Misc constants +// ============================================================ + +#define BAN_MMIO_SIZE 32768 // BAR0: 32KB MMIO +#define BAN_MAX_IDLE_WAIT 1000000 +#define BAN_ROP_COPY 0xCC +#define BAN_HW_CURSOR_SIZE 64 + +// ============================================================ +// Private driver state +// ============================================================ + +typedef struct { + uint32_t lfbPhysAddr; + uint32_t mmioPhysAddr; + uint32_t vramSize; + int32_t bytesPerPixel; + int32_t screenPitch; + uint32_t bppFormat; + volatile uint32_t *mmio; + volatile uint32_t *launch; + DpmiMappingT mmioMap; + DpmiMappingT lfbMap; + DpmiMappingT launchMap; +} BansheePrivateT; + +// ============================================================ +// Prototypes +// ============================================================ + +static void bansheeBitBlt(AccelDriverT *drv, int32_t srcX, int32_t srcY, int32_t dstX, int32_t dstY, int32_t w, int32_t h); +static void bansheeColorExpand(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h, uint32_t fg, uint32_t bg); +static bool bansheeDetect(AccelDriverT *drv); +static void bansheeHostBlit(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h); +static bool bansheeInit(AccelDriverT *drv, const AccelModeRequestT *req); +static void bansheeLineDraw(AccelDriverT *drv, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color); +static void bansheeMoveCursor(AccelDriverT *drv, int32_t x, int32_t y); +static void bansheeRectFill(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color); +static void bansheeRectFillPat(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, const uint8_t *pattern, uint32_t fg, uint32_t bg); +static void bansheeSetClip(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h); +static void bansheeSetCursor(AccelDriverT *drv, const HwCursorImageT *image); +static void bansheeShowCursor(AccelDriverT *drv, bool visible); +static void bansheeShutdown(AccelDriverT *drv); +static void bansheeWaitIdle(AccelDriverT *drv); +static uint32_t bppToFormat(int32_t bpp); + +static inline void bansheeWrite(BansheePrivateT *priv, uint32_t reg, uint32_t val) { + priv->mmio[reg / 4] = val; +} + +static inline uint32_t bansheeRead(BansheePrivateT *priv, uint32_t reg) { + return priv->mmio[reg / 4]; +} + +// ============================================================ +// Driver instance +// ============================================================ + +static BansheePrivateT sBansheePrivate; + +static AccelDriverT sBansheeDriver = { + .name = "3dfx Banshee", + .chipFamily = "3dfx", + .caps = 0, + .privData = &sBansheePrivate, + .detect = bansheeDetect, + .init = bansheeInit, + .shutdown = bansheeShutdown, + .waitIdle = bansheeWaitIdle, + .setClip = bansheeSetClip, + .rectFill = bansheeRectFill, + .rectFillPat = bansheeRectFillPat, + .bitBlt = bansheeBitBlt, + .hostBlit = bansheeHostBlit, + .colorExpand = bansheeColorExpand, + .lineDraw = bansheeLineDraw, + .setCursor = bansheeSetCursor, + .moveCursor = bansheeMoveCursor, + .showCursor = bansheeShowCursor, +}; + +// ============================================================ +// bansheeRegisterDriver +// ============================================================ + +void bansheeRegisterDriver(void) { + accelRegisterDriver(&sBansheeDriver); +} + + +// ============================================================ +// bansheeBitBlt +// ============================================================ +// +// Screen-to-screen BitBLT. The Banshee engine handles overlapping +// regions automatically when srcXY and dstXY are set correctly -- +// the hardware determines the blit direction internally. + +static void bansheeBitBlt(AccelDriverT *drv, int32_t srcX, int32_t srcY, int32_t dstX, int32_t dstY, int32_t w, int32_t h) { + BansheePrivateT *priv = (BansheePrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + bansheeWaitIdle(drv); + + bansheeWrite(priv, BAN_SRCBASEADDR, 0); + bansheeWrite(priv, BAN_DSTBASEADDR, 0); + bansheeWrite(priv, BAN_SRCFORMAT, ((uint32_t)priv->screenPitch << 16) | priv->bppFormat); + bansheeWrite(priv, BAN_DSTFORMAT, ((uint32_t)priv->screenPitch << 16) | priv->bppFormat); + bansheeWrite(priv, BAN_ROP, BAN_ROP_COPY); + bansheeWrite(priv, BAN_SRCSIZE, (uint32_t)w | ((uint32_t)h << 16)); + bansheeWrite(priv, BAN_SRCXY, (uint32_t)srcX | ((uint32_t)srcY << 16)); + bansheeWrite(priv, BAN_DSTSIZE, (uint32_t)w | ((uint32_t)h << 16)); + bansheeWrite(priv, BAN_DSTXY, (uint32_t)dstX | ((uint32_t)dstY << 16)); + bansheeWrite(priv, BAN_COMMAND, BAN_CMD_S2S_BLIT | BAN_CMD_INITIATE); +} + + +// ============================================================ +// bansheeColorExpand +// ============================================================ +// +// Monochrome-to-color expansion using host-to-screen blit with +// the SRCMONO flag. Mono bitmap bits are expanded to fg/bg colors +// by the hardware. Data is fed as dwords through the launch area. + +static void bansheeColorExpand(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h, uint32_t fg, uint32_t bg) { + BansheePrivateT *priv = (BansheePrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + int32_t bytesPerRow = (w + 7) / 8; + int32_t dwordsPerRow = (bytesPerRow + 3) / 4; + + bansheeWaitIdle(drv); + + bansheeWrite(priv, BAN_DSTBASEADDR, 0); + bansheeWrite(priv, BAN_DSTFORMAT, ((uint32_t)priv->screenPitch << 16) | priv->bppFormat); + bansheeWrite(priv, BAN_SRCFORMAT, ((uint32_t)bytesPerRow << 16) | BAN_FMT_8BPP); + bansheeWrite(priv, BAN_ROP, BAN_ROP_COPY); + bansheeWrite(priv, BAN_COLORFORE, fg); + bansheeWrite(priv, BAN_COLORBACK, bg); + bansheeWrite(priv, BAN_SRCSIZE, (uint32_t)w | ((uint32_t)h << 16)); + bansheeWrite(priv, BAN_DSTSIZE, (uint32_t)w | ((uint32_t)h << 16)); + bansheeWrite(priv, BAN_DSTXY, (uint32_t)dstX | ((uint32_t)dstY << 16)); + bansheeWrite(priv, BAN_COMMAND, BAN_CMD_H2S_BLIT | BAN_CMD_INITIATE | BAN_CMD_SRCMONO); + + // Feed mono data row by row through the launch area + for (int32_t row = 0; row < h; row++) { + const uint8_t *rowPtr = srcBuf + row * srcPitch; + + for (int32_t dw = 0; dw < dwordsPerRow; dw++) { + uint32_t val = 0; + int32_t offset = dw * 4; + + for (int32_t b = 0; b < 4; b++) { + if (offset + b < bytesPerRow) { + val |= (uint32_t)rowPtr[offset + b] << (b * 8); + } + } + + priv->launch[0] = val; + } + } +} + + +// ============================================================ +// bansheeDetect +// ============================================================ + +static bool bansheeDetect(AccelDriverT *drv) { + int32_t matchIdx; + + if (!pciFindDeviceList(sBansheeDeviceIds, &drv->pciDev, &matchIdx)) { + return false; + } + + switch (drv->pciDev.deviceId) { + case TDFX_BANSHEE: + drv->name = "3dfx Banshee"; + break; + case TDFX_VOODOO3: + drv->name = "3dfx Voodoo3"; + break; + default: + drv->name = "3dfx Banshee/Voodoo3"; + break; + } + + return true; +} + + +// ============================================================ +// bansheeHostBlit +// ============================================================ +// +// CPU-to-screen blit using host-to-screen command. Pixel data is +// fed as dwords through the launch area write-combining window. + +static void bansheeHostBlit(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h) { + BansheePrivateT *priv = (BansheePrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + int32_t bytesPerRow = w * priv->bytesPerPixel; + int32_t dwordsPerRow = (bytesPerRow + 3) / 4; + + bansheeWaitIdle(drv); + + bansheeWrite(priv, BAN_DSTBASEADDR, 0); + bansheeWrite(priv, BAN_SRCBASEADDR, 0); + bansheeWrite(priv, BAN_SRCFORMAT, ((uint32_t)(w * priv->bytesPerPixel) << 16) | priv->bppFormat); + bansheeWrite(priv, BAN_DSTFORMAT, ((uint32_t)priv->screenPitch << 16) | priv->bppFormat); + bansheeWrite(priv, BAN_ROP, BAN_ROP_COPY); + bansheeWrite(priv, BAN_SRCSIZE, (uint32_t)w | ((uint32_t)h << 16)); + bansheeWrite(priv, BAN_DSTSIZE, (uint32_t)w | ((uint32_t)h << 16)); + bansheeWrite(priv, BAN_DSTXY, (uint32_t)dstX | ((uint32_t)dstY << 16)); + bansheeWrite(priv, BAN_COMMAND, BAN_CMD_H2S_BLIT | BAN_CMD_INITIATE); + + // Feed pixel data row by row through the launch area + for (int32_t row = 0; row < h; row++) { + const uint8_t *rowPtr = srcBuf + row * srcPitch; + + for (int32_t dw = 0; dw < dwordsPerRow; dw++) { + uint32_t val = 0; + int32_t offset = dw * 4; + + for (int32_t b = 0; b < 4; b++) { + if (offset + b < bytesPerRow) { + val |= (uint32_t)rowPtr[offset + b] << (b * 8); + } + } + + priv->launch[0] = val; + } + } +} + + +// ============================================================ +// bansheeInit +// ============================================================ + +static bool bansheeInit(AccelDriverT *drv, const AccelModeRequestT *req) { + BansheePrivateT *priv = (BansheePrivateT *)drv->privData; + + // Read BARs + uint32_t bar0 = pciRead32(drv->pciDev.bus, drv->pciDev.dev, + drv->pciDev.func, PCI_BAR0); + uint32_t bar1 = pciRead32(drv->pciDev.bus, drv->pciDev.dev, + drv->pciDev.func, PCI_BAR1); + + priv->mmioPhysAddr = bar0 & 0xFFFFFFF0; + priv->lfbPhysAddr = bar1 & 0xFFFFFFF0; + + // Size the framebuffer BAR + priv->vramSize = pciSizeBar(drv->pciDev.bus, drv->pciDev.dev, + drv->pciDev.func, PCI_BAR1); + + // Map MMIO control registers (32KB) + if (!dpmiMapFramebuffer(priv->mmioPhysAddr, BAN_MMIO_SIZE, &priv->mmioMap)) { + return false; + } + priv->mmio = (volatile uint32_t *)priv->mmioMap.ptr; + + // Map launch area (4KB at MMIO phys + 0x80000) + if (!dpmiMapFramebuffer(priv->mmioPhysAddr + BAN_LAUNCH_OFFSET, BAN_LAUNCH_MAP_SIZE, &priv->launchMap)) { + dpmiUnmapFramebuffer(&priv->mmioMap); + return false; + } + priv->launch = (volatile uint32_t *)priv->launchMap.ptr; + + // Find and set VESA mode + VesaModeResultT vesa; + if (!vesaFindAndSetMode(req->width, req->height, req->bpp, &vesa)) { + dpmiUnmapFramebuffer(&priv->launchMap); + dpmiUnmapFramebuffer(&priv->mmioMap); + return false; + } + + // Map framebuffer + if (!dpmiMapFramebuffer(priv->lfbPhysAddr, priv->vramSize, &priv->lfbMap)) { + vgaRestoreTextMode(); + dpmiUnmapFramebuffer(&priv->launchMap); + dpmiUnmapFramebuffer(&priv->mmioMap); + return false; + } + + priv->bytesPerPixel = (vesa.bpp + 7) / 8; + priv->screenPitch = vesa.pitch; + priv->bppFormat = bppToFormat(vesa.bpp); + + drv->mode.width = vesa.width; + drv->mode.height = vesa.height; + drv->mode.bpp = vesa.bpp; + drv->mode.pitch = vesa.pitch; + drv->mode.framebuffer = priv->lfbMap.ptr; + drv->mode.vramSize = priv->vramSize; + drv->mode.offscreenBase = vesa.pitch * vesa.height; + + // Wait for engine idle before configuring + bansheeWaitIdle(drv); + + // Set default engine state + bansheeWrite(priv, BAN_SRCBASEADDR, 0); + bansheeWrite(priv, BAN_DSTBASEADDR, 0); + bansheeWrite(priv, BAN_DSTFORMAT, ((uint32_t)priv->screenPitch << 16) | priv->bppFormat); + bansheeWrite(priv, BAN_SRCFORMAT, ((uint32_t)priv->screenPitch << 16) | priv->bppFormat); + bansheeWrite(priv, BAN_ROP, BAN_ROP_COPY); + bansheeWrite(priv, BAN_COMMANDEXTRA, 0); + + drv->caps = ACAP_RECT_FILL + | ACAP_RECT_FILL_PAT + | ACAP_BITBLT + | ACAP_HOST_BLIT + | ACAP_COLOR_EXPAND + | ACAP_LINE_DRAW + | ACAP_HW_CURSOR + | ACAP_CLIP; + + // Full screen clip + bansheeSetClip(drv, 0, 0, vesa.width, vesa.height); + + return true; +} + + +// ============================================================ +// bansheeLineDraw +// ============================================================ +// +// Bresenham line draw with inclusive endpoints. The Banshee engine +// takes start/end XY coordinates directly via srcXY/dstXY registers. + +static void bansheeLineDraw(AccelDriverT *drv, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color) { + BansheePrivateT *priv = (BansheePrivateT *)drv->privData; + + bansheeWaitIdle(drv); + + bansheeWrite(priv, BAN_DSTBASEADDR, 0); + bansheeWrite(priv, BAN_DSTFORMAT, ((uint32_t)priv->screenPitch << 16) | priv->bppFormat); + bansheeWrite(priv, BAN_ROP, BAN_ROP_COPY); + bansheeWrite(priv, BAN_COLORFORE, color); + bansheeWrite(priv, BAN_SRCXY, (uint32_t)x1 | ((uint32_t)y1 << 16)); + bansheeWrite(priv, BAN_DSTXY, (uint32_t)x2 | ((uint32_t)y2 << 16)); + bansheeWrite(priv, BAN_COMMAND, BAN_CMD_LINEDRAW | BAN_CMD_INITIATE); +} + + +// ============================================================ +// bansheeMoveCursor +// ============================================================ + +static void bansheeMoveCursor(AccelDriverT *drv, int32_t x, int32_t y) { + BansheePrivateT *priv = (BansheePrivateT *)drv->privData; + + if (x < 0) { + x = 0; + } + if (y < 0) { + y = 0; + } + + bansheeWrite(priv, BAN_CURSORLOC, (uint32_t)x | ((uint32_t)y << 16)); +} + + +// ============================================================ +// bansheeRectFill +// ============================================================ +// +// Solid rectangle fill using the Banshee RECTFILL command. The +// foreground color is set, coordinates and dimensions are loaded, +// and the command register triggers the fill. + +static void bansheeRectFill(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color) { + BansheePrivateT *priv = (BansheePrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + bansheeWaitIdle(drv); + + bansheeWrite(priv, BAN_DSTBASEADDR, 0); + bansheeWrite(priv, BAN_DSTFORMAT, ((uint32_t)priv->screenPitch << 16) | priv->bppFormat); + bansheeWrite(priv, BAN_ROP, BAN_ROP_COPY); + bansheeWrite(priv, BAN_COLORFORE, color); + bansheeWrite(priv, BAN_DSTSIZE, (uint32_t)w | ((uint32_t)h << 16)); + bansheeWrite(priv, BAN_DSTXY, (uint32_t)x | ((uint32_t)y << 16)); + bansheeWrite(priv, BAN_COMMAND, BAN_CMD_RECTFILL | BAN_CMD_INITIATE); +} + + +// ============================================================ +// bansheeRectFillPat +// ============================================================ +// +// 8x8 mono pattern fill using the Banshee RECTFILL command with +// BAN_CMD_MONOPAT. The pattern is 8 bytes (one per row, MSB-first), +// written to pattern0Alias and pattern1Alias as two 32-bit values. +// 1-bits use the foreground color, 0-bits use the background. + +static void bansheeRectFillPat(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, const uint8_t *pattern, uint32_t fg, uint32_t bg) { + BansheePrivateT *priv = (BansheePrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + // Pack pattern rows 0-3 into PATTERN0 and rows 4-7 into PATTERN1 + uint32_t pat0 = (uint32_t)pattern[0] + | ((uint32_t)pattern[1] << 8) + | ((uint32_t)pattern[2] << 16) + | ((uint32_t)pattern[3] << 24); + uint32_t pat1 = (uint32_t)pattern[4] + | ((uint32_t)pattern[5] << 8) + | ((uint32_t)pattern[6] << 16) + | ((uint32_t)pattern[7] << 24); + + bansheeWaitIdle(drv); + + bansheeWrite(priv, BAN_DSTBASEADDR, 0); + bansheeWrite(priv, BAN_DSTFORMAT, ((uint32_t)priv->screenPitch << 16) | priv->bppFormat); + bansheeWrite(priv, BAN_ROP, BAN_ROP_COPY); + bansheeWrite(priv, BAN_COLORFORE, fg); + bansheeWrite(priv, BAN_COLORBACK, bg); + bansheeWrite(priv, BAN_PATTERN0, pat0); + bansheeWrite(priv, BAN_PATTERN1, pat1); + bansheeWrite(priv, BAN_DSTSIZE, (uint32_t)w | ((uint32_t)h << 16)); + bansheeWrite(priv, BAN_DSTXY, (uint32_t)x | ((uint32_t)y << 16)); + bansheeWrite(priv, BAN_COMMAND, BAN_CMD_RECTFILL | BAN_CMD_INITIATE | BAN_CMD_MONOPAT); +} + + +// ============================================================ +// bansheeSetClip +// ============================================================ + +static void bansheeSetClip(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h) { + BansheePrivateT *priv = (BansheePrivateT *)drv->privData; + + bansheeWrite(priv, BAN_CLIP0MIN, (uint32_t)x | ((uint32_t)y << 16)); + bansheeWrite(priv, BAN_CLIP0MAX, (uint32_t)(x + w) | ((uint32_t)(y + h) << 16)); +} + + +// ============================================================ +// bansheeSetCursor +// ============================================================ +// +// The Banshee hardware cursor is a 64x64 two-color cursor stored +// in VRAM. The format is 2 bits per pixel: AND plane followed by +// XOR plane, packed as 64x64 = 1024 bytes per plane. + +static void bansheeSetCursor(AccelDriverT *drv, const HwCursorImageT *image) { + BansheePrivateT *priv = (BansheePrivateT *)drv->privData; + + if (!image) { + bansheeShowCursor(drv, false); + return; + } + + bansheeWaitIdle(drv); + + // Store cursor image at end of VRAM (1KB AND + 1KB XOR = 2KB) + uint32_t cursorOffset = priv->vramSize - 2048; + cursorOffset &= ~0x7FF; // align to 2KB + uint8_t *cursorMem = drv->mode.framebuffer + cursorOffset; + + // Write AND mask then XOR mask, each 64x64 / 8 = 512 bytes + for (int32_t row = 0; row < BAN_HW_CURSOR_SIZE; row++) { + for (int32_t byteIdx = 0; byteIdx < 8; byteIdx++) { + int32_t srcIdx = row * 8 + byteIdx; + uint8_t andByte; + uint8_t xorByte; + + if (row < image->height && byteIdx < (image->width + 7) / 8) { + andByte = image->andMask[srcIdx]; + xorByte = image->xorMask[srcIdx]; + } else { + andByte = 0xFF; // transparent + xorByte = 0x00; + } + + cursorMem[row * 16 + byteIdx] = andByte; + cursorMem[row * 16 + byteIdx + 8] = xorByte; + } + } +} + + +// ============================================================ +// bansheeShowCursor +// ============================================================ + +static void bansheeShowCursor(AccelDriverT *drv, bool visible) { + BansheePrivateT *priv = (BansheePrivateT *)drv->privData; + + uint32_t vidProcCfg = bansheeRead(priv, BAN_VIDPROCCFG); + + if (visible) { + vidProcCfg |= BAN_CURSOR_ENABLE; + } else { + vidProcCfg &= ~BAN_CURSOR_ENABLE; + } + + bansheeWrite(priv, BAN_VIDPROCCFG, vidProcCfg); +} + + +// ============================================================ +// bansheeShutdown +// ============================================================ + +static void bansheeShutdown(AccelDriverT *drv) { + BansheePrivateT *priv = (BansheePrivateT *)drv->privData; + + bansheeShowCursor(drv, false); + vgaRestoreTextMode(); + + dpmiUnmapFramebuffer(&priv->launchMap); + dpmiUnmapFramebuffer(&priv->lfbMap); + dpmiUnmapFramebuffer(&priv->mmioMap); + + priv->mmio = NULL; + priv->launch = NULL; +} + + +// ============================================================ +// bansheeWaitIdle +// ============================================================ +// +// Wait until the 2D engine is completely idle. Bits 0-10 of the +// status register must all be zero. + +static void bansheeWaitIdle(AccelDriverT *drv) { + BansheePrivateT *priv = (BansheePrivateT *)drv->privData; + + for (int32_t i = 0; i < BAN_MAX_IDLE_WAIT; i++) { + uint32_t stat = bansheeRead(priv, BAN_STATUS); + if (!(stat & BAN_STATUS_BUSY_MASK)) { + return; + } + } +} + + +// ============================================================ +// bppToFormat +// ============================================================ +// +// Convert bits-per-pixel to the Banshee srcFormat/dstFormat +// encoding for the low bits of those registers. + +static uint32_t bppToFormat(int32_t bpp) { + switch (bpp) { + case 8: + return BAN_FMT_8BPP; + case 15: + case 16: + return BAN_FMT_16BPP; + case 32: + return BAN_FMT_32BPP; + default: + return BAN_FMT_16BPP; + } +} diff --git a/cirrusGd54.c b/cirrusGd54.c new file mode 100644 index 0000000..cf096b4 --- /dev/null +++ b/cirrusGd54.c @@ -0,0 +1,732 @@ +// cirrusGd54.c -- Cirrus Logic GD5426/28/34/36/46/80 accelerated video driver +// +// Supports the Cirrus Logic GD54xx family of VGA controllers. These +// chips were extremely common in the early-to-mid 1990s, found in +// everything from budget desktops to laptops. +// +// The GD54xx BitBLT engine is accessed entirely through extended +// Graphics Controller (GR) registers at I/O ports 0x3CE/0x3CF. +// There is no MMIO option on the GD54xx series (unlike the later +// Laguna chips). The engine supports: +// - Screen-to-screen BitBLT +// - Solid rectangle fill +// - Color expansion (monochrome-to-color, for text) +// - 8x8 pattern fill +// - Transparent blit (color key) +// - Hardware cursor (32x32 on GD5426/28, 64x64 on GD5434+) +// +// Register unlock: +// Write 0x12 to SR6 (sequencer register 6) to unlock the Cirrus +// extended registers. Write 0x00 to re-lock. +// +// BLT engine registers (GR extended, indices 0x20-0x3F): +// All BLT parameters are set through the graphics controller +// index/data ports (0x3CE/0x3CF). Addresses are linear byte +// offsets into VRAM. + +#include "accelVid.h" +#include "vgaCommon.h" +#include "pci.h" + +#include +#include +#include +#include +#include + +// ============================================================ +// Cirrus vendor/device IDs +// ============================================================ + +#define CL_VENDOR_ID 0x1013 + +#define CL_GD5426 0x0000 // ISA/VLB only, no PCI ID -- detected via probe +#define CL_GD5428 0x0000 // ISA/VLB only +#define CL_GD5429 0x00A0 // shared with 5434 on some boards +#define CL_GD5434 0x00A0 +#define CL_GD5434_ALT 0x00A8 +#define CL_GD5436 0x00AC +#define CL_GD5446 0x00B8 +#define CL_GD5480 0x00BC + +static const uint16_t sCirrusDeviceIds[] = { + CL_VENDOR_ID, CL_GD5434, + CL_VENDOR_ID, CL_GD5434_ALT, + CL_VENDOR_ID, CL_GD5436, + CL_VENDOR_ID, CL_GD5446, + CL_VENDOR_ID, CL_GD5480, + 0, 0 +}; + +// ============================================================ +// Cirrus extended GR register indices for BLT engine +// ============================================================ + +#define CL_GR20_BLT_WIDTH_LO 0x20 +#define CL_GR21_BLT_WIDTH_HI 0x21 +#define CL_GR22_BLT_HEIGHT_LO 0x22 +#define CL_GR23_BLT_HEIGHT_HI 0x23 +#define CL_GR24_BLT_DST_PITCH_LO 0x24 +#define CL_GR25_BLT_DST_PITCH_HI 0x25 +#define CL_GR26_BLT_SRC_PITCH_LO 0x26 +#define CL_GR27_BLT_SRC_PITCH_HI 0x27 +#define CL_GR28_BLT_DST_ADDR_LO 0x28 +#define CL_GR29_BLT_DST_ADDR_MID 0x29 +#define CL_GR2A_BLT_DST_ADDR_HI 0x2A +#define CL_GR2C_BLT_SRC_ADDR_LO 0x2C +#define CL_GR2D_BLT_SRC_ADDR_MID 0x2D +#define CL_GR2E_BLT_SRC_ADDR_HI 0x2E +#define CL_GR30_BLT_MODE 0x30 +#define CL_GR31_BLT_STATUS 0x31 +#define CL_GR32_BLT_ROP 0x32 +#define CL_GR33_BLT_MODE_EXT 0x33 +#define CL_GR34_BLT_FGCOLOR_LO 0x34 +#define CL_GR35_BLT_FGCOLOR_HI 0x35 +#define CL_GR38_BLT_TRANS_COLOR_LO 0x38 +#define CL_GR39_BLT_TRANS_COLOR_HI 0x39 +#define CL_GR3A_BLT_TRANS_MASK_LO 0x3A +#define CL_GR3B_BLT_TRANS_MASK_HI 0x3B + +// ============================================================ +// Cirrus BLT mode bits (GR30) +// ============================================================ + +#define CL_BLT_DIR_BACKWARD 0x01 // blit direction backward +#define CL_BLT_SRC_SYSTEM 0x02 // source is system memory (CPU) +#define CL_BLT_SRC_PATTERN 0x04 // source is 8x8 pattern +#define CL_BLT_TRANSPARENT 0x08 // transparent background +#define CL_BLT_DST_SYSTEM 0x10 // destination is system memory +#define CL_BLT_COLOR_EXPAND 0x80 // monochrome color expansion + +// ============================================================ +// Cirrus BLT status bits (GR31) +// ============================================================ + +#define CL_BLT_START 0x02 // start BLT operation +#define CL_BLT_RESET 0x04 // reset BLT engine +#define CL_BLT_BUSY 0x01 // BLT engine busy (read) + +// ============================================================ +// Cirrus BLT ROP values (GR32) +// ============================================================ +// +// The Cirrus ROP encoding is different from the S3/Windows ROP +// codes. These are the Cirrus-specific values. + +#define CL_ROP_COPY 0x0D // dest = source +#define CL_ROP_PAT_COPY 0x0D // dest = pattern (same as copy in fill mode) +#define CL_ROP_XOR 0x59 // dest = src XOR dest +#define CL_ROP_AND 0x05 // dest = src AND dest +#define CL_ROP_OR 0x6D // dest = src OR dest +#define CL_ROP_ZERO 0x00 // dest = 0 +#define CL_ROP_ONE 0x0B // dest = 1 + +// Cirrus sequencer unlock key +#define CL_SR6_UNLOCK 0x12 +#define CL_SR6_LOCK 0x00 + +// Hardware cursor constants +#define CL_HW_CURSOR_SIZE 64 // 64x64 on GD5434+ +#define CL_HW_CURSOR_BYTES 1024 // 64*64*2bpp / 8 = 1024 + +// Maximum wait iterations +#define CL_MAX_IDLE_WAIT 1000000 + +// ============================================================ +// Private driver state +// ============================================================ + +typedef struct { + uint32_t lfbPhysAddr; + uint32_t vramSize; + uint32_t cursorOffset; + int32_t bytesPerPixel; + int32_t screenPitch; + DpmiMappingT lfbMapping; + bool is5434Plus; // true for GD5434 and later (64x64 cursor) +} CirrusPrivateT; + +// ============================================================ +// Prototypes +// ============================================================ + +static void clBitBlt(AccelDriverT *drv, int32_t srcX, int32_t srcY, int32_t dstX, int32_t dstY, int32_t w, int32_t h); +static void clColorExpand(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h, uint32_t fg, uint32_t bg); +static bool clDetect(AccelDriverT *drv); +static void clHostBlit(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h); +static bool clInit(AccelDriverT *drv, const AccelModeRequestT *req); +static void clMoveCursor(AccelDriverT *drv, int32_t x, int32_t y); +static void clRectFill(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color); +static void clSetClip(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h); +static void clSetCursor(AccelDriverT *drv, const HwCursorImageT *image); +static void clShowCursor(AccelDriverT *drv, bool visible); +static void clShutdown(AccelDriverT *drv); +static void clUnlockRegs(void); +static void clWaitIdle(AccelDriverT *drv); + +// ============================================================ +// Driver instance +// ============================================================ + +static CirrusPrivateT sCirrusPrivate; + +static AccelDriverT sCirrusDriver = { + .name = "Cirrus Logic GD5434", + .chipFamily = "cirrus", + .caps = 0, + .privData = &sCirrusPrivate, + .detect = clDetect, + .init = clInit, + .shutdown = clShutdown, + .waitIdle = clWaitIdle, + .setClip = clSetClip, + .rectFill = clRectFill, + .rectFillPat = NULL, + .bitBlt = clBitBlt, + .hostBlit = clHostBlit, + .colorExpand = clColorExpand, + .lineDraw = NULL, // GD54xx has no hardware line draw + .setCursor = clSetCursor, + .moveCursor = clMoveCursor, + .showCursor = clShowCursor, +}; + +// ============================================================ +// clRegisterDriver +// ============================================================ + +void clRegisterDriver(void) { + accelRegisterDriver(&sCirrusDriver); +} + + +// ============================================================ +// clBitBlt +// ============================================================ +// +// Screen-to-screen BitBLT. The Cirrus engine uses linear VRAM +// addresses for source and destination. Direction is controlled +// by the backward bit in GR30 -- for overlapping regions where +// dst > src, we must blit backward. + +static void clBitBlt(AccelDriverT *drv, int32_t srcX, int32_t srcY, int32_t dstX, int32_t dstY, int32_t w, int32_t h) { + CirrusPrivateT *priv = (CirrusPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + int32_t bpp = priv->bytesPerPixel; + int32_t pitch = priv->screenPitch; + + // Calculate linear addresses + uint32_t srcAddr = srcY * pitch + srcX * bpp; + uint32_t dstAddr = dstY * pitch + dstX * bpp; + + // Determine direction for overlapping blits + uint8_t mode = 0; + + if (dstAddr > srcAddr) { + mode |= CL_BLT_DIR_BACKWARD; + // Adjust addresses to end of blit region + srcAddr += (h - 1) * pitch + (w - 1) * bpp; + dstAddr += (h - 1) * pitch + (w - 1) * bpp; + } + + // Width in bytes minus 1 + int32_t widthBytes = w * bpp - 1; + + clWaitIdle(drv); + + // Set up BLT parameters + vgaGfxWrite(CL_GR20_BLT_WIDTH_LO, widthBytes & 0xFF); + vgaGfxWrite(CL_GR21_BLT_WIDTH_HI, (widthBytes >> 8) & 0x1F); + + vgaGfxWrite(CL_GR22_BLT_HEIGHT_LO, (h - 1) & 0xFF); + vgaGfxWrite(CL_GR23_BLT_HEIGHT_HI, ((h - 1) >> 8) & 0x07); + + vgaGfxWrite(CL_GR24_BLT_DST_PITCH_LO, pitch & 0xFF); + vgaGfxWrite(CL_GR25_BLT_DST_PITCH_HI, (pitch >> 8) & 0x1F); + + vgaGfxWrite(CL_GR26_BLT_SRC_PITCH_LO, pitch & 0xFF); + vgaGfxWrite(CL_GR27_BLT_SRC_PITCH_HI, (pitch >> 8) & 0x1F); + + vgaGfxWrite(CL_GR28_BLT_DST_ADDR_LO, dstAddr & 0xFF); + vgaGfxWrite(CL_GR29_BLT_DST_ADDR_MID, (dstAddr >> 8) & 0xFF); + vgaGfxWrite(CL_GR2A_BLT_DST_ADDR_HI, (dstAddr >> 16) & 0x3F); + + vgaGfxWrite(CL_GR2C_BLT_SRC_ADDR_LO, srcAddr & 0xFF); + vgaGfxWrite(CL_GR2D_BLT_SRC_ADDR_MID, (srcAddr >> 8) & 0xFF); + vgaGfxWrite(CL_GR2E_BLT_SRC_ADDR_HI, (srcAddr >> 16) & 0x3F); + + vgaGfxWrite(CL_GR32_BLT_ROP, CL_ROP_COPY); + vgaGfxWrite(CL_GR30_BLT_MODE, mode); + + // Start BLT + vgaGfxWrite(CL_GR31_BLT_STATUS, CL_BLT_START); +} + + +// ============================================================ +// clColorExpand +// ============================================================ +// +// Monochrome-to-color expansion. The source data is 1bpp bitmap +// in system memory, which gets transferred through the BLT engine +// with color expansion enabled. Each 1-bit becomes the foreground +// color, each 0-bit becomes the background color. +// +// The Cirrus color expand uses GR34/GR35 for the foreground color +// and the background is set by first doing a fill, or by using +// transparent mode with a pre-filled background. + +static void clColorExpand(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h, uint32_t fg, uint32_t bg) { + CirrusPrivateT *priv = (CirrusPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + int32_t bpp = priv->bytesPerPixel; + int32_t pitch = priv->screenPitch; + + // First fill the destination with background color + clRectFill(drv, dstX, dstY, w, h, bg); + clWaitIdle(drv); + + // Now do a transparent color expand for the foreground + uint32_t dstAddr = dstY * pitch + dstX * bpp; + int32_t widthBytes = w * bpp - 1; + + // Set foreground color + vgaGfxWrite(CL_GR34_BLT_FGCOLOR_LO, fg & 0xFF); + vgaGfxWrite(CL_GR35_BLT_FGCOLOR_HI, (fg >> 8) & 0xFF); + + // Set up BLT parameters + vgaGfxWrite(CL_GR20_BLT_WIDTH_LO, widthBytes & 0xFF); + vgaGfxWrite(CL_GR21_BLT_WIDTH_HI, (widthBytes >> 8) & 0x1F); + + vgaGfxWrite(CL_GR22_BLT_HEIGHT_LO, (h - 1) & 0xFF); + vgaGfxWrite(CL_GR23_BLT_HEIGHT_HI, ((h - 1) >> 8) & 0x07); + + vgaGfxWrite(CL_GR24_BLT_DST_PITCH_LO, pitch & 0xFF); + vgaGfxWrite(CL_GR25_BLT_DST_PITCH_HI, (pitch >> 8) & 0x1F); + + // Source pitch for monochrome data + vgaGfxWrite(CL_GR26_BLT_SRC_PITCH_LO, srcPitch & 0xFF); + vgaGfxWrite(CL_GR27_BLT_SRC_PITCH_HI, (srcPitch >> 8) & 0x1F); + + vgaGfxWrite(CL_GR28_BLT_DST_ADDR_LO, dstAddr & 0xFF); + vgaGfxWrite(CL_GR29_BLT_DST_ADDR_MID, (dstAddr >> 8) & 0xFF); + vgaGfxWrite(CL_GR2A_BLT_DST_ADDR_HI, (dstAddr >> 16) & 0x3F); + + vgaGfxWrite(CL_GR32_BLT_ROP, CL_ROP_COPY); + vgaGfxWrite(CL_GR30_BLT_MODE, CL_BLT_COLOR_EXPAND | CL_BLT_SRC_SYSTEM | CL_BLT_TRANSPARENT); + + // Start BLT + vgaGfxWrite(CL_GR31_BLT_STATUS, CL_BLT_START); + + // Feed monochrome data through PIX_TRANS equivalent + // On Cirrus, system-memory source data is written to the + // BLT engine via the VGA aperture at 0xA0000 (mapped via DPMI). + // Each row of monochrome data is padded to a dword boundary. + int32_t srcBytesPerRow = (w + 7) / 8; + int32_t padBytesPerRow = (srcBytesPerRow + 3) & ~3; + + for (int32_t row = 0; row < h; row++) { + const uint8_t *rowData = srcBuf + row * srcPitch; + + for (int32_t i = 0; i < padBytesPerRow; i++) { + uint8_t byte = (i < srcBytesPerRow) ? rowData[i] : 0; + outportb(0x3CF, byte); // data through GR register space + } + } +} + + +// ============================================================ +// clDetect +// ============================================================ + +static bool clDetect(AccelDriverT *drv) { + int32_t matchIdx; + + if (!pciFindDeviceList(sCirrusDeviceIds, &drv->pciDev, &matchIdx)) { + return false; + } + + switch (drv->pciDev.deviceId) { + case CL_GD5434: + case CL_GD5434_ALT: + drv->name = "Cirrus Logic GD5434"; + break; + case CL_GD5436: + drv->name = "Cirrus Logic GD5436"; + break; + case CL_GD5446: + drv->name = "Cirrus Logic GD5446"; + break; + case CL_GD5480: + drv->name = "Cirrus Logic GD5480"; + break; + default: + drv->name = "Cirrus Logic GD54xx"; + break; + } + + return true; +} + + +// ============================================================ +// clHostBlit +// ============================================================ +// +// CPU-to-screen blit. Transfers pixel data from system memory to +// the framebuffer via the BLT engine with CL_BLT_SRC_SYSTEM mode. +// Source data is fed byte-by-byte through the GR data port (0x3CF), +// with each row padded to a dword (4-byte) boundary. + +static void clHostBlit(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h) { + CirrusPrivateT *priv = (CirrusPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + int32_t bpp = priv->bytesPerPixel; + int32_t pitch = priv->screenPitch; + uint32_t dstAddr = dstY * pitch + dstX * bpp; + int32_t widthBytes = w * bpp - 1; + int32_t rowBytes = w * bpp; + int32_t padBytesPerRow = (rowBytes + 3) & ~3; + + clWaitIdle(drv); + + // Set up BLT parameters + vgaGfxWrite(CL_GR20_BLT_WIDTH_LO, widthBytes & 0xFF); + vgaGfxWrite(CL_GR21_BLT_WIDTH_HI, (widthBytes >> 8) & 0x1F); + + vgaGfxWrite(CL_GR22_BLT_HEIGHT_LO, (h - 1) & 0xFF); + vgaGfxWrite(CL_GR23_BLT_HEIGHT_HI, ((h - 1) >> 8) & 0x07); + + vgaGfxWrite(CL_GR24_BLT_DST_PITCH_LO, pitch & 0xFF); + vgaGfxWrite(CL_GR25_BLT_DST_PITCH_HI, (pitch >> 8) & 0x1F); + + vgaGfxWrite(CL_GR28_BLT_DST_ADDR_LO, dstAddr & 0xFF); + vgaGfxWrite(CL_GR29_BLT_DST_ADDR_MID, (dstAddr >> 8) & 0xFF); + vgaGfxWrite(CL_GR2A_BLT_DST_ADDR_HI, (dstAddr >> 16) & 0x3F); + + // BLT mode: source from CPU + vgaGfxWrite(CL_GR30_BLT_MODE, CL_BLT_SRC_SYSTEM); + vgaGfxWrite(CL_GR32_BLT_ROP, CL_ROP_COPY); + + // Start BLT + vgaGfxWrite(CL_GR31_BLT_STATUS, CL_BLT_START); + + // Feed pixel data row by row, padded to dword boundary + for (int32_t row = 0; row < h; row++) { + const uint8_t *rowData = srcBuf + row * srcPitch; + + for (int32_t i = 0; i < padBytesPerRow; i++) { + uint8_t byte = (i < rowBytes) ? rowData[i] : 0; + outportb(0x3CF, byte); + } + } +} + + +// ============================================================ +// clInit +// ============================================================ + +static bool clInit(AccelDriverT *drv, const AccelModeRequestT *req) { + CirrusPrivateT *priv = (CirrusPrivateT *)drv->privData; + + memset(priv, 0, sizeof(*priv)); + + priv->is5434Plus = (drv->pciDev.deviceId != CL_GD5429); + + // Get VRAM size and LFB address from PCI BAR0 + uint32_t bar0 = pciRead32(drv->pciDev.bus, drv->pciDev.dev, drv->pciDev.func, PCI_BAR0); + priv->lfbPhysAddr = bar0 & 0xFFFFFFF0; + priv->vramSize = pciSizeBar(drv->pciDev.bus, drv->pciDev.dev, drv->pciDev.func, PCI_BAR0); + + // Unlock Cirrus extended registers + clUnlockRegs(); + + // Detect VRAM size from SR0F if BAR sizing was unreasonable + uint8_t sr0f = vgaSeqRead(0x0F); + uint32_t ramFromSr = 0; + + switch ((sr0f >> 3) & 0x03) { + case 0: ramFromSr = 256 * 1024; break; + case 1: ramFromSr = 512 * 1024; break; + case 2: ramFromSr = 1024 * 1024; break; + case 3: ramFromSr = 2048 * 1024; break; + } + + // GD5434+ can have 4MB + if (priv->is5434Plus && (sr0f & 0x80)) { + ramFromSr = 4096 * 1024; + } + + if (priv->vramSize < 256 * 1024 || priv->vramSize > 64 * 1024 * 1024) { + priv->vramSize = ramFromSr; + } + + // Find and set VESA mode + VesaModeResultT vesa; + if (!vesaFindAndSetMode(req->width, req->height, req->bpp, &vesa)) { + return false; + } + + // Map LFB via DPMI + if (!dpmiMapFramebuffer(priv->lfbPhysAddr, priv->vramSize, &priv->lfbMapping)) { + vgaRestoreTextMode(); + return false; + } + + priv->bytesPerPixel = (vesa.bpp + 7) / 8; + priv->screenPitch = vesa.pitch; + + drv->mode.width = vesa.width; + drv->mode.height = vesa.height; + drv->mode.bpp = vesa.bpp; + drv->mode.pitch = vesa.pitch; + drv->mode.framebuffer = priv->lfbMapping.ptr; + drv->mode.vramSize = priv->vramSize; + drv->mode.offscreenBase = vesa.pitch * vesa.height; + + // Re-unlock after mode set + clUnlockRegs(); + + // Reset BLT engine + vgaGfxWrite(CL_GR31_BLT_STATUS, CL_BLT_RESET); + vgaGfxWrite(CL_GR31_BLT_STATUS, 0x00); + + // Set up cursor at end of VRAM + priv->cursorOffset = priv->vramSize - CL_HW_CURSOR_BYTES; + priv->cursorOffset &= ~(CL_HW_CURSOR_BYTES - 1); + + drv->caps = ACAP_RECT_FILL + | ACAP_BITBLT + | ACAP_HOST_BLIT + | ACAP_COLOR_EXPAND + | ACAP_HW_CURSOR; + + return true; +} + + +// ============================================================ +// clMoveCursor +// ============================================================ +// +// Moves the hardware cursor. On Cirrus GD5434+, cursor position +// is set through sequencer extended registers SR10-SR13. + +static void clMoveCursor(AccelDriverT *drv, int32_t x, int32_t y) { + (void)drv; + + if (x < 0) { x = 0; } + if (y < 0) { y = 0; } + + vgaSeqWrite(0x10, x & 0xFF); + vgaSeqWrite(0x11, (x >> 8) & 0x07); + vgaSeqWrite(0x12, y & 0xFF); + vgaSeqWrite(0x13, (y >> 8) & 0x07); +} + + +// ============================================================ +// clRectFill +// ============================================================ +// +// Solid rectangle fill using the BLT engine. The Cirrus engine +// doesn't have a dedicated "fill" command -- instead, we set up +// a 1-pixel source and use pattern-fill mode, or we set the +// source to a single-color region. The simplest approach is to +// use the color expansion with all-ones data, but for solid fills +// the most efficient method is to use the ROP with the foreground +// color register. + +static void clRectFill(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color) { + CirrusPrivateT *priv = (CirrusPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + int32_t bpp = priv->bytesPerPixel; + int32_t pitch = priv->screenPitch; + uint32_t dstAddr = y * pitch + x * bpp; + int32_t widthBytes = w * bpp - 1; + + clWaitIdle(drv); + + // Set foreground color for fill + vgaGfxWrite(CL_GR34_BLT_FGCOLOR_LO, color & 0xFF); + vgaGfxWrite(CL_GR35_BLT_FGCOLOR_HI, (color >> 8) & 0xFF); + + vgaGfxWrite(CL_GR20_BLT_WIDTH_LO, widthBytes & 0xFF); + vgaGfxWrite(CL_GR21_BLT_WIDTH_HI, (widthBytes >> 8) & 0x1F); + + vgaGfxWrite(CL_GR22_BLT_HEIGHT_LO, (h - 1) & 0xFF); + vgaGfxWrite(CL_GR23_BLT_HEIGHT_HI, ((h - 1) >> 8) & 0x07); + + vgaGfxWrite(CL_GR24_BLT_DST_PITCH_LO, pitch & 0xFF); + vgaGfxWrite(CL_GR25_BLT_DST_PITCH_HI, (pitch >> 8) & 0x1F); + + vgaGfxWrite(CL_GR28_BLT_DST_ADDR_LO, dstAddr & 0xFF); + vgaGfxWrite(CL_GR29_BLT_DST_ADDR_MID, (dstAddr >> 8) & 0xFF); + vgaGfxWrite(CL_GR2A_BLT_DST_ADDR_HI, (dstAddr >> 16) & 0x3F); + + // Source = foreground color, color expand with all 1s + vgaGfxWrite(CL_GR32_BLT_ROP, CL_ROP_COPY); + vgaGfxWrite(CL_GR30_BLT_MODE, CL_BLT_COLOR_EXPAND | CL_BLT_SRC_SYSTEM); + + // Source pitch for monochrome data (1 byte per row of fill) + vgaGfxWrite(CL_GR26_BLT_SRC_PITCH_LO, 0); + vgaGfxWrite(CL_GR27_BLT_SRC_PITCH_HI, 0); + + // Start BLT + vgaGfxWrite(CL_GR31_BLT_STATUS, CL_BLT_START); + + // Feed all-ones data (every pixel is foreground color) + int32_t srcBytesPerRow = (w + 7) / 8; + int32_t padBytesPerRow = (srcBytesPerRow + 3) & ~3; + + for (int32_t row = 0; row < h; row++) { + for (int32_t i = 0; i < padBytesPerRow; i++) { + outportb(0x3CF, 0xFF); + } + } +} + + +// ============================================================ +// clSetClip +// ============================================================ +// +// The GD54xx BLT engine doesn't have hardware scissor registers. +// Clipping must be done in software by adjusting coordinates +// before issuing BLT commands. This is a no-op placeholder. + +static void clSetClip(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h) { + (void)drv; + (void)x; + (void)y; + (void)w; + (void)h; +} + + +// ============================================================ +// clSetCursor +// ============================================================ +// +// Uploads cursor image to VRAM. Cirrus GD5434+ uses 64x64 +// 2bpp cursor stored at a 1KB-aligned VRAM address. The address +// is set via SR2D (high) and SR2C (low) in units of 256 bytes. +// Format: interleaved AND/XOR planes, 16 bytes per row +// (8 bytes AND, 8 bytes XOR). + +static void clSetCursor(AccelDriverT *drv, const HwCursorImageT *image) { + CirrusPrivateT *priv = (CirrusPrivateT *)drv->privData; + + if (!image) { + clShowCursor(drv, false); + return; + } + + clWaitIdle(drv); + + uint8_t *cursorMem = drv->mode.framebuffer + priv->cursorOffset; + + for (int32_t row = 0; row < CL_HW_CURSOR_SIZE; row++) { + for (int32_t byte = 0; byte < 8; byte++) { + int32_t srcIdx = row * 8 + byte; + uint8_t andByte; + uint8_t xorByte; + + if (row < image->height && byte < (image->width + 7) / 8) { + andByte = image->andMask[srcIdx]; + xorByte = image->xorMask[srcIdx]; + } else { + andByte = 0xFF; + xorByte = 0x00; + } + + cursorMem[row * 16 + byte] = andByte; + cursorMem[row * 16 + byte + 8] = xorByte; + } + } + + // Set cursor address (in units of 256 bytes) + uint16_t addrUnits = priv->cursorOffset / 256; + vgaSeqWrite(0x2C, addrUnits & 0xFF); + vgaSeqWrite(0x2D, (addrUnits >> 8) & 0x3F); +} + + +// ============================================================ +// clShowCursor +// ============================================================ +// +// Enable/disable the hardware cursor via SR12 bit 0 on Cirrus. + +static void clShowCursor(AccelDriverT *drv, bool visible) { + (void)drv; + + uint8_t sr12 = vgaSeqRead(0x12); + + if (visible) { + sr12 |= 0x01; + } else { + sr12 &= ~0x01; + } + + vgaSeqWrite(0x12, sr12); +} + + +// ============================================================ +// clShutdown +// ============================================================ + +static void clShutdown(AccelDriverT *drv) { + CirrusPrivateT *priv = (CirrusPrivateT *)drv->privData; + + clShowCursor(drv, false); + dpmiUnmapFramebuffer(&priv->lfbMapping); + vgaRestoreTextMode(); +} + + +// ============================================================ +// clUnlockRegs +// ============================================================ +// +// Unlock Cirrus extended registers by writing 0x12 to SR6. + +static void clUnlockRegs(void) { + vgaSeqWrite(0x06, CL_SR6_UNLOCK); +} + + +// ============================================================ +// clWaitIdle +// ============================================================ +// +// Wait for the BLT engine to finish. Poll GR31 bit 0. + +static void clWaitIdle(AccelDriverT *drv) { + (void)drv; + + for (int32_t i = 0; i < CL_MAX_IDLE_WAIT; i++) { + if (!(vgaGfxRead(CL_GR31_BLT_STATUS) & CL_BLT_BUSY)) { + return; + } + } +} diff --git a/cirrusLaguna.c b/cirrusLaguna.c new file mode 100644 index 0000000..e60d24b --- /dev/null +++ b/cirrusLaguna.c @@ -0,0 +1,585 @@ +// cirrusLaguna.c -- Cirrus Logic Laguna GD5462/5464/5465 accelerated video driver +// +// Supports the Cirrus Logic Laguna family: GD5462, GD5464, and GD5465. +// These are MMIO-based PCI accelerators completely different from the +// older GD54xx (Alpine) series -- different register set, different +// BLT engine, and different programming model. +// +// The Laguna 2D engine features: +// - Solid rectangle fill +// - Screen-to-screen BitBLT +// - CPU-to-screen blit (host data window) +// - Monochrome color expansion (text/glyph rendering) +// - Hardware clip rectangle +// - 64x64 hardware cursor +// +// BAR layout: +// BAR0 = MMIO registers (4KB) +// BAR1 = linear framebuffer +// +// The 2D engine is programmed via MMIO registers starting at offset +// 0x0100. Commands are initiated by writing to the COMMAND register +// at 0x0118. Host data (for CPU-to-screen and color expand) is fed +// through a 512-byte window at MMIO + 0x0200. + +#include "accelVid.h" +#include "vgaCommon.h" +#include "pci.h" + +#include +#include +#include +#include +#include + +// ============================================================ +// Cirrus Laguna vendor/device IDs +// ============================================================ + +#define CIRRUS_VENDOR_ID 0x1013 + +#define LAGUNA_GD5462 0x00D0 +#define LAGUNA_GD5464 0x00D4 +#define LAGUNA_GD5465 0x00D6 + +static const uint16_t sLagunaDeviceIds[] = { + CIRRUS_VENDOR_ID, LAGUNA_GD5462, + CIRRUS_VENDOR_ID, LAGUNA_GD5464, + CIRRUS_VENDOR_ID, LAGUNA_GD5465, + 0, 0 +}; + +// ============================================================ +// MMIO register offsets (from BAR0) +// ============================================================ + +// 0x0000-0x00FF: VGA compatible registers (mapped) + +// 2D engine registers +#define LAG_CONTROL 0x0100 // engine control / status +#define LAG_FGCOLOR 0x0104 // foreground color +#define LAG_BGCOLOR 0x0108 // background color +#define LAG_DSTXY 0x010C // destination XY (X | Y<<16) +#define LAG_SRCXY 0x0110 // source XY (X | Y<<16) +#define LAG_DSTSIZE 0x0114 // destination size (W | H<<16) +#define LAG_COMMAND 0x0118 // command register (triggers operation) +#define LAG_PITCH 0x011C // pitch (srcPitch<<16 | dstPitch) +#define LAG_PAT0 0x0120 // 8x8 mono pattern (first 32 bits) +#define LAG_PAT1 0x0124 // 8x8 mono pattern (second 32 bits) +#define LAG_CLIPLT 0x0130 // clip left/top (left | top<<16) +#define LAG_CLIPRB 0x0134 // clip right/bottom (right | bottom<<16) +#define LAG_HOST_DATA 0x0200 // host data window (512 bytes) + +// Hardware cursor registers +#define LAG_CUR_CTRL 0x0300 // cursor control (bit 0 = enable) +#define LAG_CUR_X 0x0304 // cursor X position +#define LAG_CUR_Y 0x0308 // cursor Y position +#define LAG_CUR_ADDR 0x030C // cursor VRAM address + +// ============================================================ +// Status register bits +// ============================================================ + +#define LAG_STATUS_BUSY 0x01 // engine busy (bit 0 of CONTROL) + +// ============================================================ +// Command register encoding +// ============================================================ + +// Operation codes (bits 3:0) +#define LAG_CMD_NOP 0x00 +#define LAG_CMD_BITBLT 0x01 // screen-to-screen BitBlt +#define LAG_CMD_RECTFILL 0x02 // solid rectangle fill +#define LAG_CMD_HOST_BLIT 0x03 // host-to-screen blit +#define LAG_CMD_LINE 0x04 // line draw +#define LAG_CMD_COLOR_EXPAND 0x05 // mono color expansion from host + +// ROP encoding (bits 7:4) +#define LAG_CMD_ROP_SHIFT 4 + +// Direction and option bits +#define LAG_CMD_DIR_REV 0x0100 // bit 8: reverse direction +#define LAG_CMD_PAT_EN 0x0200 // bit 9: pattern enable +#define LAG_CMD_TRANS_EN 0x0400 // bit 10: transparency enable +#define LAG_CMD_COLOREXP 0x0800 // bit 11: color expand (mono source) + +// Common ROP values (shifted into bits 7:4) +#define LAG_ROP_COPY (0x0C << LAG_CMD_ROP_SHIFT) // 0xCC = dest = src +#define LAG_ROP_PAT (0x0F << LAG_CMD_ROP_SHIFT) // 0xF0 = dest = pat + +// ============================================================ +// Constants +// ============================================================ + +#define LAG_MMIO_SIZE 4096 +#define LAG_MAX_IDLE_WAIT 1000000 +#define LAG_HW_CURSOR_SIZE 64 +#define LAG_HW_CURSOR_BYTES 1024 // 64x64x2bpp / 8 = 1024 + +// ============================================================ +// Private driver state +// ============================================================ + +typedef struct { + uint32_t lfbPhysAddr; + uint32_t mmioPhysAddr; + uint32_t vramSize; + uint32_t cursorOffset; + int32_t bytesPerPixel; + int32_t screenPitch; + volatile uint32_t *mmio; + DpmiMappingT lfbMapping; + DpmiMappingT mmioMapping; +} LagunaPrivateT; + +// ============================================================ +// Prototypes +// ============================================================ + +static void lagBitBlt(AccelDriverT *drv, int32_t srcX, int32_t srcY, int32_t dstX, int32_t dstY, int32_t w, int32_t h); +static void lagColorExpand(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h, uint32_t fg, uint32_t bg); +static bool lagDetect(AccelDriverT *drv); +static void lagHostBlit(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h); +static bool lagInit(AccelDriverT *drv, const AccelModeRequestT *req); +static void lagMoveCursor(AccelDriverT *drv, int32_t x, int32_t y); +static void lagRectFill(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color); +static void lagSetClip(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h); +static void lagSetCursor(AccelDriverT *drv, const HwCursorImageT *image); +static void lagShowCursor(AccelDriverT *drv, bool visible); +static void lagShutdown(AccelDriverT *drv); +static void lagWaitIdle(AccelDriverT *drv); + +static inline void lagWrite(LagunaPrivateT *priv, uint32_t reg, uint32_t val) { + priv->mmio[reg / 4] = val; +} + +static inline uint32_t lagRead(LagunaPrivateT *priv, uint32_t reg) { + return priv->mmio[reg / 4]; +} + +// ============================================================ +// Driver instance +// ============================================================ + +static LagunaPrivateT sLagunaPrivate; + +static AccelDriverT sLagunaDriver = { + .name = "Cirrus Logic Laguna", + .chipFamily = "cirrus-laguna", + .caps = 0, + .privData = &sLagunaPrivate, + .detect = lagDetect, + .init = lagInit, + .shutdown = lagShutdown, + .waitIdle = lagWaitIdle, + .setClip = lagSetClip, + .rectFill = lagRectFill, + .rectFillPat = NULL, + .bitBlt = lagBitBlt, + .hostBlit = lagHostBlit, + .colorExpand = lagColorExpand, + .lineDraw = NULL, + .setCursor = lagSetCursor, + .moveCursor = lagMoveCursor, + .showCursor = lagShowCursor, +}; + +// ============================================================ +// lagunaRegisterDriver +// ============================================================ + +void lagunaRegisterDriver(void) { + accelRegisterDriver(&sLagunaDriver); +} + + +// ============================================================ +// lagBitBlt +// ============================================================ +// +// Screen-to-screen BitBLT. Handles overlapping regions by +// selecting forward or reverse direction based on src/dst +// relationship. + +static void lagBitBlt(AccelDriverT *drv, int32_t srcX, int32_t srcY, int32_t dstX, int32_t dstY, int32_t w, int32_t h) { + LagunaPrivateT *priv = (LagunaPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + lagWaitIdle(drv); + + // Determine direction for overlapping blits + uint32_t cmd = LAG_CMD_BITBLT | LAG_ROP_COPY; + + if (dstY > srcY || (dstY == srcY && dstX > srcX)) { + // Reverse direction: start from bottom-right + cmd |= LAG_CMD_DIR_REV; + lagWrite(priv, LAG_SRCXY, (uint32_t)(srcX + w - 1) | ((uint32_t)(srcY + h - 1) << 16)); + lagWrite(priv, LAG_DSTXY, (uint32_t)(dstX + w - 1) | ((uint32_t)(dstY + h - 1) << 16)); + } else { + // Forward direction: start from top-left + lagWrite(priv, LAG_SRCXY, (uint32_t)srcX | ((uint32_t)srcY << 16)); + lagWrite(priv, LAG_DSTXY, (uint32_t)dstX | ((uint32_t)dstY << 16)); + } + + lagWrite(priv, LAG_DSTSIZE, (uint32_t)(w - 1) | ((uint32_t)(h - 1) << 16)); + lagWrite(priv, LAG_PITCH, ((uint32_t)priv->screenPitch << 16) | (uint32_t)priv->screenPitch); + + // Trigger operation + lagWrite(priv, LAG_COMMAND, cmd); +} + + +// ============================================================ +// lagColorExpand +// ============================================================ +// +// Monochrome color expansion: convert 1bpp bitmap data to +// full-color pixels using the hardware color expand engine. +// Set foreground/background colors, then feed mono data +// through the host data window at MMIO + 0x0200. + +static void lagColorExpand(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h, uint32_t fg, uint32_t bg) { + LagunaPrivateT *priv = (LagunaPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + int32_t bytesPerRow = (w + 7) / 8; + int32_t dwordsPerRow = (bytesPerRow + 3) / 4; + + lagWaitIdle(drv); + + lagWrite(priv, LAG_FGCOLOR, fg); + lagWrite(priv, LAG_BGCOLOR, bg); + lagWrite(priv, LAG_DSTXY, (uint32_t)dstX | ((uint32_t)dstY << 16)); + lagWrite(priv, LAG_DSTSIZE, (uint32_t)(w - 1) | ((uint32_t)(h - 1) << 16)); + lagWrite(priv, LAG_PITCH, ((uint32_t)priv->screenPitch << 16) | (uint32_t)priv->screenPitch); + + // Start color expand operation + lagWrite(priv, LAG_COMMAND, LAG_CMD_COLOR_EXPAND | LAG_ROP_COPY | LAG_CMD_COLOREXP); + + // Feed mono data row by row through host data window + volatile uint32_t *hostWin = (volatile uint32_t *)((volatile uint8_t *)priv->mmio + LAG_HOST_DATA); + + for (int32_t row = 0; row < h; row++) { + const uint8_t *rowPtr = srcBuf + row * srcPitch; + + for (int32_t dw = 0; dw < dwordsPerRow; dw++) { + uint32_t val = 0; + int32_t offset = dw * 4; + + for (int32_t b = 0; b < 4; b++) { + if (offset + b < bytesPerRow) { + val |= (uint32_t)rowPtr[offset + b] << (b * 8); + } + } + + lagWaitIdle(drv); + hostWin[0] = val; + } + } +} + + +// ============================================================ +// lagDetect +// ============================================================ + +static bool lagDetect(AccelDriverT *drv) { + int32_t matchIdx; + + if (!pciFindDeviceList(sLagunaDeviceIds, &drv->pciDev, &matchIdx)) { + return false; + } + + switch (drv->pciDev.deviceId) { + case LAGUNA_GD5462: + drv->name = "Cirrus Logic Laguna GD5462"; + break; + case LAGUNA_GD5464: + drv->name = "Cirrus Logic Laguna GD5464"; + break; + case LAGUNA_GD5465: + drv->name = "Cirrus Logic Laguna GD5465"; + break; + default: + drv->name = "Cirrus Logic Laguna"; + break; + } + + return true; +} + + +// ============================================================ +// lagHostBlit +// ============================================================ +// +// CPU-to-screen blit: transfer pixel data from system RAM to +// VRAM through the host data window at MMIO + 0x0200. Each +// row is padded to a dword boundary. + +static void lagHostBlit(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h) { + LagunaPrivateT *priv = (LagunaPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + int32_t bytesPerRow = w * priv->bytesPerPixel; + int32_t dwordsPerRow = (bytesPerRow + 3) / 4; + + lagWaitIdle(drv); + + lagWrite(priv, LAG_DSTXY, (uint32_t)dstX | ((uint32_t)dstY << 16)); + lagWrite(priv, LAG_DSTSIZE, (uint32_t)(w - 1) | ((uint32_t)(h - 1) << 16)); + lagWrite(priv, LAG_PITCH, ((uint32_t)priv->screenPitch << 16) | (uint32_t)priv->screenPitch); + + // Start host-to-screen blit + lagWrite(priv, LAG_COMMAND, LAG_CMD_HOST_BLIT | LAG_ROP_COPY); + + // Feed pixel data row by row through host data window + volatile uint32_t *hostWin = (volatile uint32_t *)((volatile uint8_t *)priv->mmio + LAG_HOST_DATA); + + for (int32_t row = 0; row < h; row++) { + const uint8_t *rowPtr = srcBuf + row * srcPitch; + + for (int32_t dw = 0; dw < dwordsPerRow; dw++) { + uint32_t val = 0; + int32_t offset = dw * 4; + + for (int32_t b = 0; b < 4; b++) { + if (offset + b < bytesPerRow) { + val |= (uint32_t)rowPtr[offset + b] << (b * 8); + } + } + + lagWaitIdle(drv); + hostWin[0] = val; + } + } +} + + +// ============================================================ +// lagInit +// ============================================================ + +static bool lagInit(AccelDriverT *drv, const AccelModeRequestT *req) { + LagunaPrivateT *priv = (LagunaPrivateT *)drv->privData; + + // Read BARs from PCI config space + uint32_t bar0 = pciRead32(drv->pciDev.bus, drv->pciDev.dev, + drv->pciDev.func, PCI_BAR0); + uint32_t bar1 = pciRead32(drv->pciDev.bus, drv->pciDev.dev, + drv->pciDev.func, PCI_BAR1); + + priv->mmioPhysAddr = bar0 & 0xFFFFFFF0; + priv->lfbPhysAddr = bar1 & 0xFFFFFFF0; + + // Size the framebuffer BAR + priv->vramSize = pciSizeBar(drv->pciDev.bus, drv->pciDev.dev, + drv->pciDev.func, PCI_BAR1); + + // Map MMIO control registers (4KB) + if (!dpmiMapFramebuffer(priv->mmioPhysAddr, LAG_MMIO_SIZE, &priv->mmioMapping)) { + return false; + } + priv->mmio = (volatile uint32_t *)priv->mmioMapping.ptr; + + // Find and set VESA mode + VesaModeResultT vesa; + if (!vesaFindAndSetMode(req->width, req->height, req->bpp, &vesa)) { + return false; + } + + // Map framebuffer + if (!dpmiMapFramebuffer(priv->lfbPhysAddr, priv->vramSize, &priv->lfbMapping)) { + dpmiUnmapFramebuffer(&priv->mmioMapping); + vgaRestoreTextMode(); + return false; + } + + priv->bytesPerPixel = (vesa.bpp + 7) / 8; + priv->screenPitch = vesa.pitch; + + drv->mode.width = vesa.width; + drv->mode.height = vesa.height; + drv->mode.bpp = vesa.bpp; + drv->mode.pitch = vesa.pitch; + drv->mode.framebuffer = priv->lfbMapping.ptr; + drv->mode.vramSize = priv->vramSize; + drv->mode.offscreenBase = vesa.pitch * vesa.height; + + // Wait for engine idle before configuring + lagWaitIdle(drv); + + // Set up hardware cursor at end of VRAM + priv->cursorOffset = priv->vramSize - LAG_HW_CURSOR_BYTES; + priv->cursorOffset &= ~(LAG_HW_CURSOR_BYTES - 1); + + drv->caps = ACAP_RECT_FILL + | ACAP_BITBLT + | ACAP_HOST_BLIT + | ACAP_COLOR_EXPAND + | ACAP_HW_CURSOR + | ACAP_CLIP; + + // Set full-screen clip rectangle + lagSetClip(drv, 0, 0, vesa.width, vesa.height); + + return true; +} + + +// ============================================================ +// lagMoveCursor +// ============================================================ + +static void lagMoveCursor(AccelDriverT *drv, int32_t x, int32_t y) { + LagunaPrivateT *priv = (LagunaPrivateT *)drv->privData; + + if (x < 0) { x = 0; } + if (y < 0) { y = 0; } + + lagWrite(priv, LAG_CUR_X, (uint32_t)x); + lagWrite(priv, LAG_CUR_Y, (uint32_t)y); +} + + +// ============================================================ +// lagRectFill +// ============================================================ +// +// Solid rectangle fill using command 0x02. Sets the foreground +// color, destination position, and size, then triggers the fill. + +static void lagRectFill(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color) { + LagunaPrivateT *priv = (LagunaPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + lagWaitIdle(drv); + + lagWrite(priv, LAG_FGCOLOR, color); + lagWrite(priv, LAG_DSTXY, (uint32_t)x | ((uint32_t)y << 16)); + lagWrite(priv, LAG_DSTSIZE, (uint32_t)(w - 1) | ((uint32_t)(h - 1) << 16)); + lagWrite(priv, LAG_PITCH, ((uint32_t)priv->screenPitch << 16) | (uint32_t)priv->screenPitch); + + // Trigger solid fill + lagWrite(priv, LAG_COMMAND, LAG_CMD_RECTFILL | LAG_ROP_COPY); +} + + +// ============================================================ +// lagSetClip +// ============================================================ + +static void lagSetClip(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h) { + LagunaPrivateT *priv = (LagunaPrivateT *)drv->privData; + + lagWrite(priv, LAG_CLIPLT, (uint32_t)x | ((uint32_t)y << 16)); + lagWrite(priv, LAG_CLIPRB, (uint32_t)(x + w - 1) | ((uint32_t)(y + h - 1) << 16)); +} + + +// ============================================================ +// lagSetCursor +// ============================================================ +// +// Upload a hardware cursor image to VRAM at the cursor offset. +// The Laguna uses a 64x64 2bpp AND/XOR format stored in VRAM. + +static void lagSetCursor(AccelDriverT *drv, const HwCursorImageT *image) { + LagunaPrivateT *priv = (LagunaPrivateT *)drv->privData; + + if (!image) { + lagShowCursor(drv, false); + return; + } + + lagWaitIdle(drv); + + uint8_t *cursorMem = drv->mode.framebuffer + priv->cursorOffset; + + for (int32_t row = 0; row < LAG_HW_CURSOR_SIZE; row++) { + for (int32_t byte = 0; byte < 8; byte++) { + int32_t srcIdx = row * 8 + byte; + uint8_t andByte; + uint8_t xorByte; + + if (row < image->height && byte < (image->width + 7) / 8) { + andByte = image->andMask[srcIdx]; + xorByte = image->xorMask[srcIdx]; + } else { + andByte = 0xFF; // transparent + xorByte = 0x00; + } + + cursorMem[row * 16 + byte] = andByte; + cursorMem[row * 16 + byte + 8] = xorByte; + } + } + + // Set cursor VRAM address + lagWrite(priv, LAG_CUR_ADDR, priv->cursorOffset); +} + + +// ============================================================ +// lagShowCursor +// ============================================================ + +static void lagShowCursor(AccelDriverT *drv, bool visible) { + LagunaPrivateT *priv = (LagunaPrivateT *)drv->privData; + + uint32_t ctrl = lagRead(priv, LAG_CUR_CTRL); + + if (visible) { + ctrl |= 0x01; + } else { + ctrl &= ~0x01; + } + + lagWrite(priv, LAG_CUR_CTRL, ctrl); +} + + +// ============================================================ +// lagShutdown +// ============================================================ + +static void lagShutdown(AccelDriverT *drv) { + LagunaPrivateT *priv = (LagunaPrivateT *)drv->privData; + + lagShowCursor(drv, false); + dpmiUnmapFramebuffer(&priv->mmioMapping); + dpmiUnmapFramebuffer(&priv->lfbMapping); + vgaRestoreTextMode(); +} + + +// ============================================================ +// lagWaitIdle +// ============================================================ +// +// Poll the CONTROL register until bit 0 (engine busy) clears. +// Bounded by LAG_MAX_IDLE_WAIT iterations to avoid hangs on +// hardware failure. + +static void lagWaitIdle(AccelDriverT *drv) { + LagunaPrivateT *priv = (LagunaPrivateT *)drv->privData; + + for (int32_t i = 0; i < LAG_MAX_IDLE_WAIT; i++) { + uint32_t stat = lagRead(priv, LAG_CONTROL); + if (!(stat & LAG_STATUS_BUSY)) { + return; + } + } +} diff --git a/demo.c b/demo.c new file mode 100644 index 0000000..3d9c2c0 --- /dev/null +++ b/demo.c @@ -0,0 +1,869 @@ +// demo.c -- Test/demo application for accelerated video drivers +// +// Detects the video card, sets a graphics mode, exercises the +// hardware acceleration (fill rects, blit, draw lines, color +// expand), and provides a simple interactive benchmark comparing +// hardware vs software rendering speed. +// +// Usage: demo [width height bpp] +// Defaults to 640x480x16 if no arguments given. +// +// Press ESC to exit, 'b' to run benchmark, space to cycle tests. + +#include "accelVid.h" +#include "pci.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +// Scancode for ESC key +#define KEY_ESC 0x01 + +// Default video mode +#define DEFAULT_WIDTH 640 +#define DEFAULT_HEIGHT 480 +#define DEFAULT_BPP 16 + +// Benchmark iteration counts +#define BENCH_FILL_COUNT 1000 +#define BENCH_BLIT_COUNT 1000 +#define BENCH_LINE_COUNT 5000 +#define BENCH_EXPAND_COUNT 500 +#define BENCH_HBLIT_COUNT 1000 +#define BENCH_PATFILL_COUNT 1000 + +// Host blit test pattern dimensions +#define HBLIT_PAT_W 100 +#define HBLIT_PAT_H 100 + +// ============================================================ +// External driver registration functions +// ============================================================ + +extern void atiRegisterDriver(void); +extern void bansheeRegisterDriver(void); +extern void clRegisterDriver(void); +extern void etRegisterDriver(void); +extern void lagunaRegisterDriver(void); +extern void mgaRegisterDriver(void); +extern void nvRegisterDriver(void); +extern void s3RegisterDriver(void); +extern void sisRegisterDriver(void); +extern void tridentRegisterDriver(void); + +// ============================================================ +// Prototypes +// ============================================================ + +static void demoBenchmark(AccelDriverT *drv); +static void demoBitBlt(AccelDriverT *drv); +static void demoColorExpand(AccelDriverT *drv); +static void demoFillRects(AccelDriverT *drv); +static void demoHostBlit(AccelDriverT *drv); +static void demoLines(AccelDriverT *drv); +static void demoPatternFill(AccelDriverT *drv); +static bool isKeyPressed(void); +static uint32_t packColor16(uint8_t r, uint8_t g, uint8_t b); +static uint8_t readKey(void); +static void softFillRect(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color); + +// ============================================================ +// demoBenchmark +// ============================================================ +// +// Runs timed comparisons of hardware vs software rendering for +// rectangle fills and blits. Prints results to stdout after +// restoring text mode. + +static void demoBenchmark(AccelDriverT *drv) { + int32_t screenW = drv->mode.width; + int32_t screenH = drv->mode.height; + + // Benchmark hardware rect fill + clock_t hwFillStart = clock(); + + for (int32_t i = 0; i < BENCH_FILL_COUNT; i++) { + int32_t x = (i * 37) % (screenW - 100); + int32_t y = (i * 53) % (screenH - 100); + drv->rectFill(drv, x, y, 100, 100, packColor16(i & 0xFF, (i >> 3) & 0xFF, (i >> 6) & 0xFF)); + } + + drv->waitIdle(drv); + clock_t hwFillEnd = clock(); + + // Benchmark software rect fill + clock_t swFillStart = clock(); + + for (int32_t i = 0; i < BENCH_FILL_COUNT; i++) { + int32_t x = (i * 37) % (screenW - 100); + int32_t y = (i * 53) % (screenH - 100); + softFillRect(drv, x, y, 100, 100, packColor16(i & 0xFF, (i >> 3) & 0xFF, (i >> 6) & 0xFF)); + } + + clock_t swFillEnd = clock(); + + // Benchmark hardware bitblt + clock_t hwBltStart = clock(); + + for (int32_t i = 0; i < BENCH_BLIT_COUNT; i++) { + int32_t sx = (i * 31) % (screenW - 100); + int32_t sy = (i * 47) % (screenH - 100); + int32_t dx = (i * 43) % (screenW - 100); + int32_t dy = (i * 59) % (screenH - 100); + drv->bitBlt(drv, sx, sy, dx, dy, 100, 100); + } + + drv->waitIdle(drv); + clock_t hwBltEnd = clock(); + + // Benchmark hardware line draw + clock_t hwLineStart = clock(); + + for (int32_t i = 0; i < BENCH_LINE_COUNT; i++) { + int32_t x1 = (i * 37) % screenW; + int32_t y1 = (i * 53) % screenH; + int32_t x2 = (i * 71) % screenW; + int32_t y2 = (i * 89) % screenH; + drv->lineDraw(drv, x1, y1, x2, y2, packColor16(255, 255, 255)); + } + + drv->waitIdle(drv); + clock_t hwLineEnd = clock(); + + // Benchmark host blit (CPU-to-screen) + int32_t bytesPerPix = (drv->mode.bpp + 7) / 8; + int32_t hblitPitch = HBLIT_PAT_W * bytesPerPix; + uint8_t *hblitBuf = (uint8_t *)malloc(hblitPitch * HBLIT_PAT_H); + clock_t hwHblitEnd = 0; + clock_t hwHblitStart = 0; + bool hblitValid = false; + + if (hblitBuf) { + // Fill buffer with a checkerboard pattern + for (int32_t row = 0; row < HBLIT_PAT_H; row++) { + for (int32_t col = 0; col < HBLIT_PAT_W; col++) { + uint32_t color; + + if ((row / 8 + col / 8) & 1) { + color = packColor16(255, 255, 0); + } else { + color = packColor16(0, 0, 128); + } + + if (bytesPerPix == 2) { + ((uint16_t *)(hblitBuf + row * hblitPitch))[col] = (uint16_t)color; + } else if (bytesPerPix == 4) { + ((uint32_t *)(hblitBuf + row * hblitPitch))[col] = color; + } else { + hblitBuf[row * hblitPitch + col] = (uint8_t)color; + } + } + } + + hwHblitStart = clock(); + + for (int32_t i = 0; i < BENCH_HBLIT_COUNT; i++) { + int32_t dx = (i * 37) % (screenW - HBLIT_PAT_W); + int32_t dy = (i * 53) % (screenH - HBLIT_PAT_H); + drv->hostBlit(drv, hblitBuf, hblitPitch, dx, dy, HBLIT_PAT_W, HBLIT_PAT_H); + } + + drv->waitIdle(drv); + hwHblitEnd = clock(); + hblitValid = true; + free(hblitBuf); + } + + // Benchmark pattern fill + static const uint8_t benchPattern[8] = { + 0xAA, 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, 0x55 + }; + + clock_t hwPatStart = clock(); + + for (int32_t i = 0; i < BENCH_PATFILL_COUNT; i++) { + int32_t px = (i * 37) % (screenW - 100); + int32_t py = (i * 53) % (screenH - 100); + drv->rectFillPat(drv, px, py, 100, 100, benchPattern, packColor16(255, 255, 255), packColor16(0, 0, 0)); + } + + drv->waitIdle(drv); + clock_t hwPatEnd = clock(); + + // Calculate times in milliseconds + double hwFillMs = (double)(hwFillEnd - hwFillStart) * 1000.0 / CLOCKS_PER_SEC; + double swFillMs = (double)(swFillEnd - swFillStart) * 1000.0 / CLOCKS_PER_SEC; + double hwBltMs = (double)(hwBltEnd - hwBltStart) * 1000.0 / CLOCKS_PER_SEC; + double hwLineMs = (double)(hwLineEnd - hwLineStart) * 1000.0 / CLOCKS_PER_SEC; + double hwHblitMs = (double)(hwHblitEnd - hwHblitStart) * 1000.0 / CLOCKS_PER_SEC; + double hwPatMs = (double)(hwPatEnd - hwPatStart) * 1000.0 / CLOCKS_PER_SEC; + + // Store results, then restore text mode to print + accelShutdown(drv); + + printf("\n=== Benchmark Results ===\n\n"); + + printf("Rectangle Fill (%d x 100x100):\n", BENCH_FILL_COUNT); + printf(" Hardware: %.1f ms (%.0f rects/sec)\n", + hwFillMs, BENCH_FILL_COUNT * 1000.0 / hwFillMs); + printf(" Software: %.1f ms (%.0f rects/sec)\n", + swFillMs, BENCH_FILL_COUNT * 1000.0 / swFillMs); + if (swFillMs > 0) { + printf(" Speedup: %.1fx\n", swFillMs / hwFillMs); + } + + printf("\nBitBlt (%d x 100x100 screen-to-screen):\n", BENCH_BLIT_COUNT); + printf(" Hardware: %.1f ms (%.0f blits/sec)\n", + hwBltMs, BENCH_BLIT_COUNT * 1000.0 / hwBltMs); + + printf("\nLine Draw (%d lines):\n", BENCH_LINE_COUNT); + printf(" Hardware: %.1f ms (%.0f lines/sec)\n", + hwLineMs, BENCH_LINE_COUNT * 1000.0 / hwLineMs); + + if (hblitValid) { + printf("\nHost Blit (%d x %dx%d CPU-to-screen):\n", + BENCH_HBLIT_COUNT, HBLIT_PAT_W, HBLIT_PAT_H); + printf(" Hardware: %.1f ms (%.0f blits/sec)\n", + hwHblitMs, BENCH_HBLIT_COUNT * 1000.0 / hwHblitMs); + } + + printf("\nPattern Fill (%d x 100x100):\n", BENCH_PATFILL_COUNT); + printf(" Hardware: %.1f ms (%.0f fills/sec)\n", + hwPatMs, BENCH_PATFILL_COUNT * 1000.0 / hwPatMs); + + printf("\nPress any key to exit...\n"); + readKey(); +} + + +// ============================================================ +// demoBitBlt +// ============================================================ +// +// Demonstrates screen-to-screen BitBLT by filling colored +// rectangles and then copying them around the screen. + +static void demoBitBlt(AccelDriverT *drv) { + int32_t screenW = drv->mode.width; + int32_t screenH = drv->mode.height; + + // Clear screen + drv->rectFill(drv, 0, 0, screenW, screenH, 0); + + // Draw some source rectangles + drv->rectFill(drv, 10, 10, 100, 100, packColor16(255, 0, 0)); + drv->rectFill(drv, 120, 10, 100, 100, packColor16(0, 255, 0)); + drv->rectFill(drv, 230, 10, 100, 100, packColor16(0, 0, 255)); + drv->rectFill(drv, 340, 10, 100, 100, packColor16(255, 255, 0)); + drv->waitIdle(drv); + + // Copy them diagonally across the screen + for (int32_t i = 0; i < 5; i++) { + int32_t offsetY = 120 + i * 60; + + if (offsetY + 100 > screenH) { + break; + } + + drv->bitBlt(drv, 10, 10, 10 + i * 30, offsetY, 430, 100); + } + + drv->waitIdle(drv); +} + + +// ============================================================ +// demoColorExpand +// ============================================================ +// +// Demonstrates monochrome color expansion by rendering text-like +// patterns. Creates a simple 8x16 glyph and renders it repeatedly. + +static void demoColorExpand(AccelDriverT *drv) { + int32_t screenW = drv->mode.width; + int32_t screenH = drv->mode.height; + + // Clear screen + drv->rectFill(drv, 0, 0, screenW, screenH, packColor16(0, 0, 128)); + drv->waitIdle(drv); + + // 8x16 glyph bitmaps for several characters + static const uint8_t glyphA[16] = { + 0x00, 0x18, 0x3C, 0x66, 0x66, 0xC3, 0xC3, 0xFF, + 0xFF, 0xC3, 0xC3, 0xC3, 0xC3, 0xC3, 0x00, 0x00 + }; + + static const uint8_t glyphB[16] = { + 0x00, 0xFC, 0xC6, 0xC6, 0xC6, 0xFC, 0xC6, 0xC3, + 0xC3, 0xC3, 0xC6, 0xFC, 0x00, 0x00, 0x00, 0x00 + }; + + static const uint8_t glyphC[16] = { + 0x00, 0x3E, 0x63, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, + 0xC0, 0xC0, 0x63, 0x3E, 0x00, 0x00, 0x00, 0x00 + }; + + static const uint8_t glyphD[16] = { + 0x00, 0xFC, 0xC6, 0xC3, 0xC3, 0xC3, 0xC3, 0xC3, + 0xC3, 0xC3, 0xC6, 0xFC, 0x00, 0x00, 0x00, 0x00 + }; + + static const uint8_t glyphE[16] = { + 0x00, 0xFF, 0xC0, 0xC0, 0xC0, 0xFE, 0xC0, 0xC0, + 0xC0, 0xC0, 0xC0, 0xFF, 0x00, 0x00, 0x00, 0x00 + }; + + static const uint8_t glyphF[16] = { + 0x00, 0xFF, 0xC0, 0xC0, 0xC0, 0xFE, 0xC0, 0xC0, + 0xC0, 0xC0, 0xC0, 0xC0, 0x00, 0x00, 0x00, 0x00 + }; + + static const uint8_t *glyphs[6] = { + glyphA, glyphB, glyphC, glyphD, glyphE, glyphF + }; + + #define NUM_GLYPHS 6 + + // Color pairs for different rows (foreground/background) + static const uint8_t colorPairs[][6] = { + // R G B R G B (fg, then bg) + {255, 255, 255, 0, 0, 128}, // white on dark blue + {255, 255, 0, 0, 0, 0}, // yellow on black + { 0, 255, 0, 0, 64, 0}, // green on dark green + {255, 128, 0, 64, 0, 0}, // orange on dark red + { 0, 255, 255, 0, 0, 64}, // cyan on navy + {255, 0, 255, 32, 0, 32}, // magenta on dark purple + }; + + #define NUM_COLOR_PAIRS 6 + + int32_t cols = screenW / 8; + int32_t rows = screenH / 16; + + for (int32_t row = 0; row < rows; row++) { + int32_t pairIdx = row % NUM_COLOR_PAIRS; + const uint8_t *pair = colorPairs[pairIdx]; + uint32_t fg = packColor16(pair[0], pair[1], pair[2]); + uint32_t bg = packColor16(pair[3], pair[4], pair[5]); + + for (int32_t col = 0; col < cols; col++) { + const uint8_t *glyph = glyphs[(row + col) % NUM_GLYPHS]; + drv->colorExpand(drv, glyph, 1, + col * 8, row * 16, 8, 16, fg, bg); + } + } + + drv->waitIdle(drv); + + #undef NUM_GLYPHS + #undef NUM_COLOR_PAIRS +} + + +// ============================================================ +// demoFillRects +// ============================================================ +// +// Demonstrates hardware rectangle fill with various colors +// and sizes. Draws a pattern of overlapping rectangles. + +static void demoFillRects(AccelDriverT *drv) { + int32_t screenW = drv->mode.width; + int32_t screenH = drv->mode.height; + + // Clear screen to dark blue + drv->rectFill(drv, 0, 0, screenW, screenH, packColor16(0, 0, 64)); + drv->waitIdle(drv); + + // Draw concentric rectangles + int32_t colors[][3] = { + {255, 0, 0}, + {0, 255, 0}, + {0, 0, 255}, + {255, 255, 0}, + {255, 0, 255}, + {0, 255, 255}, + {255, 128, 0}, + {128, 0, 255} + }; + int32_t numColors = 8; + + int32_t cx = screenW / 2; + int32_t cy = screenH / 2; + + for (int32_t i = 0; i < numColors; i++) { + int32_t size = 200 - i * 20; + if (size < 10) { + break; + } + + uint32_t color = packColor16(colors[i][0], colors[i][1], colors[i][2]); + drv->rectFill(drv, cx - size / 2, cy - size / 2, size, size, color); + } + + // Draw a grid of small rectangles + for (int32_t y = 10; y < screenH - 30; y += 25) { + for (int32_t x = 10; x < 150; x += 25) { + uint32_t color = packColor16((x * 7) & 0xFF, (y * 3) & 0xFF, ((x + y) * 5) & 0xFF); + drv->rectFill(drv, x, y, 20, 20, color); + } + } + + // Draw grid on right side too + for (int32_t y = 10; y < screenH - 30; y += 25) { + for (int32_t x = screenW - 160; x < screenW - 10; x += 25) { + uint32_t color = packColor16((x * 3) & 0xFF, (y * 7) & 0xFF, ((x + y) * 2) & 0xFF); + drv->rectFill(drv, x, y, 20, 20, color); + } + } + + drv->waitIdle(drv); +} + + +// ============================================================ +// demoHostBlit +// ============================================================ +// +// Demonstrates CPU-to-screen blit by creating a colorful gradient +// pattern in system RAM, then tiling copies across the screen. + +static void demoHostBlit(AccelDriverT *drv) { + int32_t screenW = drv->mode.width; + int32_t screenH = drv->mode.height; + + // Clear screen + drv->rectFill(drv, 0, 0, screenW, screenH, 0); + drv->waitIdle(drv); + + // Create a gradient tile in system RAM + int32_t tileW = 64; + int32_t tileH = 64; + int32_t bytesPerPix = (drv->mode.bpp + 7) / 8; + int32_t tilePitch = tileW * bytesPerPix; + uint8_t *tileBuf = (uint8_t *)malloc(tilePitch * tileH); + + if (!tileBuf) { + return; + } + + // Fill tile with a radial gradient pattern + int32_t cx = tileW / 2; + int32_t cy = tileH / 2; + + for (int32_t row = 0; row < tileH; row++) { + for (int32_t col = 0; col < tileW; col++) { + int32_t dx = col - cx; + int32_t dy = row - cy; + int32_t dist = dx * dx + dy * dy; + + // Map distance to color -- creates concentric rings + uint8_t r = (dist * 7) & 0xFF; + uint8_t g = (dist * 3 + col * 4) & 0xFF; + uint8_t b = (row * 8 + col * 2) & 0xFF; + uint32_t color = packColor16(r, g, b); + + if (bytesPerPix == 2) { + ((uint16_t *)(tileBuf + row * tilePitch))[col] = (uint16_t)color; + } else if (bytesPerPix == 4) { + ((uint32_t *)(tileBuf + row * tilePitch))[col] = color; + } else { + tileBuf[row * tilePitch + col] = (uint8_t)color; + } + } + } + + // Tile the pattern across the screen + for (int32_t y = 0; y + tileH <= screenH; y += tileH) { + for (int32_t x = 0; x + tileW <= screenW; x += tileW) { + drv->hostBlit(drv, tileBuf, tilePitch, x, y, tileW, tileH); + } + } + + drv->waitIdle(drv); + + // Draw a border around each tile using rect fills for contrast + uint32_t borderColor = packColor16(255, 255, 255); + + for (int32_t y = 0; y + tileH <= screenH; y += tileH) { + drv->rectFill(drv, 0, y, screenW, 1, borderColor); + } + + for (int32_t x = 0; x + tileW <= screenW; x += tileW) { + drv->rectFill(drv, x, 0, 1, screenH, borderColor); + } + + drv->waitIdle(drv); + free(tileBuf); +} + + +// ============================================================ +// demoLines +// ============================================================ +// +// Demonstrates hardware line drawing with a starburst pattern. + +static void demoLines(AccelDriverT *drv) { + int32_t screenW = drv->mode.width; + int32_t screenH = drv->mode.height; + + // Clear screen + drv->rectFill(drv, 0, 0, screenW, screenH, 0); + drv->waitIdle(drv); + + int32_t cx = screenW / 2; + int32_t cy = screenH / 2; + + // Draw starburst from center + for (int32_t i = 0; i < 360; i += 3) { + // Simple integer approximation of sin/cos using a lookup + // approach. For a demo, we just use the endpoint calculation. + int32_t dx = 0; + int32_t dy = 0; + + // Approximate angle -> direction + int32_t radius = (screenH / 2) - 10; + int32_t angle = i; + + // Crude trig via quadrant decomposition + int32_t quadrant = (angle / 90) % 4; + int32_t subAngle = angle % 90; + + // Linear interpolation within each quadrant (good enough for demo) + int32_t frac = subAngle * radius / 90; + int32_t comp = radius - frac; + + switch (quadrant) { + case 0: dx = frac; dy = -comp; break; + case 1: dx = comp; dy = frac; break; + case 2: dx = -frac; dy = comp; break; + case 3: dx = -comp; dy = -frac; break; + } + + uint32_t color = packColor16( + (i * 3) & 0xFF, + (i * 5 + 100) & 0xFF, + (i * 7 + 50) & 0xFF + ); + + drv->lineDraw(drv, cx, cy, cx + dx, cy + dy, color); + } + + // Draw border rectangle with lines + uint32_t white = packColor16(255, 255, 255); + drv->lineDraw(drv, 0, 0, screenW - 1, 0, white); + drv->lineDraw(drv, screenW - 1, 0, screenW - 1, screenH - 1, white); + drv->lineDraw(drv, screenW - 1, screenH - 1, 0, screenH - 1, white); + drv->lineDraw(drv, 0, screenH - 1, 0, 0, white); + + drv->waitIdle(drv); +} + + +// ============================================================ +// demoPatternFill +// ============================================================ +// +// Demonstrates 8x8 pattern fills with several distinct patterns +// drawn side by side in colored rectangles. + +static void demoPatternFill(AccelDriverT *drv) { + int32_t screenW = drv->mode.width; + int32_t screenH = drv->mode.height; + + // Clear screen to dark gray + drv->rectFill(drv, 0, 0, screenW, screenH, packColor16(32, 32, 32)); + drv->waitIdle(drv); + + // Define several 8x8 patterns + static const uint8_t patCheckerboard[8] = { + 0xAA, 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, 0x55 + }; + + static const uint8_t patCrosshatch[8] = { + 0xFF, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 + }; + + static const uint8_t patDiagStripes[8] = { + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 + }; + + static const uint8_t patDots[8] = { + 0x00, 0x22, 0x00, 0x88, 0x00, 0x22, 0x00, 0x88 + }; + + static const uint8_t patHorzStripes[8] = { + 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00 + }; + + static const uint8_t patVertStripes[8] = { + 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA + }; + + struct { + const uint8_t *pattern; + uint32_t fg; + uint32_t bg; + } patterns[] = { + {patCheckerboard, packColor16(255, 255, 255), packColor16(0, 0, 0)}, + {patCrosshatch, packColor16(255, 255, 0), packColor16(0, 0, 128)}, + {patDiagStripes, packColor16(0, 255, 0), packColor16(0, 64, 0)}, + {patDots, packColor16(255, 0, 0), packColor16(64, 0, 0)}, + {patHorzStripes, packColor16(0, 255, 255), packColor16(0, 0, 64)}, + {patVertStripes, packColor16(255, 0, 255), packColor16(64, 0, 64)}, + }; + + int32_t numPatterns = 6; + + // Arrange patterns in a 3x2 grid + int32_t margin = 20; + int32_t spacing = 10; + int32_t cellW = (screenW - 2 * margin - (3 - 1) * spacing) / 3; + int32_t cellH = (screenH - 2 * margin - (2 - 1) * spacing) / 2; + + for (int32_t i = 0; i < numPatterns; i++) { + int32_t gridCol = i % 3; + int32_t gridRow = i / 3; + int32_t x = margin + gridCol * (cellW + spacing); + int32_t y = margin + gridRow * (cellH + spacing); + + drv->rectFillPat(drv, x, y, cellW, cellH, + patterns[i].pattern, + patterns[i].fg, patterns[i].bg); + } + + drv->waitIdle(drv); +} + + +// ============================================================ +// isKeyPressed +// ============================================================ +// +// Non-blocking check for a keypress using BIOS INT 16h. + +static bool isKeyPressed(void) { + __dpmi_regs r; + + memset(&r, 0, sizeof(r)); + r.h.ah = 0x11; // check for extended keystroke + __dpmi_int(0x16, &r); + + return !(r.x.flags & 0x40); // ZF clear = key available +} + + +// ============================================================ +// main +// ============================================================ + +int main(int argc, char *argv[]) { + int32_t reqW = DEFAULT_WIDTH; + int32_t reqH = DEFAULT_HEIGHT; + int32_t reqBpp = DEFAULT_BPP; + + if (argc >= 4) { + reqW = atoi(argv[1]); + reqH = atoi(argv[2]); + reqBpp = atoi(argv[3]); + } + + printf("DOS Accelerated Video Driver Demo\n"); + printf("Requested mode: %ldx%ldx%ld\n\n", (long)reqW, (long)reqH, (long)reqBpp); + + // Register all available drivers + atiRegisterDriver(); + bansheeRegisterDriver(); + clRegisterDriver(); + etRegisterDriver(); + lagunaRegisterDriver(); + mgaRegisterDriver(); + nvRegisterDriver(); + s3RegisterDriver(); + sisRegisterDriver(); + tridentRegisterDriver(); + + // Detect hardware + printf("Scanning PCI bus for supported video hardware...\n"); + AccelDriverT *drv = accelDetect(); + + if (!drv) { + printf("No supported video hardware found.\n"); + printf("\nPCI video devices present:\n"); + + // Enumerate and display all VGA-class PCI devices for diagnostics + for (int32_t bus = 0; bus < 256; bus++) { + for (int32_t dev = 0; dev < 32; dev++) { + uint16_t vid = pciRead16(bus, dev, 0, PCI_VENDOR_ID); + + if (vid == 0xFFFF) { + continue; + } + + uint8_t baseClass = pciRead8(bus, dev, 0, PCI_BASE_CLASS); + + if (baseClass == PCI_CLASS_DISPLAY) { + uint16_t did = pciRead16(bus, dev, 0, PCI_DEVICE_ID); + printf(" %02lX:%02lX.0 vendor=%04X device=%04X\n", + (long)bus, (long)dev, vid, did); + } + } + } + + return 1; + } + + // Initialize with requested mode + AccelModeRequestT modeReq; + modeReq.width = reqW; + modeReq.height = reqH; + modeReq.bpp = reqBpp; + + if (!accelInit(drv, &modeReq)) { + printf("Failed to initialize video driver.\n"); + return 1; + } + + printf("\nDriver: %s\n", accelGetName(drv)); + printf("Mode: %ldx%ldx%ld (pitch=%ld)\n", + (long)drv->mode.width, (long)drv->mode.height, + (long)drv->mode.bpp, (long)drv->mode.pitch); + printf("VRAM: %lu KB\n", (unsigned long)(drv->mode.vramSize / 1024)); + printf("\nPress any key to start demos...\n"); + printf(" SPACE = next demo\n"); + printf(" B = benchmark\n"); + printf(" ESC = exit\n"); + readKey(); + + // Run demos in a loop + int32_t currentDemo = 0; + int32_t numDemos = 6; + bool running = true; + + while (running) { + switch (currentDemo) { + case 0: + demoFillRects(drv); + break; + case 1: + demoBitBlt(drv); + break; + case 2: + demoLines(drv); + break; + case 3: + demoColorExpand(drv); + break; + case 4: + demoHostBlit(drv); + break; + case 5: + demoPatternFill(drv); + break; + } + + // Wait for keypress + while (!isKeyPressed()) { + // spin + } + + uint8_t key = readKey(); + + switch (key) { + case 0x01: // ESC + running = false; + break; + case 0x30: // 'b' + demoBenchmark(drv); + return 0; // benchmark already shut down the driver + case 0x39: // space + currentDemo = (currentDemo + 1) % numDemos; + break; + default: + currentDemo = (currentDemo + 1) % numDemos; + break; + } + } + + accelShutdown(drv); + printf("Demo complete.\n"); + + return 0; +} + + +// ============================================================ +// packColor16 +// ============================================================ +// +// Packs an RGB triplet into 16-bit 565 format. +// This is a simplification -- a real integration would use the +// display's actual pixel format. For the demo, 565 is fine since +// that's what most 16-bit VESA modes use. + +static uint32_t packColor16(uint8_t r, uint8_t g, uint8_t b) { + return ((uint32_t)(r >> 3) << 11) + | ((uint32_t)(g >> 2) << 5) + | ((uint32_t)(b >> 3)); +} + + +// ============================================================ +// readKey +// ============================================================ +// +// Blocking read of one keypress via BIOS INT 16h. +// Returns the scan code. + +static uint8_t readKey(void) { + __dpmi_regs r; + + memset(&r, 0, sizeof(r)); + r.h.ah = 0x10; // read extended keystroke + __dpmi_int(0x16, &r); + + return r.h.ah; // scan code +} + + +// ============================================================ +// softFillRect +// ============================================================ +// +// Software rectangle fill for benchmark comparison. Writes +// directly to the LFB (intentionally slow due to PCI bus writes). + +static void softFillRect(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color) { + uint8_t *fb = drv->mode.framebuffer; + int32_t pitch = drv->mode.pitch; + int32_t bpp = (drv->mode.bpp + 7) / 8; + + for (int32_t row = 0; row < h; row++) { + uint8_t *dst = fb + (y + row) * pitch + x * bpp; + + if (bpp == 2) { + uint16_t *dst16 = (uint16_t *)dst; + + for (int32_t col = 0; col < w; col++) { + dst16[col] = (uint16_t)color; + } + } else if (bpp == 4) { + uint32_t *dst32 = (uint32_t *)dst; + + for (int32_t col = 0; col < w; col++) { + dst32[col] = color; + } + } else { + for (int32_t col = 0; col < w; col++) { + dst[col] = (uint8_t)color; + } + } + } +} diff --git a/matroxMga.c b/matroxMga.c new file mode 100644 index 0000000..6f16aed --- /dev/null +++ b/matroxMga.c @@ -0,0 +1,843 @@ +// matroxMga.c -- Matrox Millennium/Mystique/G200/G400 accelerated video driver +// +// Supports the Matrox MGA family: MGA2064W (Millennium), MGA1064SG +// (Mystique), G100, G200, and G400/G450. The Matrox 2D drawing engine +// is widely regarded as the best 2D accelerator of the PCI/AGP era, +// with features including: +// - Solid and pattern rectangle fill +// - Screen-to-screen BitBLT (very fast, pipelined) +// - CPU-to-screen blit with color expansion (ILOAD) +// - Bresenham line draw (antialiased on G200+) +// - Trapezoid fill +// - Hardware clip rectangle +// - 64x64 three-color hardware cursor +// +// Register access: +// The MGA register block is mapped via BAR0 (PCI) or BAR1 +// depending on the chip. It's a 16KB MMIO region. The drawing +// engine registers start at offset 0x1C00 within this block. +// +// The drawing engine uses a command-based model: you set up +// parameters (colors, coordinates, dimensions) in the setup +// registers, then write to DWGCTL to start the operation. +// Some operations auto-execute when the last parameter is +// written (e.g., LEN triggers a draw). +// +// FIFO: +// The MGA has a deep command FIFO (64 entries on Millennium). +// The FIFOSTATUS register indicates how many entries are free. +// On G200+, the FIFO is deeper and the STATUS register has +// a busy bit that's more reliable. + +#include "accelVid.h" +#include "vgaCommon.h" +#include "pci.h" + +#include +#include +#include +#include +#include + +// ============================================================ +// Matrox vendor/device IDs +// ============================================================ + +#define MATROX_VENDOR_ID 0x102B + +#define MGA_2064W 0x0519 // Millennium +#define MGA_1064SG 0x051A // Mystique +#define MGA_G100_PCI 0x1000 +#define MGA_G100_AGP 0x1001 +#define MGA_G200_PCI 0x0521 +#define MGA_G200_AGP 0x0520 +#define MGA_G400 0x0525 +#define MGA_G450 0x2527 + +static const uint16_t sMatroxDeviceIds[] = { + MATROX_VENDOR_ID, MGA_2064W, + MATROX_VENDOR_ID, MGA_1064SG, + MATROX_VENDOR_ID, MGA_G100_PCI, + MATROX_VENDOR_ID, MGA_G100_AGP, + MATROX_VENDOR_ID, MGA_G200_PCI, + MATROX_VENDOR_ID, MGA_G200_AGP, + MATROX_VENDOR_ID, MGA_G400, + MATROX_VENDOR_ID, MGA_G450, + 0, 0 +}; + +// ============================================================ +// MGA drawing engine register offsets (from MMIO base) +// ============================================================ + +// Drawing engine setup registers (0x1C00 - 0x1CFF) +#define MGA_DWGCTL 0x1C00 // drawing control +#define MGA_MACCESS 0x1C04 // memory access control +#define MGA_MCTLWTST 0x1C08 // memory control wait state +#define MGA_ZORG 0x1C0C // Z origin +#define MGA_PAT0 0x1C10 // pattern register 0 +#define MGA_PAT1 0x1C14 // pattern register 1 +#define MGA_PLNWT 0x1C1C // plane write mask +#define MGA_BCOL 0x1C20 // background color +#define MGA_FCOL 0x1C24 // foreground color +#define MGA_SRC0 0x1C30 // source data 0 (for color expand) +#define MGA_SRC1 0x1C34 +#define MGA_SRC2 0x1C38 +#define MGA_SRC3 0x1C3C +#define MGA_XYSTRT 0x1C40 // XY start (for lines) +#define MGA_XYEND 0x1C44 // XY end (triggers line draw) +#define MGA_SHIFT 0x1C50 +#define MGA_SGN 0x1C58 // sign register +#define MGA_LEN 0x1C5C // number of lines (triggers rect ops) +#define MGA_AR0 0x1C60 // line draw parameter 0 +#define MGA_AR1 0x1C64 +#define MGA_AR2 0x1C68 +#define MGA_AR3 0x1C6C +#define MGA_AR4 0x1C70 +#define MGA_AR5 0x1C74 +#define MGA_AR6 0x1C78 +#define MGA_CXBNDRY 0x1C80 // clip X boundaries (left | right<<16) +#define MGA_FXBNDRY 0x1C84 // fill X boundaries (left | right<<16) +#define MGA_YDSTLEN 0x1C88 // Y dest and length (triggers fill) +#define MGA_PITCH 0x1C8C // destination pitch (in pixels) +#define MGA_YDST 0x1C90 // Y destination +#define MGA_YDSTORG 0x1C94 // Y destination origin (byte offset) +#define MGA_YTOP 0x1C98 // clip Y top +#define MGA_YBOT 0x1C9C // clip Y bottom +#define MGA_CXLEFT 0x1CA0 // clip X left +#define MGA_CXRIGHT 0x1CA4 // clip X right +#define MGA_FXLEFT 0x1CA8 // fill X left +#define MGA_FXRIGHT 0x1CAC // fill X right +#define MGA_XDST 0x1CB0 // X destination + +// Status registers (0x1E00 - 0x1EFF) +#define MGA_FIFOSTATUS 0x1E10 // FIFO status +#define MGA_STATUS 0x1E14 // engine status +#define MGA_ICLEAR 0x1E18 // interrupt clear +#define MGA_IEN 0x1E1C // interrupt enable + +// Source window (for BitBLT) +#define MGA_SRCORG 0x2CB4 // source origin + +// DWGSYNC for synchronization +#define MGA_DWGSYNC 0x2C4C + +// ============================================================ +// MGA DWGCTL command values +// ============================================================ +// +// The DWGCTL register is a 32-bit command word that encodes the +// operation type, drawing options, and raster operation. + +// Operation codes (bits 3:0) +#define MGA_OPCOD_LINE_OPEN 0x00 // line (open) +#define MGA_OPCOD_AUTOLINE_OPEN 0x01 +#define MGA_OPCOD_LINE_CLOSE 0x02 // line (closed) +#define MGA_OPCOD_AUTOLINE_CLOSE 0x03 +#define MGA_OPCOD_TRAP 0x04 // trapezoid fill +#define MGA_OPCOD_TEXTURE 0x05 // texture mapping (G200+) +#define MGA_OPCOD_BITBLT 0x08 // screen-to-screen blit +#define MGA_OPCOD_ILOAD 0x09 // CPU-to-screen (image load) +#define MGA_OPCOD_IDUMP 0x0A // screen-to-CPU + +// Drawing options (bits 31:4) +#define MGA_ATYPE_RPL 0x0000 // replace +#define MGA_ATYPE_RSTR 0x0010 // raster +#define MGA_ATYPE_ZI 0x0030 // Z interpolate +#define MGA_ATYPE_BLK 0x0040 // block transfer +#define MGA_ATYPE_I 0x0070 // interpolate + +#define MGA_ZMODE_NOZCMP 0x0000 // no Z compare +#define MGA_ZMODE_ZE 0x0200 // Z equal +#define MGA_ZMODE_ZNE 0x0300 // Z not equal + +#define MGA_SOLID 0x0800 // solid fill (no pattern) +#define MGA_ARZERO 0x1000 // AR regs are zero (solid fill optimization) +#define MGA_SGNZERO 0x2000 // SGN reg is zero +#define MGA_SHFTZERO 0x4000 // SHIFT reg is zero + +#define MGA_BOP_MASK 0x000F0000 // boolean operation (ROP) mask +#define MGA_BOP_SHIFT 16 + +// Boolean operations (ROP2, bits 19:16) +#define MGA_BOP_CLEAR (0x0 << MGA_BOP_SHIFT) +#define MGA_BOP_NOR (0x1 << MGA_BOP_SHIFT) +#define MGA_BOP_COPYINV (0x3 << MGA_BOP_SHIFT) +#define MGA_BOP_AND (0x8 << MGA_BOP_SHIFT) +#define MGA_BOP_XOR (0x6 << MGA_BOP_SHIFT) +#define MGA_BOP_COPY (0xC << MGA_BOP_SHIFT) +#define MGA_BOP_OR (0xE << MGA_BOP_SHIFT) +#define MGA_BOP_SET (0xF << MGA_BOP_SHIFT) + +// Transparency +#define MGA_TRANSC 0x00100000 // transparent color compare +#define MGA_BLTMOD_BFCOL 0x04000000 // BLT mode: foreground color +#define MGA_BLTMOD_BU32RGB 0x0C000000 // BLT mode: 32bpp ILOAD +#define MGA_BLTMOD_BMONOWF 0x08000000 // BLT mode: mono word expand MSB first + +// Pattern +#define MGA_PATTERN 0x20000000 // enable pattern + +// Linear source +#define MGA_LINEAR 0x80000000 // linear addressing (not XY) + +// ============================================================ +// MGA MACCESS values +// ============================================================ + +#define MGA_MACCESS_8BPP 0x00 +#define MGA_MACCESS_16BPP 0x01 +#define MGA_MACCESS_32BPP 0x02 +#define MGA_MACCESS_24BPP 0x03 + +// ============================================================ +// MGA SGN register bits +// ============================================================ + +#define MGA_SGN_SCANLEFT 0x01 // scan direction left +#define MGA_SGN_SCANRIGHT 0x00 // scan direction right +#define MGA_SGN_SDY_NEG 0x02 // negative Y direction +#define MGA_SGN_SDX_NEG 0x04 // negative X direction + +// ============================================================ +// MGA STATUS register bits +// ============================================================ + +#define MGA_STATUS_BUSY 0x00010000 // drawing engine busy +#define MGA_FIFO_FULL_MASK 0x0000007F // FIFO free count + +// Maximum wait iterations +#define MGA_MAX_IDLE_WAIT 1000000 + +// Hardware cursor +#define MGA_HW_CURSOR_SIZE 64 +#define MGA_HW_CURSOR_BYTES 1024 + +// ============================================================ +// Private driver state +// ============================================================ + +typedef struct { + uint32_t lfbPhysAddr; + uint32_t mmioPhysAddr; + uint32_t vramSize; + uint32_t cursorOffset; + int32_t bytesPerPixel; + int32_t screenPitch; + volatile uint32_t *mmio; // mapped MMIO base + DpmiMappingT lfbMapping; + DpmiMappingT mmioMapping; + bool isG200Plus; // G200/G400/G450 +} MatroxPrivateT; + +// ============================================================ +// Prototypes +// ============================================================ + +static void mgaBitBlt(AccelDriverT *drv, int32_t srcX, int32_t srcY, int32_t dstX, int32_t dstY, int32_t w, int32_t h); +static void mgaColorExpand(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h, uint32_t fg, uint32_t bg); +static bool mgaDetect(AccelDriverT *drv); +static void mgaHostBlit(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h); +static bool mgaInit(AccelDriverT *drv, const AccelModeRequestT *req); +static void mgaLineDraw(AccelDriverT *drv, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color); +static void mgaMoveCursor(AccelDriverT *drv, int32_t x, int32_t y); +static void mgaRectFill(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color); +static void mgaRectFillPat(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, const uint8_t *pattern, uint32_t fg, uint32_t bg); +static void mgaSetClip(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h); +static void mgaSetCursor(AccelDriverT *drv, const HwCursorImageT *image); +static void mgaShowCursor(AccelDriverT *drv, bool visible); +static void mgaShutdown(AccelDriverT *drv); +static void mgaWaitFifo(MatroxPrivateT *priv, int32_t entries); +static void mgaWaitIdle(AccelDriverT *drv); + +static inline void mgaWrite(MatroxPrivateT *priv, uint32_t reg, uint32_t val) { + priv->mmio[reg / 4] = val; +} + +static inline uint32_t mgaRead(MatroxPrivateT *priv, uint32_t reg) { + return priv->mmio[reg / 4]; +} + +// ============================================================ +// Driver instance +// ============================================================ + +static MatroxPrivateT sMatroxPrivate; + +static AccelDriverT sMatroxDriver = { + .name = "Matrox Millennium", + .chipFamily = "matrox", + .caps = 0, + .privData = &sMatroxPrivate, + .detect = mgaDetect, + .init = mgaInit, + .shutdown = mgaShutdown, + .waitIdle = mgaWaitIdle, + .setClip = mgaSetClip, + .rectFill = mgaRectFill, + .rectFillPat = mgaRectFillPat, + .bitBlt = mgaBitBlt, + .hostBlit = mgaHostBlit, + .colorExpand = mgaColorExpand, + .lineDraw = mgaLineDraw, + .setCursor = mgaSetCursor, + .moveCursor = mgaMoveCursor, + .showCursor = mgaShowCursor, +}; + +// ============================================================ +// mgaRegisterDriver +// ============================================================ + +void mgaRegisterDriver(void) { + accelRegisterDriver(&sMatroxDriver); +} + + +// ============================================================ +// mgaBitBlt +// ============================================================ +// +// Screen-to-screen BitBLT using the MGA BITBLT opcode. +// The MGA engine uses pixel coordinates and pitch, with the +// sign register controlling direction for overlapping blits. + +static void mgaBitBlt(AccelDriverT *drv, int32_t srcX, int32_t srcY, int32_t dstX, int32_t dstY, int32_t w, int32_t h) { + MatroxPrivateT *priv = (MatroxPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + // Determine direction + uint32_t sgn = 0; + int32_t startX; + int32_t endX; + int32_t startY = dstY; + uint32_t srcOrg = srcY * priv->screenPitch + srcX * priv->bytesPerPixel; + + if (dstX <= srcX) { + // Left to right + startX = dstX; + endX = dstX + w - 1; + } else { + // Right to left + startX = dstX + w - 1; + endX = dstX; + sgn |= MGA_SGN_SCANLEFT; + srcOrg = srcY * priv->screenPitch + (srcX + w - 1) * priv->bytesPerPixel; + } + + if (dstY > srcY) { + // Bottom to top + sgn |= MGA_SGN_SDY_NEG; + startY = dstY + h - 1; + srcOrg = (srcY + h - 1) * priv->screenPitch + srcX * priv->bytesPerPixel; + if (sgn & MGA_SGN_SCANLEFT) { + srcOrg = (srcY + h - 1) * priv->screenPitch + (srcX + w - 1) * priv->bytesPerPixel; + } + } + + mgaWaitFifo(priv, 8); + + mgaWrite(priv, MGA_DWGCTL, + MGA_OPCOD_BITBLT | MGA_ATYPE_BLK | MGA_BOP_COPY | MGA_SHFTZERO); + mgaWrite(priv, MGA_SGN, sgn); + mgaWrite(priv, MGA_PLNWT, 0xFFFFFFFF); + mgaWrite(priv, MGA_SRCORG, srcOrg); + mgaWrite(priv, MGA_AR5, (sgn & MGA_SGN_SDY_NEG) ? -(priv->screenPitch / priv->bytesPerPixel) : (priv->screenPitch / priv->bytesPerPixel)); + + // Set boundaries and trigger + mgaWrite(priv, MGA_FXBNDRY, ((uint32_t)endX << 16) | (uint32_t)(startX & 0xFFFF)); + mgaWrite(priv, MGA_YDSTLEN, ((uint32_t)startY << 16) | (uint32_t)h); +} + + +// ============================================================ +// mgaColorExpand +// ============================================================ +// +// CPU-to-screen monochrome color expansion using the MGA ILOAD +// opcode with BLTMOD_BMONOWF. Monochrome bitmap bits are expanded +// to foreground/background colors by the hardware. Data is fed +// as dwords through MGA_SRC0. + +static void mgaColorExpand(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h, uint32_t fg, uint32_t bg) { + MatroxPrivateT *priv = (MatroxPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + int32_t bytesPerRow = (w + 7) / 8; + int32_t dwordsPerRow = (bytesPerRow + 3) / 4; + + mgaWaitFifo(priv, 6); + + mgaWrite(priv, MGA_DWGCTL, + MGA_OPCOD_ILOAD | MGA_ATYPE_RPL | MGA_BOP_COPY + | MGA_BLTMOD_BMONOWF | MGA_SHFTZERO | MGA_SGNZERO); + mgaWrite(priv, MGA_FCOL, fg); + mgaWrite(priv, MGA_BCOL, bg); + mgaWrite(priv, MGA_PLNWT, 0xFFFFFFFF); + mgaWrite(priv, MGA_FXBNDRY, (uint32_t)dstX | ((uint32_t)(dstX + w) << 16)); + mgaWrite(priv, MGA_YDSTLEN, ((uint32_t)dstY << 16) | (uint32_t)h); + + // Feed monochrome data row by row + for (int32_t row = 0; row < h; row++) { + const uint8_t *rowPtr = srcBuf + row * srcPitch; + + for (int32_t dw = 0; dw < dwordsPerRow; dw++) { + uint32_t val = 0; + int32_t offset = dw * 4; + + for (int32_t b = 0; b < 4; b++) { + if (offset + b < bytesPerRow) { + val |= (uint32_t)rowPtr[offset + b] << (b * 8); + } + } + + mgaWaitFifo(priv, 1); + mgaWrite(priv, MGA_SRC0, val); + } + } +} + + +// ============================================================ +// mgaDetect +// ============================================================ + +static bool mgaDetect(AccelDriverT *drv) { + int32_t matchIdx; + + if (!pciFindDeviceList(sMatroxDeviceIds, &drv->pciDev, &matchIdx)) { + return false; + } + + MatroxPrivateT *priv = (MatroxPrivateT *)drv->privData; + + switch (drv->pciDev.deviceId) { + case MGA_2064W: + drv->name = "Matrox Millennium"; + priv->isG200Plus = false; + break; + case MGA_1064SG: + drv->name = "Matrox Mystique"; + priv->isG200Plus = false; + break; + case MGA_G100_PCI: + case MGA_G100_AGP: + drv->name = "Matrox G100"; + priv->isG200Plus = true; + break; + case MGA_G200_PCI: + case MGA_G200_AGP: + drv->name = "Matrox G200"; + priv->isG200Plus = true; + break; + case MGA_G400: + drv->name = "Matrox G400"; + priv->isG200Plus = true; + break; + case MGA_G450: + drv->name = "Matrox G450"; + priv->isG200Plus = true; + break; + default: + drv->name = "Matrox MGA"; + priv->isG200Plus = false; + break; + } + + return true; +} + + +// ============================================================ +// mgaHostBlit +// ============================================================ +// +// CPU-to-screen blit using the MGA ILOAD opcode. Pixel data is +// written from host memory to the framebuffer through the MMIO +// window via MGA_SRC0. Each row is padded to a dword boundary. + +static void mgaHostBlit(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h) { + MatroxPrivateT *priv = (MatroxPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + int32_t bytesPerRow = w * priv->bytesPerPixel; + int32_t dwordsPerRow = (bytesPerRow + 3) / 4; + + mgaWaitFifo(priv, 5); + + mgaWrite(priv, MGA_DWGCTL, + MGA_OPCOD_ILOAD | MGA_ATYPE_RPL | MGA_BOP_COPY + | MGA_SHFTZERO | MGA_SGNZERO); + mgaWrite(priv, MGA_FCOL, 0xFFFFFFFF); + mgaWrite(priv, MGA_PLNWT, 0xFFFFFFFF); + mgaWrite(priv, MGA_FXBNDRY, (uint32_t)dstX | ((uint32_t)(dstX + w) << 16)); + mgaWrite(priv, MGA_YDSTLEN, ((uint32_t)dstY << 16) | (uint32_t)h); + + // Feed pixel data row by row + for (int32_t row = 0; row < h; row++) { + const uint8_t *rowPtr = srcBuf + row * srcPitch; + + for (int32_t dw = 0; dw < dwordsPerRow; dw++) { + uint32_t val = 0; + int32_t offset = dw * 4; + + for (int32_t b = 0; b < 4; b++) { + if (offset + b < bytesPerRow) { + val |= (uint32_t)rowPtr[offset + b] << (b * 8); + } + } + + mgaWaitFifo(priv, 1); + mgaWrite(priv, MGA_SRC0, val); + } + } +} + + +// ============================================================ +// mgaInit +// ============================================================ + +static bool mgaInit(AccelDriverT *drv, const AccelModeRequestT *req) { + MatroxPrivateT *priv = (MatroxPrivateT *)drv->privData; + + // BAR layout depends on chip: + // Millennium (2064W): BAR0 = control regs (16KB), BAR1 = framebuffer + // Mystique+: BAR0 = control regs (16KB), BAR1 = framebuffer + uint32_t bar0 = pciRead32(drv->pciDev.bus, drv->pciDev.dev, + drv->pciDev.func, PCI_BAR0); + uint32_t bar1 = pciRead32(drv->pciDev.bus, drv->pciDev.dev, + drv->pciDev.func, PCI_BAR1); + + priv->mmioPhysAddr = bar0 & 0xFFFFFFF0; + priv->lfbPhysAddr = bar1 & 0xFFFFFFF0; + + // Size the framebuffer BAR + priv->vramSize = pciSizeBar(drv->pciDev.bus, drv->pciDev.dev, + drv->pciDev.func, PCI_BAR1); + + // Map MMIO control registers (16KB) + if (!dpmiMapFramebuffer(priv->mmioPhysAddr, 16384, &priv->mmioMapping)) { + return false; + } + priv->mmio = (volatile uint32_t *)priv->mmioMapping.ptr; + + // Find and set VESA mode + VesaModeResultT vesa; + if (!vesaFindAndSetMode(req->width, req->height, req->bpp, &vesa)) { + dpmiUnmapFramebuffer(&priv->mmioMapping); + return false; + } + + // Map framebuffer + if (!dpmiMapFramebuffer(priv->lfbPhysAddr, priv->vramSize, &priv->lfbMapping)) { + dpmiUnmapFramebuffer(&priv->mmioMapping); + vgaRestoreTextMode(); + return false; + } + + priv->bytesPerPixel = (vesa.bpp + 7) / 8; + priv->screenPitch = vesa.pitch; + + drv->mode.width = vesa.width; + drv->mode.height = vesa.height; + drv->mode.bpp = vesa.bpp; + drv->mode.pitch = vesa.pitch; + drv->mode.framebuffer = priv->lfbMapping.ptr; + drv->mode.vramSize = priv->vramSize; + drv->mode.offscreenBase = vesa.pitch * vesa.height; + + // Configure MACCESS for pixel depth + uint32_t maccess; + switch (vesa.bpp) { + case 8: maccess = MGA_MACCESS_8BPP; break; + case 15: + case 16: maccess = MGA_MACCESS_16BPP; break; + case 32: maccess = MGA_MACCESS_32BPP; break; + default: maccess = MGA_MACCESS_16BPP; break; + } + + mgaWaitIdle(drv); + mgaWrite(priv, MGA_MACCESS, maccess); + + // Set pitch (in pixels) + mgaWrite(priv, MGA_PITCH, vesa.pitch / priv->bytesPerPixel); + + // Set YDSTORG to 0 (framebuffer starts at beginning of VRAM) + mgaWrite(priv, MGA_YDSTORG, 0); + + // Plane write mask: all bits + mgaWrite(priv, MGA_PLNWT, 0xFFFFFFFF); + + // Set up cursor at end of VRAM + priv->cursorOffset = priv->vramSize - MGA_HW_CURSOR_BYTES; + priv->cursorOffset &= ~(MGA_HW_CURSOR_BYTES - 1); + + drv->caps = ACAP_RECT_FILL + | ACAP_RECT_FILL_PAT + | ACAP_BITBLT + | ACAP_HOST_BLIT + | ACAP_COLOR_EXPAND + | ACAP_LINE_DRAW + | ACAP_HW_CURSOR + | ACAP_CLIP; + + // Full screen clip + mgaSetClip(drv, 0, 0, vesa.width, vesa.height); + + return true; +} + + +// ============================================================ +// mgaLineDraw +// ============================================================ +// +// Line drawing using the MGA AUTOLINE opcode. The MGA engine +// takes start XY and end XY coordinates directly (no Bresenham +// parameter computation needed on the CPU side). + +static void mgaLineDraw(AccelDriverT *drv, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color) { + MatroxPrivateT *priv = (MatroxPrivateT *)drv->privData; + + mgaWaitFifo(priv, 5); + + mgaWrite(priv, MGA_DWGCTL, + MGA_OPCOD_AUTOLINE_CLOSE | MGA_ATYPE_RPL | MGA_SOLID + | MGA_BOP_COPY | MGA_SHFTZERO | MGA_SGNZERO | MGA_ARZERO); + mgaWrite(priv, MGA_FCOL, color); + mgaWrite(priv, MGA_PLNWT, 0xFFFFFFFF); + + // Start coordinate + mgaWrite(priv, MGA_XYSTRT, ((uint32_t)(y1 & 0xFFFF) << 16) | (uint32_t)(x1 & 0xFFFF)); + + // End coordinate (triggers draw) + mgaWrite(priv, MGA_XYEND, ((uint32_t)(y2 & 0xFFFF) << 16) | (uint32_t)(x2 & 0xFFFF)); +} + + +// ============================================================ +// mgaMoveCursor +// ============================================================ +// +// Matrox cursor position is set via RAMDAC registers. +// On Millennium: TVP3026 RAMDAC external registers. +// On Mystique+: integrated RAMDAC at MMIO offset 0x3C00+. + +static void mgaMoveCursor(AccelDriverT *drv, int32_t x, int32_t y) { + MatroxPrivateT *priv = (MatroxPrivateT *)drv->privData; + + if (x < 0) { x = 0; } + if (y < 0) { y = 0; } + + // Cursor position via DAC registers (Mystique/G200+ integrated DAC) + // CURPOS register at MMIO + 0x3C0C + mgaWrite(priv, 0x3C0C, ((uint32_t)(y & 0xFFF) << 16) | (uint32_t)(x & 0xFFF)); +} + + +// ============================================================ +// mgaRectFill +// ============================================================ +// +// Solid rectangle fill using the MGA TRAP opcode with the SOLID +// bit set. This is the fastest path for solid fills -- the +// engine fills with the foreground color using the ARZERO and +// SGNZERO hints to skip setup of unused registers. + +static void mgaRectFill(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color) { + MatroxPrivateT *priv = (MatroxPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + mgaWaitFifo(priv, 5); + + mgaWrite(priv, MGA_DWGCTL, + MGA_OPCOD_TRAP | MGA_ATYPE_BLK | MGA_SOLID + | MGA_BOP_COPY | MGA_ARZERO | MGA_SGNZERO | MGA_SHFTZERO); + mgaWrite(priv, MGA_FCOL, color); + + // Set X boundaries + mgaWrite(priv, MGA_FXBNDRY, ((uint32_t)(x + w) << 16) | (uint32_t)(x & 0xFFFF)); + + // Set Y destination and length (triggers fill) + mgaWrite(priv, MGA_YDSTLEN, ((uint32_t)(y & 0xFFFF) << 16) | (uint32_t)(h & 0xFFFF)); +} + + +// ============================================================ +// mgaRectFillPat +// ============================================================ +// +// 8x8 mono pattern fill using the MGA TRAP opcode with the +// MGA_PATTERN bit set. The pattern is 8 bytes (one per row, +// MSB-first), loaded into PAT0 (rows 0-3) and PAT1 (rows 4-7). +// 1-bits use the foreground color, 0-bits use the background. + +static void mgaRectFillPat(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, const uint8_t *pattern, uint32_t fg, uint32_t bg) { + MatroxPrivateT *priv = (MatroxPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + // Pack pattern rows 0-3 into PAT0 and rows 4-7 into PAT1 + uint32_t pat0 = (uint32_t)pattern[0] + | ((uint32_t)pattern[1] << 8) + | ((uint32_t)pattern[2] << 16) + | ((uint32_t)pattern[3] << 24); + uint32_t pat1 = (uint32_t)pattern[4] + | ((uint32_t)pattern[5] << 8) + | ((uint32_t)pattern[6] << 16) + | ((uint32_t)pattern[7] << 24); + + mgaWaitFifo(priv, 8); + + mgaWrite(priv, MGA_DWGCTL, + MGA_OPCOD_TRAP | MGA_ATYPE_RPL | MGA_PATTERN + | MGA_BOP_COPY | MGA_ARZERO | MGA_SGNZERO | MGA_SHFTZERO); + mgaWrite(priv, MGA_FCOL, fg); + mgaWrite(priv, MGA_BCOL, bg); + mgaWrite(priv, MGA_PAT0, pat0); + mgaWrite(priv, MGA_PAT1, pat1); + mgaWrite(priv, MGA_PLNWT, 0xFFFFFFFF); + + // Set X boundaries and trigger fill + mgaWrite(priv, MGA_FXBNDRY, ((uint32_t)(x + w) << 16) | (uint32_t)(x & 0xFFFF)); + mgaWrite(priv, MGA_YDSTLEN, ((uint32_t)(y & 0xFFFF) << 16) | (uint32_t)(h & 0xFFFF)); +} + + +// ============================================================ +// mgaSetClip +// ============================================================ + +static void mgaSetClip(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h) { + MatroxPrivateT *priv = (MatroxPrivateT *)drv->privData; + + mgaWaitFifo(priv, 3); + mgaWrite(priv, MGA_CXBNDRY, ((uint32_t)(x + w - 1) << 16) | (uint32_t)(x & 0xFFFF)); + mgaWrite(priv, MGA_YTOP, y * (priv->screenPitch / priv->bytesPerPixel)); + mgaWrite(priv, MGA_YBOT, (y + h - 1) * (priv->screenPitch / priv->bytesPerPixel)); +} + + +// ============================================================ +// mgaSetCursor +// ============================================================ + +static void mgaSetCursor(AccelDriverT *drv, const HwCursorImageT *image) { + MatroxPrivateT *priv = (MatroxPrivateT *)drv->privData; + + if (!image) { + mgaShowCursor(drv, false); + return; + } + + mgaWaitIdle(drv); + + uint8_t *cursorMem = drv->mode.framebuffer + priv->cursorOffset; + + for (int32_t row = 0; row < MGA_HW_CURSOR_SIZE; row++) { + for (int32_t byte = 0; byte < 8; byte++) { + int32_t srcIdx = row * 8 + byte; + uint8_t andByte; + uint8_t xorByte; + + if (row < image->height && byte < (image->width + 7) / 8) { + andByte = image->andMask[srcIdx]; + xorByte = image->xorMask[srcIdx]; + } else { + andByte = 0xFF; + xorByte = 0x00; + } + + cursorMem[row * 16 + byte] = andByte; + cursorMem[row * 16 + byte + 8] = xorByte; + } + } + + // Set cursor base address via DAC register + // CURBASE at MMIO + 0x3C04 + mgaWrite(priv, 0x3C04, priv->cursorOffset); +} + + +// ============================================================ +// mgaShowCursor +// ============================================================ + +static void mgaShowCursor(AccelDriverT *drv, bool visible) { + MatroxPrivateT *priv = (MatroxPrivateT *)drv->privData; + + // CURCTL at MMIO + 0x3C00 + uint32_t curCtl = mgaRead(priv, 0x3C00); + + if (visible) { + curCtl |= 0x01; // enable cursor + } else { + curCtl &= ~0x01; + } + + mgaWrite(priv, 0x3C00, curCtl); +} + + +// ============================================================ +// mgaShutdown +// ============================================================ + +static void mgaShutdown(AccelDriverT *drv) { + MatroxPrivateT *priv = (MatroxPrivateT *)drv->privData; + + mgaShowCursor(drv, false); + dpmiUnmapFramebuffer(&priv->mmioMapping); + dpmiUnmapFramebuffer(&priv->lfbMapping); + vgaRestoreTextMode(); +} + + +// ============================================================ +// mgaWaitFifo +// ============================================================ +// +// Wait until the MGA FIFO has enough free entries. +// FIFOSTATUS bits 6:0 indicate the number of free slots. + +static void mgaWaitFifo(MatroxPrivateT *priv, int32_t entries) { + for (int32_t i = 0; i < MGA_MAX_IDLE_WAIT; i++) { + uint32_t stat = mgaRead(priv, MGA_FIFOSTATUS); + int32_t free = stat & MGA_FIFO_FULL_MASK; + + if (free >= entries) { + return; + } + } +} + + +// ============================================================ +// mgaWaitIdle +// ============================================================ + +static void mgaWaitIdle(AccelDriverT *drv) { + MatroxPrivateT *priv = (MatroxPrivateT *)drv->privData; + + for (int32_t i = 0; i < MGA_MAX_IDLE_WAIT; i++) { + uint32_t stat = mgaRead(priv, MGA_STATUS); + if (!(stat & MGA_STATUS_BUSY)) { + return; + } + } +} diff --git a/nvidia.c b/nvidia.c new file mode 100644 index 0000000..109b174 --- /dev/null +++ b/nvidia.c @@ -0,0 +1,677 @@ +// nvidia.c -- Nvidia RIVA 128/TNT/TNT2 accelerated video driver +// +// Supports the Nvidia RIVA family: RIVA 128, RIVA 128 ZX, TNT, +// TNT2, TNT2 Ultra, TNT2 M64, and Vanta. These were high- +// performance 2D/3D accelerators of the late 1990s featuring: +// - Solid rectangle fill +// - Screen-to-screen BitBLT +// - Host-to-screen blit (CPU data transfer) +// - Hardware clip rectangle +// - 64x64 two-color hardware cursor via PRAMDAC +// +// Register access: +// The NV architecture uses memory-mapped I/O via BAR0 (16MB +// MMIO register space) and BAR1 (framebuffer). The 2D engine +// is accessed through the FIFO user space at BAR0 + 0x800000, +// which provides subchannel-based access to graphics objects. +// +// Subchannel layout: +// Sub 0 (0x0000): ROP +// Sub 1 (0x2000): Clip +// Sub 2 (0x4000): Pattern +// Sub 3 (0x6000): GdiRectangle (solid fill) +// Sub 4 (0x8000): ScreenScreenBlt +// Sub 5 (0xA000): ImageFromCpu +// +// Each subchannel has methods starting at +0x0100 within +// its range. The PGRAPH_STATUS register at 0x400700 indicates +// engine busy status (0 = idle). + +#include "accelVid.h" +#include "vgaCommon.h" +#include "pci.h" + +#include +#include +#include +#include +#include + +// ============================================================ +// Nvidia vendor/device IDs +// ============================================================ + +#define NV_VENDOR_ID 0x10DE + +#define NV_RIVA_128 0x0018 // RIVA 128 +#define NV_RIVA_128_ZX 0x0019 // RIVA 128 ZX +#define NV_TNT 0x0020 // RIVA TNT +#define NV_TNT2 0x0028 // RIVA TNT2 +#define NV_TNT2_ULTRA 0x0029 // RIVA TNT2 Ultra +#define NV_TNT2_M64 0x002D // RIVA TNT2 M64 +#define NV_VANTA 0x002C // Vanta + +static const uint16_t sNvDeviceIds[] = { + NV_VENDOR_ID, NV_RIVA_128, + NV_VENDOR_ID, NV_RIVA_128_ZX, + NV_VENDOR_ID, NV_TNT, + NV_VENDOR_ID, NV_TNT2, + NV_VENDOR_ID, NV_TNT2_ULTRA, + NV_VENDOR_ID, NV_TNT2_M64, + NV_VENDOR_ID, NV_VANTA, + 0, 0 +}; + +// ============================================================ +// MMIO register offsets (from BAR0) +// ============================================================ + +// PGRAPH status +#define NV_PGRAPH_STATUS 0x400700 // 0 = idle + +// PRAMDAC hardware cursor +#define NV_PRAMDAC_CURSOR_CFG 0x680300 // bit 0 = enable, bits 2:1 = color mode +#define NV_PRAMDAC_CURSOR_POS 0x680320 // cursor X/Y position + +// PRAMIN area -- cursor image storage offset in VRAM +// The cursor image lives at the top of VRAM, 1KB for 32x32 or 4KB for 64x64. +// PRAMDAC fetches it from the address configured in NV_PRAMDAC_CURSOR_START. +#define NV_PRAMDAC_CURSOR_START 0x680324 // cursor image VRAM offset + +// PFB -- framebuffer config (for reading VRAM size) +#define NV_PFB_BOOT_0 0x100000 // boot config (NV3) +#define NV_PFB_CFG_0 0x100200 // framebuffer config (NV4/NV5) + +// ============================================================ +// FIFO user space offsets (from BAR0 + 0x800000) +// ============================================================ +// +// Subchannel base addresses within the user FIFO area. + +#define NV_FIFO_BASE 0x800000 + +// Subchannel 0: ROP +#define NV_ROP_SUBCHAN 0x0000 +#define NV_ROP_ROP 0x0300 // raster operation + +// Subchannel 1: Clip +#define NV_CLIP_SUBCHAN 0x2000 +#define NV_CLIP_POINT 0x2300 // x | y<<16 +#define NV_CLIP_SIZE 0x2304 // w | h<<16 + +// Subchannel 3: GdiRectangle (solid fill) +#define NV_RECT_SUBCHAN 0x6000 +#define NV_RECT_COLOR 0x62FC // fill color +#define NV_RECT_POINT 0x6300 // x | y<<16 +#define NV_RECT_SIZE 0x6304 // w | h<<16 (triggers fill) + +// Subchannel 4: ScreenScreenBlt +#define NV_BLIT_SUBCHAN 0x8000 +#define NV_BLIT_POINT_IN 0x8300 // srcX | srcY<<16 +#define NV_BLIT_POINT_OUT 0x8304 // dstX | dstY<<16 +#define NV_BLIT_SIZE 0x8308 // w | h<<16 + +// Subchannel 5: ImageFromCpu +#define NV_IMAGE_SUBCHAN 0xA000 +#define NV_IMAGE_POINT 0xA300 // dstX | dstY<<16 +#define NV_IMAGE_SIZE_OUT 0xA304 // w | h<<16 +#define NV_IMAGE_SIZE_IN 0xA308 // srcW | srcH<<16 +#define NV_IMAGE_DATA 0xA400 // color data (dwords) + +// ============================================================ +// Constants +// ============================================================ + +#define NV_ROP_COPY 0xCC // dest = src +#define NV_MMIO_SIZE 0x1000000 // 16MB MMIO region +#define NV_MAX_IDLE_WAIT 1000000 +#define NV_HW_CURSOR_SIZE 64 +#define NV_HW_CURSOR_BYTES (NV_HW_CURSOR_SIZE * NV_HW_CURSOR_SIZE * 2 / 8) + +// Cursor config bits +#define NV_CURSOR_ENABLE 0x01 +#define NV_CURSOR_MODE_2COLOR 0x00 // 2-color mode (bits 2:1 = 0) + +// RIVA 128 (NV3) vs TNT (NV4/NV5) detection +#define NV_ARCH_NV3 3 +#define NV_ARCH_NV4 4 + +// ============================================================ +// Private driver state +// ============================================================ + +typedef struct { + volatile uint32_t *mmio; // mapped MMIO base (BAR0) + volatile uint32_t *fifo; // FIFO user space (BAR0 + 0x800000) + uint32_t mmioPhysAddr; + uint32_t lfbPhysAddr; + uint32_t vramSize; + uint32_t cursorOffset; // cursor image offset in VRAM + int32_t bytesPerPixel; + int32_t screenPitch; + int32_t arch; // NV_ARCH_NV3 or NV_ARCH_NV4 + DpmiMappingT mmioMapping; + DpmiMappingT lfbMapping; +} NvPrivateT; + +// ============================================================ +// Prototypes +// ============================================================ + +static void nvBitBlt(AccelDriverT *drv, int32_t srcX, int32_t srcY, int32_t dstX, int32_t dstY, int32_t w, int32_t h); +static bool nvDetect(AccelDriverT *drv); +static uint32_t nvDetectVram(NvPrivateT *priv); +static void nvHostBlit(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h); +static bool nvInit(AccelDriverT *drv, const AccelModeRequestT *req); +static void nvMoveCursor(AccelDriverT *drv, int32_t x, int32_t y); +static void nvRectFill(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color); +static void nvSetClip(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h); +static void nvSetCursor(AccelDriverT *drv, const HwCursorImageT *image); +static void nvSetupEngine(NvPrivateT *priv); +static void nvShowCursor(AccelDriverT *drv, bool visible); +static void nvShutdown(AccelDriverT *drv); +static void nvWaitIdle(AccelDriverT *drv); +static void nvWriteFifo(NvPrivateT *priv, uint32_t offset, uint32_t val); +static uint32_t nvReadMmio(NvPrivateT *priv, uint32_t offset); +static void nvWriteMmio(NvPrivateT *priv, uint32_t offset, uint32_t val); + +// ============================================================ +// Driver instance +// ============================================================ + +static NvPrivateT sNvPrivate; + +static AccelDriverT sNvDriver = { + .name = "Nvidia RIVA", + .chipFamily = "nvidia", + .caps = 0, + .privData = &sNvPrivate, + .detect = nvDetect, + .init = nvInit, + .shutdown = nvShutdown, + .waitIdle = nvWaitIdle, + .setClip = nvSetClip, + .rectFill = nvRectFill, + .rectFillPat = NULL, + .bitBlt = nvBitBlt, + .hostBlit = nvHostBlit, + .colorExpand = NULL, + .lineDraw = NULL, + .setCursor = nvSetCursor, + .moveCursor = nvMoveCursor, + .showCursor = nvShowCursor, +}; + +// ============================================================ +// nvRegisterDriver +// ============================================================ + +void nvRegisterDriver(void) { + accelRegisterDriver(&sNvDriver); +} + + +// ============================================================ +// nvBitBlt +// ============================================================ +// +// Screen-to-screen blit via the ScreenScreenBlt subchannel. +// The NV engine handles overlapping source/destination regions +// internally when the blit direction is set appropriately. + +static void nvBitBlt(AccelDriverT *drv, int32_t srcX, int32_t srcY, int32_t dstX, int32_t dstY, int32_t w, int32_t h) { + NvPrivateT *priv = (NvPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + nvWaitIdle(drv); + + nvWriteFifo(priv, NV_BLIT_POINT_IN, (uint32_t)srcX | ((uint32_t)srcY << 16)); + nvWriteFifo(priv, NV_BLIT_POINT_OUT, (uint32_t)dstX | ((uint32_t)dstY << 16)); + nvWriteFifo(priv, NV_BLIT_SIZE, (uint32_t)w | ((uint32_t)h << 16)); +} + + +// ============================================================ +// nvDetect +// ============================================================ + +static bool nvDetect(AccelDriverT *drv) { + int32_t matchIdx; + + if (!pciFindDeviceList(sNvDeviceIds, &drv->pciDev, &matchIdx)) { + return false; + } + + switch (drv->pciDev.deviceId) { + case NV_RIVA_128: + drv->name = "Nvidia RIVA 128"; + break; + case NV_RIVA_128_ZX: + drv->name = "Nvidia RIVA 128 ZX"; + break; + case NV_TNT: + drv->name = "Nvidia RIVA TNT"; + break; + case NV_TNT2: + drv->name = "Nvidia RIVA TNT2"; + break; + case NV_TNT2_ULTRA: + drv->name = "Nvidia RIVA TNT2 Ultra"; + break; + case NV_TNT2_M64: + drv->name = "Nvidia RIVA TNT2 M64"; + break; + case NV_VANTA: + drv->name = "Nvidia Vanta"; + break; + default: + drv->name = "Nvidia RIVA"; + break; + } + + return true; +} + + +// ============================================================ +// nvDetectVram +// ============================================================ +// +// Read VRAM size from the PFB registers. NV3 (RIVA 128) uses +// PFB_BOOT_0, while NV4/NV5 (TNT/TNT2) use PFB_CFG_0. + +static uint32_t nvDetectVram(NvPrivateT *priv) { + if (priv->arch == NV_ARCH_NV3) { + // NV3: PFB_BOOT_0 bits 1:0 encode VRAM size + uint32_t boot0 = nvReadMmio(priv, NV_PFB_BOOT_0); + uint32_t sizeIdx = boot0 & 0x03; + + switch (sizeIdx) { + case 0: return 8 * 1024 * 1024; + case 1: return 2 * 1024 * 1024; + case 2: return 4 * 1024 * 1024; + default: return 4 * 1024 * 1024; + } + } + + // NV4/NV5: PFB_CFG_0 bits 1:0 encode VRAM size + uint32_t cfg0 = nvReadMmio(priv, NV_PFB_CFG_0); + uint32_t sizeIdx = cfg0 & 0x03; + + switch (sizeIdx) { + case 0: return 32 * 1024 * 1024; + case 1: return 4 * 1024 * 1024; + case 2: return 8 * 1024 * 1024; + case 3: return 16 * 1024 * 1024; + default: return 4 * 1024 * 1024; + } +} + + +// ============================================================ +// nvHostBlit +// ============================================================ +// +// CPU-to-screen blit via the ImageFromCpu subchannel. Transfers +// pixel data from system memory to VRAM through the FIFO. + +static void nvHostBlit(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h) { + NvPrivateT *priv = (NvPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + int32_t rowBytes = w * priv->bytesPerPixel; + int32_t dwordsPerRow = (rowBytes + 3) / 4; + + nvWaitIdle(drv); + + // Set up the image transfer + nvWriteFifo(priv, NV_IMAGE_POINT, (uint32_t)dstX | ((uint32_t)dstY << 16)); + nvWriteFifo(priv, NV_IMAGE_SIZE_OUT, (uint32_t)w | ((uint32_t)h << 16)); + nvWriteFifo(priv, NV_IMAGE_SIZE_IN, (uint32_t)w | ((uint32_t)h << 16)); + + // Write pixel data row by row + for (int32_t row = 0; row < h; row++) { + const uint8_t *rowPtr = srcBuf + row * srcPitch; + + for (int32_t dw = 0; dw < dwordsPerRow; dw++) { + int32_t byteOff = dw * 4; + uint32_t data = 0; + + // Pack bytes into a dword (little-endian native order) + for (int32_t b = 0; b < 4; b++) { + if (byteOff + b < rowBytes) { + data |= (uint32_t)rowPtr[byteOff + b] << (b * 8); + } + } + + // Write to the color data area; each dword goes to the + // next sequential offset starting at NV_IMAGE_DATA. + nvWriteFifo(priv, NV_IMAGE_DATA + (uint32_t)(dw * 4), data); + } + + // Wait for engine between rows to avoid FIFO overflow + nvWaitIdle(drv); + } +} + + +// ============================================================ +// nvInit +// ============================================================ + +static bool nvInit(AccelDriverT *drv, const AccelModeRequestT *req) { + NvPrivateT *priv = (NvPrivateT *)drv->privData; + + memset(priv, 0, sizeof(*priv)); + + // Determine architecture (NV3 vs NV4/NV5) + if (drv->pciDev.deviceId == NV_RIVA_128 || drv->pciDev.deviceId == NV_RIVA_128_ZX) { + priv->arch = NV_ARCH_NV3; + } else { + priv->arch = NV_ARCH_NV4; + } + + // Get BAR0 (MMIO) and BAR1 (framebuffer) addresses + uint32_t bar0 = pciRead32(drv->pciDev.bus, drv->pciDev.dev, drv->pciDev.func, PCI_BAR0); + uint32_t bar1 = pciRead32(drv->pciDev.bus, drv->pciDev.dev, drv->pciDev.func, PCI_BAR1); + + priv->mmioPhysAddr = bar0 & 0xFFFFFFF0; + priv->lfbPhysAddr = bar1 & 0xFFFFFFF0; + + // Size the framebuffer BAR + uint32_t lfbBarSize = pciSizeBar(drv->pciDev.bus, drv->pciDev.dev, drv->pciDev.func, PCI_BAR1); + + // Enable bus mastering and memory space access + uint16_t pciCmd = pciRead16(drv->pciDev.bus, drv->pciDev.dev, drv->pciDev.func, PCI_COMMAND); + pciCmd |= PCI_CMD_MEM_ENABLE | PCI_CMD_BUS_MASTER; + pciWrite16(drv->pciDev.bus, drv->pciDev.dev, drv->pciDev.func, PCI_COMMAND, pciCmd); + + // Map MMIO region (BAR0, 16MB) + if (!dpmiMapFramebuffer(priv->mmioPhysAddr, NV_MMIO_SIZE, &priv->mmioMapping)) { + return false; + } + priv->mmio = (volatile uint32_t *)priv->mmioMapping.ptr; + priv->fifo = (volatile uint32_t *)(priv->mmioMapping.ptr + NV_FIFO_BASE); + + // Detect VRAM size + priv->vramSize = nvDetectVram(priv); + + // Use whichever is smaller: the BAR size or detected VRAM + if (lfbBarSize < priv->vramSize) { + priv->vramSize = lfbBarSize; + } + + // Set VESA mode + VesaModeResultT vesa; + if (!vesaFindAndSetMode(req->width, req->height, req->bpp, &vesa)) { + dpmiUnmapFramebuffer(&priv->mmioMapping); + return false; + } + + // Map framebuffer (BAR1) + if (!dpmiMapFramebuffer(priv->lfbPhysAddr, priv->vramSize, &priv->lfbMapping)) { + vgaRestoreTextMode(); + dpmiUnmapFramebuffer(&priv->mmioMapping); + return false; + } + + priv->bytesPerPixel = (vesa.bpp + 7) / 8; + priv->screenPitch = vesa.pitch; + + drv->mode.width = vesa.width; + drv->mode.height = vesa.height; + drv->mode.bpp = vesa.bpp; + drv->mode.pitch = vesa.pitch; + drv->mode.framebuffer = priv->lfbMapping.ptr; + drv->mode.vramSize = priv->vramSize; + drv->mode.offscreenBase = vesa.pitch * vesa.height; + + // Reserve space for hardware cursor at end of VRAM + priv->cursorOffset = priv->vramSize - NV_HW_CURSOR_BYTES; + priv->cursorOffset &= ~(uint32_t)(NV_HW_CURSOR_BYTES - 1); + + // Initialize the 2D engine + nvSetupEngine(priv); + + drv->caps = ACAP_RECT_FILL + | ACAP_BITBLT + | ACAP_HOST_BLIT + | ACAP_HW_CURSOR + | ACAP_CLIP; + + // Set full-screen clip + nvSetClip(drv, 0, 0, vesa.width, vesa.height); + + nvWaitIdle(drv); + return true; +} + + +// ============================================================ +// nvMoveCursor +// ============================================================ + +static void nvMoveCursor(AccelDriverT *drv, int32_t x, int32_t y) { + NvPrivateT *priv = (NvPrivateT *)drv->privData; + + // PRAMDAC cursor position: bits 15:0 = X, bits 31:16 = Y + // Negative values are handled by clamping to 0; the cursor + // offset register could be used for sub-pixel adjustment but + // that is not needed for typical use. + if (x < 0) { + x = 0; + } + if (y < 0) { + y = 0; + } + + nvWriteMmio(priv, NV_PRAMDAC_CURSOR_POS, (uint32_t)x | ((uint32_t)y << 16)); +} + + +// ============================================================ +// nvReadMmio / nvWriteMmio +// ============================================================ +// +// Direct MMIO register access via BAR0. + +static uint32_t nvReadMmio(NvPrivateT *priv, uint32_t offset) { + return priv->mmio[offset / 4]; +} + + +static void nvWriteMmio(NvPrivateT *priv, uint32_t offset, uint32_t val) { + priv->mmio[offset / 4] = val; +} + + +// ============================================================ +// nvRectFill +// ============================================================ +// +// Solid rectangle fill via the GdiRectangle subchannel. + +static void nvRectFill(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color) { + NvPrivateT *priv = (NvPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + nvWaitIdle(drv); + + nvWriteFifo(priv, NV_RECT_COLOR, color); + nvWriteFifo(priv, NV_RECT_POINT, (uint32_t)x | ((uint32_t)y << 16)); + nvWriteFifo(priv, NV_RECT_SIZE, (uint32_t)w | ((uint32_t)h << 16)); +} + + +// ============================================================ +// nvSetClip +// ============================================================ +// +// Set the hardware clip rectangle via the Clip subchannel. + +static void nvSetClip(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h) { + NvPrivateT *priv = (NvPrivateT *)drv->privData; + + nvWaitIdle(drv); + + nvWriteFifo(priv, NV_CLIP_POINT, (uint32_t)x | ((uint32_t)y << 16)); + nvWriteFifo(priv, NV_CLIP_SIZE, (uint32_t)w | ((uint32_t)h << 16)); +} + + +// ============================================================ +// nvSetCursor +// ============================================================ +// +// Upload a cursor image to VRAM and configure the PRAMDAC +// to display it. The NV hardware cursor is 64x64, 2 bits per +// pixel, stored in VRAM at the offset configured in +// NV_PRAMDAC_CURSOR_START. +// +// 2bpp encoding: +// 00 = cursor color 0 (background) +// 01 = cursor color 1 (foreground) +// 10 = transparent +// 11 = inverted + +static void nvSetCursor(AccelDriverT *drv, const HwCursorImageT *image) { + NvPrivateT *priv = (NvPrivateT *)drv->privData; + + if (!image) { + nvShowCursor(drv, false); + return; + } + + nvWaitIdle(drv); + + // Write cursor image to VRAM at the reserved offset + uint8_t *cursorMem = drv->mode.framebuffer + priv->cursorOffset; + + for (int32_t row = 0; row < NV_HW_CURSOR_SIZE; row++) { + for (int32_t byteIdx = 0; byteIdx < 16; byteIdx++) { + uint8_t val = 0xAA; // all transparent (10 pattern) + + if (row < image->height && byteIdx < (image->width + 3) / 4) { + int32_t bitOff = byteIdx * 4; + uint8_t andBits = 0; + uint8_t xorBits = 0; + + if (bitOff / 8 < (image->width + 7) / 8) { + andBits = image->andMask[row * 8 + bitOff / 8]; + xorBits = image->xorMask[row * 8 + bitOff / 8]; + } + + // Pack 4 pixels into one byte (2 bits each) + val = 0; + for (int32_t px = 0; px < 4; px++) { + int32_t srcBit = (bitOff + px) % 8; + uint8_t andBit = (andBits >> (7 - srcBit)) & 1; + uint8_t xorBit = (xorBits >> (7 - srcBit)) & 1; + uint8_t pixel; + + if (andBit && !xorBit) { + pixel = 0x02; // transparent + } else if (andBit && xorBit) { + pixel = 0x03; // inverted + } else if (!andBit && xorBit) { + pixel = 0x01; // cursor color 1 + } else { + pixel = 0x00; // cursor color 0 + } + + val |= pixel << (6 - px * 2); + } + } + + cursorMem[row * 16 + byteIdx] = val; + } + } + + // Point the PRAMDAC at the cursor image in VRAM + nvWriteMmio(priv, NV_PRAMDAC_CURSOR_START, priv->cursorOffset); +} + + +// ============================================================ +// nvSetupEngine +// ============================================================ +// +// Initialize the 2D acceleration engine. Sets the ROP to copy +// mode and prepares the FIFO subchannels for use. + +static void nvSetupEngine(NvPrivateT *priv) { + // Set ROP to copy + nvWriteFifo(priv, NV_ROP_ROP, NV_ROP_COPY); +} + + +// ============================================================ +// nvShowCursor +// ============================================================ + +static void nvShowCursor(AccelDriverT *drv, bool visible) { + NvPrivateT *priv = (NvPrivateT *)drv->privData; + + uint32_t cfg = nvReadMmio(priv, NV_PRAMDAC_CURSOR_CFG); + + if (visible) { + cfg |= NV_CURSOR_ENABLE; + } else { + cfg &= ~(uint32_t)NV_CURSOR_ENABLE; + } + + nvWriteMmio(priv, NV_PRAMDAC_CURSOR_CFG, cfg); +} + + +// ============================================================ +// nvShutdown +// ============================================================ + +static void nvShutdown(AccelDriverT *drv) { + NvPrivateT *priv = (NvPrivateT *)drv->privData; + + nvShowCursor(drv, false); + vgaRestoreTextMode(); + dpmiUnmapFramebuffer(&priv->lfbMapping); + dpmiUnmapFramebuffer(&priv->mmioMapping); +} + + +// ============================================================ +// nvWaitIdle +// ============================================================ +// +// Wait for the PGRAPH engine to become idle by polling the +// PGRAPH_STATUS register. + +static void nvWaitIdle(AccelDriverT *drv) { + NvPrivateT *priv = (NvPrivateT *)drv->privData; + + for (int32_t i = 0; i < NV_MAX_IDLE_WAIT; i++) { + if (nvReadMmio(priv, NV_PGRAPH_STATUS) == 0) { + return; + } + } +} + + +// ============================================================ +// nvWriteFifo +// ============================================================ +// +// Write a value to the FIFO user space. The offset is relative +// to the FIFO base (BAR0 + 0x800000). + +static void nvWriteFifo(NvPrivateT *priv, uint32_t offset, uint32_t val) { + priv->fifo[offset / 4] = val; +} diff --git a/pci.c b/pci.c new file mode 100644 index 0000000..d22b727 --- /dev/null +++ b/pci.c @@ -0,0 +1,307 @@ +// pci.c -- PCI configuration space access for DOS/DJGPP +// +// Implements PCI mechanism 1 (CONFIG_ADDRESS at 0xCF8, CONFIG_DATA +// at 0xCFC). This is the standard PCI configuration access method +// supported by all PCI-capable chipsets. +// +// How mechanism 1 works: +// 1. Write a 32-bit address to port 0xCF8 with bit 31 set (enable), +// bus/dev/func/register fields encoded in bits 23:0 +// 2. Read or write the 32-bit data at port 0xCFC +// 3. For sub-dword access (8/16-bit), read the full dword and +// mask/shift, or write with a read-modify-write +// +// Detection: write 0x80000000 to 0xCF8 and read back. If the value +// matches, mechanism 1 is present. This works because bit 31 is the +// enable bit -- on non-PCI systems, port 0xCF8 is either absent +// (reads back 0xFF) or belongs to a different device. + +#include "pci.h" + +#include + +// PCI configuration mechanism 1 I/O ports +#define PCI_CONFIG_ADDR 0x0CF8 +#define PCI_CONFIG_DATA 0x0CFC + +// ============================================================ +// Prototypes +// ============================================================ + +uint32_t pciBuildAddress(uint8_t bus, uint8_t dev, uint8_t func, uint8_t reg); +bool pciDetect(void); +int32_t pciEnumerate(PciEnumCallbackT cb, void *userData); +bool pciFindDevice(uint16_t vendorId, uint16_t deviceId, PciDeviceT *dev); +bool pciFindDeviceList(const uint16_t *idPairs, PciDeviceT *dev, int32_t *matchIdx); +uint8_t pciRead8(uint8_t bus, uint8_t dev, uint8_t func, uint8_t reg); +uint16_t pciRead16(uint8_t bus, uint8_t dev, uint8_t func, uint8_t reg); +uint32_t pciRead32(uint8_t bus, uint8_t dev, uint8_t func, uint8_t reg); +void pciWrite8(uint8_t bus, uint8_t dev, uint8_t func, uint8_t reg, uint8_t val); +void pciWrite16(uint8_t bus, uint8_t dev, uint8_t func, uint8_t reg, uint16_t val); +void pciWrite32(uint8_t bus, uint8_t dev, uint8_t func, uint8_t reg, uint32_t val); + +// ============================================================ +// pciBuildAddress +// ============================================================ +// +// Constructs a PCI configuration space address for mechanism 1. +// Format: [31]=enable, [23:16]=bus, [15:11]=device, [10:8]=function, +// [7:2]=register (dword-aligned), [1:0]=0 + +uint32_t pciBuildAddress(uint8_t bus, uint8_t dev, uint8_t func, uint8_t reg) { + return 0x80000000 + | ((uint32_t)bus << 16) + | ((uint32_t)dev << 11) + | ((uint32_t)func << 8) + | ((uint32_t)reg & 0xFC); +} + + +// ============================================================ +// pciDetect +// ============================================================ +// +// Checks for PCI mechanism 1 by writing the enable bit to the +// CONFIG_ADDRESS port and reading it back. Saves and restores +// the original port value to avoid disturbing any in-progress +// PCI transaction. + +bool pciDetect(void) { + uint32_t saved = inportl(PCI_CONFIG_ADDR); + + outportl(PCI_CONFIG_ADDR, 0x80000000); + uint32_t readBack = inportl(PCI_CONFIG_ADDR); + + outportl(PCI_CONFIG_ADDR, saved); + + return (readBack == 0x80000000); +} + + +// ============================================================ +// pciEnumerate +// ============================================================ +// +// Scans all bus/device/function combinations for present devices. +// A device is present if its vendor ID is not 0xFFFF. Multi-function +// devices are detected by checking bit 7 of the header type register +// on function 0; single-function devices only probe function 0. + +int32_t pciEnumerate(PciEnumCallbackT cb, void *userData) { + int32_t count = 0; + + for (int32_t bus = 0; bus < PCI_MAX_BUS; bus++) { + for (int32_t dev = 0; dev < PCI_MAX_DEV; dev++) { + uint16_t vendor0 = pciRead16(bus, dev, 0, PCI_VENDOR_ID); + + if (vendor0 == 0xFFFF) { + continue; + } + + // Check if multi-function device + uint8_t headerType = pciRead8(bus, dev, 0, PCI_HEADER_TYPE); + int32_t maxFunc = (headerType & 0x80) ? PCI_MAX_FUNC : 1; + + for (int32_t func = 0; func < maxFunc; func++) { + uint16_t vendorId = pciRead16(bus, dev, func, PCI_VENDOR_ID); + + if (vendorId == 0xFFFF) { + continue; + } + + PciDeviceT device; + device.bus = bus; + device.dev = dev; + device.func = func; + device.vendorId = vendorId; + device.deviceId = pciRead16(bus, dev, func, PCI_DEVICE_ID); + device.revision = pciRead8(bus, dev, func, PCI_REVISION_ID); + device.baseClass = pciRead8(bus, dev, func, PCI_BASE_CLASS); + device.subClass = pciRead8(bus, dev, func, PCI_SUBCLASS); + + for (int32_t i = 0; i < 6; i++) { + device.bar[i] = pciRead32(bus, dev, func, PCI_BAR0 + i * 4); + } + + count++; + + if (cb && cb(&device, userData)) { + return count; + } + } + } + } + + return count; +} + + +// ============================================================ +// pciFindDevice +// ============================================================ + +bool pciFindDevice(uint16_t vendorId, uint16_t deviceId, PciDeviceT *dev) { + for (int32_t bus = 0; bus < PCI_MAX_BUS; bus++) { + for (int32_t d = 0; d < PCI_MAX_DEV; d++) { + uint16_t vendor0 = pciRead16(bus, d, 0, PCI_VENDOR_ID); + + if (vendor0 == 0xFFFF) { + continue; + } + + uint8_t headerType = pciRead8(bus, d, 0, PCI_HEADER_TYPE); + int32_t maxFunc = (headerType & 0x80) ? PCI_MAX_FUNC : 1; + + for (int32_t func = 0; func < maxFunc; func++) { + uint16_t vid = pciRead16(bus, d, func, PCI_VENDOR_ID); + uint16_t did = pciRead16(bus, d, func, PCI_DEVICE_ID); + + if (vid == vendorId && did == deviceId) { + dev->bus = bus; + dev->dev = d; + dev->func = func; + dev->vendorId = vid; + dev->deviceId = did; + dev->revision = pciRead8(bus, d, func, PCI_REVISION_ID); + dev->baseClass = pciRead8(bus, d, func, PCI_BASE_CLASS); + dev->subClass = pciRead8(bus, d, func, PCI_SUBCLASS); + + for (int32_t i = 0; i < 6; i++) { + dev->bar[i] = pciRead32(bus, d, func, PCI_BAR0 + i * 4); + } + + return true; + } + } + } + } + + return false; +} + + +// ============================================================ +// pciFindDeviceList +// ============================================================ +// +// Searches for the first PCI device matching any vendor/device pair +// in the given list. The list is an array of uint16_t pairs: +// { vendor1, device1, vendor2, device2, ..., 0, 0 } +// On match, fills dev and sets matchIdx to the pair index (0-based). + +bool pciFindDeviceList(const uint16_t *idPairs, PciDeviceT *dev, int32_t *matchIdx) { + for (int32_t bus = 0; bus < PCI_MAX_BUS; bus++) { + for (int32_t d = 0; d < PCI_MAX_DEV; d++) { + uint16_t vendor0 = pciRead16(bus, d, 0, PCI_VENDOR_ID); + + if (vendor0 == 0xFFFF) { + continue; + } + + uint8_t headerType = pciRead8(bus, d, 0, PCI_HEADER_TYPE); + int32_t maxFunc = (headerType & 0x80) ? PCI_MAX_FUNC : 1; + + for (int32_t func = 0; func < maxFunc; func++) { + uint16_t vid = pciRead16(bus, d, func, PCI_VENDOR_ID); + uint16_t did = pciRead16(bus, d, func, PCI_DEVICE_ID); + + if (vid == 0xFFFF) { + continue; + } + + for (int32_t idx = 0; idPairs[idx * 2] != 0; idx++) { + if (vid == idPairs[idx * 2] && did == idPairs[idx * 2 + 1]) { + dev->bus = bus; + dev->dev = d; + dev->func = func; + dev->vendorId = vid; + dev->deviceId = did; + dev->revision = pciRead8(bus, d, func, PCI_REVISION_ID); + dev->baseClass = pciRead8(bus, d, func, PCI_BASE_CLASS); + dev->subClass = pciRead8(bus, d, func, PCI_SUBCLASS); + + for (int32_t i = 0; i < 6; i++) { + dev->bar[i] = pciRead32(bus, d, func, PCI_BAR0 + i * 4); + } + + if (matchIdx) { + *matchIdx = idx; + } + + return true; + } + } + } + } + } + + return false; +} + + +// ============================================================ +// pciRead8 +// ============================================================ + +uint8_t pciRead8(uint8_t bus, uint8_t dev, uint8_t func, uint8_t reg) { + outportl(PCI_CONFIG_ADDR, pciBuildAddress(bus, dev, func, reg)); + uint32_t dword = inportl(PCI_CONFIG_DATA); + return (dword >> ((reg & 3) * 8)) & 0xFF; +} + + +// ============================================================ +// pciRead16 +// ============================================================ + +uint16_t pciRead16(uint8_t bus, uint8_t dev, uint8_t func, uint8_t reg) { + outportl(PCI_CONFIG_ADDR, pciBuildAddress(bus, dev, func, reg)); + uint32_t dword = inportl(PCI_CONFIG_DATA); + return (dword >> ((reg & 2) * 8)) & 0xFFFF; +} + + +// ============================================================ +// pciRead32 +// ============================================================ + +uint32_t pciRead32(uint8_t bus, uint8_t dev, uint8_t func, uint8_t reg) { + outportl(PCI_CONFIG_ADDR, pciBuildAddress(bus, dev, func, reg)); + return inportl(PCI_CONFIG_DATA); +} + + +// ============================================================ +// pciWrite8 +// ============================================================ + +void pciWrite8(uint8_t bus, uint8_t dev, uint8_t func, uint8_t reg, uint8_t val) { + outportl(PCI_CONFIG_ADDR, pciBuildAddress(bus, dev, func, reg)); + uint32_t dword = inportl(PCI_CONFIG_DATA); + int32_t shift = (reg & 3) * 8; + dword = (dword & ~(0xFF << shift)) | ((uint32_t)val << shift); + outportl(PCI_CONFIG_DATA, dword); +} + + +// ============================================================ +// pciWrite16 +// ============================================================ + +void pciWrite16(uint8_t bus, uint8_t dev, uint8_t func, uint8_t reg, uint16_t val) { + outportl(PCI_CONFIG_ADDR, pciBuildAddress(bus, dev, func, reg)); + uint32_t dword = inportl(PCI_CONFIG_DATA); + int32_t shift = (reg & 2) * 8; + dword = (dword & ~(0xFFFF << shift)) | ((uint32_t)val << shift); + outportl(PCI_CONFIG_DATA, dword); +} + + +// ============================================================ +// pciWrite32 +// ============================================================ + +void pciWrite32(uint8_t bus, uint8_t dev, uint8_t func, uint8_t reg, uint32_t val) { + outportl(PCI_CONFIG_ADDR, pciBuildAddress(bus, dev, func, reg)); + outportl(PCI_CONFIG_DATA, val); +} diff --git a/pci.h b/pci.h new file mode 100644 index 0000000..c461584 --- /dev/null +++ b/pci.h @@ -0,0 +1,98 @@ +// pci.h -- PCI configuration space access for DOS/DJGPP +// +// Provides functions to read/write PCI configuration registers and +// enumerate devices on the PCI bus. Uses the standard mechanism 1 +// (I/O ports 0xCF8/0xCFC) which is supported by all PCI-capable +// systems from 1993 onward. +// +// All functions operate synchronously via inportl/outportl. No BIOS +// calls (INT 1Ah) are used because mechanism 1 is faster, simpler, +// and doesn't require a DPMI real-mode callback. +#ifndef PCI_H +#define PCI_H + +#include +#include + +// PCI configuration space register offsets (common header) +#define PCI_VENDOR_ID 0x00 +#define PCI_DEVICE_ID 0x02 +#define PCI_COMMAND 0x04 +#define PCI_STATUS 0x06 +#define PCI_REVISION_ID 0x08 +#define PCI_CLASS_CODE 0x09 +#define PCI_SUBCLASS 0x0A +#define PCI_BASE_CLASS 0x0B +#define PCI_HEADER_TYPE 0x0E +#define PCI_BAR0 0x10 +#define PCI_BAR1 0x14 +#define PCI_BAR2 0x18 +#define PCI_BAR3 0x1C +#define PCI_BAR4 0x20 +#define PCI_BAR5 0x24 +#define PCI_SUBSYS_VENDOR 0x2C +#define PCI_SUBSYS_ID 0x2E + +// PCI command register bits +#define PCI_CMD_IO_ENABLE 0x0001 +#define PCI_CMD_MEM_ENABLE 0x0002 +#define PCI_CMD_BUS_MASTER 0x0004 + +// PCI base class for display controllers +#define PCI_CLASS_DISPLAY 0x03 + +// Maximum PCI bus/device/function values +#define PCI_MAX_BUS 256 +#define PCI_MAX_DEV 32 +#define PCI_MAX_FUNC 8 + +// PCI device descriptor returned by enumeration +typedef struct { + uint8_t bus; + uint8_t dev; + uint8_t func; + uint16_t vendorId; + uint16_t deviceId; + uint8_t revision; + uint8_t baseClass; + uint8_t subClass; + uint32_t bar[6]; +} PciDeviceT; + +// Callback for pciEnumerate(). Return true to stop enumeration. +typedef bool (*PciEnumCallbackT)(const PciDeviceT *device, void *userData); + +// ============================================================ +// Prototypes +// ============================================================ + +// Build a CONFIG_ADDRESS dword for the given bus/dev/func/register. +uint32_t pciBuildAddress(uint8_t bus, uint8_t dev, uint8_t func, uint8_t reg); + +// Check whether PCI mechanism 1 is available. +bool pciDetect(void); + +// Enumerate all PCI devices. Calls cb for each device found. +// Stops early if cb returns true. Returns the number of devices found. +int32_t pciEnumerate(PciEnumCallbackT cb, void *userData); + +// Find the first PCI device matching vendorId/deviceId. +// Returns true if found (and fills out dev), false if not. +bool pciFindDevice(uint16_t vendorId, uint16_t deviceId, PciDeviceT *dev); + +// Find the first PCI device matching any of the given vendor/device +// pairs. The list is terminated by a {0, 0} entry. Returns true if +// found (and fills out dev and matchIdx), false if not. +bool pciFindDeviceList(const uint16_t *idPairs, PciDeviceT *dev, int32_t *matchIdx); + +// Read an 8/16/32-bit value from PCI configuration space. +uint8_t pciRead8(uint8_t bus, uint8_t dev, uint8_t func, uint8_t reg); +uint16_t pciRead16(uint8_t bus, uint8_t dev, uint8_t func, uint8_t reg); +uint32_t pciRead32(uint8_t bus, uint8_t dev, uint8_t func, uint8_t reg); + +// Write an 8/16/32-bit value to PCI configuration space. +void pciWrite8(uint8_t bus, uint8_t dev, uint8_t func, uint8_t reg, uint8_t val); +void pciWrite16(uint8_t bus, uint8_t dev, uint8_t func, uint8_t reg, uint16_t val); +void pciWrite32(uint8_t bus, uint8_t dev, uint8_t func, uint8_t reg, uint32_t val); + +#endif // PCI_H diff --git a/s3Trio.c b/s3Trio.c new file mode 100644 index 0000000..40d7fad --- /dev/null +++ b/s3Trio.c @@ -0,0 +1,1216 @@ +// s3Trio.c -- S3 Trio64/Vision864/Vision968 accelerated video driver +// +// Supports the S3 86C764 (Trio64), 86C765 (Trio64V+), 86C868 (Vision868), +// 86C864 (Vision864), 86C964 (Vision964), 86C968 (Vision968), and +// 86C732 (Trio32) chipsets. +// +// The S3 2D acceleration engine (sometimes called the "graphics engine" +// or "BitBLT engine") provides hardware-accelerated: +// - Solid rectangle fill +// - 8x8 mono/color pattern fill +// - Screen-to-screen BitBLT +// - Mono color expansion (for text rendering) +// - Bresenham line draw +// - Hardware clipping rectangle +// - 64x64 two-color hardware cursor +// +// Register access: +// The S3 extended registers are accessed through CRTC index/data +// ports (0x3D4/0x3D5) at indices 0x30-0x6D. These must be unlocked +// by writing specific key values to CR38 and CR39. +// +// The 2D engine registers are at I/O ports 0x82E8-0xBEE8 (legacy) +// or via MMIO at the linear framebuffer base + 0x1000000 on newer +// chips (Trio64+). We use MMIO when available (Trio64, ViRGE) for +// faster register access, falling back to I/O on older Vision chips. +// +// VESA mode setting: +// We use VBE BIOS calls for mode setting rather than programming +// CRTC timings directly. This is simpler and more reliable across +// the S3 chip variants (which have subtly different timing register +// layouts). After VESA sets the mode, we unlock the S3 extended +// registers and enable the acceleration engine. + +#include "accelVid.h" +#include "vgaCommon.h" +#include "pci.h" + +#include +#include +#include +#include +#include + +// ============================================================ +// S3 vendor/device IDs +// ============================================================ + +#define S3_VENDOR_ID 0x5333 + +#define S3_TRIO32 0x8810 +#define S3_TRIO64 0x8811 +#define S3_TRIO64V_PLUS 0x8814 +#define S3_VISION864 0x88C0 +#define S3_VISION864P 0x88C1 +#define S3_VISION868 0x8880 +#define S3_VISION964 0x88D0 +#define S3_VISION968 0x88F0 +#define S3_VISION968_ALT 0x88F1 +#define S3_VIRGE 0x5631 +#define S3_VIRGE_VX 0x883D +#define S3_VIRGE_DX 0x8A01 +#define S3_VIRGE_GX2 0x8A10 +#define S3_VIRGE_MX 0x8C01 +#define S3_VIRGE_MXP 0x8C03 +#define S3_SAVAGE3D 0x8A20 +#define S3_SAVAGE3D_MV 0x8A21 +#define S3_SAVAGE4 0x8A22 +#define S3_SAVAGE_MX 0x8C10 +#define S3_SAVAGE_MX_MV 0x8C11 +#define S3_SAVAGE_IX 0x8C12 +#define S3_SAVAGE_IX_MV 0x8C13 +#define S3_SAVAGE_2000 0x9102 + +// Terminated by {0, 0} +static const uint16_t sS3DeviceIds[] = { + S3_VENDOR_ID, S3_TRIO32, + S3_VENDOR_ID, S3_TRIO64, + S3_VENDOR_ID, S3_TRIO64V_PLUS, + S3_VENDOR_ID, S3_VIRGE, + S3_VENDOR_ID, S3_VIRGE_VX, + S3_VENDOR_ID, S3_VIRGE_DX, + S3_VENDOR_ID, S3_VIRGE_GX2, + S3_VENDOR_ID, S3_VIRGE_MX, + S3_VENDOR_ID, S3_VIRGE_MXP, + S3_VENDOR_ID, S3_SAVAGE3D, + S3_VENDOR_ID, S3_SAVAGE3D_MV, + S3_VENDOR_ID, S3_SAVAGE4, + S3_VENDOR_ID, S3_SAVAGE_MX, + S3_VENDOR_ID, S3_SAVAGE_MX_MV, + S3_VENDOR_ID, S3_SAVAGE_IX, + S3_VENDOR_ID, S3_SAVAGE_IX_MV, + S3_VENDOR_ID, S3_SAVAGE_2000, + S3_VENDOR_ID, S3_VISION864, + S3_VENDOR_ID, S3_VISION864P, + S3_VENDOR_ID, S3_VISION868, + S3_VENDOR_ID, S3_VISION964, + S3_VENDOR_ID, S3_VISION968, + S3_VENDOR_ID, S3_VISION968_ALT, + 0, 0 +}; + +// ============================================================ +// S3 extended CRTC register indices +// ============================================================ + +#define S3_CR30_CHIP_ID 0x30 +#define S3_CR31_MEM_CONFIG 0x31 +#define S3_CR33_BACKWARD_COMPAT 0x33 +#define S3_CR34_BACKWARD_COMPAT 0x34 +#define S3_CR35_CRTC_LOCK 0x35 +#define S3_CR38_LOCK_1 0x38 // unlock with 0x48 +#define S3_CR39_LOCK_2 0x39 // unlock with 0xA5 +#define S3_CR40_SYS_CONFIG 0x40 +#define S3_CR40_ENGINE_ENABLE 0x01 // bit 0: enable graphics engine +#define S3_CR42_MODE_CONTROL 0x42 +#define S3_CR43_EXT_MODE 0x43 +#define S3_CR45_HW_CURSOR_MODE 0x45 +#define S3_CR46_HW_CURSOR_XHI 0x46 +#define S3_CR47_HW_CURSOR_XLO 0x47 +#define S3_CR48_HW_CURSOR_YHI 0x48 +#define S3_CR49_HW_CURSOR_YLO 0x49 +#define S3_CR4A_HW_CURSOR_FG_HI 0x4A +#define S3_CR4B_HW_CURSOR_FG_LO 0x4B +#define S3_CR4C_HW_CURSOR_ADDR_HI 0x4C +#define S3_CR4D_HW_CURSOR_ADDR_LO 0x4D +#define S3_CR4E_HW_CURSOR_BG_HI 0x4E +#define S3_CR4F_HW_CURSOR_BG_LO 0x4F +#define S3_CR50_EXT_SYS_CTRL_1 0x50 +// CR50 pixel length bits (bits 5:4) +#define S3_CR50_PIX_8BPP 0x00 +#define S3_CR50_PIX_16BPP 0x10 +#define S3_CR50_PIX_32BPP 0x30 +#define S3_CR51_EXT_SYS_CTRL_2 0x51 +#define S3_CR53_EXT_MEM_CTRL_1 0x53 +#define S3_CR54_EXT_MEM_CTRL_2 0x54 +#define S3_CR55_EXT_DAC_CTRL 0x55 +#define S3_CR58_LFB_CTRL 0x58 +#define S3_CR59_LFB_ADDR_HI 0x59 +#define S3_CR5A_LFB_ADDR_LO 0x5A +#define S3_CR5D_EXT_HCNT 0x5D +#define S3_CR5E_EXT_VCNT 0x5E +#define S3_CR67_EXT_MISC_CTRL_2 0x67 +#define S3_CR6A_EXT_MISC_CTRL_3 0x6A + +// ============================================================ +// S3 2D engine I/O ports (legacy access) +// ============================================================ +// +// These are the standard S3 accelerator register ports. All S3 +// chips from the 928 onward support this I/O port interface. + +#define S3_CUR_Y 0x82E8 +#define S3_CUR_X 0x86E8 +#define S3_DESTY_AXSTP 0x8AE8 // destination Y / axial step +#define S3_DESTX_DIASTP 0x8EE8 // destination X / diagonal step +#define S3_ERR_TERM 0x92E8 +#define S3_MAJ_AXIS_PCNT 0x96E8 // major axis pixel count +#define S3_GP_STAT 0x9AE8 // graphics processor status +#define S3_CMD 0x9AE8 // command register (write) +#define S3_SHORT_STROKE 0x9EE8 +#define S3_BKGD_COLOR 0xA2E8 +#define S3_FRGD_COLOR 0xA6E8 +#define S3_WRT_MASK 0xAAE8 +#define S3_RD_MASK 0xAEE8 +#define S3_COLOR_CMP 0xB2E8 +#define S3_BKGD_MIX 0xB6E8 +#define S3_FRGD_MIX 0xBAE8 +#define S3_MULTIFUNC_CTRL 0xBEE8 // multi-function control register +#define S3_PIX_TRANS 0xE2E8 // pixel data transfer + +// ============================================================ +// S3 MULTIFUNC_CTRL sub-register indices +// ============================================================ +// +// The multi-function control register at 0xBEE8 is a multiplexed +// port: bits 15:12 select the sub-register, bits 11:0 are the value. + +#define S3_MF_MIN_AXIS_PCNT 0x0000 // minor axis pixel count +#define S3_MF_SCISSORS_T 0x1000 // scissors top +#define S3_MF_SCISSORS_L 0x2000 // scissors left +#define S3_MF_SCISSORS_B 0x3000 // scissors bottom +#define S3_MF_SCISSORS_R 0x4000 // scissors right +#define S3_MF_PIX_CNTL 0xA000 // pixel control +#define S3_MF_MULT_MISC_2 0xD000 // multi misc 2 +#define S3_MF_READ_SEL 0xE000 // read register select + +// ============================================================ +// S3 command register bits +// ============================================================ + +// Command type (bits 15:13 for Trio64) +#define S3_CMD_NOP 0x0000 +#define S3_CMD_LINE 0x2000 +#define S3_CMD_RECT 0x4000 +#define S3_CMD_POLY_LINE 0x6000 +#define S3_CMD_NOP2 0x8000 +#define S3_CMD_BITBLT 0xC000 + +// Drawing direction bits (bits 7:5) +#define S3_CMD_DRAW 0x0010 // draw (vs. move) +#define S3_CMD_DIR_X_POS 0x0020 // X direction positive +#define S3_CMD_DIR_Y_POS 0x0040 // Y direction positive +#define S3_CMD_DIR_X_MAJOR 0x0000 // X is major axis +#define S3_CMD_DIR_Y_MAJOR 0x0080 // Y is major axis + +// Additional command bits +#define S3_CMD_PLANAR 0x0002 // planar mode +// Bit 2 has dual meaning depending on command type: +// For RECT/BITBLT: across-plane (packed pixel) mode +// For LINE: include last pixel +#define S3_CMD_ACROSS_PLANE 0x0004 +#define S3_CMD_LAST_PIXEL 0x0004 +#define S3_CMD_BYTE_SWAP 0x1000 // byte swap for pixel transfer +#define S3_CMD_16BIT_IO 0x0200 // 16-bit pixel transfer +#define S3_CMD_32BIT_IO 0x0400 // 32-bit pixel transfer + +// Source select (bits 8:7 of command when applicable) +// Actually in PIX_CNTL register + +// ============================================================ +// S3 MIX register values +// ============================================================ +// +// The foreground and background MIX registers control what source +// is used and what ROP is applied. +// +// Bits 4:0 = ROP (raster operation) +// Bits 6:5 = source select: +// 00 = background color register +// 01 = foreground color register +// 10 = pixel data from CPU (via PIX_TRANS) +// 11 = display memory (screen source) + +#define S3_MIX_SRC_BKGD 0x00 +#define S3_MIX_SRC_FRGD 0x20 +#define S3_MIX_SRC_CPU 0x40 +#define S3_MIX_SRC_DISPLAY 0x60 + +// Common raster operations (bits 4:0) +#define S3_MIX_ROP_NOT_DST 0x00 // NOT dest +#define S3_MIX_ROP_ZERO 0x01 // 0 +#define S3_MIX_ROP_ONE 0x02 // 1 +#define S3_MIX_ROP_DST 0x03 // dest (nop) +#define S3_MIX_ROP_NOT_SRC 0x04 // NOT source +#define S3_MIX_ROP_SRC_XOR_DST 0x05 // source XOR dest +#define S3_MIX_ROP_NOT_SRC_AND 0x06 // NOT source AND dest +#define S3_MIX_ROP_SRC_AND_DST 0x0C // source AND dest +#define S3_MIX_ROP_SRC 0x07 // source (copy) +#define S3_MIX_ROP_NOT_SRC_OR 0x0B // NOT source OR dest +#define S3_MIX_ROP_SRC_OR_DST 0x0E // source OR dest + +// ============================================================ +// S3 PIX_CNTL (pixel control) values +// ============================================================ +// +// Written via MULTIFUNC_CTRL with index 0xA000. +// Controls the source of foreground/background mix selection. +// +// Bits 7:6 = mix select: +// 00 = foreground mix always +// 01 = cpu data determines mix (color expansion) +// 10 = pixel data determines mix +// 11 = video memory determines mix + +#define S3_PIXCNTL_MIX_FRGD 0x0000 // always use foreground mix +#define S3_PIXCNTL_MIX_CPU 0x0040 // CPU data selects mix (color expansion) +#define S3_PIXCNTL_MIX_DISPLAY 0x0080 // display memory selects mix + +// ============================================================ +// S3 GP_STAT bits +// ============================================================ + +#define S3_GP_STAT_BUSY 0x0200 // graphics engine busy +#define S3_GP_STAT_FIFO_EMPTY 0x0400 // all FIFO slots empty +#define S3_GP_STAT_FIFO_1 0x0080 // at least 1 FIFO slot free +#define S3_GP_STAT_FIFO_2 0x0040 // at least 2 FIFO slots free +#define S3_GP_STAT_FIFO_3 0x0020 // at least 3 FIFO slots free +#define S3_GP_STAT_FIFO_4 0x0010 // at least 4 FIFO slots free +#define S3_GP_STAT_FIFO_5 0x0008 // at least 5 FIFO slots free +#define S3_GP_STAT_FIFO_6 0x0004 // at least 6 FIFO slots free +#define S3_GP_STAT_FIFO_7 0x0002 // at least 7 FIFO slots free +#define S3_GP_STAT_FIFO_8 0x0001 // at least 8 FIFO slots free + +// Hardware cursor constants +#define S3_HW_CURSOR_SIZE 64 // 64x64 pixels +#define S3_HW_CURSOR_BYTES 1024 // 64*64/8 * 2 planes = 1024 bytes + +// Maximum wait iterations to prevent infinite loops on broken hardware +#define S3_MAX_IDLE_WAIT 1000000 + +// MMIO region offset from LFB base (Trio64/ViRGE new-style MMIO) +#define S3_MMIO_OFFSET 0x1000000 +#define S3_MMIO_SIZE 0x10000 // 64KB MMIO window + +// ============================================================ +// S3 MMIO register offset mapping +// ============================================================ +// +// The S3 "new MMIO" maps the enhanced registers into a 64KB +// window at LFB + 0x1000000. The I/O port addresses map to +// MMIO offsets as follows: +// I/O 0x82E8 -> MMIO 0x82E8 (same offset within 64KB window) +// +// For 16-bit register access: write to offset as uint16_t +// For 32-bit register access: write to offset as uint32_t + +// ============================================================ +// Private driver state +// ============================================================ + +typedef struct { + uint32_t lfbPhysAddr; // physical address of LFB + uint32_t vramSize; // total VRAM in bytes + uint32_t cursorOffset; // VRAM offset for cursor image + int32_t bytesPerPixel; + int32_t screenPitch; // bytes per scanline + bool isTrio; // true for Trio32/64/V+/ViRGE + bool useMMIO; // true if MMIO is available + volatile uint8_t *mmio; // mapped MMIO base pointer (NULL if I/O mode) + DpmiMappingT lfbMapping; + DpmiMappingT mmioMapping; +} S3PrivateT; + +// ============================================================ +// Prototypes +// ============================================================ + +static void s3BitBlt(AccelDriverT *drv, int32_t srcX, int32_t srcY, int32_t dstX, int32_t dstY, int32_t w, int32_t h); +static void s3ColorExpand(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h, uint32_t fg, uint32_t bg); +static bool s3Detect(AccelDriverT *drv); +static void s3HostBlit(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h); +static bool s3Init(AccelDriverT *drv, const AccelModeRequestT *req); +static void s3LineDraw(AccelDriverT *drv, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color); +static void s3MoveCursor(AccelDriverT *drv, int32_t x, int32_t y); +static void s3RectFill(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color); +static void s3RectFillPat(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, const uint8_t *pattern, uint32_t fg, uint32_t bg); +static void s3SetClip(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h); +static void s3SetCursor(AccelDriverT *drv, const HwCursorImageT *image); +static void s3ShowCursor(AccelDriverT *drv, bool visible); +static void s3Shutdown(AccelDriverT *drv); +static void s3UnlockRegs(void); +static void s3WaitFifo(S3PrivateT *priv, int32_t slots); +static void s3WaitIdle(AccelDriverT *drv); + +// ============================================================ +// Driver instance +// ============================================================ + +static S3PrivateT sS3Private; + +static AccelDriverT sS3Driver = { + .name = "S3 Trio64", + .chipFamily = "s3", + .caps = 0, + .privData = &sS3Private, + .detect = s3Detect, + .init = s3Init, + .shutdown = s3Shutdown, + .waitIdle = s3WaitIdle, + .setClip = s3SetClip, + .rectFill = s3RectFill, + .rectFillPat = s3RectFillPat, + .bitBlt = s3BitBlt, + .hostBlit = s3HostBlit, + .colorExpand = s3ColorExpand, + .lineDraw = s3LineDraw, + .setCursor = s3SetCursor, + .moveCursor = s3MoveCursor, + .showCursor = s3ShowCursor, +}; + +// ============================================================ +// s3RegisterDriver +// ============================================================ +// +// Called from main() to register the S3 driver with the manager. + +void s3RegisterDriver(void) { + accelRegisterDriver(&sS3Driver); +} + + +// ============================================================ +// S3 register access helpers +// ============================================================ +// +// When MMIO is available (Trio64, ViRGE, Savage), register access +// goes through the MMIO window at LFB + 0x1000000. The I/O port +// addresses map directly to MMIO offsets within the 64KB window. +// When MMIO is not available (Vision series), we fall back to +// I/O port access. +// +// Using MMIO is faster because: (1) memory writes can be posted +// and pipelined by the CPU, (2) no I/O port decode penalty, and +// (3) on Pentium+, memory writes are faster than I/O instructions. + +static inline void s3WriteReg16(S3PrivateT *priv, uint16_t port, uint16_t val) { + if (priv->useMMIO) { + *(volatile uint16_t *)(priv->mmio + (port & 0xFFFF)) = val; + } else { + outportw(port, val); + } +} + +static inline uint16_t s3ReadReg16(S3PrivateT *priv, uint16_t port) { + if (priv->useMMIO) { + return *(volatile uint16_t *)(priv->mmio + (port & 0xFFFF)); + } + + return inportw(port); +} + + +// ============================================================ +// s3BitBlt +// ============================================================ +// +// Screen-to-screen BitBLT using the S3 hardware engine. +// Handles overlapping source and destination by adjusting the +// blit direction. The S3 engine can blit in any of four +// directions (positive/negative X/Y). + +static void s3BitBlt(AccelDriverT *drv, int32_t srcX, int32_t srcY, int32_t dstX, int32_t dstY, int32_t w, int32_t h) { + S3PrivateT *priv = (S3PrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + // Determine blit direction to handle overlapping regions + uint16_t cmd = S3_CMD_BITBLT | S3_CMD_DRAW | S3_CMD_ACROSS_PLANE; + int32_t sx = srcX; + int32_t sy = srcY; + int32_t dx = dstX; + int32_t dy = dstY; + + if (dstX <= srcX) { + cmd |= S3_CMD_DIR_X_POS; + } else { + sx += w - 1; + dx += w - 1; + } + + if (dstY <= srcY) { + cmd |= S3_CMD_DIR_Y_POS; + } else { + sy += h - 1; + dy += h - 1; + } + + s3WaitFifo(priv, 7); + + // Foreground mix: source = display memory, ROP = copy + s3WriteReg16(priv, S3_FRGD_MIX, S3_MIX_SRC_DISPLAY | S3_MIX_ROP_SRC); + s3WriteReg16(priv, S3_WRT_MASK, 0xFFFF); + + // Pixel control: always foreground mix + s3WriteReg16(priv, S3_MULTIFUNC_CTRL, S3_MF_PIX_CNTL | S3_PIXCNTL_MIX_FRGD); + + // Source position + s3WriteReg16(priv, S3_CUR_X, sx); + s3WriteReg16(priv, S3_CUR_Y, sy); + + // Destination position + s3WriteReg16(priv, S3_DESTX_DIASTP, dx); + s3WriteReg16(priv, S3_DESTY_AXSTP, dy); + + s3WaitFifo(priv, 3); + + // Dimensions (count is pixels - 1) + s3WriteReg16(priv, S3_MAJ_AXIS_PCNT, w - 1); + s3WriteReg16(priv, S3_MULTIFUNC_CTRL, S3_MF_MIN_AXIS_PCNT | (h - 1)); + + // Fire + s3WriteReg16(priv, S3_CMD, cmd); +} + + +// ============================================================ +// s3ColorExpand +// ============================================================ +// +// Monochrome-to-color expansion using CPU-driven pixel transfer. +// This is used for text rendering: each byte of srcBuf contains +// 8 monochrome pixels (MSB first), which the engine expands to +// full-color using the foreground and background color registers. +// +// The S3 engine is set to CPU data mix mode: for each bit in +// the transferred data, 1 = use foreground color, 0 = use +// background color. + +static void s3ColorExpand(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h, uint32_t fg, uint32_t bg) { + S3PrivateT *priv = (S3PrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + s3WaitFifo(priv, 8); + + // Set colors + s3WriteReg16(priv, S3_FRGD_COLOR, fg); + s3WriteReg16(priv, S3_BKGD_COLOR, bg); + + // Foreground mix: source = foreground color, ROP = copy + s3WriteReg16(priv, S3_FRGD_MIX, S3_MIX_SRC_FRGD | S3_MIX_ROP_SRC); + // Background mix: source = background color, ROP = copy + s3WriteReg16(priv, S3_BKGD_MIX, S3_MIX_SRC_BKGD | S3_MIX_ROP_SRC); + + // Pixel control: CPU data selects fg/bg mix + s3WriteReg16(priv, S3_MULTIFUNC_CTRL, S3_MF_PIX_CNTL | S3_PIXCNTL_MIX_CPU); + + // Destination and dimensions + s3WriteReg16(priv, S3_CUR_X, dstX); + s3WriteReg16(priv, S3_CUR_Y, dstY); + s3WriteReg16(priv, S3_MAJ_AXIS_PCNT, w - 1); + + s3WaitFifo(priv, 2); + s3WriteReg16(priv, S3_MULTIFUNC_CTRL, S3_MF_MIN_AXIS_PCNT | (h - 1)); + + // Command: rectangle, draw, left-to-right top-to-bottom, CPU data + uint16_t cmd = S3_CMD_RECT | S3_CMD_DRAW | S3_CMD_ACROSS_PLANE + | S3_CMD_DIR_X_POS | S3_CMD_DIR_Y_POS + | S3_CMD_16BIT_IO; + s3WriteReg16(priv, S3_CMD, cmd); + + // Transfer monochrome data to the engine one scanline at a time. + // The engine expects MSB-first bit order, which matches our + // convention. Data must be written to PIX_TRANS in 16-bit words. + int32_t wordsPerRow = (w + 15) / 16; + + for (int32_t row = 0; row < h; row++) { + const uint8_t *rowData = srcBuf + row * srcPitch; + + s3WaitFifo(priv, 1); + + for (int32_t word = 0; word < wordsPerRow; word++) { + int32_t byteOff = word * 2; + uint8_t hi = (byteOff < srcPitch) ? rowData[byteOff] : 0; + uint8_t lo = (byteOff + 1 < srcPitch) ? rowData[byteOff + 1] : 0; + s3WriteReg16(priv, S3_PIX_TRANS, (hi << 8) | lo); + } + } +} + + +// ============================================================ +// s3Detect +// ============================================================ +// +// Scans PCI for any S3 chip in our supported list. Does not +// touch any hardware registers (detect must be side-effect-free). + +static bool s3Detect(AccelDriverT *drv) { + int32_t matchIdx; + + if (!pciFindDeviceList(sS3DeviceIds, &drv->pciDev, &matchIdx)) { + return false; + } + + // Set the driver name based on the specific chip found + switch (drv->pciDev.deviceId) { + case S3_TRIO32: + drv->name = "S3 Trio32"; + break; + case S3_TRIO64: + drv->name = "S3 Trio64"; + break; + case S3_TRIO64V_PLUS: + drv->name = "S3 Trio64V+"; + break; + case S3_VISION864: + case S3_VISION864P: + drv->name = "S3 Vision864"; + break; + case S3_VISION868: + drv->name = "S3 Vision868"; + break; + case S3_VISION964: + drv->name = "S3 Vision964"; + break; + case S3_VIRGE: + drv->name = "S3 ViRGE"; + break; + case S3_VIRGE_VX: + drv->name = "S3 ViRGE/VX"; + break; + case S3_VIRGE_DX: + drv->name = "S3 ViRGE/DX"; + break; + case S3_VIRGE_GX2: + drv->name = "S3 ViRGE/GX2"; + break; + case S3_VIRGE_MX: + case S3_VIRGE_MXP: + drv->name = "S3 ViRGE/MX"; + break; + case S3_SAVAGE3D: + case S3_SAVAGE3D_MV: + drv->name = "S3 Savage3D"; + break; + case S3_SAVAGE4: + drv->name = "S3 Savage4"; + break; + case S3_SAVAGE_MX: + case S3_SAVAGE_MX_MV: + drv->name = "S3 Savage/MX"; + break; + case S3_SAVAGE_IX: + case S3_SAVAGE_IX_MV: + drv->name = "S3 Savage/IX"; + break; + case S3_SAVAGE_2000: + drv->name = "S3 Savage 2000"; + break; + case S3_VISION968: + case S3_VISION968_ALT: + drv->name = "S3 Vision968"; + break; + default: + drv->name = "S3 (unknown)"; + break; + } + + return true; +} + + +// ============================================================ +// s3HostBlit +// ============================================================ +// +// CPU-to-screen blit via the PIX_TRANS port. Transfers packed +// pixel data from system RAM to VRAM through the engine. The +// engine handles the destination address calculation and pitch +// alignment, so the CPU just streams data. + +static void s3HostBlit(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h) { + S3PrivateT *priv = (S3PrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + int32_t bpp = priv->bytesPerPixel; + int32_t rowBytes = w * bpp; + int32_t wordCount = (rowBytes + 1) / 2; + + s3WaitFifo(priv, 7); + + // Foreground mix: source = CPU data, ROP = copy + s3WriteReg16(priv, S3_FRGD_MIX, S3_MIX_SRC_CPU | S3_MIX_ROP_SRC); + s3WriteReg16(priv, S3_WRT_MASK, 0xFFFF); + + // Pixel control: always foreground mix + s3WriteReg16(priv, S3_MULTIFUNC_CTRL, S3_MF_PIX_CNTL | S3_PIXCNTL_MIX_FRGD); + + // Destination position + s3WriteReg16(priv, S3_CUR_X, dstX); + s3WriteReg16(priv, S3_CUR_Y, dstY); + + // Dimensions + s3WriteReg16(priv, S3_MAJ_AXIS_PCNT, w - 1); + s3WriteReg16(priv, S3_MULTIFUNC_CTRL, S3_MF_MIN_AXIS_PCNT | (h - 1)); + + s3WaitFifo(priv, 1); + + // Command: rectangle, draw, CPU data, left-to-right top-to-bottom + s3WriteReg16(priv, S3_CMD, S3_CMD_RECT | S3_CMD_DRAW | S3_CMD_ACROSS_PLANE + | S3_CMD_DIR_X_POS | S3_CMD_DIR_Y_POS + | S3_CMD_16BIT_IO); + + // Transfer pixel data row by row through PIX_TRANS + for (int32_t row = 0; row < h; row++) { + const uint8_t *rowData = srcBuf + row * srcPitch; + + for (int32_t word = 0; word < wordCount; word++) { + int32_t byteOff = word * 2; + uint8_t lo = rowData[byteOff]; + uint8_t hi = (byteOff + 1 < rowBytes) ? rowData[byteOff + 1] : 0; + s3WriteReg16(priv, S3_PIX_TRANS, (hi << 8) | lo); + } + } +} + + +// ============================================================ +// s3Init +// ============================================================ +// +// Initializes the S3 chip: sets the requested video mode via +// VESA, unlocks extended registers, enables the 2D engine, and +// maps the linear framebuffer. +// +// Mode setting strategy: use VESA VBE to set the mode (with LFB +// flag bit 14 set), then unlock S3 extended registers and +// configure the acceleration engine. This avoids the complexity +// of programming S3-specific CRTC timing registers while still +// getting full hardware acceleration. + +static bool s3Init(AccelDriverT *drv, const AccelModeRequestT *req) { + S3PrivateT *priv = (S3PrivateT *)drv->privData; + + memset(priv, 0, sizeof(*priv)); + + priv->isTrio = (drv->pciDev.deviceId == S3_TRIO32 + || drv->pciDev.deviceId == S3_TRIO64 + || drv->pciDev.deviceId == S3_TRIO64V_PLUS + || drv->pciDev.deviceId == S3_VIRGE + || drv->pciDev.deviceId == S3_VIRGE_VX + || drv->pciDev.deviceId == S3_VIRGE_DX + || drv->pciDev.deviceId == S3_VIRGE_GX2 + || drv->pciDev.deviceId == S3_VIRGE_MX + || drv->pciDev.deviceId == S3_VIRGE_MXP + || drv->pciDev.deviceId == S3_SAVAGE3D + || drv->pciDev.deviceId == S3_SAVAGE3D_MV + || drv->pciDev.deviceId == S3_SAVAGE4 + || drv->pciDev.deviceId == S3_SAVAGE_MX + || drv->pciDev.deviceId == S3_SAVAGE_MX_MV + || drv->pciDev.deviceId == S3_SAVAGE_IX + || drv->pciDev.deviceId == S3_SAVAGE_IX_MV + || drv->pciDev.deviceId == S3_SAVAGE_2000); + + // Determine VRAM size and LFB address from BAR0 + uint32_t barSize = pciSizeBar(drv->pciDev.bus, drv->pciDev.dev, + drv->pciDev.func, PCI_BAR0); + uint32_t bar0 = pciRead32(drv->pciDev.bus, drv->pciDev.dev, + drv->pciDev.func, PCI_BAR0); + priv->vramSize = barSize; + priv->lfbPhysAddr = bar0 & 0xFFFFFFF0; + + // Unlock S3 extended registers + s3UnlockRegs(); + + // Cross-check VRAM size from CR36 on Trio chips + if (priv->isTrio) { + uint8_t cr36 = vgaCrtcRead(0x36); + uint32_t ramFromCr36; + + switch ((cr36 >> 5) & 0x07) { + case 0: ramFromCr36 = 4 * 1024 * 1024; break; + case 2: ramFromCr36 = 3 * 1024 * 1024; break; + case 4: ramFromCr36 = 2 * 1024 * 1024; break; + case 6: ramFromCr36 = 1 * 1024 * 1024; break; + default: ramFromCr36 = 1 * 1024 * 1024; break; + } + + if (barSize < 512 * 1024 || barSize > 64 * 1024 * 1024) { + priv->vramSize = ramFromCr36; + } + } + + // Find and set VESA mode + VesaModeResultT vesa; + if (!vesaFindAndSetMode(req->width, req->height, req->bpp, &vesa)) { + return false; + } + + // Map LFB via DPMI + if (!dpmiMapFramebuffer(priv->lfbPhysAddr, priv->vramSize, &priv->lfbMapping)) { + vgaRestoreTextMode(); + return false; + } + + // Fill in driver mode info + priv->bytesPerPixel = (vesa.bpp + 7) / 8; + priv->screenPitch = vesa.pitch; + + drv->mode.width = vesa.width; + drv->mode.height = vesa.height; + drv->mode.bpp = vesa.bpp; + drv->mode.pitch = vesa.pitch; + drv->mode.framebuffer = priv->lfbMapping.ptr; + drv->mode.vramSize = priv->vramSize; + drv->mode.offscreenBase = vesa.pitch * vesa.height; + + // Map MMIO region for Trio64/ViRGE (at LFB + 16MB) + priv->useMMIO = false; + priv->mmio = NULL; + if (priv->isTrio) { + if (dpmiMapFramebuffer(priv->lfbPhysAddr + S3_MMIO_OFFSET, S3_MMIO_SIZE, &priv->mmioMapping)) { + priv->useMMIO = true; + priv->mmio = (volatile uint8_t *)priv->mmioMapping.ptr; + } + } + + // Re-unlock after mode set (VESA may re-lock) + s3UnlockRegs(); + + // Enable the graphics engine + // CR40 bit 0 = enable graphics engine + uint8_t cr40 = vgaCrtcRead(S3_CR40_SYS_CONFIG); + vgaCrtcWrite(S3_CR40_SYS_CONFIG, cr40 | S3_CR40_ENGINE_ENABLE); + + // Set up pixel format in CR50 for the engine + uint8_t cr50 = vgaCrtcRead(S3_CR50_EXT_SYS_CTRL_1); + cr50 &= 0xC0; // clear pixel length bits + + switch (vesa.bpp) { + case 8: + cr50 |= S3_CR50_PIX_8BPP; + break; + case 15: + case 16: + cr50 |= S3_CR50_PIX_16BPP; + break; + case 32: + cr50 |= S3_CR50_PIX_32BPP; + break; + } + + vgaCrtcWrite(S3_CR50_EXT_SYS_CTRL_1, cr50); + + // Set up hardware cursor location at end of VRAM + // Cursor image is 1KB (64x64 2bpp), aligned to 1KB + priv->cursorOffset = priv->vramSize - S3_HW_CURSOR_BYTES; + priv->cursorOffset &= ~(S3_HW_CURSOR_BYTES - 1); + + // Set capability flags + drv->caps = ACAP_RECT_FILL + | ACAP_RECT_FILL_PAT + | ACAP_BITBLT + | ACAP_COLOR_EXPAND + | ACAP_HOST_BLIT + | ACAP_LINE_DRAW + | ACAP_HW_CURSOR + | ACAP_CLIP; + + // Set full-screen clip rectangle + s3SetClip(drv, 0, 0, vesa.width, vesa.height); + + // Wait for engine to be ready + s3WaitIdle(drv); + + return true; +} + + +// ============================================================ +// s3LineDraw +// ============================================================ +// +// Bresenham line drawing using the S3 hardware engine. +// The engine implements the Bresenham algorithm natively -- +// we provide the initial error term and step values. + +static void s3LineDraw(AccelDriverT *drv, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color) { + S3PrivateT *priv = (S3PrivateT *)drv->privData; + + int32_t dx = x2 - x1; + int32_t dy = y2 - y1; + + // Determine octant and make dx/dy positive + uint16_t cmd = S3_CMD_LINE | S3_CMD_DRAW | S3_CMD_LAST_PIXEL; + + if (dx >= 0) { + cmd |= S3_CMD_DIR_X_POS; + } else { + dx = -dx; + } + + if (dy >= 0) { + cmd |= S3_CMD_DIR_Y_POS; + } else { + dy = -dy; + } + + int32_t majAxis; + int32_t minAxis; + + if (dx >= dy) { + // X is major axis + majAxis = dx; + minAxis = dy; + } else { + // Y is major axis + cmd |= S3_CMD_DIR_Y_MAJOR; + majAxis = dy; + minAxis = dx; + } + + if (majAxis == 0) { + return; + } + + // Bresenham parameters: + // axialStep = 2 * minAxis + // diagonalStep = 2 * (minAxis - majAxis) + // errorTerm = 2 * minAxis - majAxis + int32_t axialStep = 2 * minAxis; + int32_t diagStep = 2 * (minAxis - majAxis); + int32_t errTerm = 2 * minAxis - majAxis; + + s3WaitFifo(priv, 7); + + s3WriteReg16(priv, S3_FRGD_COLOR, color); + s3WriteReg16(priv, S3_FRGD_MIX, S3_MIX_SRC_FRGD | S3_MIX_ROP_SRC); + s3WriteReg16(priv, S3_MULTIFUNC_CTRL, S3_MF_PIX_CNTL | S3_PIXCNTL_MIX_FRGD); + + s3WriteReg16(priv, S3_CUR_X, x1); + s3WriteReg16(priv, S3_CUR_Y, y1); + + s3WriteReg16(priv, S3_DESTY_AXSTP, axialStep); + s3WriteReg16(priv, S3_DESTX_DIASTP, diagStep); + + s3WaitFifo(priv, 3); + + s3WriteReg16(priv, S3_ERR_TERM, errTerm); + s3WriteReg16(priv, S3_MAJ_AXIS_PCNT, majAxis); + s3WriteReg16(priv, S3_CMD, cmd); +} + + +// ============================================================ +// s3MoveCursor +// ============================================================ +// +// Moves the hardware cursor to the given screen position. +// The S3 cursor registers are in CRTC extended registers CR46-CR49. + +static void s3MoveCursor(AccelDriverT *drv, int32_t x, int32_t y) { + (void)drv; + + // Handle negative coordinates (cursor partially off-screen) + // by setting the cursor origin offset in the image + if (x < 0) { + x = 0; + } + if (y < 0) { + y = 0; + } + + vgaCrtcWrite(S3_CR46_HW_CURSOR_XHI, (x >> 8) & 0x07); + vgaCrtcWrite(S3_CR47_HW_CURSOR_XLO, x & 0xFF); + vgaCrtcWrite(S3_CR48_HW_CURSOR_YHI, (y >> 8) & 0x07); + vgaCrtcWrite(S3_CR49_HW_CURSOR_YLO, y & 0xFF); +} + + +// ============================================================ +// s3RectFill +// ============================================================ +// +// Solid rectangle fill using the S3 hardware engine. +// Sets the foreground color, selects foreground-only mix mode +// with copy ROP, then issues a rectangle command. + +static void s3RectFill(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color) { + S3PrivateT *priv = (S3PrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + s3WaitFifo(priv, 7); + + // Set foreground color + s3WriteReg16(priv, S3_FRGD_COLOR, color); + + // Foreground mix: source = foreground color, ROP = copy + s3WriteReg16(priv, S3_FRGD_MIX, S3_MIX_SRC_FRGD | S3_MIX_ROP_SRC); + + // Write mask: all bits enabled + s3WriteReg16(priv, S3_WRT_MASK, 0xFFFF); + + // Pixel control: always use foreground mix + s3WriteReg16(priv, S3_MULTIFUNC_CTRL, S3_MF_PIX_CNTL | S3_PIXCNTL_MIX_FRGD); + + // Starting position + s3WriteReg16(priv, S3_CUR_X, x); + s3WriteReg16(priv, S3_CUR_Y, y); + + // Dimensions (count is pixels - 1) + s3WriteReg16(priv, S3_MAJ_AXIS_PCNT, w - 1); + + s3WaitFifo(priv, 2); + s3WriteReg16(priv, S3_MULTIFUNC_CTRL, S3_MF_MIN_AXIS_PCNT | (h - 1)); + + // Command: rectangle, draw, positive X and Y, packed mode + s3WriteReg16(priv, S3_CMD, S3_CMD_RECT | S3_CMD_DRAW | S3_CMD_ACROSS_PLANE + | S3_CMD_DIR_X_POS | S3_CMD_DIR_Y_POS); +} + + +// ============================================================ +// s3RectFillPat +// ============================================================ +// +// 8x8 mono pattern fill using CPU data mix mode. The pattern is +// 8 bytes (one per row, MSB-first), tiled across the rectangle. +// 1-bits use the foreground color, 0-bits use the background. +// Data is fed through PIX_TRANS as 16-bit words, repeating the +// 8-row pattern for the full height of the rectangle. + +static void s3RectFillPat(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, const uint8_t *pattern, uint32_t fg, uint32_t bg) { + S3PrivateT *priv = (S3PrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + s3WaitFifo(priv, 8); + + // Set colors + s3WriteReg16(priv, S3_FRGD_COLOR, fg); + s3WriteReg16(priv, S3_BKGD_COLOR, bg); + + // Foreground mix: source = foreground color, ROP = copy + s3WriteReg16(priv, S3_FRGD_MIX, S3_MIX_SRC_FRGD | S3_MIX_ROP_SRC); + // Background mix: source = background color, ROP = copy + s3WriteReg16(priv, S3_BKGD_MIX, S3_MIX_SRC_BKGD | S3_MIX_ROP_SRC); + + // Pixel control: CPU data selects fg/bg mix + s3WriteReg16(priv, S3_MULTIFUNC_CTRL, S3_MF_PIX_CNTL | S3_PIXCNTL_MIX_CPU); + + // Destination and dimensions + s3WriteReg16(priv, S3_CUR_X, x); + s3WriteReg16(priv, S3_CUR_Y, y); + s3WriteReg16(priv, S3_MAJ_AXIS_PCNT, w - 1); + + s3WaitFifo(priv, 2); + s3WriteReg16(priv, S3_MULTIFUNC_CTRL, S3_MF_MIN_AXIS_PCNT | (h - 1)); + + // Command: rectangle, draw, left-to-right top-to-bottom, CPU data + s3WriteReg16(priv, S3_CMD, S3_CMD_RECT | S3_CMD_DRAW | S3_CMD_ACROSS_PLANE + | S3_CMD_DIR_X_POS | S3_CMD_DIR_Y_POS + | S3_CMD_16BIT_IO); + + // Feed tiled pattern data through PIX_TRANS. + // Each row of the pattern is 1 byte (8 pixels), tiled across the width. + int32_t wordsPerRow = (w + 15) / 16; + + for (int32_t row = 0; row < h; row++) { + uint8_t patByte = pattern[row & 7]; + + s3WaitFifo(priv, 1); + + for (int32_t word = 0; word < wordsPerRow; word++) { + s3WriteReg16(priv, S3_PIX_TRANS, (patByte << 8) | patByte); + } + } +} + + +// ============================================================ +// s3SetClip +// ============================================================ +// +// Programs the hardware scissor rectangle. All subsequent +// drawing operations are clipped to this region. + +static void s3SetClip(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h) { + S3PrivateT *priv = (S3PrivateT *)drv->privData; + + s3WaitFifo(priv, 4); + + s3WriteReg16(priv, S3_MULTIFUNC_CTRL, S3_MF_SCISSORS_L | (x & 0x0FFF)); + s3WriteReg16(priv, S3_MULTIFUNC_CTRL, S3_MF_SCISSORS_T | (y & 0x0FFF)); + s3WriteReg16(priv, S3_MULTIFUNC_CTRL, S3_MF_SCISSORS_R | ((x + w - 1) & 0x0FFF)); + s3WriteReg16(priv, S3_MULTIFUNC_CTRL, S3_MF_SCISSORS_B | ((y + h - 1) & 0x0FFF)); +} + + +// ============================================================ +// s3SetCursor +// ============================================================ +// +// Uploads a cursor image to VRAM and configures the hardware +// cursor registers. The S3 hardware cursor is 64x64 pixels, +// stored as two bit planes (AND mask and XOR mask) at the +// cursor address in VRAM. +// +// S3 cursor VRAM format: +// 1024 bytes total = 512 bytes AND + 512 bytes XOR +// Each row: 8 bytes AND mask, 8 bytes XOR mask (interleaved +// by row on some chips, or plane-sequential on others). +// For Trio64: rows are interleaved (AND row, XOR row, ...). + +static void s3SetCursor(AccelDriverT *drv, const HwCursorImageT *image) { + S3PrivateT *priv = (S3PrivateT *)drv->privData; + + if (!image) { + s3ShowCursor(drv, false); + return; + } + + // Wait for engine idle before writing to VRAM + s3WaitIdle(drv); + + // Write cursor image to VRAM at cursorOffset + // Format: for each of 64 rows, write 8 bytes AND then 8 bytes XOR + uint8_t *cursorMem = drv->mode.framebuffer + priv->cursorOffset; + + for (int32_t row = 0; row < S3_HW_CURSOR_SIZE; row++) { + for (int32_t byte = 0; byte < 8; byte++) { + int32_t srcIdx = row * 8 + byte; + uint8_t andByte; + uint8_t xorByte; + + if (row < image->height && byte < (image->width + 7) / 8) { + andByte = image->andMask[srcIdx]; + xorByte = image->xorMask[srcIdx]; + } else { + // Outside the image: transparent (AND=0xFF, XOR=0x00) + andByte = 0xFF; + xorByte = 0x00; + } + + // Interleaved format: AND row bytes, then XOR row bytes + cursorMem[row * 16 + byte] = andByte; + cursorMem[row * 16 + byte + 8] = xorByte; + } + } + + // Set cursor address in VRAM (in units of 1KB) + uint16_t cursorAddr = priv->cursorOffset / 1024; + vgaCrtcWrite(S3_CR4C_HW_CURSOR_ADDR_HI, (cursorAddr >> 8) & 0x0F); + vgaCrtcWrite(S3_CR4D_HW_CURSOR_ADDR_LO, cursorAddr & 0xFF); +} + + +// ============================================================ +// s3ShowCursor +// ============================================================ +// +// Enables or disables the hardware cursor via CR45. + +static void s3ShowCursor(AccelDriverT *drv, bool visible) { + (void)drv; + + uint8_t cr45 = vgaCrtcRead(S3_CR45_HW_CURSOR_MODE); + + if (visible) { + cr45 |= 0x01; // enable hardware cursor + } else { + cr45 &= ~0x01; // disable hardware cursor + } + + vgaCrtcWrite(S3_CR45_HW_CURSOR_MODE, cr45); +} + + +// ============================================================ +// s3Shutdown +// ============================================================ +// +// Restores text mode and cleans up. The VESA/VGA BIOS text mode +// restore handles resetting all the S3-specific registers. + +static void s3Shutdown(AccelDriverT *drv) { + S3PrivateT *priv = (S3PrivateT *)drv->privData; + + s3ShowCursor(drv, false); + dpmiUnmapFramebuffer(&priv->mmioMapping); + dpmiUnmapFramebuffer(&priv->lfbMapping); + vgaRestoreTextMode(); +} + + +// ============================================================ +// s3UnlockRegs +// ============================================================ +// +// Unlocks S3 extended CRTC registers. Three levels: +// CR38 = 0x48 : unlock S3 VGA registers (CR30-CR3F) +// CR39 = 0xA5 : unlock S3 system registers (CR40-CR5F) +// Also unlock standard CRTC protection for timing regs. + +static void s3UnlockRegs(void) { + vgaCrtcWrite(S3_CR38_LOCK_1, 0x48); + vgaCrtcWrite(S3_CR39_LOCK_2, 0xA5); + vgaCrtcUnlock(); +} + + +// ============================================================ +// s3WaitFifo +// ============================================================ +// +// Waits until the S3 command FIFO has at least 'slots' free +// entries. The FIFO depth is 8 on Trio64. Reading GP_STAT +// returns a bitmask where bits 7:0 indicate how many slots +// are free (each bit = one more slot free, from MSB to LSB). + +static void s3WaitFifo(S3PrivateT *priv, int32_t slots) { + // Build the required mask: if we need N slots free, we need + // bit (8 - N) to be set in GP_STAT bits 7:0. + // Bits: 0x80=1free, 0x40=2free, ..., 0x01=8free + uint16_t mask = 0x0100 >> slots; + + for (int32_t i = 0; i < S3_MAX_IDLE_WAIT; i++) { + if (s3ReadReg16(priv, S3_GP_STAT) & mask) { + return; + } + } +} + + +// ============================================================ +// s3WaitIdle +// ============================================================ +// +// Waits until the S3 graphics engine is completely idle. +// The engine is idle when the BUSY bit (bit 9) of GP_STAT is clear +// AND the FIFO is empty (bit 10 is set). + +static void s3WaitIdle(AccelDriverT *drv) { + S3PrivateT *priv = (S3PrivateT *)drv->privData; + + for (int32_t i = 0; i < S3_MAX_IDLE_WAIT; i++) { + uint16_t stat = s3ReadReg16(priv, S3_GP_STAT); + if (!(stat & S3_GP_STAT_BUSY)) { + return; + } + } +} diff --git a/sis.c b/sis.c new file mode 100644 index 0000000..88ef0c1 --- /dev/null +++ b/sis.c @@ -0,0 +1,561 @@ +// sis.c -- SiS 6326/300/305/315/330 accelerated video driver +// +// Supports the SiS 6326, 300, 305, 315, and 330 integrated graphics +// chipsets. These share a similar 2D engine interface based on a +// queue-based command submission model: +// - Hardware rectangle fill +// - Screen-to-screen BitBLT +// - CPU-to-screen blit (host blit via data port) +// - Hardware clip rectangle +// - 64x64 hardware cursor +// +// Register access: +// BAR0 maps the linear framebuffer. +// BAR1 maps 128KB of MMIO registers. The 2D engine registers +// live at offsets 0x8200-0x8244 within this block. Host data +// is written to the MMIO data port at offset 0x8300. +// +// The 2D engine uses a command register at 0x822C to specify the +// operation type and ROP, then a fire register at 0x8230 to trigger +// execution. Engine status is polled at 0x8244. + +#include "accelVid.h" +#include "vgaCommon.h" +#include "pci.h" + +#include +#include +#include +#include + +// ============================================================ +// SiS vendor/device IDs +// ============================================================ + +#define SIS_VENDOR_ID 0x1039 + +#define SIS_6326 0x6326 +#define SIS_300 0x0300 +#define SIS_305 0x0305 +#define SIS_315 0x0315 +#define SIS_330 0x0330 + +static const uint16_t sSisDeviceIds[] = { + SIS_VENDOR_ID, SIS_6326, + SIS_VENDOR_ID, SIS_300, + SIS_VENDOR_ID, SIS_305, + SIS_VENDOR_ID, SIS_315, + SIS_VENDOR_ID, SIS_330, + 0, 0 +}; + +// ============================================================ +// 2D engine register offsets (from MMIO base) +// ============================================================ + +#define SIS_SRC_ADDR 0x8200 // source address (for blit) +#define SIS_SRC_PITCH 0x8204 // source pitch +#define SIS_SRC_YX 0x8208 // src Y<<16 | X +#define SIS_DST_YX 0x820C // dst Y<<16 | X +#define SIS_RECT_WH 0x8210 // width<<16 | height +#define SIS_FG_COLOR 0x8214 // foreground color +#define SIS_BG_COLOR 0x8218 // background color +#define SIS_MONO_PAT0 0x821C // mono pattern 0 +#define SIS_MONO_PAT1 0x8220 // mono pattern 1 +#define SIS_CLIP_LT 0x8224 // clip left<<16 | top +#define SIS_CLIP_RB 0x8228 // clip right<<16 | bottom +#define SIS_CMD 0x822C // command register +#define SIS_FIRE 0x8230 // fire trigger +#define SIS_LINE_PARAMS 0x8234 // line parameters +#define SIS_DST_ADDR 0x8238 // destination address +#define SIS_SRC_DST_PITCH 0x823C // src/dst pitch combined +#define SIS_AGP_BASE 0x8240 // AGP base (unused) + +// ============================================================ +// Engine status register +// ============================================================ + +#define SIS_ENGINE_STATUS 0x8244 // bit 0 = queues empty, bit 1 = idle + +#define SIS_STATUS_QUEUE_EMPTY 0x01 +#define SIS_STATUS_ENGINE_IDLE 0x02 +#define SIS_STATUS_ALL_IDLE (SIS_STATUS_QUEUE_EMPTY | SIS_STATUS_ENGINE_IDLE) + +// ============================================================ +// Host data port +// ============================================================ + +#define SIS_HOST_DATA 0x8300 // write pixel data here as dwords + +// ============================================================ +// Command register encoding +// ============================================================ + +// Bits 7:0 = ROP +#define SIS_ROP_COPY 0xCC +#define SIS_ROP_PAT_COPY 0xF0 + +// Bit 8 = X direction +#define SIS_CMD_XDIR_RIGHT (1 << 8) + +// Bit 9 = Y direction +#define SIS_CMD_YDIR_DOWN (1 << 9) + +// Bits 13:10 = command type +#define SIS_CMD_BITBLT 0x0000 +#define SIS_CMD_COLOREXP 0x0400 +#define SIS_CMD_LINEDRAW 0x0800 +#define SIS_CMD_TRAPEZOID 0x0C00 + +// Bit 14 = pattern enable +#define SIS_CMD_PAT_ENABLE (1 << 14) + +// Bit 16 = clipping enable +#define SIS_CMD_CLIP_ENABLE (1 << 16) + +// Bit 24 = source is mono +#define SIS_CMD_SRC_MONO (1 << 24) + +// ============================================================ +// Hardware cursor registers +// ============================================================ + +#define SIS_CURSOR_ENABLE 0x8500 // bit 0 = enable +#define SIS_CURSOR_X 0x8504 // cursor X position +#define SIS_CURSOR_Y 0x8508 // cursor Y position +#define SIS_CURSOR_ADDR 0x850C // cursor VRAM byte offset + +// ============================================================ +// Misc constants +// ============================================================ + +#define SIS_MMIO_SIZE 131072 // BAR1: 128KB MMIO +#define SIS_MAX_IDLE_WAIT 1000000 +#define SIS_HW_CURSOR_SIZE 64 + +// ============================================================ +// Private driver state +// ============================================================ + +typedef struct { + uint32_t lfbPhysAddr; + uint32_t mmioPhysAddr; + uint32_t vramSize; + int32_t bytesPerPixel; + int32_t screenPitch; + volatile uint32_t *mmio; + DpmiMappingT mmioMapping; + DpmiMappingT lfbMapping; +} SisPrivateT; + +// ============================================================ +// Prototypes +// ============================================================ + +static void sisBitBlt(AccelDriverT *drv, int32_t srcX, int32_t srcY, int32_t dstX, int32_t dstY, int32_t w, int32_t h); +static bool sisDetect(AccelDriverT *drv); +static void sisHostBlit(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h); +static bool sisInit(AccelDriverT *drv, const AccelModeRequestT *req); +static void sisMoveCursor(AccelDriverT *drv, int32_t x, int32_t y); +static void sisRectFill(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color); +static void sisSetClip(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h); +static void sisSetCursor(AccelDriverT *drv, const HwCursorImageT *image); +static void sisShowCursor(AccelDriverT *drv, bool visible); +static void sisShutdown(AccelDriverT *drv); +static void sisWaitIdle(AccelDriverT *drv); + +static inline void sisWrite(SisPrivateT *priv, uint32_t reg, uint32_t val) { + priv->mmio[reg / 4] = val; +} + +static inline uint32_t sisRead(SisPrivateT *priv, uint32_t reg) { + return priv->mmio[reg / 4]; +} + +// ============================================================ +// Driver instance +// ============================================================ + +static SisPrivateT sSisPrivate; + +static AccelDriverT sSisDriver = { + .name = "SiS 6326", + .chipFamily = "sis", + .caps = 0, + .privData = &sSisPrivate, + .detect = sisDetect, + .init = sisInit, + .shutdown = sisShutdown, + .waitIdle = sisWaitIdle, + .setClip = sisSetClip, + .rectFill = sisRectFill, + .rectFillPat = NULL, + .bitBlt = sisBitBlt, + .hostBlit = sisHostBlit, + .colorExpand = NULL, + .lineDraw = NULL, + .setCursor = sisSetCursor, + .moveCursor = sisMoveCursor, + .showCursor = sisShowCursor, +}; + +// ============================================================ +// sisRegisterDriver +// ============================================================ + +void sisRegisterDriver(void) { + accelRegisterDriver(&sSisDriver); +} + + +// ============================================================ +// sisBitBlt +// ============================================================ +// +// Screen-to-screen BitBLT. Handles overlapping regions by choosing +// the correct X/Y direction based on source and destination positions. + +static void sisBitBlt(AccelDriverT *drv, int32_t srcX, int32_t srcY, int32_t dstX, int32_t dstY, int32_t w, int32_t h) { + SisPrivateT *priv = (SisPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + sisWaitIdle(drv); + + // Determine blit direction for overlapping regions + uint32_t cmd = SIS_CMD_BITBLT | SIS_ROP_COPY | SIS_CMD_CLIP_ENABLE; + int32_t sx = srcX; + int32_t sy = srcY; + int32_t dx = dstX; + int32_t dy = dstY; + + if (dstX <= srcX) { + cmd |= SIS_CMD_XDIR_RIGHT; + } else { + sx += w - 1; + dx += w - 1; + } + + if (dstY <= srcY) { + cmd |= SIS_CMD_YDIR_DOWN; + } else { + sy += h - 1; + dy += h - 1; + } + + uint32_t pitch = ((uint32_t)priv->screenPitch << 16) | (uint32_t)priv->screenPitch; + + sisWrite(priv, SIS_SRC_DST_PITCH, pitch); + sisWrite(priv, SIS_SRC_YX, ((uint32_t)sy << 16) | (uint32_t)sx); + sisWrite(priv, SIS_DST_YX, ((uint32_t)dy << 16) | (uint32_t)dx); + sisWrite(priv, SIS_RECT_WH, ((uint32_t)w << 16) | (uint32_t)h); + sisWrite(priv, SIS_CMD, cmd); + sisWrite(priv, SIS_FIRE, 0); +} + + +// ============================================================ +// sisDetect +// ============================================================ + +static bool sisDetect(AccelDriverT *drv) { + int32_t matchIdx; + + if (!pciFindDeviceList(sSisDeviceIds, &drv->pciDev, &matchIdx)) { + return false; + } + + switch (drv->pciDev.deviceId) { + case SIS_6326: + drv->name = "SiS 6326"; + break; + case SIS_300: + drv->name = "SiS 300"; + break; + case SIS_305: + drv->name = "SiS 305"; + break; + case SIS_315: + drv->name = "SiS 315"; + break; + case SIS_330: + drv->name = "SiS 330"; + break; + default: + drv->name = "SiS 6326/3xx"; + break; + } + + return true; +} + + +// ============================================================ +// sisHostBlit +// ============================================================ +// +// CPU-to-screen blit. Issues a BitBLT command, then feeds pixel data +// as dwords through the MMIO host data port at offset 0x8300. + +static void sisHostBlit(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h) { + SisPrivateT *priv = (SisPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + int32_t bytesPerRow = w * priv->bytesPerPixel; + int32_t dwordsPerRow = (bytesPerRow + 3) / 4; + + sisWaitIdle(drv); + + sisWrite(priv, SIS_SRC_DST_PITCH, (uint32_t)priv->screenPitch); + sisWrite(priv, SIS_DST_YX, ((uint32_t)dstY << 16) | (uint32_t)dstX); + sisWrite(priv, SIS_RECT_WH, ((uint32_t)w << 16) | (uint32_t)h); + sisWrite(priv, SIS_FG_COLOR, 0); + sisWrite(priv, SIS_CMD, SIS_CMD_BITBLT | SIS_ROP_COPY | SIS_CMD_CLIP_ENABLE | SIS_CMD_XDIR_RIGHT | SIS_CMD_YDIR_DOWN | SIS_CMD_SRC_MONO); + sisWrite(priv, SIS_FIRE, 0); + + // Feed pixel data row by row through the host data port + for (int32_t row = 0; row < h; row++) { + const uint8_t *rowPtr = srcBuf + row * srcPitch; + + for (int32_t dw = 0; dw < dwordsPerRow; dw++) { + uint32_t val = 0; + int32_t offset = dw * 4; + + for (int32_t b = 0; b < 4; b++) { + if (offset + b < bytesPerRow) { + val |= (uint32_t)rowPtr[offset + b] << (b * 8); + } + } + + sisWrite(priv, SIS_HOST_DATA, val); + } + } +} + + +// ============================================================ +// sisInit +// ============================================================ + +static bool sisInit(AccelDriverT *drv, const AccelModeRequestT *req) { + SisPrivateT *priv = (SisPrivateT *)drv->privData; + + // Read BARs + uint32_t bar0 = pciRead32(drv->pciDev.bus, drv->pciDev.dev, + drv->pciDev.func, PCI_BAR0); + uint32_t bar1 = pciRead32(drv->pciDev.bus, drv->pciDev.dev, + drv->pciDev.func, PCI_BAR1); + + priv->lfbPhysAddr = bar0 & 0xFFFFFFF0; + priv->mmioPhysAddr = bar1 & 0xFFFFFFF0; + + // Size the framebuffer BAR + priv->vramSize = pciSizeBar(drv->pciDev.bus, drv->pciDev.dev, + drv->pciDev.func, PCI_BAR0); + + // Map MMIO control registers (128KB) + if (!dpmiMapFramebuffer(priv->mmioPhysAddr, SIS_MMIO_SIZE, &priv->mmioMapping)) { + return false; + } + priv->mmio = (volatile uint32_t *)priv->mmioMapping.ptr; + + // Find and set VESA mode + VesaModeResultT vesa; + if (!vesaFindAndSetMode(req->width, req->height, req->bpp, &vesa)) { + dpmiUnmapFramebuffer(&priv->mmioMapping); + return false; + } + + // Map framebuffer + if (!dpmiMapFramebuffer(priv->lfbPhysAddr, priv->vramSize, &priv->lfbMapping)) { + vgaRestoreTextMode(); + dpmiUnmapFramebuffer(&priv->mmioMapping); + return false; + } + + priv->bytesPerPixel = (vesa.bpp + 7) / 8; + priv->screenPitch = vesa.pitch; + + drv->mode.width = vesa.width; + drv->mode.height = vesa.height; + drv->mode.bpp = vesa.bpp; + drv->mode.pitch = vesa.pitch; + drv->mode.framebuffer = priv->lfbMapping.ptr; + drv->mode.vramSize = priv->vramSize; + drv->mode.offscreenBase = vesa.pitch * vesa.height; + + // Wait for engine idle before configuring + sisWaitIdle(drv); + + drv->caps = ACAP_RECT_FILL + | ACAP_BITBLT + | ACAP_HOST_BLIT + | ACAP_HW_CURSOR + | ACAP_CLIP; + + // Full screen clip + sisSetClip(drv, 0, 0, vesa.width, vesa.height); + + return true; +} + + +// ============================================================ +// sisMoveCursor +// ============================================================ + +static void sisMoveCursor(AccelDriverT *drv, int32_t x, int32_t y) { + SisPrivateT *priv = (SisPrivateT *)drv->privData; + + if (x < 0) { + x = 0; + } + if (y < 0) { + y = 0; + } + + sisWrite(priv, SIS_CURSOR_X, (uint32_t)x); + sisWrite(priv, SIS_CURSOR_Y, (uint32_t)y); +} + + +// ============================================================ +// sisRectFill +// ============================================================ +// +// Solid rectangle fill. Sets the foreground color, loads the +// destination coordinates and dimensions, then fires a BitBLT +// command with PAT_COPY ROP and pattern enable to fill with a +// solid color. + +static void sisRectFill(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color) { + SisPrivateT *priv = (SisPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + sisWaitIdle(drv); + + sisWrite(priv, SIS_SRC_DST_PITCH, (uint32_t)priv->screenPitch); + sisWrite(priv, SIS_FG_COLOR, color); + sisWrite(priv, SIS_MONO_PAT0, 0xFFFFFFFF); + sisWrite(priv, SIS_MONO_PAT1, 0xFFFFFFFF); + sisWrite(priv, SIS_DST_YX, ((uint32_t)y << 16) | (uint32_t)x); + sisWrite(priv, SIS_RECT_WH, ((uint32_t)w << 16) | (uint32_t)h); + sisWrite(priv, SIS_CMD, SIS_CMD_BITBLT | SIS_ROP_PAT_COPY | SIS_CMD_PAT_ENABLE | SIS_CMD_CLIP_ENABLE | SIS_CMD_XDIR_RIGHT | SIS_CMD_YDIR_DOWN); + sisWrite(priv, SIS_FIRE, 0); +} + + +// ============================================================ +// sisSetClip +// ============================================================ + +static void sisSetClip(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h) { + SisPrivateT *priv = (SisPrivateT *)drv->privData; + + sisWrite(priv, SIS_CLIP_LT, ((uint32_t)x << 16) | (uint32_t)y); + sisWrite(priv, SIS_CLIP_RB, ((uint32_t)(x + w - 1) << 16) | (uint32_t)(y + h - 1)); +} + + +// ============================================================ +// sisSetCursor +// ============================================================ +// +// Upload a 64x64 hardware cursor image to VRAM. The SiS cursor +// format is 2bpp: AND mask and XOR mask interleaved per row, +// 16 bytes per row (8 AND + 8 XOR). Total size is 1024 bytes. + +static void sisSetCursor(AccelDriverT *drv, const HwCursorImageT *image) { + SisPrivateT *priv = (SisPrivateT *)drv->privData; + + if (!image) { + sisShowCursor(drv, false); + return; + } + + sisWaitIdle(drv); + + // Store cursor image at end of VRAM (1KB aligned) + uint32_t cursorOffset = priv->vramSize - 1024; + cursorOffset &= ~0x3FF; + uint8_t *cursorMem = drv->mode.framebuffer + cursorOffset; + + // Write AND mask then XOR mask, interleaved per row + for (int32_t row = 0; row < SIS_HW_CURSOR_SIZE; row++) { + for (int32_t byteIdx = 0; byteIdx < 8; byteIdx++) { + int32_t srcIdx = row * 8 + byteIdx; + uint8_t andByte; + uint8_t xorByte; + + if (row < image->height && byteIdx < (image->width + 7) / 8) { + andByte = image->andMask[srcIdx]; + xorByte = image->xorMask[srcIdx]; + } else { + andByte = 0xFF; // transparent + xorByte = 0x00; + } + + cursorMem[row * 16 + byteIdx] = andByte; + cursorMem[row * 16 + byteIdx + 8] = xorByte; + } + } + + // Set cursor address register + sisWrite(priv, SIS_CURSOR_ADDR, cursorOffset); +} + + +// ============================================================ +// sisShowCursor +// ============================================================ + +static void sisShowCursor(AccelDriverT *drv, bool visible) { + SisPrivateT *priv = (SisPrivateT *)drv->privData; + + sisWrite(priv, SIS_CURSOR_ENABLE, visible ? 1 : 0); +} + + +// ============================================================ +// sisShutdown +// ============================================================ + +static void sisShutdown(AccelDriverT *drv) { + SisPrivateT *priv = (SisPrivateT *)drv->privData; + + sisShowCursor(drv, false); + vgaRestoreTextMode(); + + dpmiUnmapFramebuffer(&priv->lfbMapping); + dpmiUnmapFramebuffer(&priv->mmioMapping); + + priv->mmio = NULL; +} + + +// ============================================================ +// sisWaitIdle +// ============================================================ +// +// Wait until the 2D engine is completely idle. Both bit 0 (queues +// empty) and bit 1 (engine idle) of the status register at 0x8244 +// must be set. + +static void sisWaitIdle(AccelDriverT *drv) { + SisPrivateT *priv = (SisPrivateT *)drv->privData; + + for (int32_t i = 0; i < SIS_MAX_IDLE_WAIT; i++) { + uint32_t stat = sisRead(priv, SIS_ENGINE_STATUS); + if ((stat & SIS_STATUS_ALL_IDLE) == SIS_STATUS_ALL_IDLE) { + return; + } + } +} diff --git a/test/86box.cfg b/test/86box.cfg new file mode 100644 index 0000000..71c2cb4 --- /dev/null +++ b/test/86box.cfg @@ -0,0 +1,62 @@ +# 86Box configuration for testing DOS accelerated video drivers +# Target: S3 Trio64 with 2MB VRAM + +[Machine] +machine = Award 430FX +cpu_family = intel_pentium_p54c +cpu_speed = 75000000 +cpu_multi = 1.5 +fpu_type = internal +mem_size = 16 +time_sync = local + +[Video] +gfxcard = S3 Trio64 +voodoo = off + +[Video S3 Trio64] +memory = 2 + +[Input] +mouse_type = ps2 + +[Sound] +sndcard = Sound Blaster 16 +midi_device = none +mpu401 = none +opl_type = nuked + +[Floppy and CD-ROM drives] +fdd_01_type = 35_2hd +fdd_02_type = none +cdrom_01_host_drive = 0 +cdrom_01_speed = 8 +cdrom_01_type = 86B_CD-ROM_1.00 +cdrom_01_bus_type = ide +cdrom_01_ide_channel = 1:0 + +[Hard disks] +hdd_01_parameters = 63, 16, 507, 0, ide, none +hdd_01_fn = dos622.img +hdd_01_ide_channel = 0:0 + +[Floppy images] +fdd_01_fn = +fdd_02_fn = + +[Storage controllers] +hdc = IDE (PCI) +scsi_card = none + +[Network] +net_type = none + +[Ports (COM & LPT)] +serial1_enabled = 1 +serial2_enabled = 0 +lpt1_enabled = 1 +lpt1_device = none + +[Other peripherals] +bugger = off +postcard = off diff --git a/test/README.txt b/test/README.txt new file mode 100644 index 0000000..c1466bd --- /dev/null +++ b/test/README.txt @@ -0,0 +1,121 @@ +86Box Test Environment Setup +============================ + +This directory contains configuration files for testing the DOS +accelerated video driver demo under 86Box, an x86 hardware emulator. + +The 86box.cfg is configured for: + - Intel Pentium 75 MHz (Award 430FX chipset) + - 16 MB RAM + - S3 Trio64 with 2 MB VRAM + - Sound Blaster 16 + - IDE hard disk (504 MB image) + - 3.5" 1.44 MB floppy drive + - IDE CD-ROM + + +Step 1: Install 86Box +--------------------- +Download 86Box from https://86box.net/ and extract it to a +directory of your choice. You also need the ROM set -- place +the roms/ folder alongside the 86Box executable. + + +Step 2: Create a Hard Disk Image +-------------------------------- +Use 86Box's built-in disk creation or an external tool: + + - In 86Box: Settings > Hard Disks > New + - Create a 504 MB image named "dos622.img" + - Or use: dd if=/dev/zero of=dos622.img bs=1M count=504 + +The 86box.cfg expects the image at: + dos622.img (in the same directory as 86box.cfg) + + +Step 3: Install DOS 6.22 +------------------------- +1. Copy 86box.cfg to your 86Box working directory (or point + 86Box at this directory with the --vmpath flag). +2. Obtain MS-DOS 6.22 floppy images (disk1.img, disk2.img, disk3.img). +3. Start 86Box. Insert disk1.img in the floppy drive: + Settings > Floppy & CD-ROM > Floppy 1 > select disk1.img +4. Boot from floppy (the machine should boot from A: by default). +5. Follow the DOS setup process: + - FDISK: create a primary partition using all space, set active + - Reboot from floppy after FDISK + - FORMAT C: /S + - Run SETUP from the DOS disks +6. Swap floppy images when prompted for disk 2 and disk 3. +7. After setup completes, remove the floppy image and reboot + to verify DOS boots from the hard drive. + + +Step 4: Install CWSDPMI +----------------------- +The demo is a DJGPP (32-bit protected mode) executable and needs +a DPMI host. Download CWSDPMI from: + http://sandmann.dotster.com/cwsdpmi/ + +Copy CWSDPMI.EXE to C:\ on the disk image. DJGPP executables +will load it automatically when no other DPMI host is present. + +Alternatively, you can use CWSDPR0.EXE for ring-0 operation, +which provides direct hardware access without virtualization +overhead. + + +Step 5: Copy the Demo +---------------------- +Mount the disk image and copy these files to C:\: + + demo.exe - the compiled demo executable + cwsdpmi.exe - DPMI host (see Step 4) + +You can mount the image on Linux with: + sudo mount -o loop,offset=32256 dos622.img /mnt + +Or use mtools: + mcopy -i dos622.img@@32256 demo.exe :: + mcopy -i dos622.img@@32256 cwsdpmi.exe :: + +Also copy rundemo.bat for convenience: + mcopy -i dos622.img@@32256 rundemo.bat :: + + +Step 6: Run the Demo +-------------------- +Boot the machine in 86Box and at the C:\> prompt: + + C:\>RUNDEMO + +Or run directly: + + C:\>DEMO 640 480 16 + +Other supported modes (depending on VRAM): + C:\>DEMO 800 600 16 + C:\>DEMO 640 480 32 + C:\>DEMO 1024 768 8 + +Controls: + SPACE - cycle to next demo + B - run benchmark + ESC - exit + + +Troubleshooting +--------------- +- "No supported video hardware found": Verify 86box.cfg has + the S3 Trio64 selected. Check that PCI is enabled. + +- Black screen or garbled display: The S3 driver may not support + the requested mode at the configured VRAM size. Try a lower + resolution or color depth. + +- "Load error: no DPMI": CWSDPMI.EXE is missing or not in the + PATH. Copy it to the same directory as DEMO.EXE. + +- Demo runs but acceleration looks wrong: Some 86Box versions + have incomplete S3 acceleration emulation. Try updating to + the latest 86Box release. diff --git a/test/rundemo.bat b/test/rundemo.bat new file mode 100644 index 0000000..b8512cd --- /dev/null +++ b/test/rundemo.bat @@ -0,0 +1,3 @@ +@ECHO OFF +REM Run the accelerated video driver demo at 640x480 16-bit color +DEMO.EXE 640 480 16 diff --git a/trident.c b/trident.c new file mode 100644 index 0000000..ac683e9 --- /dev/null +++ b/trident.c @@ -0,0 +1,630 @@ +// trident.c -- Trident TGUI9440/9660/9680 accelerated video driver +// +// Supports the Trident TGUI family: TGUI9440, TGUI9660, TGUI9680, +// ProVidia 9685, Blade3D, and CyberBlade. These were common PCI +// chips in low-cost 1990s desktop and laptop systems. +// +// The TGUI 2D engine provides: +// - Solid rectangle fill (pattern source) +// - Screen-to-screen BitBLT +// - CPU-to-screen blit (host data transfer) +// - Hardware cursor (64x64) +// +// Register access: +// The GER (Graphics Engine Register) set uses I/O ports in the +// 0x2120-0x214F range. Operations are programmed by writing +// coordinates, dimensions, ROP, and command byte, then the engine +// executes asynchronously. Status is polled at 0x2120. + +#include "accelVid.h" +#include "vgaCommon.h" +#include "pci.h" + +#include +#include +#include +#include +#include + +// ============================================================ +// Trident vendor/device IDs +// ============================================================ + +#define TRIDENT_VENDOR_ID 0x1023 + +#define TRIDENT_TGUI9440 0x9440 +#define TRIDENT_TGUI9660 0x9660 +#define TRIDENT_TGUI9680 0x9680 +#define TRIDENT_PROVIDIA 0x9685 +#define TRIDENT_BLADE3D 0x9880 +#define TRIDENT_CYBERBLADE 0x9910 + +static const uint16_t sTridentDeviceIds[] = { + TRIDENT_VENDOR_ID, TRIDENT_TGUI9440, + TRIDENT_VENDOR_ID, TRIDENT_TGUI9660, + TRIDENT_VENDOR_ID, TRIDENT_TGUI9680, + TRIDENT_VENDOR_ID, TRIDENT_PROVIDIA, + TRIDENT_VENDOR_ID, TRIDENT_BLADE3D, + TRIDENT_VENDOR_ID, TRIDENT_CYBERBLADE, + 0, 0 +}; + +// ============================================================ +// GER (Graphics Engine Register) ports +// ============================================================ + +#define GER_STATUS 0x2120 // word: bit 0 = engine busy +#define GER_OPERMODE 0x2122 // word: bits 2:0 = bpp encoding +#define GER_COMMAND 0x2124 // byte: command register +#define GER_ROP 0x2125 // byte: raster operation +#define GER_FG_COLOR 0x2128 // dword: foreground color +#define GER_BG_COLOR 0x212C // dword: background color +#define GER_PAT_ADDR 0x2130 // dword: pattern address +#define GER_SRC_X 0x2138 // word: source X +#define GER_SRC_Y 0x213A // word: source Y +#define GER_DST_X 0x213C // word: destination X +#define GER_DST_Y 0x213E // word: destination Y +#define GER_DIM_X 0x2140 // word: width - 1 +#define GER_DIM_Y 0x2142 // word: height - 1 +#define GER_STYLE 0x2144 // dword: line style/pattern +#define GER_CKEY 0x2148 // dword: color key + +// ============================================================ +// GER status bits +// ============================================================ + +#define GER_STATUS_BUSY 0x0001 + +// ============================================================ +// GER command byte encoding +// ============================================================ +// +// Bit 0: X direction (0=left, 1=right) +// Bit 1: Y direction (0=up, 1=down) +// Bits 3:2: source select (00=video, 01=system, 10=pattern) +// Bit 4: draw enable (must be set) +// Bit 5: mono source +// Bits 7:6: command type (00=bitblt) + +#define GER_CMD_X_RIGHT 0x01 +#define GER_CMD_X_LEFT 0x00 +#define GER_CMD_Y_DOWN 0x02 +#define GER_CMD_Y_UP 0x00 +#define GER_CMD_SRC_VIDEO 0x00 +#define GER_CMD_SRC_SYSTEM 0x04 +#define GER_CMD_SRC_PATTERN 0x08 +#define GER_CMD_DRAW 0x10 +#define GER_CMD_MONO 0x20 +#define GER_CMD_BITBLT 0x00 + +// Composite commands +#define GER_CMD_SOLID_FILL (GER_CMD_BITBLT | GER_CMD_SRC_PATTERN | GER_CMD_DRAW | GER_CMD_X_RIGHT | GER_CMD_Y_DOWN) +#define GER_CMD_SCRBLT_FWD (GER_CMD_BITBLT | GER_CMD_SRC_VIDEO | GER_CMD_DRAW | GER_CMD_X_RIGHT | GER_CMD_Y_DOWN) +#define GER_CMD_HOSTBLT (GER_CMD_BITBLT | GER_CMD_SRC_SYSTEM | GER_CMD_DRAW | GER_CMD_X_RIGHT | GER_CMD_Y_DOWN) + +// ============================================================ +// GER opermode bpp encoding (bits 2:0) +// ============================================================ + +#define GER_BPP_8 0x00 +#define GER_BPP_16 0x01 +#define GER_BPP_32 0x02 + +// ============================================================ +// ROPs for GER engine +// ============================================================ + +#define TGUI_ROP_COPY 0xCC +#define TGUI_ROP_PAT_COPY 0xF0 + +// ============================================================ +// Hardware cursor +// ============================================================ +// +// 64x64 cursor stored at end of VRAM. Each row is 16 bytes: +// 8 bytes AND mask followed by 8 bytes XOR mask. +// Enable via CRTC extended register 0x50 bit 7. +// Position via CRTC registers 0x40-0x43. + +#define TGUI_CURSOR_SIZE 64 +#define TGUI_CURSOR_BYTES (TGUI_CURSOR_SIZE * 16) // 1024 bytes + +// ============================================================ +// CRTC extended registers for cursor +// ============================================================ + +#define TGUI_CRTC_CURSOR_X_LO 0x40 +#define TGUI_CRTC_CURSOR_X_HI 0x41 +#define TGUI_CRTC_CURSOR_Y_LO 0x42 +#define TGUI_CRTC_CURSOR_Y_HI 0x43 +#define TGUI_CRTC_CURSOR_CTRL 0x50 + +// ============================================================ +// Miscellaneous +// ============================================================ + +#define TGUI_MAX_IDLE_WAIT 1000000 + +// ============================================================ +// Private driver state +// ============================================================ + +typedef struct { + uint32_t lfbPhysAddr; + uint32_t vramSize; + uint32_t cursorOffset; + int32_t bytesPerPixel; + int32_t screenPitch; + uint16_t chipId; +} TridentPrivateT; + +// ============================================================ +// Prototypes +// ============================================================ + +static void tgBitBlt(AccelDriverT *drv, int32_t srcX, int32_t srcY, int32_t dstX, int32_t dstY, int32_t w, int32_t h); +static bool tgDetect(AccelDriverT *drv); +static uint8_t tgGetBppMode(int32_t bytesPerPixel); +static void tgHostBlit(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h); +static bool tgInit(AccelDriverT *drv, const AccelModeRequestT *req); +static void tgMoveCursor(AccelDriverT *drv, int32_t x, int32_t y); +static void tgRectFill(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color); +static void tgSetCursor(AccelDriverT *drv, const HwCursorImageT *image); +static void tgShowCursor(AccelDriverT *drv, bool visible); +static void tgShutdown(AccelDriverT *drv); +static void tgUnlockRegs(void); +static void tgWaitIdle(AccelDriverT *drv); + +// ============================================================ +// Driver instance +// ============================================================ + +static TridentPrivateT sTridentPrivate; + +static AccelDriverT sTridentDriver = { + .name = "Trident TGUI", + .chipFamily = "trident", + .caps = 0, + .privData = &sTridentPrivate, + .detect = tgDetect, + .init = tgInit, + .shutdown = tgShutdown, + .waitIdle = tgWaitIdle, + .setClip = NULL, + .rectFill = tgRectFill, + .rectFillPat = NULL, + .bitBlt = tgBitBlt, + .hostBlit = tgHostBlit, + .colorExpand = NULL, + .lineDraw = NULL, + .setCursor = tgSetCursor, + .moveCursor = tgMoveCursor, + .showCursor = tgShowCursor, +}; + +// ============================================================ +// tridentRegisterDriver +// ============================================================ + +void tridentRegisterDriver(void) { + accelRegisterDriver(&sTridentDriver); +} + + +// ============================================================ +// tgBitBlt +// ============================================================ +// +// Screen-to-screen BitBLT. Direction bits are set to handle +// overlapping source/destination regions correctly. + +static void tgBitBlt(AccelDriverT *drv, int32_t srcX, int32_t srcY, int32_t dstX, int32_t dstY, int32_t w, int32_t h) { + if (w <= 0 || h <= 0) { + return; + } + + tgWaitIdle(drv); + + TridentPrivateT *priv = (TridentPrivateT *)drv->privData; + + // Determine copy direction for overlap handling + uint8_t cmd = GER_CMD_BITBLT | GER_CMD_SRC_VIDEO | GER_CMD_DRAW; + + int32_t sx = srcX; + int32_t sy = srcY; + int32_t dx = dstX; + int32_t dy = dstY; + + if (dstY > srcY || (dstY == srcY && dstX > srcX)) { + // Copy bottom-to-top, right-to-left + sx += w - 1; + sy += h - 1; + dx += w - 1; + dy += h - 1; + cmd |= GER_CMD_X_LEFT | GER_CMD_Y_UP; + } else { + // Copy top-to-bottom, left-to-right + cmd |= GER_CMD_X_RIGHT | GER_CMD_Y_DOWN; + } + + // Set operation mode (bpp) + outportw(GER_OPERMODE, tgGetBppMode(priv->bytesPerPixel)); + + // ROP: copy + outportb(GER_ROP, TGUI_ROP_COPY); + + // Source coordinates + outportw(GER_SRC_X, sx); + outportw(GER_SRC_Y, sy); + + // Destination coordinates + outportw(GER_DST_X, dx); + outportw(GER_DST_Y, dy); + + // Dimensions (width - 1, height - 1) + outportw(GER_DIM_X, w - 1); + outportw(GER_DIM_Y, h - 1); + + // Fire command + outportb(GER_COMMAND, cmd); +} + + +// ============================================================ +// tgDetect +// ============================================================ + +static bool tgDetect(AccelDriverT *drv) { + int32_t matchIdx; + + if (!pciFindDeviceList(sTridentDeviceIds, &drv->pciDev, &matchIdx)) { + return false; + } + + TridentPrivateT *priv = (TridentPrivateT *)drv->privData; + priv->chipId = drv->pciDev.deviceId; + + switch (drv->pciDev.deviceId) { + case TRIDENT_TGUI9440: + drv->name = "Trident TGUI9440"; + break; + case TRIDENT_TGUI9660: + drv->name = "Trident TGUI9660"; + break; + case TRIDENT_TGUI9680: + drv->name = "Trident TGUI9680"; + break; + case TRIDENT_PROVIDIA: + drv->name = "Trident ProVidia 9685"; + break; + case TRIDENT_BLADE3D: + drv->name = "Trident Blade3D"; + break; + case TRIDENT_CYBERBLADE: + drv->name = "Trident CyberBlade"; + break; + default: + drv->name = "Trident TGUI"; + break; + } + + return true; +} + + +// ============================================================ +// tgGetBppMode +// ============================================================ +// +// Return the GER_OPERMODE bpp encoding for the given bytes per pixel. + +static uint8_t tgGetBppMode(int32_t bytesPerPixel) { + switch (bytesPerPixel) { + case 2: + return GER_BPP_16; + case 4: + return GER_BPP_32; + default: + return GER_BPP_8; + } +} + + +// ============================================================ +// tgHostBlit +// ============================================================ +// +// CPU-to-screen blit. Sets source select to system/CPU and feeds +// pixel data through the GER data port. Each scanline of source +// data is written as a series of 32-bit dwords. + +static void tgHostBlit(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h) { + if (w <= 0 || h <= 0) { + return; + } + + TridentPrivateT *priv = (TridentPrivateT *)drv->privData; + + int32_t rowBytes = w * priv->bytesPerPixel; + int32_t padBytes = (rowBytes + 3) & ~3; + int32_t dwordsPerRow = padBytes / 4; + + tgWaitIdle(drv); + + // Set operation mode (bpp) + outportw(GER_OPERMODE, tgGetBppMode(priv->bytesPerPixel)); + + // ROP: copy + outportb(GER_ROP, TGUI_ROP_COPY); + + // Source coordinates (not meaningful for host data, set to 0) + outportw(GER_SRC_X, 0); + outportw(GER_SRC_Y, 0); + + // Destination coordinates + outportw(GER_DST_X, dstX); + outportw(GER_DST_Y, dstY); + + // Dimensions + outportw(GER_DIM_X, w - 1); + outportw(GER_DIM_Y, h - 1); + + // Fire host blit command + outportb(GER_COMMAND, GER_CMD_HOSTBLT); + + // Feed pixel data row by row as dwords + for (int32_t row = 0; row < h; row++) { + const uint8_t *rowData = srcBuf + row * srcPitch; + + for (int32_t d = 0; d < dwordsPerRow; d++) { + int32_t base = d * 4; + uint32_t dword = 0; + + for (int32_t b = 0; b < 4; b++) { + int32_t idx = base + b; + uint8_t byte = (idx < rowBytes) ? rowData[idx] : 0; + dword |= (uint32_t)byte << (b * 8); + } + + outportl(GER_SRC_X, dword); + } + } +} + + +// ============================================================ +// tgInit +// ============================================================ + +static bool tgInit(AccelDriverT *drv, const AccelModeRequestT *req) { + TridentPrivateT *priv = (TridentPrivateT *)drv->privData; + + // Get LFB physical address from PCI BAR0 + uint32_t bar0 = pciRead32(drv->pciDev.bus, drv->pciDev.dev, + drv->pciDev.func, PCI_BAR0); + priv->lfbPhysAddr = bar0 & 0xFFFFFFF0; + priv->vramSize = pciSizeBar(drv->pciDev.bus, drv->pciDev.dev, + drv->pciDev.func, PCI_BAR0); + + // Unlock Trident extended registers + tgUnlockRegs(); + + // Find and set VESA mode + VesaModeResultT vesa; + if (!vesaFindAndSetMode(req->width, req->height, req->bpp, &vesa)) { + return false; + } + + // Map LFB via DPMI + DpmiMappingT lfbMap; + if (!dpmiMapFramebuffer(priv->lfbPhysAddr, priv->vramSize, &lfbMap)) { + vgaRestoreTextMode(); + return false; + } + + // Fill in driver mode info + priv->bytesPerPixel = (vesa.bpp + 7) / 8; + priv->screenPitch = vesa.pitch; + + drv->mode.width = vesa.width; + drv->mode.height = vesa.height; + drv->mode.bpp = vesa.bpp; + drv->mode.pitch = vesa.pitch; + drv->mode.framebuffer = lfbMap.ptr; + drv->mode.vramSize = priv->vramSize; + drv->mode.offscreenBase = vesa.pitch * vesa.height; + + // Re-unlock after mode set (VESA BIOS may re-lock) + tgUnlockRegs(); + + // Set GER operation mode for current bpp + outportw(GER_OPERMODE, tgGetBppMode(priv->bytesPerPixel)); + + // Set up hardware cursor at end of VRAM + priv->cursorOffset = priv->vramSize - TGUI_CURSOR_BYTES; + priv->cursorOffset &= ~(uint32_t)(TGUI_CURSOR_BYTES - 1); + + // Set cursor start address via CRTC extended registers + // The cursor address is stored as a byte offset divided by 1024 + uint32_t cursorAddrReg = priv->cursorOffset / 1024; + vgaCrtcWrite(0x44, cursorAddrReg & 0xFF); + vgaCrtcWrite(0x45, (cursorAddrReg >> 8) & 0xFF); + + drv->caps = ACAP_RECT_FILL | ACAP_BITBLT | ACAP_HOST_BLIT | ACAP_HW_CURSOR; + + tgWaitIdle(drv); + return true; +} + + +// ============================================================ +// tgMoveCursor +// ============================================================ +// +// Set the hardware cursor position via CRTC extended registers +// 0x40-0x43. X is at 0x40/0x41, Y is at 0x42/0x43. + +static void tgMoveCursor(AccelDriverT *drv, int32_t x, int32_t y) { + (void)drv; + + if (x < 0) { x = 0; } + if (y < 0) { y = 0; } + + vgaCrtcWrite(TGUI_CRTC_CURSOR_X_LO, x & 0xFF); + vgaCrtcWrite(TGUI_CRTC_CURSOR_X_HI, (x >> 8) & 0x07); + vgaCrtcWrite(TGUI_CRTC_CURSOR_Y_LO, y & 0xFF); + vgaCrtcWrite(TGUI_CRTC_CURSOR_Y_HI, (y >> 8) & 0x07); +} + + +// ============================================================ +// tgRectFill +// ============================================================ +// +// Solid rectangle fill using the GER engine in pattern source mode. +// The foreground color register provides the fill color, and the +// ROP is set to pattern copy (0xF0). + +static void tgRectFill(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color) { + if (w <= 0 || h <= 0) { + return; + } + + TridentPrivateT *priv = (TridentPrivateT *)drv->privData; + + tgWaitIdle(drv); + + // Set operation mode (bpp) + outportw(GER_OPERMODE, tgGetBppMode(priv->bytesPerPixel)); + + // Foreground color for the fill + outportl(GER_FG_COLOR, color); + + // ROP: pattern copy (solid fill uses fg color as pattern) + outportb(GER_ROP, TGUI_ROP_PAT_COPY); + + // Destination coordinates + outportw(GER_DST_X, x); + outportw(GER_DST_Y, y); + + // Dimensions (width - 1, height - 1) + outportw(GER_DIM_X, w - 1); + outportw(GER_DIM_Y, h - 1); + + // Fire solid fill command + outportb(GER_COMMAND, GER_CMD_SOLID_FILL); +} + + +// ============================================================ +// tgSetCursor +// ============================================================ +// +// Upload a cursor image to VRAM at the cursor offset. The TGUI +// cursor format is 64x64 with 16 bytes per row: 8 bytes AND mask +// followed by 8 bytes XOR mask. + +static void tgSetCursor(AccelDriverT *drv, const HwCursorImageT *image) { + TridentPrivateT *priv = (TridentPrivateT *)drv->privData; + + if (!image) { + tgShowCursor(drv, false); + return; + } + + tgWaitIdle(drv); + + uint8_t *cursorMem = drv->mode.framebuffer + priv->cursorOffset; + + for (int32_t row = 0; row < TGUI_CURSOR_SIZE; row++) { + for (int32_t col = 0; col < 8; col++) { + int32_t srcIdx = row * 8 + col; + uint8_t andByte; + uint8_t xorByte; + + if (row < image->height && col < (image->width + 7) / 8) { + andByte = image->andMask[srcIdx]; + xorByte = image->xorMask[srcIdx]; + } else { + // Transparent: AND=0xFF, XOR=0x00 + andByte = 0xFF; + xorByte = 0x00; + } + + cursorMem[row * 16 + col] = andByte; + cursorMem[row * 16 + col + 8] = xorByte; + } + } +} + + +// ============================================================ +// tgShowCursor +// ============================================================ +// +// Enable or disable the hardware cursor via CRTC extended +// register 0x50, bit 7. + +static void tgShowCursor(AccelDriverT *drv, bool visible) { + (void)drv; + + uint8_t val = vgaCrtcRead(TGUI_CRTC_CURSOR_CTRL); + + if (visible) { + val |= 0x80; + } else { + val &= ~0x80; + } + + vgaCrtcWrite(TGUI_CRTC_CURSOR_CTRL, val); +} + + +// ============================================================ +// tgShutdown +// ============================================================ + +static void tgShutdown(AccelDriverT *drv) { + tgShowCursor(drv, false); + tgWaitIdle(drv); + vgaRestoreTextMode(); + __djgpp_nearptr_disable(); +} + + +// ============================================================ +// tgUnlockRegs +// ============================================================ +// +// Unlock Trident extended registers. Reading SR0B returns the +// chip version/ID and simultaneously unlocks the extended +// sequencer and CRTC registers. Then writing 0x01 to SR0E +// enables new-mode registers on TGUI chips. + +static void tgUnlockRegs(void) { + // Read SR0B to unlock extensions (returns chip ID) + outportb(VGA_SEQ_INDEX, 0x0B); + (void)inportb(VGA_SEQ_DATA); + + // Enable new-mode TGUI registers + outportb(VGA_SEQ_INDEX, 0x0E); + outportb(VGA_SEQ_DATA, 0x01); +} + + +// ============================================================ +// tgWaitIdle +// ============================================================ +// +// Wait for the GER engine to finish. Polls the status register +// at 0x2120 until bit 0 (busy) clears. + +static void tgWaitIdle(AccelDriverT *drv) { + (void)drv; + + for (int32_t i = 0; i < TGUI_MAX_IDLE_WAIT; i++) { + if (!(inportw(GER_STATUS) & GER_STATUS_BUSY)) { + return; + } + } +} diff --git a/tsengW32.c b/tsengW32.c new file mode 100644 index 0000000..e721d6f --- /dev/null +++ b/tsengW32.c @@ -0,0 +1,698 @@ +// tsengW32.c -- Tseng ET4000/W32p accelerated video driver +// +// Supports the Tseng Labs ET4000/W32 family: W32, W32i, W32p rev A/B/C/D. +// These chips were common in ISA/VLB and early PCI systems of the early +// 1990s, offering good 2D acceleration for their era. +// +// The W32 ACL (Accelerator) engine provides: +// - Solid rectangle fill +// - 8x8 pattern fill (mono and color) +// - Screen-to-screen BitBLT +// - CPU-to-screen color expansion +// - Bresenham line draw (W32p only) +// - Hardware cursor (64x64 on W32p, not on W32/W32i) +// +// Register access: +// The ACL registers are accessed via I/O ports in the 0x21xx range +// after unlocking with a key sequence. The ACL uses a different +// programming model from S3 or ATI -- operations are set up by +// writing source/destination addresses, dimensions, and mix/ROP +// to indexed registers, then triggered by writing to the +// accelerator control register. +// +// On the W32p, an MMU (Memory Management Unit) provides four +// apertures at the end of the linear address space that can be +// used for CPU-to-screen data transfer, avoiding I/O port +// overhead for host blits. + +#include "accelVid.h" +#include "vgaCommon.h" +#include "pci.h" + +#include +#include +#include +#include +#include + +// ============================================================ +// Tseng vendor/device IDs +// ============================================================ + +#define TSENG_VENDOR_ID 0x100C + +#define TSENG_W32 0x3202 +#define TSENG_W32I 0x3205 +#define TSENG_W32P_A 0x3206 +#define TSENG_W32P_B 0x3207 +#define TSENG_W32P_C 0x3208 +#define TSENG_W32P_D 0x4702 + +static const uint16_t sTsengDeviceIds[] = { + TSENG_VENDOR_ID, TSENG_W32, + TSENG_VENDOR_ID, TSENG_W32I, + TSENG_VENDOR_ID, TSENG_W32P_A, + TSENG_VENDOR_ID, TSENG_W32P_B, + TSENG_VENDOR_ID, TSENG_W32P_C, + TSENG_VENDOR_ID, TSENG_W32P_D, + 0, 0 +}; + +// ============================================================ +// Tseng ACL register ports +// ============================================================ +// +// The ACL registers are at I/O ports 0x2100-0x217F. They are +// accessed as indexed registers via a base+offset scheme. + +#define ET_ACL_SUSPEND_TERM 0x2100 // suspend/terminate +#define ET_ACL_OPERATION_STATE 0x2101 // operation state (read) +#define ET_ACL_SYNC_ENABLE 0x2102 // sync enable +#define ET_ACL_INT_STATUS 0x2109 // interrupt status +#define ET_ACL_INT_MASK 0x210A // interrupt mask + +// ACL setup registers +#define ET_ACL_PATTERN_ADDR 0x2110 // pattern address (3 bytes) +#define ET_ACL_SOURCE_ADDR 0x2114 // source address (3 bytes) +#define ET_ACL_PATTERN_Y_OFF 0x2118 // pattern Y offset +#define ET_ACL_SOURCE_Y_OFF 0x211A // source Y offset +#define ET_ACL_DEST_Y_OFF 0x211C // destination Y offset + +// Virtual bus size affects transfer granularity +#define ET_ACL_VBUS_SIZE 0x2120 // virtual bus size + +// X/Y count (dimensions) +#define ET_ACL_XY_DIR 0x2124 // X/Y direction +#define ET_ACL_X_COUNT 0x2128 // X count (width - 1, in bytes) +#define ET_ACL_Y_COUNT 0x212A // Y count (height - 1) + +// Routing control +#define ET_ACL_ROUTING_CTRL 0x2126 // routing control + +// Mix/ROP registers +#define ET_ACL_MIX_CONTROL 0x2127 // foreground/background source +#define ET_ACL_ROP 0x2130 // raster operation + +// Destination address +#define ET_ACL_DEST_ADDR 0x2134 // destination address (3 bytes) + +// Pixel depth control +#define ET_ACL_PIXEL_DEPTH 0x2138 // pixel depth (0=8, 1=15/16, 2=24, 3=32) + +// CPU source data port (for host-to-screen) +#define ET_ACL_CPU_DATA 0x2140 // CPU data register (32-bit) + +// ============================================================ +// ACL direction bits (ET_ACL_XY_DIR) +// ============================================================ + +#define ET_DIR_X_POS 0x00 +#define ET_DIR_X_NEG 0x01 +#define ET_DIR_Y_POS 0x00 +#define ET_DIR_Y_NEG 0x02 + +// ============================================================ +// ACL routing control (ET_ACL_ROUTING_CTRL) +// ============================================================ + +#define ET_ROUTE_SRC_VRAM 0x00 // source from video memory +#define ET_ROUTE_SRC_CPU 0x02 // source from CPU +#define ET_ROUTE_SRC_PATTERN 0x04 // source from pattern +#define ET_ROUTE_SRC_COLOR_EXP 0x06 // source is mono -> color expand +#define ET_ROUTE_DST_VRAM 0x00 // destination to video memory + +// ============================================================ +// ACL mix control (ET_ACL_MIX_CONTROL) +// ============================================================ + +#define ET_MIX_FG_SRC 0x00 // foreground from source +#define ET_MIX_FG_PATTERN 0x04 // foreground from pattern +#define ET_MIX_FG_COLOR 0x08 // foreground from foreground color reg +#define ET_MIX_BG_SRC 0x00 // background from source +#define ET_MIX_BG_PATTERN 0x10 // background from pattern +#define ET_MIX_BG_COLOR 0x20 // background from background color reg + +// ============================================================ +// ACL operation state bits +// ============================================================ + +#define ET_ACCEL_BUSY 0x02 // accelerator busy +#define ET_ACCEL_CMD_READY 0x01 // ready for next command + +// ============================================================ +// ACL suspend/terminate control +// ============================================================ + +#define ET_ACL_START 0x00 // start/continue operation +#define ET_ACL_SUSPEND 0x01 // suspend +#define ET_ACL_TERMINATE 0x02 // terminate + +// Common ROPs +#define ET_ROP_COPY 0xCC // dest = source +#define ET_ROP_PAT_COPY 0xF0 // dest = pattern +#define ET_ROP_ZERO 0x00 +#define ET_ROP_ONE 0xFF +#define ET_ROP_XOR 0x66 + +// Hardware cursor +#define ET_HW_CURSOR_SIZE 64 +#define ET_HW_CURSOR_BYTES 1024 + +// Maximum wait iterations +#define ET_MAX_IDLE_WAIT 1000000 + +// ============================================================ +// Private driver state +// ============================================================ + +typedef struct { + uint32_t lfbPhysAddr; + uint32_t vramSize; + uint32_t cursorOffset; + int32_t bytesPerPixel; + int32_t screenPitch; + bool isW32p; // W32p has more features than W32/W32i +} TsengPrivateT; + +// ============================================================ +// Prototypes +// ============================================================ + +static void etBitBlt(AccelDriverT *drv, int32_t srcX, int32_t srcY, int32_t dstX, int32_t dstY, int32_t w, int32_t h); +static bool etDetect(AccelDriverT *drv); +static void etHostBlit(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h); +static bool etInit(AccelDriverT *drv, const AccelModeRequestT *req); +static void etMoveCursor(AccelDriverT *drv, int32_t x, int32_t y); +static void etRectFill(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color); +static void etSetCursor(AccelDriverT *drv, const HwCursorImageT *image); +static void etShowCursor(AccelDriverT *drv, bool visible); +static void etShutdown(AccelDriverT *drv); +static void etUnlockRegs(void); +static void etWaitIdle(AccelDriverT *drv); + +// ============================================================ +// Driver instance +// ============================================================ + +static TsengPrivateT sTsengPrivate; + +static AccelDriverT sTsengDriver = { + .name = "Tseng ET4000/W32p", + .chipFamily = "tseng", + .caps = 0, + .privData = &sTsengPrivate, + .detect = etDetect, + .init = etInit, + .shutdown = etShutdown, + .waitIdle = etWaitIdle, + .setClip = NULL, // W32 has no hardware scissors + .rectFill = etRectFill, + .rectFillPat = NULL, + .bitBlt = etBitBlt, + .hostBlit = etHostBlit, + .colorExpand = NULL, + .lineDraw = NULL, // Line draw is complex on W32, omit for now + .setCursor = etSetCursor, + .moveCursor = etMoveCursor, + .showCursor = etShowCursor, +}; + +// ============================================================ +// etRegisterDriver +// ============================================================ + +void etRegisterDriver(void) { + accelRegisterDriver(&sTsengDriver); +} + + +// ============================================================ +// etBitBlt +// ============================================================ +// +// Screen-to-screen BitBLT using the ACL engine. Source and +// destination are linear byte addresses in VRAM. Direction is +// controlled to handle overlapping regions. + +static void etBitBlt(AccelDriverT *drv, int32_t srcX, int32_t srcY, int32_t dstX, int32_t dstY, int32_t w, int32_t h) { + TsengPrivateT *priv = (TsengPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + int32_t bpp = priv->bytesPerPixel; + int32_t pitch = priv->screenPitch; + + uint32_t srcAddr = srcY * pitch + srcX * bpp; + uint32_t dstAddr = dstY * pitch + dstX * bpp; + + uint8_t direction = ET_DIR_X_POS | ET_DIR_Y_POS; + + if (dstAddr > srcAddr) { + direction = ET_DIR_X_NEG | ET_DIR_Y_NEG; + srcAddr += (h - 1) * pitch + (w - 1) * bpp; + dstAddr += (h - 1) * pitch + (w - 1) * bpp; + } + + int32_t widthBytes = w * bpp - 1; + + etWaitIdle(drv); + + // Set pixel depth + uint8_t pixDepth = 0; + if (bpp == 2) { pixDepth = 1; } + if (bpp == 4) { pixDepth = 3; } + outportb(ET_ACL_PIXEL_DEPTH, pixDepth); + + // Source routing: VRAM to VRAM + outportb(ET_ACL_ROUTING_CTRL, ET_ROUTE_SRC_VRAM | ET_ROUTE_DST_VRAM); + + // ROP: copy + outportb(ET_ACL_ROP, ET_ROP_COPY); + + // Direction + outportb(ET_ACL_XY_DIR, direction); + + // Source Y offset (pitch) + outportw(ET_ACL_SOURCE_Y_OFF, pitch - 1); + + // Dest Y offset (pitch) + outportw(ET_ACL_DEST_Y_OFF, pitch - 1); + + // X and Y counts + outportw(ET_ACL_X_COUNT, widthBytes); + outportw(ET_ACL_Y_COUNT, h - 1); + + // Source address (24-bit) + outportb(ET_ACL_SOURCE_ADDR, srcAddr & 0xFF); + outportb(ET_ACL_SOURCE_ADDR + 1, (srcAddr >> 8) & 0xFF); + outportb(ET_ACL_SOURCE_ADDR + 2, (srcAddr >> 16) & 0xFF); + + // Destination address (triggers operation) + outportb(ET_ACL_DEST_ADDR, dstAddr & 0xFF); + outportb(ET_ACL_DEST_ADDR + 1, (dstAddr >> 8) & 0xFF); + outportb(ET_ACL_DEST_ADDR + 2, (dstAddr >> 16) & 0xFF); + + // Start + outportb(ET_ACL_SUSPEND_TERM, ET_ACL_START); +} + + +// ============================================================ +// etDetect +// ============================================================ + +static bool etDetect(AccelDriverT *drv) { + int32_t matchIdx; + + if (!pciFindDeviceList(sTsengDeviceIds, &drv->pciDev, &matchIdx)) { + return false; + } + + TsengPrivateT *priv = (TsengPrivateT *)drv->privData; + + switch (drv->pciDev.deviceId) { + case TSENG_W32: + drv->name = "Tseng ET4000/W32"; + priv->isW32p = false; + break; + case TSENG_W32I: + drv->name = "Tseng ET4000/W32i"; + priv->isW32p = false; + break; + case TSENG_W32P_A: + case TSENG_W32P_B: + case TSENG_W32P_C: + case TSENG_W32P_D: + drv->name = "Tseng ET4000/W32p"; + priv->isW32p = true; + break; + default: + drv->name = "Tseng ET4000/W32"; + priv->isW32p = false; + break; + } + + return true; +} + + +// ============================================================ +// etHostBlit +// ============================================================ +// +// CPU-to-screen blit. Transfers pixel data from system memory to +// the framebuffer via the ACL engine. Source routing is set to CPU +// and data is fed as 32-bit dwords through ET_ACL_CPU_DATA. Each +// row of source pixels is packed into dwords with padding to a +// 4-byte boundary. + +static void etHostBlit(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h) { + TsengPrivateT *priv = (TsengPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + int32_t bpp = priv->bytesPerPixel; + int32_t pitch = priv->screenPitch; + uint32_t dstAddr = dstY * pitch + dstX * bpp; + int32_t widthBytes = w * bpp - 1; + int32_t rowBytes = w * bpp; + int32_t padBytesPerRow = (rowBytes + 3) & ~3; + int32_t dwordsPerRow = padBytesPerRow / 4; + + etWaitIdle(drv); + + // Set pixel depth + uint8_t pixDepth = 0; + if (bpp == 2) { pixDepth = 1; } + if (bpp == 4) { pixDepth = 3; } + outportb(ET_ACL_PIXEL_DEPTH, pixDepth); + + // Routing: source from CPU, destination to VRAM + outportb(ET_ACL_ROUTING_CTRL, ET_ROUTE_SRC_CPU | ET_ROUTE_DST_VRAM); + + // ROP: copy + outportb(ET_ACL_ROP, ET_ROP_COPY); + + // Direction: forward + outportb(ET_ACL_XY_DIR, ET_DIR_X_POS | ET_DIR_Y_POS); + + // Dest Y offset (pitch) + outportw(ET_ACL_DEST_Y_OFF, pitch - 1); + + // X and Y counts + outportw(ET_ACL_X_COUNT, widthBytes); + outportw(ET_ACL_Y_COUNT, h - 1); + + // Destination address + outportb(ET_ACL_DEST_ADDR, dstAddr & 0xFF); + outportb(ET_ACL_DEST_ADDR + 1, (dstAddr >> 8) & 0xFF); + outportb(ET_ACL_DEST_ADDR + 2, (dstAddr >> 16) & 0xFF); + + // Start + outportb(ET_ACL_SUSPEND_TERM, ET_ACL_START); + + // Feed pixel data as dwords, row by row + for (int32_t row = 0; row < h; row++) { + const uint8_t *rowData = srcBuf + row * srcPitch; + + for (int32_t d = 0; d < dwordsPerRow; d++) { + int32_t base = d * 4; + uint32_t dword = 0; + + for (int32_t b = 0; b < 4; b++) { + int32_t idx = base + b; + uint8_t byte = (idx < rowBytes) ? rowData[idx] : 0; + dword |= (uint32_t)byte << (b * 8); + } + + outportl(ET_ACL_CPU_DATA, dword); + } + } +} + + +// ============================================================ +// etInit +// ============================================================ + +static bool etInit(AccelDriverT *drv, const AccelModeRequestT *req) { + TsengPrivateT *priv = (TsengPrivateT *)drv->privData; + + // Get LFB from PCI BAR0 + uint32_t bar0 = pciRead32(drv->pciDev.bus, drv->pciDev.dev, + drv->pciDev.func, PCI_BAR0); + priv->lfbPhysAddr = bar0 & 0xFFFFFFF0; + priv->vramSize = pciSizeBar(drv->pciDev.bus, drv->pciDev.dev, + drv->pciDev.func, PCI_BAR0); + + // Unlock Tseng extended registers + etUnlockRegs(); + + // Find and set VESA mode + VesaModeResultT vesa; + if (!vesaFindAndSetMode(req->width, req->height, req->bpp, &vesa)) { + return false; + } + + // Map LFB via DPMI + DpmiMappingT lfbMap; + if (!dpmiMapFramebuffer(priv->lfbPhysAddr, priv->vramSize, &lfbMap)) { + vgaRestoreTextMode(); + return false; + } + + // Fill in driver mode info + priv->bytesPerPixel = (vesa.bpp + 7) / 8; + priv->screenPitch = vesa.pitch; + + drv->mode.width = vesa.width; + drv->mode.height = vesa.height; + drv->mode.bpp = vesa.bpp; + drv->mode.pitch = vesa.pitch; + drv->mode.framebuffer = lfbMap.ptr; + drv->mode.vramSize = priv->vramSize; + drv->mode.offscreenBase = vesa.pitch * vesa.height; + + // Re-unlock after mode set + etUnlockRegs(); + + // Reset the ACL engine + outportb(ET_ACL_SUSPEND_TERM, ET_ACL_TERMINATE); + outportb(ET_ACL_SUSPEND_TERM, ET_ACL_START); + + // Set up cursor at end of VRAM (W32p only) + if (priv->isW32p) { + priv->cursorOffset = priv->vramSize - ET_HW_CURSOR_BYTES; + priv->cursorOffset &= ~(ET_HW_CURSOR_BYTES - 1); + } + + drv->caps = ACAP_RECT_FILL | ACAP_BITBLT | ACAP_HOST_BLIT; + + if (priv->isW32p) { + drv->caps |= ACAP_HW_CURSOR; + } + + etWaitIdle(drv); + return true; +} + + +// ============================================================ +// etMoveCursor +// ============================================================ +// +// The W32p hardware cursor position is set through CRTC extended +// registers (IMA port area). Cursor X is at CRTC index 0x40/0x41, +// cursor Y at 0x42/0x43. + +static void etMoveCursor(AccelDriverT *drv, int32_t x, int32_t y) { + (void)drv; + + if (x < 0) { x = 0; } + if (y < 0) { y = 0; } + + // ET4000/W32p cursor position registers + outportb(0x217A, 0xE0); // cursor X low + outportb(0x217B, x & 0xFF); + outportb(0x217A, 0xE1); // cursor X high + outportb(0x217B, (x >> 8) & 0x07); + outportb(0x217A, 0xE2); // cursor Y low + outportb(0x217B, y & 0xFF); + outportb(0x217A, 0xE3); // cursor Y high + outportb(0x217B, (y >> 8) & 0x07); +} + + +// ============================================================ +// etRectFill +// ============================================================ +// +// Solid fill using the ACL engine. We write a single pixel of +// the fill color to an offscreen VRAM location and use it as +// the "source" for a replicated blit. + +static void etRectFill(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color) { + TsengPrivateT *priv = (TsengPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + int32_t bpp = priv->bytesPerPixel; + int32_t pitch = priv->screenPitch; + + // Write the fill color to an offscreen VRAM location for pattern source + // Use just past the visible screen area + uint32_t patAddr = priv->vramSize - 64; // safe offscreen area + uint8_t *fb = drv->mode.framebuffer; + + etWaitIdle(drv); + + // Write pattern pixel(s) to VRAM + for (int32_t i = 0; i < bpp; i++) { + fb[patAddr + i] = (color >> (i * 8)) & 0xFF; + } + + uint32_t dstAddr = y * pitch + x * bpp; + int32_t widthBytes = w * bpp - 1; + + // Set pixel depth + uint8_t pixDepth = 0; + if (bpp == 2) { pixDepth = 1; } + if (bpp == 4) { pixDepth = 3; } + outportb(ET_ACL_PIXEL_DEPTH, pixDepth); + + // Routing: pattern fill + outportb(ET_ACL_ROUTING_CTRL, ET_ROUTE_SRC_PATTERN | ET_ROUTE_DST_VRAM); + + // ROP: pattern copy + outportb(ET_ACL_ROP, ET_ROP_PAT_COPY); + + // Direction: forward + outportb(ET_ACL_XY_DIR, ET_DIR_X_POS | ET_DIR_Y_POS); + + // Pattern address and Y offset + outportb(ET_ACL_PATTERN_ADDR, patAddr & 0xFF); + outportb(ET_ACL_PATTERN_ADDR + 1, (patAddr >> 8) & 0xFF); + outportb(ET_ACL_PATTERN_ADDR + 2, (patAddr >> 16) & 0xFF); + outportw(ET_ACL_PATTERN_Y_OFF, 0); // single-line pattern + + // Dest Y offset + outportw(ET_ACL_DEST_Y_OFF, pitch - 1); + + // Dimensions + outportw(ET_ACL_X_COUNT, widthBytes); + outportw(ET_ACL_Y_COUNT, h - 1); + + // Destination address (triggers operation) + outportb(ET_ACL_DEST_ADDR, dstAddr & 0xFF); + outportb(ET_ACL_DEST_ADDR + 1, (dstAddr >> 8) & 0xFF); + outportb(ET_ACL_DEST_ADDR + 2, (dstAddr >> 16) & 0xFF); + + // Start + outportb(ET_ACL_SUSPEND_TERM, ET_ACL_START); +} + + +// ============================================================ +// etSetCursor +// ============================================================ + +static void etSetCursor(AccelDriverT *drv, const HwCursorImageT *image) { + TsengPrivateT *priv = (TsengPrivateT *)drv->privData; + + if (!priv->isW32p) { + return; + } + + if (!image) { + etShowCursor(drv, false); + return; + } + + etWaitIdle(drv); + + uint8_t *cursorMem = drv->mode.framebuffer + priv->cursorOffset; + + for (int32_t row = 0; row < ET_HW_CURSOR_SIZE; row++) { + for (int32_t byte = 0; byte < 8; byte++) { + int32_t srcIdx = row * 8 + byte; + uint8_t andByte; + uint8_t xorByte; + + if (row < image->height && byte < (image->width + 7) / 8) { + andByte = image->andMask[srcIdx]; + xorByte = image->xorMask[srcIdx]; + } else { + andByte = 0xFF; + xorByte = 0x00; + } + + cursorMem[row * 16 + byte] = andByte; + cursorMem[row * 16 + byte + 8] = xorByte; + } + } + + // Set cursor address via IMA registers + uint32_t cursorAddr = priv->cursorOffset / 4; // in dword units + outportb(0x217A, 0xE8); + outportb(0x217B, cursorAddr & 0xFF); + outportb(0x217A, 0xE9); + outportb(0x217B, (cursorAddr >> 8) & 0xFF); + outportb(0x217A, 0xEA); + outportb(0x217B, (cursorAddr >> 16) & 0x0F); +} + + +// ============================================================ +// etShowCursor +// ============================================================ + +static void etShowCursor(AccelDriverT *drv, bool visible) { + TsengPrivateT *priv = (TsengPrivateT *)drv->privData; + + if (!priv->isW32p) { + return; + } + + // Cursor control via IMA register 0xF7 + outportb(0x217A, 0xF7); + uint8_t val = inportb(0x217B); + + if (visible) { + val |= 0x80; + } else { + val &= ~0x80; + } + + outportb(0x217A, 0xF7); + outportb(0x217B, val); +} + + +// ============================================================ +// etShutdown +// ============================================================ + +static void etShutdown(AccelDriverT *drv) { + etShowCursor(drv, false); + outportb(ET_ACL_SUSPEND_TERM, ET_ACL_TERMINATE); + vgaRestoreTextMode(); + __djgpp_nearptr_disable(); +} + + +// ============================================================ +// etUnlockRegs +// ============================================================ +// +// Unlock Tseng extended registers. +// ET4000: write 0x03 to the "key" register at 0x3BF/0x3D8. +// This enables access to extended CRTC and attribute registers. + +static void etUnlockRegs(void) { + outportb(0x3BF, 0x03); + outportb(0x3D8, 0xA0); +} + + +// ============================================================ +// etWaitIdle +// ============================================================ +// +// Wait for the ACL engine to finish. Poll the operation state +// register for the busy bit to clear. + +static void etWaitIdle(AccelDriverT *drv) { + (void)drv; + + for (int32_t i = 0; i < ET_MAX_IDLE_WAIT; i++) { + if (!(inportb(ET_ACL_OPERATION_STATE) & ET_ACCEL_BUSY)) { + return; + } + } +} diff --git a/vgaCommon.c b/vgaCommon.c new file mode 100644 index 0000000..3f8e8f2 --- /dev/null +++ b/vgaCommon.c @@ -0,0 +1,505 @@ +// vgaCommon.c -- Shared VGA register programming +// +// Implements read/write access to the five standard VGA register +// groups. These are used by all chip-specific drivers for basic +// mode setup before enabling acceleration. +// +// Important timing note: on real hardware, some registers require +// specific sequencing (e.g. attribute controller must be reset via +// a read of Input Status 1 before writing the index). These +// functions handle the sequencing internally. + +#include "vgaCommon.h" +#include "pci.h" + +#include +#include +#include +#include +#include +#include +#include + +// VESA mode scoring weights (same as DVX) +#define MODE_SCORE_16BPP 100 +#define MODE_SCORE_15BPP 90 +#define MODE_SCORE_32BPP 85 +#define MODE_SCORE_8BPP 70 +#define MODE_SCORE_PREF_BPP 20 +#define MODE_SCORE_EXACT_RES 10 + +// ============================================================ +// Prototypes +// ============================================================ + +bool dpmiMapFramebuffer(uint32_t physAddr, uint32_t size, DpmiMappingT *mapping); +void dpmiUnmapFramebuffer(DpmiMappingT *mapping); +uint32_t pciSizeBar(uint8_t bus, uint8_t dev, uint8_t func, uint8_t barReg); +uint8_t vgaAttrRead(uint8_t index); +void vgaAttrReset(void); +void vgaAttrWrite(uint8_t index, uint8_t val); +void vgaBlankScreen(bool blank); +uint8_t vgaCrtcRead(uint8_t index); +void vgaCrtcLock(void); +void vgaCrtcUnlock(void); +void vgaCrtcWrite(uint8_t index, uint8_t val); +void vgaDacReadColor(uint8_t index, uint8_t *r, uint8_t *g, uint8_t *b); +void vgaDacWriteColor(uint8_t index, uint8_t r, uint8_t g, uint8_t b); +uint8_t vgaGfxRead(uint8_t index); +void vgaGfxWrite(uint8_t index, uint8_t val); +uint8_t vgaMiscRead(void); +void vgaMiscWrite(uint8_t val); +void vgaRestoreTextMode(void); +uint8_t vgaSeqRead(uint8_t index); +void vgaSeqWrite(uint8_t index, uint8_t val); +bool vesaFindAndSetMode(int32_t reqW, int32_t reqH, int32_t reqBpp, VesaModeResultT *result); +void vgaWaitVRetrace(void); + +// ============================================================ +// dpmiMapFramebuffer +// ============================================================ +// +// Maps a physical address region into the DJGPP near pointer +// address space via DPMI. This is the three-step process that +// every driver needs: +// 1. Map physical address to linear address +// 2. Lock the pages to prevent swapping +// 3. Enable near pointers for direct C pointer access +// +// Returns true on success. On failure, mapping->ptr is NULL. + +bool dpmiMapFramebuffer(uint32_t physAddr, uint32_t size, DpmiMappingT *mapping) { + __dpmi_meminfo info; + + memset(mapping, 0, sizeof(*mapping)); + + info.address = physAddr; + info.size = size; + + if (__dpmi_physical_address_mapping(&info) != 0) { + fprintf(stderr, "dpmiMap: Failed to map 0x%08lX (%lu bytes)\n", + (unsigned long)physAddr, (unsigned long)size); + return false; + } + + __dpmi_meminfo lockInfo; + lockInfo.address = info.address; + lockInfo.size = size; + __dpmi_lock_linear_region(&lockInfo); + + if (__djgpp_nearptr_enable() == 0) { + fprintf(stderr, "dpmiMap: Failed to enable near pointers\n"); + return false; + } + + mapping->ptr = (uint8_t *)(info.address + __djgpp_conventional_base); + mapping->linearAddr = info.address; + mapping->size = size; + + return true; +} + + +// ============================================================ +// dpmiUnmapFramebuffer +// ============================================================ + +void dpmiUnmapFramebuffer(DpmiMappingT *mapping) { + if (mapping->ptr) { + __djgpp_nearptr_disable(); + mapping->ptr = NULL; + } +} + + +// ============================================================ +// pciSizeBar +// ============================================================ +// +// Determines the size of a PCI BAR by writing all 1s and reading +// back the mask. Saves and restores the original BAR value. + +uint32_t pciSizeBar(uint8_t bus, uint8_t dev, uint8_t func, uint8_t barReg) { + uint32_t saved = pciRead32(bus, dev, func, barReg); + + pciWrite32(bus, dev, func, barReg, 0xFFFFFFFF); + uint32_t mask = pciRead32(bus, dev, func, barReg); + pciWrite32(bus, dev, func, barReg, saved); + + // Decode: invert the writable bits, add 1 + mask &= 0xFFFFFFF0; // mask off type bits + + if (mask == 0) { + return 0; + } + + return (~mask) + 1; +} + + +// ============================================================ +// vesaFindAndSetMode +// ============================================================ +// +// Enumerates VESA VBE modes, scores them against the requested +// resolution and bpp, sets the best match with LFB enabled, and +// returns the mode details. This replaces ~150 lines of identical +// code in every driver. + +bool vesaFindAndSetMode(int32_t reqW, int32_t reqH, int32_t reqBpp, VesaModeResultT *result) { + __dpmi_regs r; + + memset(result, 0, sizeof(*result)); + + // Get VBE controller info + _farpokeb(_dos_ds, __tb + 0, 'V'); + _farpokeb(_dos_ds, __tb + 1, 'B'); + _farpokeb(_dos_ds, __tb + 2, 'E'); + _farpokeb(_dos_ds, __tb + 3, '2'); + + memset(&r, 0, sizeof(r)); + r.x.ax = 0x4F00; + r.x.es = __tb >> 4; + r.x.di = __tb & 0x0F; + __dpmi_int(0x10, &r); + + if (r.x.ax != 0x004F) { + fprintf(stderr, "vesaFindAndSetMode: VBE not available\n"); + return false; + } + + // Copy mode list before 4F01h overwrites __tb + uint16_t modeListOff = _farpeekw(_dos_ds, __tb + 14); + uint16_t modeListSeg = _farpeekw(_dos_ds, __tb + 16); + uint32_t modeListAddr = ((uint32_t)modeListSeg << 4) + modeListOff; + + uint16_t modes[256]; + int32_t modeCount = 0; + + for (int32_t i = 0; i < 256; i++) { + uint16_t mode = _farpeekw(_dos_ds, modeListAddr + i * 2); + if (mode == 0xFFFF) { + break; + } + modes[modeCount++] = mode; + } + + // Score each mode and find the best + uint16_t bestMode = 0; + int32_t bestScore = -1; + + for (int32_t i = 0; i < modeCount; i++) { + memset(&r, 0, sizeof(r)); + r.x.ax = 0x4F01; + r.x.cx = modes[i]; + r.x.es = __tb >> 4; + r.x.di = __tb & 0x0F; + __dpmi_int(0x10, &r); + + if (r.x.ax != 0x004F) { + continue; + } + + uint16_t attr = _farpeekw(_dos_ds, __tb + 0); + int32_t w = _farpeekw(_dos_ds, __tb + 18); + int32_t h = _farpeekw(_dos_ds, __tb + 20); + int32_t bpp = _farpeekb(_dos_ds, __tb + 25); + int32_t pitch = _farpeekw(_dos_ds, __tb + 16); + uint32_t phys = _farpeekl(_dos_ds, __tb + 40); + + // Must have LFB and be a graphics mode + if (!(attr & 0x0080) || !(attr & 0x0010)) { + continue; + } + + // Must meet requested resolution + if (w < reqW || h < reqH) { + continue; + } + + // Only 8/15/16/32 bpp + if (bpp != 8 && bpp != 15 && bpp != 16 && bpp != 32) { + continue; + } + + int32_t score = 0; + + if (bpp == 16) { score = MODE_SCORE_16BPP; } + else if (bpp == 15) { score = MODE_SCORE_15BPP; } + else if (bpp == 32) { score = MODE_SCORE_32BPP; } + else { score = MODE_SCORE_8BPP; } + + if (bpp == reqBpp) { score += MODE_SCORE_PREF_BPP; } + if (w == reqW && h == reqH) { score += MODE_SCORE_EXACT_RES; } + + if (score > bestScore) { + bestScore = score; + bestMode = modes[i]; + result->width = w; + result->height = h; + result->bpp = bpp; + result->pitch = pitch; + result->lfbPhysAddr = phys; + } + } + + if (bestScore < 0) { + fprintf(stderr, "vesaFindAndSetMode: No suitable mode for %ldx%ldx%ld\n", + (long)reqW, (long)reqH, (long)reqBpp); + return false; + } + + // Set the mode with LFB enabled (bit 14) + memset(&r, 0, sizeof(r)); + r.x.ax = 0x4F02; + r.x.bx = bestMode | 0x4000; // bit 14 = enable LFB + __dpmi_int(0x10, &r); + + if (r.x.ax != 0x004F) { + fprintf(stderr, "vesaFindAndSetMode: Failed to set mode 0x%04X\n", bestMode); + return false; + } + + return true; +} + + +// ============================================================ +// vgaAttrRead +// ============================================================ +// +// The attribute controller is unusual: reading Input Status 1 +// resets its flip-flop so the next write to 0x3C0 is treated as +// an index (not data). We must reset before every access. + +uint8_t vgaAttrRead(uint8_t index) { + inportb(VGA_INPUT_STATUS_1); + outportb(VGA_ATTR_INDEX, index); + return inportb(VGA_ATTR_DATA_R); +} + + +// ============================================================ +// vgaAttrReset +// ============================================================ +// +// Resets the attribute controller flip-flop by reading Input +// Status 1. After this, the next write to 0x3C0 is an index write. + +void vgaAttrReset(void) { + inportb(VGA_INPUT_STATUS_1); +} + + +// ============================================================ +// vgaAttrWrite +// ============================================================ +// +// Writes to the attribute controller. The flip-flop mechanism +// means we must: (1) read Input Status 1 to reset, (2) write +// the index to 0x3C0, (3) write the data to 0x3C0. +// Bit 5 of the index byte must be set to keep the palette +// address source enabled (otherwise the screen goes black). + +void vgaAttrWrite(uint8_t index, uint8_t val) { + inportb(VGA_INPUT_STATUS_1); + outportb(VGA_ATTR_INDEX, index); + outportb(VGA_ATTR_DATA_W, val); +} + + +// ============================================================ +// vgaBlankScreen +// ============================================================ +// +// Toggles the screen on/off by setting bit 5 of the sequencer +// clocking mode register. Blanking prevents visible garbage +// during mode transitions. + +void vgaBlankScreen(bool blank) { + uint8_t val = vgaSeqRead(VGA_SEQ_CLOCK_MODE); + + if (blank) { + val |= VGA_SEQ_SCREEN_OFF; + } else { + val &= ~VGA_SEQ_SCREEN_OFF; + } + + vgaSeqWrite(VGA_SEQ_CLOCK_MODE, val); +} + + +// ============================================================ +// vgaCrtcLock +// ============================================================ +// +// Re-enables CRTC write protection by setting bit 7 of the +// vertical sync end register. + +void vgaCrtcLock(void) { + uint8_t val = vgaCrtcRead(VGA_CRTC_V_SYNC_END); + vgaCrtcWrite(VGA_CRTC_V_SYNC_END, val | 0x80); +} + + +// ============================================================ +// vgaCrtcRead +// ============================================================ + +uint8_t vgaCrtcRead(uint8_t index) { + outportb(VGA_CRTC_INDEX, index); + return inportb(VGA_CRTC_DATA); +} + + +// ============================================================ +// vgaCrtcUnlock +// ============================================================ +// +// Disables CRTC write protection. Registers 0x00-0x07 of the +// CRTC are protected by bit 7 of the vertical sync end register +// (0x11). Clearing this bit allows writing to those registers. + +void vgaCrtcUnlock(void) { + uint8_t val = vgaCrtcRead(VGA_CRTC_V_SYNC_END); + vgaCrtcWrite(VGA_CRTC_V_SYNC_END, val & 0x7F); +} + + +// ============================================================ +// vgaCrtcWrite +// ============================================================ + +void vgaCrtcWrite(uint8_t index, uint8_t val) { + outportb(VGA_CRTC_INDEX, index); + outportb(VGA_CRTC_DATA, val); +} + + +// ============================================================ +// vgaDacReadColor +// ============================================================ +// +// Read one DAC palette entry. Write the index to 0x3C7, then +// read three bytes (R, G, B) from 0x3C9. DAC values are 6-bit +// (0-63) on standard VGA, 8-bit on some SVGA cards. + +void vgaDacReadColor(uint8_t index, uint8_t *r, uint8_t *g, uint8_t *b) { + outportb(VGA_DAC_READ_ADDR, index); + *r = inportb(VGA_DAC_DATA); + *g = inportb(VGA_DAC_DATA); + *b = inportb(VGA_DAC_DATA); +} + + +// ============================================================ +// vgaDacWriteColor +// ============================================================ +// +// Write one DAC palette entry. Write the starting index to 0x3C8, +// then write three bytes (R, G, B) to 0x3C9. + +void vgaDacWriteColor(uint8_t index, uint8_t r, uint8_t g, uint8_t b) { + outportb(VGA_DAC_WRITE_ADDR, index); + outportb(VGA_DAC_DATA, r); + outportb(VGA_DAC_DATA, g); + outportb(VGA_DAC_DATA, b); +} + + +// ============================================================ +// vgaGfxRead +// ============================================================ + +uint8_t vgaGfxRead(uint8_t index) { + outportb(VGA_GFX_INDEX, index); + return inportb(VGA_GFX_DATA); +} + + +// ============================================================ +// vgaGfxWrite +// ============================================================ + +void vgaGfxWrite(uint8_t index, uint8_t val) { + outportb(VGA_GFX_INDEX, index); + outportb(VGA_GFX_DATA, val); +} + + +// ============================================================ +// vgaMiscRead +// ============================================================ + +uint8_t vgaMiscRead(void) { + return inportb(VGA_MISC_OUT_R); +} + + +// ============================================================ +// vgaMiscWrite +// ============================================================ + +void vgaMiscWrite(uint8_t val) { + outportb(VGA_MISC_OUT_W, val); +} + + +// ============================================================ +// vgaRestoreTextMode +// ============================================================ +// +// Restores VGA text mode 3 (80x25, 16 color). Uses INT 10h +// because manually reprogramming all VGA registers for text mode +// is error-prone and varies by chipset. The BIOS handles it +// correctly for all VGA-compatible cards. + +void vgaRestoreTextMode(void) { + __dpmi_regs r; + + memset(&r, 0, sizeof(r)); + r.x.ax = 0x0003; + __dpmi_int(0x10, &r); +} + + +// ============================================================ +// vgaSeqRead +// ============================================================ + +uint8_t vgaSeqRead(uint8_t index) { + outportb(VGA_SEQ_INDEX, index); + return inportb(VGA_SEQ_DATA); +} + + +// ============================================================ +// vgaSeqWrite +// ============================================================ + +void vgaSeqWrite(uint8_t index, uint8_t val) { + outportb(VGA_SEQ_INDEX, index); + outportb(VGA_SEQ_DATA, val); +} + + +// ============================================================ +// vgaWaitVRetrace +// ============================================================ +// +// Waits for the start of the next vertical retrace by spinning +// on bit 3 of Input Status 1 (port 0x3DA). First waits for bit +// to clear (if we're currently in retrace), then waits for it +// to set (start of next retrace). + +void vgaWaitVRetrace(void) { + // Wait for any current retrace to end + while (inportb(VGA_INPUT_STATUS_1) & 0x08) { + // spin + } + + // Wait for next retrace to start + while (!(inportb(VGA_INPUT_STATUS_1) & 0x08)) { + // spin + } +} diff --git a/vgaCommon.h b/vgaCommon.h new file mode 100644 index 0000000..8e9ae7a --- /dev/null +++ b/vgaCommon.h @@ -0,0 +1,198 @@ +// vgaCommon.h -- Shared VGA register programming for DOS/DJGPP +// +// Provides low-level access to the standard VGA register sets that +// are common across all VGA-compatible video cards. Every chipset +// driver needs these for basic mode setup before enabling its +// chip-specific acceleration extensions. +// +// The five standard VGA register groups: +// - Miscellaneous Output (0x3C2 write, 0x3CC read) +// - Sequencer (0x3C4/0x3C5) +// - CRTC (0x3D4/0x3D5 for color, 0x3B4/0x3B5 for mono) +// - Graphics Controller (0x3CE/0x3CF) +// - Attribute Controller (0x3C0/0x3C1, toggle via 0x3DA read) +// +// All functions use DJGPP's inportb/outportb for port I/O. +#ifndef VGA_COMMON_H +#define VGA_COMMON_H + +#include +#include + +// ============================================================ +// VGA I/O port addresses +// ============================================================ + +// Miscellaneous output register +#define VGA_MISC_OUT_W 0x3C2 // write +#define VGA_MISC_OUT_R 0x3CC // read + +// Input status registers +#define VGA_INPUT_STATUS_0 0x3C2 +#define VGA_INPUT_STATUS_1 0x3DA // color mode +#define VGA_INPUT_STATUS_1M 0x3BA // mono mode + +// Sequencer +#define VGA_SEQ_INDEX 0x3C4 +#define VGA_SEQ_DATA 0x3C5 + +// CRTC (color mode addresses -- we always use color) +#define VGA_CRTC_INDEX 0x3D4 +#define VGA_CRTC_DATA 0x3D5 + +// Graphics Controller +#define VGA_GFX_INDEX 0x3CE +#define VGA_GFX_DATA 0x3CF + +// Attribute Controller (index and data share 0x3C0) +#define VGA_ATTR_INDEX 0x3C0 +#define VGA_ATTR_DATA_W 0x3C0 +#define VGA_ATTR_DATA_R 0x3C1 + +// DAC (palette) +#define VGA_DAC_READ_ADDR 0x3C7 +#define VGA_DAC_WRITE_ADDR 0x3C8 +#define VGA_DAC_DATA 0x3C9 +#define VGA_DAC_STATE 0x3C7 + +// Feature control +#define VGA_FEATURE_W 0x3DA // write (color mode) +#define VGA_FEATURE_R 0x3CA // read + +// ============================================================ +// Sequencer register indices +// ============================================================ + +#define VGA_SEQ_RESET 0x00 +#define VGA_SEQ_CLOCK_MODE 0x01 +#define VGA_SEQ_PLANE_MASK 0x02 +#define VGA_SEQ_CHAR_MAP 0x03 +#define VGA_SEQ_MEM_MODE 0x04 + +// Sequencer clock mode bits +#define VGA_SEQ_SCREEN_OFF 0x20 // bit 5: blank the screen + +// ============================================================ +// CRTC register indices +// ============================================================ + +#define VGA_CRTC_H_TOTAL 0x00 +#define VGA_CRTC_H_DISP_END 0x01 +#define VGA_CRTC_H_BLANK_START 0x02 +#define VGA_CRTC_H_BLANK_END 0x03 +#define VGA_CRTC_H_SYNC_START 0x04 +#define VGA_CRTC_H_SYNC_END 0x05 +#define VGA_CRTC_V_TOTAL 0x06 +#define VGA_CRTC_OVERFLOW 0x07 +#define VGA_CRTC_PRESET_ROW 0x08 +#define VGA_CRTC_MAX_SCAN 0x09 +#define VGA_CRTC_CURSOR_START 0x0A +#define VGA_CRTC_CURSOR_END 0x0B +#define VGA_CRTC_START_ADDR_HI 0x0C +#define VGA_CRTC_START_ADDR_LO 0x0D +#define VGA_CRTC_CURSOR_HI 0x0E +#define VGA_CRTC_CURSOR_LO 0x0F +#define VGA_CRTC_V_SYNC_START 0x10 +#define VGA_CRTC_V_SYNC_END 0x11 +#define VGA_CRTC_V_DISP_END 0x12 +#define VGA_CRTC_OFFSET 0x13 +#define VGA_CRTC_UNDERLINE 0x14 +#define VGA_CRTC_V_BLANK_START 0x15 +#define VGA_CRTC_V_BLANK_END 0x16 +#define VGA_CRTC_MODE_CTRL 0x17 +#define VGA_CRTC_LINE_COMPARE 0x18 + +// ============================================================ +// Graphics controller register indices +// ============================================================ + +#define VGA_GFX_SET_RESET 0x00 +#define VGA_GFX_ENABLE_SET_RESET 0x01 +#define VGA_GFX_COLOR_COMPARE 0x02 +#define VGA_GFX_DATA_ROTATE 0x03 +#define VGA_GFX_READ_MAP_SEL 0x04 +#define VGA_GFX_MODE 0x05 +#define VGA_GFX_MISC 0x06 +#define VGA_GFX_COLOR_DONT_CARE 0x07 +#define VGA_GFX_BIT_MASK 0x08 + +// ============================================================ +// VESA mode result (returned by vesaFindAndSetMode) +// ============================================================ + +typedef struct { + int32_t width; + int32_t height; + int32_t bpp; + int32_t pitch; + uint32_t lfbPhysAddr; // physical address of LFB from VBE +} VesaModeResultT; + +// ============================================================ +// DPMI LFB mapping result (returned by dpmiMapFramebuffer) +// ============================================================ + +typedef struct { + uint8_t *ptr; // near pointer to mapped region + uint32_t linearAddr; // linear address (for unmapping) + uint32_t size; // mapped size in bytes +} DpmiMappingT; + +// ============================================================ +// Prototypes +// ============================================================ + +// Find the best VESA VBE mode matching the requested resolution +// and bpp, set it with LFB enabled, and return the mode details. +// Returns true on success. This replaces ~150 lines of duplicated +// code in every driver. +bool vesaFindAndSetMode(int32_t reqW, int32_t reqH, int32_t reqBpp, VesaModeResultT *result); + +// Map a physical address region into the DJGPP near pointer space +// via DPMI. Handles physical address mapping, page locking, and +// near pointer enable. Returns true on success. +bool dpmiMapFramebuffer(uint32_t physAddr, uint32_t size, DpmiMappingT *mapping); + +// Unmap a previously mapped framebuffer region and disable near +// pointers. Safe to call with a zeroed mapping struct. +void dpmiUnmapFramebuffer(DpmiMappingT *mapping); + +// Size a PCI BAR by writing all 1s and reading back. Returns the +// decoded size in bytes. Saves and restores the original BAR value. +uint32_t pciSizeBar(uint8_t bus, uint8_t dev, uint8_t func, uint8_t barReg); + +// Read/write individual VGA register sets +uint8_t vgaAttrRead(uint8_t index); +void vgaAttrReset(void); +void vgaAttrWrite(uint8_t index, uint8_t val); +uint8_t vgaCrtcRead(uint8_t index); +void vgaCrtcWrite(uint8_t index, uint8_t val); +uint8_t vgaGfxRead(uint8_t index); +void vgaGfxWrite(uint8_t index, uint8_t val); +uint8_t vgaMiscRead(void); +void vgaMiscWrite(uint8_t val); +uint8_t vgaSeqRead(uint8_t index); +void vgaSeqWrite(uint8_t index, uint8_t val); + +// CRTC register protection: some CRTC registers are write-protected +// by bit 7 of the V_SYNC_END register. These functions unlock/lock. +void vgaCrtcLock(void); +void vgaCrtcUnlock(void); + +// Palette (DAC) operations +void vgaDacReadColor(uint8_t index, uint8_t *r, uint8_t *g, uint8_t *b); +void vgaDacWriteColor(uint8_t index, uint8_t r, uint8_t g, uint8_t b); + +// Restore VGA text mode (mode 3). Uses INT 10h for reliability +// across all chipsets. +void vgaRestoreTextMode(void); + +// Wait for vertical retrace. Spins on Input Status 1 bit 3. +// Useful for timing-sensitive register writes and tear-free updates. +void vgaWaitVRetrace(void); + +// Enable/disable VGA display output by toggling sequencer clocking +// mode bit 5. Used during mode transitions to prevent screen garbage. +void vgaBlankScreen(bool blank); + +#endif // VGA_COMMON_H