commit a20c488959eba1908ca003320c4899af4fb4524a Author: Scott Duensing Date: Mon Apr 13 19:40:45 2026 -0500 Initial commit. diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..cbcbcb2 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.claude/ +obj/ +bin/ +PLAN.md diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..9ac4de9 --- /dev/null +++ b/Makefile @@ -0,0 +1,52 @@ +# Makefile -- DOS Accelerated Video Driver Framework +# +# DJGPP cross-compilation build matching DVX conventions. +# Produces demo.exe as the test application. + +DJGPP_PREFIX = $(HOME)/djgpp/djgpp +CC = $(DJGPP_PREFIX)/bin/i586-pc-msdosdjgpp-gcc +CFLAGS = -O2 -Wall -Wextra -Werror -Wno-type-limits -Wno-sign-compare -Wno-format-truncation -march=i486 -mtune=i586 + +OBJDIR = obj +BINDIR = bin + +# Source files +SRCS = pci.c vgaCommon.c accelVid.c s3Trio.c cirrusGd54.c cirrusLaguna.c atiMach64.c tsengW32.c matroxMga.c banshee.c nvidia.c trident.c sis.c demo.c +OBJS = $(patsubst %.c,$(OBJDIR)/%.o,$(SRCS)) + +TARGET = $(BINDIR)/demo.exe + +.PHONY: all clean + +all: $(TARGET) + +$(TARGET): $(OBJS) | $(BINDIR) + $(CC) $(CFLAGS) -o $@ $(OBJS) + +$(OBJDIR)/%.o: %.c | $(OBJDIR) + $(CC) $(CFLAGS) -c -o $@ $< + +$(OBJDIR): + mkdir -p $(OBJDIR) + +$(BINDIR): + mkdir -p $(BINDIR) + +# Dependencies +$(OBJDIR)/pci.o: pci.c pci.h +$(OBJDIR)/vgaCommon.o: vgaCommon.c vgaCommon.h +$(OBJDIR)/accelVid.o: accelVid.c accelVid.h pci.h +$(OBJDIR)/s3Trio.o: s3Trio.c accelVid.h vgaCommon.h pci.h +$(OBJDIR)/cirrusGd54.o: cirrusGd54.c accelVid.h vgaCommon.h pci.h +$(OBJDIR)/atiMach64.o: atiMach64.c accelVid.h vgaCommon.h pci.h +$(OBJDIR)/tsengW32.o: tsengW32.c accelVid.h vgaCommon.h pci.h +$(OBJDIR)/matroxMga.o: matroxMga.c accelVid.h vgaCommon.h pci.h +$(OBJDIR)/banshee.o: banshee.c accelVid.h vgaCommon.h pci.h +$(OBJDIR)/nvidia.o: nvidia.c accelVid.h vgaCommon.h pci.h +$(OBJDIR)/trident.o: trident.c accelVid.h vgaCommon.h pci.h +$(OBJDIR)/cirrusLaguna.o: cirrusLaguna.c accelVid.h vgaCommon.h pci.h +$(OBJDIR)/sis.o: sis.c accelVid.h vgaCommon.h pci.h +$(OBJDIR)/demo.o: demo.c accelVid.h pci.h + +clean: + rm -rf $(OBJDIR) $(BINDIR) diff --git a/README.md b/README.md new file mode 100644 index 0000000..86fe0b8 --- /dev/null +++ b/README.md @@ -0,0 +1,399 @@ +# DOS Accelerated Video Driver Framework + +Hardware-accelerated 2D video drivers for DOS/DJGPP. Programs the +acceleration engines on PCI video cards directly -- no VESA, no BIOS +calls for rendering. A common API lets applications use acceleration +without knowing which chip is present. + +## Supported Video Cards + +### S3 (s3Trio.c) + +| Chip | Device ID | Notes | +|------|-----------|-------| +| Trio32 | 0x8810 | | +| Trio64 | 0x8811 | MMIO at LFB+16MB | +| Trio64V+ | 0x8814 | MMIO at LFB+16MB | +| ViRGE | 0x5631 | MMIO, 3D engine ignored | +| ViRGE/VX | 0x883D | | +| ViRGE/DX/GX | 0x8A01 | | +| ViRGE/GX2 | 0x8A10 | | +| ViRGE/MX | 0x8C01, 0x8C03 | | +| Savage3D | 0x8A20, 0x8A21 | | +| Savage4 | 0x8A22 | | +| Savage/MX | 0x8C10, 0x8C11 | | +| Savage/IX | 0x8C12, 0x8C13 | | +| Savage 2000 | 0x9102 | | +| Vision864 | 0x88C0, 0x88C1 | I/O only (no MMIO) | +| Vision868 | 0x8880 | I/O only | +| Vision964 | 0x88D0 | I/O only | +| Vision968 | 0x88F0, 0x88F1 | I/O only | + +Hardware ops: RectFill, PatFill, BitBlt, HostBlit, ColorExpand, +LineDraw, HwCursor, Clip + +### ATI Mach64 / Rage (atiMach64.c) + +| Chip | Device ID | Notes | +|------|-----------|-------| +| Mach64 GX | 0x4758 | I/O only | +| Mach64 CX | 0x4358 | I/O only | +| Mach64 CT | 0x4354 | MMIO at end of aperture | +| Mach64 ET | 0x4554 | | +| Mach64 VT | 0x5654, 0x5655 | | +| 3D Rage II | 0x4754, 0x4755 | | +| Rage Pro | 0x4750, 0x4752 | | +| Rage 128 | 0x5245, 0x5246, 0x524B, 0x524C | | +| Rage 128 Pro | 0x5046, 0x5052 | | + +Hardware ops: RectFill, PatFill, BitBlt, HostBlit, ColorExpand, +LineDraw, HwCursor, Clip + +### Matrox MGA (matroxMga.c) + +| Chip | Device ID | Notes | +|------|-----------|-------| +| Millennium (MGA2064W) | 0x0519 | Separate MMIO BAR | +| Mystique (MGA1064SG) | 0x051A | | +| G100 | 0x1000, 0x1001 | | +| G200 | 0x0520, 0x0521 | | +| G400 | 0x0525 | | +| G450 | 0x2527 | | + +Hardware ops: RectFill, PatFill, BitBlt, HostBlit, ColorExpand, +LineDraw, HwCursor, Clip + +### 3dfx (banshee.c) + +| Chip | Device ID | Notes | +|------|-----------|-------| +| Banshee | 0x0003 | MMIO + launch area for data | +| Voodoo3 | 0x0005 | | + +Hardware ops: RectFill, PatFill, BitBlt, HostBlit, ColorExpand, +LineDraw, HwCursor, Clip + +### Cirrus Logic GD54xx (cirrusGd54.c) + +| Chip | Device ID | Notes | +|------|-----------|-------| +| GD5434 | 0x00A0, 0x00A8 | BLT via GR registers | +| GD5436 | 0x00AC | | +| GD5446 | 0x00B8 | | +| GD5480 | 0x00BC | | + +Hardware ops: RectFill, BitBlt, HostBlit, ColorExpand, HwCursor + +### Cirrus Logic Laguna (cirrusLaguna.c) + +| Chip | Device ID | Notes | +|------|-----------|-------| +| GD5462 | 0x00D0 | MMIO, different engine from GD54xx | +| GD5464 | 0x00D4 | | +| GD5465 | 0x00D6 | | + +Hardware ops: RectFill, BitBlt, HostBlit, ColorExpand, HwCursor, Clip + +### Nvidia RIVA / TNT (nvidia.c) + +| Chip | Device ID | Notes | +|------|-----------|-------| +| RIVA 128 | 0x0018 | PGRAPH subchannel interface | +| RIVA 128 ZX | 0x0019 | | +| TNT | 0x0020 | | +| TNT2 | 0x0028 | | +| TNT2 Ultra | 0x0029 | | +| TNT2 M64 | 0x002D | | +| Vanta | 0x002C | | + +Hardware ops: RectFill, BitBlt, HostBlit, HwCursor, Clip + +### Tseng ET4000/W32 (tsengW32.c) + +| Chip | Device ID | Notes | +|------|-----------|-------| +| W32 | 0x3202 | ACL engine via I/O ports | +| W32i | 0x3205 | | +| W32p rev A | 0x3206 | HwCursor on W32p only | +| W32p rev B | 0x3207 | | +| W32p rev C | 0x3208 | | +| W32p rev D | 0x4702 | | + +Hardware ops: RectFill, BitBlt, HostBlit, HwCursor (W32p only) + +### Trident TGUI (trident.c) + +| Chip | Device ID | Notes | +|------|-----------|-------| +| TGUI9440 | 0x9440 | GER engine via I/O ports | +| TGUI9660 | 0x9660 | | +| TGUI9680 | 0x9680 | | +| ProVidia 9685 | 0x9685 | | +| Blade3D | 0x9880 | | +| CyberBlade | 0x9910 | | + +Hardware ops: RectFill, BitBlt, HostBlit, HwCursor + +### SiS (sis.c) + +| Chip | Device ID | Notes | +|------|-----------|-------| +| 6326 | 0x6326 | MMIO queue-based engine | +| 300 | 0x0300 | | +| 305 | 0x0305 | | +| 315 | 0x0315 | | +| 330 | 0x0330 | | + +Hardware ops: RectFill, BitBlt, HostBlit, HwCursor, Clip + +## Capability Matrix + +Operations not implemented in hardware get automatic software fallbacks. +Every function pointer is always callable -- callers never need to +check for NULL. + +| Operation | S3 | ATI | Matrox | 3dfx | CL 54xx | CL Laguna | Nvidia | Tseng | Trident | SiS | +|-----------|:--:|:---:|:------:|:----:|:-------:|:---------:|:------:|:-----:|:-------:|:---:| +| RectFill | HW | HW | HW | HW | HW | HW | HW | HW | HW | HW | +| PatFill | HW | HW | HW | HW | sw | sw | sw | sw | sw | sw | +| BitBlt | HW | HW | HW | HW | HW | HW | HW | HW | HW | HW | +| HostBlit | HW | HW | HW | HW | HW | HW | HW | HW | HW | HW | +| ColorExpand | HW | HW | HW | HW | HW | HW | sw | sw | sw | sw | +| LineDraw | HW | HW | HW | HW | sw | sw | sw | sw | sw | sw | +| HwCursor | HW | HW | HW | HW | HW | HW | HW | W32p | HW | HW | +| Clip | HW | HW | HW | HW | sw | HW | HW | sw | sw | HW | + +HW = hardware accelerated, sw = software fallback + +## API Usage + +### Basic Lifecycle + +```c +#include "accelVid.h" + +// Declare registration functions for the drivers you want +extern void s3RegisterDriver(void); +extern void atiRegisterDriver(void); +// ... etc + +int main(void) { + // 1. Register drivers (order = detection priority) + s3RegisterDriver(); + atiRegisterDriver(); + + // 2. Detect hardware + AccelDriverT *drv = accelDetect(); + if (!drv) { + printf("No supported video card found\n"); + return 1; + } + + // 3. Initialize with a video mode + AccelModeRequestT req; + req.width = 640; + req.height = 480; + req.bpp = 16; + + if (!accelInit(drv, &req)) { + printf("Failed to set video mode\n"); + return 1; + } + + // Mode info is now available + printf("Mode: %dx%dx%d pitch=%d\n", + drv->mode.width, drv->mode.height, + drv->mode.bpp, drv->mode.pitch); + + // 4. Draw + drv->rectFill(drv, 0, 0, 640, 480, 0x001F); // blue + drv->waitIdle(drv); + + // 5. Shut down + accelShutdown(drv); + return 0; +} +``` + +### Drawing Operations + +All drawing functions take the driver pointer as the first argument. +Colors are packed in the display's native pixel format. + +```c +// Solid rectangle fill +drv->rectFill(drv, x, y, w, h, color); + +// 8x8 mono pattern fill (1=fg, 0=bg, MSB first, 8 bytes) +uint8_t checkerboard[8] = { + 0xAA, 0x55, 0xAA, 0x55, + 0xAA, 0x55, 0xAA, 0x55 +}; +drv->rectFillPat(drv, x, y, w, h, checkerboard, fgColor, bgColor); + +// Screen-to-screen blit (handles overlapping regions) +drv->bitBlt(drv, srcX, srcY, dstX, dstY, w, h); + +// CPU-to-screen blit (transfer RAM buffer to VRAM) +// srcBuf = packed pixels in display format, srcPitch = byte stride +drv->hostBlit(drv, buffer, pitch, dstX, dstY, w, h); + +// Monochrome color expansion (1bpp -> full color) +// Each 1-bit becomes fg, each 0-bit becomes bg +// srcBuf = packed MSB-first mono bitmap, srcPitch = byte stride +drv->colorExpand(drv, glyphData, 1, dstX, dstY, 8, 16, fg, bg); + +// Bresenham line draw (inclusive endpoints) +drv->lineDraw(drv, x1, y1, x2, y2, color); + +// Hardware clip rectangle +drv->setClip(drv, clipX, clipY, clipW, clipH); +``` + +### Hardware Cursor + +```c +// Define a cursor image (64x64 max, AND/XOR masks) +HwCursorImageT cursor; +cursor.width = 16; +cursor.height = 16; +cursor.hotX = 0; +cursor.hotY = 0; +memset(cursor.andMask, 0xFF, sizeof(cursor.andMask)); // transparent +memset(cursor.xorMask, 0x00, sizeof(cursor.xorMask)); +// ... fill in actual cursor shape ... + +// Upload and enable +drv->setCursor(drv, &cursor); +drv->showCursor(drv, true); + +// Move (call on every mouse poll) +drv->moveCursor(drv, mouseX, mouseY); + +// Hide +drv->showCursor(drv, false); +``` + +### Checking Capabilities + +The `caps` field indicates which operations are hardware-accelerated. +Software fallbacks are always installed, so you can call any operation +regardless of caps. Use caps to make optimization decisions: + +```c +if (drv->caps & ACAP_COLOR_EXPAND) { + // Use color expansion for text -- 16x less bus traffic + drv->colorExpand(drv, glyph, 1, x, y, 8, 16, fg, bg); +} else { + // Software fallback is installed but may be slow -- + // consider pre-rendering text to a RAM buffer instead + drv->colorExpand(drv, glyph, 1, x, y, 8, 16, fg, bg); +} + +if (drv->caps & ACAP_HW_CURSOR) { + // Hardware cursor eliminates cursor dirty rectangles + drv->setCursor(drv, &cursorImage); + drv->showCursor(drv, true); +} +``` + +### Synchronization + +The acceleration engine runs asynchronously. Drawing functions return +immediately after queuing the command. Use `waitIdle` before reading +from VRAM or when you need all pending operations to complete: + +```c +drv->rectFill(drv, 0, 0, 100, 100, color1); +drv->rectFill(drv, 50, 50, 100, 100, color2); +drv->bitBlt(drv, 0, 0, 200, 0, 150, 150); + +// Wait for everything to finish before reading VRAM +drv->waitIdle(drv); +uint16_t pixel = *(uint16_t *)(drv->mode.framebuffer + offset); +``` + +### Mode Information + +After `accelInit` succeeds, `drv->mode` contains: + +| Field | Description | +|-------|-------------| +| `width` | Horizontal resolution in pixels | +| `height` | Vertical resolution in pixels | +| `bpp` | Bits per pixel (8, 15, 16, or 32) | +| `pitch` | Bytes per scanline (may exceed width * bpp/8) | +| `framebuffer` | Direct pointer to the linear framebuffer | +| `vramSize` | Total video RAM in bytes | +| `offscreenBase` | Byte offset where offscreen VRAM begins | + +The framebuffer pointer can be used for direct pixel access when +the acceleration engine doesn't offer a suitable operation. + +## Adding a New Driver + +1. Create a new source file (e.g., `newchip.c`) +2. Include `accelVid.h`, `vgaCommon.h`, and `pci.h` +3. Define a static `AccelDriverT` with your function pointers +4. Use shared helpers for boilerplate: + - `vesaFindAndSetMode()` for VESA mode enumeration and setting + - `dpmiMapFramebuffer()` for DPMI physical address mapping + - `pciSizeBar()` for PCI BAR size detection +5. Leave unsupported operations as NULL -- the driver manager + installs software fallbacks automatically +6. Add a registration function: `void newchipRegisterDriver(void)` +7. Add the source file to the Makefile and call the registration + function from `main()` + +See `trident.c` (simplest driver) or `matroxMga.c` (most complete) +as reference implementations. + +## Building + +Requires a DJGPP cross-compiler targeting i586-pc-msdosdjgpp. + +``` +make # build bin/demo.exe +make clean # remove build artifacts +``` + +The Makefile expects the DJGPP toolchain at `$HOME/djgpp/djgpp`. +Override with `make DJGPP_PREFIX=/path/to/djgpp`. + +Compiler flags: `-O2 -Wall -Wextra -Werror -march=i486 -mtune=i586` + +## Testing + +The `test/` directory contains an 86Box configuration for testing +with an emulated S3 Trio64. See `test/README.txt` for setup +instructions. + +``` +demo.exe [width height bpp] +``` + +Default mode: 640x480x16. Controls: SPACE cycles demos, B runs +benchmarks, ESC exits. + +## Project Structure + +``` +accelVid.h Driver abstraction and manager API +accelVid.c Driver manager, software fallbacks +pci.h / pci.c PCI configuration space access +vgaCommon.h / .c Shared VGA registers, VESA, DPMI helpers +s3Trio.c S3 Trio/ViRGE/Savage/Vision driver +atiMach64.c ATI Mach64 / Rage driver +matroxMga.c Matrox Millennium / Mystique / G-series driver +banshee.c 3dfx Banshee / Voodoo3 driver +cirrusGd54.c Cirrus Logic GD5434/36/46/80 driver +cirrusLaguna.c Cirrus Logic Laguna GD5462/64/65 driver +nvidia.c Nvidia RIVA 128 / TNT family driver +tsengW32.c Tseng ET4000/W32 family driver +trident.c Trident TGUI / Blade / CyberBlade driver +sis.c SiS 6326/300/315 driver +demo.c Test/demo application +Makefile DJGPP cross-compilation build +PLAN.md Architecture plan and chipset reference +test/ 86Box test configuration and setup guide +``` diff --git a/accelVid.c b/accelVid.c new file mode 100644 index 0000000..58eace1 --- /dev/null +++ b/accelVid.c @@ -0,0 +1,574 @@ +// accelVid.c -- Accelerated video driver manager +// +// Manages registration, detection, and lifecycle of hardware-specific +// video drivers. Drivers register themselves at startup, then the +// manager probes each in order to find matching hardware. +// +// After a chip driver's init() succeeds, the manager fills in +// software fallback implementations for any drawing operations +// the driver left as NULL. This means callers never need to +// check function pointers -- every operation is always callable. +// The fallbacks draw directly to the LFB using simple loops. + +#include "accelVid.h" + +#include +#include +#include + +// Maximum number of registered drivers. This is more than enough +// for all chip families we'll ever support. +#define MAX_DRIVERS 32 + +// ============================================================ +// Prototypes -- public API +// ============================================================ + +AccelDriverT *accelDetect(void); +uint32_t accelGetCaps(const AccelDriverT *drv); +const char *accelGetName(const AccelDriverT *drv); +bool accelInit(AccelDriverT *drv, const AccelModeRequestT *req); +void accelRegisterDriver(AccelDriverT *drv); +void accelShutdown(AccelDriverT *drv); + +// ============================================================ +// Prototypes -- software fallbacks +// ============================================================ + +static void swBitBlt(AccelDriverT *drv, int32_t srcX, int32_t srcY, int32_t dstX, int32_t dstY, int32_t w, int32_t h); +static void swColorExpand(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h, uint32_t fg, uint32_t bg); +static void swHostBlit(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h); +static void swLineDraw(AccelDriverT *drv, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color); +static void swRectFill(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color); +static void swRectFillPat(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, const uint8_t *pattern, uint32_t fg, uint32_t bg); +static void swSetClip(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h); +static void swWaitIdle(AccelDriverT *drv); +static void swInstallFallbacks(AccelDriverT *drv); + +// ============================================================ +// Inline helpers for software fallbacks +// ============================================================ + +// Write a pixel at (x, y) in the framebuffer. No bounds checking +// -- the caller must clip before calling. +static inline void swPutPixel(AccelDriverT *drv, int32_t x, int32_t y, uint32_t color) { + uint8_t *fb = drv->mode.framebuffer; + int32_t bpp = (drv->mode.bpp + 7) / 8; + uint8_t *dst = fb + y * drv->mode.pitch + x * bpp; + + switch (bpp) { + case 1: + *dst = (uint8_t)color; + break; + case 2: + *(uint16_t *)dst = (uint16_t)color; + break; + case 4: + *(uint32_t *)dst = color; + break; + } +} + +// ============================================================ +// Module state +// ============================================================ + +static AccelDriverT *sDrivers[MAX_DRIVERS]; +static int32_t sDriverCount = 0; + +// Software clip rectangle (used by fallbacks when no hardware clip) +static int32_t sClipX = 0; +static int32_t sClipY = 0; +static int32_t sClipW = 0; +static int32_t sClipH = 0; + +// ============================================================ +// accelDetect +// ============================================================ +// +// Iterates all registered drivers and calls detect() on each. +// Returns the first driver that claims the hardware, or NULL +// if no supported hardware is found. +// +// Detection order matters: drivers registered first are tried +// first. This allows callers to prioritize specific drivers +// (e.g. prefer S3 over generic VESA). + +AccelDriverT *accelDetect(void) { + if (!pciDetect()) { + fprintf(stderr, "accelVid: PCI bus not detected\n"); + return NULL; + } + + for (int32_t i = 0; i < sDriverCount; i++) { + if (sDrivers[i]->detect(sDrivers[i])) { + printf("accelVid: Detected %s (PCI %02X:%02X.%X, " + "vendor=%04X device=%04X)\n", + sDrivers[i]->name, + sDrivers[i]->pciDev.bus, + sDrivers[i]->pciDev.dev, + sDrivers[i]->pciDev.func, + sDrivers[i]->pciDev.vendorId, + sDrivers[i]->pciDev.deviceId); + return sDrivers[i]; + } + } + + fprintf(stderr, "accelVid: No supported video hardware found\n"); + return NULL; +} + + +// ============================================================ +// accelGetCaps +// ============================================================ + +uint32_t accelGetCaps(const AccelDriverT *drv) { + if (!drv) { + return 0; + } + + return drv->caps; +} + + +// ============================================================ +// accelGetName +// ============================================================ + +const char *accelGetName(const AccelDriverT *drv) { + if (!drv) { + return "none"; + } + + return drv->name; +} + + +// ============================================================ +// accelInit +// ============================================================ + +bool accelInit(AccelDriverT *drv, const AccelModeRequestT *req) { + if (!drv || !drv->init) { + return false; + } + + memset(&drv->mode, 0, sizeof(drv->mode)); + + if (!drv->init(drv, req)) { + fprintf(stderr, "accelVid: Failed to initialize %s\n", drv->name); + return false; + } + + printf("accelVid: Initialized %s at %ldx%ldx%ld (pitch=%ld, vram=%luKB)\n", + drv->name, + (long)drv->mode.width, + (long)drv->mode.height, + (long)drv->mode.bpp, + (long)drv->mode.pitch, + (unsigned long)(drv->mode.vramSize / 1024)); + + // Report capabilities + printf("accelVid: Capabilities:"); + + if (drv->caps & ACAP_RECT_FILL) { + printf(" RectFill"); + } + if (drv->caps & ACAP_RECT_FILL_PAT) { + printf(" PatFill"); + } + if (drv->caps & ACAP_BITBLT) { + printf(" BitBlt"); + } + if (drv->caps & ACAP_COLOR_EXPAND) { + printf(" ColorExpand"); + } + if (drv->caps & ACAP_LINE_DRAW) { + printf(" LineDraw"); + } + if (drv->caps & ACAP_HW_CURSOR) { + printf(" HwCursor"); + } + if (drv->caps & ACAP_HOST_BLIT) { + printf(" HostBlit"); + } + if (drv->caps & ACAP_CLIP) { + printf(" Clip"); + } + if (drv->caps & ACAP_TRANSPARENCY) { + printf(" Transparency"); + } + + printf("\n"); + + // Install software fallbacks for any operations the driver + // didn't implement in hardware + swInstallFallbacks(drv); + + return true; +} + + +// ============================================================ +// accelRegisterDriver +// ============================================================ + +void accelRegisterDriver(AccelDriverT *drv) { + if (sDriverCount >= MAX_DRIVERS) { + fprintf(stderr, "accelVid: Too many drivers registered (max %d)\n", + MAX_DRIVERS); + return; + } + + sDrivers[sDriverCount++] = drv; +} + + +// ============================================================ +// accelShutdown +// ============================================================ + +void accelShutdown(AccelDriverT *drv) { + if (!drv) { + return; + } + + if (drv->waitIdle) { + drv->waitIdle(drv); + } + + if (drv->showCursor) { + drv->showCursor(drv, false); + } + + if (drv->shutdown) { + drv->shutdown(drv); + } + + memset(&drv->mode, 0, sizeof(drv->mode)); +} + + +// ============================================================ +// Software fallback implementations +// ============================================================ +// +// These draw directly to the LFB. They're correct but slow +// (uncached PCI writes). The point isn't performance -- it's +// ensuring every operation is always callable so the caller +// never needs to check for NULL function pointers. + + +// ============================================================ +// swBitBlt +// ============================================================ +// +// Screen-to-screen blit via the LFB. Handles overlapping regions +// by choosing copy direction. + +static void swBitBlt(AccelDriverT *drv, int32_t srcX, int32_t srcY, int32_t dstX, int32_t dstY, int32_t w, int32_t h) { + if (w <= 0 || h <= 0) { + return; + } + + uint8_t *fb = drv->mode.framebuffer; + int32_t pitch = drv->mode.pitch; + int32_t bpp = (drv->mode.bpp + 7) / 8; + int32_t rowBytes = w * bpp; + + if (dstY < srcY || (dstY == srcY && dstX <= srcX)) { + // Copy forward (top to bottom, left to right) + for (int32_t row = 0; row < h; row++) { + uint8_t *src = fb + (srcY + row) * pitch + srcX * bpp; + uint8_t *dst = fb + (dstY + row) * pitch + dstX * bpp; + memmove(dst, src, rowBytes); + } + } else { + // Copy backward (bottom to top) + for (int32_t row = h - 1; row >= 0; row--) { + uint8_t *src = fb + (srcY + row) * pitch + srcX * bpp; + uint8_t *dst = fb + (dstY + row) * pitch + dstX * bpp; + memmove(dst, src, rowBytes); + } + } +} + + +// ============================================================ +// swColorExpand +// ============================================================ +// +// Monochrome-to-color expansion via the LFB. Each 1-bit in srcBuf +// becomes the fg color, each 0-bit becomes the bg color. + +static void swColorExpand(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h, uint32_t fg, uint32_t bg) { + if (w <= 0 || h <= 0) { + return; + } + + uint8_t *fb = drv->mode.framebuffer; + int32_t pitch = drv->mode.pitch; + int32_t bpp = (drv->mode.bpp + 7) / 8; + + for (int32_t row = 0; row < h; row++) { + const uint8_t *mono = srcBuf + row * srcPitch; + uint8_t *dst = fb + (dstY + row) * pitch + dstX * bpp; + + for (int32_t col = 0; col < w; col++) { + int32_t byteIdx = col / 8; + int32_t bitIdx = 7 - (col % 8); + uint32_t color = (mono[byteIdx] >> bitIdx) & 1 ? fg : bg; + + switch (bpp) { + case 1: + dst[col] = (uint8_t)color; + break; + case 2: + ((uint16_t *)dst)[col] = (uint16_t)color; + break; + case 4: + ((uint32_t *)dst)[col] = color; + break; + } + } + } +} + + +// ============================================================ +// swHostBlit +// ============================================================ +// +// CPU-to-screen blit via the LFB. Just a memcpy per scanline. + +static void swHostBlit(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h) { + if (w <= 0 || h <= 0) { + return; + } + + uint8_t *fb = drv->mode.framebuffer; + int32_t pitch = drv->mode.pitch; + int32_t bpp = (drv->mode.bpp + 7) / 8; + int32_t rowBytes = w * bpp; + + for (int32_t row = 0; row < h; row++) { + const uint8_t *src = srcBuf + row * srcPitch; + uint8_t *dst = fb + (dstY + row) * pitch + dstX * bpp; + memcpy(dst, src, rowBytes); + } +} + + +// ============================================================ +// swLineDraw +// ============================================================ +// +// Bresenham line draw via the LFB. + +static void swLineDraw(AccelDriverT *drv, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color) { + int32_t dx = abs(x2 - x1); + int32_t dy = abs(y2 - y1); + int32_t sx = (x1 < x2) ? 1 : -1; + int32_t sy = (y1 < y2) ? 1 : -1; + int32_t err = dx - dy; + int32_t x = x1; + int32_t y = y1; + + for (;;) { + if (x >= sClipX && x < sClipX + sClipW && + y >= sClipY && y < sClipY + sClipH) { + swPutPixel(drv, x, y, color); + } + + if (x == x2 && y == y2) { + break; + } + + int32_t e2 = 2 * err; + + if (e2 > -dy) { + err -= dy; + x += sx; + } + + if (e2 < dx) { + err += dx; + y += sy; + } + } +} + + +// ============================================================ +// swRectFill +// ============================================================ +// +// Solid rectangle fill via the LFB. + +static void swRectFill(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color) { + if (w <= 0 || h <= 0) { + return; + } + + uint8_t *fb = drv->mode.framebuffer; + int32_t pitch = drv->mode.pitch; + int32_t bpp = (drv->mode.bpp + 7) / 8; + + for (int32_t row = 0; row < h; row++) { + uint8_t *dst = fb + (y + row) * pitch + x * bpp; + + switch (bpp) { + case 1: + memset(dst, (uint8_t)color, w); + break; + case 2: { + uint16_t *dst16 = (uint16_t *)dst; + for (int32_t col = 0; col < w; col++) { + dst16[col] = (uint16_t)color; + } + break; + } + case 4: { + uint32_t *dst32 = (uint32_t *)dst; + for (int32_t col = 0; col < w; col++) { + dst32[col] = color; + } + break; + } + } + } +} + + +// ============================================================ +// swRectFillPat +// ============================================================ +// +// 8x8 monochrome pattern fill via the LFB. The pattern is 8 bytes, +// one bit per pixel, MSB-first, row 0 first. Each 1-bit gets the +// fg color, each 0-bit gets the bg color. The pattern tiles across +// the destination rectangle with alignment to screen coordinates +// (so patterns line up across adjacent fills). + +static void swRectFillPat(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, const uint8_t *pattern, uint32_t fg, uint32_t bg) { + if (w <= 0 || h <= 0) { + return; + } + + uint8_t *fb = drv->mode.framebuffer; + int32_t pitch = drv->mode.pitch; + int32_t bpp = (drv->mode.bpp + 7) / 8; + + for (int32_t row = 0; row < h; row++) { + uint8_t patRow = pattern[(y + row) & 7]; + uint8_t *dst = fb + (y + row) * pitch + x * bpp; + + for (int32_t col = 0; col < w; col++) { + int32_t patBit = 7 - ((x + col) & 7); + uint32_t color = (patRow >> patBit) & 1 ? fg : bg; + + switch (bpp) { + case 1: + dst[col] = (uint8_t)color; + break; + case 2: + ((uint16_t *)dst)[col] = (uint16_t)color; + break; + case 4: + ((uint32_t *)dst)[col] = color; + break; + } + } + } +} + + +// ============================================================ +// swSetClip +// ============================================================ +// +// Software clip rectangle for fallback line drawing. + +static void swSetClip(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h) { + (void)drv; + sClipX = x; + sClipY = y; + sClipW = w; + sClipH = h; +} + + +// ============================================================ +// swWaitIdle +// ============================================================ +// +// No-op -- software operations complete synchronously. + +static void swWaitIdle(AccelDriverT *drv) { + (void)drv; +} + + +// ============================================================ +// swInstallFallbacks +// ============================================================ +// +// Fills in software implementations for any NULL function +// pointers in the driver struct. Called by accelInit() after +// the chip driver's init() succeeds. This guarantees that +// every drawing operation is always callable. + +static void swInstallFallbacks(AccelDriverT *drv) { + int32_t count = 0; + + if (!drv->waitIdle) { + drv->waitIdle = swWaitIdle; + } + + if (!drv->setClip) { + drv->setClip = swSetClip; + count++; + } + + if (!drv->rectFill) { + drv->rectFill = swRectFill; + count++; + } + + if (!drv->bitBlt) { + drv->bitBlt = swBitBlt; + count++; + } + + if (!drv->hostBlit) { + drv->hostBlit = swHostBlit; + count++; + } + + if (!drv->colorExpand) { + drv->colorExpand = swColorExpand; + count++; + } + + if (!drv->rectFillPat) { + drv->rectFillPat = swRectFillPat; + count++; + } + + if (!drv->lineDraw) { + drv->lineDraw = swLineDraw; + count++; + } + + // Initialize the software clip rect to full screen + sClipX = 0; + sClipY = 0; + sClipW = drv->mode.width; + sClipH = drv->mode.height; + + if (count > 0) { + printf("accelVid: %ld operation(s) using software fallback\n", + (long)count); + } +} diff --git a/accelVid.h b/accelVid.h new file mode 100644 index 0000000..4ed2d07 --- /dev/null +++ b/accelVid.h @@ -0,0 +1,257 @@ +// accelVid.h -- Accelerated video driver abstraction for DOS +// +// Defines the common interface that all hardware-specific video +// drivers implement. Each driver fills in an AccelDriverT struct +// with function pointers for its accelerated operations and sets +// capability flags indicating which operations are hardware-backed. +// +// The driver manager (accelVid.c) iterates registered drivers, +// calls detect() on each, and returns the first match. The caller +// then uses the function pointers directly -- no dispatch overhead +// beyond the initial detection. +// +// Operations that aren't hardware-accelerated on a given chip +// should be left as NULL. The caller is responsible for falling +// back to software rendering for NULL operations. Capability +// flags in AccelDriverT.caps indicate which operations are +// available so callers can check without testing each pointer. +// +// All coordinates and dimensions are in pixels. Colors are packed +// in the display's native pixel format (same as DVX's packColor). +#ifndef ACCEL_VID_H +#define ACCEL_VID_H + +#include +#include + +#include "pci.h" + +// ============================================================ +// Capability flags +// ============================================================ +// +// Bit flags indicating which operations are hardware-accelerated. +// A driver sets these in its caps field during detect/init. The +// caller can test (drv->caps & ACAP_xxx) to decide whether to +// use hardware or fall back to software. + +#define ACAP_RECT_FILL 0x00000001 // solid rectangle fill +#define ACAP_RECT_FILL_PAT 0x00000002 // pattern rectangle fill (8x8) +#define ACAP_BITBLT 0x00000004 // screen-to-screen blit +#define ACAP_COLOR_EXPAND 0x00000008 // mono-to-color expansion (text/glyphs) +#define ACAP_LINE_DRAW 0x00000010 // Bresenham line drawing +#define ACAP_HW_CURSOR 0x00000020 // hardware sprite cursor +#define ACAP_HOST_BLIT 0x00000040 // CPU-to-screen blit (image upload) +#define ACAP_CLIP 0x00000080 // hardware clip rectangle +#define ACAP_TRANSPARENCY 0x00000100 // transparent blit (color key) + +// ============================================================ +// Raster operation codes +// ============================================================ +// +// Standard Microsoft/GDI ROP codes used by Windows drivers. +// These map to the 256 possible ternary raster operations, but +// we only define the commonly used ones. The hardware engines +// typically support these natively. + +#define ROP_COPY 0xCC // dest = src +#define ROP_PAT_COPY 0xF0 // dest = pattern +#define ROP_ZERO 0x00 // dest = 0 (black) +#define ROP_ONE 0xFF // dest = 1 (white) +#define ROP_SRC_AND 0x88 // dest = src AND dest +#define ROP_SRC_OR 0xEE // dest = src OR dest +#define ROP_SRC_XOR 0x66 // dest = src XOR dest +#define ROP_NOT 0x55 // dest = NOT dest +#define ROP_PAT_AND 0xA0 // dest = pat AND dest +#define ROP_PAT_OR 0xFA // dest = pat OR dest +#define ROP_PAT_XOR 0x5A // dest = pat XOR dest + +// ============================================================ +// Hardware cursor image format +// ============================================================ +// +// Hardware cursors use a 2-bit-per-pixel AND/XOR format: +// AND=0, XOR=0 -> cursor color 0 (background) +// AND=0, XOR=1 -> cursor color 1 (foreground) +// AND=1, XOR=0 -> transparent (screen shows through) +// AND=1, XOR=1 -> inverted (screen pixel is inverted) +// +// Most chips support 64x64 cursors (S3, Matrox, ATI, Tseng W32p). +// Older Cirrus (GD5426/28) support only 32x32. + +#define HW_CURSOR_MAX_SIZE 64 + +typedef struct { + int32_t width; + int32_t height; + int32_t hotX; + int32_t hotY; + uint8_t andMask[HW_CURSOR_MAX_SIZE * HW_CURSOR_MAX_SIZE / 8]; + uint8_t xorMask[HW_CURSOR_MAX_SIZE * HW_CURSOR_MAX_SIZE / 8]; +} HwCursorImageT; + +// ============================================================ +// Video mode request / result +// ============================================================ + +typedef struct { + int32_t width; + int32_t height; + int32_t bpp; // requested bits per pixel (8, 15, 16, 32) +} AccelModeRequestT; + +typedef struct { + int32_t width; + int32_t height; + int32_t bpp; + int32_t pitch; // bytes per scanline (may be > width * bytesPerPixel) + uint8_t *framebuffer; // mapped linear framebuffer pointer + uint32_t vramSize; // total video RAM in bytes + uint32_t offscreenBase; // offset to start of offscreen VRAM (for allocations) +} AccelModeResultT; + +// ============================================================ +// Driver structure +// ============================================================ +// +// Each chip driver provides a statically-allocated AccelDriverT +// and registers it with accelRegisterDriver(). The driver manager +// calls detect() on each registered driver during accelInit(). +// +// The init() function receives a mode request and returns detailed +// mode info. It is responsible for: +// - Programming the CRTC/sequencer for the requested mode +// - Enabling the linear framebuffer +// - Unlocking the acceleration engine +// - Setting up MMIO mappings if needed +// +// All accelerated drawing functions must call waitIdle() internally +// before returning if the operation is asynchronous. The explicit +// waitIdle() in the API is for synchronization points where the +// caller needs to read back from VRAM after a series of operations. + +typedef struct AccelDriverT { + // Driver identification + const char *name; // human-readable name (e.g. "S3 Trio64") + const char *chipFamily; // family identifier (e.g. "s3", "cirrus") + uint32_t caps; // ACAP_xxx capability flags + + // PCI device info (filled by detect) + PciDeviceT pciDev; + + // Current mode info (filled by init) + AccelModeResultT mode; + + // -------------------------------------------------------- + // Lifecycle + // -------------------------------------------------------- + + // Probe for this chip. Returns true if this driver's hardware + // is present. Must not change any hardware state. + bool (*detect)(struct AccelDriverT *drv); + + // Initialize the chip: set the requested video mode, enable + // acceleration, map the framebuffer. Returns true on success. + bool (*init)(struct AccelDriverT *drv, const AccelModeRequestT *req); + + // Shut down: restore text mode, disable acceleration, unmap + // memory. Safe to call even if init() was never called. + void (*shutdown)(struct AccelDriverT *drv); + + // -------------------------------------------------------- + // Synchronization + // -------------------------------------------------------- + + // Wait until the acceleration engine is idle. All pending + // drawing commands must complete before this returns. + void (*waitIdle)(struct AccelDriverT *drv); + + // -------------------------------------------------------- + // Hardware clip rectangle + // -------------------------------------------------------- + + // Set the hardware clip rectangle. All subsequent drawing + // operations are clipped to this region. Pass full-screen + // dimensions to disable clipping. + void (*setClip)(struct AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h); + + // -------------------------------------------------------- + // Accelerated drawing operations + // -------------------------------------------------------- + + // Solid rectangle fill. + void (*rectFill)(struct AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color); + + // Pattern rectangle fill (8x8 pattern, one color + transparent + // or two-color). Pattern data is 8 bytes, one bit per pixel, + // MSB-first, top row first. + void (*rectFillPat)(struct AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, const uint8_t *pattern, uint32_t fg, uint32_t bg); + + // Screen-to-screen blit. + void (*bitBlt)(struct AccelDriverT *drv, int32_t srcX, int32_t srcY, int32_t dstX, int32_t dstY, int32_t w, int32_t h); + + // CPU-to-screen blit: transfer pixels from system RAM to VRAM. + // srcBuf points to packed pixel data in display format. + // srcPitch is the byte stride of the source buffer. + void (*hostBlit)(struct AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h); + + // Monochrome color expansion: convert 1bpp bitmap data to + // full-color pixels. Used for fast text/glyph rendering. + // srcBuf is packed MSB-first, one bit per pixel. + // srcPitch is the byte stride between rows. + void (*colorExpand)(struct AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h, uint32_t fg, uint32_t bg); + + // Bresenham line draw (inclusive endpoints). + void (*lineDraw)(struct AccelDriverT *drv, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color); + + // -------------------------------------------------------- + // Hardware cursor + // -------------------------------------------------------- + + // Set the cursor image. Called when the cursor shape changes. + void (*setCursor)(struct AccelDriverT *drv, const HwCursorImageT *image); + + // Move the cursor to a screen position. Called every mouse poll. + void (*moveCursor)(struct AccelDriverT *drv, int32_t x, int32_t y); + + // Show or hide the hardware cursor. + void (*showCursor)(struct AccelDriverT *drv, bool visible); + + // -------------------------------------------------------- + // Private driver data + // -------------------------------------------------------- + + // Opaque pointer for chip-specific state (MMIO base address, + // current engine state, etc.). Each driver allocates and manages + // its own private data. + void *privData; + +} AccelDriverT; + +// ============================================================ +// Driver manager API +// ============================================================ + +// Register a driver with the manager. Call once per driver at +// startup (typically from main before accelInit). Drivers are +// probed in registration order. +void accelRegisterDriver(AccelDriverT *drv); + +// Probe all registered drivers and return the first one whose +// detect() succeeds. Returns NULL if no supported hardware is found. +AccelDriverT *accelDetect(void); + +// Initialize the detected driver with the given mode. +// Returns true on success. On failure the driver is not usable. +bool accelInit(AccelDriverT *drv, const AccelModeRequestT *req); + +// Shut down the active driver and restore text mode. +void accelShutdown(AccelDriverT *drv); + +// Return the driver name string for display. +const char *accelGetName(const AccelDriverT *drv); + +// Return the capability flags for the active driver. +uint32_t accelGetCaps(const AccelDriverT *drv); + +#endif // ACCEL_VID_H diff --git a/atiMach64.c b/atiMach64.c new file mode 100644 index 0000000..83b4065 --- /dev/null +++ b/atiMach64.c @@ -0,0 +1,960 @@ +// atiMach64.c -- ATI Mach64 / Rage accelerated video driver +// +// Supports the ATI Mach64 family: GX, CX, CT, ET, VT, GT (Rage II), +// and Rage Pro. These were among the most capable 2D accelerators +// of the mid-1990s, with features including: +// - Solid and pattern rectangle fill +// - Screen-to-screen BitBLT +// - Host-to-screen blit (CPU data transfer) +// - Monochrome color expansion +// - Bresenham line draw +// - Trapezoid fill +// - Hardware scissor rectangle +// - 64x64 two-color hardware cursor +// +// Register access: +// The Mach64 has two register access methods: +// 1. I/O port: registers at block I/O base + offset. The base +// is typically 0x02EC for Mach64, determined by CONFIG_CHIP_ID. +// 2. MMIO: register block at end of LFB (BAR0 + aperture_size - 1KB) +// or via a dedicated BAR. +// +// We use MMIO for speed. The register block is 1KB at the end +// of the aperture (LFB base + size - 0x400 on most variants, +// or LFB base + size - 0x800 for 8MB apertures). +// +// Some early Mach64 chips (GX/CX) may not support MMIO well; +// for those we fall back to I/O port access. + +#include "accelVid.h" +#include "vgaCommon.h" +#include "pci.h" + +#include +#include +#include +#include + +// ============================================================ +// ATI vendor/device IDs +// ============================================================ + +#define ATI_VENDOR_ID 0x1002 + +#define ATI_MACH64_GX 0x4758 // Mach64 GX +#define ATI_MACH64_CX 0x4358 // Mach64 CX +#define ATI_MACH64_CT 0x4354 // Mach64 CT +#define ATI_MACH64_ET 0x4554 // Mach64 ET +#define ATI_MACH64_VT 0x5654 // Mach64 VT +#define ATI_MACH64_VT_B 0x5655 // Mach64 VT-B +#define ATI_MACH64_GT 0x4754 // Mach64 GT (3D Rage II) +#define ATI_MACH64_GT_B 0x4755 // Mach64 GT-B (3D Rage II+) +#define ATI_RAGE_PRO 0x4750 // Rage Pro +#define ATI_RAGE_PRO_AGP 0x4752 // Rage Pro AGP +#define ATI_RAGE_XL_PCI 0x4752 // Rage XL PCI (shares ID with Pro AGP) +#define ATI_RAGE_128_RE 0x5245 // Rage 128 RE +#define ATI_RAGE_128_RF 0x5246 // Rage 128 RF +#define ATI_RAGE_128_RK 0x524B // Rage 128 RK +#define ATI_RAGE_128_RL 0x524C // Rage 128 RL +#define ATI_RAGE_128_PRO_PF 0x5046 // Rage 128 Pro PF +#define ATI_RAGE_128_PRO_PR 0x5052 // Rage 128 Pro PR +#define ATI_RAGE_FURY 0x5046 // Rage Fury (same as 128 Pro PF) + +static const uint16_t sAtiDeviceIds[] = { + ATI_VENDOR_ID, ATI_MACH64_GX, + ATI_VENDOR_ID, ATI_MACH64_CX, + ATI_VENDOR_ID, ATI_MACH64_CT, + ATI_VENDOR_ID, ATI_MACH64_ET, + ATI_VENDOR_ID, ATI_MACH64_VT, + ATI_VENDOR_ID, ATI_MACH64_VT_B, + ATI_VENDOR_ID, ATI_MACH64_GT, + ATI_VENDOR_ID, ATI_MACH64_GT_B, + ATI_VENDOR_ID, ATI_RAGE_PRO, + ATI_VENDOR_ID, ATI_RAGE_PRO_AGP, + ATI_VENDOR_ID, ATI_RAGE_128_RE, + ATI_VENDOR_ID, ATI_RAGE_128_RF, + ATI_VENDOR_ID, ATI_RAGE_128_RK, + ATI_VENDOR_ID, ATI_RAGE_128_RL, + ATI_VENDOR_ID, ATI_RAGE_128_PRO_PF, + ATI_VENDOR_ID, ATI_RAGE_128_PRO_PR, + 0, 0 +}; + +// ============================================================ +// Mach64 register offsets (from MMIO base) +// ============================================================ +// +// The Mach64 has a flat register space. For I/O access, these +// offsets are added to the I/O base port. For MMIO, they're +// byte offsets from the MMIO base address. + +// Drawing engine source registers +#define ATI_SRC_OFF_PITCH 0x0000 // source offset and pitch +#define ATI_SRC_Y 0x0004 // source Y +#define ATI_SRC_X 0x0008 // source X (alias: SRC_HEIGHT1) +#define ATI_SRC_Y_X 0x000C // source Y and X combined +#define ATI_SRC_WIDTH1 0x0010 +#define ATI_SRC_HEIGHT1 0x0014 + +// Drawing engine destination registers +#define ATI_DST_OFF_PITCH 0x0040 // destination offset and pitch +#define ATI_DST_Y 0x0044 +#define ATI_DST_X 0x0048 +#define ATI_DST_Y_X 0x004C +#define ATI_DST_HEIGHT 0x0050 +#define ATI_DST_WIDTH 0x0054 +#define ATI_DST_HEIGHT_WIDTH 0x0058 // triggers blit +#define ATI_DST_X_WIDTH 0x005C +#define ATI_DST_BRES_ERR 0x0064 +#define ATI_DST_BRES_INC 0x0068 +#define ATI_DST_BRES_DEC 0x006C +#define ATI_DST_BRES_LNTH 0x0070 +#define ATI_DST_BRES_LNTH_END 0x0074 // triggers line draw + +// Host data (CPU-to-screen) +#define ATI_HOST_DATA0 0x0200 + +// Scissor registers +#define ATI_SC_LEFT 0x00A0 +#define ATI_SC_RIGHT 0x00A4 +#define ATI_SC_TOP 0x00A8 +#define ATI_SC_BOTTOM 0x00AC + +// Drawing processor registers +#define ATI_DP_BKGD_CLR 0x00B0 +#define ATI_DP_FRGD_CLR 0x00B4 +#define ATI_DP_WRITE_MASK 0x00B8 +#define ATI_DP_CHAIN_MASK 0x00BC +#define ATI_DP_PIX_WIDTH 0x00D0 +#define ATI_DP_MIX 0x00D4 +#define ATI_DP_SRC 0x00D8 + +// Clock/config +#define ATI_CLR_CMP_CNTL 0x0100 +#define ATI_GUI_TRAJ_CNTL 0x00CC +#define ATI_GUI_STAT 0x00CE // I/O only; for MMIO see below + +// FIFO and status (MMIO addresses) +#define ATI_FIFO_STAT 0x0310 +#define ATI_GUI_STAT_MMIO 0x0338 + +// Hardware cursor +#define ATI_CUR_CLR0 0x0260 +#define ATI_CUR_CLR1 0x0264 +#define ATI_CUR_OFFSET 0x0268 +#define ATI_CUR_HORZ_VERT_POSN 0x026C +#define ATI_CUR_HORZ_VERT_OFF 0x0270 +#define ATI_GEN_TEST_CNTL 0x0034 // general test/cursor control + +// Memory config +#define ATI_MEM_CNTL 0x0140 + +// I/O and MMIO constants +#define ATI_IO_BASE_DEFAULT 0x02EC // default block I/O base port +#define ATI_MMIO_SIZE 0x0400 // MMIO block size (1KB at end of aperture) +#define ATI_CONFIG_CHIP_ID 0x00E0 + +// ============================================================ +// Mach64 DP_MIX values +// ============================================================ +// +// The drawing processor MIX register controls the raster operation +// for foreground (bits 20:16) and background (bits 4:0). + +#define ATI_MIX_NOT_DST 0x00 +#define ATI_MIX_ZERO 0x01 +#define ATI_MIX_ONE 0x02 +#define ATI_MIX_DST 0x03 +#define ATI_MIX_NOT_SRC 0x04 +#define ATI_MIX_XOR 0x05 +#define ATI_MIX_XNOR 0x06 +#define ATI_MIX_COPY 0x07 // dest = source (most common) +#define ATI_MIX_NOT_SRC_AND 0x08 +#define ATI_MIX_SRC_AND_DST 0x0C +#define ATI_MIX_SRC_OR_DST 0x0E + +// Foreground mix is in bits 20:16, background in bits 4:0 +#define ATI_FRGD_MIX(rop) ((uint32_t)(rop) << 16) +#define ATI_BKGD_MIX(rop) ((uint32_t)(rop)) + +// ============================================================ +// Mach64 DP_SRC values +// ============================================================ + +#define ATI_SRC_BKGD_CLR 0x00 // background color register +#define ATI_SRC_FRGD_CLR 0x01 // foreground color register +#define ATI_SRC_HOST 0x02 // CPU host data +#define ATI_SRC_BLIT 0x03 // video memory (blit) +#define ATI_SRC_PATTERN 0x04 // pattern register + +// DP_SRC packs three source selects: mono src (bits 10:8), +// foreground src (bits 18:16 on some, or bits 10:8), background src +// In practice, the format is: +// bits 2:0 = background source +// bits 10:8 = foreground source +// bits 18:16 = mono source (for color expand) + +#define ATI_DP_SRC_BKGD(s) ((uint32_t)(s)) +#define ATI_DP_SRC_FRGD(s) ((uint32_t)(s) << 8) +#define ATI_DP_SRC_MONO(s) ((uint32_t)(s) << 16) + +// ============================================================ +// Mach64 DP_PIX_WIDTH values +// ============================================================ + +#define ATI_PIX_8BPP 0x02 +#define ATI_PIX_15BPP 0x03 +#define ATI_PIX_16BPP 0x04 +#define ATI_PIX_32BPP 0x06 + +// HOST byte/word/dword order -- use native (little-endian) +#define ATI_HOST_BYTE_ORDER 0x00 + +// GUI_TRAJ_CNTL direction bits +#define ATI_DST_X_DIR_LEFT 0x00 +#define ATI_DST_X_DIR_RIGHT 0x01 +#define ATI_DST_Y_DIR_UP 0x00 +#define ATI_DST_Y_DIR_DOWN 0x02 + +// GUI_STAT busy bit +#define ATI_GUI_STAT_BUSY 0x00000001 +#define ATI_FIFO_STAT_MASK 0x0000FFFF + +// Hardware cursor size +#define ATI_HW_CURSOR_SIZE 64 +#define ATI_HW_CURSOR_BYTES 1024 // 64*64*2bpp/8 + +// Maximum wait iterations +#define ATI_MAX_IDLE_WAIT 1000000 + +// ============================================================ +// Private driver state +// ============================================================ + +typedef struct { + uint32_t lfbPhysAddr; + uint32_t vramSize; + uint32_t cursorOffset; + int32_t bytesPerPixel; + int32_t screenPitch; + volatile uint32_t *mmio; // mapped MMIO register base + uint32_t mmioPhysAddr; + bool useIo; // fall back to I/O on old GX/CX + uint16_t ioBase; // I/O base port for register access + DpmiMappingT lfbMapping; +} AtiPrivateT; + +// ============================================================ +// Prototypes +// ============================================================ + +static void atiBitBlt(AccelDriverT *drv, int32_t srcX, int32_t srcY, int32_t dstX, int32_t dstY, int32_t w, int32_t h); +static void atiColorExpand(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h, uint32_t fg, uint32_t bg); +static bool atiDetect(AccelDriverT *drv); +static void atiHostBlit(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h); +static bool atiInit(AccelDriverT *drv, const AccelModeRequestT *req); +static void atiLineDraw(AccelDriverT *drv, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color); +static void atiMoveCursor(AccelDriverT *drv, int32_t x, int32_t y); +static void atiRectFill(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color); +static void atiRectFillPat(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, const uint8_t *pattern, uint32_t fg, uint32_t bg); +static void atiSetClip(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h); +static void atiSetCursor(AccelDriverT *drv, const HwCursorImageT *image); +static void atiShowCursor(AccelDriverT *drv, bool visible); +static void atiShutdown(AccelDriverT *drv); +static void atiWaitFifo(AtiPrivateT *priv, int32_t entries); +static void atiWaitIdle(AccelDriverT *drv); +static void atiWriteReg(AtiPrivateT *priv, uint32_t reg, uint32_t val); +static uint32_t atiReadReg(AtiPrivateT *priv, uint32_t reg); + +// ============================================================ +// Driver instance +// ============================================================ + +static AtiPrivateT sAtiPrivate; + +static AccelDriverT sAtiDriver = { + .name = "ATI Mach64", + .chipFamily = "ati", + .caps = 0, + .privData = &sAtiPrivate, + .detect = atiDetect, + .init = atiInit, + .shutdown = atiShutdown, + .waitIdle = atiWaitIdle, + .setClip = atiSetClip, + .rectFill = atiRectFill, + .rectFillPat = atiRectFillPat, + .bitBlt = atiBitBlt, + .hostBlit = atiHostBlit, + .colorExpand = atiColorExpand, + .lineDraw = atiLineDraw, + .setCursor = atiSetCursor, + .moveCursor = atiMoveCursor, + .showCursor = atiShowCursor, +}; + +// ============================================================ +// atiRegisterDriver +// ============================================================ + +void atiRegisterDriver(void) { + accelRegisterDriver(&sAtiDriver); +} + + +// ============================================================ +// atiReadReg / atiWriteReg +// ============================================================ +// +// Register access abstraction. Uses MMIO when available, falls +// back to I/O port access on older chips. + +static uint32_t atiReadReg(AtiPrivateT *priv, uint32_t reg) { + if (priv->useIo) { + return inportl(priv->ioBase + reg); + } + + return priv->mmio[reg / 4]; +} + +static void atiWriteReg(AtiPrivateT *priv, uint32_t reg, uint32_t val) { + if (priv->useIo) { + outportl(priv->ioBase + reg, val); + return; + } + + priv->mmio[reg / 4] = val; +} + + +// ============================================================ +// atiBitBlt +// ============================================================ +// +// Screen-to-screen BitBLT. The Mach64 engine handles overlapping +// regions automatically based on the trajectory control register. + +static void atiBitBlt(AccelDriverT *drv, int32_t srcX, int32_t srcY, int32_t dstX, int32_t dstY, int32_t w, int32_t h) { + AtiPrivateT *priv = (AtiPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + // Determine blit direction + uint32_t direction = ATI_DST_X_DIR_RIGHT | ATI_DST_Y_DIR_DOWN; + int32_t sx = srcX; + int32_t sy = srcY; + int32_t dx = dstX; + int32_t dy = dstY; + + if (srcX < dstX) { + direction &= ~ATI_DST_X_DIR_RIGHT; + sx += w - 1; + dx += w - 1; + } + if (srcY < dstY) { + direction &= ~ATI_DST_Y_DIR_DOWN; + sy += h - 1; + dy += h - 1; + } + + atiWaitFifo(priv, 7); + atiWriteReg(priv, ATI_GUI_TRAJ_CNTL, direction); + atiWriteReg(priv, ATI_DP_MIX, ATI_FRGD_MIX(ATI_MIX_COPY) | ATI_BKGD_MIX(ATI_MIX_COPY)); + atiWriteReg(priv, ATI_DP_SRC, ATI_DP_SRC_FRGD(ATI_SRC_BLIT)); + atiWriteReg(priv, ATI_SRC_Y_X, ((uint32_t)sx << 16) | (uint32_t)sy); + atiWriteReg(priv, ATI_SRC_WIDTH1, w); + atiWriteReg(priv, ATI_DST_Y_X, ((uint32_t)dx << 16) | (uint32_t)dy); + atiWriteReg(priv, ATI_DST_HEIGHT_WIDTH, ((uint32_t)w << 16) | (uint32_t)h); +} + + +// ============================================================ +// atiColorExpand +// ============================================================ +// +// Monochrome-to-color expansion via the host data path. +// Converts 1bpp source bitmap to full-color pixels using the +// Mach64 engine. Source data is packed MSB-first, padded to +// dword boundaries per scanline. + +static void atiColorExpand(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h, uint32_t fg, uint32_t bg) { + AtiPrivateT *priv = (AtiPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + // Number of dwords per scanline of monochrome data + int32_t dwordsPerRow = (w + 31) / 32; + + // Set up color expand: mono source from host, fg/bg from color regs + atiWaitFifo(priv, 7); + atiWriteReg(priv, ATI_DP_FRGD_CLR, fg); + atiWriteReg(priv, ATI_DP_BKGD_CLR, bg); + atiWriteReg(priv, ATI_DP_SRC, ATI_DP_SRC_MONO(ATI_SRC_HOST) | ATI_DP_SRC_FRGD(ATI_SRC_FRGD_CLR) | ATI_DP_SRC_BKGD(ATI_SRC_BKGD_CLR)); + atiWriteReg(priv, ATI_DP_MIX, ATI_FRGD_MIX(ATI_MIX_COPY) | ATI_BKGD_MIX(ATI_MIX_COPY)); + atiWriteReg(priv, ATI_GUI_TRAJ_CNTL, ATI_DST_X_DIR_RIGHT | ATI_DST_Y_DIR_DOWN); + atiWriteReg(priv, ATI_DST_Y_X, ((uint32_t)dstX << 16) | (uint32_t)dstY); + atiWriteReg(priv, ATI_DST_HEIGHT_WIDTH, ((uint32_t)w << 16) | (uint32_t)h); + + // Feed monochrome data row by row through HOST_DATA0 + for (int32_t row = 0; row < h; row++) { + const uint8_t *rowPtr = srcBuf + row * srcPitch; + + for (int32_t dw = 0; dw < dwordsPerRow; dw++) { + // Pack bytes into a dword (MSB-first bit order) + int32_t byteOff = dw * 4; + uint32_t data = 0; + + for (int32_t b = 0; b < 4; b++) { + uint8_t srcByte = 0; + if (byteOff + b < srcPitch) { + srcByte = rowPtr[byteOff + b]; + } + data |= (uint32_t)srcByte << (24 - b * 8); + } + + atiWaitFifo(priv, 1); + atiWriteReg(priv, ATI_HOST_DATA0, data); + } + } +} + + +// ============================================================ +// atiDetect +// ============================================================ + +static bool atiDetect(AccelDriverT *drv) { + int32_t matchIdx; + + if (!pciFindDeviceList(sAtiDeviceIds, &drv->pciDev, &matchIdx)) { + return false; + } + + switch (drv->pciDev.deviceId) { + case ATI_MACH64_GX: + drv->name = "ATI Mach64 GX"; + break; + case ATI_MACH64_CX: + drv->name = "ATI Mach64 CX"; + break; + case ATI_MACH64_CT: + drv->name = "ATI Mach64 CT"; + break; + case ATI_MACH64_ET: + drv->name = "ATI Mach64 ET"; + break; + case ATI_MACH64_VT: + case ATI_MACH64_VT_B: + drv->name = "ATI Mach64 VT"; + break; + case ATI_MACH64_GT: + case ATI_MACH64_GT_B: + drv->name = "ATI 3D Rage II"; + break; + case ATI_RAGE_PRO: + case ATI_RAGE_PRO_AGP: + drv->name = "ATI Rage Pro"; + break; + case ATI_RAGE_128_RE: + case ATI_RAGE_128_RF: + case ATI_RAGE_128_RK: + case ATI_RAGE_128_RL: + drv->name = "ATI Rage 128"; + break; + case ATI_RAGE_128_PRO_PF: + case ATI_RAGE_128_PRO_PR: + drv->name = "ATI Rage 128 Pro"; + break; + default: + drv->name = "ATI Mach64"; + break; + } + + return true; +} + + +// ============================================================ +// atiHostBlit +// ============================================================ +// +// CPU-to-screen blit. Transfers pixel data from system memory +// to VRAM through the Mach64 host data registers. + +static void atiHostBlit(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h) { + AtiPrivateT *priv = (AtiPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + int32_t bytesPerPixel = priv->bytesPerPixel; + int32_t rowBytes = w * bytesPerPixel; + int32_t dwordsPerRow = (rowBytes + 3) / 4; + + // Set up host-to-screen blit + atiWaitFifo(priv, 5); + atiWriteReg(priv, ATI_DP_SRC, ATI_DP_SRC_FRGD(ATI_SRC_HOST)); + atiWriteReg(priv, ATI_DP_MIX, ATI_FRGD_MIX(ATI_MIX_COPY) | ATI_BKGD_MIX(ATI_MIX_COPY)); + atiWriteReg(priv, ATI_GUI_TRAJ_CNTL, ATI_DST_X_DIR_RIGHT | ATI_DST_Y_DIR_DOWN); + atiWriteReg(priv, ATI_DST_Y_X, ((uint32_t)dstX << 16) | (uint32_t)dstY); + atiWriteReg(priv, ATI_DST_HEIGHT_WIDTH, ((uint32_t)w << 16) | (uint32_t)h); + + // Write pixel data row by row through HOST_DATA0 + for (int32_t row = 0; row < h; row++) { + const uint8_t *rowPtr = srcBuf + row * srcPitch; + + for (int32_t dw = 0; dw < dwordsPerRow; dw++) { + int32_t byteOff = dw * 4; + uint32_t data = 0; + + // Pack bytes into a dword (little-endian native order) + for (int32_t b = 0; b < 4; b++) { + if (byteOff + b < rowBytes) { + data |= (uint32_t)rowPtr[byteOff + b] << (b * 8); + } + } + + atiWaitFifo(priv, 1); + atiWriteReg(priv, ATI_HOST_DATA0, data); + } + } +} + + +// ============================================================ +// atiInit +// ============================================================ + +static bool atiInit(AccelDriverT *drv, const AccelModeRequestT *req) { + AtiPrivateT *priv = (AtiPrivateT *)drv->privData; + + memset(priv, 0, sizeof(*priv)); + + // Determine if this is an old GX/CX (I/O only) or newer (MMIO) + priv->useIo = (drv->pciDev.deviceId == ATI_MACH64_GX + || drv->pciDev.deviceId == ATI_MACH64_CX); + priv->ioBase = ATI_IO_BASE_DEFAULT; + + // Get LFB address and size from PCI BAR0 + uint32_t bar0 = pciRead32(drv->pciDev.bus, drv->pciDev.dev, + drv->pciDev.func, PCI_BAR0); + priv->lfbPhysAddr = bar0 & 0xFFFFFFF0; + + uint32_t barSize = pciSizeBar(drv->pciDev.bus, drv->pciDev.dev, drv->pciDev.func, PCI_BAR0); + + // Aperture size != VRAM size on Mach64 (aperture is typically 8MB) + // Read actual VRAM from MEM_CNTL register + uint32_t memCntl; + if (priv->useIo) { + memCntl = inportl(priv->ioBase + ATI_MEM_CNTL); + } else { + // Need a temporary MMIO mapping to read MEM_CNTL + // MMIO is at the end of the aperture + priv->mmioPhysAddr = priv->lfbPhysAddr + barSize - ATI_MMIO_SIZE; + memCntl = 0; // will determine from aperture size + } + + // Determine VRAM size + if (memCntl != 0) { + uint32_t memSize = memCntl & 0x07; + switch (memSize) { + case 0: priv->vramSize = 512 * 1024; break; + case 1: priv->vramSize = 1024 * 1024; break; + case 2: priv->vramSize = 2 * 1024 * 1024; break; + case 3: priv->vramSize = 4 * 1024 * 1024; break; + case 4: priv->vramSize = 6 * 1024 * 1024; break; + case 5: priv->vramSize = 8 * 1024 * 1024; break; + default: priv->vramSize = 2 * 1024 * 1024; break; + } + } else { + // Conservative fallback + priv->vramSize = (barSize > 8 * 1024 * 1024) ? 4 * 1024 * 1024 : barSize; + } + + // Set VESA mode + VesaModeResultT vesa; + if (!vesaFindAndSetMode(req->width, req->height, req->bpp, &vesa)) { + return false; + } + + // Map LFB + MMIO region (map entire aperture; MMIO is at end) + if (!dpmiMapFramebuffer(priv->lfbPhysAddr, barSize, &priv->lfbMapping)) { + vgaRestoreTextMode(); + return false; + } + + priv->bytesPerPixel = (vesa.bpp + 7) / 8; + priv->screenPitch = vesa.pitch; + + drv->mode.width = vesa.width; + drv->mode.height = vesa.height; + drv->mode.bpp = vesa.bpp; + drv->mode.pitch = vesa.pitch; + drv->mode.framebuffer = priv->lfbMapping.ptr; + drv->mode.vramSize = priv->vramSize; + drv->mode.offscreenBase = vesa.pitch * vesa.height; + + // Set up MMIO pointer at end of aperture + if (!priv->useIo) { + priv->mmio = (volatile uint32_t *)(priv->lfbMapping.ptr + barSize - ATI_MMIO_SIZE); + } + + // Configure the drawing engine pixel width + uint32_t pixWidth; + switch (vesa.bpp) { + case 8: pixWidth = ATI_PIX_8BPP; break; + case 15: pixWidth = ATI_PIX_15BPP; break; + case 16: pixWidth = ATI_PIX_16BPP; break; + case 32: pixWidth = ATI_PIX_32BPP; break; + default: pixWidth = ATI_PIX_16BPP; break; + } + + // DP_PIX_WIDTH: set all fields to the same depth + uint32_t dpPixWidth = pixWidth + | (pixWidth << 4) // host data + | (pixWidth << 8) // source + | (pixWidth << 16) // destination + | (pixWidth << 28); // default + atiWaitFifo(priv, 2); + atiWriteReg(priv, ATI_DP_PIX_WIDTH, dpPixWidth); + atiWriteReg(priv, ATI_DP_WRITE_MASK, 0xFFFFFFFF); + + // Set DST_OFF_PITCH: offset = 0, pitch in units of 8 pixels + uint32_t pitch8 = vesa.pitch / priv->bytesPerPixel / 8; + atiWriteReg(priv, ATI_DST_OFF_PITCH, pitch8 << 22); + atiWriteReg(priv, ATI_SRC_OFF_PITCH, pitch8 << 22); + + // Set up cursor at end of VRAM + priv->cursorOffset = priv->vramSize - ATI_HW_CURSOR_BYTES; + priv->cursorOffset &= ~(ATI_HW_CURSOR_BYTES - 1); + + drv->caps = ACAP_RECT_FILL + | ACAP_RECT_FILL_PAT + | ACAP_BITBLT + | ACAP_HOST_BLIT + | ACAP_COLOR_EXPAND + | ACAP_LINE_DRAW + | ACAP_HW_CURSOR + | ACAP_CLIP; + + // Full screen clip + atiSetClip(drv, 0, 0, vesa.width, vesa.height); + + atiWaitIdle(drv); + return true; +} + + +// ============================================================ +// atiLineDraw +// ============================================================ +// +// Bresenham line draw using the Mach64 DST_BRES registers. + +static void atiLineDraw(AccelDriverT *drv, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color) { + AtiPrivateT *priv = (AtiPrivateT *)drv->privData; + + int32_t dx = x2 - x1; + int32_t dy = y2 - y1; + + uint32_t direction = ATI_DST_X_DIR_RIGHT | ATI_DST_Y_DIR_DOWN; + + if (dx < 0) { + dx = -dx; + direction &= ~ATI_DST_X_DIR_RIGHT; + } + if (dy < 0) { + dy = -dy; + direction &= ~ATI_DST_Y_DIR_DOWN; + } + + int32_t majAxis; + int32_t minAxis; + + if (dx >= dy) { + majAxis = dx; + minAxis = dy; + } else { + majAxis = dy; + minAxis = dx; + // Swap X/Y major + direction |= 0x04; // Y major axis select + } + + if (majAxis == 0) { + return; + } + + int32_t errTerm = 2 * minAxis - majAxis; + int32_t errInc = 2 * minAxis; + int32_t errDec = 2 * (minAxis - majAxis); + + atiWaitFifo(priv, 8); + atiWriteReg(priv, ATI_GUI_TRAJ_CNTL, direction); + atiWriteReg(priv, ATI_DP_MIX, ATI_FRGD_MIX(ATI_MIX_COPY)); + atiWriteReg(priv, ATI_DP_SRC, ATI_DP_SRC_FRGD(ATI_SRC_FRGD_CLR)); + atiWriteReg(priv, ATI_DP_FRGD_CLR, color); + atiWriteReg(priv, ATI_DST_Y_X, ((uint32_t)x1 << 16) | (uint32_t)y1); + atiWriteReg(priv, ATI_DST_BRES_ERR, errTerm); + atiWriteReg(priv, ATI_DST_BRES_INC, errInc); + atiWriteReg(priv, ATI_DST_BRES_DEC, errDec); + + atiWaitFifo(priv, 1); + atiWriteReg(priv, ATI_DST_BRES_LNTH, majAxis + 1); +} + + +// ============================================================ +// atiMoveCursor +// ============================================================ + +static void atiMoveCursor(AccelDriverT *drv, int32_t x, int32_t y) { + AtiPrivateT *priv = (AtiPrivateT *)drv->privData; + + uint32_t offset = 0; + + if (x < 0) { + offset |= ((-x) & 0x3F) << 16; + x = 0; + } + if (y < 0) { + offset |= (-y) & 0x3F; + y = 0; + } + + atiWriteReg(priv, ATI_CUR_HORZ_VERT_OFF, offset); + atiWriteReg(priv, ATI_CUR_HORZ_VERT_POSN, + ((uint32_t)x << 16) | (uint32_t)y); +} + + +// ============================================================ +// atiRectFill +// ============================================================ + +static void atiRectFill(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color) { + AtiPrivateT *priv = (AtiPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + atiWaitFifo(priv, 5); + atiWriteReg(priv, ATI_GUI_TRAJ_CNTL, ATI_DST_X_DIR_RIGHT | ATI_DST_Y_DIR_DOWN); + atiWriteReg(priv, ATI_DP_MIX, ATI_FRGD_MIX(ATI_MIX_COPY)); + atiWriteReg(priv, ATI_DP_SRC, ATI_DP_SRC_FRGD(ATI_SRC_FRGD_CLR)); + atiWriteReg(priv, ATI_DP_FRGD_CLR, color); + atiWriteReg(priv, ATI_DST_Y_X, ((uint32_t)x << 16) | (uint32_t)y); + + atiWaitFifo(priv, 1); + atiWriteReg(priv, ATI_DST_HEIGHT_WIDTH, ((uint32_t)w << 16) | (uint32_t)h); +} + + +// ============================================================ +// atiRectFillPat +// ============================================================ +// +// 8x8 mono pattern fill using the host data path. The pattern is +// 8 bytes (one per row, MSB-first), tiled across the rectangle. +// 1-bits use the foreground color, 0-bits use the background. +// Data is fed through HOST_DATA0, repeating the 8-row pattern +// for the full height, with each row padded to a dword boundary. + +static void atiRectFillPat(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, const uint8_t *pattern, uint32_t fg, uint32_t bg) { + AtiPrivateT *priv = (AtiPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + // Number of dwords per scanline of monochrome data + int32_t dwordsPerRow = (w + 31) / 32; + + // Set up color expand: mono source from host, fg/bg from color regs + atiWaitFifo(priv, 7); + atiWriteReg(priv, ATI_DP_FRGD_CLR, fg); + atiWriteReg(priv, ATI_DP_BKGD_CLR, bg); + atiWriteReg(priv, ATI_DP_SRC, ATI_DP_SRC_MONO(ATI_SRC_HOST) | ATI_DP_SRC_FRGD(ATI_SRC_FRGD_CLR) | ATI_DP_SRC_BKGD(ATI_SRC_BKGD_CLR)); + atiWriteReg(priv, ATI_DP_MIX, ATI_FRGD_MIX(ATI_MIX_COPY) | ATI_BKGD_MIX(ATI_MIX_COPY)); + atiWriteReg(priv, ATI_GUI_TRAJ_CNTL, ATI_DST_X_DIR_RIGHT | ATI_DST_Y_DIR_DOWN); + atiWriteReg(priv, ATI_DST_Y_X, ((uint32_t)x << 16) | (uint32_t)y); + atiWriteReg(priv, ATI_DST_HEIGHT_WIDTH, ((uint32_t)w << 16) | (uint32_t)h); + + // Feed tiled pattern data through HOST_DATA0 + for (int32_t row = 0; row < h; row++) { + uint8_t patByte = pattern[row & 7]; + + for (int32_t dw = 0; dw < dwordsPerRow; dw++) { + // Replicate the pattern byte across all 4 bytes of the dword. + // MSB-first bit order: place the pattern byte in the high byte. + uint32_t data = ((uint32_t)patByte << 24) + | ((uint32_t)patByte << 16) + | ((uint32_t)patByte << 8) + | (uint32_t)patByte; + + atiWaitFifo(priv, 1); + atiWriteReg(priv, ATI_HOST_DATA0, data); + } + } +} + + +// ============================================================ +// atiSetClip +// ============================================================ + +static void atiSetClip(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h) { + AtiPrivateT *priv = (AtiPrivateT *)drv->privData; + + atiWaitFifo(priv, 4); + atiWriteReg(priv, ATI_SC_LEFT, x); + atiWriteReg(priv, ATI_SC_TOP, y); + atiWriteReg(priv, ATI_SC_RIGHT, x + w - 1); + atiWriteReg(priv, ATI_SC_BOTTOM, y + h - 1); +} + + +// ============================================================ +// atiSetCursor +// ============================================================ + +static void atiSetCursor(AccelDriverT *drv, const HwCursorImageT *image) { + AtiPrivateT *priv = (AtiPrivateT *)drv->privData; + + if (!image) { + atiShowCursor(drv, false); + return; + } + + atiWaitIdle(drv); + + // Write cursor image to VRAM + // Mach64 cursor format: 64x64, 2bpp, rows of 16 bytes + // Bit encoding: 00=cursor color 0, 01=cursor color 1, + // 10=transparent, 11=inverted + uint8_t *cursorMem = drv->mode.framebuffer + priv->cursorOffset; + + for (int32_t row = 0; row < ATI_HW_CURSOR_SIZE; row++) { + for (int32_t byte = 0; byte < 16; byte++) { + uint8_t val = 0xAA; // all transparent (10 pattern) + + if (row < image->height && byte < (image->width + 3) / 4) { + // Convert AND/XOR to Mach64 2bpp encoding + int32_t bitOff = byte * 4; + uint8_t andBits = 0; + uint8_t xorBits = 0; + + if (bitOff / 8 < (image->width + 7) / 8) { + andBits = image->andMask[row * 8 + bitOff / 8]; + xorBits = image->xorMask[row * 8 + bitOff / 8]; + } + + // Pack 4 pixels into one byte (2 bits each) + val = 0; + for (int32_t px = 0; px < 4; px++) { + int32_t srcBit = (bitOff + px) % 8; + uint8_t andBit = (andBits >> (7 - srcBit)) & 1; + uint8_t xorBit = (xorBits >> (7 - srcBit)) & 1; + uint8_t pixel; + + if (andBit && !xorBit) { + pixel = 0x02; // transparent + } else if (andBit && xorBit) { + pixel = 0x03; // inverted + } else if (!andBit && xorBit) { + pixel = 0x01; // cursor color 1 + } else { + pixel = 0x00; // cursor color 0 + } + + val |= pixel << (6 - px * 2); + } + } + + cursorMem[row * 16 + byte] = val; + } + } + + // Set cursor offset (in units of 8 bytes) + atiWriteReg(priv, ATI_CUR_OFFSET, priv->cursorOffset / 8); + + // Set cursor colors (white foreground, black background) + atiWriteReg(priv, ATI_CUR_CLR0, 0x00000000); + atiWriteReg(priv, ATI_CUR_CLR1, 0x00FFFFFF); +} + + +// ============================================================ +// atiShowCursor +// ============================================================ + +static void atiShowCursor(AccelDriverT *drv, bool visible) { + AtiPrivateT *priv = (AtiPrivateT *)drv->privData; + + uint32_t val = atiReadReg(priv, ATI_GEN_TEST_CNTL); + + if (visible) { + val |= 0x80; // enable cursor + } else { + val &= ~0x80; + } + + atiWriteReg(priv, ATI_GEN_TEST_CNTL, val); +} + + +// ============================================================ +// atiShutdown +// ============================================================ + +static void atiShutdown(AccelDriverT *drv) { + AtiPrivateT *priv = (AtiPrivateT *)drv->privData; + + atiShowCursor(drv, false); + dpmiUnmapFramebuffer(&priv->lfbMapping); + vgaRestoreTextMode(); +} + + +// ============================================================ +// atiWaitFifo +// ============================================================ +// +// Wait until the Mach64 FIFO has at least 'entries' free slots. +// The FIFO_STAT register indicates free entries (bits 15:0, +// value = 0x8000 means 0 free, lower values mean more free). + +static void atiWaitFifo(AtiPrivateT *priv, int32_t entries) { + uint32_t mask = ATI_FIFO_STAT_MASK >> entries; + + for (int32_t i = 0; i < ATI_MAX_IDLE_WAIT; i++) { + if (!(atiReadReg(priv, ATI_FIFO_STAT) & mask)) { + return; + } + } +} + + +// ============================================================ +// atiWaitIdle +// ============================================================ + +static void atiWaitIdle(AccelDriverT *drv) { + AtiPrivateT *priv = (AtiPrivateT *)drv->privData; + + // First wait for FIFO to drain + atiWaitFifo(priv, 16); + + // Then wait for engine idle + for (int32_t i = 0; i < ATI_MAX_IDLE_WAIT; i++) { + if (!(atiReadReg(priv, ATI_GUI_STAT_MMIO) & ATI_GUI_STAT_BUSY)) { + return; + } + } +} diff --git a/banshee.c b/banshee.c new file mode 100644 index 0000000..a5aa8bd --- /dev/null +++ b/banshee.c @@ -0,0 +1,715 @@ +// banshee.c -- 3dfx Banshee/Voodoo3 accelerated video driver +// +// Supports the 3dfx Banshee and Voodoo3 2D/3D accelerators. +// The Banshee was 3dfx's first 2D/3D combo chip, and the Voodoo3 +// improved on it with higher clock speeds. Both share the same +// 2D register interface: +// - Hardware rectangle fill +// - Screen-to-screen BitBLT +// - CPU-to-screen blit (host blit via launch area) +// - Monochrome color expansion (host blit with mono source) +// - Bresenham line draw +// - Hardware clip rectangle +// - 64x64 hardware cursor +// +// Register access: +// BAR0 maps the 32KB MMIO register block. The 2D engine +// registers live at offsets 0x200-0x270 within this block. +// The status register at 0x100 provides engine busy state. +// +// For host-to-screen operations, pixel data is fed through the +// "launch area" -- a write-combining window at MMIO physical +// address + 0x80000. Data is written as 32-bit dwords. +// +// BAR1 maps the linear framebuffer. + +#include "accelVid.h" +#include "vgaCommon.h" +#include "pci.h" + +#include +#include +#include +#include +#include + +// ============================================================ +// 3dfx vendor/device IDs +// ============================================================ + +#define TDFX_VENDOR_ID 0x121A + +#define TDFX_BANSHEE 0x0003 +#define TDFX_VOODOO3 0x0005 + +static const uint16_t sBansheeDeviceIds[] = { + TDFX_VENDOR_ID, TDFX_BANSHEE, + TDFX_VENDOR_ID, TDFX_VOODOO3, + 0, 0 +}; + +// ============================================================ +// 2D engine register offsets (from MMIO base) +// ============================================================ + +#define BAN_STATUS 0x100 // bits 0-10: busy when any set +#define BAN_INTRCTRL 0x108 // interrupt control + +#define BAN_CLIP0MIN 0x200 // clip rect 0 min (X | Y<<16) +#define BAN_CLIP0MAX 0x204 // clip rect 0 max (X | Y<<16) +#define BAN_DSTBASEADDR 0x208 // destination base address +#define BAN_DSTFORMAT 0x20C // pitch<<16 | bpp encoding +#define BAN_SRCCKMIN 0x210 // source color key min +#define BAN_SRCCKMAX 0x214 // source color key max +#define BAN_DSTCKMIN 0x218 // dest color key min +#define BAN_DSTCKMAX 0x21C // dest color key max +#define BAN_BRESERROR0 0x220 // Bresenham error 0 +#define BAN_BRESERROR1 0x224 // Bresenham error 1 +#define BAN_ROP 0x230 // raster operation (bits 7:0) +#define BAN_SRCBASEADDR 0x234 // source base address +#define BAN_COMMANDEXTRA 0x238 // command extra +#define BAN_LINESTIPPLE 0x23C // line stipple +#define BAN_LINESTYLE 0x240 // line style +#define BAN_PATTERN0 0x244 // pattern alias 0 +#define BAN_PATTERN1 0x248 // pattern alias 1 +#define BAN_CLIP1MIN 0x24C // clip rect 1 min +#define BAN_CLIP1MAX 0x250 // clip rect 1 max +#define BAN_SRCFORMAT 0x254 // pitch<<16 | bpp encoding +#define BAN_SRCSIZE 0x258 // width | height<<16 +#define BAN_SRCXY 0x25C // X | Y<<16 +#define BAN_COLORBACK 0x260 // background color +#define BAN_COLORFORE 0x264 // foreground color +#define BAN_DSTSIZE 0x268 // width | height<<16 +#define BAN_DSTXY 0x26C // X | Y<<16 +#define BAN_COMMAND 0x270 // command (triggers operation) + +// ============================================================ +// Command register encoding +// ============================================================ + +// Command types (bits 3:0) +#define BAN_CMD_NOP 0x00 +#define BAN_CMD_S2S_BLIT 0x01 // screen-to-screen blit +#define BAN_CMD_S2S_STRETCH 0x02 // screen-to-screen stretch blit +#define BAN_CMD_H2S_BLIT 0x03 // host-to-screen blit +#define BAN_CMD_RECTFILL 0x05 // rectangle fill +#define BAN_CMD_LINEDRAW 0x06 // line draw +#define BAN_CMD_POLYLINE 0x07 // polyline + +// Command flags +#define BAN_CMD_INITIATE (1 << 4) // must be set to start operation +#define BAN_CMD_STIPPLE (1 << 8) // stipple line +#define BAN_CMD_CLIPSEL1 (1 << 9) // use clip1 instead of clip0 +#define BAN_CMD_SRCCKENA (1 << 12) // source color key enable +#define BAN_CMD_DSTCKENA (1 << 13) // dest color key enable +#define BAN_CMD_MONOPAT (1 << 14) // mono pattern +#define BAN_CMD_SRCMONO (1 << 15) // source is monochrome + +// ============================================================ +// BPP format encodings (for srcFormat/dstFormat low bits) +// ============================================================ + +#define BAN_FMT_8BPP 1 +#define BAN_FMT_16BPP 3 +#define BAN_FMT_32BPP 5 + +// ============================================================ +// Status register +// ============================================================ + +#define BAN_STATUS_BUSY_MASK 0x7FF // bits 0-10: engine busy + +// ============================================================ +// Hardware cursor registers +// ============================================================ + +#define BAN_VIDPROCCFG 0x5C // bit 27 = cursor enable +#define BAN_CURSORLOC 0x60 // X | Y<<16 + +#define BAN_CURSOR_ENABLE (1 << 27) + +// ============================================================ +// Launch area +// ============================================================ + +#define BAN_LAUNCH_OFFSET 0x80000 // offset from MMIO phys base +#define BAN_LAUNCH_MAP_SIZE 4096 // map 4KB of launch area + +// ============================================================ +// Misc constants +// ============================================================ + +#define BAN_MMIO_SIZE 32768 // BAR0: 32KB MMIO +#define BAN_MAX_IDLE_WAIT 1000000 +#define BAN_ROP_COPY 0xCC +#define BAN_HW_CURSOR_SIZE 64 + +// ============================================================ +// Private driver state +// ============================================================ + +typedef struct { + uint32_t lfbPhysAddr; + uint32_t mmioPhysAddr; + uint32_t vramSize; + int32_t bytesPerPixel; + int32_t screenPitch; + uint32_t bppFormat; + volatile uint32_t *mmio; + volatile uint32_t *launch; + DpmiMappingT mmioMap; + DpmiMappingT lfbMap; + DpmiMappingT launchMap; +} BansheePrivateT; + +// ============================================================ +// Prototypes +// ============================================================ + +static void bansheeBitBlt(AccelDriverT *drv, int32_t srcX, int32_t srcY, int32_t dstX, int32_t dstY, int32_t w, int32_t h); +static void bansheeColorExpand(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h, uint32_t fg, uint32_t bg); +static bool bansheeDetect(AccelDriverT *drv); +static void bansheeHostBlit(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h); +static bool bansheeInit(AccelDriverT *drv, const AccelModeRequestT *req); +static void bansheeLineDraw(AccelDriverT *drv, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color); +static void bansheeMoveCursor(AccelDriverT *drv, int32_t x, int32_t y); +static void bansheeRectFill(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color); +static void bansheeRectFillPat(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, const uint8_t *pattern, uint32_t fg, uint32_t bg); +static void bansheeSetClip(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h); +static void bansheeSetCursor(AccelDriverT *drv, const HwCursorImageT *image); +static void bansheeShowCursor(AccelDriverT *drv, bool visible); +static void bansheeShutdown(AccelDriverT *drv); +static void bansheeWaitIdle(AccelDriverT *drv); +static uint32_t bppToFormat(int32_t bpp); + +static inline void bansheeWrite(BansheePrivateT *priv, uint32_t reg, uint32_t val) { + priv->mmio[reg / 4] = val; +} + +static inline uint32_t bansheeRead(BansheePrivateT *priv, uint32_t reg) { + return priv->mmio[reg / 4]; +} + +// ============================================================ +// Driver instance +// ============================================================ + +static BansheePrivateT sBansheePrivate; + +static AccelDriverT sBansheeDriver = { + .name = "3dfx Banshee", + .chipFamily = "3dfx", + .caps = 0, + .privData = &sBansheePrivate, + .detect = bansheeDetect, + .init = bansheeInit, + .shutdown = bansheeShutdown, + .waitIdle = bansheeWaitIdle, + .setClip = bansheeSetClip, + .rectFill = bansheeRectFill, + .rectFillPat = bansheeRectFillPat, + .bitBlt = bansheeBitBlt, + .hostBlit = bansheeHostBlit, + .colorExpand = bansheeColorExpand, + .lineDraw = bansheeLineDraw, + .setCursor = bansheeSetCursor, + .moveCursor = bansheeMoveCursor, + .showCursor = bansheeShowCursor, +}; + +// ============================================================ +// bansheeRegisterDriver +// ============================================================ + +void bansheeRegisterDriver(void) { + accelRegisterDriver(&sBansheeDriver); +} + + +// ============================================================ +// bansheeBitBlt +// ============================================================ +// +// Screen-to-screen BitBLT. The Banshee engine handles overlapping +// regions automatically when srcXY and dstXY are set correctly -- +// the hardware determines the blit direction internally. + +static void bansheeBitBlt(AccelDriverT *drv, int32_t srcX, int32_t srcY, int32_t dstX, int32_t dstY, int32_t w, int32_t h) { + BansheePrivateT *priv = (BansheePrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + bansheeWaitIdle(drv); + + bansheeWrite(priv, BAN_SRCBASEADDR, 0); + bansheeWrite(priv, BAN_DSTBASEADDR, 0); + bansheeWrite(priv, BAN_SRCFORMAT, ((uint32_t)priv->screenPitch << 16) | priv->bppFormat); + bansheeWrite(priv, BAN_DSTFORMAT, ((uint32_t)priv->screenPitch << 16) | priv->bppFormat); + bansheeWrite(priv, BAN_ROP, BAN_ROP_COPY); + bansheeWrite(priv, BAN_SRCSIZE, (uint32_t)w | ((uint32_t)h << 16)); + bansheeWrite(priv, BAN_SRCXY, (uint32_t)srcX | ((uint32_t)srcY << 16)); + bansheeWrite(priv, BAN_DSTSIZE, (uint32_t)w | ((uint32_t)h << 16)); + bansheeWrite(priv, BAN_DSTXY, (uint32_t)dstX | ((uint32_t)dstY << 16)); + bansheeWrite(priv, BAN_COMMAND, BAN_CMD_S2S_BLIT | BAN_CMD_INITIATE); +} + + +// ============================================================ +// bansheeColorExpand +// ============================================================ +// +// Monochrome-to-color expansion using host-to-screen blit with +// the SRCMONO flag. Mono bitmap bits are expanded to fg/bg colors +// by the hardware. Data is fed as dwords through the launch area. + +static void bansheeColorExpand(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h, uint32_t fg, uint32_t bg) { + BansheePrivateT *priv = (BansheePrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + int32_t bytesPerRow = (w + 7) / 8; + int32_t dwordsPerRow = (bytesPerRow + 3) / 4; + + bansheeWaitIdle(drv); + + bansheeWrite(priv, BAN_DSTBASEADDR, 0); + bansheeWrite(priv, BAN_DSTFORMAT, ((uint32_t)priv->screenPitch << 16) | priv->bppFormat); + bansheeWrite(priv, BAN_SRCFORMAT, ((uint32_t)bytesPerRow << 16) | BAN_FMT_8BPP); + bansheeWrite(priv, BAN_ROP, BAN_ROP_COPY); + bansheeWrite(priv, BAN_COLORFORE, fg); + bansheeWrite(priv, BAN_COLORBACK, bg); + bansheeWrite(priv, BAN_SRCSIZE, (uint32_t)w | ((uint32_t)h << 16)); + bansheeWrite(priv, BAN_DSTSIZE, (uint32_t)w | ((uint32_t)h << 16)); + bansheeWrite(priv, BAN_DSTXY, (uint32_t)dstX | ((uint32_t)dstY << 16)); + bansheeWrite(priv, BAN_COMMAND, BAN_CMD_H2S_BLIT | BAN_CMD_INITIATE | BAN_CMD_SRCMONO); + + // Feed mono data row by row through the launch area + for (int32_t row = 0; row < h; row++) { + const uint8_t *rowPtr = srcBuf + row * srcPitch; + + for (int32_t dw = 0; dw < dwordsPerRow; dw++) { + uint32_t val = 0; + int32_t offset = dw * 4; + + for (int32_t b = 0; b < 4; b++) { + if (offset + b < bytesPerRow) { + val |= (uint32_t)rowPtr[offset + b] << (b * 8); + } + } + + priv->launch[0] = val; + } + } +} + + +// ============================================================ +// bansheeDetect +// ============================================================ + +static bool bansheeDetect(AccelDriverT *drv) { + int32_t matchIdx; + + if (!pciFindDeviceList(sBansheeDeviceIds, &drv->pciDev, &matchIdx)) { + return false; + } + + switch (drv->pciDev.deviceId) { + case TDFX_BANSHEE: + drv->name = "3dfx Banshee"; + break; + case TDFX_VOODOO3: + drv->name = "3dfx Voodoo3"; + break; + default: + drv->name = "3dfx Banshee/Voodoo3"; + break; + } + + return true; +} + + +// ============================================================ +// bansheeHostBlit +// ============================================================ +// +// CPU-to-screen blit using host-to-screen command. Pixel data is +// fed as dwords through the launch area write-combining window. + +static void bansheeHostBlit(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h) { + BansheePrivateT *priv = (BansheePrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + int32_t bytesPerRow = w * priv->bytesPerPixel; + int32_t dwordsPerRow = (bytesPerRow + 3) / 4; + + bansheeWaitIdle(drv); + + bansheeWrite(priv, BAN_DSTBASEADDR, 0); + bansheeWrite(priv, BAN_SRCBASEADDR, 0); + bansheeWrite(priv, BAN_SRCFORMAT, ((uint32_t)(w * priv->bytesPerPixel) << 16) | priv->bppFormat); + bansheeWrite(priv, BAN_DSTFORMAT, ((uint32_t)priv->screenPitch << 16) | priv->bppFormat); + bansheeWrite(priv, BAN_ROP, BAN_ROP_COPY); + bansheeWrite(priv, BAN_SRCSIZE, (uint32_t)w | ((uint32_t)h << 16)); + bansheeWrite(priv, BAN_DSTSIZE, (uint32_t)w | ((uint32_t)h << 16)); + bansheeWrite(priv, BAN_DSTXY, (uint32_t)dstX | ((uint32_t)dstY << 16)); + bansheeWrite(priv, BAN_COMMAND, BAN_CMD_H2S_BLIT | BAN_CMD_INITIATE); + + // Feed pixel data row by row through the launch area + for (int32_t row = 0; row < h; row++) { + const uint8_t *rowPtr = srcBuf + row * srcPitch; + + for (int32_t dw = 0; dw < dwordsPerRow; dw++) { + uint32_t val = 0; + int32_t offset = dw * 4; + + for (int32_t b = 0; b < 4; b++) { + if (offset + b < bytesPerRow) { + val |= (uint32_t)rowPtr[offset + b] << (b * 8); + } + } + + priv->launch[0] = val; + } + } +} + + +// ============================================================ +// bansheeInit +// ============================================================ + +static bool bansheeInit(AccelDriverT *drv, const AccelModeRequestT *req) { + BansheePrivateT *priv = (BansheePrivateT *)drv->privData; + + // Read BARs + uint32_t bar0 = pciRead32(drv->pciDev.bus, drv->pciDev.dev, + drv->pciDev.func, PCI_BAR0); + uint32_t bar1 = pciRead32(drv->pciDev.bus, drv->pciDev.dev, + drv->pciDev.func, PCI_BAR1); + + priv->mmioPhysAddr = bar0 & 0xFFFFFFF0; + priv->lfbPhysAddr = bar1 & 0xFFFFFFF0; + + // Size the framebuffer BAR + priv->vramSize = pciSizeBar(drv->pciDev.bus, drv->pciDev.dev, + drv->pciDev.func, PCI_BAR1); + + // Map MMIO control registers (32KB) + if (!dpmiMapFramebuffer(priv->mmioPhysAddr, BAN_MMIO_SIZE, &priv->mmioMap)) { + return false; + } + priv->mmio = (volatile uint32_t *)priv->mmioMap.ptr; + + // Map launch area (4KB at MMIO phys + 0x80000) + if (!dpmiMapFramebuffer(priv->mmioPhysAddr + BAN_LAUNCH_OFFSET, BAN_LAUNCH_MAP_SIZE, &priv->launchMap)) { + dpmiUnmapFramebuffer(&priv->mmioMap); + return false; + } + priv->launch = (volatile uint32_t *)priv->launchMap.ptr; + + // Find and set VESA mode + VesaModeResultT vesa; + if (!vesaFindAndSetMode(req->width, req->height, req->bpp, &vesa)) { + dpmiUnmapFramebuffer(&priv->launchMap); + dpmiUnmapFramebuffer(&priv->mmioMap); + return false; + } + + // Map framebuffer + if (!dpmiMapFramebuffer(priv->lfbPhysAddr, priv->vramSize, &priv->lfbMap)) { + vgaRestoreTextMode(); + dpmiUnmapFramebuffer(&priv->launchMap); + dpmiUnmapFramebuffer(&priv->mmioMap); + return false; + } + + priv->bytesPerPixel = (vesa.bpp + 7) / 8; + priv->screenPitch = vesa.pitch; + priv->bppFormat = bppToFormat(vesa.bpp); + + drv->mode.width = vesa.width; + drv->mode.height = vesa.height; + drv->mode.bpp = vesa.bpp; + drv->mode.pitch = vesa.pitch; + drv->mode.framebuffer = priv->lfbMap.ptr; + drv->mode.vramSize = priv->vramSize; + drv->mode.offscreenBase = vesa.pitch * vesa.height; + + // Wait for engine idle before configuring + bansheeWaitIdle(drv); + + // Set default engine state + bansheeWrite(priv, BAN_SRCBASEADDR, 0); + bansheeWrite(priv, BAN_DSTBASEADDR, 0); + bansheeWrite(priv, BAN_DSTFORMAT, ((uint32_t)priv->screenPitch << 16) | priv->bppFormat); + bansheeWrite(priv, BAN_SRCFORMAT, ((uint32_t)priv->screenPitch << 16) | priv->bppFormat); + bansheeWrite(priv, BAN_ROP, BAN_ROP_COPY); + bansheeWrite(priv, BAN_COMMANDEXTRA, 0); + + drv->caps = ACAP_RECT_FILL + | ACAP_RECT_FILL_PAT + | ACAP_BITBLT + | ACAP_HOST_BLIT + | ACAP_COLOR_EXPAND + | ACAP_LINE_DRAW + | ACAP_HW_CURSOR + | ACAP_CLIP; + + // Full screen clip + bansheeSetClip(drv, 0, 0, vesa.width, vesa.height); + + return true; +} + + +// ============================================================ +// bansheeLineDraw +// ============================================================ +// +// Bresenham line draw with inclusive endpoints. The Banshee engine +// takes start/end XY coordinates directly via srcXY/dstXY registers. + +static void bansheeLineDraw(AccelDriverT *drv, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color) { + BansheePrivateT *priv = (BansheePrivateT *)drv->privData; + + bansheeWaitIdle(drv); + + bansheeWrite(priv, BAN_DSTBASEADDR, 0); + bansheeWrite(priv, BAN_DSTFORMAT, ((uint32_t)priv->screenPitch << 16) | priv->bppFormat); + bansheeWrite(priv, BAN_ROP, BAN_ROP_COPY); + bansheeWrite(priv, BAN_COLORFORE, color); + bansheeWrite(priv, BAN_SRCXY, (uint32_t)x1 | ((uint32_t)y1 << 16)); + bansheeWrite(priv, BAN_DSTXY, (uint32_t)x2 | ((uint32_t)y2 << 16)); + bansheeWrite(priv, BAN_COMMAND, BAN_CMD_LINEDRAW | BAN_CMD_INITIATE); +} + + +// ============================================================ +// bansheeMoveCursor +// ============================================================ + +static void bansheeMoveCursor(AccelDriverT *drv, int32_t x, int32_t y) { + BansheePrivateT *priv = (BansheePrivateT *)drv->privData; + + if (x < 0) { + x = 0; + } + if (y < 0) { + y = 0; + } + + bansheeWrite(priv, BAN_CURSORLOC, (uint32_t)x | ((uint32_t)y << 16)); +} + + +// ============================================================ +// bansheeRectFill +// ============================================================ +// +// Solid rectangle fill using the Banshee RECTFILL command. The +// foreground color is set, coordinates and dimensions are loaded, +// and the command register triggers the fill. + +static void bansheeRectFill(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color) { + BansheePrivateT *priv = (BansheePrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + bansheeWaitIdle(drv); + + bansheeWrite(priv, BAN_DSTBASEADDR, 0); + bansheeWrite(priv, BAN_DSTFORMAT, ((uint32_t)priv->screenPitch << 16) | priv->bppFormat); + bansheeWrite(priv, BAN_ROP, BAN_ROP_COPY); + bansheeWrite(priv, BAN_COLORFORE, color); + bansheeWrite(priv, BAN_DSTSIZE, (uint32_t)w | ((uint32_t)h << 16)); + bansheeWrite(priv, BAN_DSTXY, (uint32_t)x | ((uint32_t)y << 16)); + bansheeWrite(priv, BAN_COMMAND, BAN_CMD_RECTFILL | BAN_CMD_INITIATE); +} + + +// ============================================================ +// bansheeRectFillPat +// ============================================================ +// +// 8x8 mono pattern fill using the Banshee RECTFILL command with +// BAN_CMD_MONOPAT. The pattern is 8 bytes (one per row, MSB-first), +// written to pattern0Alias and pattern1Alias as two 32-bit values. +// 1-bits use the foreground color, 0-bits use the background. + +static void bansheeRectFillPat(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, const uint8_t *pattern, uint32_t fg, uint32_t bg) { + BansheePrivateT *priv = (BansheePrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + // Pack pattern rows 0-3 into PATTERN0 and rows 4-7 into PATTERN1 + uint32_t pat0 = (uint32_t)pattern[0] + | ((uint32_t)pattern[1] << 8) + | ((uint32_t)pattern[2] << 16) + | ((uint32_t)pattern[3] << 24); + uint32_t pat1 = (uint32_t)pattern[4] + | ((uint32_t)pattern[5] << 8) + | ((uint32_t)pattern[6] << 16) + | ((uint32_t)pattern[7] << 24); + + bansheeWaitIdle(drv); + + bansheeWrite(priv, BAN_DSTBASEADDR, 0); + bansheeWrite(priv, BAN_DSTFORMAT, ((uint32_t)priv->screenPitch << 16) | priv->bppFormat); + bansheeWrite(priv, BAN_ROP, BAN_ROP_COPY); + bansheeWrite(priv, BAN_COLORFORE, fg); + bansheeWrite(priv, BAN_COLORBACK, bg); + bansheeWrite(priv, BAN_PATTERN0, pat0); + bansheeWrite(priv, BAN_PATTERN1, pat1); + bansheeWrite(priv, BAN_DSTSIZE, (uint32_t)w | ((uint32_t)h << 16)); + bansheeWrite(priv, BAN_DSTXY, (uint32_t)x | ((uint32_t)y << 16)); + bansheeWrite(priv, BAN_COMMAND, BAN_CMD_RECTFILL | BAN_CMD_INITIATE | BAN_CMD_MONOPAT); +} + + +// ============================================================ +// bansheeSetClip +// ============================================================ + +static void bansheeSetClip(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h) { + BansheePrivateT *priv = (BansheePrivateT *)drv->privData; + + bansheeWrite(priv, BAN_CLIP0MIN, (uint32_t)x | ((uint32_t)y << 16)); + bansheeWrite(priv, BAN_CLIP0MAX, (uint32_t)(x + w) | ((uint32_t)(y + h) << 16)); +} + + +// ============================================================ +// bansheeSetCursor +// ============================================================ +// +// The Banshee hardware cursor is a 64x64 two-color cursor stored +// in VRAM. The format is 2 bits per pixel: AND plane followed by +// XOR plane, packed as 64x64 = 1024 bytes per plane. + +static void bansheeSetCursor(AccelDriverT *drv, const HwCursorImageT *image) { + BansheePrivateT *priv = (BansheePrivateT *)drv->privData; + + if (!image) { + bansheeShowCursor(drv, false); + return; + } + + bansheeWaitIdle(drv); + + // Store cursor image at end of VRAM (1KB AND + 1KB XOR = 2KB) + uint32_t cursorOffset = priv->vramSize - 2048; + cursorOffset &= ~0x7FF; // align to 2KB + uint8_t *cursorMem = drv->mode.framebuffer + cursorOffset; + + // Write AND mask then XOR mask, each 64x64 / 8 = 512 bytes + for (int32_t row = 0; row < BAN_HW_CURSOR_SIZE; row++) { + for (int32_t byteIdx = 0; byteIdx < 8; byteIdx++) { + int32_t srcIdx = row * 8 + byteIdx; + uint8_t andByte; + uint8_t xorByte; + + if (row < image->height && byteIdx < (image->width + 7) / 8) { + andByte = image->andMask[srcIdx]; + xorByte = image->xorMask[srcIdx]; + } else { + andByte = 0xFF; // transparent + xorByte = 0x00; + } + + cursorMem[row * 16 + byteIdx] = andByte; + cursorMem[row * 16 + byteIdx + 8] = xorByte; + } + } +} + + +// ============================================================ +// bansheeShowCursor +// ============================================================ + +static void bansheeShowCursor(AccelDriverT *drv, bool visible) { + BansheePrivateT *priv = (BansheePrivateT *)drv->privData; + + uint32_t vidProcCfg = bansheeRead(priv, BAN_VIDPROCCFG); + + if (visible) { + vidProcCfg |= BAN_CURSOR_ENABLE; + } else { + vidProcCfg &= ~BAN_CURSOR_ENABLE; + } + + bansheeWrite(priv, BAN_VIDPROCCFG, vidProcCfg); +} + + +// ============================================================ +// bansheeShutdown +// ============================================================ + +static void bansheeShutdown(AccelDriverT *drv) { + BansheePrivateT *priv = (BansheePrivateT *)drv->privData; + + bansheeShowCursor(drv, false); + vgaRestoreTextMode(); + + dpmiUnmapFramebuffer(&priv->launchMap); + dpmiUnmapFramebuffer(&priv->lfbMap); + dpmiUnmapFramebuffer(&priv->mmioMap); + + priv->mmio = NULL; + priv->launch = NULL; +} + + +// ============================================================ +// bansheeWaitIdle +// ============================================================ +// +// Wait until the 2D engine is completely idle. Bits 0-10 of the +// status register must all be zero. + +static void bansheeWaitIdle(AccelDriverT *drv) { + BansheePrivateT *priv = (BansheePrivateT *)drv->privData; + + for (int32_t i = 0; i < BAN_MAX_IDLE_WAIT; i++) { + uint32_t stat = bansheeRead(priv, BAN_STATUS); + if (!(stat & BAN_STATUS_BUSY_MASK)) { + return; + } + } +} + + +// ============================================================ +// bppToFormat +// ============================================================ +// +// Convert bits-per-pixel to the Banshee srcFormat/dstFormat +// encoding for the low bits of those registers. + +static uint32_t bppToFormat(int32_t bpp) { + switch (bpp) { + case 8: + return BAN_FMT_8BPP; + case 15: + case 16: + return BAN_FMT_16BPP; + case 32: + return BAN_FMT_32BPP; + default: + return BAN_FMT_16BPP; + } +} diff --git a/cirrusGd54.c b/cirrusGd54.c new file mode 100644 index 0000000..cf096b4 --- /dev/null +++ b/cirrusGd54.c @@ -0,0 +1,732 @@ +// cirrusGd54.c -- Cirrus Logic GD5426/28/34/36/46/80 accelerated video driver +// +// Supports the Cirrus Logic GD54xx family of VGA controllers. These +// chips were extremely common in the early-to-mid 1990s, found in +// everything from budget desktops to laptops. +// +// The GD54xx BitBLT engine is accessed entirely through extended +// Graphics Controller (GR) registers at I/O ports 0x3CE/0x3CF. +// There is no MMIO option on the GD54xx series (unlike the later +// Laguna chips). The engine supports: +// - Screen-to-screen BitBLT +// - Solid rectangle fill +// - Color expansion (monochrome-to-color, for text) +// - 8x8 pattern fill +// - Transparent blit (color key) +// - Hardware cursor (32x32 on GD5426/28, 64x64 on GD5434+) +// +// Register unlock: +// Write 0x12 to SR6 (sequencer register 6) to unlock the Cirrus +// extended registers. Write 0x00 to re-lock. +// +// BLT engine registers (GR extended, indices 0x20-0x3F): +// All BLT parameters are set through the graphics controller +// index/data ports (0x3CE/0x3CF). Addresses are linear byte +// offsets into VRAM. + +#include "accelVid.h" +#include "vgaCommon.h" +#include "pci.h" + +#include +#include +#include +#include +#include + +// ============================================================ +// Cirrus vendor/device IDs +// ============================================================ + +#define CL_VENDOR_ID 0x1013 + +#define CL_GD5426 0x0000 // ISA/VLB only, no PCI ID -- detected via probe +#define CL_GD5428 0x0000 // ISA/VLB only +#define CL_GD5429 0x00A0 // shared with 5434 on some boards +#define CL_GD5434 0x00A0 +#define CL_GD5434_ALT 0x00A8 +#define CL_GD5436 0x00AC +#define CL_GD5446 0x00B8 +#define CL_GD5480 0x00BC + +static const uint16_t sCirrusDeviceIds[] = { + CL_VENDOR_ID, CL_GD5434, + CL_VENDOR_ID, CL_GD5434_ALT, + CL_VENDOR_ID, CL_GD5436, + CL_VENDOR_ID, CL_GD5446, + CL_VENDOR_ID, CL_GD5480, + 0, 0 +}; + +// ============================================================ +// Cirrus extended GR register indices for BLT engine +// ============================================================ + +#define CL_GR20_BLT_WIDTH_LO 0x20 +#define CL_GR21_BLT_WIDTH_HI 0x21 +#define CL_GR22_BLT_HEIGHT_LO 0x22 +#define CL_GR23_BLT_HEIGHT_HI 0x23 +#define CL_GR24_BLT_DST_PITCH_LO 0x24 +#define CL_GR25_BLT_DST_PITCH_HI 0x25 +#define CL_GR26_BLT_SRC_PITCH_LO 0x26 +#define CL_GR27_BLT_SRC_PITCH_HI 0x27 +#define CL_GR28_BLT_DST_ADDR_LO 0x28 +#define CL_GR29_BLT_DST_ADDR_MID 0x29 +#define CL_GR2A_BLT_DST_ADDR_HI 0x2A +#define CL_GR2C_BLT_SRC_ADDR_LO 0x2C +#define CL_GR2D_BLT_SRC_ADDR_MID 0x2D +#define CL_GR2E_BLT_SRC_ADDR_HI 0x2E +#define CL_GR30_BLT_MODE 0x30 +#define CL_GR31_BLT_STATUS 0x31 +#define CL_GR32_BLT_ROP 0x32 +#define CL_GR33_BLT_MODE_EXT 0x33 +#define CL_GR34_BLT_FGCOLOR_LO 0x34 +#define CL_GR35_BLT_FGCOLOR_HI 0x35 +#define CL_GR38_BLT_TRANS_COLOR_LO 0x38 +#define CL_GR39_BLT_TRANS_COLOR_HI 0x39 +#define CL_GR3A_BLT_TRANS_MASK_LO 0x3A +#define CL_GR3B_BLT_TRANS_MASK_HI 0x3B + +// ============================================================ +// Cirrus BLT mode bits (GR30) +// ============================================================ + +#define CL_BLT_DIR_BACKWARD 0x01 // blit direction backward +#define CL_BLT_SRC_SYSTEM 0x02 // source is system memory (CPU) +#define CL_BLT_SRC_PATTERN 0x04 // source is 8x8 pattern +#define CL_BLT_TRANSPARENT 0x08 // transparent background +#define CL_BLT_DST_SYSTEM 0x10 // destination is system memory +#define CL_BLT_COLOR_EXPAND 0x80 // monochrome color expansion + +// ============================================================ +// Cirrus BLT status bits (GR31) +// ============================================================ + +#define CL_BLT_START 0x02 // start BLT operation +#define CL_BLT_RESET 0x04 // reset BLT engine +#define CL_BLT_BUSY 0x01 // BLT engine busy (read) + +// ============================================================ +// Cirrus BLT ROP values (GR32) +// ============================================================ +// +// The Cirrus ROP encoding is different from the S3/Windows ROP +// codes. These are the Cirrus-specific values. + +#define CL_ROP_COPY 0x0D // dest = source +#define CL_ROP_PAT_COPY 0x0D // dest = pattern (same as copy in fill mode) +#define CL_ROP_XOR 0x59 // dest = src XOR dest +#define CL_ROP_AND 0x05 // dest = src AND dest +#define CL_ROP_OR 0x6D // dest = src OR dest +#define CL_ROP_ZERO 0x00 // dest = 0 +#define CL_ROP_ONE 0x0B // dest = 1 + +// Cirrus sequencer unlock key +#define CL_SR6_UNLOCK 0x12 +#define CL_SR6_LOCK 0x00 + +// Hardware cursor constants +#define CL_HW_CURSOR_SIZE 64 // 64x64 on GD5434+ +#define CL_HW_CURSOR_BYTES 1024 // 64*64*2bpp / 8 = 1024 + +// Maximum wait iterations +#define CL_MAX_IDLE_WAIT 1000000 + +// ============================================================ +// Private driver state +// ============================================================ + +typedef struct { + uint32_t lfbPhysAddr; + uint32_t vramSize; + uint32_t cursorOffset; + int32_t bytesPerPixel; + int32_t screenPitch; + DpmiMappingT lfbMapping; + bool is5434Plus; // true for GD5434 and later (64x64 cursor) +} CirrusPrivateT; + +// ============================================================ +// Prototypes +// ============================================================ + +static void clBitBlt(AccelDriverT *drv, int32_t srcX, int32_t srcY, int32_t dstX, int32_t dstY, int32_t w, int32_t h); +static void clColorExpand(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h, uint32_t fg, uint32_t bg); +static bool clDetect(AccelDriverT *drv); +static void clHostBlit(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h); +static bool clInit(AccelDriverT *drv, const AccelModeRequestT *req); +static void clMoveCursor(AccelDriverT *drv, int32_t x, int32_t y); +static void clRectFill(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color); +static void clSetClip(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h); +static void clSetCursor(AccelDriverT *drv, const HwCursorImageT *image); +static void clShowCursor(AccelDriverT *drv, bool visible); +static void clShutdown(AccelDriverT *drv); +static void clUnlockRegs(void); +static void clWaitIdle(AccelDriverT *drv); + +// ============================================================ +// Driver instance +// ============================================================ + +static CirrusPrivateT sCirrusPrivate; + +static AccelDriverT sCirrusDriver = { + .name = "Cirrus Logic GD5434", + .chipFamily = "cirrus", + .caps = 0, + .privData = &sCirrusPrivate, + .detect = clDetect, + .init = clInit, + .shutdown = clShutdown, + .waitIdle = clWaitIdle, + .setClip = clSetClip, + .rectFill = clRectFill, + .rectFillPat = NULL, + .bitBlt = clBitBlt, + .hostBlit = clHostBlit, + .colorExpand = clColorExpand, + .lineDraw = NULL, // GD54xx has no hardware line draw + .setCursor = clSetCursor, + .moveCursor = clMoveCursor, + .showCursor = clShowCursor, +}; + +// ============================================================ +// clRegisterDriver +// ============================================================ + +void clRegisterDriver(void) { + accelRegisterDriver(&sCirrusDriver); +} + + +// ============================================================ +// clBitBlt +// ============================================================ +// +// Screen-to-screen BitBLT. The Cirrus engine uses linear VRAM +// addresses for source and destination. Direction is controlled +// by the backward bit in GR30 -- for overlapping regions where +// dst > src, we must blit backward. + +static void clBitBlt(AccelDriverT *drv, int32_t srcX, int32_t srcY, int32_t dstX, int32_t dstY, int32_t w, int32_t h) { + CirrusPrivateT *priv = (CirrusPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + int32_t bpp = priv->bytesPerPixel; + int32_t pitch = priv->screenPitch; + + // Calculate linear addresses + uint32_t srcAddr = srcY * pitch + srcX * bpp; + uint32_t dstAddr = dstY * pitch + dstX * bpp; + + // Determine direction for overlapping blits + uint8_t mode = 0; + + if (dstAddr > srcAddr) { + mode |= CL_BLT_DIR_BACKWARD; + // Adjust addresses to end of blit region + srcAddr += (h - 1) * pitch + (w - 1) * bpp; + dstAddr += (h - 1) * pitch + (w - 1) * bpp; + } + + // Width in bytes minus 1 + int32_t widthBytes = w * bpp - 1; + + clWaitIdle(drv); + + // Set up BLT parameters + vgaGfxWrite(CL_GR20_BLT_WIDTH_LO, widthBytes & 0xFF); + vgaGfxWrite(CL_GR21_BLT_WIDTH_HI, (widthBytes >> 8) & 0x1F); + + vgaGfxWrite(CL_GR22_BLT_HEIGHT_LO, (h - 1) & 0xFF); + vgaGfxWrite(CL_GR23_BLT_HEIGHT_HI, ((h - 1) >> 8) & 0x07); + + vgaGfxWrite(CL_GR24_BLT_DST_PITCH_LO, pitch & 0xFF); + vgaGfxWrite(CL_GR25_BLT_DST_PITCH_HI, (pitch >> 8) & 0x1F); + + vgaGfxWrite(CL_GR26_BLT_SRC_PITCH_LO, pitch & 0xFF); + vgaGfxWrite(CL_GR27_BLT_SRC_PITCH_HI, (pitch >> 8) & 0x1F); + + vgaGfxWrite(CL_GR28_BLT_DST_ADDR_LO, dstAddr & 0xFF); + vgaGfxWrite(CL_GR29_BLT_DST_ADDR_MID, (dstAddr >> 8) & 0xFF); + vgaGfxWrite(CL_GR2A_BLT_DST_ADDR_HI, (dstAddr >> 16) & 0x3F); + + vgaGfxWrite(CL_GR2C_BLT_SRC_ADDR_LO, srcAddr & 0xFF); + vgaGfxWrite(CL_GR2D_BLT_SRC_ADDR_MID, (srcAddr >> 8) & 0xFF); + vgaGfxWrite(CL_GR2E_BLT_SRC_ADDR_HI, (srcAddr >> 16) & 0x3F); + + vgaGfxWrite(CL_GR32_BLT_ROP, CL_ROP_COPY); + vgaGfxWrite(CL_GR30_BLT_MODE, mode); + + // Start BLT + vgaGfxWrite(CL_GR31_BLT_STATUS, CL_BLT_START); +} + + +// ============================================================ +// clColorExpand +// ============================================================ +// +// Monochrome-to-color expansion. The source data is 1bpp bitmap +// in system memory, which gets transferred through the BLT engine +// with color expansion enabled. Each 1-bit becomes the foreground +// color, each 0-bit becomes the background color. +// +// The Cirrus color expand uses GR34/GR35 for the foreground color +// and the background is set by first doing a fill, or by using +// transparent mode with a pre-filled background. + +static void clColorExpand(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h, uint32_t fg, uint32_t bg) { + CirrusPrivateT *priv = (CirrusPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + int32_t bpp = priv->bytesPerPixel; + int32_t pitch = priv->screenPitch; + + // First fill the destination with background color + clRectFill(drv, dstX, dstY, w, h, bg); + clWaitIdle(drv); + + // Now do a transparent color expand for the foreground + uint32_t dstAddr = dstY * pitch + dstX * bpp; + int32_t widthBytes = w * bpp - 1; + + // Set foreground color + vgaGfxWrite(CL_GR34_BLT_FGCOLOR_LO, fg & 0xFF); + vgaGfxWrite(CL_GR35_BLT_FGCOLOR_HI, (fg >> 8) & 0xFF); + + // Set up BLT parameters + vgaGfxWrite(CL_GR20_BLT_WIDTH_LO, widthBytes & 0xFF); + vgaGfxWrite(CL_GR21_BLT_WIDTH_HI, (widthBytes >> 8) & 0x1F); + + vgaGfxWrite(CL_GR22_BLT_HEIGHT_LO, (h - 1) & 0xFF); + vgaGfxWrite(CL_GR23_BLT_HEIGHT_HI, ((h - 1) >> 8) & 0x07); + + vgaGfxWrite(CL_GR24_BLT_DST_PITCH_LO, pitch & 0xFF); + vgaGfxWrite(CL_GR25_BLT_DST_PITCH_HI, (pitch >> 8) & 0x1F); + + // Source pitch for monochrome data + vgaGfxWrite(CL_GR26_BLT_SRC_PITCH_LO, srcPitch & 0xFF); + vgaGfxWrite(CL_GR27_BLT_SRC_PITCH_HI, (srcPitch >> 8) & 0x1F); + + vgaGfxWrite(CL_GR28_BLT_DST_ADDR_LO, dstAddr & 0xFF); + vgaGfxWrite(CL_GR29_BLT_DST_ADDR_MID, (dstAddr >> 8) & 0xFF); + vgaGfxWrite(CL_GR2A_BLT_DST_ADDR_HI, (dstAddr >> 16) & 0x3F); + + vgaGfxWrite(CL_GR32_BLT_ROP, CL_ROP_COPY); + vgaGfxWrite(CL_GR30_BLT_MODE, CL_BLT_COLOR_EXPAND | CL_BLT_SRC_SYSTEM | CL_BLT_TRANSPARENT); + + // Start BLT + vgaGfxWrite(CL_GR31_BLT_STATUS, CL_BLT_START); + + // Feed monochrome data through PIX_TRANS equivalent + // On Cirrus, system-memory source data is written to the + // BLT engine via the VGA aperture at 0xA0000 (mapped via DPMI). + // Each row of monochrome data is padded to a dword boundary. + int32_t srcBytesPerRow = (w + 7) / 8; + int32_t padBytesPerRow = (srcBytesPerRow + 3) & ~3; + + for (int32_t row = 0; row < h; row++) { + const uint8_t *rowData = srcBuf + row * srcPitch; + + for (int32_t i = 0; i < padBytesPerRow; i++) { + uint8_t byte = (i < srcBytesPerRow) ? rowData[i] : 0; + outportb(0x3CF, byte); // data through GR register space + } + } +} + + +// ============================================================ +// clDetect +// ============================================================ + +static bool clDetect(AccelDriverT *drv) { + int32_t matchIdx; + + if (!pciFindDeviceList(sCirrusDeviceIds, &drv->pciDev, &matchIdx)) { + return false; + } + + switch (drv->pciDev.deviceId) { + case CL_GD5434: + case CL_GD5434_ALT: + drv->name = "Cirrus Logic GD5434"; + break; + case CL_GD5436: + drv->name = "Cirrus Logic GD5436"; + break; + case CL_GD5446: + drv->name = "Cirrus Logic GD5446"; + break; + case CL_GD5480: + drv->name = "Cirrus Logic GD5480"; + break; + default: + drv->name = "Cirrus Logic GD54xx"; + break; + } + + return true; +} + + +// ============================================================ +// clHostBlit +// ============================================================ +// +// CPU-to-screen blit. Transfers pixel data from system memory to +// the framebuffer via the BLT engine with CL_BLT_SRC_SYSTEM mode. +// Source data is fed byte-by-byte through the GR data port (0x3CF), +// with each row padded to a dword (4-byte) boundary. + +static void clHostBlit(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h) { + CirrusPrivateT *priv = (CirrusPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + int32_t bpp = priv->bytesPerPixel; + int32_t pitch = priv->screenPitch; + uint32_t dstAddr = dstY * pitch + dstX * bpp; + int32_t widthBytes = w * bpp - 1; + int32_t rowBytes = w * bpp; + int32_t padBytesPerRow = (rowBytes + 3) & ~3; + + clWaitIdle(drv); + + // Set up BLT parameters + vgaGfxWrite(CL_GR20_BLT_WIDTH_LO, widthBytes & 0xFF); + vgaGfxWrite(CL_GR21_BLT_WIDTH_HI, (widthBytes >> 8) & 0x1F); + + vgaGfxWrite(CL_GR22_BLT_HEIGHT_LO, (h - 1) & 0xFF); + vgaGfxWrite(CL_GR23_BLT_HEIGHT_HI, ((h - 1) >> 8) & 0x07); + + vgaGfxWrite(CL_GR24_BLT_DST_PITCH_LO, pitch & 0xFF); + vgaGfxWrite(CL_GR25_BLT_DST_PITCH_HI, (pitch >> 8) & 0x1F); + + vgaGfxWrite(CL_GR28_BLT_DST_ADDR_LO, dstAddr & 0xFF); + vgaGfxWrite(CL_GR29_BLT_DST_ADDR_MID, (dstAddr >> 8) & 0xFF); + vgaGfxWrite(CL_GR2A_BLT_DST_ADDR_HI, (dstAddr >> 16) & 0x3F); + + // BLT mode: source from CPU + vgaGfxWrite(CL_GR30_BLT_MODE, CL_BLT_SRC_SYSTEM); + vgaGfxWrite(CL_GR32_BLT_ROP, CL_ROP_COPY); + + // Start BLT + vgaGfxWrite(CL_GR31_BLT_STATUS, CL_BLT_START); + + // Feed pixel data row by row, padded to dword boundary + for (int32_t row = 0; row < h; row++) { + const uint8_t *rowData = srcBuf + row * srcPitch; + + for (int32_t i = 0; i < padBytesPerRow; i++) { + uint8_t byte = (i < rowBytes) ? rowData[i] : 0; + outportb(0x3CF, byte); + } + } +} + + +// ============================================================ +// clInit +// ============================================================ + +static bool clInit(AccelDriverT *drv, const AccelModeRequestT *req) { + CirrusPrivateT *priv = (CirrusPrivateT *)drv->privData; + + memset(priv, 0, sizeof(*priv)); + + priv->is5434Plus = (drv->pciDev.deviceId != CL_GD5429); + + // Get VRAM size and LFB address from PCI BAR0 + uint32_t bar0 = pciRead32(drv->pciDev.bus, drv->pciDev.dev, drv->pciDev.func, PCI_BAR0); + priv->lfbPhysAddr = bar0 & 0xFFFFFFF0; + priv->vramSize = pciSizeBar(drv->pciDev.bus, drv->pciDev.dev, drv->pciDev.func, PCI_BAR0); + + // Unlock Cirrus extended registers + clUnlockRegs(); + + // Detect VRAM size from SR0F if BAR sizing was unreasonable + uint8_t sr0f = vgaSeqRead(0x0F); + uint32_t ramFromSr = 0; + + switch ((sr0f >> 3) & 0x03) { + case 0: ramFromSr = 256 * 1024; break; + case 1: ramFromSr = 512 * 1024; break; + case 2: ramFromSr = 1024 * 1024; break; + case 3: ramFromSr = 2048 * 1024; break; + } + + // GD5434+ can have 4MB + if (priv->is5434Plus && (sr0f & 0x80)) { + ramFromSr = 4096 * 1024; + } + + if (priv->vramSize < 256 * 1024 || priv->vramSize > 64 * 1024 * 1024) { + priv->vramSize = ramFromSr; + } + + // Find and set VESA mode + VesaModeResultT vesa; + if (!vesaFindAndSetMode(req->width, req->height, req->bpp, &vesa)) { + return false; + } + + // Map LFB via DPMI + if (!dpmiMapFramebuffer(priv->lfbPhysAddr, priv->vramSize, &priv->lfbMapping)) { + vgaRestoreTextMode(); + return false; + } + + priv->bytesPerPixel = (vesa.bpp + 7) / 8; + priv->screenPitch = vesa.pitch; + + drv->mode.width = vesa.width; + drv->mode.height = vesa.height; + drv->mode.bpp = vesa.bpp; + drv->mode.pitch = vesa.pitch; + drv->mode.framebuffer = priv->lfbMapping.ptr; + drv->mode.vramSize = priv->vramSize; + drv->mode.offscreenBase = vesa.pitch * vesa.height; + + // Re-unlock after mode set + clUnlockRegs(); + + // Reset BLT engine + vgaGfxWrite(CL_GR31_BLT_STATUS, CL_BLT_RESET); + vgaGfxWrite(CL_GR31_BLT_STATUS, 0x00); + + // Set up cursor at end of VRAM + priv->cursorOffset = priv->vramSize - CL_HW_CURSOR_BYTES; + priv->cursorOffset &= ~(CL_HW_CURSOR_BYTES - 1); + + drv->caps = ACAP_RECT_FILL + | ACAP_BITBLT + | ACAP_HOST_BLIT + | ACAP_COLOR_EXPAND + | ACAP_HW_CURSOR; + + return true; +} + + +// ============================================================ +// clMoveCursor +// ============================================================ +// +// Moves the hardware cursor. On Cirrus GD5434+, cursor position +// is set through sequencer extended registers SR10-SR13. + +static void clMoveCursor(AccelDriverT *drv, int32_t x, int32_t y) { + (void)drv; + + if (x < 0) { x = 0; } + if (y < 0) { y = 0; } + + vgaSeqWrite(0x10, x & 0xFF); + vgaSeqWrite(0x11, (x >> 8) & 0x07); + vgaSeqWrite(0x12, y & 0xFF); + vgaSeqWrite(0x13, (y >> 8) & 0x07); +} + + +// ============================================================ +// clRectFill +// ============================================================ +// +// Solid rectangle fill using the BLT engine. The Cirrus engine +// doesn't have a dedicated "fill" command -- instead, we set up +// a 1-pixel source and use pattern-fill mode, or we set the +// source to a single-color region. The simplest approach is to +// use the color expansion with all-ones data, but for solid fills +// the most efficient method is to use the ROP with the foreground +// color register. + +static void clRectFill(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color) { + CirrusPrivateT *priv = (CirrusPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + int32_t bpp = priv->bytesPerPixel; + int32_t pitch = priv->screenPitch; + uint32_t dstAddr = y * pitch + x * bpp; + int32_t widthBytes = w * bpp - 1; + + clWaitIdle(drv); + + // Set foreground color for fill + vgaGfxWrite(CL_GR34_BLT_FGCOLOR_LO, color & 0xFF); + vgaGfxWrite(CL_GR35_BLT_FGCOLOR_HI, (color >> 8) & 0xFF); + + vgaGfxWrite(CL_GR20_BLT_WIDTH_LO, widthBytes & 0xFF); + vgaGfxWrite(CL_GR21_BLT_WIDTH_HI, (widthBytes >> 8) & 0x1F); + + vgaGfxWrite(CL_GR22_BLT_HEIGHT_LO, (h - 1) & 0xFF); + vgaGfxWrite(CL_GR23_BLT_HEIGHT_HI, ((h - 1) >> 8) & 0x07); + + vgaGfxWrite(CL_GR24_BLT_DST_PITCH_LO, pitch & 0xFF); + vgaGfxWrite(CL_GR25_BLT_DST_PITCH_HI, (pitch >> 8) & 0x1F); + + vgaGfxWrite(CL_GR28_BLT_DST_ADDR_LO, dstAddr & 0xFF); + vgaGfxWrite(CL_GR29_BLT_DST_ADDR_MID, (dstAddr >> 8) & 0xFF); + vgaGfxWrite(CL_GR2A_BLT_DST_ADDR_HI, (dstAddr >> 16) & 0x3F); + + // Source = foreground color, color expand with all 1s + vgaGfxWrite(CL_GR32_BLT_ROP, CL_ROP_COPY); + vgaGfxWrite(CL_GR30_BLT_MODE, CL_BLT_COLOR_EXPAND | CL_BLT_SRC_SYSTEM); + + // Source pitch for monochrome data (1 byte per row of fill) + vgaGfxWrite(CL_GR26_BLT_SRC_PITCH_LO, 0); + vgaGfxWrite(CL_GR27_BLT_SRC_PITCH_HI, 0); + + // Start BLT + vgaGfxWrite(CL_GR31_BLT_STATUS, CL_BLT_START); + + // Feed all-ones data (every pixel is foreground color) + int32_t srcBytesPerRow = (w + 7) / 8; + int32_t padBytesPerRow = (srcBytesPerRow + 3) & ~3; + + for (int32_t row = 0; row < h; row++) { + for (int32_t i = 0; i < padBytesPerRow; i++) { + outportb(0x3CF, 0xFF); + } + } +} + + +// ============================================================ +// clSetClip +// ============================================================ +// +// The GD54xx BLT engine doesn't have hardware scissor registers. +// Clipping must be done in software by adjusting coordinates +// before issuing BLT commands. This is a no-op placeholder. + +static void clSetClip(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h) { + (void)drv; + (void)x; + (void)y; + (void)w; + (void)h; +} + + +// ============================================================ +// clSetCursor +// ============================================================ +// +// Uploads cursor image to VRAM. Cirrus GD5434+ uses 64x64 +// 2bpp cursor stored at a 1KB-aligned VRAM address. The address +// is set via SR2D (high) and SR2C (low) in units of 256 bytes. +// Format: interleaved AND/XOR planes, 16 bytes per row +// (8 bytes AND, 8 bytes XOR). + +static void clSetCursor(AccelDriverT *drv, const HwCursorImageT *image) { + CirrusPrivateT *priv = (CirrusPrivateT *)drv->privData; + + if (!image) { + clShowCursor(drv, false); + return; + } + + clWaitIdle(drv); + + uint8_t *cursorMem = drv->mode.framebuffer + priv->cursorOffset; + + for (int32_t row = 0; row < CL_HW_CURSOR_SIZE; row++) { + for (int32_t byte = 0; byte < 8; byte++) { + int32_t srcIdx = row * 8 + byte; + uint8_t andByte; + uint8_t xorByte; + + if (row < image->height && byte < (image->width + 7) / 8) { + andByte = image->andMask[srcIdx]; + xorByte = image->xorMask[srcIdx]; + } else { + andByte = 0xFF; + xorByte = 0x00; + } + + cursorMem[row * 16 + byte] = andByte; + cursorMem[row * 16 + byte + 8] = xorByte; + } + } + + // Set cursor address (in units of 256 bytes) + uint16_t addrUnits = priv->cursorOffset / 256; + vgaSeqWrite(0x2C, addrUnits & 0xFF); + vgaSeqWrite(0x2D, (addrUnits >> 8) & 0x3F); +} + + +// ============================================================ +// clShowCursor +// ============================================================ +// +// Enable/disable the hardware cursor via SR12 bit 0 on Cirrus. + +static void clShowCursor(AccelDriverT *drv, bool visible) { + (void)drv; + + uint8_t sr12 = vgaSeqRead(0x12); + + if (visible) { + sr12 |= 0x01; + } else { + sr12 &= ~0x01; + } + + vgaSeqWrite(0x12, sr12); +} + + +// ============================================================ +// clShutdown +// ============================================================ + +static void clShutdown(AccelDriverT *drv) { + CirrusPrivateT *priv = (CirrusPrivateT *)drv->privData; + + clShowCursor(drv, false); + dpmiUnmapFramebuffer(&priv->lfbMapping); + vgaRestoreTextMode(); +} + + +// ============================================================ +// clUnlockRegs +// ============================================================ +// +// Unlock Cirrus extended registers by writing 0x12 to SR6. + +static void clUnlockRegs(void) { + vgaSeqWrite(0x06, CL_SR6_UNLOCK); +} + + +// ============================================================ +// clWaitIdle +// ============================================================ +// +// Wait for the BLT engine to finish. Poll GR31 bit 0. + +static void clWaitIdle(AccelDriverT *drv) { + (void)drv; + + for (int32_t i = 0; i < CL_MAX_IDLE_WAIT; i++) { + if (!(vgaGfxRead(CL_GR31_BLT_STATUS) & CL_BLT_BUSY)) { + return; + } + } +} diff --git a/cirrusLaguna.c b/cirrusLaguna.c new file mode 100644 index 0000000..e60d24b --- /dev/null +++ b/cirrusLaguna.c @@ -0,0 +1,585 @@ +// cirrusLaguna.c -- Cirrus Logic Laguna GD5462/5464/5465 accelerated video driver +// +// Supports the Cirrus Logic Laguna family: GD5462, GD5464, and GD5465. +// These are MMIO-based PCI accelerators completely different from the +// older GD54xx (Alpine) series -- different register set, different +// BLT engine, and different programming model. +// +// The Laguna 2D engine features: +// - Solid rectangle fill +// - Screen-to-screen BitBLT +// - CPU-to-screen blit (host data window) +// - Monochrome color expansion (text/glyph rendering) +// - Hardware clip rectangle +// - 64x64 hardware cursor +// +// BAR layout: +// BAR0 = MMIO registers (4KB) +// BAR1 = linear framebuffer +// +// The 2D engine is programmed via MMIO registers starting at offset +// 0x0100. Commands are initiated by writing to the COMMAND register +// at 0x0118. Host data (for CPU-to-screen and color expand) is fed +// through a 512-byte window at MMIO + 0x0200. + +#include "accelVid.h" +#include "vgaCommon.h" +#include "pci.h" + +#include +#include +#include +#include +#include + +// ============================================================ +// Cirrus Laguna vendor/device IDs +// ============================================================ + +#define CIRRUS_VENDOR_ID 0x1013 + +#define LAGUNA_GD5462 0x00D0 +#define LAGUNA_GD5464 0x00D4 +#define LAGUNA_GD5465 0x00D6 + +static const uint16_t sLagunaDeviceIds[] = { + CIRRUS_VENDOR_ID, LAGUNA_GD5462, + CIRRUS_VENDOR_ID, LAGUNA_GD5464, + CIRRUS_VENDOR_ID, LAGUNA_GD5465, + 0, 0 +}; + +// ============================================================ +// MMIO register offsets (from BAR0) +// ============================================================ + +// 0x0000-0x00FF: VGA compatible registers (mapped) + +// 2D engine registers +#define LAG_CONTROL 0x0100 // engine control / status +#define LAG_FGCOLOR 0x0104 // foreground color +#define LAG_BGCOLOR 0x0108 // background color +#define LAG_DSTXY 0x010C // destination XY (X | Y<<16) +#define LAG_SRCXY 0x0110 // source XY (X | Y<<16) +#define LAG_DSTSIZE 0x0114 // destination size (W | H<<16) +#define LAG_COMMAND 0x0118 // command register (triggers operation) +#define LAG_PITCH 0x011C // pitch (srcPitch<<16 | dstPitch) +#define LAG_PAT0 0x0120 // 8x8 mono pattern (first 32 bits) +#define LAG_PAT1 0x0124 // 8x8 mono pattern (second 32 bits) +#define LAG_CLIPLT 0x0130 // clip left/top (left | top<<16) +#define LAG_CLIPRB 0x0134 // clip right/bottom (right | bottom<<16) +#define LAG_HOST_DATA 0x0200 // host data window (512 bytes) + +// Hardware cursor registers +#define LAG_CUR_CTRL 0x0300 // cursor control (bit 0 = enable) +#define LAG_CUR_X 0x0304 // cursor X position +#define LAG_CUR_Y 0x0308 // cursor Y position +#define LAG_CUR_ADDR 0x030C // cursor VRAM address + +// ============================================================ +// Status register bits +// ============================================================ + +#define LAG_STATUS_BUSY 0x01 // engine busy (bit 0 of CONTROL) + +// ============================================================ +// Command register encoding +// ============================================================ + +// Operation codes (bits 3:0) +#define LAG_CMD_NOP 0x00 +#define LAG_CMD_BITBLT 0x01 // screen-to-screen BitBlt +#define LAG_CMD_RECTFILL 0x02 // solid rectangle fill +#define LAG_CMD_HOST_BLIT 0x03 // host-to-screen blit +#define LAG_CMD_LINE 0x04 // line draw +#define LAG_CMD_COLOR_EXPAND 0x05 // mono color expansion from host + +// ROP encoding (bits 7:4) +#define LAG_CMD_ROP_SHIFT 4 + +// Direction and option bits +#define LAG_CMD_DIR_REV 0x0100 // bit 8: reverse direction +#define LAG_CMD_PAT_EN 0x0200 // bit 9: pattern enable +#define LAG_CMD_TRANS_EN 0x0400 // bit 10: transparency enable +#define LAG_CMD_COLOREXP 0x0800 // bit 11: color expand (mono source) + +// Common ROP values (shifted into bits 7:4) +#define LAG_ROP_COPY (0x0C << LAG_CMD_ROP_SHIFT) // 0xCC = dest = src +#define LAG_ROP_PAT (0x0F << LAG_CMD_ROP_SHIFT) // 0xF0 = dest = pat + +// ============================================================ +// Constants +// ============================================================ + +#define LAG_MMIO_SIZE 4096 +#define LAG_MAX_IDLE_WAIT 1000000 +#define LAG_HW_CURSOR_SIZE 64 +#define LAG_HW_CURSOR_BYTES 1024 // 64x64x2bpp / 8 = 1024 + +// ============================================================ +// Private driver state +// ============================================================ + +typedef struct { + uint32_t lfbPhysAddr; + uint32_t mmioPhysAddr; + uint32_t vramSize; + uint32_t cursorOffset; + int32_t bytesPerPixel; + int32_t screenPitch; + volatile uint32_t *mmio; + DpmiMappingT lfbMapping; + DpmiMappingT mmioMapping; +} LagunaPrivateT; + +// ============================================================ +// Prototypes +// ============================================================ + +static void lagBitBlt(AccelDriverT *drv, int32_t srcX, int32_t srcY, int32_t dstX, int32_t dstY, int32_t w, int32_t h); +static void lagColorExpand(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h, uint32_t fg, uint32_t bg); +static bool lagDetect(AccelDriverT *drv); +static void lagHostBlit(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h); +static bool lagInit(AccelDriverT *drv, const AccelModeRequestT *req); +static void lagMoveCursor(AccelDriverT *drv, int32_t x, int32_t y); +static void lagRectFill(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color); +static void lagSetClip(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h); +static void lagSetCursor(AccelDriverT *drv, const HwCursorImageT *image); +static void lagShowCursor(AccelDriverT *drv, bool visible); +static void lagShutdown(AccelDriverT *drv); +static void lagWaitIdle(AccelDriverT *drv); + +static inline void lagWrite(LagunaPrivateT *priv, uint32_t reg, uint32_t val) { + priv->mmio[reg / 4] = val; +} + +static inline uint32_t lagRead(LagunaPrivateT *priv, uint32_t reg) { + return priv->mmio[reg / 4]; +} + +// ============================================================ +// Driver instance +// ============================================================ + +static LagunaPrivateT sLagunaPrivate; + +static AccelDriverT sLagunaDriver = { + .name = "Cirrus Logic Laguna", + .chipFamily = "cirrus-laguna", + .caps = 0, + .privData = &sLagunaPrivate, + .detect = lagDetect, + .init = lagInit, + .shutdown = lagShutdown, + .waitIdle = lagWaitIdle, + .setClip = lagSetClip, + .rectFill = lagRectFill, + .rectFillPat = NULL, + .bitBlt = lagBitBlt, + .hostBlit = lagHostBlit, + .colorExpand = lagColorExpand, + .lineDraw = NULL, + .setCursor = lagSetCursor, + .moveCursor = lagMoveCursor, + .showCursor = lagShowCursor, +}; + +// ============================================================ +// lagunaRegisterDriver +// ============================================================ + +void lagunaRegisterDriver(void) { + accelRegisterDriver(&sLagunaDriver); +} + + +// ============================================================ +// lagBitBlt +// ============================================================ +// +// Screen-to-screen BitBLT. Handles overlapping regions by +// selecting forward or reverse direction based on src/dst +// relationship. + +static void lagBitBlt(AccelDriverT *drv, int32_t srcX, int32_t srcY, int32_t dstX, int32_t dstY, int32_t w, int32_t h) { + LagunaPrivateT *priv = (LagunaPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + lagWaitIdle(drv); + + // Determine direction for overlapping blits + uint32_t cmd = LAG_CMD_BITBLT | LAG_ROP_COPY; + + if (dstY > srcY || (dstY == srcY && dstX > srcX)) { + // Reverse direction: start from bottom-right + cmd |= LAG_CMD_DIR_REV; + lagWrite(priv, LAG_SRCXY, (uint32_t)(srcX + w - 1) | ((uint32_t)(srcY + h - 1) << 16)); + lagWrite(priv, LAG_DSTXY, (uint32_t)(dstX + w - 1) | ((uint32_t)(dstY + h - 1) << 16)); + } else { + // Forward direction: start from top-left + lagWrite(priv, LAG_SRCXY, (uint32_t)srcX | ((uint32_t)srcY << 16)); + lagWrite(priv, LAG_DSTXY, (uint32_t)dstX | ((uint32_t)dstY << 16)); + } + + lagWrite(priv, LAG_DSTSIZE, (uint32_t)(w - 1) | ((uint32_t)(h - 1) << 16)); + lagWrite(priv, LAG_PITCH, ((uint32_t)priv->screenPitch << 16) | (uint32_t)priv->screenPitch); + + // Trigger operation + lagWrite(priv, LAG_COMMAND, cmd); +} + + +// ============================================================ +// lagColorExpand +// ============================================================ +// +// Monochrome color expansion: convert 1bpp bitmap data to +// full-color pixels using the hardware color expand engine. +// Set foreground/background colors, then feed mono data +// through the host data window at MMIO + 0x0200. + +static void lagColorExpand(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h, uint32_t fg, uint32_t bg) { + LagunaPrivateT *priv = (LagunaPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + int32_t bytesPerRow = (w + 7) / 8; + int32_t dwordsPerRow = (bytesPerRow + 3) / 4; + + lagWaitIdle(drv); + + lagWrite(priv, LAG_FGCOLOR, fg); + lagWrite(priv, LAG_BGCOLOR, bg); + lagWrite(priv, LAG_DSTXY, (uint32_t)dstX | ((uint32_t)dstY << 16)); + lagWrite(priv, LAG_DSTSIZE, (uint32_t)(w - 1) | ((uint32_t)(h - 1) << 16)); + lagWrite(priv, LAG_PITCH, ((uint32_t)priv->screenPitch << 16) | (uint32_t)priv->screenPitch); + + // Start color expand operation + lagWrite(priv, LAG_COMMAND, LAG_CMD_COLOR_EXPAND | LAG_ROP_COPY | LAG_CMD_COLOREXP); + + // Feed mono data row by row through host data window + volatile uint32_t *hostWin = (volatile uint32_t *)((volatile uint8_t *)priv->mmio + LAG_HOST_DATA); + + for (int32_t row = 0; row < h; row++) { + const uint8_t *rowPtr = srcBuf + row * srcPitch; + + for (int32_t dw = 0; dw < dwordsPerRow; dw++) { + uint32_t val = 0; + int32_t offset = dw * 4; + + for (int32_t b = 0; b < 4; b++) { + if (offset + b < bytesPerRow) { + val |= (uint32_t)rowPtr[offset + b] << (b * 8); + } + } + + lagWaitIdle(drv); + hostWin[0] = val; + } + } +} + + +// ============================================================ +// lagDetect +// ============================================================ + +static bool lagDetect(AccelDriverT *drv) { + int32_t matchIdx; + + if (!pciFindDeviceList(sLagunaDeviceIds, &drv->pciDev, &matchIdx)) { + return false; + } + + switch (drv->pciDev.deviceId) { + case LAGUNA_GD5462: + drv->name = "Cirrus Logic Laguna GD5462"; + break; + case LAGUNA_GD5464: + drv->name = "Cirrus Logic Laguna GD5464"; + break; + case LAGUNA_GD5465: + drv->name = "Cirrus Logic Laguna GD5465"; + break; + default: + drv->name = "Cirrus Logic Laguna"; + break; + } + + return true; +} + + +// ============================================================ +// lagHostBlit +// ============================================================ +// +// CPU-to-screen blit: transfer pixel data from system RAM to +// VRAM through the host data window at MMIO + 0x0200. Each +// row is padded to a dword boundary. + +static void lagHostBlit(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h) { + LagunaPrivateT *priv = (LagunaPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + int32_t bytesPerRow = w * priv->bytesPerPixel; + int32_t dwordsPerRow = (bytesPerRow + 3) / 4; + + lagWaitIdle(drv); + + lagWrite(priv, LAG_DSTXY, (uint32_t)dstX | ((uint32_t)dstY << 16)); + lagWrite(priv, LAG_DSTSIZE, (uint32_t)(w - 1) | ((uint32_t)(h - 1) << 16)); + lagWrite(priv, LAG_PITCH, ((uint32_t)priv->screenPitch << 16) | (uint32_t)priv->screenPitch); + + // Start host-to-screen blit + lagWrite(priv, LAG_COMMAND, LAG_CMD_HOST_BLIT | LAG_ROP_COPY); + + // Feed pixel data row by row through host data window + volatile uint32_t *hostWin = (volatile uint32_t *)((volatile uint8_t *)priv->mmio + LAG_HOST_DATA); + + for (int32_t row = 0; row < h; row++) { + const uint8_t *rowPtr = srcBuf + row * srcPitch; + + for (int32_t dw = 0; dw < dwordsPerRow; dw++) { + uint32_t val = 0; + int32_t offset = dw * 4; + + for (int32_t b = 0; b < 4; b++) { + if (offset + b < bytesPerRow) { + val |= (uint32_t)rowPtr[offset + b] << (b * 8); + } + } + + lagWaitIdle(drv); + hostWin[0] = val; + } + } +} + + +// ============================================================ +// lagInit +// ============================================================ + +static bool lagInit(AccelDriverT *drv, const AccelModeRequestT *req) { + LagunaPrivateT *priv = (LagunaPrivateT *)drv->privData; + + // Read BARs from PCI config space + uint32_t bar0 = pciRead32(drv->pciDev.bus, drv->pciDev.dev, + drv->pciDev.func, PCI_BAR0); + uint32_t bar1 = pciRead32(drv->pciDev.bus, drv->pciDev.dev, + drv->pciDev.func, PCI_BAR1); + + priv->mmioPhysAddr = bar0 & 0xFFFFFFF0; + priv->lfbPhysAddr = bar1 & 0xFFFFFFF0; + + // Size the framebuffer BAR + priv->vramSize = pciSizeBar(drv->pciDev.bus, drv->pciDev.dev, + drv->pciDev.func, PCI_BAR1); + + // Map MMIO control registers (4KB) + if (!dpmiMapFramebuffer(priv->mmioPhysAddr, LAG_MMIO_SIZE, &priv->mmioMapping)) { + return false; + } + priv->mmio = (volatile uint32_t *)priv->mmioMapping.ptr; + + // Find and set VESA mode + VesaModeResultT vesa; + if (!vesaFindAndSetMode(req->width, req->height, req->bpp, &vesa)) { + return false; + } + + // Map framebuffer + if (!dpmiMapFramebuffer(priv->lfbPhysAddr, priv->vramSize, &priv->lfbMapping)) { + dpmiUnmapFramebuffer(&priv->mmioMapping); + vgaRestoreTextMode(); + return false; + } + + priv->bytesPerPixel = (vesa.bpp + 7) / 8; + priv->screenPitch = vesa.pitch; + + drv->mode.width = vesa.width; + drv->mode.height = vesa.height; + drv->mode.bpp = vesa.bpp; + drv->mode.pitch = vesa.pitch; + drv->mode.framebuffer = priv->lfbMapping.ptr; + drv->mode.vramSize = priv->vramSize; + drv->mode.offscreenBase = vesa.pitch * vesa.height; + + // Wait for engine idle before configuring + lagWaitIdle(drv); + + // Set up hardware cursor at end of VRAM + priv->cursorOffset = priv->vramSize - LAG_HW_CURSOR_BYTES; + priv->cursorOffset &= ~(LAG_HW_CURSOR_BYTES - 1); + + drv->caps = ACAP_RECT_FILL + | ACAP_BITBLT + | ACAP_HOST_BLIT + | ACAP_COLOR_EXPAND + | ACAP_HW_CURSOR + | ACAP_CLIP; + + // Set full-screen clip rectangle + lagSetClip(drv, 0, 0, vesa.width, vesa.height); + + return true; +} + + +// ============================================================ +// lagMoveCursor +// ============================================================ + +static void lagMoveCursor(AccelDriverT *drv, int32_t x, int32_t y) { + LagunaPrivateT *priv = (LagunaPrivateT *)drv->privData; + + if (x < 0) { x = 0; } + if (y < 0) { y = 0; } + + lagWrite(priv, LAG_CUR_X, (uint32_t)x); + lagWrite(priv, LAG_CUR_Y, (uint32_t)y); +} + + +// ============================================================ +// lagRectFill +// ============================================================ +// +// Solid rectangle fill using command 0x02. Sets the foreground +// color, destination position, and size, then triggers the fill. + +static void lagRectFill(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color) { + LagunaPrivateT *priv = (LagunaPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + lagWaitIdle(drv); + + lagWrite(priv, LAG_FGCOLOR, color); + lagWrite(priv, LAG_DSTXY, (uint32_t)x | ((uint32_t)y << 16)); + lagWrite(priv, LAG_DSTSIZE, (uint32_t)(w - 1) | ((uint32_t)(h - 1) << 16)); + lagWrite(priv, LAG_PITCH, ((uint32_t)priv->screenPitch << 16) | (uint32_t)priv->screenPitch); + + // Trigger solid fill + lagWrite(priv, LAG_COMMAND, LAG_CMD_RECTFILL | LAG_ROP_COPY); +} + + +// ============================================================ +// lagSetClip +// ============================================================ + +static void lagSetClip(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h) { + LagunaPrivateT *priv = (LagunaPrivateT *)drv->privData; + + lagWrite(priv, LAG_CLIPLT, (uint32_t)x | ((uint32_t)y << 16)); + lagWrite(priv, LAG_CLIPRB, (uint32_t)(x + w - 1) | ((uint32_t)(y + h - 1) << 16)); +} + + +// ============================================================ +// lagSetCursor +// ============================================================ +// +// Upload a hardware cursor image to VRAM at the cursor offset. +// The Laguna uses a 64x64 2bpp AND/XOR format stored in VRAM. + +static void lagSetCursor(AccelDriverT *drv, const HwCursorImageT *image) { + LagunaPrivateT *priv = (LagunaPrivateT *)drv->privData; + + if (!image) { + lagShowCursor(drv, false); + return; + } + + lagWaitIdle(drv); + + uint8_t *cursorMem = drv->mode.framebuffer + priv->cursorOffset; + + for (int32_t row = 0; row < LAG_HW_CURSOR_SIZE; row++) { + for (int32_t byte = 0; byte < 8; byte++) { + int32_t srcIdx = row * 8 + byte; + uint8_t andByte; + uint8_t xorByte; + + if (row < image->height && byte < (image->width + 7) / 8) { + andByte = image->andMask[srcIdx]; + xorByte = image->xorMask[srcIdx]; + } else { + andByte = 0xFF; // transparent + xorByte = 0x00; + } + + cursorMem[row * 16 + byte] = andByte; + cursorMem[row * 16 + byte + 8] = xorByte; + } + } + + // Set cursor VRAM address + lagWrite(priv, LAG_CUR_ADDR, priv->cursorOffset); +} + + +// ============================================================ +// lagShowCursor +// ============================================================ + +static void lagShowCursor(AccelDriverT *drv, bool visible) { + LagunaPrivateT *priv = (LagunaPrivateT *)drv->privData; + + uint32_t ctrl = lagRead(priv, LAG_CUR_CTRL); + + if (visible) { + ctrl |= 0x01; + } else { + ctrl &= ~0x01; + } + + lagWrite(priv, LAG_CUR_CTRL, ctrl); +} + + +// ============================================================ +// lagShutdown +// ============================================================ + +static void lagShutdown(AccelDriverT *drv) { + LagunaPrivateT *priv = (LagunaPrivateT *)drv->privData; + + lagShowCursor(drv, false); + dpmiUnmapFramebuffer(&priv->mmioMapping); + dpmiUnmapFramebuffer(&priv->lfbMapping); + vgaRestoreTextMode(); +} + + +// ============================================================ +// lagWaitIdle +// ============================================================ +// +// Poll the CONTROL register until bit 0 (engine busy) clears. +// Bounded by LAG_MAX_IDLE_WAIT iterations to avoid hangs on +// hardware failure. + +static void lagWaitIdle(AccelDriverT *drv) { + LagunaPrivateT *priv = (LagunaPrivateT *)drv->privData; + + for (int32_t i = 0; i < LAG_MAX_IDLE_WAIT; i++) { + uint32_t stat = lagRead(priv, LAG_CONTROL); + if (!(stat & LAG_STATUS_BUSY)) { + return; + } + } +} diff --git a/demo.c b/demo.c new file mode 100644 index 0000000..3d9c2c0 --- /dev/null +++ b/demo.c @@ -0,0 +1,869 @@ +// demo.c -- Test/demo application for accelerated video drivers +// +// Detects the video card, sets a graphics mode, exercises the +// hardware acceleration (fill rects, blit, draw lines, color +// expand), and provides a simple interactive benchmark comparing +// hardware vs software rendering speed. +// +// Usage: demo [width height bpp] +// Defaults to 640x480x16 if no arguments given. +// +// Press ESC to exit, 'b' to run benchmark, space to cycle tests. + +#include "accelVid.h" +#include "pci.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +// Scancode for ESC key +#define KEY_ESC 0x01 + +// Default video mode +#define DEFAULT_WIDTH 640 +#define DEFAULT_HEIGHT 480 +#define DEFAULT_BPP 16 + +// Benchmark iteration counts +#define BENCH_FILL_COUNT 1000 +#define BENCH_BLIT_COUNT 1000 +#define BENCH_LINE_COUNT 5000 +#define BENCH_EXPAND_COUNT 500 +#define BENCH_HBLIT_COUNT 1000 +#define BENCH_PATFILL_COUNT 1000 + +// Host blit test pattern dimensions +#define HBLIT_PAT_W 100 +#define HBLIT_PAT_H 100 + +// ============================================================ +// External driver registration functions +// ============================================================ + +extern void atiRegisterDriver(void); +extern void bansheeRegisterDriver(void); +extern void clRegisterDriver(void); +extern void etRegisterDriver(void); +extern void lagunaRegisterDriver(void); +extern void mgaRegisterDriver(void); +extern void nvRegisterDriver(void); +extern void s3RegisterDriver(void); +extern void sisRegisterDriver(void); +extern void tridentRegisterDriver(void); + +// ============================================================ +// Prototypes +// ============================================================ + +static void demoBenchmark(AccelDriverT *drv); +static void demoBitBlt(AccelDriverT *drv); +static void demoColorExpand(AccelDriverT *drv); +static void demoFillRects(AccelDriverT *drv); +static void demoHostBlit(AccelDriverT *drv); +static void demoLines(AccelDriverT *drv); +static void demoPatternFill(AccelDriverT *drv); +static bool isKeyPressed(void); +static uint32_t packColor16(uint8_t r, uint8_t g, uint8_t b); +static uint8_t readKey(void); +static void softFillRect(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color); + +// ============================================================ +// demoBenchmark +// ============================================================ +// +// Runs timed comparisons of hardware vs software rendering for +// rectangle fills and blits. Prints results to stdout after +// restoring text mode. + +static void demoBenchmark(AccelDriverT *drv) { + int32_t screenW = drv->mode.width; + int32_t screenH = drv->mode.height; + + // Benchmark hardware rect fill + clock_t hwFillStart = clock(); + + for (int32_t i = 0; i < BENCH_FILL_COUNT; i++) { + int32_t x = (i * 37) % (screenW - 100); + int32_t y = (i * 53) % (screenH - 100); + drv->rectFill(drv, x, y, 100, 100, packColor16(i & 0xFF, (i >> 3) & 0xFF, (i >> 6) & 0xFF)); + } + + drv->waitIdle(drv); + clock_t hwFillEnd = clock(); + + // Benchmark software rect fill + clock_t swFillStart = clock(); + + for (int32_t i = 0; i < BENCH_FILL_COUNT; i++) { + int32_t x = (i * 37) % (screenW - 100); + int32_t y = (i * 53) % (screenH - 100); + softFillRect(drv, x, y, 100, 100, packColor16(i & 0xFF, (i >> 3) & 0xFF, (i >> 6) & 0xFF)); + } + + clock_t swFillEnd = clock(); + + // Benchmark hardware bitblt + clock_t hwBltStart = clock(); + + for (int32_t i = 0; i < BENCH_BLIT_COUNT; i++) { + int32_t sx = (i * 31) % (screenW - 100); + int32_t sy = (i * 47) % (screenH - 100); + int32_t dx = (i * 43) % (screenW - 100); + int32_t dy = (i * 59) % (screenH - 100); + drv->bitBlt(drv, sx, sy, dx, dy, 100, 100); + } + + drv->waitIdle(drv); + clock_t hwBltEnd = clock(); + + // Benchmark hardware line draw + clock_t hwLineStart = clock(); + + for (int32_t i = 0; i < BENCH_LINE_COUNT; i++) { + int32_t x1 = (i * 37) % screenW; + int32_t y1 = (i * 53) % screenH; + int32_t x2 = (i * 71) % screenW; + int32_t y2 = (i * 89) % screenH; + drv->lineDraw(drv, x1, y1, x2, y2, packColor16(255, 255, 255)); + } + + drv->waitIdle(drv); + clock_t hwLineEnd = clock(); + + // Benchmark host blit (CPU-to-screen) + int32_t bytesPerPix = (drv->mode.bpp + 7) / 8; + int32_t hblitPitch = HBLIT_PAT_W * bytesPerPix; + uint8_t *hblitBuf = (uint8_t *)malloc(hblitPitch * HBLIT_PAT_H); + clock_t hwHblitEnd = 0; + clock_t hwHblitStart = 0; + bool hblitValid = false; + + if (hblitBuf) { + // Fill buffer with a checkerboard pattern + for (int32_t row = 0; row < HBLIT_PAT_H; row++) { + for (int32_t col = 0; col < HBLIT_PAT_W; col++) { + uint32_t color; + + if ((row / 8 + col / 8) & 1) { + color = packColor16(255, 255, 0); + } else { + color = packColor16(0, 0, 128); + } + + if (bytesPerPix == 2) { + ((uint16_t *)(hblitBuf + row * hblitPitch))[col] = (uint16_t)color; + } else if (bytesPerPix == 4) { + ((uint32_t *)(hblitBuf + row * hblitPitch))[col] = color; + } else { + hblitBuf[row * hblitPitch + col] = (uint8_t)color; + } + } + } + + hwHblitStart = clock(); + + for (int32_t i = 0; i < BENCH_HBLIT_COUNT; i++) { + int32_t dx = (i * 37) % (screenW - HBLIT_PAT_W); + int32_t dy = (i * 53) % (screenH - HBLIT_PAT_H); + drv->hostBlit(drv, hblitBuf, hblitPitch, dx, dy, HBLIT_PAT_W, HBLIT_PAT_H); + } + + drv->waitIdle(drv); + hwHblitEnd = clock(); + hblitValid = true; + free(hblitBuf); + } + + // Benchmark pattern fill + static const uint8_t benchPattern[8] = { + 0xAA, 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, 0x55 + }; + + clock_t hwPatStart = clock(); + + for (int32_t i = 0; i < BENCH_PATFILL_COUNT; i++) { + int32_t px = (i * 37) % (screenW - 100); + int32_t py = (i * 53) % (screenH - 100); + drv->rectFillPat(drv, px, py, 100, 100, benchPattern, packColor16(255, 255, 255), packColor16(0, 0, 0)); + } + + drv->waitIdle(drv); + clock_t hwPatEnd = clock(); + + // Calculate times in milliseconds + double hwFillMs = (double)(hwFillEnd - hwFillStart) * 1000.0 / CLOCKS_PER_SEC; + double swFillMs = (double)(swFillEnd - swFillStart) * 1000.0 / CLOCKS_PER_SEC; + double hwBltMs = (double)(hwBltEnd - hwBltStart) * 1000.0 / CLOCKS_PER_SEC; + double hwLineMs = (double)(hwLineEnd - hwLineStart) * 1000.0 / CLOCKS_PER_SEC; + double hwHblitMs = (double)(hwHblitEnd - hwHblitStart) * 1000.0 / CLOCKS_PER_SEC; + double hwPatMs = (double)(hwPatEnd - hwPatStart) * 1000.0 / CLOCKS_PER_SEC; + + // Store results, then restore text mode to print + accelShutdown(drv); + + printf("\n=== Benchmark Results ===\n\n"); + + printf("Rectangle Fill (%d x 100x100):\n", BENCH_FILL_COUNT); + printf(" Hardware: %.1f ms (%.0f rects/sec)\n", + hwFillMs, BENCH_FILL_COUNT * 1000.0 / hwFillMs); + printf(" Software: %.1f ms (%.0f rects/sec)\n", + swFillMs, BENCH_FILL_COUNT * 1000.0 / swFillMs); + if (swFillMs > 0) { + printf(" Speedup: %.1fx\n", swFillMs / hwFillMs); + } + + printf("\nBitBlt (%d x 100x100 screen-to-screen):\n", BENCH_BLIT_COUNT); + printf(" Hardware: %.1f ms (%.0f blits/sec)\n", + hwBltMs, BENCH_BLIT_COUNT * 1000.0 / hwBltMs); + + printf("\nLine Draw (%d lines):\n", BENCH_LINE_COUNT); + printf(" Hardware: %.1f ms (%.0f lines/sec)\n", + hwLineMs, BENCH_LINE_COUNT * 1000.0 / hwLineMs); + + if (hblitValid) { + printf("\nHost Blit (%d x %dx%d CPU-to-screen):\n", + BENCH_HBLIT_COUNT, HBLIT_PAT_W, HBLIT_PAT_H); + printf(" Hardware: %.1f ms (%.0f blits/sec)\n", + hwHblitMs, BENCH_HBLIT_COUNT * 1000.0 / hwHblitMs); + } + + printf("\nPattern Fill (%d x 100x100):\n", BENCH_PATFILL_COUNT); + printf(" Hardware: %.1f ms (%.0f fills/sec)\n", + hwPatMs, BENCH_PATFILL_COUNT * 1000.0 / hwPatMs); + + printf("\nPress any key to exit...\n"); + readKey(); +} + + +// ============================================================ +// demoBitBlt +// ============================================================ +// +// Demonstrates screen-to-screen BitBLT by filling colored +// rectangles and then copying them around the screen. + +static void demoBitBlt(AccelDriverT *drv) { + int32_t screenW = drv->mode.width; + int32_t screenH = drv->mode.height; + + // Clear screen + drv->rectFill(drv, 0, 0, screenW, screenH, 0); + + // Draw some source rectangles + drv->rectFill(drv, 10, 10, 100, 100, packColor16(255, 0, 0)); + drv->rectFill(drv, 120, 10, 100, 100, packColor16(0, 255, 0)); + drv->rectFill(drv, 230, 10, 100, 100, packColor16(0, 0, 255)); + drv->rectFill(drv, 340, 10, 100, 100, packColor16(255, 255, 0)); + drv->waitIdle(drv); + + // Copy them diagonally across the screen + for (int32_t i = 0; i < 5; i++) { + int32_t offsetY = 120 + i * 60; + + if (offsetY + 100 > screenH) { + break; + } + + drv->bitBlt(drv, 10, 10, 10 + i * 30, offsetY, 430, 100); + } + + drv->waitIdle(drv); +} + + +// ============================================================ +// demoColorExpand +// ============================================================ +// +// Demonstrates monochrome color expansion by rendering text-like +// patterns. Creates a simple 8x16 glyph and renders it repeatedly. + +static void demoColorExpand(AccelDriverT *drv) { + int32_t screenW = drv->mode.width; + int32_t screenH = drv->mode.height; + + // Clear screen + drv->rectFill(drv, 0, 0, screenW, screenH, packColor16(0, 0, 128)); + drv->waitIdle(drv); + + // 8x16 glyph bitmaps for several characters + static const uint8_t glyphA[16] = { + 0x00, 0x18, 0x3C, 0x66, 0x66, 0xC3, 0xC3, 0xFF, + 0xFF, 0xC3, 0xC3, 0xC3, 0xC3, 0xC3, 0x00, 0x00 + }; + + static const uint8_t glyphB[16] = { + 0x00, 0xFC, 0xC6, 0xC6, 0xC6, 0xFC, 0xC6, 0xC3, + 0xC3, 0xC3, 0xC6, 0xFC, 0x00, 0x00, 0x00, 0x00 + }; + + static const uint8_t glyphC[16] = { + 0x00, 0x3E, 0x63, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, + 0xC0, 0xC0, 0x63, 0x3E, 0x00, 0x00, 0x00, 0x00 + }; + + static const uint8_t glyphD[16] = { + 0x00, 0xFC, 0xC6, 0xC3, 0xC3, 0xC3, 0xC3, 0xC3, + 0xC3, 0xC3, 0xC6, 0xFC, 0x00, 0x00, 0x00, 0x00 + }; + + static const uint8_t glyphE[16] = { + 0x00, 0xFF, 0xC0, 0xC0, 0xC0, 0xFE, 0xC0, 0xC0, + 0xC0, 0xC0, 0xC0, 0xFF, 0x00, 0x00, 0x00, 0x00 + }; + + static const uint8_t glyphF[16] = { + 0x00, 0xFF, 0xC0, 0xC0, 0xC0, 0xFE, 0xC0, 0xC0, + 0xC0, 0xC0, 0xC0, 0xC0, 0x00, 0x00, 0x00, 0x00 + }; + + static const uint8_t *glyphs[6] = { + glyphA, glyphB, glyphC, glyphD, glyphE, glyphF + }; + + #define NUM_GLYPHS 6 + + // Color pairs for different rows (foreground/background) + static const uint8_t colorPairs[][6] = { + // R G B R G B (fg, then bg) + {255, 255, 255, 0, 0, 128}, // white on dark blue + {255, 255, 0, 0, 0, 0}, // yellow on black + { 0, 255, 0, 0, 64, 0}, // green on dark green + {255, 128, 0, 64, 0, 0}, // orange on dark red + { 0, 255, 255, 0, 0, 64}, // cyan on navy + {255, 0, 255, 32, 0, 32}, // magenta on dark purple + }; + + #define NUM_COLOR_PAIRS 6 + + int32_t cols = screenW / 8; + int32_t rows = screenH / 16; + + for (int32_t row = 0; row < rows; row++) { + int32_t pairIdx = row % NUM_COLOR_PAIRS; + const uint8_t *pair = colorPairs[pairIdx]; + uint32_t fg = packColor16(pair[0], pair[1], pair[2]); + uint32_t bg = packColor16(pair[3], pair[4], pair[5]); + + for (int32_t col = 0; col < cols; col++) { + const uint8_t *glyph = glyphs[(row + col) % NUM_GLYPHS]; + drv->colorExpand(drv, glyph, 1, + col * 8, row * 16, 8, 16, fg, bg); + } + } + + drv->waitIdle(drv); + + #undef NUM_GLYPHS + #undef NUM_COLOR_PAIRS +} + + +// ============================================================ +// demoFillRects +// ============================================================ +// +// Demonstrates hardware rectangle fill with various colors +// and sizes. Draws a pattern of overlapping rectangles. + +static void demoFillRects(AccelDriverT *drv) { + int32_t screenW = drv->mode.width; + int32_t screenH = drv->mode.height; + + // Clear screen to dark blue + drv->rectFill(drv, 0, 0, screenW, screenH, packColor16(0, 0, 64)); + drv->waitIdle(drv); + + // Draw concentric rectangles + int32_t colors[][3] = { + {255, 0, 0}, + {0, 255, 0}, + {0, 0, 255}, + {255, 255, 0}, + {255, 0, 255}, + {0, 255, 255}, + {255, 128, 0}, + {128, 0, 255} + }; + int32_t numColors = 8; + + int32_t cx = screenW / 2; + int32_t cy = screenH / 2; + + for (int32_t i = 0; i < numColors; i++) { + int32_t size = 200 - i * 20; + if (size < 10) { + break; + } + + uint32_t color = packColor16(colors[i][0], colors[i][1], colors[i][2]); + drv->rectFill(drv, cx - size / 2, cy - size / 2, size, size, color); + } + + // Draw a grid of small rectangles + for (int32_t y = 10; y < screenH - 30; y += 25) { + for (int32_t x = 10; x < 150; x += 25) { + uint32_t color = packColor16((x * 7) & 0xFF, (y * 3) & 0xFF, ((x + y) * 5) & 0xFF); + drv->rectFill(drv, x, y, 20, 20, color); + } + } + + // Draw grid on right side too + for (int32_t y = 10; y < screenH - 30; y += 25) { + for (int32_t x = screenW - 160; x < screenW - 10; x += 25) { + uint32_t color = packColor16((x * 3) & 0xFF, (y * 7) & 0xFF, ((x + y) * 2) & 0xFF); + drv->rectFill(drv, x, y, 20, 20, color); + } + } + + drv->waitIdle(drv); +} + + +// ============================================================ +// demoHostBlit +// ============================================================ +// +// Demonstrates CPU-to-screen blit by creating a colorful gradient +// pattern in system RAM, then tiling copies across the screen. + +static void demoHostBlit(AccelDriverT *drv) { + int32_t screenW = drv->mode.width; + int32_t screenH = drv->mode.height; + + // Clear screen + drv->rectFill(drv, 0, 0, screenW, screenH, 0); + drv->waitIdle(drv); + + // Create a gradient tile in system RAM + int32_t tileW = 64; + int32_t tileH = 64; + int32_t bytesPerPix = (drv->mode.bpp + 7) / 8; + int32_t tilePitch = tileW * bytesPerPix; + uint8_t *tileBuf = (uint8_t *)malloc(tilePitch * tileH); + + if (!tileBuf) { + return; + } + + // Fill tile with a radial gradient pattern + int32_t cx = tileW / 2; + int32_t cy = tileH / 2; + + for (int32_t row = 0; row < tileH; row++) { + for (int32_t col = 0; col < tileW; col++) { + int32_t dx = col - cx; + int32_t dy = row - cy; + int32_t dist = dx * dx + dy * dy; + + // Map distance to color -- creates concentric rings + uint8_t r = (dist * 7) & 0xFF; + uint8_t g = (dist * 3 + col * 4) & 0xFF; + uint8_t b = (row * 8 + col * 2) & 0xFF; + uint32_t color = packColor16(r, g, b); + + if (bytesPerPix == 2) { + ((uint16_t *)(tileBuf + row * tilePitch))[col] = (uint16_t)color; + } else if (bytesPerPix == 4) { + ((uint32_t *)(tileBuf + row * tilePitch))[col] = color; + } else { + tileBuf[row * tilePitch + col] = (uint8_t)color; + } + } + } + + // Tile the pattern across the screen + for (int32_t y = 0; y + tileH <= screenH; y += tileH) { + for (int32_t x = 0; x + tileW <= screenW; x += tileW) { + drv->hostBlit(drv, tileBuf, tilePitch, x, y, tileW, tileH); + } + } + + drv->waitIdle(drv); + + // Draw a border around each tile using rect fills for contrast + uint32_t borderColor = packColor16(255, 255, 255); + + for (int32_t y = 0; y + tileH <= screenH; y += tileH) { + drv->rectFill(drv, 0, y, screenW, 1, borderColor); + } + + for (int32_t x = 0; x + tileW <= screenW; x += tileW) { + drv->rectFill(drv, x, 0, 1, screenH, borderColor); + } + + drv->waitIdle(drv); + free(tileBuf); +} + + +// ============================================================ +// demoLines +// ============================================================ +// +// Demonstrates hardware line drawing with a starburst pattern. + +static void demoLines(AccelDriverT *drv) { + int32_t screenW = drv->mode.width; + int32_t screenH = drv->mode.height; + + // Clear screen + drv->rectFill(drv, 0, 0, screenW, screenH, 0); + drv->waitIdle(drv); + + int32_t cx = screenW / 2; + int32_t cy = screenH / 2; + + // Draw starburst from center + for (int32_t i = 0; i < 360; i += 3) { + // Simple integer approximation of sin/cos using a lookup + // approach. For a demo, we just use the endpoint calculation. + int32_t dx = 0; + int32_t dy = 0; + + // Approximate angle -> direction + int32_t radius = (screenH / 2) - 10; + int32_t angle = i; + + // Crude trig via quadrant decomposition + int32_t quadrant = (angle / 90) % 4; + int32_t subAngle = angle % 90; + + // Linear interpolation within each quadrant (good enough for demo) + int32_t frac = subAngle * radius / 90; + int32_t comp = radius - frac; + + switch (quadrant) { + case 0: dx = frac; dy = -comp; break; + case 1: dx = comp; dy = frac; break; + case 2: dx = -frac; dy = comp; break; + case 3: dx = -comp; dy = -frac; break; + } + + uint32_t color = packColor16( + (i * 3) & 0xFF, + (i * 5 + 100) & 0xFF, + (i * 7 + 50) & 0xFF + ); + + drv->lineDraw(drv, cx, cy, cx + dx, cy + dy, color); + } + + // Draw border rectangle with lines + uint32_t white = packColor16(255, 255, 255); + drv->lineDraw(drv, 0, 0, screenW - 1, 0, white); + drv->lineDraw(drv, screenW - 1, 0, screenW - 1, screenH - 1, white); + drv->lineDraw(drv, screenW - 1, screenH - 1, 0, screenH - 1, white); + drv->lineDraw(drv, 0, screenH - 1, 0, 0, white); + + drv->waitIdle(drv); +} + + +// ============================================================ +// demoPatternFill +// ============================================================ +// +// Demonstrates 8x8 pattern fills with several distinct patterns +// drawn side by side in colored rectangles. + +static void demoPatternFill(AccelDriverT *drv) { + int32_t screenW = drv->mode.width; + int32_t screenH = drv->mode.height; + + // Clear screen to dark gray + drv->rectFill(drv, 0, 0, screenW, screenH, packColor16(32, 32, 32)); + drv->waitIdle(drv); + + // Define several 8x8 patterns + static const uint8_t patCheckerboard[8] = { + 0xAA, 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, 0x55 + }; + + static const uint8_t patCrosshatch[8] = { + 0xFF, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 + }; + + static const uint8_t patDiagStripes[8] = { + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 + }; + + static const uint8_t patDots[8] = { + 0x00, 0x22, 0x00, 0x88, 0x00, 0x22, 0x00, 0x88 + }; + + static const uint8_t patHorzStripes[8] = { + 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00 + }; + + static const uint8_t patVertStripes[8] = { + 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA + }; + + struct { + const uint8_t *pattern; + uint32_t fg; + uint32_t bg; + } patterns[] = { + {patCheckerboard, packColor16(255, 255, 255), packColor16(0, 0, 0)}, + {patCrosshatch, packColor16(255, 255, 0), packColor16(0, 0, 128)}, + {patDiagStripes, packColor16(0, 255, 0), packColor16(0, 64, 0)}, + {patDots, packColor16(255, 0, 0), packColor16(64, 0, 0)}, + {patHorzStripes, packColor16(0, 255, 255), packColor16(0, 0, 64)}, + {patVertStripes, packColor16(255, 0, 255), packColor16(64, 0, 64)}, + }; + + int32_t numPatterns = 6; + + // Arrange patterns in a 3x2 grid + int32_t margin = 20; + int32_t spacing = 10; + int32_t cellW = (screenW - 2 * margin - (3 - 1) * spacing) / 3; + int32_t cellH = (screenH - 2 * margin - (2 - 1) * spacing) / 2; + + for (int32_t i = 0; i < numPatterns; i++) { + int32_t gridCol = i % 3; + int32_t gridRow = i / 3; + int32_t x = margin + gridCol * (cellW + spacing); + int32_t y = margin + gridRow * (cellH + spacing); + + drv->rectFillPat(drv, x, y, cellW, cellH, + patterns[i].pattern, + patterns[i].fg, patterns[i].bg); + } + + drv->waitIdle(drv); +} + + +// ============================================================ +// isKeyPressed +// ============================================================ +// +// Non-blocking check for a keypress using BIOS INT 16h. + +static bool isKeyPressed(void) { + __dpmi_regs r; + + memset(&r, 0, sizeof(r)); + r.h.ah = 0x11; // check for extended keystroke + __dpmi_int(0x16, &r); + + return !(r.x.flags & 0x40); // ZF clear = key available +} + + +// ============================================================ +// main +// ============================================================ + +int main(int argc, char *argv[]) { + int32_t reqW = DEFAULT_WIDTH; + int32_t reqH = DEFAULT_HEIGHT; + int32_t reqBpp = DEFAULT_BPP; + + if (argc >= 4) { + reqW = atoi(argv[1]); + reqH = atoi(argv[2]); + reqBpp = atoi(argv[3]); + } + + printf("DOS Accelerated Video Driver Demo\n"); + printf("Requested mode: %ldx%ldx%ld\n\n", (long)reqW, (long)reqH, (long)reqBpp); + + // Register all available drivers + atiRegisterDriver(); + bansheeRegisterDriver(); + clRegisterDriver(); + etRegisterDriver(); + lagunaRegisterDriver(); + mgaRegisterDriver(); + nvRegisterDriver(); + s3RegisterDriver(); + sisRegisterDriver(); + tridentRegisterDriver(); + + // Detect hardware + printf("Scanning PCI bus for supported video hardware...\n"); + AccelDriverT *drv = accelDetect(); + + if (!drv) { + printf("No supported video hardware found.\n"); + printf("\nPCI video devices present:\n"); + + // Enumerate and display all VGA-class PCI devices for diagnostics + for (int32_t bus = 0; bus < 256; bus++) { + for (int32_t dev = 0; dev < 32; dev++) { + uint16_t vid = pciRead16(bus, dev, 0, PCI_VENDOR_ID); + + if (vid == 0xFFFF) { + continue; + } + + uint8_t baseClass = pciRead8(bus, dev, 0, PCI_BASE_CLASS); + + if (baseClass == PCI_CLASS_DISPLAY) { + uint16_t did = pciRead16(bus, dev, 0, PCI_DEVICE_ID); + printf(" %02lX:%02lX.0 vendor=%04X device=%04X\n", + (long)bus, (long)dev, vid, did); + } + } + } + + return 1; + } + + // Initialize with requested mode + AccelModeRequestT modeReq; + modeReq.width = reqW; + modeReq.height = reqH; + modeReq.bpp = reqBpp; + + if (!accelInit(drv, &modeReq)) { + printf("Failed to initialize video driver.\n"); + return 1; + } + + printf("\nDriver: %s\n", accelGetName(drv)); + printf("Mode: %ldx%ldx%ld (pitch=%ld)\n", + (long)drv->mode.width, (long)drv->mode.height, + (long)drv->mode.bpp, (long)drv->mode.pitch); + printf("VRAM: %lu KB\n", (unsigned long)(drv->mode.vramSize / 1024)); + printf("\nPress any key to start demos...\n"); + printf(" SPACE = next demo\n"); + printf(" B = benchmark\n"); + printf(" ESC = exit\n"); + readKey(); + + // Run demos in a loop + int32_t currentDemo = 0; + int32_t numDemos = 6; + bool running = true; + + while (running) { + switch (currentDemo) { + case 0: + demoFillRects(drv); + break; + case 1: + demoBitBlt(drv); + break; + case 2: + demoLines(drv); + break; + case 3: + demoColorExpand(drv); + break; + case 4: + demoHostBlit(drv); + break; + case 5: + demoPatternFill(drv); + break; + } + + // Wait for keypress + while (!isKeyPressed()) { + // spin + } + + uint8_t key = readKey(); + + switch (key) { + case 0x01: // ESC + running = false; + break; + case 0x30: // 'b' + demoBenchmark(drv); + return 0; // benchmark already shut down the driver + case 0x39: // space + currentDemo = (currentDemo + 1) % numDemos; + break; + default: + currentDemo = (currentDemo + 1) % numDemos; + break; + } + } + + accelShutdown(drv); + printf("Demo complete.\n"); + + return 0; +} + + +// ============================================================ +// packColor16 +// ============================================================ +// +// Packs an RGB triplet into 16-bit 565 format. +// This is a simplification -- a real integration would use the +// display's actual pixel format. For the demo, 565 is fine since +// that's what most 16-bit VESA modes use. + +static uint32_t packColor16(uint8_t r, uint8_t g, uint8_t b) { + return ((uint32_t)(r >> 3) << 11) + | ((uint32_t)(g >> 2) << 5) + | ((uint32_t)(b >> 3)); +} + + +// ============================================================ +// readKey +// ============================================================ +// +// Blocking read of one keypress via BIOS INT 16h. +// Returns the scan code. + +static uint8_t readKey(void) { + __dpmi_regs r; + + memset(&r, 0, sizeof(r)); + r.h.ah = 0x10; // read extended keystroke + __dpmi_int(0x16, &r); + + return r.h.ah; // scan code +} + + +// ============================================================ +// softFillRect +// ============================================================ +// +// Software rectangle fill for benchmark comparison. Writes +// directly to the LFB (intentionally slow due to PCI bus writes). + +static void softFillRect(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color) { + uint8_t *fb = drv->mode.framebuffer; + int32_t pitch = drv->mode.pitch; + int32_t bpp = (drv->mode.bpp + 7) / 8; + + for (int32_t row = 0; row < h; row++) { + uint8_t *dst = fb + (y + row) * pitch + x * bpp; + + if (bpp == 2) { + uint16_t *dst16 = (uint16_t *)dst; + + for (int32_t col = 0; col < w; col++) { + dst16[col] = (uint16_t)color; + } + } else if (bpp == 4) { + uint32_t *dst32 = (uint32_t *)dst; + + for (int32_t col = 0; col < w; col++) { + dst32[col] = color; + } + } else { + for (int32_t col = 0; col < w; col++) { + dst[col] = (uint8_t)color; + } + } + } +} diff --git a/matroxMga.c b/matroxMga.c new file mode 100644 index 0000000..6f16aed --- /dev/null +++ b/matroxMga.c @@ -0,0 +1,843 @@ +// matroxMga.c -- Matrox Millennium/Mystique/G200/G400 accelerated video driver +// +// Supports the Matrox MGA family: MGA2064W (Millennium), MGA1064SG +// (Mystique), G100, G200, and G400/G450. The Matrox 2D drawing engine +// is widely regarded as the best 2D accelerator of the PCI/AGP era, +// with features including: +// - Solid and pattern rectangle fill +// - Screen-to-screen BitBLT (very fast, pipelined) +// - CPU-to-screen blit with color expansion (ILOAD) +// - Bresenham line draw (antialiased on G200+) +// - Trapezoid fill +// - Hardware clip rectangle +// - 64x64 three-color hardware cursor +// +// Register access: +// The MGA register block is mapped via BAR0 (PCI) or BAR1 +// depending on the chip. It's a 16KB MMIO region. The drawing +// engine registers start at offset 0x1C00 within this block. +// +// The drawing engine uses a command-based model: you set up +// parameters (colors, coordinates, dimensions) in the setup +// registers, then write to DWGCTL to start the operation. +// Some operations auto-execute when the last parameter is +// written (e.g., LEN triggers a draw). +// +// FIFO: +// The MGA has a deep command FIFO (64 entries on Millennium). +// The FIFOSTATUS register indicates how many entries are free. +// On G200+, the FIFO is deeper and the STATUS register has +// a busy bit that's more reliable. + +#include "accelVid.h" +#include "vgaCommon.h" +#include "pci.h" + +#include +#include +#include +#include +#include + +// ============================================================ +// Matrox vendor/device IDs +// ============================================================ + +#define MATROX_VENDOR_ID 0x102B + +#define MGA_2064W 0x0519 // Millennium +#define MGA_1064SG 0x051A // Mystique +#define MGA_G100_PCI 0x1000 +#define MGA_G100_AGP 0x1001 +#define MGA_G200_PCI 0x0521 +#define MGA_G200_AGP 0x0520 +#define MGA_G400 0x0525 +#define MGA_G450 0x2527 + +static const uint16_t sMatroxDeviceIds[] = { + MATROX_VENDOR_ID, MGA_2064W, + MATROX_VENDOR_ID, MGA_1064SG, + MATROX_VENDOR_ID, MGA_G100_PCI, + MATROX_VENDOR_ID, MGA_G100_AGP, + MATROX_VENDOR_ID, MGA_G200_PCI, + MATROX_VENDOR_ID, MGA_G200_AGP, + MATROX_VENDOR_ID, MGA_G400, + MATROX_VENDOR_ID, MGA_G450, + 0, 0 +}; + +// ============================================================ +// MGA drawing engine register offsets (from MMIO base) +// ============================================================ + +// Drawing engine setup registers (0x1C00 - 0x1CFF) +#define MGA_DWGCTL 0x1C00 // drawing control +#define MGA_MACCESS 0x1C04 // memory access control +#define MGA_MCTLWTST 0x1C08 // memory control wait state +#define MGA_ZORG 0x1C0C // Z origin +#define MGA_PAT0 0x1C10 // pattern register 0 +#define MGA_PAT1 0x1C14 // pattern register 1 +#define MGA_PLNWT 0x1C1C // plane write mask +#define MGA_BCOL 0x1C20 // background color +#define MGA_FCOL 0x1C24 // foreground color +#define MGA_SRC0 0x1C30 // source data 0 (for color expand) +#define MGA_SRC1 0x1C34 +#define MGA_SRC2 0x1C38 +#define MGA_SRC3 0x1C3C +#define MGA_XYSTRT 0x1C40 // XY start (for lines) +#define MGA_XYEND 0x1C44 // XY end (triggers line draw) +#define MGA_SHIFT 0x1C50 +#define MGA_SGN 0x1C58 // sign register +#define MGA_LEN 0x1C5C // number of lines (triggers rect ops) +#define MGA_AR0 0x1C60 // line draw parameter 0 +#define MGA_AR1 0x1C64 +#define MGA_AR2 0x1C68 +#define MGA_AR3 0x1C6C +#define MGA_AR4 0x1C70 +#define MGA_AR5 0x1C74 +#define MGA_AR6 0x1C78 +#define MGA_CXBNDRY 0x1C80 // clip X boundaries (left | right<<16) +#define MGA_FXBNDRY 0x1C84 // fill X boundaries (left | right<<16) +#define MGA_YDSTLEN 0x1C88 // Y dest and length (triggers fill) +#define MGA_PITCH 0x1C8C // destination pitch (in pixels) +#define MGA_YDST 0x1C90 // Y destination +#define MGA_YDSTORG 0x1C94 // Y destination origin (byte offset) +#define MGA_YTOP 0x1C98 // clip Y top +#define MGA_YBOT 0x1C9C // clip Y bottom +#define MGA_CXLEFT 0x1CA0 // clip X left +#define MGA_CXRIGHT 0x1CA4 // clip X right +#define MGA_FXLEFT 0x1CA8 // fill X left +#define MGA_FXRIGHT 0x1CAC // fill X right +#define MGA_XDST 0x1CB0 // X destination + +// Status registers (0x1E00 - 0x1EFF) +#define MGA_FIFOSTATUS 0x1E10 // FIFO status +#define MGA_STATUS 0x1E14 // engine status +#define MGA_ICLEAR 0x1E18 // interrupt clear +#define MGA_IEN 0x1E1C // interrupt enable + +// Source window (for BitBLT) +#define MGA_SRCORG 0x2CB4 // source origin + +// DWGSYNC for synchronization +#define MGA_DWGSYNC 0x2C4C + +// ============================================================ +// MGA DWGCTL command values +// ============================================================ +// +// The DWGCTL register is a 32-bit command word that encodes the +// operation type, drawing options, and raster operation. + +// Operation codes (bits 3:0) +#define MGA_OPCOD_LINE_OPEN 0x00 // line (open) +#define MGA_OPCOD_AUTOLINE_OPEN 0x01 +#define MGA_OPCOD_LINE_CLOSE 0x02 // line (closed) +#define MGA_OPCOD_AUTOLINE_CLOSE 0x03 +#define MGA_OPCOD_TRAP 0x04 // trapezoid fill +#define MGA_OPCOD_TEXTURE 0x05 // texture mapping (G200+) +#define MGA_OPCOD_BITBLT 0x08 // screen-to-screen blit +#define MGA_OPCOD_ILOAD 0x09 // CPU-to-screen (image load) +#define MGA_OPCOD_IDUMP 0x0A // screen-to-CPU + +// Drawing options (bits 31:4) +#define MGA_ATYPE_RPL 0x0000 // replace +#define MGA_ATYPE_RSTR 0x0010 // raster +#define MGA_ATYPE_ZI 0x0030 // Z interpolate +#define MGA_ATYPE_BLK 0x0040 // block transfer +#define MGA_ATYPE_I 0x0070 // interpolate + +#define MGA_ZMODE_NOZCMP 0x0000 // no Z compare +#define MGA_ZMODE_ZE 0x0200 // Z equal +#define MGA_ZMODE_ZNE 0x0300 // Z not equal + +#define MGA_SOLID 0x0800 // solid fill (no pattern) +#define MGA_ARZERO 0x1000 // AR regs are zero (solid fill optimization) +#define MGA_SGNZERO 0x2000 // SGN reg is zero +#define MGA_SHFTZERO 0x4000 // SHIFT reg is zero + +#define MGA_BOP_MASK 0x000F0000 // boolean operation (ROP) mask +#define MGA_BOP_SHIFT 16 + +// Boolean operations (ROP2, bits 19:16) +#define MGA_BOP_CLEAR (0x0 << MGA_BOP_SHIFT) +#define MGA_BOP_NOR (0x1 << MGA_BOP_SHIFT) +#define MGA_BOP_COPYINV (0x3 << MGA_BOP_SHIFT) +#define MGA_BOP_AND (0x8 << MGA_BOP_SHIFT) +#define MGA_BOP_XOR (0x6 << MGA_BOP_SHIFT) +#define MGA_BOP_COPY (0xC << MGA_BOP_SHIFT) +#define MGA_BOP_OR (0xE << MGA_BOP_SHIFT) +#define MGA_BOP_SET (0xF << MGA_BOP_SHIFT) + +// Transparency +#define MGA_TRANSC 0x00100000 // transparent color compare +#define MGA_BLTMOD_BFCOL 0x04000000 // BLT mode: foreground color +#define MGA_BLTMOD_BU32RGB 0x0C000000 // BLT mode: 32bpp ILOAD +#define MGA_BLTMOD_BMONOWF 0x08000000 // BLT mode: mono word expand MSB first + +// Pattern +#define MGA_PATTERN 0x20000000 // enable pattern + +// Linear source +#define MGA_LINEAR 0x80000000 // linear addressing (not XY) + +// ============================================================ +// MGA MACCESS values +// ============================================================ + +#define MGA_MACCESS_8BPP 0x00 +#define MGA_MACCESS_16BPP 0x01 +#define MGA_MACCESS_32BPP 0x02 +#define MGA_MACCESS_24BPP 0x03 + +// ============================================================ +// MGA SGN register bits +// ============================================================ + +#define MGA_SGN_SCANLEFT 0x01 // scan direction left +#define MGA_SGN_SCANRIGHT 0x00 // scan direction right +#define MGA_SGN_SDY_NEG 0x02 // negative Y direction +#define MGA_SGN_SDX_NEG 0x04 // negative X direction + +// ============================================================ +// MGA STATUS register bits +// ============================================================ + +#define MGA_STATUS_BUSY 0x00010000 // drawing engine busy +#define MGA_FIFO_FULL_MASK 0x0000007F // FIFO free count + +// Maximum wait iterations +#define MGA_MAX_IDLE_WAIT 1000000 + +// Hardware cursor +#define MGA_HW_CURSOR_SIZE 64 +#define MGA_HW_CURSOR_BYTES 1024 + +// ============================================================ +// Private driver state +// ============================================================ + +typedef struct { + uint32_t lfbPhysAddr; + uint32_t mmioPhysAddr; + uint32_t vramSize; + uint32_t cursorOffset; + int32_t bytesPerPixel; + int32_t screenPitch; + volatile uint32_t *mmio; // mapped MMIO base + DpmiMappingT lfbMapping; + DpmiMappingT mmioMapping; + bool isG200Plus; // G200/G400/G450 +} MatroxPrivateT; + +// ============================================================ +// Prototypes +// ============================================================ + +static void mgaBitBlt(AccelDriverT *drv, int32_t srcX, int32_t srcY, int32_t dstX, int32_t dstY, int32_t w, int32_t h); +static void mgaColorExpand(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h, uint32_t fg, uint32_t bg); +static bool mgaDetect(AccelDriverT *drv); +static void mgaHostBlit(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h); +static bool mgaInit(AccelDriverT *drv, const AccelModeRequestT *req); +static void mgaLineDraw(AccelDriverT *drv, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color); +static void mgaMoveCursor(AccelDriverT *drv, int32_t x, int32_t y); +static void mgaRectFill(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color); +static void mgaRectFillPat(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, const uint8_t *pattern, uint32_t fg, uint32_t bg); +static void mgaSetClip(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h); +static void mgaSetCursor(AccelDriverT *drv, const HwCursorImageT *image); +static void mgaShowCursor(AccelDriverT *drv, bool visible); +static void mgaShutdown(AccelDriverT *drv); +static void mgaWaitFifo(MatroxPrivateT *priv, int32_t entries); +static void mgaWaitIdle(AccelDriverT *drv); + +static inline void mgaWrite(MatroxPrivateT *priv, uint32_t reg, uint32_t val) { + priv->mmio[reg / 4] = val; +} + +static inline uint32_t mgaRead(MatroxPrivateT *priv, uint32_t reg) { + return priv->mmio[reg / 4]; +} + +// ============================================================ +// Driver instance +// ============================================================ + +static MatroxPrivateT sMatroxPrivate; + +static AccelDriverT sMatroxDriver = { + .name = "Matrox Millennium", + .chipFamily = "matrox", + .caps = 0, + .privData = &sMatroxPrivate, + .detect = mgaDetect, + .init = mgaInit, + .shutdown = mgaShutdown, + .waitIdle = mgaWaitIdle, + .setClip = mgaSetClip, + .rectFill = mgaRectFill, + .rectFillPat = mgaRectFillPat, + .bitBlt = mgaBitBlt, + .hostBlit = mgaHostBlit, + .colorExpand = mgaColorExpand, + .lineDraw = mgaLineDraw, + .setCursor = mgaSetCursor, + .moveCursor = mgaMoveCursor, + .showCursor = mgaShowCursor, +}; + +// ============================================================ +// mgaRegisterDriver +// ============================================================ + +void mgaRegisterDriver(void) { + accelRegisterDriver(&sMatroxDriver); +} + + +// ============================================================ +// mgaBitBlt +// ============================================================ +// +// Screen-to-screen BitBLT using the MGA BITBLT opcode. +// The MGA engine uses pixel coordinates and pitch, with the +// sign register controlling direction for overlapping blits. + +static void mgaBitBlt(AccelDriverT *drv, int32_t srcX, int32_t srcY, int32_t dstX, int32_t dstY, int32_t w, int32_t h) { + MatroxPrivateT *priv = (MatroxPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + // Determine direction + uint32_t sgn = 0; + int32_t startX; + int32_t endX; + int32_t startY = dstY; + uint32_t srcOrg = srcY * priv->screenPitch + srcX * priv->bytesPerPixel; + + if (dstX <= srcX) { + // Left to right + startX = dstX; + endX = dstX + w - 1; + } else { + // Right to left + startX = dstX + w - 1; + endX = dstX; + sgn |= MGA_SGN_SCANLEFT; + srcOrg = srcY * priv->screenPitch + (srcX + w - 1) * priv->bytesPerPixel; + } + + if (dstY > srcY) { + // Bottom to top + sgn |= MGA_SGN_SDY_NEG; + startY = dstY + h - 1; + srcOrg = (srcY + h - 1) * priv->screenPitch + srcX * priv->bytesPerPixel; + if (sgn & MGA_SGN_SCANLEFT) { + srcOrg = (srcY + h - 1) * priv->screenPitch + (srcX + w - 1) * priv->bytesPerPixel; + } + } + + mgaWaitFifo(priv, 8); + + mgaWrite(priv, MGA_DWGCTL, + MGA_OPCOD_BITBLT | MGA_ATYPE_BLK | MGA_BOP_COPY | MGA_SHFTZERO); + mgaWrite(priv, MGA_SGN, sgn); + mgaWrite(priv, MGA_PLNWT, 0xFFFFFFFF); + mgaWrite(priv, MGA_SRCORG, srcOrg); + mgaWrite(priv, MGA_AR5, (sgn & MGA_SGN_SDY_NEG) ? -(priv->screenPitch / priv->bytesPerPixel) : (priv->screenPitch / priv->bytesPerPixel)); + + // Set boundaries and trigger + mgaWrite(priv, MGA_FXBNDRY, ((uint32_t)endX << 16) | (uint32_t)(startX & 0xFFFF)); + mgaWrite(priv, MGA_YDSTLEN, ((uint32_t)startY << 16) | (uint32_t)h); +} + + +// ============================================================ +// mgaColorExpand +// ============================================================ +// +// CPU-to-screen monochrome color expansion using the MGA ILOAD +// opcode with BLTMOD_BMONOWF. Monochrome bitmap bits are expanded +// to foreground/background colors by the hardware. Data is fed +// as dwords through MGA_SRC0. + +static void mgaColorExpand(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h, uint32_t fg, uint32_t bg) { + MatroxPrivateT *priv = (MatroxPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + int32_t bytesPerRow = (w + 7) / 8; + int32_t dwordsPerRow = (bytesPerRow + 3) / 4; + + mgaWaitFifo(priv, 6); + + mgaWrite(priv, MGA_DWGCTL, + MGA_OPCOD_ILOAD | MGA_ATYPE_RPL | MGA_BOP_COPY + | MGA_BLTMOD_BMONOWF | MGA_SHFTZERO | MGA_SGNZERO); + mgaWrite(priv, MGA_FCOL, fg); + mgaWrite(priv, MGA_BCOL, bg); + mgaWrite(priv, MGA_PLNWT, 0xFFFFFFFF); + mgaWrite(priv, MGA_FXBNDRY, (uint32_t)dstX | ((uint32_t)(dstX + w) << 16)); + mgaWrite(priv, MGA_YDSTLEN, ((uint32_t)dstY << 16) | (uint32_t)h); + + // Feed monochrome data row by row + for (int32_t row = 0; row < h; row++) { + const uint8_t *rowPtr = srcBuf + row * srcPitch; + + for (int32_t dw = 0; dw < dwordsPerRow; dw++) { + uint32_t val = 0; + int32_t offset = dw * 4; + + for (int32_t b = 0; b < 4; b++) { + if (offset + b < bytesPerRow) { + val |= (uint32_t)rowPtr[offset + b] << (b * 8); + } + } + + mgaWaitFifo(priv, 1); + mgaWrite(priv, MGA_SRC0, val); + } + } +} + + +// ============================================================ +// mgaDetect +// ============================================================ + +static bool mgaDetect(AccelDriverT *drv) { + int32_t matchIdx; + + if (!pciFindDeviceList(sMatroxDeviceIds, &drv->pciDev, &matchIdx)) { + return false; + } + + MatroxPrivateT *priv = (MatroxPrivateT *)drv->privData; + + switch (drv->pciDev.deviceId) { + case MGA_2064W: + drv->name = "Matrox Millennium"; + priv->isG200Plus = false; + break; + case MGA_1064SG: + drv->name = "Matrox Mystique"; + priv->isG200Plus = false; + break; + case MGA_G100_PCI: + case MGA_G100_AGP: + drv->name = "Matrox G100"; + priv->isG200Plus = true; + break; + case MGA_G200_PCI: + case MGA_G200_AGP: + drv->name = "Matrox G200"; + priv->isG200Plus = true; + break; + case MGA_G400: + drv->name = "Matrox G400"; + priv->isG200Plus = true; + break; + case MGA_G450: + drv->name = "Matrox G450"; + priv->isG200Plus = true; + break; + default: + drv->name = "Matrox MGA"; + priv->isG200Plus = false; + break; + } + + return true; +} + + +// ============================================================ +// mgaHostBlit +// ============================================================ +// +// CPU-to-screen blit using the MGA ILOAD opcode. Pixel data is +// written from host memory to the framebuffer through the MMIO +// window via MGA_SRC0. Each row is padded to a dword boundary. + +static void mgaHostBlit(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h) { + MatroxPrivateT *priv = (MatroxPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + int32_t bytesPerRow = w * priv->bytesPerPixel; + int32_t dwordsPerRow = (bytesPerRow + 3) / 4; + + mgaWaitFifo(priv, 5); + + mgaWrite(priv, MGA_DWGCTL, + MGA_OPCOD_ILOAD | MGA_ATYPE_RPL | MGA_BOP_COPY + | MGA_SHFTZERO | MGA_SGNZERO); + mgaWrite(priv, MGA_FCOL, 0xFFFFFFFF); + mgaWrite(priv, MGA_PLNWT, 0xFFFFFFFF); + mgaWrite(priv, MGA_FXBNDRY, (uint32_t)dstX | ((uint32_t)(dstX + w) << 16)); + mgaWrite(priv, MGA_YDSTLEN, ((uint32_t)dstY << 16) | (uint32_t)h); + + // Feed pixel data row by row + for (int32_t row = 0; row < h; row++) { + const uint8_t *rowPtr = srcBuf + row * srcPitch; + + for (int32_t dw = 0; dw < dwordsPerRow; dw++) { + uint32_t val = 0; + int32_t offset = dw * 4; + + for (int32_t b = 0; b < 4; b++) { + if (offset + b < bytesPerRow) { + val |= (uint32_t)rowPtr[offset + b] << (b * 8); + } + } + + mgaWaitFifo(priv, 1); + mgaWrite(priv, MGA_SRC0, val); + } + } +} + + +// ============================================================ +// mgaInit +// ============================================================ + +static bool mgaInit(AccelDriverT *drv, const AccelModeRequestT *req) { + MatroxPrivateT *priv = (MatroxPrivateT *)drv->privData; + + // BAR layout depends on chip: + // Millennium (2064W): BAR0 = control regs (16KB), BAR1 = framebuffer + // Mystique+: BAR0 = control regs (16KB), BAR1 = framebuffer + uint32_t bar0 = pciRead32(drv->pciDev.bus, drv->pciDev.dev, + drv->pciDev.func, PCI_BAR0); + uint32_t bar1 = pciRead32(drv->pciDev.bus, drv->pciDev.dev, + drv->pciDev.func, PCI_BAR1); + + priv->mmioPhysAddr = bar0 & 0xFFFFFFF0; + priv->lfbPhysAddr = bar1 & 0xFFFFFFF0; + + // Size the framebuffer BAR + priv->vramSize = pciSizeBar(drv->pciDev.bus, drv->pciDev.dev, + drv->pciDev.func, PCI_BAR1); + + // Map MMIO control registers (16KB) + if (!dpmiMapFramebuffer(priv->mmioPhysAddr, 16384, &priv->mmioMapping)) { + return false; + } + priv->mmio = (volatile uint32_t *)priv->mmioMapping.ptr; + + // Find and set VESA mode + VesaModeResultT vesa; + if (!vesaFindAndSetMode(req->width, req->height, req->bpp, &vesa)) { + dpmiUnmapFramebuffer(&priv->mmioMapping); + return false; + } + + // Map framebuffer + if (!dpmiMapFramebuffer(priv->lfbPhysAddr, priv->vramSize, &priv->lfbMapping)) { + dpmiUnmapFramebuffer(&priv->mmioMapping); + vgaRestoreTextMode(); + return false; + } + + priv->bytesPerPixel = (vesa.bpp + 7) / 8; + priv->screenPitch = vesa.pitch; + + drv->mode.width = vesa.width; + drv->mode.height = vesa.height; + drv->mode.bpp = vesa.bpp; + drv->mode.pitch = vesa.pitch; + drv->mode.framebuffer = priv->lfbMapping.ptr; + drv->mode.vramSize = priv->vramSize; + drv->mode.offscreenBase = vesa.pitch * vesa.height; + + // Configure MACCESS for pixel depth + uint32_t maccess; + switch (vesa.bpp) { + case 8: maccess = MGA_MACCESS_8BPP; break; + case 15: + case 16: maccess = MGA_MACCESS_16BPP; break; + case 32: maccess = MGA_MACCESS_32BPP; break; + default: maccess = MGA_MACCESS_16BPP; break; + } + + mgaWaitIdle(drv); + mgaWrite(priv, MGA_MACCESS, maccess); + + // Set pitch (in pixels) + mgaWrite(priv, MGA_PITCH, vesa.pitch / priv->bytesPerPixel); + + // Set YDSTORG to 0 (framebuffer starts at beginning of VRAM) + mgaWrite(priv, MGA_YDSTORG, 0); + + // Plane write mask: all bits + mgaWrite(priv, MGA_PLNWT, 0xFFFFFFFF); + + // Set up cursor at end of VRAM + priv->cursorOffset = priv->vramSize - MGA_HW_CURSOR_BYTES; + priv->cursorOffset &= ~(MGA_HW_CURSOR_BYTES - 1); + + drv->caps = ACAP_RECT_FILL + | ACAP_RECT_FILL_PAT + | ACAP_BITBLT + | ACAP_HOST_BLIT + | ACAP_COLOR_EXPAND + | ACAP_LINE_DRAW + | ACAP_HW_CURSOR + | ACAP_CLIP; + + // Full screen clip + mgaSetClip(drv, 0, 0, vesa.width, vesa.height); + + return true; +} + + +// ============================================================ +// mgaLineDraw +// ============================================================ +// +// Line drawing using the MGA AUTOLINE opcode. The MGA engine +// takes start XY and end XY coordinates directly (no Bresenham +// parameter computation needed on the CPU side). + +static void mgaLineDraw(AccelDriverT *drv, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color) { + MatroxPrivateT *priv = (MatroxPrivateT *)drv->privData; + + mgaWaitFifo(priv, 5); + + mgaWrite(priv, MGA_DWGCTL, + MGA_OPCOD_AUTOLINE_CLOSE | MGA_ATYPE_RPL | MGA_SOLID + | MGA_BOP_COPY | MGA_SHFTZERO | MGA_SGNZERO | MGA_ARZERO); + mgaWrite(priv, MGA_FCOL, color); + mgaWrite(priv, MGA_PLNWT, 0xFFFFFFFF); + + // Start coordinate + mgaWrite(priv, MGA_XYSTRT, ((uint32_t)(y1 & 0xFFFF) << 16) | (uint32_t)(x1 & 0xFFFF)); + + // End coordinate (triggers draw) + mgaWrite(priv, MGA_XYEND, ((uint32_t)(y2 & 0xFFFF) << 16) | (uint32_t)(x2 & 0xFFFF)); +} + + +// ============================================================ +// mgaMoveCursor +// ============================================================ +// +// Matrox cursor position is set via RAMDAC registers. +// On Millennium: TVP3026 RAMDAC external registers. +// On Mystique+: integrated RAMDAC at MMIO offset 0x3C00+. + +static void mgaMoveCursor(AccelDriverT *drv, int32_t x, int32_t y) { + MatroxPrivateT *priv = (MatroxPrivateT *)drv->privData; + + if (x < 0) { x = 0; } + if (y < 0) { y = 0; } + + // Cursor position via DAC registers (Mystique/G200+ integrated DAC) + // CURPOS register at MMIO + 0x3C0C + mgaWrite(priv, 0x3C0C, ((uint32_t)(y & 0xFFF) << 16) | (uint32_t)(x & 0xFFF)); +} + + +// ============================================================ +// mgaRectFill +// ============================================================ +// +// Solid rectangle fill using the MGA TRAP opcode with the SOLID +// bit set. This is the fastest path for solid fills -- the +// engine fills with the foreground color using the ARZERO and +// SGNZERO hints to skip setup of unused registers. + +static void mgaRectFill(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color) { + MatroxPrivateT *priv = (MatroxPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + mgaWaitFifo(priv, 5); + + mgaWrite(priv, MGA_DWGCTL, + MGA_OPCOD_TRAP | MGA_ATYPE_BLK | MGA_SOLID + | MGA_BOP_COPY | MGA_ARZERO | MGA_SGNZERO | MGA_SHFTZERO); + mgaWrite(priv, MGA_FCOL, color); + + // Set X boundaries + mgaWrite(priv, MGA_FXBNDRY, ((uint32_t)(x + w) << 16) | (uint32_t)(x & 0xFFFF)); + + // Set Y destination and length (triggers fill) + mgaWrite(priv, MGA_YDSTLEN, ((uint32_t)(y & 0xFFFF) << 16) | (uint32_t)(h & 0xFFFF)); +} + + +// ============================================================ +// mgaRectFillPat +// ============================================================ +// +// 8x8 mono pattern fill using the MGA TRAP opcode with the +// MGA_PATTERN bit set. The pattern is 8 bytes (one per row, +// MSB-first), loaded into PAT0 (rows 0-3) and PAT1 (rows 4-7). +// 1-bits use the foreground color, 0-bits use the background. + +static void mgaRectFillPat(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, const uint8_t *pattern, uint32_t fg, uint32_t bg) { + MatroxPrivateT *priv = (MatroxPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + // Pack pattern rows 0-3 into PAT0 and rows 4-7 into PAT1 + uint32_t pat0 = (uint32_t)pattern[0] + | ((uint32_t)pattern[1] << 8) + | ((uint32_t)pattern[2] << 16) + | ((uint32_t)pattern[3] << 24); + uint32_t pat1 = (uint32_t)pattern[4] + | ((uint32_t)pattern[5] << 8) + | ((uint32_t)pattern[6] << 16) + | ((uint32_t)pattern[7] << 24); + + mgaWaitFifo(priv, 8); + + mgaWrite(priv, MGA_DWGCTL, + MGA_OPCOD_TRAP | MGA_ATYPE_RPL | MGA_PATTERN + | MGA_BOP_COPY | MGA_ARZERO | MGA_SGNZERO | MGA_SHFTZERO); + mgaWrite(priv, MGA_FCOL, fg); + mgaWrite(priv, MGA_BCOL, bg); + mgaWrite(priv, MGA_PAT0, pat0); + mgaWrite(priv, MGA_PAT1, pat1); + mgaWrite(priv, MGA_PLNWT, 0xFFFFFFFF); + + // Set X boundaries and trigger fill + mgaWrite(priv, MGA_FXBNDRY, ((uint32_t)(x + w) << 16) | (uint32_t)(x & 0xFFFF)); + mgaWrite(priv, MGA_YDSTLEN, ((uint32_t)(y & 0xFFFF) << 16) | (uint32_t)(h & 0xFFFF)); +} + + +// ============================================================ +// mgaSetClip +// ============================================================ + +static void mgaSetClip(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h) { + MatroxPrivateT *priv = (MatroxPrivateT *)drv->privData; + + mgaWaitFifo(priv, 3); + mgaWrite(priv, MGA_CXBNDRY, ((uint32_t)(x + w - 1) << 16) | (uint32_t)(x & 0xFFFF)); + mgaWrite(priv, MGA_YTOP, y * (priv->screenPitch / priv->bytesPerPixel)); + mgaWrite(priv, MGA_YBOT, (y + h - 1) * (priv->screenPitch / priv->bytesPerPixel)); +} + + +// ============================================================ +// mgaSetCursor +// ============================================================ + +static void mgaSetCursor(AccelDriverT *drv, const HwCursorImageT *image) { + MatroxPrivateT *priv = (MatroxPrivateT *)drv->privData; + + if (!image) { + mgaShowCursor(drv, false); + return; + } + + mgaWaitIdle(drv); + + uint8_t *cursorMem = drv->mode.framebuffer + priv->cursorOffset; + + for (int32_t row = 0; row < MGA_HW_CURSOR_SIZE; row++) { + for (int32_t byte = 0; byte < 8; byte++) { + int32_t srcIdx = row * 8 + byte; + uint8_t andByte; + uint8_t xorByte; + + if (row < image->height && byte < (image->width + 7) / 8) { + andByte = image->andMask[srcIdx]; + xorByte = image->xorMask[srcIdx]; + } else { + andByte = 0xFF; + xorByte = 0x00; + } + + cursorMem[row * 16 + byte] = andByte; + cursorMem[row * 16 + byte + 8] = xorByte; + } + } + + // Set cursor base address via DAC register + // CURBASE at MMIO + 0x3C04 + mgaWrite(priv, 0x3C04, priv->cursorOffset); +} + + +// ============================================================ +// mgaShowCursor +// ============================================================ + +static void mgaShowCursor(AccelDriverT *drv, bool visible) { + MatroxPrivateT *priv = (MatroxPrivateT *)drv->privData; + + // CURCTL at MMIO + 0x3C00 + uint32_t curCtl = mgaRead(priv, 0x3C00); + + if (visible) { + curCtl |= 0x01; // enable cursor + } else { + curCtl &= ~0x01; + } + + mgaWrite(priv, 0x3C00, curCtl); +} + + +// ============================================================ +// mgaShutdown +// ============================================================ + +static void mgaShutdown(AccelDriverT *drv) { + MatroxPrivateT *priv = (MatroxPrivateT *)drv->privData; + + mgaShowCursor(drv, false); + dpmiUnmapFramebuffer(&priv->mmioMapping); + dpmiUnmapFramebuffer(&priv->lfbMapping); + vgaRestoreTextMode(); +} + + +// ============================================================ +// mgaWaitFifo +// ============================================================ +// +// Wait until the MGA FIFO has enough free entries. +// FIFOSTATUS bits 6:0 indicate the number of free slots. + +static void mgaWaitFifo(MatroxPrivateT *priv, int32_t entries) { + for (int32_t i = 0; i < MGA_MAX_IDLE_WAIT; i++) { + uint32_t stat = mgaRead(priv, MGA_FIFOSTATUS); + int32_t free = stat & MGA_FIFO_FULL_MASK; + + if (free >= entries) { + return; + } + } +} + + +// ============================================================ +// mgaWaitIdle +// ============================================================ + +static void mgaWaitIdle(AccelDriverT *drv) { + MatroxPrivateT *priv = (MatroxPrivateT *)drv->privData; + + for (int32_t i = 0; i < MGA_MAX_IDLE_WAIT; i++) { + uint32_t stat = mgaRead(priv, MGA_STATUS); + if (!(stat & MGA_STATUS_BUSY)) { + return; + } + } +} diff --git a/nvidia.c b/nvidia.c new file mode 100644 index 0000000..109b174 --- /dev/null +++ b/nvidia.c @@ -0,0 +1,677 @@ +// nvidia.c -- Nvidia RIVA 128/TNT/TNT2 accelerated video driver +// +// Supports the Nvidia RIVA family: RIVA 128, RIVA 128 ZX, TNT, +// TNT2, TNT2 Ultra, TNT2 M64, and Vanta. These were high- +// performance 2D/3D accelerators of the late 1990s featuring: +// - Solid rectangle fill +// - Screen-to-screen BitBLT +// - Host-to-screen blit (CPU data transfer) +// - Hardware clip rectangle +// - 64x64 two-color hardware cursor via PRAMDAC +// +// Register access: +// The NV architecture uses memory-mapped I/O via BAR0 (16MB +// MMIO register space) and BAR1 (framebuffer). The 2D engine +// is accessed through the FIFO user space at BAR0 + 0x800000, +// which provides subchannel-based access to graphics objects. +// +// Subchannel layout: +// Sub 0 (0x0000): ROP +// Sub 1 (0x2000): Clip +// Sub 2 (0x4000): Pattern +// Sub 3 (0x6000): GdiRectangle (solid fill) +// Sub 4 (0x8000): ScreenScreenBlt +// Sub 5 (0xA000): ImageFromCpu +// +// Each subchannel has methods starting at +0x0100 within +// its range. The PGRAPH_STATUS register at 0x400700 indicates +// engine busy status (0 = idle). + +#include "accelVid.h" +#include "vgaCommon.h" +#include "pci.h" + +#include +#include +#include +#include +#include + +// ============================================================ +// Nvidia vendor/device IDs +// ============================================================ + +#define NV_VENDOR_ID 0x10DE + +#define NV_RIVA_128 0x0018 // RIVA 128 +#define NV_RIVA_128_ZX 0x0019 // RIVA 128 ZX +#define NV_TNT 0x0020 // RIVA TNT +#define NV_TNT2 0x0028 // RIVA TNT2 +#define NV_TNT2_ULTRA 0x0029 // RIVA TNT2 Ultra +#define NV_TNT2_M64 0x002D // RIVA TNT2 M64 +#define NV_VANTA 0x002C // Vanta + +static const uint16_t sNvDeviceIds[] = { + NV_VENDOR_ID, NV_RIVA_128, + NV_VENDOR_ID, NV_RIVA_128_ZX, + NV_VENDOR_ID, NV_TNT, + NV_VENDOR_ID, NV_TNT2, + NV_VENDOR_ID, NV_TNT2_ULTRA, + NV_VENDOR_ID, NV_TNT2_M64, + NV_VENDOR_ID, NV_VANTA, + 0, 0 +}; + +// ============================================================ +// MMIO register offsets (from BAR0) +// ============================================================ + +// PGRAPH status +#define NV_PGRAPH_STATUS 0x400700 // 0 = idle + +// PRAMDAC hardware cursor +#define NV_PRAMDAC_CURSOR_CFG 0x680300 // bit 0 = enable, bits 2:1 = color mode +#define NV_PRAMDAC_CURSOR_POS 0x680320 // cursor X/Y position + +// PRAMIN area -- cursor image storage offset in VRAM +// The cursor image lives at the top of VRAM, 1KB for 32x32 or 4KB for 64x64. +// PRAMDAC fetches it from the address configured in NV_PRAMDAC_CURSOR_START. +#define NV_PRAMDAC_CURSOR_START 0x680324 // cursor image VRAM offset + +// PFB -- framebuffer config (for reading VRAM size) +#define NV_PFB_BOOT_0 0x100000 // boot config (NV3) +#define NV_PFB_CFG_0 0x100200 // framebuffer config (NV4/NV5) + +// ============================================================ +// FIFO user space offsets (from BAR0 + 0x800000) +// ============================================================ +// +// Subchannel base addresses within the user FIFO area. + +#define NV_FIFO_BASE 0x800000 + +// Subchannel 0: ROP +#define NV_ROP_SUBCHAN 0x0000 +#define NV_ROP_ROP 0x0300 // raster operation + +// Subchannel 1: Clip +#define NV_CLIP_SUBCHAN 0x2000 +#define NV_CLIP_POINT 0x2300 // x | y<<16 +#define NV_CLIP_SIZE 0x2304 // w | h<<16 + +// Subchannel 3: GdiRectangle (solid fill) +#define NV_RECT_SUBCHAN 0x6000 +#define NV_RECT_COLOR 0x62FC // fill color +#define NV_RECT_POINT 0x6300 // x | y<<16 +#define NV_RECT_SIZE 0x6304 // w | h<<16 (triggers fill) + +// Subchannel 4: ScreenScreenBlt +#define NV_BLIT_SUBCHAN 0x8000 +#define NV_BLIT_POINT_IN 0x8300 // srcX | srcY<<16 +#define NV_BLIT_POINT_OUT 0x8304 // dstX | dstY<<16 +#define NV_BLIT_SIZE 0x8308 // w | h<<16 + +// Subchannel 5: ImageFromCpu +#define NV_IMAGE_SUBCHAN 0xA000 +#define NV_IMAGE_POINT 0xA300 // dstX | dstY<<16 +#define NV_IMAGE_SIZE_OUT 0xA304 // w | h<<16 +#define NV_IMAGE_SIZE_IN 0xA308 // srcW | srcH<<16 +#define NV_IMAGE_DATA 0xA400 // color data (dwords) + +// ============================================================ +// Constants +// ============================================================ + +#define NV_ROP_COPY 0xCC // dest = src +#define NV_MMIO_SIZE 0x1000000 // 16MB MMIO region +#define NV_MAX_IDLE_WAIT 1000000 +#define NV_HW_CURSOR_SIZE 64 +#define NV_HW_CURSOR_BYTES (NV_HW_CURSOR_SIZE * NV_HW_CURSOR_SIZE * 2 / 8) + +// Cursor config bits +#define NV_CURSOR_ENABLE 0x01 +#define NV_CURSOR_MODE_2COLOR 0x00 // 2-color mode (bits 2:1 = 0) + +// RIVA 128 (NV3) vs TNT (NV4/NV5) detection +#define NV_ARCH_NV3 3 +#define NV_ARCH_NV4 4 + +// ============================================================ +// Private driver state +// ============================================================ + +typedef struct { + volatile uint32_t *mmio; // mapped MMIO base (BAR0) + volatile uint32_t *fifo; // FIFO user space (BAR0 + 0x800000) + uint32_t mmioPhysAddr; + uint32_t lfbPhysAddr; + uint32_t vramSize; + uint32_t cursorOffset; // cursor image offset in VRAM + int32_t bytesPerPixel; + int32_t screenPitch; + int32_t arch; // NV_ARCH_NV3 or NV_ARCH_NV4 + DpmiMappingT mmioMapping; + DpmiMappingT lfbMapping; +} NvPrivateT; + +// ============================================================ +// Prototypes +// ============================================================ + +static void nvBitBlt(AccelDriverT *drv, int32_t srcX, int32_t srcY, int32_t dstX, int32_t dstY, int32_t w, int32_t h); +static bool nvDetect(AccelDriverT *drv); +static uint32_t nvDetectVram(NvPrivateT *priv); +static void nvHostBlit(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h); +static bool nvInit(AccelDriverT *drv, const AccelModeRequestT *req); +static void nvMoveCursor(AccelDriverT *drv, int32_t x, int32_t y); +static void nvRectFill(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color); +static void nvSetClip(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h); +static void nvSetCursor(AccelDriverT *drv, const HwCursorImageT *image); +static void nvSetupEngine(NvPrivateT *priv); +static void nvShowCursor(AccelDriverT *drv, bool visible); +static void nvShutdown(AccelDriverT *drv); +static void nvWaitIdle(AccelDriverT *drv); +static void nvWriteFifo(NvPrivateT *priv, uint32_t offset, uint32_t val); +static uint32_t nvReadMmio(NvPrivateT *priv, uint32_t offset); +static void nvWriteMmio(NvPrivateT *priv, uint32_t offset, uint32_t val); + +// ============================================================ +// Driver instance +// ============================================================ + +static NvPrivateT sNvPrivate; + +static AccelDriverT sNvDriver = { + .name = "Nvidia RIVA", + .chipFamily = "nvidia", + .caps = 0, + .privData = &sNvPrivate, + .detect = nvDetect, + .init = nvInit, + .shutdown = nvShutdown, + .waitIdle = nvWaitIdle, + .setClip = nvSetClip, + .rectFill = nvRectFill, + .rectFillPat = NULL, + .bitBlt = nvBitBlt, + .hostBlit = nvHostBlit, + .colorExpand = NULL, + .lineDraw = NULL, + .setCursor = nvSetCursor, + .moveCursor = nvMoveCursor, + .showCursor = nvShowCursor, +}; + +// ============================================================ +// nvRegisterDriver +// ============================================================ + +void nvRegisterDriver(void) { + accelRegisterDriver(&sNvDriver); +} + + +// ============================================================ +// nvBitBlt +// ============================================================ +// +// Screen-to-screen blit via the ScreenScreenBlt subchannel. +// The NV engine handles overlapping source/destination regions +// internally when the blit direction is set appropriately. + +static void nvBitBlt(AccelDriverT *drv, int32_t srcX, int32_t srcY, int32_t dstX, int32_t dstY, int32_t w, int32_t h) { + NvPrivateT *priv = (NvPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + nvWaitIdle(drv); + + nvWriteFifo(priv, NV_BLIT_POINT_IN, (uint32_t)srcX | ((uint32_t)srcY << 16)); + nvWriteFifo(priv, NV_BLIT_POINT_OUT, (uint32_t)dstX | ((uint32_t)dstY << 16)); + nvWriteFifo(priv, NV_BLIT_SIZE, (uint32_t)w | ((uint32_t)h << 16)); +} + + +// ============================================================ +// nvDetect +// ============================================================ + +static bool nvDetect(AccelDriverT *drv) { + int32_t matchIdx; + + if (!pciFindDeviceList(sNvDeviceIds, &drv->pciDev, &matchIdx)) { + return false; + } + + switch (drv->pciDev.deviceId) { + case NV_RIVA_128: + drv->name = "Nvidia RIVA 128"; + break; + case NV_RIVA_128_ZX: + drv->name = "Nvidia RIVA 128 ZX"; + break; + case NV_TNT: + drv->name = "Nvidia RIVA TNT"; + break; + case NV_TNT2: + drv->name = "Nvidia RIVA TNT2"; + break; + case NV_TNT2_ULTRA: + drv->name = "Nvidia RIVA TNT2 Ultra"; + break; + case NV_TNT2_M64: + drv->name = "Nvidia RIVA TNT2 M64"; + break; + case NV_VANTA: + drv->name = "Nvidia Vanta"; + break; + default: + drv->name = "Nvidia RIVA"; + break; + } + + return true; +} + + +// ============================================================ +// nvDetectVram +// ============================================================ +// +// Read VRAM size from the PFB registers. NV3 (RIVA 128) uses +// PFB_BOOT_0, while NV4/NV5 (TNT/TNT2) use PFB_CFG_0. + +static uint32_t nvDetectVram(NvPrivateT *priv) { + if (priv->arch == NV_ARCH_NV3) { + // NV3: PFB_BOOT_0 bits 1:0 encode VRAM size + uint32_t boot0 = nvReadMmio(priv, NV_PFB_BOOT_0); + uint32_t sizeIdx = boot0 & 0x03; + + switch (sizeIdx) { + case 0: return 8 * 1024 * 1024; + case 1: return 2 * 1024 * 1024; + case 2: return 4 * 1024 * 1024; + default: return 4 * 1024 * 1024; + } + } + + // NV4/NV5: PFB_CFG_0 bits 1:0 encode VRAM size + uint32_t cfg0 = nvReadMmio(priv, NV_PFB_CFG_0); + uint32_t sizeIdx = cfg0 & 0x03; + + switch (sizeIdx) { + case 0: return 32 * 1024 * 1024; + case 1: return 4 * 1024 * 1024; + case 2: return 8 * 1024 * 1024; + case 3: return 16 * 1024 * 1024; + default: return 4 * 1024 * 1024; + } +} + + +// ============================================================ +// nvHostBlit +// ============================================================ +// +// CPU-to-screen blit via the ImageFromCpu subchannel. Transfers +// pixel data from system memory to VRAM through the FIFO. + +static void nvHostBlit(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h) { + NvPrivateT *priv = (NvPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + int32_t rowBytes = w * priv->bytesPerPixel; + int32_t dwordsPerRow = (rowBytes + 3) / 4; + + nvWaitIdle(drv); + + // Set up the image transfer + nvWriteFifo(priv, NV_IMAGE_POINT, (uint32_t)dstX | ((uint32_t)dstY << 16)); + nvWriteFifo(priv, NV_IMAGE_SIZE_OUT, (uint32_t)w | ((uint32_t)h << 16)); + nvWriteFifo(priv, NV_IMAGE_SIZE_IN, (uint32_t)w | ((uint32_t)h << 16)); + + // Write pixel data row by row + for (int32_t row = 0; row < h; row++) { + const uint8_t *rowPtr = srcBuf + row * srcPitch; + + for (int32_t dw = 0; dw < dwordsPerRow; dw++) { + int32_t byteOff = dw * 4; + uint32_t data = 0; + + // Pack bytes into a dword (little-endian native order) + for (int32_t b = 0; b < 4; b++) { + if (byteOff + b < rowBytes) { + data |= (uint32_t)rowPtr[byteOff + b] << (b * 8); + } + } + + // Write to the color data area; each dword goes to the + // next sequential offset starting at NV_IMAGE_DATA. + nvWriteFifo(priv, NV_IMAGE_DATA + (uint32_t)(dw * 4), data); + } + + // Wait for engine between rows to avoid FIFO overflow + nvWaitIdle(drv); + } +} + + +// ============================================================ +// nvInit +// ============================================================ + +static bool nvInit(AccelDriverT *drv, const AccelModeRequestT *req) { + NvPrivateT *priv = (NvPrivateT *)drv->privData; + + memset(priv, 0, sizeof(*priv)); + + // Determine architecture (NV3 vs NV4/NV5) + if (drv->pciDev.deviceId == NV_RIVA_128 || drv->pciDev.deviceId == NV_RIVA_128_ZX) { + priv->arch = NV_ARCH_NV3; + } else { + priv->arch = NV_ARCH_NV4; + } + + // Get BAR0 (MMIO) and BAR1 (framebuffer) addresses + uint32_t bar0 = pciRead32(drv->pciDev.bus, drv->pciDev.dev, drv->pciDev.func, PCI_BAR0); + uint32_t bar1 = pciRead32(drv->pciDev.bus, drv->pciDev.dev, drv->pciDev.func, PCI_BAR1); + + priv->mmioPhysAddr = bar0 & 0xFFFFFFF0; + priv->lfbPhysAddr = bar1 & 0xFFFFFFF0; + + // Size the framebuffer BAR + uint32_t lfbBarSize = pciSizeBar(drv->pciDev.bus, drv->pciDev.dev, drv->pciDev.func, PCI_BAR1); + + // Enable bus mastering and memory space access + uint16_t pciCmd = pciRead16(drv->pciDev.bus, drv->pciDev.dev, drv->pciDev.func, PCI_COMMAND); + pciCmd |= PCI_CMD_MEM_ENABLE | PCI_CMD_BUS_MASTER; + pciWrite16(drv->pciDev.bus, drv->pciDev.dev, drv->pciDev.func, PCI_COMMAND, pciCmd); + + // Map MMIO region (BAR0, 16MB) + if (!dpmiMapFramebuffer(priv->mmioPhysAddr, NV_MMIO_SIZE, &priv->mmioMapping)) { + return false; + } + priv->mmio = (volatile uint32_t *)priv->mmioMapping.ptr; + priv->fifo = (volatile uint32_t *)(priv->mmioMapping.ptr + NV_FIFO_BASE); + + // Detect VRAM size + priv->vramSize = nvDetectVram(priv); + + // Use whichever is smaller: the BAR size or detected VRAM + if (lfbBarSize < priv->vramSize) { + priv->vramSize = lfbBarSize; + } + + // Set VESA mode + VesaModeResultT vesa; + if (!vesaFindAndSetMode(req->width, req->height, req->bpp, &vesa)) { + dpmiUnmapFramebuffer(&priv->mmioMapping); + return false; + } + + // Map framebuffer (BAR1) + if (!dpmiMapFramebuffer(priv->lfbPhysAddr, priv->vramSize, &priv->lfbMapping)) { + vgaRestoreTextMode(); + dpmiUnmapFramebuffer(&priv->mmioMapping); + return false; + } + + priv->bytesPerPixel = (vesa.bpp + 7) / 8; + priv->screenPitch = vesa.pitch; + + drv->mode.width = vesa.width; + drv->mode.height = vesa.height; + drv->mode.bpp = vesa.bpp; + drv->mode.pitch = vesa.pitch; + drv->mode.framebuffer = priv->lfbMapping.ptr; + drv->mode.vramSize = priv->vramSize; + drv->mode.offscreenBase = vesa.pitch * vesa.height; + + // Reserve space for hardware cursor at end of VRAM + priv->cursorOffset = priv->vramSize - NV_HW_CURSOR_BYTES; + priv->cursorOffset &= ~(uint32_t)(NV_HW_CURSOR_BYTES - 1); + + // Initialize the 2D engine + nvSetupEngine(priv); + + drv->caps = ACAP_RECT_FILL + | ACAP_BITBLT + | ACAP_HOST_BLIT + | ACAP_HW_CURSOR + | ACAP_CLIP; + + // Set full-screen clip + nvSetClip(drv, 0, 0, vesa.width, vesa.height); + + nvWaitIdle(drv); + return true; +} + + +// ============================================================ +// nvMoveCursor +// ============================================================ + +static void nvMoveCursor(AccelDriverT *drv, int32_t x, int32_t y) { + NvPrivateT *priv = (NvPrivateT *)drv->privData; + + // PRAMDAC cursor position: bits 15:0 = X, bits 31:16 = Y + // Negative values are handled by clamping to 0; the cursor + // offset register could be used for sub-pixel adjustment but + // that is not needed for typical use. + if (x < 0) { + x = 0; + } + if (y < 0) { + y = 0; + } + + nvWriteMmio(priv, NV_PRAMDAC_CURSOR_POS, (uint32_t)x | ((uint32_t)y << 16)); +} + + +// ============================================================ +// nvReadMmio / nvWriteMmio +// ============================================================ +// +// Direct MMIO register access via BAR0. + +static uint32_t nvReadMmio(NvPrivateT *priv, uint32_t offset) { + return priv->mmio[offset / 4]; +} + + +static void nvWriteMmio(NvPrivateT *priv, uint32_t offset, uint32_t val) { + priv->mmio[offset / 4] = val; +} + + +// ============================================================ +// nvRectFill +// ============================================================ +// +// Solid rectangle fill via the GdiRectangle subchannel. + +static void nvRectFill(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color) { + NvPrivateT *priv = (NvPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + nvWaitIdle(drv); + + nvWriteFifo(priv, NV_RECT_COLOR, color); + nvWriteFifo(priv, NV_RECT_POINT, (uint32_t)x | ((uint32_t)y << 16)); + nvWriteFifo(priv, NV_RECT_SIZE, (uint32_t)w | ((uint32_t)h << 16)); +} + + +// ============================================================ +// nvSetClip +// ============================================================ +// +// Set the hardware clip rectangle via the Clip subchannel. + +static void nvSetClip(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h) { + NvPrivateT *priv = (NvPrivateT *)drv->privData; + + nvWaitIdle(drv); + + nvWriteFifo(priv, NV_CLIP_POINT, (uint32_t)x | ((uint32_t)y << 16)); + nvWriteFifo(priv, NV_CLIP_SIZE, (uint32_t)w | ((uint32_t)h << 16)); +} + + +// ============================================================ +// nvSetCursor +// ============================================================ +// +// Upload a cursor image to VRAM and configure the PRAMDAC +// to display it. The NV hardware cursor is 64x64, 2 bits per +// pixel, stored in VRAM at the offset configured in +// NV_PRAMDAC_CURSOR_START. +// +// 2bpp encoding: +// 00 = cursor color 0 (background) +// 01 = cursor color 1 (foreground) +// 10 = transparent +// 11 = inverted + +static void nvSetCursor(AccelDriverT *drv, const HwCursorImageT *image) { + NvPrivateT *priv = (NvPrivateT *)drv->privData; + + if (!image) { + nvShowCursor(drv, false); + return; + } + + nvWaitIdle(drv); + + // Write cursor image to VRAM at the reserved offset + uint8_t *cursorMem = drv->mode.framebuffer + priv->cursorOffset; + + for (int32_t row = 0; row < NV_HW_CURSOR_SIZE; row++) { + for (int32_t byteIdx = 0; byteIdx < 16; byteIdx++) { + uint8_t val = 0xAA; // all transparent (10 pattern) + + if (row < image->height && byteIdx < (image->width + 3) / 4) { + int32_t bitOff = byteIdx * 4; + uint8_t andBits = 0; + uint8_t xorBits = 0; + + if (bitOff / 8 < (image->width + 7) / 8) { + andBits = image->andMask[row * 8 + bitOff / 8]; + xorBits = image->xorMask[row * 8 + bitOff / 8]; + } + + // Pack 4 pixels into one byte (2 bits each) + val = 0; + for (int32_t px = 0; px < 4; px++) { + int32_t srcBit = (bitOff + px) % 8; + uint8_t andBit = (andBits >> (7 - srcBit)) & 1; + uint8_t xorBit = (xorBits >> (7 - srcBit)) & 1; + uint8_t pixel; + + if (andBit && !xorBit) { + pixel = 0x02; // transparent + } else if (andBit && xorBit) { + pixel = 0x03; // inverted + } else if (!andBit && xorBit) { + pixel = 0x01; // cursor color 1 + } else { + pixel = 0x00; // cursor color 0 + } + + val |= pixel << (6 - px * 2); + } + } + + cursorMem[row * 16 + byteIdx] = val; + } + } + + // Point the PRAMDAC at the cursor image in VRAM + nvWriteMmio(priv, NV_PRAMDAC_CURSOR_START, priv->cursorOffset); +} + + +// ============================================================ +// nvSetupEngine +// ============================================================ +// +// Initialize the 2D acceleration engine. Sets the ROP to copy +// mode and prepares the FIFO subchannels for use. + +static void nvSetupEngine(NvPrivateT *priv) { + // Set ROP to copy + nvWriteFifo(priv, NV_ROP_ROP, NV_ROP_COPY); +} + + +// ============================================================ +// nvShowCursor +// ============================================================ + +static void nvShowCursor(AccelDriverT *drv, bool visible) { + NvPrivateT *priv = (NvPrivateT *)drv->privData; + + uint32_t cfg = nvReadMmio(priv, NV_PRAMDAC_CURSOR_CFG); + + if (visible) { + cfg |= NV_CURSOR_ENABLE; + } else { + cfg &= ~(uint32_t)NV_CURSOR_ENABLE; + } + + nvWriteMmio(priv, NV_PRAMDAC_CURSOR_CFG, cfg); +} + + +// ============================================================ +// nvShutdown +// ============================================================ + +static void nvShutdown(AccelDriverT *drv) { + NvPrivateT *priv = (NvPrivateT *)drv->privData; + + nvShowCursor(drv, false); + vgaRestoreTextMode(); + dpmiUnmapFramebuffer(&priv->lfbMapping); + dpmiUnmapFramebuffer(&priv->mmioMapping); +} + + +// ============================================================ +// nvWaitIdle +// ============================================================ +// +// Wait for the PGRAPH engine to become idle by polling the +// PGRAPH_STATUS register. + +static void nvWaitIdle(AccelDriverT *drv) { + NvPrivateT *priv = (NvPrivateT *)drv->privData; + + for (int32_t i = 0; i < NV_MAX_IDLE_WAIT; i++) { + if (nvReadMmio(priv, NV_PGRAPH_STATUS) == 0) { + return; + } + } +} + + +// ============================================================ +// nvWriteFifo +// ============================================================ +// +// Write a value to the FIFO user space. The offset is relative +// to the FIFO base (BAR0 + 0x800000). + +static void nvWriteFifo(NvPrivateT *priv, uint32_t offset, uint32_t val) { + priv->fifo[offset / 4] = val; +} diff --git a/pci.c b/pci.c new file mode 100644 index 0000000..d22b727 --- /dev/null +++ b/pci.c @@ -0,0 +1,307 @@ +// pci.c -- PCI configuration space access for DOS/DJGPP +// +// Implements PCI mechanism 1 (CONFIG_ADDRESS at 0xCF8, CONFIG_DATA +// at 0xCFC). This is the standard PCI configuration access method +// supported by all PCI-capable chipsets. +// +// How mechanism 1 works: +// 1. Write a 32-bit address to port 0xCF8 with bit 31 set (enable), +// bus/dev/func/register fields encoded in bits 23:0 +// 2. Read or write the 32-bit data at port 0xCFC +// 3. For sub-dword access (8/16-bit), read the full dword and +// mask/shift, or write with a read-modify-write +// +// Detection: write 0x80000000 to 0xCF8 and read back. If the value +// matches, mechanism 1 is present. This works because bit 31 is the +// enable bit -- on non-PCI systems, port 0xCF8 is either absent +// (reads back 0xFF) or belongs to a different device. + +#include "pci.h" + +#include + +// PCI configuration mechanism 1 I/O ports +#define PCI_CONFIG_ADDR 0x0CF8 +#define PCI_CONFIG_DATA 0x0CFC + +// ============================================================ +// Prototypes +// ============================================================ + +uint32_t pciBuildAddress(uint8_t bus, uint8_t dev, uint8_t func, uint8_t reg); +bool pciDetect(void); +int32_t pciEnumerate(PciEnumCallbackT cb, void *userData); +bool pciFindDevice(uint16_t vendorId, uint16_t deviceId, PciDeviceT *dev); +bool pciFindDeviceList(const uint16_t *idPairs, PciDeviceT *dev, int32_t *matchIdx); +uint8_t pciRead8(uint8_t bus, uint8_t dev, uint8_t func, uint8_t reg); +uint16_t pciRead16(uint8_t bus, uint8_t dev, uint8_t func, uint8_t reg); +uint32_t pciRead32(uint8_t bus, uint8_t dev, uint8_t func, uint8_t reg); +void pciWrite8(uint8_t bus, uint8_t dev, uint8_t func, uint8_t reg, uint8_t val); +void pciWrite16(uint8_t bus, uint8_t dev, uint8_t func, uint8_t reg, uint16_t val); +void pciWrite32(uint8_t bus, uint8_t dev, uint8_t func, uint8_t reg, uint32_t val); + +// ============================================================ +// pciBuildAddress +// ============================================================ +// +// Constructs a PCI configuration space address for mechanism 1. +// Format: [31]=enable, [23:16]=bus, [15:11]=device, [10:8]=function, +// [7:2]=register (dword-aligned), [1:0]=0 + +uint32_t pciBuildAddress(uint8_t bus, uint8_t dev, uint8_t func, uint8_t reg) { + return 0x80000000 + | ((uint32_t)bus << 16) + | ((uint32_t)dev << 11) + | ((uint32_t)func << 8) + | ((uint32_t)reg & 0xFC); +} + + +// ============================================================ +// pciDetect +// ============================================================ +// +// Checks for PCI mechanism 1 by writing the enable bit to the +// CONFIG_ADDRESS port and reading it back. Saves and restores +// the original port value to avoid disturbing any in-progress +// PCI transaction. + +bool pciDetect(void) { + uint32_t saved = inportl(PCI_CONFIG_ADDR); + + outportl(PCI_CONFIG_ADDR, 0x80000000); + uint32_t readBack = inportl(PCI_CONFIG_ADDR); + + outportl(PCI_CONFIG_ADDR, saved); + + return (readBack == 0x80000000); +} + + +// ============================================================ +// pciEnumerate +// ============================================================ +// +// Scans all bus/device/function combinations for present devices. +// A device is present if its vendor ID is not 0xFFFF. Multi-function +// devices are detected by checking bit 7 of the header type register +// on function 0; single-function devices only probe function 0. + +int32_t pciEnumerate(PciEnumCallbackT cb, void *userData) { + int32_t count = 0; + + for (int32_t bus = 0; bus < PCI_MAX_BUS; bus++) { + for (int32_t dev = 0; dev < PCI_MAX_DEV; dev++) { + uint16_t vendor0 = pciRead16(bus, dev, 0, PCI_VENDOR_ID); + + if (vendor0 == 0xFFFF) { + continue; + } + + // Check if multi-function device + uint8_t headerType = pciRead8(bus, dev, 0, PCI_HEADER_TYPE); + int32_t maxFunc = (headerType & 0x80) ? PCI_MAX_FUNC : 1; + + for (int32_t func = 0; func < maxFunc; func++) { + uint16_t vendorId = pciRead16(bus, dev, func, PCI_VENDOR_ID); + + if (vendorId == 0xFFFF) { + continue; + } + + PciDeviceT device; + device.bus = bus; + device.dev = dev; + device.func = func; + device.vendorId = vendorId; + device.deviceId = pciRead16(bus, dev, func, PCI_DEVICE_ID); + device.revision = pciRead8(bus, dev, func, PCI_REVISION_ID); + device.baseClass = pciRead8(bus, dev, func, PCI_BASE_CLASS); + device.subClass = pciRead8(bus, dev, func, PCI_SUBCLASS); + + for (int32_t i = 0; i < 6; i++) { + device.bar[i] = pciRead32(bus, dev, func, PCI_BAR0 + i * 4); + } + + count++; + + if (cb && cb(&device, userData)) { + return count; + } + } + } + } + + return count; +} + + +// ============================================================ +// pciFindDevice +// ============================================================ + +bool pciFindDevice(uint16_t vendorId, uint16_t deviceId, PciDeviceT *dev) { + for (int32_t bus = 0; bus < PCI_MAX_BUS; bus++) { + for (int32_t d = 0; d < PCI_MAX_DEV; d++) { + uint16_t vendor0 = pciRead16(bus, d, 0, PCI_VENDOR_ID); + + if (vendor0 == 0xFFFF) { + continue; + } + + uint8_t headerType = pciRead8(bus, d, 0, PCI_HEADER_TYPE); + int32_t maxFunc = (headerType & 0x80) ? PCI_MAX_FUNC : 1; + + for (int32_t func = 0; func < maxFunc; func++) { + uint16_t vid = pciRead16(bus, d, func, PCI_VENDOR_ID); + uint16_t did = pciRead16(bus, d, func, PCI_DEVICE_ID); + + if (vid == vendorId && did == deviceId) { + dev->bus = bus; + dev->dev = d; + dev->func = func; + dev->vendorId = vid; + dev->deviceId = did; + dev->revision = pciRead8(bus, d, func, PCI_REVISION_ID); + dev->baseClass = pciRead8(bus, d, func, PCI_BASE_CLASS); + dev->subClass = pciRead8(bus, d, func, PCI_SUBCLASS); + + for (int32_t i = 0; i < 6; i++) { + dev->bar[i] = pciRead32(bus, d, func, PCI_BAR0 + i * 4); + } + + return true; + } + } + } + } + + return false; +} + + +// ============================================================ +// pciFindDeviceList +// ============================================================ +// +// Searches for the first PCI device matching any vendor/device pair +// in the given list. The list is an array of uint16_t pairs: +// { vendor1, device1, vendor2, device2, ..., 0, 0 } +// On match, fills dev and sets matchIdx to the pair index (0-based). + +bool pciFindDeviceList(const uint16_t *idPairs, PciDeviceT *dev, int32_t *matchIdx) { + for (int32_t bus = 0; bus < PCI_MAX_BUS; bus++) { + for (int32_t d = 0; d < PCI_MAX_DEV; d++) { + uint16_t vendor0 = pciRead16(bus, d, 0, PCI_VENDOR_ID); + + if (vendor0 == 0xFFFF) { + continue; + } + + uint8_t headerType = pciRead8(bus, d, 0, PCI_HEADER_TYPE); + int32_t maxFunc = (headerType & 0x80) ? PCI_MAX_FUNC : 1; + + for (int32_t func = 0; func < maxFunc; func++) { + uint16_t vid = pciRead16(bus, d, func, PCI_VENDOR_ID); + uint16_t did = pciRead16(bus, d, func, PCI_DEVICE_ID); + + if (vid == 0xFFFF) { + continue; + } + + for (int32_t idx = 0; idPairs[idx * 2] != 0; idx++) { + if (vid == idPairs[idx * 2] && did == idPairs[idx * 2 + 1]) { + dev->bus = bus; + dev->dev = d; + dev->func = func; + dev->vendorId = vid; + dev->deviceId = did; + dev->revision = pciRead8(bus, d, func, PCI_REVISION_ID); + dev->baseClass = pciRead8(bus, d, func, PCI_BASE_CLASS); + dev->subClass = pciRead8(bus, d, func, PCI_SUBCLASS); + + for (int32_t i = 0; i < 6; i++) { + dev->bar[i] = pciRead32(bus, d, func, PCI_BAR0 + i * 4); + } + + if (matchIdx) { + *matchIdx = idx; + } + + return true; + } + } + } + } + } + + return false; +} + + +// ============================================================ +// pciRead8 +// ============================================================ + +uint8_t pciRead8(uint8_t bus, uint8_t dev, uint8_t func, uint8_t reg) { + outportl(PCI_CONFIG_ADDR, pciBuildAddress(bus, dev, func, reg)); + uint32_t dword = inportl(PCI_CONFIG_DATA); + return (dword >> ((reg & 3) * 8)) & 0xFF; +} + + +// ============================================================ +// pciRead16 +// ============================================================ + +uint16_t pciRead16(uint8_t bus, uint8_t dev, uint8_t func, uint8_t reg) { + outportl(PCI_CONFIG_ADDR, pciBuildAddress(bus, dev, func, reg)); + uint32_t dword = inportl(PCI_CONFIG_DATA); + return (dword >> ((reg & 2) * 8)) & 0xFFFF; +} + + +// ============================================================ +// pciRead32 +// ============================================================ + +uint32_t pciRead32(uint8_t bus, uint8_t dev, uint8_t func, uint8_t reg) { + outportl(PCI_CONFIG_ADDR, pciBuildAddress(bus, dev, func, reg)); + return inportl(PCI_CONFIG_DATA); +} + + +// ============================================================ +// pciWrite8 +// ============================================================ + +void pciWrite8(uint8_t bus, uint8_t dev, uint8_t func, uint8_t reg, uint8_t val) { + outportl(PCI_CONFIG_ADDR, pciBuildAddress(bus, dev, func, reg)); + uint32_t dword = inportl(PCI_CONFIG_DATA); + int32_t shift = (reg & 3) * 8; + dword = (dword & ~(0xFF << shift)) | ((uint32_t)val << shift); + outportl(PCI_CONFIG_DATA, dword); +} + + +// ============================================================ +// pciWrite16 +// ============================================================ + +void pciWrite16(uint8_t bus, uint8_t dev, uint8_t func, uint8_t reg, uint16_t val) { + outportl(PCI_CONFIG_ADDR, pciBuildAddress(bus, dev, func, reg)); + uint32_t dword = inportl(PCI_CONFIG_DATA); + int32_t shift = (reg & 2) * 8; + dword = (dword & ~(0xFFFF << shift)) | ((uint32_t)val << shift); + outportl(PCI_CONFIG_DATA, dword); +} + + +// ============================================================ +// pciWrite32 +// ============================================================ + +void pciWrite32(uint8_t bus, uint8_t dev, uint8_t func, uint8_t reg, uint32_t val) { + outportl(PCI_CONFIG_ADDR, pciBuildAddress(bus, dev, func, reg)); + outportl(PCI_CONFIG_DATA, val); +} diff --git a/pci.h b/pci.h new file mode 100644 index 0000000..c461584 --- /dev/null +++ b/pci.h @@ -0,0 +1,98 @@ +// pci.h -- PCI configuration space access for DOS/DJGPP +// +// Provides functions to read/write PCI configuration registers and +// enumerate devices on the PCI bus. Uses the standard mechanism 1 +// (I/O ports 0xCF8/0xCFC) which is supported by all PCI-capable +// systems from 1993 onward. +// +// All functions operate synchronously via inportl/outportl. No BIOS +// calls (INT 1Ah) are used because mechanism 1 is faster, simpler, +// and doesn't require a DPMI real-mode callback. +#ifndef PCI_H +#define PCI_H + +#include +#include + +// PCI configuration space register offsets (common header) +#define PCI_VENDOR_ID 0x00 +#define PCI_DEVICE_ID 0x02 +#define PCI_COMMAND 0x04 +#define PCI_STATUS 0x06 +#define PCI_REVISION_ID 0x08 +#define PCI_CLASS_CODE 0x09 +#define PCI_SUBCLASS 0x0A +#define PCI_BASE_CLASS 0x0B +#define PCI_HEADER_TYPE 0x0E +#define PCI_BAR0 0x10 +#define PCI_BAR1 0x14 +#define PCI_BAR2 0x18 +#define PCI_BAR3 0x1C +#define PCI_BAR4 0x20 +#define PCI_BAR5 0x24 +#define PCI_SUBSYS_VENDOR 0x2C +#define PCI_SUBSYS_ID 0x2E + +// PCI command register bits +#define PCI_CMD_IO_ENABLE 0x0001 +#define PCI_CMD_MEM_ENABLE 0x0002 +#define PCI_CMD_BUS_MASTER 0x0004 + +// PCI base class for display controllers +#define PCI_CLASS_DISPLAY 0x03 + +// Maximum PCI bus/device/function values +#define PCI_MAX_BUS 256 +#define PCI_MAX_DEV 32 +#define PCI_MAX_FUNC 8 + +// PCI device descriptor returned by enumeration +typedef struct { + uint8_t bus; + uint8_t dev; + uint8_t func; + uint16_t vendorId; + uint16_t deviceId; + uint8_t revision; + uint8_t baseClass; + uint8_t subClass; + uint32_t bar[6]; +} PciDeviceT; + +// Callback for pciEnumerate(). Return true to stop enumeration. +typedef bool (*PciEnumCallbackT)(const PciDeviceT *device, void *userData); + +// ============================================================ +// Prototypes +// ============================================================ + +// Build a CONFIG_ADDRESS dword for the given bus/dev/func/register. +uint32_t pciBuildAddress(uint8_t bus, uint8_t dev, uint8_t func, uint8_t reg); + +// Check whether PCI mechanism 1 is available. +bool pciDetect(void); + +// Enumerate all PCI devices. Calls cb for each device found. +// Stops early if cb returns true. Returns the number of devices found. +int32_t pciEnumerate(PciEnumCallbackT cb, void *userData); + +// Find the first PCI device matching vendorId/deviceId. +// Returns true if found (and fills out dev), false if not. +bool pciFindDevice(uint16_t vendorId, uint16_t deviceId, PciDeviceT *dev); + +// Find the first PCI device matching any of the given vendor/device +// pairs. The list is terminated by a {0, 0} entry. Returns true if +// found (and fills out dev and matchIdx), false if not. +bool pciFindDeviceList(const uint16_t *idPairs, PciDeviceT *dev, int32_t *matchIdx); + +// Read an 8/16/32-bit value from PCI configuration space. +uint8_t pciRead8(uint8_t bus, uint8_t dev, uint8_t func, uint8_t reg); +uint16_t pciRead16(uint8_t bus, uint8_t dev, uint8_t func, uint8_t reg); +uint32_t pciRead32(uint8_t bus, uint8_t dev, uint8_t func, uint8_t reg); + +// Write an 8/16/32-bit value to PCI configuration space. +void pciWrite8(uint8_t bus, uint8_t dev, uint8_t func, uint8_t reg, uint8_t val); +void pciWrite16(uint8_t bus, uint8_t dev, uint8_t func, uint8_t reg, uint16_t val); +void pciWrite32(uint8_t bus, uint8_t dev, uint8_t func, uint8_t reg, uint32_t val); + +#endif // PCI_H diff --git a/s3Trio.c b/s3Trio.c new file mode 100644 index 0000000..40d7fad --- /dev/null +++ b/s3Trio.c @@ -0,0 +1,1216 @@ +// s3Trio.c -- S3 Trio64/Vision864/Vision968 accelerated video driver +// +// Supports the S3 86C764 (Trio64), 86C765 (Trio64V+), 86C868 (Vision868), +// 86C864 (Vision864), 86C964 (Vision964), 86C968 (Vision968), and +// 86C732 (Trio32) chipsets. +// +// The S3 2D acceleration engine (sometimes called the "graphics engine" +// or "BitBLT engine") provides hardware-accelerated: +// - Solid rectangle fill +// - 8x8 mono/color pattern fill +// - Screen-to-screen BitBLT +// - Mono color expansion (for text rendering) +// - Bresenham line draw +// - Hardware clipping rectangle +// - 64x64 two-color hardware cursor +// +// Register access: +// The S3 extended registers are accessed through CRTC index/data +// ports (0x3D4/0x3D5) at indices 0x30-0x6D. These must be unlocked +// by writing specific key values to CR38 and CR39. +// +// The 2D engine registers are at I/O ports 0x82E8-0xBEE8 (legacy) +// or via MMIO at the linear framebuffer base + 0x1000000 on newer +// chips (Trio64+). We use MMIO when available (Trio64, ViRGE) for +// faster register access, falling back to I/O on older Vision chips. +// +// VESA mode setting: +// We use VBE BIOS calls for mode setting rather than programming +// CRTC timings directly. This is simpler and more reliable across +// the S3 chip variants (which have subtly different timing register +// layouts). After VESA sets the mode, we unlock the S3 extended +// registers and enable the acceleration engine. + +#include "accelVid.h" +#include "vgaCommon.h" +#include "pci.h" + +#include +#include +#include +#include +#include + +// ============================================================ +// S3 vendor/device IDs +// ============================================================ + +#define S3_VENDOR_ID 0x5333 + +#define S3_TRIO32 0x8810 +#define S3_TRIO64 0x8811 +#define S3_TRIO64V_PLUS 0x8814 +#define S3_VISION864 0x88C0 +#define S3_VISION864P 0x88C1 +#define S3_VISION868 0x8880 +#define S3_VISION964 0x88D0 +#define S3_VISION968 0x88F0 +#define S3_VISION968_ALT 0x88F1 +#define S3_VIRGE 0x5631 +#define S3_VIRGE_VX 0x883D +#define S3_VIRGE_DX 0x8A01 +#define S3_VIRGE_GX2 0x8A10 +#define S3_VIRGE_MX 0x8C01 +#define S3_VIRGE_MXP 0x8C03 +#define S3_SAVAGE3D 0x8A20 +#define S3_SAVAGE3D_MV 0x8A21 +#define S3_SAVAGE4 0x8A22 +#define S3_SAVAGE_MX 0x8C10 +#define S3_SAVAGE_MX_MV 0x8C11 +#define S3_SAVAGE_IX 0x8C12 +#define S3_SAVAGE_IX_MV 0x8C13 +#define S3_SAVAGE_2000 0x9102 + +// Terminated by {0, 0} +static const uint16_t sS3DeviceIds[] = { + S3_VENDOR_ID, S3_TRIO32, + S3_VENDOR_ID, S3_TRIO64, + S3_VENDOR_ID, S3_TRIO64V_PLUS, + S3_VENDOR_ID, S3_VIRGE, + S3_VENDOR_ID, S3_VIRGE_VX, + S3_VENDOR_ID, S3_VIRGE_DX, + S3_VENDOR_ID, S3_VIRGE_GX2, + S3_VENDOR_ID, S3_VIRGE_MX, + S3_VENDOR_ID, S3_VIRGE_MXP, + S3_VENDOR_ID, S3_SAVAGE3D, + S3_VENDOR_ID, S3_SAVAGE3D_MV, + S3_VENDOR_ID, S3_SAVAGE4, + S3_VENDOR_ID, S3_SAVAGE_MX, + S3_VENDOR_ID, S3_SAVAGE_MX_MV, + S3_VENDOR_ID, S3_SAVAGE_IX, + S3_VENDOR_ID, S3_SAVAGE_IX_MV, + S3_VENDOR_ID, S3_SAVAGE_2000, + S3_VENDOR_ID, S3_VISION864, + S3_VENDOR_ID, S3_VISION864P, + S3_VENDOR_ID, S3_VISION868, + S3_VENDOR_ID, S3_VISION964, + S3_VENDOR_ID, S3_VISION968, + S3_VENDOR_ID, S3_VISION968_ALT, + 0, 0 +}; + +// ============================================================ +// S3 extended CRTC register indices +// ============================================================ + +#define S3_CR30_CHIP_ID 0x30 +#define S3_CR31_MEM_CONFIG 0x31 +#define S3_CR33_BACKWARD_COMPAT 0x33 +#define S3_CR34_BACKWARD_COMPAT 0x34 +#define S3_CR35_CRTC_LOCK 0x35 +#define S3_CR38_LOCK_1 0x38 // unlock with 0x48 +#define S3_CR39_LOCK_2 0x39 // unlock with 0xA5 +#define S3_CR40_SYS_CONFIG 0x40 +#define S3_CR40_ENGINE_ENABLE 0x01 // bit 0: enable graphics engine +#define S3_CR42_MODE_CONTROL 0x42 +#define S3_CR43_EXT_MODE 0x43 +#define S3_CR45_HW_CURSOR_MODE 0x45 +#define S3_CR46_HW_CURSOR_XHI 0x46 +#define S3_CR47_HW_CURSOR_XLO 0x47 +#define S3_CR48_HW_CURSOR_YHI 0x48 +#define S3_CR49_HW_CURSOR_YLO 0x49 +#define S3_CR4A_HW_CURSOR_FG_HI 0x4A +#define S3_CR4B_HW_CURSOR_FG_LO 0x4B +#define S3_CR4C_HW_CURSOR_ADDR_HI 0x4C +#define S3_CR4D_HW_CURSOR_ADDR_LO 0x4D +#define S3_CR4E_HW_CURSOR_BG_HI 0x4E +#define S3_CR4F_HW_CURSOR_BG_LO 0x4F +#define S3_CR50_EXT_SYS_CTRL_1 0x50 +// CR50 pixel length bits (bits 5:4) +#define S3_CR50_PIX_8BPP 0x00 +#define S3_CR50_PIX_16BPP 0x10 +#define S3_CR50_PIX_32BPP 0x30 +#define S3_CR51_EXT_SYS_CTRL_2 0x51 +#define S3_CR53_EXT_MEM_CTRL_1 0x53 +#define S3_CR54_EXT_MEM_CTRL_2 0x54 +#define S3_CR55_EXT_DAC_CTRL 0x55 +#define S3_CR58_LFB_CTRL 0x58 +#define S3_CR59_LFB_ADDR_HI 0x59 +#define S3_CR5A_LFB_ADDR_LO 0x5A +#define S3_CR5D_EXT_HCNT 0x5D +#define S3_CR5E_EXT_VCNT 0x5E +#define S3_CR67_EXT_MISC_CTRL_2 0x67 +#define S3_CR6A_EXT_MISC_CTRL_3 0x6A + +// ============================================================ +// S3 2D engine I/O ports (legacy access) +// ============================================================ +// +// These are the standard S3 accelerator register ports. All S3 +// chips from the 928 onward support this I/O port interface. + +#define S3_CUR_Y 0x82E8 +#define S3_CUR_X 0x86E8 +#define S3_DESTY_AXSTP 0x8AE8 // destination Y / axial step +#define S3_DESTX_DIASTP 0x8EE8 // destination X / diagonal step +#define S3_ERR_TERM 0x92E8 +#define S3_MAJ_AXIS_PCNT 0x96E8 // major axis pixel count +#define S3_GP_STAT 0x9AE8 // graphics processor status +#define S3_CMD 0x9AE8 // command register (write) +#define S3_SHORT_STROKE 0x9EE8 +#define S3_BKGD_COLOR 0xA2E8 +#define S3_FRGD_COLOR 0xA6E8 +#define S3_WRT_MASK 0xAAE8 +#define S3_RD_MASK 0xAEE8 +#define S3_COLOR_CMP 0xB2E8 +#define S3_BKGD_MIX 0xB6E8 +#define S3_FRGD_MIX 0xBAE8 +#define S3_MULTIFUNC_CTRL 0xBEE8 // multi-function control register +#define S3_PIX_TRANS 0xE2E8 // pixel data transfer + +// ============================================================ +// S3 MULTIFUNC_CTRL sub-register indices +// ============================================================ +// +// The multi-function control register at 0xBEE8 is a multiplexed +// port: bits 15:12 select the sub-register, bits 11:0 are the value. + +#define S3_MF_MIN_AXIS_PCNT 0x0000 // minor axis pixel count +#define S3_MF_SCISSORS_T 0x1000 // scissors top +#define S3_MF_SCISSORS_L 0x2000 // scissors left +#define S3_MF_SCISSORS_B 0x3000 // scissors bottom +#define S3_MF_SCISSORS_R 0x4000 // scissors right +#define S3_MF_PIX_CNTL 0xA000 // pixel control +#define S3_MF_MULT_MISC_2 0xD000 // multi misc 2 +#define S3_MF_READ_SEL 0xE000 // read register select + +// ============================================================ +// S3 command register bits +// ============================================================ + +// Command type (bits 15:13 for Trio64) +#define S3_CMD_NOP 0x0000 +#define S3_CMD_LINE 0x2000 +#define S3_CMD_RECT 0x4000 +#define S3_CMD_POLY_LINE 0x6000 +#define S3_CMD_NOP2 0x8000 +#define S3_CMD_BITBLT 0xC000 + +// Drawing direction bits (bits 7:5) +#define S3_CMD_DRAW 0x0010 // draw (vs. move) +#define S3_CMD_DIR_X_POS 0x0020 // X direction positive +#define S3_CMD_DIR_Y_POS 0x0040 // Y direction positive +#define S3_CMD_DIR_X_MAJOR 0x0000 // X is major axis +#define S3_CMD_DIR_Y_MAJOR 0x0080 // Y is major axis + +// Additional command bits +#define S3_CMD_PLANAR 0x0002 // planar mode +// Bit 2 has dual meaning depending on command type: +// For RECT/BITBLT: across-plane (packed pixel) mode +// For LINE: include last pixel +#define S3_CMD_ACROSS_PLANE 0x0004 +#define S3_CMD_LAST_PIXEL 0x0004 +#define S3_CMD_BYTE_SWAP 0x1000 // byte swap for pixel transfer +#define S3_CMD_16BIT_IO 0x0200 // 16-bit pixel transfer +#define S3_CMD_32BIT_IO 0x0400 // 32-bit pixel transfer + +// Source select (bits 8:7 of command when applicable) +// Actually in PIX_CNTL register + +// ============================================================ +// S3 MIX register values +// ============================================================ +// +// The foreground and background MIX registers control what source +// is used and what ROP is applied. +// +// Bits 4:0 = ROP (raster operation) +// Bits 6:5 = source select: +// 00 = background color register +// 01 = foreground color register +// 10 = pixel data from CPU (via PIX_TRANS) +// 11 = display memory (screen source) + +#define S3_MIX_SRC_BKGD 0x00 +#define S3_MIX_SRC_FRGD 0x20 +#define S3_MIX_SRC_CPU 0x40 +#define S3_MIX_SRC_DISPLAY 0x60 + +// Common raster operations (bits 4:0) +#define S3_MIX_ROP_NOT_DST 0x00 // NOT dest +#define S3_MIX_ROP_ZERO 0x01 // 0 +#define S3_MIX_ROP_ONE 0x02 // 1 +#define S3_MIX_ROP_DST 0x03 // dest (nop) +#define S3_MIX_ROP_NOT_SRC 0x04 // NOT source +#define S3_MIX_ROP_SRC_XOR_DST 0x05 // source XOR dest +#define S3_MIX_ROP_NOT_SRC_AND 0x06 // NOT source AND dest +#define S3_MIX_ROP_SRC_AND_DST 0x0C // source AND dest +#define S3_MIX_ROP_SRC 0x07 // source (copy) +#define S3_MIX_ROP_NOT_SRC_OR 0x0B // NOT source OR dest +#define S3_MIX_ROP_SRC_OR_DST 0x0E // source OR dest + +// ============================================================ +// S3 PIX_CNTL (pixel control) values +// ============================================================ +// +// Written via MULTIFUNC_CTRL with index 0xA000. +// Controls the source of foreground/background mix selection. +// +// Bits 7:6 = mix select: +// 00 = foreground mix always +// 01 = cpu data determines mix (color expansion) +// 10 = pixel data determines mix +// 11 = video memory determines mix + +#define S3_PIXCNTL_MIX_FRGD 0x0000 // always use foreground mix +#define S3_PIXCNTL_MIX_CPU 0x0040 // CPU data selects mix (color expansion) +#define S3_PIXCNTL_MIX_DISPLAY 0x0080 // display memory selects mix + +// ============================================================ +// S3 GP_STAT bits +// ============================================================ + +#define S3_GP_STAT_BUSY 0x0200 // graphics engine busy +#define S3_GP_STAT_FIFO_EMPTY 0x0400 // all FIFO slots empty +#define S3_GP_STAT_FIFO_1 0x0080 // at least 1 FIFO slot free +#define S3_GP_STAT_FIFO_2 0x0040 // at least 2 FIFO slots free +#define S3_GP_STAT_FIFO_3 0x0020 // at least 3 FIFO slots free +#define S3_GP_STAT_FIFO_4 0x0010 // at least 4 FIFO slots free +#define S3_GP_STAT_FIFO_5 0x0008 // at least 5 FIFO slots free +#define S3_GP_STAT_FIFO_6 0x0004 // at least 6 FIFO slots free +#define S3_GP_STAT_FIFO_7 0x0002 // at least 7 FIFO slots free +#define S3_GP_STAT_FIFO_8 0x0001 // at least 8 FIFO slots free + +// Hardware cursor constants +#define S3_HW_CURSOR_SIZE 64 // 64x64 pixels +#define S3_HW_CURSOR_BYTES 1024 // 64*64/8 * 2 planes = 1024 bytes + +// Maximum wait iterations to prevent infinite loops on broken hardware +#define S3_MAX_IDLE_WAIT 1000000 + +// MMIO region offset from LFB base (Trio64/ViRGE new-style MMIO) +#define S3_MMIO_OFFSET 0x1000000 +#define S3_MMIO_SIZE 0x10000 // 64KB MMIO window + +// ============================================================ +// S3 MMIO register offset mapping +// ============================================================ +// +// The S3 "new MMIO" maps the enhanced registers into a 64KB +// window at LFB + 0x1000000. The I/O port addresses map to +// MMIO offsets as follows: +// I/O 0x82E8 -> MMIO 0x82E8 (same offset within 64KB window) +// +// For 16-bit register access: write to offset as uint16_t +// For 32-bit register access: write to offset as uint32_t + +// ============================================================ +// Private driver state +// ============================================================ + +typedef struct { + uint32_t lfbPhysAddr; // physical address of LFB + uint32_t vramSize; // total VRAM in bytes + uint32_t cursorOffset; // VRAM offset for cursor image + int32_t bytesPerPixel; + int32_t screenPitch; // bytes per scanline + bool isTrio; // true for Trio32/64/V+/ViRGE + bool useMMIO; // true if MMIO is available + volatile uint8_t *mmio; // mapped MMIO base pointer (NULL if I/O mode) + DpmiMappingT lfbMapping; + DpmiMappingT mmioMapping; +} S3PrivateT; + +// ============================================================ +// Prototypes +// ============================================================ + +static void s3BitBlt(AccelDriverT *drv, int32_t srcX, int32_t srcY, int32_t dstX, int32_t dstY, int32_t w, int32_t h); +static void s3ColorExpand(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h, uint32_t fg, uint32_t bg); +static bool s3Detect(AccelDriverT *drv); +static void s3HostBlit(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h); +static bool s3Init(AccelDriverT *drv, const AccelModeRequestT *req); +static void s3LineDraw(AccelDriverT *drv, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color); +static void s3MoveCursor(AccelDriverT *drv, int32_t x, int32_t y); +static void s3RectFill(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color); +static void s3RectFillPat(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, const uint8_t *pattern, uint32_t fg, uint32_t bg); +static void s3SetClip(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h); +static void s3SetCursor(AccelDriverT *drv, const HwCursorImageT *image); +static void s3ShowCursor(AccelDriverT *drv, bool visible); +static void s3Shutdown(AccelDriverT *drv); +static void s3UnlockRegs(void); +static void s3WaitFifo(S3PrivateT *priv, int32_t slots); +static void s3WaitIdle(AccelDriverT *drv); + +// ============================================================ +// Driver instance +// ============================================================ + +static S3PrivateT sS3Private; + +static AccelDriverT sS3Driver = { + .name = "S3 Trio64", + .chipFamily = "s3", + .caps = 0, + .privData = &sS3Private, + .detect = s3Detect, + .init = s3Init, + .shutdown = s3Shutdown, + .waitIdle = s3WaitIdle, + .setClip = s3SetClip, + .rectFill = s3RectFill, + .rectFillPat = s3RectFillPat, + .bitBlt = s3BitBlt, + .hostBlit = s3HostBlit, + .colorExpand = s3ColorExpand, + .lineDraw = s3LineDraw, + .setCursor = s3SetCursor, + .moveCursor = s3MoveCursor, + .showCursor = s3ShowCursor, +}; + +// ============================================================ +// s3RegisterDriver +// ============================================================ +// +// Called from main() to register the S3 driver with the manager. + +void s3RegisterDriver(void) { + accelRegisterDriver(&sS3Driver); +} + + +// ============================================================ +// S3 register access helpers +// ============================================================ +// +// When MMIO is available (Trio64, ViRGE, Savage), register access +// goes through the MMIO window at LFB + 0x1000000. The I/O port +// addresses map directly to MMIO offsets within the 64KB window. +// When MMIO is not available (Vision series), we fall back to +// I/O port access. +// +// Using MMIO is faster because: (1) memory writes can be posted +// and pipelined by the CPU, (2) no I/O port decode penalty, and +// (3) on Pentium+, memory writes are faster than I/O instructions. + +static inline void s3WriteReg16(S3PrivateT *priv, uint16_t port, uint16_t val) { + if (priv->useMMIO) { + *(volatile uint16_t *)(priv->mmio + (port & 0xFFFF)) = val; + } else { + outportw(port, val); + } +} + +static inline uint16_t s3ReadReg16(S3PrivateT *priv, uint16_t port) { + if (priv->useMMIO) { + return *(volatile uint16_t *)(priv->mmio + (port & 0xFFFF)); + } + + return inportw(port); +} + + +// ============================================================ +// s3BitBlt +// ============================================================ +// +// Screen-to-screen BitBLT using the S3 hardware engine. +// Handles overlapping source and destination by adjusting the +// blit direction. The S3 engine can blit in any of four +// directions (positive/negative X/Y). + +static void s3BitBlt(AccelDriverT *drv, int32_t srcX, int32_t srcY, int32_t dstX, int32_t dstY, int32_t w, int32_t h) { + S3PrivateT *priv = (S3PrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + // Determine blit direction to handle overlapping regions + uint16_t cmd = S3_CMD_BITBLT | S3_CMD_DRAW | S3_CMD_ACROSS_PLANE; + int32_t sx = srcX; + int32_t sy = srcY; + int32_t dx = dstX; + int32_t dy = dstY; + + if (dstX <= srcX) { + cmd |= S3_CMD_DIR_X_POS; + } else { + sx += w - 1; + dx += w - 1; + } + + if (dstY <= srcY) { + cmd |= S3_CMD_DIR_Y_POS; + } else { + sy += h - 1; + dy += h - 1; + } + + s3WaitFifo(priv, 7); + + // Foreground mix: source = display memory, ROP = copy + s3WriteReg16(priv, S3_FRGD_MIX, S3_MIX_SRC_DISPLAY | S3_MIX_ROP_SRC); + s3WriteReg16(priv, S3_WRT_MASK, 0xFFFF); + + // Pixel control: always foreground mix + s3WriteReg16(priv, S3_MULTIFUNC_CTRL, S3_MF_PIX_CNTL | S3_PIXCNTL_MIX_FRGD); + + // Source position + s3WriteReg16(priv, S3_CUR_X, sx); + s3WriteReg16(priv, S3_CUR_Y, sy); + + // Destination position + s3WriteReg16(priv, S3_DESTX_DIASTP, dx); + s3WriteReg16(priv, S3_DESTY_AXSTP, dy); + + s3WaitFifo(priv, 3); + + // Dimensions (count is pixels - 1) + s3WriteReg16(priv, S3_MAJ_AXIS_PCNT, w - 1); + s3WriteReg16(priv, S3_MULTIFUNC_CTRL, S3_MF_MIN_AXIS_PCNT | (h - 1)); + + // Fire + s3WriteReg16(priv, S3_CMD, cmd); +} + + +// ============================================================ +// s3ColorExpand +// ============================================================ +// +// Monochrome-to-color expansion using CPU-driven pixel transfer. +// This is used for text rendering: each byte of srcBuf contains +// 8 monochrome pixels (MSB first), which the engine expands to +// full-color using the foreground and background color registers. +// +// The S3 engine is set to CPU data mix mode: for each bit in +// the transferred data, 1 = use foreground color, 0 = use +// background color. + +static void s3ColorExpand(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h, uint32_t fg, uint32_t bg) { + S3PrivateT *priv = (S3PrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + s3WaitFifo(priv, 8); + + // Set colors + s3WriteReg16(priv, S3_FRGD_COLOR, fg); + s3WriteReg16(priv, S3_BKGD_COLOR, bg); + + // Foreground mix: source = foreground color, ROP = copy + s3WriteReg16(priv, S3_FRGD_MIX, S3_MIX_SRC_FRGD | S3_MIX_ROP_SRC); + // Background mix: source = background color, ROP = copy + s3WriteReg16(priv, S3_BKGD_MIX, S3_MIX_SRC_BKGD | S3_MIX_ROP_SRC); + + // Pixel control: CPU data selects fg/bg mix + s3WriteReg16(priv, S3_MULTIFUNC_CTRL, S3_MF_PIX_CNTL | S3_PIXCNTL_MIX_CPU); + + // Destination and dimensions + s3WriteReg16(priv, S3_CUR_X, dstX); + s3WriteReg16(priv, S3_CUR_Y, dstY); + s3WriteReg16(priv, S3_MAJ_AXIS_PCNT, w - 1); + + s3WaitFifo(priv, 2); + s3WriteReg16(priv, S3_MULTIFUNC_CTRL, S3_MF_MIN_AXIS_PCNT | (h - 1)); + + // Command: rectangle, draw, left-to-right top-to-bottom, CPU data + uint16_t cmd = S3_CMD_RECT | S3_CMD_DRAW | S3_CMD_ACROSS_PLANE + | S3_CMD_DIR_X_POS | S3_CMD_DIR_Y_POS + | S3_CMD_16BIT_IO; + s3WriteReg16(priv, S3_CMD, cmd); + + // Transfer monochrome data to the engine one scanline at a time. + // The engine expects MSB-first bit order, which matches our + // convention. Data must be written to PIX_TRANS in 16-bit words. + int32_t wordsPerRow = (w + 15) / 16; + + for (int32_t row = 0; row < h; row++) { + const uint8_t *rowData = srcBuf + row * srcPitch; + + s3WaitFifo(priv, 1); + + for (int32_t word = 0; word < wordsPerRow; word++) { + int32_t byteOff = word * 2; + uint8_t hi = (byteOff < srcPitch) ? rowData[byteOff] : 0; + uint8_t lo = (byteOff + 1 < srcPitch) ? rowData[byteOff + 1] : 0; + s3WriteReg16(priv, S3_PIX_TRANS, (hi << 8) | lo); + } + } +} + + +// ============================================================ +// s3Detect +// ============================================================ +// +// Scans PCI for any S3 chip in our supported list. Does not +// touch any hardware registers (detect must be side-effect-free). + +static bool s3Detect(AccelDriverT *drv) { + int32_t matchIdx; + + if (!pciFindDeviceList(sS3DeviceIds, &drv->pciDev, &matchIdx)) { + return false; + } + + // Set the driver name based on the specific chip found + switch (drv->pciDev.deviceId) { + case S3_TRIO32: + drv->name = "S3 Trio32"; + break; + case S3_TRIO64: + drv->name = "S3 Trio64"; + break; + case S3_TRIO64V_PLUS: + drv->name = "S3 Trio64V+"; + break; + case S3_VISION864: + case S3_VISION864P: + drv->name = "S3 Vision864"; + break; + case S3_VISION868: + drv->name = "S3 Vision868"; + break; + case S3_VISION964: + drv->name = "S3 Vision964"; + break; + case S3_VIRGE: + drv->name = "S3 ViRGE"; + break; + case S3_VIRGE_VX: + drv->name = "S3 ViRGE/VX"; + break; + case S3_VIRGE_DX: + drv->name = "S3 ViRGE/DX"; + break; + case S3_VIRGE_GX2: + drv->name = "S3 ViRGE/GX2"; + break; + case S3_VIRGE_MX: + case S3_VIRGE_MXP: + drv->name = "S3 ViRGE/MX"; + break; + case S3_SAVAGE3D: + case S3_SAVAGE3D_MV: + drv->name = "S3 Savage3D"; + break; + case S3_SAVAGE4: + drv->name = "S3 Savage4"; + break; + case S3_SAVAGE_MX: + case S3_SAVAGE_MX_MV: + drv->name = "S3 Savage/MX"; + break; + case S3_SAVAGE_IX: + case S3_SAVAGE_IX_MV: + drv->name = "S3 Savage/IX"; + break; + case S3_SAVAGE_2000: + drv->name = "S3 Savage 2000"; + break; + case S3_VISION968: + case S3_VISION968_ALT: + drv->name = "S3 Vision968"; + break; + default: + drv->name = "S3 (unknown)"; + break; + } + + return true; +} + + +// ============================================================ +// s3HostBlit +// ============================================================ +// +// CPU-to-screen blit via the PIX_TRANS port. Transfers packed +// pixel data from system RAM to VRAM through the engine. The +// engine handles the destination address calculation and pitch +// alignment, so the CPU just streams data. + +static void s3HostBlit(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h) { + S3PrivateT *priv = (S3PrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + int32_t bpp = priv->bytesPerPixel; + int32_t rowBytes = w * bpp; + int32_t wordCount = (rowBytes + 1) / 2; + + s3WaitFifo(priv, 7); + + // Foreground mix: source = CPU data, ROP = copy + s3WriteReg16(priv, S3_FRGD_MIX, S3_MIX_SRC_CPU | S3_MIX_ROP_SRC); + s3WriteReg16(priv, S3_WRT_MASK, 0xFFFF); + + // Pixel control: always foreground mix + s3WriteReg16(priv, S3_MULTIFUNC_CTRL, S3_MF_PIX_CNTL | S3_PIXCNTL_MIX_FRGD); + + // Destination position + s3WriteReg16(priv, S3_CUR_X, dstX); + s3WriteReg16(priv, S3_CUR_Y, dstY); + + // Dimensions + s3WriteReg16(priv, S3_MAJ_AXIS_PCNT, w - 1); + s3WriteReg16(priv, S3_MULTIFUNC_CTRL, S3_MF_MIN_AXIS_PCNT | (h - 1)); + + s3WaitFifo(priv, 1); + + // Command: rectangle, draw, CPU data, left-to-right top-to-bottom + s3WriteReg16(priv, S3_CMD, S3_CMD_RECT | S3_CMD_DRAW | S3_CMD_ACROSS_PLANE + | S3_CMD_DIR_X_POS | S3_CMD_DIR_Y_POS + | S3_CMD_16BIT_IO); + + // Transfer pixel data row by row through PIX_TRANS + for (int32_t row = 0; row < h; row++) { + const uint8_t *rowData = srcBuf + row * srcPitch; + + for (int32_t word = 0; word < wordCount; word++) { + int32_t byteOff = word * 2; + uint8_t lo = rowData[byteOff]; + uint8_t hi = (byteOff + 1 < rowBytes) ? rowData[byteOff + 1] : 0; + s3WriteReg16(priv, S3_PIX_TRANS, (hi << 8) | lo); + } + } +} + + +// ============================================================ +// s3Init +// ============================================================ +// +// Initializes the S3 chip: sets the requested video mode via +// VESA, unlocks extended registers, enables the 2D engine, and +// maps the linear framebuffer. +// +// Mode setting strategy: use VESA VBE to set the mode (with LFB +// flag bit 14 set), then unlock S3 extended registers and +// configure the acceleration engine. This avoids the complexity +// of programming S3-specific CRTC timing registers while still +// getting full hardware acceleration. + +static bool s3Init(AccelDriverT *drv, const AccelModeRequestT *req) { + S3PrivateT *priv = (S3PrivateT *)drv->privData; + + memset(priv, 0, sizeof(*priv)); + + priv->isTrio = (drv->pciDev.deviceId == S3_TRIO32 + || drv->pciDev.deviceId == S3_TRIO64 + || drv->pciDev.deviceId == S3_TRIO64V_PLUS + || drv->pciDev.deviceId == S3_VIRGE + || drv->pciDev.deviceId == S3_VIRGE_VX + || drv->pciDev.deviceId == S3_VIRGE_DX + || drv->pciDev.deviceId == S3_VIRGE_GX2 + || drv->pciDev.deviceId == S3_VIRGE_MX + || drv->pciDev.deviceId == S3_VIRGE_MXP + || drv->pciDev.deviceId == S3_SAVAGE3D + || drv->pciDev.deviceId == S3_SAVAGE3D_MV + || drv->pciDev.deviceId == S3_SAVAGE4 + || drv->pciDev.deviceId == S3_SAVAGE_MX + || drv->pciDev.deviceId == S3_SAVAGE_MX_MV + || drv->pciDev.deviceId == S3_SAVAGE_IX + || drv->pciDev.deviceId == S3_SAVAGE_IX_MV + || drv->pciDev.deviceId == S3_SAVAGE_2000); + + // Determine VRAM size and LFB address from BAR0 + uint32_t barSize = pciSizeBar(drv->pciDev.bus, drv->pciDev.dev, + drv->pciDev.func, PCI_BAR0); + uint32_t bar0 = pciRead32(drv->pciDev.bus, drv->pciDev.dev, + drv->pciDev.func, PCI_BAR0); + priv->vramSize = barSize; + priv->lfbPhysAddr = bar0 & 0xFFFFFFF0; + + // Unlock S3 extended registers + s3UnlockRegs(); + + // Cross-check VRAM size from CR36 on Trio chips + if (priv->isTrio) { + uint8_t cr36 = vgaCrtcRead(0x36); + uint32_t ramFromCr36; + + switch ((cr36 >> 5) & 0x07) { + case 0: ramFromCr36 = 4 * 1024 * 1024; break; + case 2: ramFromCr36 = 3 * 1024 * 1024; break; + case 4: ramFromCr36 = 2 * 1024 * 1024; break; + case 6: ramFromCr36 = 1 * 1024 * 1024; break; + default: ramFromCr36 = 1 * 1024 * 1024; break; + } + + if (barSize < 512 * 1024 || barSize > 64 * 1024 * 1024) { + priv->vramSize = ramFromCr36; + } + } + + // Find and set VESA mode + VesaModeResultT vesa; + if (!vesaFindAndSetMode(req->width, req->height, req->bpp, &vesa)) { + return false; + } + + // Map LFB via DPMI + if (!dpmiMapFramebuffer(priv->lfbPhysAddr, priv->vramSize, &priv->lfbMapping)) { + vgaRestoreTextMode(); + return false; + } + + // Fill in driver mode info + priv->bytesPerPixel = (vesa.bpp + 7) / 8; + priv->screenPitch = vesa.pitch; + + drv->mode.width = vesa.width; + drv->mode.height = vesa.height; + drv->mode.bpp = vesa.bpp; + drv->mode.pitch = vesa.pitch; + drv->mode.framebuffer = priv->lfbMapping.ptr; + drv->mode.vramSize = priv->vramSize; + drv->mode.offscreenBase = vesa.pitch * vesa.height; + + // Map MMIO region for Trio64/ViRGE (at LFB + 16MB) + priv->useMMIO = false; + priv->mmio = NULL; + if (priv->isTrio) { + if (dpmiMapFramebuffer(priv->lfbPhysAddr + S3_MMIO_OFFSET, S3_MMIO_SIZE, &priv->mmioMapping)) { + priv->useMMIO = true; + priv->mmio = (volatile uint8_t *)priv->mmioMapping.ptr; + } + } + + // Re-unlock after mode set (VESA may re-lock) + s3UnlockRegs(); + + // Enable the graphics engine + // CR40 bit 0 = enable graphics engine + uint8_t cr40 = vgaCrtcRead(S3_CR40_SYS_CONFIG); + vgaCrtcWrite(S3_CR40_SYS_CONFIG, cr40 | S3_CR40_ENGINE_ENABLE); + + // Set up pixel format in CR50 for the engine + uint8_t cr50 = vgaCrtcRead(S3_CR50_EXT_SYS_CTRL_1); + cr50 &= 0xC0; // clear pixel length bits + + switch (vesa.bpp) { + case 8: + cr50 |= S3_CR50_PIX_8BPP; + break; + case 15: + case 16: + cr50 |= S3_CR50_PIX_16BPP; + break; + case 32: + cr50 |= S3_CR50_PIX_32BPP; + break; + } + + vgaCrtcWrite(S3_CR50_EXT_SYS_CTRL_1, cr50); + + // Set up hardware cursor location at end of VRAM + // Cursor image is 1KB (64x64 2bpp), aligned to 1KB + priv->cursorOffset = priv->vramSize - S3_HW_CURSOR_BYTES; + priv->cursorOffset &= ~(S3_HW_CURSOR_BYTES - 1); + + // Set capability flags + drv->caps = ACAP_RECT_FILL + | ACAP_RECT_FILL_PAT + | ACAP_BITBLT + | ACAP_COLOR_EXPAND + | ACAP_HOST_BLIT + | ACAP_LINE_DRAW + | ACAP_HW_CURSOR + | ACAP_CLIP; + + // Set full-screen clip rectangle + s3SetClip(drv, 0, 0, vesa.width, vesa.height); + + // Wait for engine to be ready + s3WaitIdle(drv); + + return true; +} + + +// ============================================================ +// s3LineDraw +// ============================================================ +// +// Bresenham line drawing using the S3 hardware engine. +// The engine implements the Bresenham algorithm natively -- +// we provide the initial error term and step values. + +static void s3LineDraw(AccelDriverT *drv, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color) { + S3PrivateT *priv = (S3PrivateT *)drv->privData; + + int32_t dx = x2 - x1; + int32_t dy = y2 - y1; + + // Determine octant and make dx/dy positive + uint16_t cmd = S3_CMD_LINE | S3_CMD_DRAW | S3_CMD_LAST_PIXEL; + + if (dx >= 0) { + cmd |= S3_CMD_DIR_X_POS; + } else { + dx = -dx; + } + + if (dy >= 0) { + cmd |= S3_CMD_DIR_Y_POS; + } else { + dy = -dy; + } + + int32_t majAxis; + int32_t minAxis; + + if (dx >= dy) { + // X is major axis + majAxis = dx; + minAxis = dy; + } else { + // Y is major axis + cmd |= S3_CMD_DIR_Y_MAJOR; + majAxis = dy; + minAxis = dx; + } + + if (majAxis == 0) { + return; + } + + // Bresenham parameters: + // axialStep = 2 * minAxis + // diagonalStep = 2 * (minAxis - majAxis) + // errorTerm = 2 * minAxis - majAxis + int32_t axialStep = 2 * minAxis; + int32_t diagStep = 2 * (minAxis - majAxis); + int32_t errTerm = 2 * minAxis - majAxis; + + s3WaitFifo(priv, 7); + + s3WriteReg16(priv, S3_FRGD_COLOR, color); + s3WriteReg16(priv, S3_FRGD_MIX, S3_MIX_SRC_FRGD | S3_MIX_ROP_SRC); + s3WriteReg16(priv, S3_MULTIFUNC_CTRL, S3_MF_PIX_CNTL | S3_PIXCNTL_MIX_FRGD); + + s3WriteReg16(priv, S3_CUR_X, x1); + s3WriteReg16(priv, S3_CUR_Y, y1); + + s3WriteReg16(priv, S3_DESTY_AXSTP, axialStep); + s3WriteReg16(priv, S3_DESTX_DIASTP, diagStep); + + s3WaitFifo(priv, 3); + + s3WriteReg16(priv, S3_ERR_TERM, errTerm); + s3WriteReg16(priv, S3_MAJ_AXIS_PCNT, majAxis); + s3WriteReg16(priv, S3_CMD, cmd); +} + + +// ============================================================ +// s3MoveCursor +// ============================================================ +// +// Moves the hardware cursor to the given screen position. +// The S3 cursor registers are in CRTC extended registers CR46-CR49. + +static void s3MoveCursor(AccelDriverT *drv, int32_t x, int32_t y) { + (void)drv; + + // Handle negative coordinates (cursor partially off-screen) + // by setting the cursor origin offset in the image + if (x < 0) { + x = 0; + } + if (y < 0) { + y = 0; + } + + vgaCrtcWrite(S3_CR46_HW_CURSOR_XHI, (x >> 8) & 0x07); + vgaCrtcWrite(S3_CR47_HW_CURSOR_XLO, x & 0xFF); + vgaCrtcWrite(S3_CR48_HW_CURSOR_YHI, (y >> 8) & 0x07); + vgaCrtcWrite(S3_CR49_HW_CURSOR_YLO, y & 0xFF); +} + + +// ============================================================ +// s3RectFill +// ============================================================ +// +// Solid rectangle fill using the S3 hardware engine. +// Sets the foreground color, selects foreground-only mix mode +// with copy ROP, then issues a rectangle command. + +static void s3RectFill(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color) { + S3PrivateT *priv = (S3PrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + s3WaitFifo(priv, 7); + + // Set foreground color + s3WriteReg16(priv, S3_FRGD_COLOR, color); + + // Foreground mix: source = foreground color, ROP = copy + s3WriteReg16(priv, S3_FRGD_MIX, S3_MIX_SRC_FRGD | S3_MIX_ROP_SRC); + + // Write mask: all bits enabled + s3WriteReg16(priv, S3_WRT_MASK, 0xFFFF); + + // Pixel control: always use foreground mix + s3WriteReg16(priv, S3_MULTIFUNC_CTRL, S3_MF_PIX_CNTL | S3_PIXCNTL_MIX_FRGD); + + // Starting position + s3WriteReg16(priv, S3_CUR_X, x); + s3WriteReg16(priv, S3_CUR_Y, y); + + // Dimensions (count is pixels - 1) + s3WriteReg16(priv, S3_MAJ_AXIS_PCNT, w - 1); + + s3WaitFifo(priv, 2); + s3WriteReg16(priv, S3_MULTIFUNC_CTRL, S3_MF_MIN_AXIS_PCNT | (h - 1)); + + // Command: rectangle, draw, positive X and Y, packed mode + s3WriteReg16(priv, S3_CMD, S3_CMD_RECT | S3_CMD_DRAW | S3_CMD_ACROSS_PLANE + | S3_CMD_DIR_X_POS | S3_CMD_DIR_Y_POS); +} + + +// ============================================================ +// s3RectFillPat +// ============================================================ +// +// 8x8 mono pattern fill using CPU data mix mode. The pattern is +// 8 bytes (one per row, MSB-first), tiled across the rectangle. +// 1-bits use the foreground color, 0-bits use the background. +// Data is fed through PIX_TRANS as 16-bit words, repeating the +// 8-row pattern for the full height of the rectangle. + +static void s3RectFillPat(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, const uint8_t *pattern, uint32_t fg, uint32_t bg) { + S3PrivateT *priv = (S3PrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + s3WaitFifo(priv, 8); + + // Set colors + s3WriteReg16(priv, S3_FRGD_COLOR, fg); + s3WriteReg16(priv, S3_BKGD_COLOR, bg); + + // Foreground mix: source = foreground color, ROP = copy + s3WriteReg16(priv, S3_FRGD_MIX, S3_MIX_SRC_FRGD | S3_MIX_ROP_SRC); + // Background mix: source = background color, ROP = copy + s3WriteReg16(priv, S3_BKGD_MIX, S3_MIX_SRC_BKGD | S3_MIX_ROP_SRC); + + // Pixel control: CPU data selects fg/bg mix + s3WriteReg16(priv, S3_MULTIFUNC_CTRL, S3_MF_PIX_CNTL | S3_PIXCNTL_MIX_CPU); + + // Destination and dimensions + s3WriteReg16(priv, S3_CUR_X, x); + s3WriteReg16(priv, S3_CUR_Y, y); + s3WriteReg16(priv, S3_MAJ_AXIS_PCNT, w - 1); + + s3WaitFifo(priv, 2); + s3WriteReg16(priv, S3_MULTIFUNC_CTRL, S3_MF_MIN_AXIS_PCNT | (h - 1)); + + // Command: rectangle, draw, left-to-right top-to-bottom, CPU data + s3WriteReg16(priv, S3_CMD, S3_CMD_RECT | S3_CMD_DRAW | S3_CMD_ACROSS_PLANE + | S3_CMD_DIR_X_POS | S3_CMD_DIR_Y_POS + | S3_CMD_16BIT_IO); + + // Feed tiled pattern data through PIX_TRANS. + // Each row of the pattern is 1 byte (8 pixels), tiled across the width. + int32_t wordsPerRow = (w + 15) / 16; + + for (int32_t row = 0; row < h; row++) { + uint8_t patByte = pattern[row & 7]; + + s3WaitFifo(priv, 1); + + for (int32_t word = 0; word < wordsPerRow; word++) { + s3WriteReg16(priv, S3_PIX_TRANS, (patByte << 8) | patByte); + } + } +} + + +// ============================================================ +// s3SetClip +// ============================================================ +// +// Programs the hardware scissor rectangle. All subsequent +// drawing operations are clipped to this region. + +static void s3SetClip(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h) { + S3PrivateT *priv = (S3PrivateT *)drv->privData; + + s3WaitFifo(priv, 4); + + s3WriteReg16(priv, S3_MULTIFUNC_CTRL, S3_MF_SCISSORS_L | (x & 0x0FFF)); + s3WriteReg16(priv, S3_MULTIFUNC_CTRL, S3_MF_SCISSORS_T | (y & 0x0FFF)); + s3WriteReg16(priv, S3_MULTIFUNC_CTRL, S3_MF_SCISSORS_R | ((x + w - 1) & 0x0FFF)); + s3WriteReg16(priv, S3_MULTIFUNC_CTRL, S3_MF_SCISSORS_B | ((y + h - 1) & 0x0FFF)); +} + + +// ============================================================ +// s3SetCursor +// ============================================================ +// +// Uploads a cursor image to VRAM and configures the hardware +// cursor registers. The S3 hardware cursor is 64x64 pixels, +// stored as two bit planes (AND mask and XOR mask) at the +// cursor address in VRAM. +// +// S3 cursor VRAM format: +// 1024 bytes total = 512 bytes AND + 512 bytes XOR +// Each row: 8 bytes AND mask, 8 bytes XOR mask (interleaved +// by row on some chips, or plane-sequential on others). +// For Trio64: rows are interleaved (AND row, XOR row, ...). + +static void s3SetCursor(AccelDriverT *drv, const HwCursorImageT *image) { + S3PrivateT *priv = (S3PrivateT *)drv->privData; + + if (!image) { + s3ShowCursor(drv, false); + return; + } + + // Wait for engine idle before writing to VRAM + s3WaitIdle(drv); + + // Write cursor image to VRAM at cursorOffset + // Format: for each of 64 rows, write 8 bytes AND then 8 bytes XOR + uint8_t *cursorMem = drv->mode.framebuffer + priv->cursorOffset; + + for (int32_t row = 0; row < S3_HW_CURSOR_SIZE; row++) { + for (int32_t byte = 0; byte < 8; byte++) { + int32_t srcIdx = row * 8 + byte; + uint8_t andByte; + uint8_t xorByte; + + if (row < image->height && byte < (image->width + 7) / 8) { + andByte = image->andMask[srcIdx]; + xorByte = image->xorMask[srcIdx]; + } else { + // Outside the image: transparent (AND=0xFF, XOR=0x00) + andByte = 0xFF; + xorByte = 0x00; + } + + // Interleaved format: AND row bytes, then XOR row bytes + cursorMem[row * 16 + byte] = andByte; + cursorMem[row * 16 + byte + 8] = xorByte; + } + } + + // Set cursor address in VRAM (in units of 1KB) + uint16_t cursorAddr = priv->cursorOffset / 1024; + vgaCrtcWrite(S3_CR4C_HW_CURSOR_ADDR_HI, (cursorAddr >> 8) & 0x0F); + vgaCrtcWrite(S3_CR4D_HW_CURSOR_ADDR_LO, cursorAddr & 0xFF); +} + + +// ============================================================ +// s3ShowCursor +// ============================================================ +// +// Enables or disables the hardware cursor via CR45. + +static void s3ShowCursor(AccelDriverT *drv, bool visible) { + (void)drv; + + uint8_t cr45 = vgaCrtcRead(S3_CR45_HW_CURSOR_MODE); + + if (visible) { + cr45 |= 0x01; // enable hardware cursor + } else { + cr45 &= ~0x01; // disable hardware cursor + } + + vgaCrtcWrite(S3_CR45_HW_CURSOR_MODE, cr45); +} + + +// ============================================================ +// s3Shutdown +// ============================================================ +// +// Restores text mode and cleans up. The VESA/VGA BIOS text mode +// restore handles resetting all the S3-specific registers. + +static void s3Shutdown(AccelDriverT *drv) { + S3PrivateT *priv = (S3PrivateT *)drv->privData; + + s3ShowCursor(drv, false); + dpmiUnmapFramebuffer(&priv->mmioMapping); + dpmiUnmapFramebuffer(&priv->lfbMapping); + vgaRestoreTextMode(); +} + + +// ============================================================ +// s3UnlockRegs +// ============================================================ +// +// Unlocks S3 extended CRTC registers. Three levels: +// CR38 = 0x48 : unlock S3 VGA registers (CR30-CR3F) +// CR39 = 0xA5 : unlock S3 system registers (CR40-CR5F) +// Also unlock standard CRTC protection for timing regs. + +static void s3UnlockRegs(void) { + vgaCrtcWrite(S3_CR38_LOCK_1, 0x48); + vgaCrtcWrite(S3_CR39_LOCK_2, 0xA5); + vgaCrtcUnlock(); +} + + +// ============================================================ +// s3WaitFifo +// ============================================================ +// +// Waits until the S3 command FIFO has at least 'slots' free +// entries. The FIFO depth is 8 on Trio64. Reading GP_STAT +// returns a bitmask where bits 7:0 indicate how many slots +// are free (each bit = one more slot free, from MSB to LSB). + +static void s3WaitFifo(S3PrivateT *priv, int32_t slots) { + // Build the required mask: if we need N slots free, we need + // bit (8 - N) to be set in GP_STAT bits 7:0. + // Bits: 0x80=1free, 0x40=2free, ..., 0x01=8free + uint16_t mask = 0x0100 >> slots; + + for (int32_t i = 0; i < S3_MAX_IDLE_WAIT; i++) { + if (s3ReadReg16(priv, S3_GP_STAT) & mask) { + return; + } + } +} + + +// ============================================================ +// s3WaitIdle +// ============================================================ +// +// Waits until the S3 graphics engine is completely idle. +// The engine is idle when the BUSY bit (bit 9) of GP_STAT is clear +// AND the FIFO is empty (bit 10 is set). + +static void s3WaitIdle(AccelDriverT *drv) { + S3PrivateT *priv = (S3PrivateT *)drv->privData; + + for (int32_t i = 0; i < S3_MAX_IDLE_WAIT; i++) { + uint16_t stat = s3ReadReg16(priv, S3_GP_STAT); + if (!(stat & S3_GP_STAT_BUSY)) { + return; + } + } +} diff --git a/sis.c b/sis.c new file mode 100644 index 0000000..88ef0c1 --- /dev/null +++ b/sis.c @@ -0,0 +1,561 @@ +// sis.c -- SiS 6326/300/305/315/330 accelerated video driver +// +// Supports the SiS 6326, 300, 305, 315, and 330 integrated graphics +// chipsets. These share a similar 2D engine interface based on a +// queue-based command submission model: +// - Hardware rectangle fill +// - Screen-to-screen BitBLT +// - CPU-to-screen blit (host blit via data port) +// - Hardware clip rectangle +// - 64x64 hardware cursor +// +// Register access: +// BAR0 maps the linear framebuffer. +// BAR1 maps 128KB of MMIO registers. The 2D engine registers +// live at offsets 0x8200-0x8244 within this block. Host data +// is written to the MMIO data port at offset 0x8300. +// +// The 2D engine uses a command register at 0x822C to specify the +// operation type and ROP, then a fire register at 0x8230 to trigger +// execution. Engine status is polled at 0x8244. + +#include "accelVid.h" +#include "vgaCommon.h" +#include "pci.h" + +#include +#include +#include +#include + +// ============================================================ +// SiS vendor/device IDs +// ============================================================ + +#define SIS_VENDOR_ID 0x1039 + +#define SIS_6326 0x6326 +#define SIS_300 0x0300 +#define SIS_305 0x0305 +#define SIS_315 0x0315 +#define SIS_330 0x0330 + +static const uint16_t sSisDeviceIds[] = { + SIS_VENDOR_ID, SIS_6326, + SIS_VENDOR_ID, SIS_300, + SIS_VENDOR_ID, SIS_305, + SIS_VENDOR_ID, SIS_315, + SIS_VENDOR_ID, SIS_330, + 0, 0 +}; + +// ============================================================ +// 2D engine register offsets (from MMIO base) +// ============================================================ + +#define SIS_SRC_ADDR 0x8200 // source address (for blit) +#define SIS_SRC_PITCH 0x8204 // source pitch +#define SIS_SRC_YX 0x8208 // src Y<<16 | X +#define SIS_DST_YX 0x820C // dst Y<<16 | X +#define SIS_RECT_WH 0x8210 // width<<16 | height +#define SIS_FG_COLOR 0x8214 // foreground color +#define SIS_BG_COLOR 0x8218 // background color +#define SIS_MONO_PAT0 0x821C // mono pattern 0 +#define SIS_MONO_PAT1 0x8220 // mono pattern 1 +#define SIS_CLIP_LT 0x8224 // clip left<<16 | top +#define SIS_CLIP_RB 0x8228 // clip right<<16 | bottom +#define SIS_CMD 0x822C // command register +#define SIS_FIRE 0x8230 // fire trigger +#define SIS_LINE_PARAMS 0x8234 // line parameters +#define SIS_DST_ADDR 0x8238 // destination address +#define SIS_SRC_DST_PITCH 0x823C // src/dst pitch combined +#define SIS_AGP_BASE 0x8240 // AGP base (unused) + +// ============================================================ +// Engine status register +// ============================================================ + +#define SIS_ENGINE_STATUS 0x8244 // bit 0 = queues empty, bit 1 = idle + +#define SIS_STATUS_QUEUE_EMPTY 0x01 +#define SIS_STATUS_ENGINE_IDLE 0x02 +#define SIS_STATUS_ALL_IDLE (SIS_STATUS_QUEUE_EMPTY | SIS_STATUS_ENGINE_IDLE) + +// ============================================================ +// Host data port +// ============================================================ + +#define SIS_HOST_DATA 0x8300 // write pixel data here as dwords + +// ============================================================ +// Command register encoding +// ============================================================ + +// Bits 7:0 = ROP +#define SIS_ROP_COPY 0xCC +#define SIS_ROP_PAT_COPY 0xF0 + +// Bit 8 = X direction +#define SIS_CMD_XDIR_RIGHT (1 << 8) + +// Bit 9 = Y direction +#define SIS_CMD_YDIR_DOWN (1 << 9) + +// Bits 13:10 = command type +#define SIS_CMD_BITBLT 0x0000 +#define SIS_CMD_COLOREXP 0x0400 +#define SIS_CMD_LINEDRAW 0x0800 +#define SIS_CMD_TRAPEZOID 0x0C00 + +// Bit 14 = pattern enable +#define SIS_CMD_PAT_ENABLE (1 << 14) + +// Bit 16 = clipping enable +#define SIS_CMD_CLIP_ENABLE (1 << 16) + +// Bit 24 = source is mono +#define SIS_CMD_SRC_MONO (1 << 24) + +// ============================================================ +// Hardware cursor registers +// ============================================================ + +#define SIS_CURSOR_ENABLE 0x8500 // bit 0 = enable +#define SIS_CURSOR_X 0x8504 // cursor X position +#define SIS_CURSOR_Y 0x8508 // cursor Y position +#define SIS_CURSOR_ADDR 0x850C // cursor VRAM byte offset + +// ============================================================ +// Misc constants +// ============================================================ + +#define SIS_MMIO_SIZE 131072 // BAR1: 128KB MMIO +#define SIS_MAX_IDLE_WAIT 1000000 +#define SIS_HW_CURSOR_SIZE 64 + +// ============================================================ +// Private driver state +// ============================================================ + +typedef struct { + uint32_t lfbPhysAddr; + uint32_t mmioPhysAddr; + uint32_t vramSize; + int32_t bytesPerPixel; + int32_t screenPitch; + volatile uint32_t *mmio; + DpmiMappingT mmioMapping; + DpmiMappingT lfbMapping; +} SisPrivateT; + +// ============================================================ +// Prototypes +// ============================================================ + +static void sisBitBlt(AccelDriverT *drv, int32_t srcX, int32_t srcY, int32_t dstX, int32_t dstY, int32_t w, int32_t h); +static bool sisDetect(AccelDriverT *drv); +static void sisHostBlit(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h); +static bool sisInit(AccelDriverT *drv, const AccelModeRequestT *req); +static void sisMoveCursor(AccelDriverT *drv, int32_t x, int32_t y); +static void sisRectFill(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color); +static void sisSetClip(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h); +static void sisSetCursor(AccelDriverT *drv, const HwCursorImageT *image); +static void sisShowCursor(AccelDriverT *drv, bool visible); +static void sisShutdown(AccelDriverT *drv); +static void sisWaitIdle(AccelDriverT *drv); + +static inline void sisWrite(SisPrivateT *priv, uint32_t reg, uint32_t val) { + priv->mmio[reg / 4] = val; +} + +static inline uint32_t sisRead(SisPrivateT *priv, uint32_t reg) { + return priv->mmio[reg / 4]; +} + +// ============================================================ +// Driver instance +// ============================================================ + +static SisPrivateT sSisPrivate; + +static AccelDriverT sSisDriver = { + .name = "SiS 6326", + .chipFamily = "sis", + .caps = 0, + .privData = &sSisPrivate, + .detect = sisDetect, + .init = sisInit, + .shutdown = sisShutdown, + .waitIdle = sisWaitIdle, + .setClip = sisSetClip, + .rectFill = sisRectFill, + .rectFillPat = NULL, + .bitBlt = sisBitBlt, + .hostBlit = sisHostBlit, + .colorExpand = NULL, + .lineDraw = NULL, + .setCursor = sisSetCursor, + .moveCursor = sisMoveCursor, + .showCursor = sisShowCursor, +}; + +// ============================================================ +// sisRegisterDriver +// ============================================================ + +void sisRegisterDriver(void) { + accelRegisterDriver(&sSisDriver); +} + + +// ============================================================ +// sisBitBlt +// ============================================================ +// +// Screen-to-screen BitBLT. Handles overlapping regions by choosing +// the correct X/Y direction based on source and destination positions. + +static void sisBitBlt(AccelDriverT *drv, int32_t srcX, int32_t srcY, int32_t dstX, int32_t dstY, int32_t w, int32_t h) { + SisPrivateT *priv = (SisPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + sisWaitIdle(drv); + + // Determine blit direction for overlapping regions + uint32_t cmd = SIS_CMD_BITBLT | SIS_ROP_COPY | SIS_CMD_CLIP_ENABLE; + int32_t sx = srcX; + int32_t sy = srcY; + int32_t dx = dstX; + int32_t dy = dstY; + + if (dstX <= srcX) { + cmd |= SIS_CMD_XDIR_RIGHT; + } else { + sx += w - 1; + dx += w - 1; + } + + if (dstY <= srcY) { + cmd |= SIS_CMD_YDIR_DOWN; + } else { + sy += h - 1; + dy += h - 1; + } + + uint32_t pitch = ((uint32_t)priv->screenPitch << 16) | (uint32_t)priv->screenPitch; + + sisWrite(priv, SIS_SRC_DST_PITCH, pitch); + sisWrite(priv, SIS_SRC_YX, ((uint32_t)sy << 16) | (uint32_t)sx); + sisWrite(priv, SIS_DST_YX, ((uint32_t)dy << 16) | (uint32_t)dx); + sisWrite(priv, SIS_RECT_WH, ((uint32_t)w << 16) | (uint32_t)h); + sisWrite(priv, SIS_CMD, cmd); + sisWrite(priv, SIS_FIRE, 0); +} + + +// ============================================================ +// sisDetect +// ============================================================ + +static bool sisDetect(AccelDriverT *drv) { + int32_t matchIdx; + + if (!pciFindDeviceList(sSisDeviceIds, &drv->pciDev, &matchIdx)) { + return false; + } + + switch (drv->pciDev.deviceId) { + case SIS_6326: + drv->name = "SiS 6326"; + break; + case SIS_300: + drv->name = "SiS 300"; + break; + case SIS_305: + drv->name = "SiS 305"; + break; + case SIS_315: + drv->name = "SiS 315"; + break; + case SIS_330: + drv->name = "SiS 330"; + break; + default: + drv->name = "SiS 6326/3xx"; + break; + } + + return true; +} + + +// ============================================================ +// sisHostBlit +// ============================================================ +// +// CPU-to-screen blit. Issues a BitBLT command, then feeds pixel data +// as dwords through the MMIO host data port at offset 0x8300. + +static void sisHostBlit(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h) { + SisPrivateT *priv = (SisPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + int32_t bytesPerRow = w * priv->bytesPerPixel; + int32_t dwordsPerRow = (bytesPerRow + 3) / 4; + + sisWaitIdle(drv); + + sisWrite(priv, SIS_SRC_DST_PITCH, (uint32_t)priv->screenPitch); + sisWrite(priv, SIS_DST_YX, ((uint32_t)dstY << 16) | (uint32_t)dstX); + sisWrite(priv, SIS_RECT_WH, ((uint32_t)w << 16) | (uint32_t)h); + sisWrite(priv, SIS_FG_COLOR, 0); + sisWrite(priv, SIS_CMD, SIS_CMD_BITBLT | SIS_ROP_COPY | SIS_CMD_CLIP_ENABLE | SIS_CMD_XDIR_RIGHT | SIS_CMD_YDIR_DOWN | SIS_CMD_SRC_MONO); + sisWrite(priv, SIS_FIRE, 0); + + // Feed pixel data row by row through the host data port + for (int32_t row = 0; row < h; row++) { + const uint8_t *rowPtr = srcBuf + row * srcPitch; + + for (int32_t dw = 0; dw < dwordsPerRow; dw++) { + uint32_t val = 0; + int32_t offset = dw * 4; + + for (int32_t b = 0; b < 4; b++) { + if (offset + b < bytesPerRow) { + val |= (uint32_t)rowPtr[offset + b] << (b * 8); + } + } + + sisWrite(priv, SIS_HOST_DATA, val); + } + } +} + + +// ============================================================ +// sisInit +// ============================================================ + +static bool sisInit(AccelDriverT *drv, const AccelModeRequestT *req) { + SisPrivateT *priv = (SisPrivateT *)drv->privData; + + // Read BARs + uint32_t bar0 = pciRead32(drv->pciDev.bus, drv->pciDev.dev, + drv->pciDev.func, PCI_BAR0); + uint32_t bar1 = pciRead32(drv->pciDev.bus, drv->pciDev.dev, + drv->pciDev.func, PCI_BAR1); + + priv->lfbPhysAddr = bar0 & 0xFFFFFFF0; + priv->mmioPhysAddr = bar1 & 0xFFFFFFF0; + + // Size the framebuffer BAR + priv->vramSize = pciSizeBar(drv->pciDev.bus, drv->pciDev.dev, + drv->pciDev.func, PCI_BAR0); + + // Map MMIO control registers (128KB) + if (!dpmiMapFramebuffer(priv->mmioPhysAddr, SIS_MMIO_SIZE, &priv->mmioMapping)) { + return false; + } + priv->mmio = (volatile uint32_t *)priv->mmioMapping.ptr; + + // Find and set VESA mode + VesaModeResultT vesa; + if (!vesaFindAndSetMode(req->width, req->height, req->bpp, &vesa)) { + dpmiUnmapFramebuffer(&priv->mmioMapping); + return false; + } + + // Map framebuffer + if (!dpmiMapFramebuffer(priv->lfbPhysAddr, priv->vramSize, &priv->lfbMapping)) { + vgaRestoreTextMode(); + dpmiUnmapFramebuffer(&priv->mmioMapping); + return false; + } + + priv->bytesPerPixel = (vesa.bpp + 7) / 8; + priv->screenPitch = vesa.pitch; + + drv->mode.width = vesa.width; + drv->mode.height = vesa.height; + drv->mode.bpp = vesa.bpp; + drv->mode.pitch = vesa.pitch; + drv->mode.framebuffer = priv->lfbMapping.ptr; + drv->mode.vramSize = priv->vramSize; + drv->mode.offscreenBase = vesa.pitch * vesa.height; + + // Wait for engine idle before configuring + sisWaitIdle(drv); + + drv->caps = ACAP_RECT_FILL + | ACAP_BITBLT + | ACAP_HOST_BLIT + | ACAP_HW_CURSOR + | ACAP_CLIP; + + // Full screen clip + sisSetClip(drv, 0, 0, vesa.width, vesa.height); + + return true; +} + + +// ============================================================ +// sisMoveCursor +// ============================================================ + +static void sisMoveCursor(AccelDriverT *drv, int32_t x, int32_t y) { + SisPrivateT *priv = (SisPrivateT *)drv->privData; + + if (x < 0) { + x = 0; + } + if (y < 0) { + y = 0; + } + + sisWrite(priv, SIS_CURSOR_X, (uint32_t)x); + sisWrite(priv, SIS_CURSOR_Y, (uint32_t)y); +} + + +// ============================================================ +// sisRectFill +// ============================================================ +// +// Solid rectangle fill. Sets the foreground color, loads the +// destination coordinates and dimensions, then fires a BitBLT +// command with PAT_COPY ROP and pattern enable to fill with a +// solid color. + +static void sisRectFill(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color) { + SisPrivateT *priv = (SisPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + sisWaitIdle(drv); + + sisWrite(priv, SIS_SRC_DST_PITCH, (uint32_t)priv->screenPitch); + sisWrite(priv, SIS_FG_COLOR, color); + sisWrite(priv, SIS_MONO_PAT0, 0xFFFFFFFF); + sisWrite(priv, SIS_MONO_PAT1, 0xFFFFFFFF); + sisWrite(priv, SIS_DST_YX, ((uint32_t)y << 16) | (uint32_t)x); + sisWrite(priv, SIS_RECT_WH, ((uint32_t)w << 16) | (uint32_t)h); + sisWrite(priv, SIS_CMD, SIS_CMD_BITBLT | SIS_ROP_PAT_COPY | SIS_CMD_PAT_ENABLE | SIS_CMD_CLIP_ENABLE | SIS_CMD_XDIR_RIGHT | SIS_CMD_YDIR_DOWN); + sisWrite(priv, SIS_FIRE, 0); +} + + +// ============================================================ +// sisSetClip +// ============================================================ + +static void sisSetClip(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h) { + SisPrivateT *priv = (SisPrivateT *)drv->privData; + + sisWrite(priv, SIS_CLIP_LT, ((uint32_t)x << 16) | (uint32_t)y); + sisWrite(priv, SIS_CLIP_RB, ((uint32_t)(x + w - 1) << 16) | (uint32_t)(y + h - 1)); +} + + +// ============================================================ +// sisSetCursor +// ============================================================ +// +// Upload a 64x64 hardware cursor image to VRAM. The SiS cursor +// format is 2bpp: AND mask and XOR mask interleaved per row, +// 16 bytes per row (8 AND + 8 XOR). Total size is 1024 bytes. + +static void sisSetCursor(AccelDriverT *drv, const HwCursorImageT *image) { + SisPrivateT *priv = (SisPrivateT *)drv->privData; + + if (!image) { + sisShowCursor(drv, false); + return; + } + + sisWaitIdle(drv); + + // Store cursor image at end of VRAM (1KB aligned) + uint32_t cursorOffset = priv->vramSize - 1024; + cursorOffset &= ~0x3FF; + uint8_t *cursorMem = drv->mode.framebuffer + cursorOffset; + + // Write AND mask then XOR mask, interleaved per row + for (int32_t row = 0; row < SIS_HW_CURSOR_SIZE; row++) { + for (int32_t byteIdx = 0; byteIdx < 8; byteIdx++) { + int32_t srcIdx = row * 8 + byteIdx; + uint8_t andByte; + uint8_t xorByte; + + if (row < image->height && byteIdx < (image->width + 7) / 8) { + andByte = image->andMask[srcIdx]; + xorByte = image->xorMask[srcIdx]; + } else { + andByte = 0xFF; // transparent + xorByte = 0x00; + } + + cursorMem[row * 16 + byteIdx] = andByte; + cursorMem[row * 16 + byteIdx + 8] = xorByte; + } + } + + // Set cursor address register + sisWrite(priv, SIS_CURSOR_ADDR, cursorOffset); +} + + +// ============================================================ +// sisShowCursor +// ============================================================ + +static void sisShowCursor(AccelDriverT *drv, bool visible) { + SisPrivateT *priv = (SisPrivateT *)drv->privData; + + sisWrite(priv, SIS_CURSOR_ENABLE, visible ? 1 : 0); +} + + +// ============================================================ +// sisShutdown +// ============================================================ + +static void sisShutdown(AccelDriverT *drv) { + SisPrivateT *priv = (SisPrivateT *)drv->privData; + + sisShowCursor(drv, false); + vgaRestoreTextMode(); + + dpmiUnmapFramebuffer(&priv->lfbMapping); + dpmiUnmapFramebuffer(&priv->mmioMapping); + + priv->mmio = NULL; +} + + +// ============================================================ +// sisWaitIdle +// ============================================================ +// +// Wait until the 2D engine is completely idle. Both bit 0 (queues +// empty) and bit 1 (engine idle) of the status register at 0x8244 +// must be set. + +static void sisWaitIdle(AccelDriverT *drv) { + SisPrivateT *priv = (SisPrivateT *)drv->privData; + + for (int32_t i = 0; i < SIS_MAX_IDLE_WAIT; i++) { + uint32_t stat = sisRead(priv, SIS_ENGINE_STATUS); + if ((stat & SIS_STATUS_ALL_IDLE) == SIS_STATUS_ALL_IDLE) { + return; + } + } +} diff --git a/test/86box.cfg b/test/86box.cfg new file mode 100644 index 0000000..71c2cb4 --- /dev/null +++ b/test/86box.cfg @@ -0,0 +1,62 @@ +# 86Box configuration for testing DOS accelerated video drivers +# Target: S3 Trio64 with 2MB VRAM + +[Machine] +machine = Award 430FX +cpu_family = intel_pentium_p54c +cpu_speed = 75000000 +cpu_multi = 1.5 +fpu_type = internal +mem_size = 16 +time_sync = local + +[Video] +gfxcard = S3 Trio64 +voodoo = off + +[Video S3 Trio64] +memory = 2 + +[Input] +mouse_type = ps2 + +[Sound] +sndcard = Sound Blaster 16 +midi_device = none +mpu401 = none +opl_type = nuked + +[Floppy and CD-ROM drives] +fdd_01_type = 35_2hd +fdd_02_type = none +cdrom_01_host_drive = 0 +cdrom_01_speed = 8 +cdrom_01_type = 86B_CD-ROM_1.00 +cdrom_01_bus_type = ide +cdrom_01_ide_channel = 1:0 + +[Hard disks] +hdd_01_parameters = 63, 16, 507, 0, ide, none +hdd_01_fn = dos622.img +hdd_01_ide_channel = 0:0 + +[Floppy images] +fdd_01_fn = +fdd_02_fn = + +[Storage controllers] +hdc = IDE (PCI) +scsi_card = none + +[Network] +net_type = none + +[Ports (COM & LPT)] +serial1_enabled = 1 +serial2_enabled = 0 +lpt1_enabled = 1 +lpt1_device = none + +[Other peripherals] +bugger = off +postcard = off diff --git a/test/README.txt b/test/README.txt new file mode 100644 index 0000000..c1466bd --- /dev/null +++ b/test/README.txt @@ -0,0 +1,121 @@ +86Box Test Environment Setup +============================ + +This directory contains configuration files for testing the DOS +accelerated video driver demo under 86Box, an x86 hardware emulator. + +The 86box.cfg is configured for: + - Intel Pentium 75 MHz (Award 430FX chipset) + - 16 MB RAM + - S3 Trio64 with 2 MB VRAM + - Sound Blaster 16 + - IDE hard disk (504 MB image) + - 3.5" 1.44 MB floppy drive + - IDE CD-ROM + + +Step 1: Install 86Box +--------------------- +Download 86Box from https://86box.net/ and extract it to a +directory of your choice. You also need the ROM set -- place +the roms/ folder alongside the 86Box executable. + + +Step 2: Create a Hard Disk Image +-------------------------------- +Use 86Box's built-in disk creation or an external tool: + + - In 86Box: Settings > Hard Disks > New + - Create a 504 MB image named "dos622.img" + - Or use: dd if=/dev/zero of=dos622.img bs=1M count=504 + +The 86box.cfg expects the image at: + dos622.img (in the same directory as 86box.cfg) + + +Step 3: Install DOS 6.22 +------------------------- +1. Copy 86box.cfg to your 86Box working directory (or point + 86Box at this directory with the --vmpath flag). +2. Obtain MS-DOS 6.22 floppy images (disk1.img, disk2.img, disk3.img). +3. Start 86Box. Insert disk1.img in the floppy drive: + Settings > Floppy & CD-ROM > Floppy 1 > select disk1.img +4. Boot from floppy (the machine should boot from A: by default). +5. Follow the DOS setup process: + - FDISK: create a primary partition using all space, set active + - Reboot from floppy after FDISK + - FORMAT C: /S + - Run SETUP from the DOS disks +6. Swap floppy images when prompted for disk 2 and disk 3. +7. After setup completes, remove the floppy image and reboot + to verify DOS boots from the hard drive. + + +Step 4: Install CWSDPMI +----------------------- +The demo is a DJGPP (32-bit protected mode) executable and needs +a DPMI host. Download CWSDPMI from: + http://sandmann.dotster.com/cwsdpmi/ + +Copy CWSDPMI.EXE to C:\ on the disk image. DJGPP executables +will load it automatically when no other DPMI host is present. + +Alternatively, you can use CWSDPR0.EXE for ring-0 operation, +which provides direct hardware access without virtualization +overhead. + + +Step 5: Copy the Demo +---------------------- +Mount the disk image and copy these files to C:\: + + demo.exe - the compiled demo executable + cwsdpmi.exe - DPMI host (see Step 4) + +You can mount the image on Linux with: + sudo mount -o loop,offset=32256 dos622.img /mnt + +Or use mtools: + mcopy -i dos622.img@@32256 demo.exe :: + mcopy -i dos622.img@@32256 cwsdpmi.exe :: + +Also copy rundemo.bat for convenience: + mcopy -i dos622.img@@32256 rundemo.bat :: + + +Step 6: Run the Demo +-------------------- +Boot the machine in 86Box and at the C:\> prompt: + + C:\>RUNDEMO + +Or run directly: + + C:\>DEMO 640 480 16 + +Other supported modes (depending on VRAM): + C:\>DEMO 800 600 16 + C:\>DEMO 640 480 32 + C:\>DEMO 1024 768 8 + +Controls: + SPACE - cycle to next demo + B - run benchmark + ESC - exit + + +Troubleshooting +--------------- +- "No supported video hardware found": Verify 86box.cfg has + the S3 Trio64 selected. Check that PCI is enabled. + +- Black screen or garbled display: The S3 driver may not support + the requested mode at the configured VRAM size. Try a lower + resolution or color depth. + +- "Load error: no DPMI": CWSDPMI.EXE is missing or not in the + PATH. Copy it to the same directory as DEMO.EXE. + +- Demo runs but acceleration looks wrong: Some 86Box versions + have incomplete S3 acceleration emulation. Try updating to + the latest 86Box release. diff --git a/test/rundemo.bat b/test/rundemo.bat new file mode 100644 index 0000000..b8512cd --- /dev/null +++ b/test/rundemo.bat @@ -0,0 +1,3 @@ +@ECHO OFF +REM Run the accelerated video driver demo at 640x480 16-bit color +DEMO.EXE 640 480 16 diff --git a/trident.c b/trident.c new file mode 100644 index 0000000..ac683e9 --- /dev/null +++ b/trident.c @@ -0,0 +1,630 @@ +// trident.c -- Trident TGUI9440/9660/9680 accelerated video driver +// +// Supports the Trident TGUI family: TGUI9440, TGUI9660, TGUI9680, +// ProVidia 9685, Blade3D, and CyberBlade. These were common PCI +// chips in low-cost 1990s desktop and laptop systems. +// +// The TGUI 2D engine provides: +// - Solid rectangle fill (pattern source) +// - Screen-to-screen BitBLT +// - CPU-to-screen blit (host data transfer) +// - Hardware cursor (64x64) +// +// Register access: +// The GER (Graphics Engine Register) set uses I/O ports in the +// 0x2120-0x214F range. Operations are programmed by writing +// coordinates, dimensions, ROP, and command byte, then the engine +// executes asynchronously. Status is polled at 0x2120. + +#include "accelVid.h" +#include "vgaCommon.h" +#include "pci.h" + +#include +#include +#include +#include +#include + +// ============================================================ +// Trident vendor/device IDs +// ============================================================ + +#define TRIDENT_VENDOR_ID 0x1023 + +#define TRIDENT_TGUI9440 0x9440 +#define TRIDENT_TGUI9660 0x9660 +#define TRIDENT_TGUI9680 0x9680 +#define TRIDENT_PROVIDIA 0x9685 +#define TRIDENT_BLADE3D 0x9880 +#define TRIDENT_CYBERBLADE 0x9910 + +static const uint16_t sTridentDeviceIds[] = { + TRIDENT_VENDOR_ID, TRIDENT_TGUI9440, + TRIDENT_VENDOR_ID, TRIDENT_TGUI9660, + TRIDENT_VENDOR_ID, TRIDENT_TGUI9680, + TRIDENT_VENDOR_ID, TRIDENT_PROVIDIA, + TRIDENT_VENDOR_ID, TRIDENT_BLADE3D, + TRIDENT_VENDOR_ID, TRIDENT_CYBERBLADE, + 0, 0 +}; + +// ============================================================ +// GER (Graphics Engine Register) ports +// ============================================================ + +#define GER_STATUS 0x2120 // word: bit 0 = engine busy +#define GER_OPERMODE 0x2122 // word: bits 2:0 = bpp encoding +#define GER_COMMAND 0x2124 // byte: command register +#define GER_ROP 0x2125 // byte: raster operation +#define GER_FG_COLOR 0x2128 // dword: foreground color +#define GER_BG_COLOR 0x212C // dword: background color +#define GER_PAT_ADDR 0x2130 // dword: pattern address +#define GER_SRC_X 0x2138 // word: source X +#define GER_SRC_Y 0x213A // word: source Y +#define GER_DST_X 0x213C // word: destination X +#define GER_DST_Y 0x213E // word: destination Y +#define GER_DIM_X 0x2140 // word: width - 1 +#define GER_DIM_Y 0x2142 // word: height - 1 +#define GER_STYLE 0x2144 // dword: line style/pattern +#define GER_CKEY 0x2148 // dword: color key + +// ============================================================ +// GER status bits +// ============================================================ + +#define GER_STATUS_BUSY 0x0001 + +// ============================================================ +// GER command byte encoding +// ============================================================ +// +// Bit 0: X direction (0=left, 1=right) +// Bit 1: Y direction (0=up, 1=down) +// Bits 3:2: source select (00=video, 01=system, 10=pattern) +// Bit 4: draw enable (must be set) +// Bit 5: mono source +// Bits 7:6: command type (00=bitblt) + +#define GER_CMD_X_RIGHT 0x01 +#define GER_CMD_X_LEFT 0x00 +#define GER_CMD_Y_DOWN 0x02 +#define GER_CMD_Y_UP 0x00 +#define GER_CMD_SRC_VIDEO 0x00 +#define GER_CMD_SRC_SYSTEM 0x04 +#define GER_CMD_SRC_PATTERN 0x08 +#define GER_CMD_DRAW 0x10 +#define GER_CMD_MONO 0x20 +#define GER_CMD_BITBLT 0x00 + +// Composite commands +#define GER_CMD_SOLID_FILL (GER_CMD_BITBLT | GER_CMD_SRC_PATTERN | GER_CMD_DRAW | GER_CMD_X_RIGHT | GER_CMD_Y_DOWN) +#define GER_CMD_SCRBLT_FWD (GER_CMD_BITBLT | GER_CMD_SRC_VIDEO | GER_CMD_DRAW | GER_CMD_X_RIGHT | GER_CMD_Y_DOWN) +#define GER_CMD_HOSTBLT (GER_CMD_BITBLT | GER_CMD_SRC_SYSTEM | GER_CMD_DRAW | GER_CMD_X_RIGHT | GER_CMD_Y_DOWN) + +// ============================================================ +// GER opermode bpp encoding (bits 2:0) +// ============================================================ + +#define GER_BPP_8 0x00 +#define GER_BPP_16 0x01 +#define GER_BPP_32 0x02 + +// ============================================================ +// ROPs for GER engine +// ============================================================ + +#define TGUI_ROP_COPY 0xCC +#define TGUI_ROP_PAT_COPY 0xF0 + +// ============================================================ +// Hardware cursor +// ============================================================ +// +// 64x64 cursor stored at end of VRAM. Each row is 16 bytes: +// 8 bytes AND mask followed by 8 bytes XOR mask. +// Enable via CRTC extended register 0x50 bit 7. +// Position via CRTC registers 0x40-0x43. + +#define TGUI_CURSOR_SIZE 64 +#define TGUI_CURSOR_BYTES (TGUI_CURSOR_SIZE * 16) // 1024 bytes + +// ============================================================ +// CRTC extended registers for cursor +// ============================================================ + +#define TGUI_CRTC_CURSOR_X_LO 0x40 +#define TGUI_CRTC_CURSOR_X_HI 0x41 +#define TGUI_CRTC_CURSOR_Y_LO 0x42 +#define TGUI_CRTC_CURSOR_Y_HI 0x43 +#define TGUI_CRTC_CURSOR_CTRL 0x50 + +// ============================================================ +// Miscellaneous +// ============================================================ + +#define TGUI_MAX_IDLE_WAIT 1000000 + +// ============================================================ +// Private driver state +// ============================================================ + +typedef struct { + uint32_t lfbPhysAddr; + uint32_t vramSize; + uint32_t cursorOffset; + int32_t bytesPerPixel; + int32_t screenPitch; + uint16_t chipId; +} TridentPrivateT; + +// ============================================================ +// Prototypes +// ============================================================ + +static void tgBitBlt(AccelDriverT *drv, int32_t srcX, int32_t srcY, int32_t dstX, int32_t dstY, int32_t w, int32_t h); +static bool tgDetect(AccelDriverT *drv); +static uint8_t tgGetBppMode(int32_t bytesPerPixel); +static void tgHostBlit(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h); +static bool tgInit(AccelDriverT *drv, const AccelModeRequestT *req); +static void tgMoveCursor(AccelDriverT *drv, int32_t x, int32_t y); +static void tgRectFill(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color); +static void tgSetCursor(AccelDriverT *drv, const HwCursorImageT *image); +static void tgShowCursor(AccelDriverT *drv, bool visible); +static void tgShutdown(AccelDriverT *drv); +static void tgUnlockRegs(void); +static void tgWaitIdle(AccelDriverT *drv); + +// ============================================================ +// Driver instance +// ============================================================ + +static TridentPrivateT sTridentPrivate; + +static AccelDriverT sTridentDriver = { + .name = "Trident TGUI", + .chipFamily = "trident", + .caps = 0, + .privData = &sTridentPrivate, + .detect = tgDetect, + .init = tgInit, + .shutdown = tgShutdown, + .waitIdle = tgWaitIdle, + .setClip = NULL, + .rectFill = tgRectFill, + .rectFillPat = NULL, + .bitBlt = tgBitBlt, + .hostBlit = tgHostBlit, + .colorExpand = NULL, + .lineDraw = NULL, + .setCursor = tgSetCursor, + .moveCursor = tgMoveCursor, + .showCursor = tgShowCursor, +}; + +// ============================================================ +// tridentRegisterDriver +// ============================================================ + +void tridentRegisterDriver(void) { + accelRegisterDriver(&sTridentDriver); +} + + +// ============================================================ +// tgBitBlt +// ============================================================ +// +// Screen-to-screen BitBLT. Direction bits are set to handle +// overlapping source/destination regions correctly. + +static void tgBitBlt(AccelDriverT *drv, int32_t srcX, int32_t srcY, int32_t dstX, int32_t dstY, int32_t w, int32_t h) { + if (w <= 0 || h <= 0) { + return; + } + + tgWaitIdle(drv); + + TridentPrivateT *priv = (TridentPrivateT *)drv->privData; + + // Determine copy direction for overlap handling + uint8_t cmd = GER_CMD_BITBLT | GER_CMD_SRC_VIDEO | GER_CMD_DRAW; + + int32_t sx = srcX; + int32_t sy = srcY; + int32_t dx = dstX; + int32_t dy = dstY; + + if (dstY > srcY || (dstY == srcY && dstX > srcX)) { + // Copy bottom-to-top, right-to-left + sx += w - 1; + sy += h - 1; + dx += w - 1; + dy += h - 1; + cmd |= GER_CMD_X_LEFT | GER_CMD_Y_UP; + } else { + // Copy top-to-bottom, left-to-right + cmd |= GER_CMD_X_RIGHT | GER_CMD_Y_DOWN; + } + + // Set operation mode (bpp) + outportw(GER_OPERMODE, tgGetBppMode(priv->bytesPerPixel)); + + // ROP: copy + outportb(GER_ROP, TGUI_ROP_COPY); + + // Source coordinates + outportw(GER_SRC_X, sx); + outportw(GER_SRC_Y, sy); + + // Destination coordinates + outportw(GER_DST_X, dx); + outportw(GER_DST_Y, dy); + + // Dimensions (width - 1, height - 1) + outportw(GER_DIM_X, w - 1); + outportw(GER_DIM_Y, h - 1); + + // Fire command + outportb(GER_COMMAND, cmd); +} + + +// ============================================================ +// tgDetect +// ============================================================ + +static bool tgDetect(AccelDriverT *drv) { + int32_t matchIdx; + + if (!pciFindDeviceList(sTridentDeviceIds, &drv->pciDev, &matchIdx)) { + return false; + } + + TridentPrivateT *priv = (TridentPrivateT *)drv->privData; + priv->chipId = drv->pciDev.deviceId; + + switch (drv->pciDev.deviceId) { + case TRIDENT_TGUI9440: + drv->name = "Trident TGUI9440"; + break; + case TRIDENT_TGUI9660: + drv->name = "Trident TGUI9660"; + break; + case TRIDENT_TGUI9680: + drv->name = "Trident TGUI9680"; + break; + case TRIDENT_PROVIDIA: + drv->name = "Trident ProVidia 9685"; + break; + case TRIDENT_BLADE3D: + drv->name = "Trident Blade3D"; + break; + case TRIDENT_CYBERBLADE: + drv->name = "Trident CyberBlade"; + break; + default: + drv->name = "Trident TGUI"; + break; + } + + return true; +} + + +// ============================================================ +// tgGetBppMode +// ============================================================ +// +// Return the GER_OPERMODE bpp encoding for the given bytes per pixel. + +static uint8_t tgGetBppMode(int32_t bytesPerPixel) { + switch (bytesPerPixel) { + case 2: + return GER_BPP_16; + case 4: + return GER_BPP_32; + default: + return GER_BPP_8; + } +} + + +// ============================================================ +// tgHostBlit +// ============================================================ +// +// CPU-to-screen blit. Sets source select to system/CPU and feeds +// pixel data through the GER data port. Each scanline of source +// data is written as a series of 32-bit dwords. + +static void tgHostBlit(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h) { + if (w <= 0 || h <= 0) { + return; + } + + TridentPrivateT *priv = (TridentPrivateT *)drv->privData; + + int32_t rowBytes = w * priv->bytesPerPixel; + int32_t padBytes = (rowBytes + 3) & ~3; + int32_t dwordsPerRow = padBytes / 4; + + tgWaitIdle(drv); + + // Set operation mode (bpp) + outportw(GER_OPERMODE, tgGetBppMode(priv->bytesPerPixel)); + + // ROP: copy + outportb(GER_ROP, TGUI_ROP_COPY); + + // Source coordinates (not meaningful for host data, set to 0) + outportw(GER_SRC_X, 0); + outportw(GER_SRC_Y, 0); + + // Destination coordinates + outportw(GER_DST_X, dstX); + outportw(GER_DST_Y, dstY); + + // Dimensions + outportw(GER_DIM_X, w - 1); + outportw(GER_DIM_Y, h - 1); + + // Fire host blit command + outportb(GER_COMMAND, GER_CMD_HOSTBLT); + + // Feed pixel data row by row as dwords + for (int32_t row = 0; row < h; row++) { + const uint8_t *rowData = srcBuf + row * srcPitch; + + for (int32_t d = 0; d < dwordsPerRow; d++) { + int32_t base = d * 4; + uint32_t dword = 0; + + for (int32_t b = 0; b < 4; b++) { + int32_t idx = base + b; + uint8_t byte = (idx < rowBytes) ? rowData[idx] : 0; + dword |= (uint32_t)byte << (b * 8); + } + + outportl(GER_SRC_X, dword); + } + } +} + + +// ============================================================ +// tgInit +// ============================================================ + +static bool tgInit(AccelDriverT *drv, const AccelModeRequestT *req) { + TridentPrivateT *priv = (TridentPrivateT *)drv->privData; + + // Get LFB physical address from PCI BAR0 + uint32_t bar0 = pciRead32(drv->pciDev.bus, drv->pciDev.dev, + drv->pciDev.func, PCI_BAR0); + priv->lfbPhysAddr = bar0 & 0xFFFFFFF0; + priv->vramSize = pciSizeBar(drv->pciDev.bus, drv->pciDev.dev, + drv->pciDev.func, PCI_BAR0); + + // Unlock Trident extended registers + tgUnlockRegs(); + + // Find and set VESA mode + VesaModeResultT vesa; + if (!vesaFindAndSetMode(req->width, req->height, req->bpp, &vesa)) { + return false; + } + + // Map LFB via DPMI + DpmiMappingT lfbMap; + if (!dpmiMapFramebuffer(priv->lfbPhysAddr, priv->vramSize, &lfbMap)) { + vgaRestoreTextMode(); + return false; + } + + // Fill in driver mode info + priv->bytesPerPixel = (vesa.bpp + 7) / 8; + priv->screenPitch = vesa.pitch; + + drv->mode.width = vesa.width; + drv->mode.height = vesa.height; + drv->mode.bpp = vesa.bpp; + drv->mode.pitch = vesa.pitch; + drv->mode.framebuffer = lfbMap.ptr; + drv->mode.vramSize = priv->vramSize; + drv->mode.offscreenBase = vesa.pitch * vesa.height; + + // Re-unlock after mode set (VESA BIOS may re-lock) + tgUnlockRegs(); + + // Set GER operation mode for current bpp + outportw(GER_OPERMODE, tgGetBppMode(priv->bytesPerPixel)); + + // Set up hardware cursor at end of VRAM + priv->cursorOffset = priv->vramSize - TGUI_CURSOR_BYTES; + priv->cursorOffset &= ~(uint32_t)(TGUI_CURSOR_BYTES - 1); + + // Set cursor start address via CRTC extended registers + // The cursor address is stored as a byte offset divided by 1024 + uint32_t cursorAddrReg = priv->cursorOffset / 1024; + vgaCrtcWrite(0x44, cursorAddrReg & 0xFF); + vgaCrtcWrite(0x45, (cursorAddrReg >> 8) & 0xFF); + + drv->caps = ACAP_RECT_FILL | ACAP_BITBLT | ACAP_HOST_BLIT | ACAP_HW_CURSOR; + + tgWaitIdle(drv); + return true; +} + + +// ============================================================ +// tgMoveCursor +// ============================================================ +// +// Set the hardware cursor position via CRTC extended registers +// 0x40-0x43. X is at 0x40/0x41, Y is at 0x42/0x43. + +static void tgMoveCursor(AccelDriverT *drv, int32_t x, int32_t y) { + (void)drv; + + if (x < 0) { x = 0; } + if (y < 0) { y = 0; } + + vgaCrtcWrite(TGUI_CRTC_CURSOR_X_LO, x & 0xFF); + vgaCrtcWrite(TGUI_CRTC_CURSOR_X_HI, (x >> 8) & 0x07); + vgaCrtcWrite(TGUI_CRTC_CURSOR_Y_LO, y & 0xFF); + vgaCrtcWrite(TGUI_CRTC_CURSOR_Y_HI, (y >> 8) & 0x07); +} + + +// ============================================================ +// tgRectFill +// ============================================================ +// +// Solid rectangle fill using the GER engine in pattern source mode. +// The foreground color register provides the fill color, and the +// ROP is set to pattern copy (0xF0). + +static void tgRectFill(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color) { + if (w <= 0 || h <= 0) { + return; + } + + TridentPrivateT *priv = (TridentPrivateT *)drv->privData; + + tgWaitIdle(drv); + + // Set operation mode (bpp) + outportw(GER_OPERMODE, tgGetBppMode(priv->bytesPerPixel)); + + // Foreground color for the fill + outportl(GER_FG_COLOR, color); + + // ROP: pattern copy (solid fill uses fg color as pattern) + outportb(GER_ROP, TGUI_ROP_PAT_COPY); + + // Destination coordinates + outportw(GER_DST_X, x); + outportw(GER_DST_Y, y); + + // Dimensions (width - 1, height - 1) + outportw(GER_DIM_X, w - 1); + outportw(GER_DIM_Y, h - 1); + + // Fire solid fill command + outportb(GER_COMMAND, GER_CMD_SOLID_FILL); +} + + +// ============================================================ +// tgSetCursor +// ============================================================ +// +// Upload a cursor image to VRAM at the cursor offset. The TGUI +// cursor format is 64x64 with 16 bytes per row: 8 bytes AND mask +// followed by 8 bytes XOR mask. + +static void tgSetCursor(AccelDriverT *drv, const HwCursorImageT *image) { + TridentPrivateT *priv = (TridentPrivateT *)drv->privData; + + if (!image) { + tgShowCursor(drv, false); + return; + } + + tgWaitIdle(drv); + + uint8_t *cursorMem = drv->mode.framebuffer + priv->cursorOffset; + + for (int32_t row = 0; row < TGUI_CURSOR_SIZE; row++) { + for (int32_t col = 0; col < 8; col++) { + int32_t srcIdx = row * 8 + col; + uint8_t andByte; + uint8_t xorByte; + + if (row < image->height && col < (image->width + 7) / 8) { + andByte = image->andMask[srcIdx]; + xorByte = image->xorMask[srcIdx]; + } else { + // Transparent: AND=0xFF, XOR=0x00 + andByte = 0xFF; + xorByte = 0x00; + } + + cursorMem[row * 16 + col] = andByte; + cursorMem[row * 16 + col + 8] = xorByte; + } + } +} + + +// ============================================================ +// tgShowCursor +// ============================================================ +// +// Enable or disable the hardware cursor via CRTC extended +// register 0x50, bit 7. + +static void tgShowCursor(AccelDriverT *drv, bool visible) { + (void)drv; + + uint8_t val = vgaCrtcRead(TGUI_CRTC_CURSOR_CTRL); + + if (visible) { + val |= 0x80; + } else { + val &= ~0x80; + } + + vgaCrtcWrite(TGUI_CRTC_CURSOR_CTRL, val); +} + + +// ============================================================ +// tgShutdown +// ============================================================ + +static void tgShutdown(AccelDriverT *drv) { + tgShowCursor(drv, false); + tgWaitIdle(drv); + vgaRestoreTextMode(); + __djgpp_nearptr_disable(); +} + + +// ============================================================ +// tgUnlockRegs +// ============================================================ +// +// Unlock Trident extended registers. Reading SR0B returns the +// chip version/ID and simultaneously unlocks the extended +// sequencer and CRTC registers. Then writing 0x01 to SR0E +// enables new-mode registers on TGUI chips. + +static void tgUnlockRegs(void) { + // Read SR0B to unlock extensions (returns chip ID) + outportb(VGA_SEQ_INDEX, 0x0B); + (void)inportb(VGA_SEQ_DATA); + + // Enable new-mode TGUI registers + outportb(VGA_SEQ_INDEX, 0x0E); + outportb(VGA_SEQ_DATA, 0x01); +} + + +// ============================================================ +// tgWaitIdle +// ============================================================ +// +// Wait for the GER engine to finish. Polls the status register +// at 0x2120 until bit 0 (busy) clears. + +static void tgWaitIdle(AccelDriverT *drv) { + (void)drv; + + for (int32_t i = 0; i < TGUI_MAX_IDLE_WAIT; i++) { + if (!(inportw(GER_STATUS) & GER_STATUS_BUSY)) { + return; + } + } +} diff --git a/tsengW32.c b/tsengW32.c new file mode 100644 index 0000000..e721d6f --- /dev/null +++ b/tsengW32.c @@ -0,0 +1,698 @@ +// tsengW32.c -- Tseng ET4000/W32p accelerated video driver +// +// Supports the Tseng Labs ET4000/W32 family: W32, W32i, W32p rev A/B/C/D. +// These chips were common in ISA/VLB and early PCI systems of the early +// 1990s, offering good 2D acceleration for their era. +// +// The W32 ACL (Accelerator) engine provides: +// - Solid rectangle fill +// - 8x8 pattern fill (mono and color) +// - Screen-to-screen BitBLT +// - CPU-to-screen color expansion +// - Bresenham line draw (W32p only) +// - Hardware cursor (64x64 on W32p, not on W32/W32i) +// +// Register access: +// The ACL registers are accessed via I/O ports in the 0x21xx range +// after unlocking with a key sequence. The ACL uses a different +// programming model from S3 or ATI -- operations are set up by +// writing source/destination addresses, dimensions, and mix/ROP +// to indexed registers, then triggered by writing to the +// accelerator control register. +// +// On the W32p, an MMU (Memory Management Unit) provides four +// apertures at the end of the linear address space that can be +// used for CPU-to-screen data transfer, avoiding I/O port +// overhead for host blits. + +#include "accelVid.h" +#include "vgaCommon.h" +#include "pci.h" + +#include +#include +#include +#include +#include + +// ============================================================ +// Tseng vendor/device IDs +// ============================================================ + +#define TSENG_VENDOR_ID 0x100C + +#define TSENG_W32 0x3202 +#define TSENG_W32I 0x3205 +#define TSENG_W32P_A 0x3206 +#define TSENG_W32P_B 0x3207 +#define TSENG_W32P_C 0x3208 +#define TSENG_W32P_D 0x4702 + +static const uint16_t sTsengDeviceIds[] = { + TSENG_VENDOR_ID, TSENG_W32, + TSENG_VENDOR_ID, TSENG_W32I, + TSENG_VENDOR_ID, TSENG_W32P_A, + TSENG_VENDOR_ID, TSENG_W32P_B, + TSENG_VENDOR_ID, TSENG_W32P_C, + TSENG_VENDOR_ID, TSENG_W32P_D, + 0, 0 +}; + +// ============================================================ +// Tseng ACL register ports +// ============================================================ +// +// The ACL registers are at I/O ports 0x2100-0x217F. They are +// accessed as indexed registers via a base+offset scheme. + +#define ET_ACL_SUSPEND_TERM 0x2100 // suspend/terminate +#define ET_ACL_OPERATION_STATE 0x2101 // operation state (read) +#define ET_ACL_SYNC_ENABLE 0x2102 // sync enable +#define ET_ACL_INT_STATUS 0x2109 // interrupt status +#define ET_ACL_INT_MASK 0x210A // interrupt mask + +// ACL setup registers +#define ET_ACL_PATTERN_ADDR 0x2110 // pattern address (3 bytes) +#define ET_ACL_SOURCE_ADDR 0x2114 // source address (3 bytes) +#define ET_ACL_PATTERN_Y_OFF 0x2118 // pattern Y offset +#define ET_ACL_SOURCE_Y_OFF 0x211A // source Y offset +#define ET_ACL_DEST_Y_OFF 0x211C // destination Y offset + +// Virtual bus size affects transfer granularity +#define ET_ACL_VBUS_SIZE 0x2120 // virtual bus size + +// X/Y count (dimensions) +#define ET_ACL_XY_DIR 0x2124 // X/Y direction +#define ET_ACL_X_COUNT 0x2128 // X count (width - 1, in bytes) +#define ET_ACL_Y_COUNT 0x212A // Y count (height - 1) + +// Routing control +#define ET_ACL_ROUTING_CTRL 0x2126 // routing control + +// Mix/ROP registers +#define ET_ACL_MIX_CONTROL 0x2127 // foreground/background source +#define ET_ACL_ROP 0x2130 // raster operation + +// Destination address +#define ET_ACL_DEST_ADDR 0x2134 // destination address (3 bytes) + +// Pixel depth control +#define ET_ACL_PIXEL_DEPTH 0x2138 // pixel depth (0=8, 1=15/16, 2=24, 3=32) + +// CPU source data port (for host-to-screen) +#define ET_ACL_CPU_DATA 0x2140 // CPU data register (32-bit) + +// ============================================================ +// ACL direction bits (ET_ACL_XY_DIR) +// ============================================================ + +#define ET_DIR_X_POS 0x00 +#define ET_DIR_X_NEG 0x01 +#define ET_DIR_Y_POS 0x00 +#define ET_DIR_Y_NEG 0x02 + +// ============================================================ +// ACL routing control (ET_ACL_ROUTING_CTRL) +// ============================================================ + +#define ET_ROUTE_SRC_VRAM 0x00 // source from video memory +#define ET_ROUTE_SRC_CPU 0x02 // source from CPU +#define ET_ROUTE_SRC_PATTERN 0x04 // source from pattern +#define ET_ROUTE_SRC_COLOR_EXP 0x06 // source is mono -> color expand +#define ET_ROUTE_DST_VRAM 0x00 // destination to video memory + +// ============================================================ +// ACL mix control (ET_ACL_MIX_CONTROL) +// ============================================================ + +#define ET_MIX_FG_SRC 0x00 // foreground from source +#define ET_MIX_FG_PATTERN 0x04 // foreground from pattern +#define ET_MIX_FG_COLOR 0x08 // foreground from foreground color reg +#define ET_MIX_BG_SRC 0x00 // background from source +#define ET_MIX_BG_PATTERN 0x10 // background from pattern +#define ET_MIX_BG_COLOR 0x20 // background from background color reg + +// ============================================================ +// ACL operation state bits +// ============================================================ + +#define ET_ACCEL_BUSY 0x02 // accelerator busy +#define ET_ACCEL_CMD_READY 0x01 // ready for next command + +// ============================================================ +// ACL suspend/terminate control +// ============================================================ + +#define ET_ACL_START 0x00 // start/continue operation +#define ET_ACL_SUSPEND 0x01 // suspend +#define ET_ACL_TERMINATE 0x02 // terminate + +// Common ROPs +#define ET_ROP_COPY 0xCC // dest = source +#define ET_ROP_PAT_COPY 0xF0 // dest = pattern +#define ET_ROP_ZERO 0x00 +#define ET_ROP_ONE 0xFF +#define ET_ROP_XOR 0x66 + +// Hardware cursor +#define ET_HW_CURSOR_SIZE 64 +#define ET_HW_CURSOR_BYTES 1024 + +// Maximum wait iterations +#define ET_MAX_IDLE_WAIT 1000000 + +// ============================================================ +// Private driver state +// ============================================================ + +typedef struct { + uint32_t lfbPhysAddr; + uint32_t vramSize; + uint32_t cursorOffset; + int32_t bytesPerPixel; + int32_t screenPitch; + bool isW32p; // W32p has more features than W32/W32i +} TsengPrivateT; + +// ============================================================ +// Prototypes +// ============================================================ + +static void etBitBlt(AccelDriverT *drv, int32_t srcX, int32_t srcY, int32_t dstX, int32_t dstY, int32_t w, int32_t h); +static bool etDetect(AccelDriverT *drv); +static void etHostBlit(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h); +static bool etInit(AccelDriverT *drv, const AccelModeRequestT *req); +static void etMoveCursor(AccelDriverT *drv, int32_t x, int32_t y); +static void etRectFill(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color); +static void etSetCursor(AccelDriverT *drv, const HwCursorImageT *image); +static void etShowCursor(AccelDriverT *drv, bool visible); +static void etShutdown(AccelDriverT *drv); +static void etUnlockRegs(void); +static void etWaitIdle(AccelDriverT *drv); + +// ============================================================ +// Driver instance +// ============================================================ + +static TsengPrivateT sTsengPrivate; + +static AccelDriverT sTsengDriver = { + .name = "Tseng ET4000/W32p", + .chipFamily = "tseng", + .caps = 0, + .privData = &sTsengPrivate, + .detect = etDetect, + .init = etInit, + .shutdown = etShutdown, + .waitIdle = etWaitIdle, + .setClip = NULL, // W32 has no hardware scissors + .rectFill = etRectFill, + .rectFillPat = NULL, + .bitBlt = etBitBlt, + .hostBlit = etHostBlit, + .colorExpand = NULL, + .lineDraw = NULL, // Line draw is complex on W32, omit for now + .setCursor = etSetCursor, + .moveCursor = etMoveCursor, + .showCursor = etShowCursor, +}; + +// ============================================================ +// etRegisterDriver +// ============================================================ + +void etRegisterDriver(void) { + accelRegisterDriver(&sTsengDriver); +} + + +// ============================================================ +// etBitBlt +// ============================================================ +// +// Screen-to-screen BitBLT using the ACL engine. Source and +// destination are linear byte addresses in VRAM. Direction is +// controlled to handle overlapping regions. + +static void etBitBlt(AccelDriverT *drv, int32_t srcX, int32_t srcY, int32_t dstX, int32_t dstY, int32_t w, int32_t h) { + TsengPrivateT *priv = (TsengPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + int32_t bpp = priv->bytesPerPixel; + int32_t pitch = priv->screenPitch; + + uint32_t srcAddr = srcY * pitch + srcX * bpp; + uint32_t dstAddr = dstY * pitch + dstX * bpp; + + uint8_t direction = ET_DIR_X_POS | ET_DIR_Y_POS; + + if (dstAddr > srcAddr) { + direction = ET_DIR_X_NEG | ET_DIR_Y_NEG; + srcAddr += (h - 1) * pitch + (w - 1) * bpp; + dstAddr += (h - 1) * pitch + (w - 1) * bpp; + } + + int32_t widthBytes = w * bpp - 1; + + etWaitIdle(drv); + + // Set pixel depth + uint8_t pixDepth = 0; + if (bpp == 2) { pixDepth = 1; } + if (bpp == 4) { pixDepth = 3; } + outportb(ET_ACL_PIXEL_DEPTH, pixDepth); + + // Source routing: VRAM to VRAM + outportb(ET_ACL_ROUTING_CTRL, ET_ROUTE_SRC_VRAM | ET_ROUTE_DST_VRAM); + + // ROP: copy + outportb(ET_ACL_ROP, ET_ROP_COPY); + + // Direction + outportb(ET_ACL_XY_DIR, direction); + + // Source Y offset (pitch) + outportw(ET_ACL_SOURCE_Y_OFF, pitch - 1); + + // Dest Y offset (pitch) + outportw(ET_ACL_DEST_Y_OFF, pitch - 1); + + // X and Y counts + outportw(ET_ACL_X_COUNT, widthBytes); + outportw(ET_ACL_Y_COUNT, h - 1); + + // Source address (24-bit) + outportb(ET_ACL_SOURCE_ADDR, srcAddr & 0xFF); + outportb(ET_ACL_SOURCE_ADDR + 1, (srcAddr >> 8) & 0xFF); + outportb(ET_ACL_SOURCE_ADDR + 2, (srcAddr >> 16) & 0xFF); + + // Destination address (triggers operation) + outportb(ET_ACL_DEST_ADDR, dstAddr & 0xFF); + outportb(ET_ACL_DEST_ADDR + 1, (dstAddr >> 8) & 0xFF); + outportb(ET_ACL_DEST_ADDR + 2, (dstAddr >> 16) & 0xFF); + + // Start + outportb(ET_ACL_SUSPEND_TERM, ET_ACL_START); +} + + +// ============================================================ +// etDetect +// ============================================================ + +static bool etDetect(AccelDriverT *drv) { + int32_t matchIdx; + + if (!pciFindDeviceList(sTsengDeviceIds, &drv->pciDev, &matchIdx)) { + return false; + } + + TsengPrivateT *priv = (TsengPrivateT *)drv->privData; + + switch (drv->pciDev.deviceId) { + case TSENG_W32: + drv->name = "Tseng ET4000/W32"; + priv->isW32p = false; + break; + case TSENG_W32I: + drv->name = "Tseng ET4000/W32i"; + priv->isW32p = false; + break; + case TSENG_W32P_A: + case TSENG_W32P_B: + case TSENG_W32P_C: + case TSENG_W32P_D: + drv->name = "Tseng ET4000/W32p"; + priv->isW32p = true; + break; + default: + drv->name = "Tseng ET4000/W32"; + priv->isW32p = false; + break; + } + + return true; +} + + +// ============================================================ +// etHostBlit +// ============================================================ +// +// CPU-to-screen blit. Transfers pixel data from system memory to +// the framebuffer via the ACL engine. Source routing is set to CPU +// and data is fed as 32-bit dwords through ET_ACL_CPU_DATA. Each +// row of source pixels is packed into dwords with padding to a +// 4-byte boundary. + +static void etHostBlit(AccelDriverT *drv, const uint8_t *srcBuf, int32_t srcPitch, int32_t dstX, int32_t dstY, int32_t w, int32_t h) { + TsengPrivateT *priv = (TsengPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + int32_t bpp = priv->bytesPerPixel; + int32_t pitch = priv->screenPitch; + uint32_t dstAddr = dstY * pitch + dstX * bpp; + int32_t widthBytes = w * bpp - 1; + int32_t rowBytes = w * bpp; + int32_t padBytesPerRow = (rowBytes + 3) & ~3; + int32_t dwordsPerRow = padBytesPerRow / 4; + + etWaitIdle(drv); + + // Set pixel depth + uint8_t pixDepth = 0; + if (bpp == 2) { pixDepth = 1; } + if (bpp == 4) { pixDepth = 3; } + outportb(ET_ACL_PIXEL_DEPTH, pixDepth); + + // Routing: source from CPU, destination to VRAM + outportb(ET_ACL_ROUTING_CTRL, ET_ROUTE_SRC_CPU | ET_ROUTE_DST_VRAM); + + // ROP: copy + outportb(ET_ACL_ROP, ET_ROP_COPY); + + // Direction: forward + outportb(ET_ACL_XY_DIR, ET_DIR_X_POS | ET_DIR_Y_POS); + + // Dest Y offset (pitch) + outportw(ET_ACL_DEST_Y_OFF, pitch - 1); + + // X and Y counts + outportw(ET_ACL_X_COUNT, widthBytes); + outportw(ET_ACL_Y_COUNT, h - 1); + + // Destination address + outportb(ET_ACL_DEST_ADDR, dstAddr & 0xFF); + outportb(ET_ACL_DEST_ADDR + 1, (dstAddr >> 8) & 0xFF); + outportb(ET_ACL_DEST_ADDR + 2, (dstAddr >> 16) & 0xFF); + + // Start + outportb(ET_ACL_SUSPEND_TERM, ET_ACL_START); + + // Feed pixel data as dwords, row by row + for (int32_t row = 0; row < h; row++) { + const uint8_t *rowData = srcBuf + row * srcPitch; + + for (int32_t d = 0; d < dwordsPerRow; d++) { + int32_t base = d * 4; + uint32_t dword = 0; + + for (int32_t b = 0; b < 4; b++) { + int32_t idx = base + b; + uint8_t byte = (idx < rowBytes) ? rowData[idx] : 0; + dword |= (uint32_t)byte << (b * 8); + } + + outportl(ET_ACL_CPU_DATA, dword); + } + } +} + + +// ============================================================ +// etInit +// ============================================================ + +static bool etInit(AccelDriverT *drv, const AccelModeRequestT *req) { + TsengPrivateT *priv = (TsengPrivateT *)drv->privData; + + // Get LFB from PCI BAR0 + uint32_t bar0 = pciRead32(drv->pciDev.bus, drv->pciDev.dev, + drv->pciDev.func, PCI_BAR0); + priv->lfbPhysAddr = bar0 & 0xFFFFFFF0; + priv->vramSize = pciSizeBar(drv->pciDev.bus, drv->pciDev.dev, + drv->pciDev.func, PCI_BAR0); + + // Unlock Tseng extended registers + etUnlockRegs(); + + // Find and set VESA mode + VesaModeResultT vesa; + if (!vesaFindAndSetMode(req->width, req->height, req->bpp, &vesa)) { + return false; + } + + // Map LFB via DPMI + DpmiMappingT lfbMap; + if (!dpmiMapFramebuffer(priv->lfbPhysAddr, priv->vramSize, &lfbMap)) { + vgaRestoreTextMode(); + return false; + } + + // Fill in driver mode info + priv->bytesPerPixel = (vesa.bpp + 7) / 8; + priv->screenPitch = vesa.pitch; + + drv->mode.width = vesa.width; + drv->mode.height = vesa.height; + drv->mode.bpp = vesa.bpp; + drv->mode.pitch = vesa.pitch; + drv->mode.framebuffer = lfbMap.ptr; + drv->mode.vramSize = priv->vramSize; + drv->mode.offscreenBase = vesa.pitch * vesa.height; + + // Re-unlock after mode set + etUnlockRegs(); + + // Reset the ACL engine + outportb(ET_ACL_SUSPEND_TERM, ET_ACL_TERMINATE); + outportb(ET_ACL_SUSPEND_TERM, ET_ACL_START); + + // Set up cursor at end of VRAM (W32p only) + if (priv->isW32p) { + priv->cursorOffset = priv->vramSize - ET_HW_CURSOR_BYTES; + priv->cursorOffset &= ~(ET_HW_CURSOR_BYTES - 1); + } + + drv->caps = ACAP_RECT_FILL | ACAP_BITBLT | ACAP_HOST_BLIT; + + if (priv->isW32p) { + drv->caps |= ACAP_HW_CURSOR; + } + + etWaitIdle(drv); + return true; +} + + +// ============================================================ +// etMoveCursor +// ============================================================ +// +// The W32p hardware cursor position is set through CRTC extended +// registers (IMA port area). Cursor X is at CRTC index 0x40/0x41, +// cursor Y at 0x42/0x43. + +static void etMoveCursor(AccelDriverT *drv, int32_t x, int32_t y) { + (void)drv; + + if (x < 0) { x = 0; } + if (y < 0) { y = 0; } + + // ET4000/W32p cursor position registers + outportb(0x217A, 0xE0); // cursor X low + outportb(0x217B, x & 0xFF); + outportb(0x217A, 0xE1); // cursor X high + outportb(0x217B, (x >> 8) & 0x07); + outportb(0x217A, 0xE2); // cursor Y low + outportb(0x217B, y & 0xFF); + outportb(0x217A, 0xE3); // cursor Y high + outportb(0x217B, (y >> 8) & 0x07); +} + + +// ============================================================ +// etRectFill +// ============================================================ +// +// Solid fill using the ACL engine. We write a single pixel of +// the fill color to an offscreen VRAM location and use it as +// the "source" for a replicated blit. + +static void etRectFill(AccelDriverT *drv, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color) { + TsengPrivateT *priv = (TsengPrivateT *)drv->privData; + + if (w <= 0 || h <= 0) { + return; + } + + int32_t bpp = priv->bytesPerPixel; + int32_t pitch = priv->screenPitch; + + // Write the fill color to an offscreen VRAM location for pattern source + // Use just past the visible screen area + uint32_t patAddr = priv->vramSize - 64; // safe offscreen area + uint8_t *fb = drv->mode.framebuffer; + + etWaitIdle(drv); + + // Write pattern pixel(s) to VRAM + for (int32_t i = 0; i < bpp; i++) { + fb[patAddr + i] = (color >> (i * 8)) & 0xFF; + } + + uint32_t dstAddr = y * pitch + x * bpp; + int32_t widthBytes = w * bpp - 1; + + // Set pixel depth + uint8_t pixDepth = 0; + if (bpp == 2) { pixDepth = 1; } + if (bpp == 4) { pixDepth = 3; } + outportb(ET_ACL_PIXEL_DEPTH, pixDepth); + + // Routing: pattern fill + outportb(ET_ACL_ROUTING_CTRL, ET_ROUTE_SRC_PATTERN | ET_ROUTE_DST_VRAM); + + // ROP: pattern copy + outportb(ET_ACL_ROP, ET_ROP_PAT_COPY); + + // Direction: forward + outportb(ET_ACL_XY_DIR, ET_DIR_X_POS | ET_DIR_Y_POS); + + // Pattern address and Y offset + outportb(ET_ACL_PATTERN_ADDR, patAddr & 0xFF); + outportb(ET_ACL_PATTERN_ADDR + 1, (patAddr >> 8) & 0xFF); + outportb(ET_ACL_PATTERN_ADDR + 2, (patAddr >> 16) & 0xFF); + outportw(ET_ACL_PATTERN_Y_OFF, 0); // single-line pattern + + // Dest Y offset + outportw(ET_ACL_DEST_Y_OFF, pitch - 1); + + // Dimensions + outportw(ET_ACL_X_COUNT, widthBytes); + outportw(ET_ACL_Y_COUNT, h - 1); + + // Destination address (triggers operation) + outportb(ET_ACL_DEST_ADDR, dstAddr & 0xFF); + outportb(ET_ACL_DEST_ADDR + 1, (dstAddr >> 8) & 0xFF); + outportb(ET_ACL_DEST_ADDR + 2, (dstAddr >> 16) & 0xFF); + + // Start + outportb(ET_ACL_SUSPEND_TERM, ET_ACL_START); +} + + +// ============================================================ +// etSetCursor +// ============================================================ + +static void etSetCursor(AccelDriverT *drv, const HwCursorImageT *image) { + TsengPrivateT *priv = (TsengPrivateT *)drv->privData; + + if (!priv->isW32p) { + return; + } + + if (!image) { + etShowCursor(drv, false); + return; + } + + etWaitIdle(drv); + + uint8_t *cursorMem = drv->mode.framebuffer + priv->cursorOffset; + + for (int32_t row = 0; row < ET_HW_CURSOR_SIZE; row++) { + for (int32_t byte = 0; byte < 8; byte++) { + int32_t srcIdx = row * 8 + byte; + uint8_t andByte; + uint8_t xorByte; + + if (row < image->height && byte < (image->width + 7) / 8) { + andByte = image->andMask[srcIdx]; + xorByte = image->xorMask[srcIdx]; + } else { + andByte = 0xFF; + xorByte = 0x00; + } + + cursorMem[row * 16 + byte] = andByte; + cursorMem[row * 16 + byte + 8] = xorByte; + } + } + + // Set cursor address via IMA registers + uint32_t cursorAddr = priv->cursorOffset / 4; // in dword units + outportb(0x217A, 0xE8); + outportb(0x217B, cursorAddr & 0xFF); + outportb(0x217A, 0xE9); + outportb(0x217B, (cursorAddr >> 8) & 0xFF); + outportb(0x217A, 0xEA); + outportb(0x217B, (cursorAddr >> 16) & 0x0F); +} + + +// ============================================================ +// etShowCursor +// ============================================================ + +static void etShowCursor(AccelDriverT *drv, bool visible) { + TsengPrivateT *priv = (TsengPrivateT *)drv->privData; + + if (!priv->isW32p) { + return; + } + + // Cursor control via IMA register 0xF7 + outportb(0x217A, 0xF7); + uint8_t val = inportb(0x217B); + + if (visible) { + val |= 0x80; + } else { + val &= ~0x80; + } + + outportb(0x217A, 0xF7); + outportb(0x217B, val); +} + + +// ============================================================ +// etShutdown +// ============================================================ + +static void etShutdown(AccelDriverT *drv) { + etShowCursor(drv, false); + outportb(ET_ACL_SUSPEND_TERM, ET_ACL_TERMINATE); + vgaRestoreTextMode(); + __djgpp_nearptr_disable(); +} + + +// ============================================================ +// etUnlockRegs +// ============================================================ +// +// Unlock Tseng extended registers. +// ET4000: write 0x03 to the "key" register at 0x3BF/0x3D8. +// This enables access to extended CRTC and attribute registers. + +static void etUnlockRegs(void) { + outportb(0x3BF, 0x03); + outportb(0x3D8, 0xA0); +} + + +// ============================================================ +// etWaitIdle +// ============================================================ +// +// Wait for the ACL engine to finish. Poll the operation state +// register for the busy bit to clear. + +static void etWaitIdle(AccelDriverT *drv) { + (void)drv; + + for (int32_t i = 0; i < ET_MAX_IDLE_WAIT; i++) { + if (!(inportb(ET_ACL_OPERATION_STATE) & ET_ACCEL_BUSY)) { + return; + } + } +} diff --git a/vgaCommon.c b/vgaCommon.c new file mode 100644 index 0000000..3f8e8f2 --- /dev/null +++ b/vgaCommon.c @@ -0,0 +1,505 @@ +// vgaCommon.c -- Shared VGA register programming +// +// Implements read/write access to the five standard VGA register +// groups. These are used by all chip-specific drivers for basic +// mode setup before enabling acceleration. +// +// Important timing note: on real hardware, some registers require +// specific sequencing (e.g. attribute controller must be reset via +// a read of Input Status 1 before writing the index). These +// functions handle the sequencing internally. + +#include "vgaCommon.h" +#include "pci.h" + +#include +#include +#include +#include +#include +#include +#include + +// VESA mode scoring weights (same as DVX) +#define MODE_SCORE_16BPP 100 +#define MODE_SCORE_15BPP 90 +#define MODE_SCORE_32BPP 85 +#define MODE_SCORE_8BPP 70 +#define MODE_SCORE_PREF_BPP 20 +#define MODE_SCORE_EXACT_RES 10 + +// ============================================================ +// Prototypes +// ============================================================ + +bool dpmiMapFramebuffer(uint32_t physAddr, uint32_t size, DpmiMappingT *mapping); +void dpmiUnmapFramebuffer(DpmiMappingT *mapping); +uint32_t pciSizeBar(uint8_t bus, uint8_t dev, uint8_t func, uint8_t barReg); +uint8_t vgaAttrRead(uint8_t index); +void vgaAttrReset(void); +void vgaAttrWrite(uint8_t index, uint8_t val); +void vgaBlankScreen(bool blank); +uint8_t vgaCrtcRead(uint8_t index); +void vgaCrtcLock(void); +void vgaCrtcUnlock(void); +void vgaCrtcWrite(uint8_t index, uint8_t val); +void vgaDacReadColor(uint8_t index, uint8_t *r, uint8_t *g, uint8_t *b); +void vgaDacWriteColor(uint8_t index, uint8_t r, uint8_t g, uint8_t b); +uint8_t vgaGfxRead(uint8_t index); +void vgaGfxWrite(uint8_t index, uint8_t val); +uint8_t vgaMiscRead(void); +void vgaMiscWrite(uint8_t val); +void vgaRestoreTextMode(void); +uint8_t vgaSeqRead(uint8_t index); +void vgaSeqWrite(uint8_t index, uint8_t val); +bool vesaFindAndSetMode(int32_t reqW, int32_t reqH, int32_t reqBpp, VesaModeResultT *result); +void vgaWaitVRetrace(void); + +// ============================================================ +// dpmiMapFramebuffer +// ============================================================ +// +// Maps a physical address region into the DJGPP near pointer +// address space via DPMI. This is the three-step process that +// every driver needs: +// 1. Map physical address to linear address +// 2. Lock the pages to prevent swapping +// 3. Enable near pointers for direct C pointer access +// +// Returns true on success. On failure, mapping->ptr is NULL. + +bool dpmiMapFramebuffer(uint32_t physAddr, uint32_t size, DpmiMappingT *mapping) { + __dpmi_meminfo info; + + memset(mapping, 0, sizeof(*mapping)); + + info.address = physAddr; + info.size = size; + + if (__dpmi_physical_address_mapping(&info) != 0) { + fprintf(stderr, "dpmiMap: Failed to map 0x%08lX (%lu bytes)\n", + (unsigned long)physAddr, (unsigned long)size); + return false; + } + + __dpmi_meminfo lockInfo; + lockInfo.address = info.address; + lockInfo.size = size; + __dpmi_lock_linear_region(&lockInfo); + + if (__djgpp_nearptr_enable() == 0) { + fprintf(stderr, "dpmiMap: Failed to enable near pointers\n"); + return false; + } + + mapping->ptr = (uint8_t *)(info.address + __djgpp_conventional_base); + mapping->linearAddr = info.address; + mapping->size = size; + + return true; +} + + +// ============================================================ +// dpmiUnmapFramebuffer +// ============================================================ + +void dpmiUnmapFramebuffer(DpmiMappingT *mapping) { + if (mapping->ptr) { + __djgpp_nearptr_disable(); + mapping->ptr = NULL; + } +} + + +// ============================================================ +// pciSizeBar +// ============================================================ +// +// Determines the size of a PCI BAR by writing all 1s and reading +// back the mask. Saves and restores the original BAR value. + +uint32_t pciSizeBar(uint8_t bus, uint8_t dev, uint8_t func, uint8_t barReg) { + uint32_t saved = pciRead32(bus, dev, func, barReg); + + pciWrite32(bus, dev, func, barReg, 0xFFFFFFFF); + uint32_t mask = pciRead32(bus, dev, func, barReg); + pciWrite32(bus, dev, func, barReg, saved); + + // Decode: invert the writable bits, add 1 + mask &= 0xFFFFFFF0; // mask off type bits + + if (mask == 0) { + return 0; + } + + return (~mask) + 1; +} + + +// ============================================================ +// vesaFindAndSetMode +// ============================================================ +// +// Enumerates VESA VBE modes, scores them against the requested +// resolution and bpp, sets the best match with LFB enabled, and +// returns the mode details. This replaces ~150 lines of identical +// code in every driver. + +bool vesaFindAndSetMode(int32_t reqW, int32_t reqH, int32_t reqBpp, VesaModeResultT *result) { + __dpmi_regs r; + + memset(result, 0, sizeof(*result)); + + // Get VBE controller info + _farpokeb(_dos_ds, __tb + 0, 'V'); + _farpokeb(_dos_ds, __tb + 1, 'B'); + _farpokeb(_dos_ds, __tb + 2, 'E'); + _farpokeb(_dos_ds, __tb + 3, '2'); + + memset(&r, 0, sizeof(r)); + r.x.ax = 0x4F00; + r.x.es = __tb >> 4; + r.x.di = __tb & 0x0F; + __dpmi_int(0x10, &r); + + if (r.x.ax != 0x004F) { + fprintf(stderr, "vesaFindAndSetMode: VBE not available\n"); + return false; + } + + // Copy mode list before 4F01h overwrites __tb + uint16_t modeListOff = _farpeekw(_dos_ds, __tb + 14); + uint16_t modeListSeg = _farpeekw(_dos_ds, __tb + 16); + uint32_t modeListAddr = ((uint32_t)modeListSeg << 4) + modeListOff; + + uint16_t modes[256]; + int32_t modeCount = 0; + + for (int32_t i = 0; i < 256; i++) { + uint16_t mode = _farpeekw(_dos_ds, modeListAddr + i * 2); + if (mode == 0xFFFF) { + break; + } + modes[modeCount++] = mode; + } + + // Score each mode and find the best + uint16_t bestMode = 0; + int32_t bestScore = -1; + + for (int32_t i = 0; i < modeCount; i++) { + memset(&r, 0, sizeof(r)); + r.x.ax = 0x4F01; + r.x.cx = modes[i]; + r.x.es = __tb >> 4; + r.x.di = __tb & 0x0F; + __dpmi_int(0x10, &r); + + if (r.x.ax != 0x004F) { + continue; + } + + uint16_t attr = _farpeekw(_dos_ds, __tb + 0); + int32_t w = _farpeekw(_dos_ds, __tb + 18); + int32_t h = _farpeekw(_dos_ds, __tb + 20); + int32_t bpp = _farpeekb(_dos_ds, __tb + 25); + int32_t pitch = _farpeekw(_dos_ds, __tb + 16); + uint32_t phys = _farpeekl(_dos_ds, __tb + 40); + + // Must have LFB and be a graphics mode + if (!(attr & 0x0080) || !(attr & 0x0010)) { + continue; + } + + // Must meet requested resolution + if (w < reqW || h < reqH) { + continue; + } + + // Only 8/15/16/32 bpp + if (bpp != 8 && bpp != 15 && bpp != 16 && bpp != 32) { + continue; + } + + int32_t score = 0; + + if (bpp == 16) { score = MODE_SCORE_16BPP; } + else if (bpp == 15) { score = MODE_SCORE_15BPP; } + else if (bpp == 32) { score = MODE_SCORE_32BPP; } + else { score = MODE_SCORE_8BPP; } + + if (bpp == reqBpp) { score += MODE_SCORE_PREF_BPP; } + if (w == reqW && h == reqH) { score += MODE_SCORE_EXACT_RES; } + + if (score > bestScore) { + bestScore = score; + bestMode = modes[i]; + result->width = w; + result->height = h; + result->bpp = bpp; + result->pitch = pitch; + result->lfbPhysAddr = phys; + } + } + + if (bestScore < 0) { + fprintf(stderr, "vesaFindAndSetMode: No suitable mode for %ldx%ldx%ld\n", + (long)reqW, (long)reqH, (long)reqBpp); + return false; + } + + // Set the mode with LFB enabled (bit 14) + memset(&r, 0, sizeof(r)); + r.x.ax = 0x4F02; + r.x.bx = bestMode | 0x4000; // bit 14 = enable LFB + __dpmi_int(0x10, &r); + + if (r.x.ax != 0x004F) { + fprintf(stderr, "vesaFindAndSetMode: Failed to set mode 0x%04X\n", bestMode); + return false; + } + + return true; +} + + +// ============================================================ +// vgaAttrRead +// ============================================================ +// +// The attribute controller is unusual: reading Input Status 1 +// resets its flip-flop so the next write to 0x3C0 is treated as +// an index (not data). We must reset before every access. + +uint8_t vgaAttrRead(uint8_t index) { + inportb(VGA_INPUT_STATUS_1); + outportb(VGA_ATTR_INDEX, index); + return inportb(VGA_ATTR_DATA_R); +} + + +// ============================================================ +// vgaAttrReset +// ============================================================ +// +// Resets the attribute controller flip-flop by reading Input +// Status 1. After this, the next write to 0x3C0 is an index write. + +void vgaAttrReset(void) { + inportb(VGA_INPUT_STATUS_1); +} + + +// ============================================================ +// vgaAttrWrite +// ============================================================ +// +// Writes to the attribute controller. The flip-flop mechanism +// means we must: (1) read Input Status 1 to reset, (2) write +// the index to 0x3C0, (3) write the data to 0x3C0. +// Bit 5 of the index byte must be set to keep the palette +// address source enabled (otherwise the screen goes black). + +void vgaAttrWrite(uint8_t index, uint8_t val) { + inportb(VGA_INPUT_STATUS_1); + outportb(VGA_ATTR_INDEX, index); + outportb(VGA_ATTR_DATA_W, val); +} + + +// ============================================================ +// vgaBlankScreen +// ============================================================ +// +// Toggles the screen on/off by setting bit 5 of the sequencer +// clocking mode register. Blanking prevents visible garbage +// during mode transitions. + +void vgaBlankScreen(bool blank) { + uint8_t val = vgaSeqRead(VGA_SEQ_CLOCK_MODE); + + if (blank) { + val |= VGA_SEQ_SCREEN_OFF; + } else { + val &= ~VGA_SEQ_SCREEN_OFF; + } + + vgaSeqWrite(VGA_SEQ_CLOCK_MODE, val); +} + + +// ============================================================ +// vgaCrtcLock +// ============================================================ +// +// Re-enables CRTC write protection by setting bit 7 of the +// vertical sync end register. + +void vgaCrtcLock(void) { + uint8_t val = vgaCrtcRead(VGA_CRTC_V_SYNC_END); + vgaCrtcWrite(VGA_CRTC_V_SYNC_END, val | 0x80); +} + + +// ============================================================ +// vgaCrtcRead +// ============================================================ + +uint8_t vgaCrtcRead(uint8_t index) { + outportb(VGA_CRTC_INDEX, index); + return inportb(VGA_CRTC_DATA); +} + + +// ============================================================ +// vgaCrtcUnlock +// ============================================================ +// +// Disables CRTC write protection. Registers 0x00-0x07 of the +// CRTC are protected by bit 7 of the vertical sync end register +// (0x11). Clearing this bit allows writing to those registers. + +void vgaCrtcUnlock(void) { + uint8_t val = vgaCrtcRead(VGA_CRTC_V_SYNC_END); + vgaCrtcWrite(VGA_CRTC_V_SYNC_END, val & 0x7F); +} + + +// ============================================================ +// vgaCrtcWrite +// ============================================================ + +void vgaCrtcWrite(uint8_t index, uint8_t val) { + outportb(VGA_CRTC_INDEX, index); + outportb(VGA_CRTC_DATA, val); +} + + +// ============================================================ +// vgaDacReadColor +// ============================================================ +// +// Read one DAC palette entry. Write the index to 0x3C7, then +// read three bytes (R, G, B) from 0x3C9. DAC values are 6-bit +// (0-63) on standard VGA, 8-bit on some SVGA cards. + +void vgaDacReadColor(uint8_t index, uint8_t *r, uint8_t *g, uint8_t *b) { + outportb(VGA_DAC_READ_ADDR, index); + *r = inportb(VGA_DAC_DATA); + *g = inportb(VGA_DAC_DATA); + *b = inportb(VGA_DAC_DATA); +} + + +// ============================================================ +// vgaDacWriteColor +// ============================================================ +// +// Write one DAC palette entry. Write the starting index to 0x3C8, +// then write three bytes (R, G, B) to 0x3C9. + +void vgaDacWriteColor(uint8_t index, uint8_t r, uint8_t g, uint8_t b) { + outportb(VGA_DAC_WRITE_ADDR, index); + outportb(VGA_DAC_DATA, r); + outportb(VGA_DAC_DATA, g); + outportb(VGA_DAC_DATA, b); +} + + +// ============================================================ +// vgaGfxRead +// ============================================================ + +uint8_t vgaGfxRead(uint8_t index) { + outportb(VGA_GFX_INDEX, index); + return inportb(VGA_GFX_DATA); +} + + +// ============================================================ +// vgaGfxWrite +// ============================================================ + +void vgaGfxWrite(uint8_t index, uint8_t val) { + outportb(VGA_GFX_INDEX, index); + outportb(VGA_GFX_DATA, val); +} + + +// ============================================================ +// vgaMiscRead +// ============================================================ + +uint8_t vgaMiscRead(void) { + return inportb(VGA_MISC_OUT_R); +} + + +// ============================================================ +// vgaMiscWrite +// ============================================================ + +void vgaMiscWrite(uint8_t val) { + outportb(VGA_MISC_OUT_W, val); +} + + +// ============================================================ +// vgaRestoreTextMode +// ============================================================ +// +// Restores VGA text mode 3 (80x25, 16 color). Uses INT 10h +// because manually reprogramming all VGA registers for text mode +// is error-prone and varies by chipset. The BIOS handles it +// correctly for all VGA-compatible cards. + +void vgaRestoreTextMode(void) { + __dpmi_regs r; + + memset(&r, 0, sizeof(r)); + r.x.ax = 0x0003; + __dpmi_int(0x10, &r); +} + + +// ============================================================ +// vgaSeqRead +// ============================================================ + +uint8_t vgaSeqRead(uint8_t index) { + outportb(VGA_SEQ_INDEX, index); + return inportb(VGA_SEQ_DATA); +} + + +// ============================================================ +// vgaSeqWrite +// ============================================================ + +void vgaSeqWrite(uint8_t index, uint8_t val) { + outportb(VGA_SEQ_INDEX, index); + outportb(VGA_SEQ_DATA, val); +} + + +// ============================================================ +// vgaWaitVRetrace +// ============================================================ +// +// Waits for the start of the next vertical retrace by spinning +// on bit 3 of Input Status 1 (port 0x3DA). First waits for bit +// to clear (if we're currently in retrace), then waits for it +// to set (start of next retrace). + +void vgaWaitVRetrace(void) { + // Wait for any current retrace to end + while (inportb(VGA_INPUT_STATUS_1) & 0x08) { + // spin + } + + // Wait for next retrace to start + while (!(inportb(VGA_INPUT_STATUS_1) & 0x08)) { + // spin + } +} diff --git a/vgaCommon.h b/vgaCommon.h new file mode 100644 index 0000000..8e9ae7a --- /dev/null +++ b/vgaCommon.h @@ -0,0 +1,198 @@ +// vgaCommon.h -- Shared VGA register programming for DOS/DJGPP +// +// Provides low-level access to the standard VGA register sets that +// are common across all VGA-compatible video cards. Every chipset +// driver needs these for basic mode setup before enabling its +// chip-specific acceleration extensions. +// +// The five standard VGA register groups: +// - Miscellaneous Output (0x3C2 write, 0x3CC read) +// - Sequencer (0x3C4/0x3C5) +// - CRTC (0x3D4/0x3D5 for color, 0x3B4/0x3B5 for mono) +// - Graphics Controller (0x3CE/0x3CF) +// - Attribute Controller (0x3C0/0x3C1, toggle via 0x3DA read) +// +// All functions use DJGPP's inportb/outportb for port I/O. +#ifndef VGA_COMMON_H +#define VGA_COMMON_H + +#include +#include + +// ============================================================ +// VGA I/O port addresses +// ============================================================ + +// Miscellaneous output register +#define VGA_MISC_OUT_W 0x3C2 // write +#define VGA_MISC_OUT_R 0x3CC // read + +// Input status registers +#define VGA_INPUT_STATUS_0 0x3C2 +#define VGA_INPUT_STATUS_1 0x3DA // color mode +#define VGA_INPUT_STATUS_1M 0x3BA // mono mode + +// Sequencer +#define VGA_SEQ_INDEX 0x3C4 +#define VGA_SEQ_DATA 0x3C5 + +// CRTC (color mode addresses -- we always use color) +#define VGA_CRTC_INDEX 0x3D4 +#define VGA_CRTC_DATA 0x3D5 + +// Graphics Controller +#define VGA_GFX_INDEX 0x3CE +#define VGA_GFX_DATA 0x3CF + +// Attribute Controller (index and data share 0x3C0) +#define VGA_ATTR_INDEX 0x3C0 +#define VGA_ATTR_DATA_W 0x3C0 +#define VGA_ATTR_DATA_R 0x3C1 + +// DAC (palette) +#define VGA_DAC_READ_ADDR 0x3C7 +#define VGA_DAC_WRITE_ADDR 0x3C8 +#define VGA_DAC_DATA 0x3C9 +#define VGA_DAC_STATE 0x3C7 + +// Feature control +#define VGA_FEATURE_W 0x3DA // write (color mode) +#define VGA_FEATURE_R 0x3CA // read + +// ============================================================ +// Sequencer register indices +// ============================================================ + +#define VGA_SEQ_RESET 0x00 +#define VGA_SEQ_CLOCK_MODE 0x01 +#define VGA_SEQ_PLANE_MASK 0x02 +#define VGA_SEQ_CHAR_MAP 0x03 +#define VGA_SEQ_MEM_MODE 0x04 + +// Sequencer clock mode bits +#define VGA_SEQ_SCREEN_OFF 0x20 // bit 5: blank the screen + +// ============================================================ +// CRTC register indices +// ============================================================ + +#define VGA_CRTC_H_TOTAL 0x00 +#define VGA_CRTC_H_DISP_END 0x01 +#define VGA_CRTC_H_BLANK_START 0x02 +#define VGA_CRTC_H_BLANK_END 0x03 +#define VGA_CRTC_H_SYNC_START 0x04 +#define VGA_CRTC_H_SYNC_END 0x05 +#define VGA_CRTC_V_TOTAL 0x06 +#define VGA_CRTC_OVERFLOW 0x07 +#define VGA_CRTC_PRESET_ROW 0x08 +#define VGA_CRTC_MAX_SCAN 0x09 +#define VGA_CRTC_CURSOR_START 0x0A +#define VGA_CRTC_CURSOR_END 0x0B +#define VGA_CRTC_START_ADDR_HI 0x0C +#define VGA_CRTC_START_ADDR_LO 0x0D +#define VGA_CRTC_CURSOR_HI 0x0E +#define VGA_CRTC_CURSOR_LO 0x0F +#define VGA_CRTC_V_SYNC_START 0x10 +#define VGA_CRTC_V_SYNC_END 0x11 +#define VGA_CRTC_V_DISP_END 0x12 +#define VGA_CRTC_OFFSET 0x13 +#define VGA_CRTC_UNDERLINE 0x14 +#define VGA_CRTC_V_BLANK_START 0x15 +#define VGA_CRTC_V_BLANK_END 0x16 +#define VGA_CRTC_MODE_CTRL 0x17 +#define VGA_CRTC_LINE_COMPARE 0x18 + +// ============================================================ +// Graphics controller register indices +// ============================================================ + +#define VGA_GFX_SET_RESET 0x00 +#define VGA_GFX_ENABLE_SET_RESET 0x01 +#define VGA_GFX_COLOR_COMPARE 0x02 +#define VGA_GFX_DATA_ROTATE 0x03 +#define VGA_GFX_READ_MAP_SEL 0x04 +#define VGA_GFX_MODE 0x05 +#define VGA_GFX_MISC 0x06 +#define VGA_GFX_COLOR_DONT_CARE 0x07 +#define VGA_GFX_BIT_MASK 0x08 + +// ============================================================ +// VESA mode result (returned by vesaFindAndSetMode) +// ============================================================ + +typedef struct { + int32_t width; + int32_t height; + int32_t bpp; + int32_t pitch; + uint32_t lfbPhysAddr; // physical address of LFB from VBE +} VesaModeResultT; + +// ============================================================ +// DPMI LFB mapping result (returned by dpmiMapFramebuffer) +// ============================================================ + +typedef struct { + uint8_t *ptr; // near pointer to mapped region + uint32_t linearAddr; // linear address (for unmapping) + uint32_t size; // mapped size in bytes +} DpmiMappingT; + +// ============================================================ +// Prototypes +// ============================================================ + +// Find the best VESA VBE mode matching the requested resolution +// and bpp, set it with LFB enabled, and return the mode details. +// Returns true on success. This replaces ~150 lines of duplicated +// code in every driver. +bool vesaFindAndSetMode(int32_t reqW, int32_t reqH, int32_t reqBpp, VesaModeResultT *result); + +// Map a physical address region into the DJGPP near pointer space +// via DPMI. Handles physical address mapping, page locking, and +// near pointer enable. Returns true on success. +bool dpmiMapFramebuffer(uint32_t physAddr, uint32_t size, DpmiMappingT *mapping); + +// Unmap a previously mapped framebuffer region and disable near +// pointers. Safe to call with a zeroed mapping struct. +void dpmiUnmapFramebuffer(DpmiMappingT *mapping); + +// Size a PCI BAR by writing all 1s and reading back. Returns the +// decoded size in bytes. Saves and restores the original BAR value. +uint32_t pciSizeBar(uint8_t bus, uint8_t dev, uint8_t func, uint8_t barReg); + +// Read/write individual VGA register sets +uint8_t vgaAttrRead(uint8_t index); +void vgaAttrReset(void); +void vgaAttrWrite(uint8_t index, uint8_t val); +uint8_t vgaCrtcRead(uint8_t index); +void vgaCrtcWrite(uint8_t index, uint8_t val); +uint8_t vgaGfxRead(uint8_t index); +void vgaGfxWrite(uint8_t index, uint8_t val); +uint8_t vgaMiscRead(void); +void vgaMiscWrite(uint8_t val); +uint8_t vgaSeqRead(uint8_t index); +void vgaSeqWrite(uint8_t index, uint8_t val); + +// CRTC register protection: some CRTC registers are write-protected +// by bit 7 of the V_SYNC_END register. These functions unlock/lock. +void vgaCrtcLock(void); +void vgaCrtcUnlock(void); + +// Palette (DAC) operations +void vgaDacReadColor(uint8_t index, uint8_t *r, uint8_t *g, uint8_t *b); +void vgaDacWriteColor(uint8_t index, uint8_t r, uint8_t g, uint8_t b); + +// Restore VGA text mode (mode 3). Uses INT 10h for reliability +// across all chipsets. +void vgaRestoreTextMode(void); + +// Wait for vertical retrace. Spins on Input Status 1 bit 3. +// Useful for timing-sensitive register writes and tear-free updates. +void vgaWaitVRetrace(void); + +// Enable/disable VGA display output by toggling sequencer clocking +// mode bit 5. Used during mode transitions to prevent screen garbage. +void vgaBlankScreen(bool blank); + +#endif // VGA_COMMON_H