DVX_GUI/dvx/dvxDraw.c

1358 lines
49 KiB
C

// dvx_draw.c -- Layer 2: Drawing primitives for DVX GUI (optimized)
//
// This is the second layer of the DVX compositor stack, sitting on top
// of dvxVideo (layer 1) and below dvxComp (layer 3). It provides all
// rasterization primitives: filled rects, buffer copies, beveled
// frames, bitmap font text, masked bitmaps (cursors/icons), and
// single-pixel operations.
//
// Every function here draws into the system-RAM backbuffer (d->backBuf),
// never directly to the LFB. The compositor layer is responsible for
// flushing changed regions to the hardware framebuffer via rep movsd.
// This separation means draw operations benefit from CPU cache (the
// backbuffer lives in cacheable system RAM) while LFB writes are
// batched into large sequential bursts.
//
// Performance strategy overview:
//
// The core tension on 486/Pentium is between generality and speed.
// The draw layer resolves this with a two-tier approach:
//
// 1) Span operations (spanFill/spanCopy) are dispatched through
// function pointers in BlitOpsT, set once at init based on bpp.
// The platform implementations use rep stosl/rep movsd inline asm
// for maximum throughput (the 486 executes rep stosl at 1 dword
// per clock after startup; the Pentium pairs it in the U-pipe).
// Using function pointers here costs one indirect call per span
// but avoids a bpp switch in the inner loop of rectFill, which
// would otherwise be a branch per scanline.
//
// 2) Character rendering (drawChar, drawTextN, drawTermRow) uses
// explicit if/else chains on bpp rather than function pointers.
// This is deliberate: the per-pixel work inside glyph rendering
// is a tight bit-test loop where an indirect call per pixel would
// be catastrophic, and the bpp branch is taken once per glyph row
// (hoisted out of the pixel loop). The compiler can also inline
// the pixel store when the bpp is a compile-time constant within
// each branch.
//
// 3) For the most critical glyph paths (unclipped 32bpp and 16bpp),
// the pixel loops are fully unrolled into 8 direct array stores
// with literal bit masks. This eliminates the sGlyphBit[] table
// lookup, the loop counter, and the loop branch -- saving ~3 cycles
// per pixel on a 486. The clipped path falls back to the table.
//
// Clip rectangle handling: All draw functions clip against
// d->clipX/Y/W/H (set by setClipRect in layer 1). The clipRect()
// helper is marked static inline so it compiles to straight-line
// compare-and-clamp code at each call site with no function call
// overhead. __builtin_expect hints mark the clipping branches as
// unlikely, helping the branch predictor on Pentium and later.
#include "dvxDraw.h"
#include "platform/dvxPlatform.h"
#include <string.h>
// ============================================================
// Prototypes
// ============================================================
char accelParse(const char *text);
static inline void clipRect(const DisplayT *d, int32_t *x, int32_t *y, int32_t *w, int32_t *h);
static inline void putPixel(uint8_t *dst, uint32_t color, int32_t bpp);
// Bit lookup tables for glyph and mask rendering. On a 486, a variable
// shift (1 << (7 - col)) costs 4 cycles per bit position; a table
// lookup is a fixed 1-cycle load from L1. The 8-entry sGlyphBit table
// maps column index 0..7 to the corresponding bit mask in a 1bpp glyph
// byte (MSB-first, matching standard VGA/bitmap font layout). The
// 16-entry sMaskBit table does the same for 16-pixel-wide cursor/icon
// masks.
static const uint8_t sGlyphBit[8] = {0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01};
static const uint16_t sMaskBit[16] = {0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100, 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001};
// ============================================================
// accelParse
// ============================================================
//
// Scans a menu/button label for the & accelerator marker and returns
// the character after it (lowercased). Follows the Windows/Motif
// convention: "&File" means Alt+F activates it, "&&" is a literal &.
// Returns 0 if no accelerator is found. The result is always
// lowercased so the WM can do a single case-insensitive compare
// against incoming Alt+key events.
char accelParse(const char *text) {
if (!text) {
return 0;
}
while (*text) {
if (*text == '&') {
text++;
if (*text == '&') {
// Escaped && -- literal &, not an accelerator
text++;
continue;
}
if (*text && *text != '&') {
char ch = *text;
if (ch >= 'A' && ch <= 'Z') {
return (char)(ch + 32);
}
if (ch >= 'a' && ch <= 'z') {
return ch;
}
if (ch >= '0' && ch <= '9') {
return ch;
}
return ch;
}
break;
}
text++;
}
return 0;
}
// ============================================================
// clipRect
// ============================================================
//
// Intersects a rectangle with the display's current clip rect,
// modifying the rect in place. If the rect is fully outside the
// clip region, w or h will be <= 0 and callers bail out.
//
// Marked static inline because this is called on every rectFill,
// rectCopy, and indirectly on every glyph -- it must compile to
// straight-line clamp instructions with zero call overhead.
// __builtin_expect(..., 0) marks clipping as unlikely; in the
// common case windows are fully within the clip rect and all
// four branches fall through untaken. On Pentium this keeps the
// branch predictor happy (static not-taken prediction for forward
// branches), and on 486 it at least avoids the taken-branch penalty.
static inline void clipRect(const DisplayT *d, int32_t *x, int32_t *y, int32_t *w, int32_t *h) {
int32_t cx2 = d->clipX + d->clipW;
int32_t cy2 = d->clipY + d->clipH;
int32_t rx1 = *x;
int32_t ry1 = *y;
int32_t rx2 = rx1 + *w;
int32_t ry2 = ry1 + *h;
if (__builtin_expect(rx1 < d->clipX, 0)) { rx1 = d->clipX; }
if (__builtin_expect(ry1 < d->clipY, 0)) { ry1 = d->clipY; }
if (__builtin_expect(rx2 > cx2, 0)) { rx2 = cx2; }
if (__builtin_expect(ry2 > cy2, 0)) { ry2 = cy2; }
*x = rx1;
*y = ry1;
*w = rx2 - rx1;
*h = ry2 - ry1;
}
// ============================================================
// drawBevel
// ============================================================
//
// Draws a Motif/DESQview-style beveled rectangular frame. The bevel
// creates the illusion of a raised or sunken 3D surface by drawing
// lighter "highlight" edges on the top and left, and darker "shadow"
// edges on the bottom and right. Swapping highlight and shadow gives
// a sunken appearance (see BEVEL_RAISED/BEVEL_SUNKEN macros in
// dvxTypes.h).
//
// BevelStyleT.width controls the border thickness. DV/X uses 2px
// bevels for most window chrome (matching the original DESQview/X
// and Motif look), 1px for inner borders and scrollbar elements.
//
// The implementation has special-cased fast paths for bw==2 and bw==1
// that emit exact spans via rectFill rather than looping. This
// matters because drawBevel is called for every window frame, button,
// menu, and scrollbar element on every repaint -- the loop overhead
// and extra rectFill calls in the general case add up. Each rectFill
// call already handles clipping internally, so the bevels clip
// correctly even when a window is partially off-screen.
//
// face==0 means "don't fill the interior", which is used for frame-only
// bevels where the content area is painted separately by a callback.
void drawBevel(DisplayT *d, const BlitOpsT *ops, int32_t x, int32_t y, int32_t w, int32_t h, const BevelStyleT *style) {
int32_t bw = style->width;
// Fill interior if requested
if (style->face != 0) {
rectFill(d, ops, x + bw, y + bw, w - bw * 2, h - bw * 2, style->face);
}
// Fast path for the common bevel widths (1 and 2)
// Directly emit spans instead of calling drawHLine->rectFill->clipRect per line
if (bw == 2) {
// Top 2 highlight lines
rectFill(d, ops, x, y, w, 1, style->highlight);
rectFill(d, ops, x + 1, y + 1, w - 2, 1, style->highlight);
// Left 2 highlight columns
rectFill(d, ops, x, y + 1, 1, h - 1, style->highlight);
rectFill(d, ops, x + 1, y + 2, 1, h - 3, style->highlight);
// Bottom 2 shadow lines
rectFill(d, ops, x, y + h - 1, w, 1, style->shadow);
rectFill(d, ops, x + 1, y + h - 2, w - 2, 1, style->shadow);
// Right 2 shadow columns
rectFill(d, ops, x + w - 1, y + 1, 1, h - 2, style->shadow);
rectFill(d, ops, x + w - 2, y + 2, 1, h - 4, style->shadow);
} else if (bw == 1) {
rectFill(d, ops, x, y, w, 1, style->highlight);
rectFill(d, ops, x, y + 1, 1, h - 1, style->highlight);
rectFill(d, ops, x, y + h - 1, w, 1, style->shadow);
rectFill(d, ops, x + w - 1, y + 1, 1, h - 2, style->shadow);
} else {
for (int32_t i = 0; i < bw; i++) {
rectFill(d, ops, x + i, y + i, w - i * 2, 1, style->highlight);
}
for (int32_t i = 0; i < bw; i++) {
rectFill(d, ops, x + i, y + i + 1, 1, h - i * 2 - 1, style->highlight);
}
for (int32_t i = 0; i < bw; i++) {
rectFill(d, ops, x + i, y + h - 1 - i, w - i * 2, 1, style->shadow);
}
for (int32_t i = 0; i < bw; i++) {
rectFill(d, ops, x + w - 1 - i, y + i + 1, 1, h - i * 2 - 2, style->shadow);
}
}
}
// ============================================================
// drawChar
// ============================================================
//
// Renders a single fixed-width bitmap font character into the
// backbuffer. Returns the character advance width (always
// font->charWidth) so callers can accumulate cursor position.
//
// Font format: each glyph is charHeight bytes of 1bpp data, MSB-first
// (bit 7 = leftmost pixel). This is the standard VGA/PC BIOS font
// format. We use 8-pixel-wide glyphs exclusively because 8 bits fit
// in one byte per scanline, making the inner loop a single byte load
// plus 8 bit tests -- no multi-byte glyph row assembly needed.
//
// The function has six specialized code paths (3 bpp x 2 modes),
// chosen with if/else chains rather than function pointers. On 486
// and Pentium, an indirect call through a function pointer stalls the
// pipeline (no branch target buffer for indirect calls on 486, and
// a mandatory bubble on Pentium). The if/else chain resolves at the
// outer loop level (once per glyph, not per pixel), so the per-pixel
// inner code is branch-free within each path.
//
// Opaque vs transparent mode:
// opaque=true: Fills the entire character cell (bg then fg). Used
// for normal text where the background must overwrite
// whatever was previously in the cell.
// opaque=false: Only writes foreground pixels; background shows
// through. Used for overlay text on existing content.
//
// The "unclipped fast path" (colStart==0, colEnd==cw) avoids the
// sGlyphBit[] table lookup by testing literal bit masks directly.
// This matters because the table lookup involves an indexed load
// (base + index * element_size), while the literal mask is an
// immediate operand in the compare instruction. At 8 pixels per row
// and 14-16 rows per glyph, saving even 1 cycle per pixel adds up
// across a full screen of text (~6400 characters at 80x80).
int32_t drawChar(DisplayT *d, const BlitOpsT *ops, const BitmapFontT *font, int32_t x, int32_t y, char ch, uint32_t fg, uint32_t bg, bool opaque) {
int32_t cw = font->charWidth;
int32_t chh = font->charHeight;
// Quick reject: entirely outside clip rect
if (__builtin_expect(x + cw <= d->clipX || x >= d->clipX + d->clipW || y + chh <= d->clipY || y >= d->clipY + d->clipH, 0)) {
return cw;
}
int32_t idx = (uint8_t)ch - font->firstChar;
if (__builtin_expect(idx < 0 || idx >= font->numChars, 0)) {
if (opaque) {
rectFill(d, ops, x, y, cw, chh, bg);
}
return cw;
}
const uint8_t *glyph = font->glyphData + idx * chh;
int32_t bpp = ops->bytesPerPixel;
int32_t pitch = d->pitch;
// Calculate clipped row/col bounds once
int32_t clipX1 = d->clipX;
int32_t clipX2 = d->clipX + d->clipW;
int32_t clipY1 = d->clipY;
int32_t clipY2 = d->clipY + d->clipH;
int32_t rowStart = 0;
int32_t rowEnd = chh;
if (y < clipY1) { rowStart = clipY1 - y; }
if (y + chh > clipY2) { rowEnd = clipY2 - y; }
int32_t colStart = 0;
int32_t colEnd = cw;
if (x < clipX1) { colStart = clipX1 - x; }
if (x + cw > clipX2) { colEnd = clipX2 - x; }
// Unclipped fast path: when the character cell is fully within the
// clip rect we can skip per-pixel clip checks and use the fully
// unrolled 8-store sequences below. This is the hot path for all
// text that isn't at the edge of a window.
bool unclipped = (colStart == 0 && colEnd == cw);
if (opaque) {
// Opaque mode: every pixel in the cell gets written (fg or bg).
// The unclipped 32bpp and 16bpp paths use branchless ternary
// stores -- the compiler emits cmov or conditional-set sequences
// that avoid branch misprediction penalties. Each row is 8
// direct array stores with no loop, no table lookup.
if (unclipped && bpp == 4) {
for (int32_t row = rowStart; row < rowEnd; row++) {
uint32_t *dst32 = (uint32_t *)(d->backBuf + (y + row) * pitch + x * 4);
uint8_t bits = glyph[row];
dst32[0] = (bits & 0x80) ? fg : bg;
dst32[1] = (bits & 0x40) ? fg : bg;
dst32[2] = (bits & 0x20) ? fg : bg;
dst32[3] = (bits & 0x10) ? fg : bg;
dst32[4] = (bits & 0x08) ? fg : bg;
dst32[5] = (bits & 0x04) ? fg : bg;
dst32[6] = (bits & 0x02) ? fg : bg;
dst32[7] = (bits & 0x01) ? fg : bg;
}
} else if (unclipped && bpp == 2) {
uint16_t fg16 = (uint16_t)fg;
uint16_t bg16 = (uint16_t)bg;
for (int32_t row = rowStart; row < rowEnd; row++) {
uint16_t *dst16 = (uint16_t *)(d->backBuf + (y + row) * pitch + x * 2);
uint8_t bits = glyph[row];
dst16[0] = (bits & 0x80) ? fg16 : bg16;
dst16[1] = (bits & 0x40) ? fg16 : bg16;
dst16[2] = (bits & 0x20) ? fg16 : bg16;
dst16[3] = (bits & 0x10) ? fg16 : bg16;
dst16[4] = (bits & 0x08) ? fg16 : bg16;
dst16[5] = (bits & 0x04) ? fg16 : bg16;
dst16[6] = (bits & 0x02) ? fg16 : bg16;
dst16[7] = (bits & 0x01) ? fg16 : bg16;
}
} else {
// Clipped path or 8bpp: use spanFill for bg (leveraging
// rep stosl), then iterate visible columns with sGlyphBit[]
// table for fg. 8bpp always takes this path because 8-bit
// stores can't be branchlessly ternary'd as efficiently --
// the compiler can't cmov into a byte store.
for (int32_t row = rowStart; row < rowEnd; row++) {
int32_t py = y + row;
uint8_t *dst = d->backBuf + py * pitch + (x + colStart) * bpp;
ops->spanFill(dst, bg, colEnd - colStart);
uint8_t bits = glyph[row];
if (bits == 0) {
continue;
}
dst = d->backBuf + py * pitch + x * bpp;
if (bpp == 2) {
uint16_t fg16 = (uint16_t)fg;
for (int32_t col = colStart; col < colEnd; col++) {
if (bits & sGlyphBit[col]) {
*(uint16_t *)(dst + col * 2) = fg16;
}
}
} else if (bpp == 4) {
for (int32_t col = colStart; col < colEnd; col++) {
if (bits & sGlyphBit[col]) {
*(uint32_t *)(dst + col * 4) = fg;
}
}
} else {
uint8_t fg8 = (uint8_t)fg;
for (int32_t col = colStart; col < colEnd; col++) {
if (bits & sGlyphBit[col]) {
dst[col] = fg8;
}
}
}
}
}
} else {
// Transparent mode: only fg pixels are written; bg is untouched.
// The "bits == 0" early-out per row is important here: blank
// rows in the glyph (common in the top/bottom padding of most
// characters) skip all pixel work entirely. In opaque mode
// blank rows still need the bg fill so we can't skip them.
if (unclipped && bpp == 4) {
for (int32_t row = rowStart; row < rowEnd; row++) {
uint8_t bits = glyph[row];
if (bits == 0) {
continue;
}
uint32_t *dst32 = (uint32_t *)(d->backBuf + (y + row) * pitch + x * 4);
if (bits & 0x80) { dst32[0] = fg; }
if (bits & 0x40) { dst32[1] = fg; }
if (bits & 0x20) { dst32[2] = fg; }
if (bits & 0x10) { dst32[3] = fg; }
if (bits & 0x08) { dst32[4] = fg; }
if (bits & 0x04) { dst32[5] = fg; }
if (bits & 0x02) { dst32[6] = fg; }
if (bits & 0x01) { dst32[7] = fg; }
}
} else if (unclipped && bpp == 2) {
uint16_t fg16 = (uint16_t)fg;
for (int32_t row = rowStart; row < rowEnd; row++) {
uint8_t bits = glyph[row];
if (bits == 0) {
continue;
}
uint16_t *dst16 = (uint16_t *)(d->backBuf + (y + row) * pitch + x * 2);
if (bits & 0x80) { dst16[0] = fg16; }
if (bits & 0x40) { dst16[1] = fg16; }
if (bits & 0x20) { dst16[2] = fg16; }
if (bits & 0x10) { dst16[3] = fg16; }
if (bits & 0x08) { dst16[4] = fg16; }
if (bits & 0x04) { dst16[5] = fg16; }
if (bits & 0x02) { dst16[6] = fg16; }
if (bits & 0x01) { dst16[7] = fg16; }
}
} else {
// Clipped path or 8bpp
for (int32_t row = rowStart; row < rowEnd; row++) {
uint8_t bits = glyph[row];
if (bits == 0) {
continue;
}
int32_t py = y + row;
uint8_t *dst = d->backBuf + py * pitch + x * bpp;
if (bpp == 2) {
uint16_t fg16 = (uint16_t)fg;
for (int32_t col = colStart; col < colEnd; col++) {
if (bits & sGlyphBit[col]) {
*(uint16_t *)(dst + col * 2) = fg16;
}
}
} else if (bpp == 4) {
for (int32_t col = colStart; col < colEnd; col++) {
if (bits & sGlyphBit[col]) {
*(uint32_t *)(dst + col * 4) = fg;
}
}
} else {
uint8_t fg8 = (uint8_t)fg;
for (int32_t col = colStart; col < colEnd; col++) {
if (bits & sGlyphBit[col]) {
dst[col] = fg8;
}
}
}
}
}
}
return cw;
}
// ============================================================
// drawTextN
// ============================================================
//
// Renders exactly 'count' characters from a buffer in one pass.
// Same idea as drawTermRow but for uniform fg/bg text runs.
// Avoids per-character function call overhead, redundant clip
// calculation, and spanFill startup costs.
//
// The key optimization over calling drawChar() in a loop is the
// bg fill strategy: in opaque mode, instead of calling spanFill
// once per character cell per row (count * charHeight spanFill
// calls), we fill the entire visible span's background in one
// spanFill per scanline (just charHeight calls total). Then we
// overlay only the fg glyph pixels. For an 80-column line this
// reduces spanFill calls from 80*16=1280 to just 16. Each
// spanFill maps to a single rep stosl, so we're also getting
// better write-combine utilization from the larger sequential
// stores.
//
// Horizontal clipping is done at the character level (firstChar/
// lastChar) to avoid iterating invisible characters, with per-pixel
// edge clipping only for the partially visible first and last chars.
void drawTextN(DisplayT *d, const BlitOpsT *ops, const BitmapFontT *font, int32_t x, int32_t y, const char *text, int32_t count, uint32_t fg, uint32_t bg, bool opaque) {
if (count <= 0) {
return;
}
int32_t cw = font->charWidth;
int32_t ch = font->charHeight;
int32_t bpp = ops->bytesPerPixel;
int32_t pitch = d->pitch;
// Row-level clip: reject if entirely outside vertically
int32_t clipX1 = d->clipX;
int32_t clipX2 = d->clipX + d->clipW;
int32_t clipY1 = d->clipY;
int32_t clipY2 = d->clipY + d->clipH;
if (y + ch <= clipY1 || y >= clipY2) {
return;
}
int32_t totalW = count * cw;
if (x + totalW <= clipX1 || x >= clipX2) {
return;
}
// Vertical clip for glyph scanlines
int32_t rowStart = 0;
int32_t rowEnd = ch;
if (y < clipY1) { rowStart = clipY1 - y; }
if (y + ch > clipY2) { rowEnd = clipY2 - y; }
// Horizontal clip: find first and last visible column (character index)
int32_t firstChar = 0;
int32_t lastChar = count;
if (x < clipX1) {
firstChar = (clipX1 - x) / cw;
}
if (x + totalW > clipX2) {
lastChar = (clipX2 - x + cw - 1) / cw;
if (lastChar > count) { lastChar = count; }
}
// Per-pixel clip for partially visible edge characters
int32_t edgeColStart = 0;
if (x + firstChar * cw < clipX1) {
edgeColStart = clipX1 - (x + firstChar * cw);
}
if (opaque) {
// Opaque: fill background for the entire visible span once per scanline,
// then overlay foreground glyph pixels
int32_t fillX1 = x + firstChar * cw;
int32_t fillX2 = x + lastChar * cw;
if (fillX1 < clipX1) { fillX1 = clipX1; }
if (fillX2 > clipX2) { fillX2 = clipX2; }
int32_t fillW = fillX2 - fillX1;
if (fillW > 0) {
for (int32_t row = rowStart; row < rowEnd; row++) {
uint8_t *dst = d->backBuf + (y + row) * pitch + fillX1 * bpp;
ops->spanFill(dst, bg, fillW);
}
}
}
// Render glyph foreground pixels
for (int32_t ci = firstChar; ci < lastChar; ci++) {
int32_t cx = x + ci * cw;
int32_t cStart = 0;
int32_t cEnd = cw;
if (ci == firstChar) {
cStart = edgeColStart;
}
if (cx + cw > clipX2) {
cEnd = clipX2 - cx;
}
int32_t idx = (uint8_t)text[ci] - font->firstChar;
const uint8_t *glyph = NULL;
if (idx >= 0 && idx < font->numChars) {
glyph = font->glyphData + idx * ch;
}
if (!glyph) {
continue;
}
if (bpp == 2) {
uint16_t fg16 = (uint16_t)fg;
for (int32_t row = rowStart; row < rowEnd; row++) {
uint8_t bits = glyph[row];
if (bits == 0) { continue; }
uint16_t *dst = (uint16_t *)(d->backBuf + (y + row) * pitch + cx * 2);
for (int32_t p = cStart; p < cEnd; p++) {
if (bits & sGlyphBit[p]) {
dst[p] = fg16;
}
}
}
} else if (bpp == 4) {
for (int32_t row = rowStart; row < rowEnd; row++) {
uint8_t bits = glyph[row];
if (bits == 0) { continue; }
uint32_t *dst = (uint32_t *)(d->backBuf + (y + row) * pitch + cx * 4);
for (int32_t p = cStart; p < cEnd; p++) {
if (bits & sGlyphBit[p]) {
dst[p] = fg;
}
}
}
} else {
uint8_t fg8 = (uint8_t)fg;
for (int32_t row = rowStart; row < rowEnd; row++) {
uint8_t bits = glyph[row];
if (bits == 0) { continue; }
uint8_t *dst = d->backBuf + (y + row) * pitch + cx;
for (int32_t p = cStart; p < cEnd; p++) {
if (bits & sGlyphBit[p]) {
dst[p] = fg8;
}
}
}
}
}
}
// ============================================================
// drawFocusRect
// ============================================================
//
// Draws a dotted (every-other-pixel) rectangle to indicate keyboard
// focus, matching the Windows/Motif convention. Uses putPixel per
// dot rather than spanFill because the alternating pattern can't be
// expressed as a span fill (which writes uniform color).
//
// The parity calculations on the bottom and right edges ensure the
// dot pattern is visually continuous around corners -- the starting
// pixel of each edge is offset so dots don't double up or gap at
// the corner where two edges meet.
//
// This is not performance-critical; focus rects are drawn at most
// once per focused widget per repaint.
void drawFocusRect(DisplayT *d, const BlitOpsT *ops, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color) {
int32_t bpp = ops->bytesPerPixel;
int32_t pitch = d->pitch;
int32_t clipX1 = d->clipX;
int32_t clipX2 = d->clipX + d->clipW;
int32_t clipY1 = d->clipY;
int32_t clipY2 = d->clipY + d->clipH;
int32_t x2 = x + w - 1;
int32_t y2 = y + h - 1;
// Top edge
if (y >= clipY1 && y < clipY2) {
for (int32_t px = x; px <= x2; px += 2) {
if (px >= clipX1 && px < clipX2) {
putPixel(d->backBuf + y * pitch + px * bpp, color, bpp);
}
}
}
// Bottom edge
if (y2 >= clipY1 && y2 < clipY2 && y2 != y) {
int32_t parity = (y2 - y) & 1;
for (int32_t px = x + parity; px <= x2; px += 2) {
if (px >= clipX1 && px < clipX2) {
putPixel(d->backBuf + y2 * pitch + px * bpp, color, bpp);
}
}
}
// Left edge (skip corners already drawn)
if (x >= clipX1 && x < clipX2) {
for (int32_t py = y + 2; py < y2; py += 2) {
if (py >= clipY1 && py < clipY2) {
putPixel(d->backBuf + py * pitch + x * bpp, color, bpp);
}
}
}
// Right edge (skip corners already drawn)
if (x2 >= clipX1 && x2 < clipX2 && x2 != x) {
int32_t parity = (x2 - x) & 1;
for (int32_t py = y + 2 - parity; py < y2; py += 2) {
if (py >= clipY1 && py < clipY2) {
putPixel(d->backBuf + py * pitch + x2 * bpp, color, bpp);
}
}
}
}
// ============================================================
// drawHLine
// ============================================================
//
// Thin convenience wrapper -- a horizontal line is just a 1px-tall rect.
// Delegates to rectFill which handles clipping and uses spanFill (rep
// stosl) for the actual write.
void drawHLine(DisplayT *d, const BlitOpsT *ops, int32_t x, int32_t y, int32_t w, uint32_t color) {
rectFill(d, ops, x, y, w, 1, color);
}
// ============================================================
// drawInit
// ============================================================
//
// Wires up the BlitOpsT function pointers to the correct
// platform-specific span operations for the active pixel format.
// Called once during startup after videoInit determines the bpp.
//
// The span ops are the only place where function pointers are used
// in the draw layer. This is a deliberate performance tradeoff:
// spanFill and spanCopy are called per-scanline (not per-pixel),
// so the indirect call overhead (~5 cycles on Pentium for the
// mispredicted first call, then predicted afterward) is amortized
// over an entire row of pixels. The alternative -- a switch inside
// rectFill's inner loop -- would branch every scanline for no gain.
//
// The platform implementations (dvxPlatformDos.c) use inline asm:
// spanFill8/16/32 -> rep stosl (fills 4 bytes per clock)
// spanCopy8/16/32 -> rep movsd (copies 4 bytes per clock)
// These are the fastest bulk memory operations available on 486/
// Pentium without SSE. The 8-bit and 16-bit variants handle
// alignment preambles to get to dword boundaries, then use
// rep stosl/movsd for the bulk.
void drawInit(BlitOpsT *ops, const DisplayT *d) {
ops->bytesPerPixel = d->format.bytesPerPixel;
ops->pitch = d->pitch;
switch (d->format.bytesPerPixel) {
case 1:
ops->spanFill = platformSpanFill8;
ops->spanCopy = platformSpanCopy8;
break;
case 2:
ops->spanFill = platformSpanFill16;
ops->spanCopy = platformSpanCopy16;
break;
case 4:
ops->spanFill = platformSpanFill32;
ops->spanCopy = platformSpanCopy32;
break;
default:
ops->spanFill = platformSpanFill8;
ops->spanCopy = platformSpanCopy8;
break;
}
}
// ============================================================
// drawMaskedBitmap
// ============================================================
//
// Renders a 1-bit masked bitmap (used for mouse cursors and icons).
// The two-plane format mirrors the hardware cursor format used by
// VGA and early SVGA cards:
//
// andMask bit=1, xorData bit=X -> transparent (pixel unchanged)
// andMask bit=0, xorData bit=0 -> bgColor
// andMask bit=0, xorData bit=1 -> fgColor
//
// Each row is a uint16_t (supporting up to 16 pixels wide), stored
// MSB-first. This is sufficient for standard 16x16 mouse cursors.
//
// The colMask optimization pre-computes which bits in each row fall
// within the visible (clipped) columns. For fully transparent rows
// (all visible bits have andMask=1), the entire row is skipped with
// a single bitwise AND + compare -- no per-pixel iteration needed.
void drawMaskedBitmap(DisplayT *d, const BlitOpsT *ops, int32_t x, int32_t y, int32_t w, int32_t h, const uint16_t *andMask, const uint16_t *xorData, uint32_t fgColor, uint32_t bgColor) {
int32_t bpp = ops->bytesPerPixel;
int32_t pitch = d->pitch;
// Pre-clip row/col bounds
int32_t clipX1 = d->clipX;
int32_t clipX2 = d->clipX + d->clipW;
int32_t clipY1 = d->clipY;
int32_t clipY2 = d->clipY + d->clipH;
int32_t rowStart = 0;
int32_t rowEnd = h;
if (y < clipY1) { rowStart = clipY1 - y; }
if (y + h > clipY2) { rowEnd = clipY2 - y; }
int32_t colStart = 0;
int32_t colEnd = w;
if (x < clipX1) { colStart = clipX1 - x; }
if (x + w > clipX2) { colEnd = clipX2 - x; }
if (colStart >= colEnd || rowStart >= rowEnd) {
return;
}
// Pre-compute column mask once (loop-invariant)
uint16_t colMask = 0;
for (int32_t col = colStart; col < colEnd; col++) {
colMask |= sMaskBit[col];
}
for (int32_t row = rowStart; row < rowEnd; row++) {
uint16_t mask = andMask[row];
uint16_t data = xorData[row];
// Skip fully transparent rows
if ((mask & colMask) == colMask) {
continue;
}
int32_t py = y + row;
uint8_t *dst = d->backBuf + py * pitch + x * bpp;
if (bpp == 2) {
uint16_t fg16 = (uint16_t)fgColor;
uint16_t bg16 = (uint16_t)bgColor;
for (int32_t col = colStart; col < colEnd; col++) {
uint16_t bit = sMaskBit[col];
if (!(mask & bit)) {
*(uint16_t *)(dst + col * 2) = (data & bit) ? fg16 : bg16;
}
}
} else if (bpp == 4) {
for (int32_t col = colStart; col < colEnd; col++) {
uint16_t bit = sMaskBit[col];
if (!(mask & bit)) {
*(uint32_t *)(dst + col * 4) = (data & bit) ? fgColor : bgColor;
}
}
} else {
uint8_t fg8 = (uint8_t)fgColor;
uint8_t bg8 = (uint8_t)bgColor;
for (int32_t col = colStart; col < colEnd; col++) {
uint16_t bit = sMaskBit[col];
if (!(mask & bit)) {
dst[col] = (data & bit) ? fg8 : bg8;
}
}
}
}
}
// ============================================================
// drawTermRow
// ============================================================
//
// Renders an entire row of terminal character cells in one pass.
// lineData points to (ch, attr) pairs. palette is a 16-entry
// packed-color table. This avoids per-character function call
// overhead, redundant clip calculation, and spanFill startup
// costs that make drawChar expensive when called 80x per row.
//
// This is the primary rendering function for the terminal emulator.
// The attribute byte uses the standard CGA/VGA format:
// bits 0-3: foreground color (0-15)
// bits 4-6: background color (0-7)
// bit 7: blink flag
//
// Unlike drawTextN (which handles uniform fg/bg), every cell here
// can have a different fg/bg pair, so the bg can't be filled in a
// single bulk pass. Instead each cell is rendered individually,
// always in opaque mode (every pixel gets a write). The bpp branch
// is still hoisted outside the per-pixel loop -- the outer loop
// selects the bpp path once, then iterates cells within it.
//
// blinkVisible controls the blink phase: when false, fg is replaced
// with bg for characters that have bit 7 set, effectively hiding them.
// cursorCol specifies which cell (if any) should be drawn with
// inverted fg/bg to show the text cursor.
void drawTermRow(DisplayT *d, const BlitOpsT *ops, const BitmapFontT *font, int32_t x, int32_t y, int32_t cols, const uint8_t *lineData, const uint32_t *palette, bool blinkVisible, int32_t cursorCol) {
int32_t cw = font->charWidth;
int32_t ch = font->charHeight;
int32_t bpp = ops->bytesPerPixel;
int32_t pitch = d->pitch;
// Row-level clip: reject if entirely outside vertically
int32_t clipX1 = d->clipX;
int32_t clipX2 = d->clipX + d->clipW;
int32_t clipY1 = d->clipY;
int32_t clipY2 = d->clipY + d->clipH;
if (y + ch <= clipY1 || y >= clipY2) {
return;
}
// Vertical clip for glyph scanlines
int32_t rowStart = 0;
int32_t rowEnd = ch;
if (y < clipY1) { rowStart = clipY1 - y; }
if (y + ch > clipY2) { rowEnd = clipY2 - y; }
// Horizontal clip: find first and last visible column
int32_t rowW = cols * cw;
int32_t firstCol = 0;
int32_t lastCol = cols;
if (x + rowW <= clipX1 || x >= clipX2) {
return;
}
if (x < clipX1) {
firstCol = (clipX1 - x) / cw;
}
if (x + rowW > clipX2) {
lastCol = (clipX2 - x + cw - 1) / cw;
if (lastCol > cols) { lastCol = cols; }
}
// Per-column clip for partially visible edge cells
int32_t edgeColStart = 0;
if (x + firstCol * cw < clipX1) {
edgeColStart = clipX1 - (x + firstCol * cw);
}
// Render each visible cell
for (int32_t col = firstCol; col < lastCol; col++) {
uint8_t gch = lineData[col * 2];
uint8_t attr = lineData[col * 2 + 1];
uint32_t fg = palette[attr & 0x0F];
uint32_t bg = palette[(attr >> 4) & 0x07];
// Blink: hide text during off phase
if ((attr & 0x80) && !blinkVisible) {
fg = bg;
}
// Cursor: invert colors
if (col == cursorCol) {
uint32_t tmp = fg;
fg = bg;
bg = tmp;
}
int32_t cx = x + col * cw;
// Determine per-cell horizontal clip
int32_t cStart = 0;
int32_t cEnd = cw;
if (col == firstCol) {
cStart = edgeColStart;
}
if (cx + cw > clipX2) {
cEnd = clipX2 - cx;
}
// Look up glyph data
int32_t idx = (uint8_t)gch - font->firstChar;
const uint8_t *glyph = NULL;
if (idx >= 0 && idx < font->numChars) {
glyph = font->glyphData + idx * ch;
}
// Render scanlines
if (bpp == 2) {
uint16_t fg16 = (uint16_t)fg;
uint16_t bg16 = (uint16_t)bg;
for (int32_t row = rowStart; row < rowEnd; row++) {
uint16_t *dst = (uint16_t *)(d->backBuf + (y + row) * pitch + cx * 2);
uint8_t bits = glyph ? glyph[row] : 0;
for (int32_t p = cStart; p < cEnd; p++) {
dst[p] = (bits & sGlyphBit[p]) ? fg16 : bg16;
}
}
} else if (bpp == 4) {
for (int32_t row = rowStart; row < rowEnd; row++) {
uint32_t *dst = (uint32_t *)(d->backBuf + (y + row) * pitch + cx * 4);
uint8_t bits = glyph ? glyph[row] : 0;
for (int32_t p = cStart; p < cEnd; p++) {
dst[p] = (bits & sGlyphBit[p]) ? fg : bg;
}
}
} else {
uint8_t fg8 = (uint8_t)fg;
uint8_t bg8 = (uint8_t)bg;
for (int32_t row = rowStart; row < rowEnd; row++) {
uint8_t *dst = d->backBuf + (y + row) * pitch + cx;
uint8_t bits = glyph ? glyph[row] : 0;
for (int32_t p = cStart; p < cEnd; p++) {
dst[p] = (bits & sGlyphBit[p]) ? fg8 : bg8;
}
}
}
}
}
// ============================================================
// drawText
// ============================================================
//
// Renders a null-terminated string by calling drawChar per character.
// Simpler than drawTextN but slower for long runs because each
// drawChar call independently clips, computes row bounds, and
// dispatches on bpp. Used for short labels and ad-hoc text where
// the call overhead doesn't matter; drawTextN is preferred for
// bulk text (editor buffers, list views, etc.).
//
// The left-of-clip skip avoids calling drawChar for characters that
// are entirely to the left of the visible area. The right-of-clip
// early-out breaks the loop as soon as we've passed the right edge.
// These are both marked unlikely (__builtin_expect) because the
// common case is text fully within the clip rect.
void drawText(DisplayT *d, const BlitOpsT *ops, const BitmapFontT *font, int32_t x, int32_t y, const char *text, uint32_t fg, uint32_t bg, bool opaque) {
int32_t cw = font->charWidth;
int32_t clipX2 = d->clipX + d->clipW;
while (*text) {
// Early out if we've moved past the right clip edge
if (__builtin_expect(x >= clipX2, 0)) {
break;
}
// Skip characters entirely to the left of clip
if (__builtin_expect(x + cw <= d->clipX, 0)) {
x += cw;
text++;
continue;
}
x += drawChar(d, ops, font, x, y, *text, fg, bg, opaque);
text++;
}
}
// ============================================================
// drawTextAccel
// ============================================================
//
// Like drawText but interprets & markers in the string: the character
// following & is drawn with an underline to indicate it's the keyboard
// accelerator (e.g. "&File" draws "File" with F underlined). "&&"
// draws a literal &. This matches the Windows/Motif convention for
// menu and button labels.
//
// The underline is drawn as a 1px horizontal line at the bottom of
// the character cell (y + charHeight - 1), which is the standard
// placement for accelerator underlines.
void drawTextAccel(DisplayT *d, const BlitOpsT *ops, const BitmapFontT *font, int32_t x, int32_t y, const char *text, uint32_t fg, uint32_t bg, bool opaque) {
int32_t cw = font->charWidth;
int32_t clipX2 = d->clipX + d->clipW;
while (*text) {
if (__builtin_expect(x >= clipX2, 0)) {
break;
}
if (*text == '&') {
text++;
if (*text == '&') {
// Escaped && -- draw literal &
if (x + cw > d->clipX) {
drawChar(d, ops, font, x, y, '&', fg, bg, opaque);
}
x += cw;
text++;
continue;
}
if (*text) {
// Accelerator character -- draw it then underline
if (x + cw > d->clipX) {
drawChar(d, ops, font, x, y, *text, fg, bg, opaque);
drawHLine(d, ops, x, y + font->charHeight - 1, cw, fg);
}
x += cw;
text++;
continue;
}
break;
}
if (x + cw > d->clipX) {
drawChar(d, ops, font, x, y, *text, fg, bg, opaque);
}
x += cw;
text++;
}
}
// ============================================================
// drawVLine
// ============================================================
//
// Draws a vertical line pixel-by-pixel. Unlike drawHLine (which
// delegates to rectFill -> spanFill for a single-row span), a
// vertical line can't use spanFill because each pixel is on a
// different scanline. Instead we advance by d->pitch per pixel
// and write directly, branching on bpp once at the top.
//
// The ops parameter is unused (suppressed with (void)ops) because
// spanFill operates on contiguous horizontal runs and is useless
// for vertical lines. We keep the parameter for API consistency
// with the rest of the draw layer.
void drawVLine(DisplayT *d, const BlitOpsT *ops, int32_t x, int32_t y, int32_t h, uint32_t color) {
(void)ops;
// Inline single-pixel-wide fill to avoid rectFill overhead for narrow lines
if (__builtin_expect(x < d->clipX || x >= d->clipX + d->clipW, 0)) {
return;
}
int32_t y1 = y;
int32_t y2 = y + h;
if (y1 < d->clipY) { y1 = d->clipY; }
if (y2 > d->clipY + d->clipH) { y2 = d->clipY + d->clipH; }
if (y1 >= y2) { return; }
int32_t bpp = d->format.bytesPerPixel;
uint8_t *dst = d->backBuf + y1 * d->pitch + x * bpp;
int32_t pitch = d->pitch;
if (bpp == 2) {
uint16_t c16 = (uint16_t)color;
for (int32_t i = y1; i < y2; i++) {
*(uint16_t *)dst = c16;
dst += pitch;
}
} else if (bpp == 4) {
for (int32_t i = y1; i < y2; i++) {
*(uint32_t *)dst = color;
dst += pitch;
}
} else {
uint8_t c8 = (uint8_t)color;
for (int32_t i = y1; i < y2; i++) {
*dst = c8;
dst += pitch;
}
}
}
// ============================================================
// putPixel
// ============================================================
//
// Writes a single pixel at an already-computed buffer address.
// Only used by drawFocusRect for its alternating dot pattern.
// Marked static inline so it compiles to a direct store at the
// call site with no function call overhead. The bpp chain here
// is acceptable because focus rect drawing is infrequent.
static inline void putPixel(uint8_t *dst, uint32_t color, int32_t bpp) {
if (bpp == 2) {
*(uint16_t *)dst = (uint16_t)color;
} else if (bpp == 4) {
*(uint32_t *)dst = color;
} else {
*dst = (uint8_t)color;
}
}
// ============================================================
// rectCopy
// ============================================================
//
// Copies a rectangular region from an arbitrary source buffer into
// the display backbuffer. Used by the compositor to blit per-window
// content buffers (win->contentBuf) into the shared backbuffer during
// the composite pass.
//
// Clipping adjusts both the destination and source positions by the
// same delta so the visible portion maps to the correct source pixels.
// When the source and destination pitches match and equal the row byte
// count, the entire block is copied in a single memcpy (which the
// compiler/libc can optimize to rep movsd). Otherwise it falls back
// to per-row memcpy.
//
// This function does NOT handle overlapping source and destination
// regions (no memmove). That's fine because the source is always a
// per-window content buffer and the destination is the shared
// backbuffer -- they never overlap.
void rectCopy(DisplayT *d, const BlitOpsT *ops, int32_t dstX, int32_t dstY, const uint8_t *srcBuf, int32_t srcPitch, int32_t srcX, int32_t srcY, int32_t w, int32_t h) {
int32_t bpp = ops->bytesPerPixel;
// Clip to display clip rect
int32_t origDstX = dstX;
int32_t origDstY = dstY;
clipRect(d, &dstX, &dstY, &w, &h);
if (__builtin_expect(w <= 0 || h <= 0, 0)) {
return;
}
// Adjust source position by the amount we clipped
srcX += dstX - origDstX;
srcY += dstY - origDstY;
const uint8_t *srcRow = srcBuf + srcY * srcPitch + srcX * bpp;
uint8_t *dstRow = d->backBuf + dstY * d->pitch + dstX * bpp;
int32_t rowBytes = w * bpp;
int32_t dstPitch = d->pitch;
// For full-width copies aligned to pitch, use memcpy (may optimize to rep movsd)
if (rowBytes == dstPitch && rowBytes == srcPitch) {
memcpy(dstRow, srcRow, rowBytes * h);
} else {
for (int32_t i = 0; i < h; i++) {
memcpy(dstRow, srcRow, rowBytes);
srcRow += srcPitch;
dstRow += dstPitch;
}
}
}
// ============================================================
// rectFill
// ============================================================
//
// The workhorse fill primitive. Clips to the display clip rect,
// then fills one scanline at a time via the spanFill function
// pointer (which routes to rep stosl on DOS). This is the most
// frequently called function in the draw layer -- it backs rectFill
// directly, plus drawHLine, drawBevel interior fills, and the bg
// fill in opaque text rendering.
//
// The clip + early-out pattern (clipRect then check w/h <= 0) is
// the same in every draw function. The __builtin_expect marks the
// zero-size case as unlikely to avoid a taken-branch penalty in the
// common case where the rect is visible after clipping.
void rectFill(DisplayT *d, const BlitOpsT *ops, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color) {
clipRect(d, &x, &y, &w, &h);
if (__builtin_expect(w <= 0 || h <= 0, 0)) {
return;
}
uint8_t *row = d->backBuf + y * d->pitch + x * d->format.bytesPerPixel;
int32_t pitch = d->pitch;
for (int32_t i = 0; i < h; i++) {
ops->spanFill(row, color, w);
row += pitch;
}
}
// ============================================================
// textWidth
// ============================================================
//
// Returns the pixel width of a null-terminated string. Because all
// fonts are fixed-width, this is just strlen * charWidth -- but we
// iterate manually rather than calling strlen to avoid a second pass
// over the string. This is used heavily for layout calculations
// (centering text in buttons, sizing menu popups, etc.).
int32_t textWidth(const BitmapFontT *font, const char *text) {
int32_t w = 0;
while (*text) {
w += font->charWidth;
text++;
}
return w;
}
// ============================================================
// textWidthAccel
// ============================================================
//
// Like textWidth but accounts for & accelerator markers: a single &
// is not rendered (it just marks the next character as the accelerator),
// so it doesn't contribute to width. "&&" renders as one "&" character.
// Used to compute the correct pixel width for menu items and button
// labels that contain accelerator markers.
int32_t textWidthAccel(const BitmapFontT *font, const char *text) {
int32_t w = 0;
while (*text) {
if (*text == '&') {
text++;
if (*text == '&') {
// Escaped && -- counts as one character
w += font->charWidth;
text++;
continue;
}
if (*text) {
// Accelerator character -- counts as one character, & is skipped
w += font->charWidth;
text++;
continue;
}
break;
}
w += font->charWidth;
text++;
}
return w;
}