// dvx_draw.c -- Layer 2: Drawing primitives for DVX GUI (optimized) // // This is the second layer of the DVX compositor stack, sitting on top // of dvxVideo (layer 1) and below dvxComp (layer 3). It provides all // rasterization primitives: filled rects, buffer copies, beveled // frames, bitmap font text, masked bitmaps (cursors/icons), and // single-pixel operations. // // Every function here draws into the system-RAM backbuffer (d->backBuf), // never directly to the LFB. The compositor layer is responsible for // flushing changed regions to the hardware framebuffer via rep movsd. // This separation means draw operations benefit from CPU cache (the // backbuffer lives in cacheable system RAM) while LFB writes are // batched into large sequential bursts. // // Performance strategy overview: // // The core tension on 486/Pentium is between generality and speed. // The draw layer resolves this with a two-tier approach: // // 1) Span operations (spanFill/spanCopy) are dispatched through // function pointers in BlitOpsT, set once at init based on bpp. // The platform implementations use rep stosl/rep movsd inline asm // for maximum throughput (the 486 executes rep stosl at 1 dword // per clock after startup; the Pentium pairs it in the U-pipe). // Using function pointers here costs one indirect call per span // but avoids a bpp switch in the inner loop of rectFill, which // would otherwise be a branch per scanline. // // 2) Character rendering (drawChar, drawTextN, drawTermRow) uses // explicit if/else chains on bpp rather than function pointers. // This is deliberate: the per-pixel work inside glyph rendering // is a tight bit-test loop where an indirect call per pixel would // be catastrophic, and the bpp branch is taken once per glyph row // (hoisted out of the pixel loop). The compiler can also inline // the pixel store when the bpp is a compile-time constant within // each branch. // // 3) For the most critical glyph paths (unclipped 32bpp and 16bpp), // the pixel loops are fully unrolled into 8 direct array stores // with literal bit masks. This eliminates the sGlyphBit[] table // lookup, the loop counter, and the loop branch -- saving ~3 cycles // per pixel on a 486. The clipped path falls back to the table. // // Clip rectangle handling: All draw functions clip against // d->clipX/Y/W/H (set by setClipRect in layer 1). The clipRect() // helper is marked static inline so it compiles to straight-line // compare-and-clamp code at each call site with no function call // overhead. __builtin_expect hints mark the clipping branches as // unlikely, helping the branch predictor on Pentium and later. #include "dvxDraw.h" #include "platform/dvxPlatform.h" #include // ============================================================ // Prototypes // ============================================================ char accelParse(const char *text); static inline void clipRect(const DisplayT *d, int32_t *x, int32_t *y, int32_t *w, int32_t *h); static inline void putPixel(uint8_t *dst, uint32_t color, int32_t bpp); // Bit lookup tables for glyph and mask rendering. On a 486, a variable // shift (1 << (7 - col)) costs 4 cycles per bit position; a table // lookup is a fixed 1-cycle load from L1. The 8-entry sGlyphBit table // maps column index 0..7 to the corresponding bit mask in a 1bpp glyph // byte (MSB-first, matching standard VGA/bitmap font layout). The // 16-entry sMaskBit table does the same for 16-pixel-wide cursor/icon // masks. static const uint8_t sGlyphBit[8] = {0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01}; static const uint16_t sMaskBit[16] = {0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100, 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001}; // ============================================================ // accelParse // ============================================================ // // Scans a menu/button label for the & accelerator marker and returns // the character after it (lowercased). Follows the Windows/Motif // convention: "&File" means Alt+F activates it, "&&" is a literal &. // Returns 0 if no accelerator is found. The result is always // lowercased so the WM can do a single case-insensitive compare // against incoming Alt+key events. char accelParse(const char *text) { if (!text) { return 0; } while (*text) { if (*text == '&') { text++; if (*text == '&') { // Escaped && -- literal &, not an accelerator text++; continue; } if (*text && *text != '&') { char ch = *text; if (ch >= 'A' && ch <= 'Z') { return (char)(ch + 32); } if (ch >= 'a' && ch <= 'z') { return ch; } if (ch >= '0' && ch <= '9') { return ch; } return ch; } break; } text++; } return 0; } // ============================================================ // clipRect // ============================================================ // // Intersects a rectangle with the display's current clip rect, // modifying the rect in place. If the rect is fully outside the // clip region, w or h will be <= 0 and callers bail out. // // Marked static inline because this is called on every rectFill, // rectCopy, and indirectly on every glyph -- it must compile to // straight-line clamp instructions with zero call overhead. // __builtin_expect(..., 0) marks clipping as unlikely; in the // common case windows are fully within the clip rect and all // four branches fall through untaken. On Pentium this keeps the // branch predictor happy (static not-taken prediction for forward // branches), and on 486 it at least avoids the taken-branch penalty. static inline void clipRect(const DisplayT *d, int32_t *x, int32_t *y, int32_t *w, int32_t *h) { int32_t cx2 = d->clipX + d->clipW; int32_t cy2 = d->clipY + d->clipH; int32_t rx1 = *x; int32_t ry1 = *y; int32_t rx2 = rx1 + *w; int32_t ry2 = ry1 + *h; if (__builtin_expect(rx1 < d->clipX, 0)) { rx1 = d->clipX; } if (__builtin_expect(ry1 < d->clipY, 0)) { ry1 = d->clipY; } if (__builtin_expect(rx2 > cx2, 0)) { rx2 = cx2; } if (__builtin_expect(ry2 > cy2, 0)) { ry2 = cy2; } *x = rx1; *y = ry1; *w = rx2 - rx1; *h = ry2 - ry1; } // ============================================================ // drawBevel // ============================================================ // // Draws a Motif/DESQview-style beveled rectangular frame. The bevel // creates the illusion of a raised or sunken 3D surface by drawing // lighter "highlight" edges on the top and left, and darker "shadow" // edges on the bottom and right. Swapping highlight and shadow gives // a sunken appearance (see BEVEL_RAISED/BEVEL_SUNKEN macros in // dvxTypes.h). // // BevelStyleT.width controls the border thickness. DV/X uses 2px // bevels for most window chrome (matching the original DESQview/X // and Motif look), 1px for inner borders and scrollbar elements. // // The implementation has special-cased fast paths for bw==2 and bw==1 // that emit exact spans via rectFill rather than looping. This // matters because drawBevel is called for every window frame, button, // menu, and scrollbar element on every repaint -- the loop overhead // and extra rectFill calls in the general case add up. Each rectFill // call already handles clipping internally, so the bevels clip // correctly even when a window is partially off-screen. // // face==0 means "don't fill the interior", which is used for frame-only // bevels where the content area is painted separately by a callback. void drawBevel(DisplayT *d, const BlitOpsT *ops, int32_t x, int32_t y, int32_t w, int32_t h, const BevelStyleT *style) { int32_t bw = style->width; // Fill interior if requested if (style->face != 0) { rectFill(d, ops, x + bw, y + bw, w - bw * 2, h - bw * 2, style->face); } // Fast path for the common bevel widths (1 and 2) // Directly emit spans instead of calling drawHLine->rectFill->clipRect per line if (bw == 2) { // Top 2 highlight lines rectFill(d, ops, x, y, w, 1, style->highlight); rectFill(d, ops, x + 1, y + 1, w - 2, 1, style->highlight); // Left 2 highlight columns rectFill(d, ops, x, y + 1, 1, h - 1, style->highlight); rectFill(d, ops, x + 1, y + 2, 1, h - 3, style->highlight); // Bottom 2 shadow lines rectFill(d, ops, x, y + h - 1, w, 1, style->shadow); rectFill(d, ops, x + 1, y + h - 2, w - 2, 1, style->shadow); // Right 2 shadow columns rectFill(d, ops, x + w - 1, y + 1, 1, h - 2, style->shadow); rectFill(d, ops, x + w - 2, y + 2, 1, h - 4, style->shadow); } else if (bw == 1) { rectFill(d, ops, x, y, w, 1, style->highlight); rectFill(d, ops, x, y + 1, 1, h - 1, style->highlight); rectFill(d, ops, x, y + h - 1, w, 1, style->shadow); rectFill(d, ops, x + w - 1, y + 1, 1, h - 2, style->shadow); } else { for (int32_t i = 0; i < bw; i++) { rectFill(d, ops, x + i, y + i, w - i * 2, 1, style->highlight); } for (int32_t i = 0; i < bw; i++) { rectFill(d, ops, x + i, y + i + 1, 1, h - i * 2 - 1, style->highlight); } for (int32_t i = 0; i < bw; i++) { rectFill(d, ops, x + i, y + h - 1 - i, w - i * 2, 1, style->shadow); } for (int32_t i = 0; i < bw; i++) { rectFill(d, ops, x + w - 1 - i, y + i + 1, 1, h - i * 2 - 2, style->shadow); } } } // ============================================================ // drawChar // ============================================================ // // Renders a single fixed-width bitmap font character into the // backbuffer. Returns the character advance width (always // font->charWidth) so callers can accumulate cursor position. // // Font format: each glyph is charHeight bytes of 1bpp data, MSB-first // (bit 7 = leftmost pixel). This is the standard VGA/PC BIOS font // format. We use 8-pixel-wide glyphs exclusively because 8 bits fit // in one byte per scanline, making the inner loop a single byte load // plus 8 bit tests -- no multi-byte glyph row assembly needed. // // The function has six specialized code paths (3 bpp x 2 modes), // chosen with if/else chains rather than function pointers. On 486 // and Pentium, an indirect call through a function pointer stalls the // pipeline (no branch target buffer for indirect calls on 486, and // a mandatory bubble on Pentium). The if/else chain resolves at the // outer loop level (once per glyph, not per pixel), so the per-pixel // inner code is branch-free within each path. // // Opaque vs transparent mode: // opaque=true: Fills the entire character cell (bg then fg). Used // for normal text where the background must overwrite // whatever was previously in the cell. // opaque=false: Only writes foreground pixels; background shows // through. Used for overlay text on existing content. // // The "unclipped fast path" (colStart==0, colEnd==cw) avoids the // sGlyphBit[] table lookup by testing literal bit masks directly. // This matters because the table lookup involves an indexed load // (base + index * element_size), while the literal mask is an // immediate operand in the compare instruction. At 8 pixels per row // and 14-16 rows per glyph, saving even 1 cycle per pixel adds up // across a full screen of text (~6400 characters at 80x80). int32_t drawChar(DisplayT *d, const BlitOpsT *ops, const BitmapFontT *font, int32_t x, int32_t y, char ch, uint32_t fg, uint32_t bg, bool opaque) { int32_t cw = font->charWidth; int32_t chh = font->charHeight; // Quick reject: entirely outside clip rect if (__builtin_expect(x + cw <= d->clipX || x >= d->clipX + d->clipW || y + chh <= d->clipY || y >= d->clipY + d->clipH, 0)) { return cw; } int32_t idx = (uint8_t)ch - font->firstChar; if (__builtin_expect(idx < 0 || idx >= font->numChars, 0)) { if (opaque) { rectFill(d, ops, x, y, cw, chh, bg); } return cw; } const uint8_t *glyph = font->glyphData + idx * chh; int32_t bpp = ops->bytesPerPixel; int32_t pitch = d->pitch; // Calculate clipped row/col bounds once int32_t clipX1 = d->clipX; int32_t clipX2 = d->clipX + d->clipW; int32_t clipY1 = d->clipY; int32_t clipY2 = d->clipY + d->clipH; int32_t rowStart = 0; int32_t rowEnd = chh; if (y < clipY1) { rowStart = clipY1 - y; } if (y + chh > clipY2) { rowEnd = clipY2 - y; } int32_t colStart = 0; int32_t colEnd = cw; if (x < clipX1) { colStart = clipX1 - x; } if (x + cw > clipX2) { colEnd = clipX2 - x; } // Unclipped fast path: when the character cell is fully within the // clip rect we can skip per-pixel clip checks and use the fully // unrolled 8-store sequences below. This is the hot path for all // text that isn't at the edge of a window. bool unclipped = (colStart == 0 && colEnd == cw); if (opaque) { // Opaque mode: every pixel in the cell gets written (fg or bg). // The unclipped 32bpp and 16bpp paths use branchless ternary // stores -- the compiler emits cmov or conditional-set sequences // that avoid branch misprediction penalties. Each row is 8 // direct array stores with no loop, no table lookup. if (unclipped && bpp == 4) { for (int32_t row = rowStart; row < rowEnd; row++) { uint32_t *dst32 = (uint32_t *)(d->backBuf + (y + row) * pitch + x * 4); uint8_t bits = glyph[row]; dst32[0] = (bits & 0x80) ? fg : bg; dst32[1] = (bits & 0x40) ? fg : bg; dst32[2] = (bits & 0x20) ? fg : bg; dst32[3] = (bits & 0x10) ? fg : bg; dst32[4] = (bits & 0x08) ? fg : bg; dst32[5] = (bits & 0x04) ? fg : bg; dst32[6] = (bits & 0x02) ? fg : bg; dst32[7] = (bits & 0x01) ? fg : bg; } } else if (unclipped && bpp == 2) { uint16_t fg16 = (uint16_t)fg; uint16_t bg16 = (uint16_t)bg; for (int32_t row = rowStart; row < rowEnd; row++) { uint16_t *dst16 = (uint16_t *)(d->backBuf + (y + row) * pitch + x * 2); uint8_t bits = glyph[row]; dst16[0] = (bits & 0x80) ? fg16 : bg16; dst16[1] = (bits & 0x40) ? fg16 : bg16; dst16[2] = (bits & 0x20) ? fg16 : bg16; dst16[3] = (bits & 0x10) ? fg16 : bg16; dst16[4] = (bits & 0x08) ? fg16 : bg16; dst16[5] = (bits & 0x04) ? fg16 : bg16; dst16[6] = (bits & 0x02) ? fg16 : bg16; dst16[7] = (bits & 0x01) ? fg16 : bg16; } } else { // Clipped path or 8bpp: use spanFill for bg (leveraging // rep stosl), then iterate visible columns with sGlyphBit[] // table for fg. 8bpp always takes this path because 8-bit // stores can't be branchlessly ternary'd as efficiently -- // the compiler can't cmov into a byte store. for (int32_t row = rowStart; row < rowEnd; row++) { int32_t py = y + row; uint8_t *dst = d->backBuf + py * pitch + (x + colStart) * bpp; ops->spanFill(dst, bg, colEnd - colStart); uint8_t bits = glyph[row]; if (bits == 0) { continue; } dst = d->backBuf + py * pitch + x * bpp; if (bpp == 2) { uint16_t fg16 = (uint16_t)fg; for (int32_t col = colStart; col < colEnd; col++) { if (bits & sGlyphBit[col]) { *(uint16_t *)(dst + col * 2) = fg16; } } } else if (bpp == 4) { for (int32_t col = colStart; col < colEnd; col++) { if (bits & sGlyphBit[col]) { *(uint32_t *)(dst + col * 4) = fg; } } } else { uint8_t fg8 = (uint8_t)fg; for (int32_t col = colStart; col < colEnd; col++) { if (bits & sGlyphBit[col]) { dst[col] = fg8; } } } } } } else { // Transparent mode: only fg pixels are written; bg is untouched. // The "bits == 0" early-out per row is important here: blank // rows in the glyph (common in the top/bottom padding of most // characters) skip all pixel work entirely. In opaque mode // blank rows still need the bg fill so we can't skip them. if (unclipped && bpp == 4) { for (int32_t row = rowStart; row < rowEnd; row++) { uint8_t bits = glyph[row]; if (bits == 0) { continue; } uint32_t *dst32 = (uint32_t *)(d->backBuf + (y + row) * pitch + x * 4); if (bits & 0x80) { dst32[0] = fg; } if (bits & 0x40) { dst32[1] = fg; } if (bits & 0x20) { dst32[2] = fg; } if (bits & 0x10) { dst32[3] = fg; } if (bits & 0x08) { dst32[4] = fg; } if (bits & 0x04) { dst32[5] = fg; } if (bits & 0x02) { dst32[6] = fg; } if (bits & 0x01) { dst32[7] = fg; } } } else if (unclipped && bpp == 2) { uint16_t fg16 = (uint16_t)fg; for (int32_t row = rowStart; row < rowEnd; row++) { uint8_t bits = glyph[row]; if (bits == 0) { continue; } uint16_t *dst16 = (uint16_t *)(d->backBuf + (y + row) * pitch + x * 2); if (bits & 0x80) { dst16[0] = fg16; } if (bits & 0x40) { dst16[1] = fg16; } if (bits & 0x20) { dst16[2] = fg16; } if (bits & 0x10) { dst16[3] = fg16; } if (bits & 0x08) { dst16[4] = fg16; } if (bits & 0x04) { dst16[5] = fg16; } if (bits & 0x02) { dst16[6] = fg16; } if (bits & 0x01) { dst16[7] = fg16; } } } else { // Clipped path or 8bpp for (int32_t row = rowStart; row < rowEnd; row++) { uint8_t bits = glyph[row]; if (bits == 0) { continue; } int32_t py = y + row; uint8_t *dst = d->backBuf + py * pitch + x * bpp; if (bpp == 2) { uint16_t fg16 = (uint16_t)fg; for (int32_t col = colStart; col < colEnd; col++) { if (bits & sGlyphBit[col]) { *(uint16_t *)(dst + col * 2) = fg16; } } } else if (bpp == 4) { for (int32_t col = colStart; col < colEnd; col++) { if (bits & sGlyphBit[col]) { *(uint32_t *)(dst + col * 4) = fg; } } } else { uint8_t fg8 = (uint8_t)fg; for (int32_t col = colStart; col < colEnd; col++) { if (bits & sGlyphBit[col]) { dst[col] = fg8; } } } } } } return cw; } // ============================================================ // drawTextN // ============================================================ // // Renders exactly 'count' characters from a buffer in one pass. // Same idea as drawTermRow but for uniform fg/bg text runs. // Avoids per-character function call overhead, redundant clip // calculation, and spanFill startup costs. // // The key optimization over calling drawChar() in a loop is the // bg fill strategy: in opaque mode, instead of calling spanFill // once per character cell per row (count * charHeight spanFill // calls), we fill the entire visible span's background in one // spanFill per scanline (just charHeight calls total). Then we // overlay only the fg glyph pixels. For an 80-column line this // reduces spanFill calls from 80*16=1280 to just 16. Each // spanFill maps to a single rep stosl, so we're also getting // better write-combine utilization from the larger sequential // stores. // // Horizontal clipping is done at the character level (firstChar/ // lastChar) to avoid iterating invisible characters, with per-pixel // edge clipping only for the partially visible first and last chars. void drawTextN(DisplayT *d, const BlitOpsT *ops, const BitmapFontT *font, int32_t x, int32_t y, const char *text, int32_t count, uint32_t fg, uint32_t bg, bool opaque) { if (count <= 0) { return; } int32_t cw = font->charWidth; int32_t ch = font->charHeight; int32_t bpp = ops->bytesPerPixel; int32_t pitch = d->pitch; // Row-level clip: reject if entirely outside vertically int32_t clipX1 = d->clipX; int32_t clipX2 = d->clipX + d->clipW; int32_t clipY1 = d->clipY; int32_t clipY2 = d->clipY + d->clipH; if (y + ch <= clipY1 || y >= clipY2) { return; } int32_t totalW = count * cw; if (x + totalW <= clipX1 || x >= clipX2) { return; } // Vertical clip for glyph scanlines int32_t rowStart = 0; int32_t rowEnd = ch; if (y < clipY1) { rowStart = clipY1 - y; } if (y + ch > clipY2) { rowEnd = clipY2 - y; } // Horizontal clip: find first and last visible column (character index) int32_t firstChar = 0; int32_t lastChar = count; if (x < clipX1) { firstChar = (clipX1 - x) / cw; } if (x + totalW > clipX2) { lastChar = (clipX2 - x + cw - 1) / cw; if (lastChar > count) { lastChar = count; } } // Per-pixel clip for partially visible edge characters int32_t edgeColStart = 0; if (x + firstChar * cw < clipX1) { edgeColStart = clipX1 - (x + firstChar * cw); } if (opaque) { // Opaque: fill background for the entire visible span once per scanline, // then overlay foreground glyph pixels int32_t fillX1 = x + firstChar * cw; int32_t fillX2 = x + lastChar * cw; if (fillX1 < clipX1) { fillX1 = clipX1; } if (fillX2 > clipX2) { fillX2 = clipX2; } int32_t fillW = fillX2 - fillX1; if (fillW > 0) { for (int32_t row = rowStart; row < rowEnd; row++) { uint8_t *dst = d->backBuf + (y + row) * pitch + fillX1 * bpp; ops->spanFill(dst, bg, fillW); } } } // Render glyph foreground pixels for (int32_t ci = firstChar; ci < lastChar; ci++) { int32_t cx = x + ci * cw; int32_t cStart = 0; int32_t cEnd = cw; if (ci == firstChar) { cStart = edgeColStart; } if (cx + cw > clipX2) { cEnd = clipX2 - cx; } int32_t idx = (uint8_t)text[ci] - font->firstChar; const uint8_t *glyph = NULL; if (idx >= 0 && idx < font->numChars) { glyph = font->glyphData + idx * ch; } if (!glyph) { continue; } if (bpp == 2) { uint16_t fg16 = (uint16_t)fg; for (int32_t row = rowStart; row < rowEnd; row++) { uint8_t bits = glyph[row]; if (bits == 0) { continue; } uint16_t *dst = (uint16_t *)(d->backBuf + (y + row) * pitch + cx * 2); for (int32_t p = cStart; p < cEnd; p++) { if (bits & sGlyphBit[p]) { dst[p] = fg16; } } } } else if (bpp == 4) { for (int32_t row = rowStart; row < rowEnd; row++) { uint8_t bits = glyph[row]; if (bits == 0) { continue; } uint32_t *dst = (uint32_t *)(d->backBuf + (y + row) * pitch + cx * 4); for (int32_t p = cStart; p < cEnd; p++) { if (bits & sGlyphBit[p]) { dst[p] = fg; } } } } else { uint8_t fg8 = (uint8_t)fg; for (int32_t row = rowStart; row < rowEnd; row++) { uint8_t bits = glyph[row]; if (bits == 0) { continue; } uint8_t *dst = d->backBuf + (y + row) * pitch + cx; for (int32_t p = cStart; p < cEnd; p++) { if (bits & sGlyphBit[p]) { dst[p] = fg8; } } } } } } // ============================================================ // drawFocusRect // ============================================================ // // Draws a dotted (every-other-pixel) rectangle to indicate keyboard // focus, matching the Windows/Motif convention. Uses putPixel per // dot rather than spanFill because the alternating pattern can't be // expressed as a span fill (which writes uniform color). // // The parity calculations on the bottom and right edges ensure the // dot pattern is visually continuous around corners -- the starting // pixel of each edge is offset so dots don't double up or gap at // the corner where two edges meet. // // This is not performance-critical; focus rects are drawn at most // once per focused widget per repaint. void drawFocusRect(DisplayT *d, const BlitOpsT *ops, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color) { int32_t bpp = ops->bytesPerPixel; int32_t pitch = d->pitch; int32_t clipX1 = d->clipX; int32_t clipX2 = d->clipX + d->clipW; int32_t clipY1 = d->clipY; int32_t clipY2 = d->clipY + d->clipH; int32_t x2 = x + w - 1; int32_t y2 = y + h - 1; // Top edge if (y >= clipY1 && y < clipY2) { for (int32_t px = x; px <= x2; px += 2) { if (px >= clipX1 && px < clipX2) { putPixel(d->backBuf + y * pitch + px * bpp, color, bpp); } } } // Bottom edge if (y2 >= clipY1 && y2 < clipY2 && y2 != y) { int32_t parity = (y2 - y) & 1; for (int32_t px = x + parity; px <= x2; px += 2) { if (px >= clipX1 && px < clipX2) { putPixel(d->backBuf + y2 * pitch + px * bpp, color, bpp); } } } // Left edge (skip corners already drawn) if (x >= clipX1 && x < clipX2) { for (int32_t py = y + 2; py < y2; py += 2) { if (py >= clipY1 && py < clipY2) { putPixel(d->backBuf + py * pitch + x * bpp, color, bpp); } } } // Right edge (skip corners already drawn) if (x2 >= clipX1 && x2 < clipX2 && x2 != x) { int32_t parity = (x2 - x) & 1; for (int32_t py = y + 2 - parity; py < y2; py += 2) { if (py >= clipY1 && py < clipY2) { putPixel(d->backBuf + py * pitch + x2 * bpp, color, bpp); } } } } // ============================================================ // drawHLine // ============================================================ // // Thin convenience wrapper -- a horizontal line is just a 1px-tall rect. // Delegates to rectFill which handles clipping and uses spanFill (rep // stosl) for the actual write. void drawHLine(DisplayT *d, const BlitOpsT *ops, int32_t x, int32_t y, int32_t w, uint32_t color) { rectFill(d, ops, x, y, w, 1, color); } // ============================================================ // drawInit // ============================================================ // // Wires up the BlitOpsT function pointers to the correct // platform-specific span operations for the active pixel format. // Called once during startup after videoInit determines the bpp. // // The span ops are the only place where function pointers are used // in the draw layer. This is a deliberate performance tradeoff: // spanFill and spanCopy are called per-scanline (not per-pixel), // so the indirect call overhead (~5 cycles on Pentium for the // mispredicted first call, then predicted afterward) is amortized // over an entire row of pixels. The alternative -- a switch inside // rectFill's inner loop -- would branch every scanline for no gain. // // The platform implementations (dvxPlatformDos.c) use inline asm: // spanFill8/16/32 -> rep stosl (fills 4 bytes per clock) // spanCopy8/16/32 -> rep movsd (copies 4 bytes per clock) // These are the fastest bulk memory operations available on 486/ // Pentium without SSE. The 8-bit and 16-bit variants handle // alignment preambles to get to dword boundaries, then use // rep stosl/movsd for the bulk. void drawInit(BlitOpsT *ops, const DisplayT *d) { ops->bytesPerPixel = d->format.bytesPerPixel; ops->pitch = d->pitch; switch (d->format.bytesPerPixel) { case 1: ops->spanFill = platformSpanFill8; ops->spanCopy = platformSpanCopy8; break; case 2: ops->spanFill = platformSpanFill16; ops->spanCopy = platformSpanCopy16; break; case 4: ops->spanFill = platformSpanFill32; ops->spanCopy = platformSpanCopy32; break; default: ops->spanFill = platformSpanFill8; ops->spanCopy = platformSpanCopy8; break; } } // ============================================================ // drawMaskedBitmap // ============================================================ // // Renders a 1-bit masked bitmap (used for mouse cursors and icons). // The two-plane format mirrors the hardware cursor format used by // VGA and early SVGA cards: // // andMask bit=1, xorData bit=X -> transparent (pixel unchanged) // andMask bit=0, xorData bit=0 -> bgColor // andMask bit=0, xorData bit=1 -> fgColor // // Each row is a uint16_t (supporting up to 16 pixels wide), stored // MSB-first. This is sufficient for standard 16x16 mouse cursors. // // The colMask optimization pre-computes which bits in each row fall // within the visible (clipped) columns. For fully transparent rows // (all visible bits have andMask=1), the entire row is skipped with // a single bitwise AND + compare -- no per-pixel iteration needed. void drawMaskedBitmap(DisplayT *d, const BlitOpsT *ops, int32_t x, int32_t y, int32_t w, int32_t h, const uint16_t *andMask, const uint16_t *xorData, uint32_t fgColor, uint32_t bgColor) { int32_t bpp = ops->bytesPerPixel; int32_t pitch = d->pitch; // Pre-clip row/col bounds int32_t clipX1 = d->clipX; int32_t clipX2 = d->clipX + d->clipW; int32_t clipY1 = d->clipY; int32_t clipY2 = d->clipY + d->clipH; int32_t rowStart = 0; int32_t rowEnd = h; if (y < clipY1) { rowStart = clipY1 - y; } if (y + h > clipY2) { rowEnd = clipY2 - y; } int32_t colStart = 0; int32_t colEnd = w; if (x < clipX1) { colStart = clipX1 - x; } if (x + w > clipX2) { colEnd = clipX2 - x; } if (colStart >= colEnd || rowStart >= rowEnd) { return; } // Pre-compute column mask once (loop-invariant) uint16_t colMask = 0; for (int32_t col = colStart; col < colEnd; col++) { colMask |= sMaskBit[col]; } for (int32_t row = rowStart; row < rowEnd; row++) { uint16_t mask = andMask[row]; uint16_t data = xorData[row]; // Skip fully transparent rows if ((mask & colMask) == colMask) { continue; } int32_t py = y + row; uint8_t *dst = d->backBuf + py * pitch + x * bpp; if (bpp == 2) { uint16_t fg16 = (uint16_t)fgColor; uint16_t bg16 = (uint16_t)bgColor; for (int32_t col = colStart; col < colEnd; col++) { uint16_t bit = sMaskBit[col]; if (!(mask & bit)) { *(uint16_t *)(dst + col * 2) = (data & bit) ? fg16 : bg16; } } } else if (bpp == 4) { for (int32_t col = colStart; col < colEnd; col++) { uint16_t bit = sMaskBit[col]; if (!(mask & bit)) { *(uint32_t *)(dst + col * 4) = (data & bit) ? fgColor : bgColor; } } } else { uint8_t fg8 = (uint8_t)fgColor; uint8_t bg8 = (uint8_t)bgColor; for (int32_t col = colStart; col < colEnd; col++) { uint16_t bit = sMaskBit[col]; if (!(mask & bit)) { dst[col] = (data & bit) ? fg8 : bg8; } } } } } // ============================================================ // drawTermRow // ============================================================ // // Renders an entire row of terminal character cells in one pass. // lineData points to (ch, attr) pairs. palette is a 16-entry // packed-color table. This avoids per-character function call // overhead, redundant clip calculation, and spanFill startup // costs that make drawChar expensive when called 80x per row. // // This is the primary rendering function for the terminal emulator. // The attribute byte uses the standard CGA/VGA format: // bits 0-3: foreground color (0-15) // bits 4-6: background color (0-7) // bit 7: blink flag // // Unlike drawTextN (which handles uniform fg/bg), every cell here // can have a different fg/bg pair, so the bg can't be filled in a // single bulk pass. Instead each cell is rendered individually, // always in opaque mode (every pixel gets a write). The bpp branch // is still hoisted outside the per-pixel loop -- the outer loop // selects the bpp path once, then iterates cells within it. // // blinkVisible controls the blink phase: when false, fg is replaced // with bg for characters that have bit 7 set, effectively hiding them. // cursorCol specifies which cell (if any) should be drawn with // inverted fg/bg to show the text cursor. void drawTermRow(DisplayT *d, const BlitOpsT *ops, const BitmapFontT *font, int32_t x, int32_t y, int32_t cols, const uint8_t *lineData, const uint32_t *palette, bool blinkVisible, int32_t cursorCol) { int32_t cw = font->charWidth; int32_t ch = font->charHeight; int32_t bpp = ops->bytesPerPixel; int32_t pitch = d->pitch; // Row-level clip: reject if entirely outside vertically int32_t clipX1 = d->clipX; int32_t clipX2 = d->clipX + d->clipW; int32_t clipY1 = d->clipY; int32_t clipY2 = d->clipY + d->clipH; if (y + ch <= clipY1 || y >= clipY2) { return; } // Vertical clip for glyph scanlines int32_t rowStart = 0; int32_t rowEnd = ch; if (y < clipY1) { rowStart = clipY1 - y; } if (y + ch > clipY2) { rowEnd = clipY2 - y; } // Horizontal clip: find first and last visible column int32_t rowW = cols * cw; int32_t firstCol = 0; int32_t lastCol = cols; if (x + rowW <= clipX1 || x >= clipX2) { return; } if (x < clipX1) { firstCol = (clipX1 - x) / cw; } if (x + rowW > clipX2) { lastCol = (clipX2 - x + cw - 1) / cw; if (lastCol > cols) { lastCol = cols; } } // Per-column clip for partially visible edge cells int32_t edgeColStart = 0; if (x + firstCol * cw < clipX1) { edgeColStart = clipX1 - (x + firstCol * cw); } // Render each visible cell for (int32_t col = firstCol; col < lastCol; col++) { uint8_t gch = lineData[col * 2]; uint8_t attr = lineData[col * 2 + 1]; uint32_t fg = palette[attr & 0x0F]; uint32_t bg = palette[(attr >> 4) & 0x07]; // Blink: hide text during off phase if ((attr & 0x80) && !blinkVisible) { fg = bg; } // Cursor: invert colors if (col == cursorCol) { uint32_t tmp = fg; fg = bg; bg = tmp; } int32_t cx = x + col * cw; // Determine per-cell horizontal clip int32_t cStart = 0; int32_t cEnd = cw; if (col == firstCol) { cStart = edgeColStart; } if (cx + cw > clipX2) { cEnd = clipX2 - cx; } // Look up glyph data int32_t idx = (uint8_t)gch - font->firstChar; const uint8_t *glyph = NULL; if (idx >= 0 && idx < font->numChars) { glyph = font->glyphData + idx * ch; } // Render scanlines if (bpp == 2) { uint16_t fg16 = (uint16_t)fg; uint16_t bg16 = (uint16_t)bg; for (int32_t row = rowStart; row < rowEnd; row++) { uint16_t *dst = (uint16_t *)(d->backBuf + (y + row) * pitch + cx * 2); uint8_t bits = glyph ? glyph[row] : 0; for (int32_t p = cStart; p < cEnd; p++) { dst[p] = (bits & sGlyphBit[p]) ? fg16 : bg16; } } } else if (bpp == 4) { for (int32_t row = rowStart; row < rowEnd; row++) { uint32_t *dst = (uint32_t *)(d->backBuf + (y + row) * pitch + cx * 4); uint8_t bits = glyph ? glyph[row] : 0; for (int32_t p = cStart; p < cEnd; p++) { dst[p] = (bits & sGlyphBit[p]) ? fg : bg; } } } else { uint8_t fg8 = (uint8_t)fg; uint8_t bg8 = (uint8_t)bg; for (int32_t row = rowStart; row < rowEnd; row++) { uint8_t *dst = d->backBuf + (y + row) * pitch + cx; uint8_t bits = glyph ? glyph[row] : 0; for (int32_t p = cStart; p < cEnd; p++) { dst[p] = (bits & sGlyphBit[p]) ? fg8 : bg8; } } } } } // ============================================================ // drawText // ============================================================ // // Renders a null-terminated string by calling drawChar per character. // Simpler than drawTextN but slower for long runs because each // drawChar call independently clips, computes row bounds, and // dispatches on bpp. Used for short labels and ad-hoc text where // the call overhead doesn't matter; drawTextN is preferred for // bulk text (editor buffers, list views, etc.). // // The left-of-clip skip avoids calling drawChar for characters that // are entirely to the left of the visible area. The right-of-clip // early-out breaks the loop as soon as we've passed the right edge. // These are both marked unlikely (__builtin_expect) because the // common case is text fully within the clip rect. void drawText(DisplayT *d, const BlitOpsT *ops, const BitmapFontT *font, int32_t x, int32_t y, const char *text, uint32_t fg, uint32_t bg, bool opaque) { int32_t cw = font->charWidth; int32_t clipX2 = d->clipX + d->clipW; while (*text) { // Early out if we've moved past the right clip edge if (__builtin_expect(x >= clipX2, 0)) { break; } // Skip characters entirely to the left of clip if (__builtin_expect(x + cw <= d->clipX, 0)) { x += cw; text++; continue; } x += drawChar(d, ops, font, x, y, *text, fg, bg, opaque); text++; } } // ============================================================ // drawTextAccel // ============================================================ // // Like drawText but interprets & markers in the string: the character // following & is drawn with an underline to indicate it's the keyboard // accelerator (e.g. "&File" draws "File" with F underlined). "&&" // draws a literal &. This matches the Windows/Motif convention for // menu and button labels. // // The underline is drawn as a 1px horizontal line at the bottom of // the character cell (y + charHeight - 1), which is the standard // placement for accelerator underlines. void drawTextAccel(DisplayT *d, const BlitOpsT *ops, const BitmapFontT *font, int32_t x, int32_t y, const char *text, uint32_t fg, uint32_t bg, bool opaque) { int32_t cw = font->charWidth; int32_t clipX2 = d->clipX + d->clipW; while (*text) { if (__builtin_expect(x >= clipX2, 0)) { break; } if (*text == '&') { text++; if (*text == '&') { // Escaped && -- draw literal & if (x + cw > d->clipX) { drawChar(d, ops, font, x, y, '&', fg, bg, opaque); } x += cw; text++; continue; } if (*text) { // Accelerator character -- draw it then underline if (x + cw > d->clipX) { drawChar(d, ops, font, x, y, *text, fg, bg, opaque); drawHLine(d, ops, x, y + font->charHeight - 1, cw, fg); } x += cw; text++; continue; } break; } if (x + cw > d->clipX) { drawChar(d, ops, font, x, y, *text, fg, bg, opaque); } x += cw; text++; } } // ============================================================ // drawVLine // ============================================================ // // Draws a vertical line pixel-by-pixel. Unlike drawHLine (which // delegates to rectFill -> spanFill for a single-row span), a // vertical line can't use spanFill because each pixel is on a // different scanline. Instead we advance by d->pitch per pixel // and write directly, branching on bpp once at the top. // // The ops parameter is unused (suppressed with (void)ops) because // spanFill operates on contiguous horizontal runs and is useless // for vertical lines. We keep the parameter for API consistency // with the rest of the draw layer. void drawVLine(DisplayT *d, const BlitOpsT *ops, int32_t x, int32_t y, int32_t h, uint32_t color) { (void)ops; // Inline single-pixel-wide fill to avoid rectFill overhead for narrow lines if (__builtin_expect(x < d->clipX || x >= d->clipX + d->clipW, 0)) { return; } int32_t y1 = y; int32_t y2 = y + h; if (y1 < d->clipY) { y1 = d->clipY; } if (y2 > d->clipY + d->clipH) { y2 = d->clipY + d->clipH; } if (y1 >= y2) { return; } int32_t bpp = d->format.bytesPerPixel; uint8_t *dst = d->backBuf + y1 * d->pitch + x * bpp; int32_t pitch = d->pitch; if (bpp == 2) { uint16_t c16 = (uint16_t)color; for (int32_t i = y1; i < y2; i++) { *(uint16_t *)dst = c16; dst += pitch; } } else if (bpp == 4) { for (int32_t i = y1; i < y2; i++) { *(uint32_t *)dst = color; dst += pitch; } } else { uint8_t c8 = (uint8_t)color; for (int32_t i = y1; i < y2; i++) { *dst = c8; dst += pitch; } } } // ============================================================ // putPixel // ============================================================ // // Writes a single pixel at an already-computed buffer address. // Only used by drawFocusRect for its alternating dot pattern. // Marked static inline so it compiles to a direct store at the // call site with no function call overhead. The bpp chain here // is acceptable because focus rect drawing is infrequent. static inline void putPixel(uint8_t *dst, uint32_t color, int32_t bpp) { if (bpp == 2) { *(uint16_t *)dst = (uint16_t)color; } else if (bpp == 4) { *(uint32_t *)dst = color; } else { *dst = (uint8_t)color; } } // ============================================================ // rectCopy // ============================================================ // // Copies a rectangular region from an arbitrary source buffer into // the display backbuffer. Used by the compositor to blit per-window // content buffers (win->contentBuf) into the shared backbuffer during // the composite pass. // // Clipping adjusts both the destination and source positions by the // same delta so the visible portion maps to the correct source pixels. // When the source and destination pitches match and equal the row byte // count, the entire block is copied in a single memcpy (which the // compiler/libc can optimize to rep movsd). Otherwise it falls back // to per-row memcpy. // // This function does NOT handle overlapping source and destination // regions (no memmove). That's fine because the source is always a // per-window content buffer and the destination is the shared // backbuffer -- they never overlap. void rectCopy(DisplayT *d, const BlitOpsT *ops, int32_t dstX, int32_t dstY, const uint8_t *srcBuf, int32_t srcPitch, int32_t srcX, int32_t srcY, int32_t w, int32_t h) { int32_t bpp = ops->bytesPerPixel; // Clip to display clip rect int32_t origDstX = dstX; int32_t origDstY = dstY; clipRect(d, &dstX, &dstY, &w, &h); if (__builtin_expect(w <= 0 || h <= 0, 0)) { return; } // Adjust source position by the amount we clipped srcX += dstX - origDstX; srcY += dstY - origDstY; const uint8_t *srcRow = srcBuf + srcY * srcPitch + srcX * bpp; uint8_t *dstRow = d->backBuf + dstY * d->pitch + dstX * bpp; int32_t rowBytes = w * bpp; int32_t dstPitch = d->pitch; // For full-width copies aligned to pitch, use memcpy (may optimize to rep movsd) if (rowBytes == dstPitch && rowBytes == srcPitch) { memcpy(dstRow, srcRow, rowBytes * h); } else { for (int32_t i = 0; i < h; i++) { memcpy(dstRow, srcRow, rowBytes); srcRow += srcPitch; dstRow += dstPitch; } } } // ============================================================ // rectFill // ============================================================ // // The workhorse fill primitive. Clips to the display clip rect, // then fills one scanline at a time via the spanFill function // pointer (which routes to rep stosl on DOS). This is the most // frequently called function in the draw layer -- it backs rectFill // directly, plus drawHLine, drawBevel interior fills, and the bg // fill in opaque text rendering. // // The clip + early-out pattern (clipRect then check w/h <= 0) is // the same in every draw function. The __builtin_expect marks the // zero-size case as unlikely to avoid a taken-branch penalty in the // common case where the rect is visible after clipping. void rectFill(DisplayT *d, const BlitOpsT *ops, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color) { clipRect(d, &x, &y, &w, &h); if (__builtin_expect(w <= 0 || h <= 0, 0)) { return; } uint8_t *row = d->backBuf + y * d->pitch + x * d->format.bytesPerPixel; int32_t pitch = d->pitch; for (int32_t i = 0; i < h; i++) { ops->spanFill(row, color, w); row += pitch; } } // ============================================================ // textWidth // ============================================================ // // Returns the pixel width of a null-terminated string. Because all // fonts are fixed-width, this is just strlen * charWidth -- but we // iterate manually rather than calling strlen to avoid a second pass // over the string. This is used heavily for layout calculations // (centering text in buttons, sizing menu popups, etc.). int32_t textWidth(const BitmapFontT *font, const char *text) { int32_t w = 0; while (*text) { w += font->charWidth; text++; } return w; } // ============================================================ // textWidthAccel // ============================================================ // // Like textWidth but accounts for & accelerator markers: a single & // is not rendered (it just marks the next character as the accelerator), // so it doesn't contribute to width. "&&" renders as one "&" character. // Used to compute the correct pixel width for menu items and button // labels that contain accelerator markers. int32_t textWidthAccel(const BitmapFontT *font, const char *text) { int32_t w = 0; while (*text) { if (*text == '&') { text++; if (*text == '&') { // Escaped && -- counts as one character w += font->charWidth; text++; continue; } if (*text) { // Accelerator character -- counts as one character, & is skipped w += font->charWidth; text++; continue; } break; } w += font->charWidth; text++; } return w; }