// dvx_draw.c  -- Layer 2: Drawing primitives for DVX GUI (optimized)
//
// This is the second layer of the DVX compositor stack, sitting on top
// of dvxVideo (layer 1) and below dvxComp (layer 3).  It provides all
// rasterization primitives: filled rects, buffer copies, beveled
// frames, bitmap font text, masked bitmaps (cursors/icons), and
// single-pixel operations.
//
// Every function here draws into the system-RAM backbuffer (d->backBuf),
// never directly to the LFB.  The compositor layer is responsible for
// flushing changed regions to the hardware framebuffer via rep movsd.
// This separation means draw operations benefit from CPU cache (the
// backbuffer lives in cacheable system RAM) while LFB writes are
// batched into large sequential bursts.
//
// Performance strategy overview:
//
// The core tension on 486/Pentium is between generality and speed.
// The draw layer resolves this with a two-tier approach:
//
// 1) Span operations (spanFill/spanCopy) are dispatched through
//    function pointers in BlitOpsT, set once at init based on bpp.
//    The platform implementations use rep stosl/rep movsd inline asm
//    for maximum throughput (the 486 executes rep stosl at 1 dword
//    per clock after startup; the Pentium pairs it in the U-pipe).
//    Using function pointers here costs one indirect call per span
//    but avoids a bpp switch in the inner loop of rectFill, which
//    would otherwise be a branch per scanline.
//
// 2) Character rendering (drawChar, drawTextN, drawTermRow) uses
//    explicit if/else chains on bpp rather than function pointers.
//    This is deliberate: the per-pixel work inside glyph rendering
//    is a tight bit-test loop where an indirect call per pixel would
//    be catastrophic, and the bpp branch is taken once per glyph row
//    (hoisted out of the pixel loop).  The compiler can also inline
//    the pixel store when the bpp is a compile-time constant within
//    each branch.
//
// 3) For the most critical glyph paths (unclipped 32bpp and 16bpp),
//    the pixel loops are fully unrolled into 8 direct array stores
//    with literal bit masks.  This eliminates the sGlyphBit[] table
//    lookup, the loop counter, and the loop branch  -- saving ~3 cycles
//    per pixel on a 486.  The clipped path falls back to the table.
//
// Clip rectangle handling:  All draw functions clip against
// d->clipX/Y/W/H (set by setClipRect in layer 1).  The clipRect()
// helper is marked static inline so it compiles to straight-line
// compare-and-clamp code at each call site with no function call
// overhead.  __builtin_expect hints mark the clipping branches as
// unlikely, helping the branch predictor on Pentium and later.

#include "dvxDraw.h"
#include "platform/dvxPlatform.h"

#include <string.h>

// ============================================================
// Prototypes
// ============================================================

char accelParse(const char *text);
static inline void clipRect(const DisplayT *d, int32_t *x, int32_t *y, int32_t *w, int32_t *h);
static inline void putPixel(uint8_t *dst, uint32_t color, int32_t bpp);

// Bit lookup tables for glyph and mask rendering.  On a 486, a variable
// shift (1 << (7 - col)) costs 4 cycles per bit position; a table
// lookup is a fixed 1-cycle load from L1.  The 8-entry sGlyphBit table
// maps column index 0..7 to the corresponding bit mask in a 1bpp glyph
// byte (MSB-first, matching standard VGA/bitmap font layout).  The
// 16-entry sMaskBit table does the same for 16-pixel-wide cursor/icon
// masks.
static const uint8_t  sGlyphBit[8]  = {0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01};
static const uint16_t sMaskBit[16]  = {0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100, 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001};


// ============================================================
// accelParse
// ============================================================
//
// Scans a menu/button label for the & accelerator marker and returns
// the character after it (lowercased).  Follows the Windows/Motif
// convention: "&File" means Alt+F activates it, "&&" is a literal &.
// Returns 0 if no accelerator is found.  The result is always
// lowercased so the WM can do a single case-insensitive compare
// against incoming Alt+key events.

char accelParse(const char *text) {
    if (!text) {
        return 0;
    }

    while (*text) {
        if (*text == '&') {
            text++;

            if (*text == '&') {
                // Escaped &&  -- literal &, not an accelerator
                text++;
                continue;
            }

            if (*text && *text != '&') {
                char ch = *text;

                if (ch >= 'A' && ch <= 'Z') {
                    return (char)(ch + 32);
                }

                if (ch >= 'a' && ch <= 'z') {
                    return ch;
                }

                if (ch >= '0' && ch <= '9') {
                    return ch;
                }

                return ch;
            }

            break;
        }

        text++;
    }

    return 0;
}


// ============================================================
// clipRect
// ============================================================
//
// Intersects a rectangle with the display's current clip rect,
// modifying the rect in place.  If the rect is fully outside the
// clip region, w or h will be <= 0 and callers bail out.
//
// Marked static inline because this is called on every rectFill,
// rectCopy, and indirectly on every glyph  -- it must compile to
// straight-line clamp instructions with zero call overhead.
// __builtin_expect(..., 0) marks clipping as unlikely; in the
// common case windows are fully within the clip rect and all
// four branches fall through untaken.  On Pentium this keeps the
// branch predictor happy (static not-taken prediction for forward
// branches), and on 486 it at least avoids the taken-branch penalty.

static inline void clipRect(const DisplayT *d, int32_t *x, int32_t *y, int32_t *w, int32_t *h) {
    int32_t cx2 = d->clipX + d->clipW;
    int32_t cy2 = d->clipY + d->clipH;

    int32_t rx1 = *x;
    int32_t ry1 = *y;
    int32_t rx2 = rx1 + *w;
    int32_t ry2 = ry1 + *h;

    if (__builtin_expect(rx1 < d->clipX, 0)) { rx1 = d->clipX; }
    if (__builtin_expect(ry1 < d->clipY, 0)) { ry1 = d->clipY; }
    if (__builtin_expect(rx2 > cx2, 0)) { rx2 = cx2; }
    if (__builtin_expect(ry2 > cy2, 0)) { ry2 = cy2; }

    *x = rx1;
    *y = ry1;
    *w = rx2 - rx1;
    *h = ry2 - ry1;
}


// ============================================================
// drawBevel
// ============================================================
//
// Draws a Motif/DESQview-style beveled rectangular frame.  The bevel
// creates the illusion of a raised or sunken 3D surface by drawing
// lighter "highlight" edges on the top and left, and darker "shadow"
// edges on the bottom and right.  Swapping highlight and shadow gives
// a sunken appearance (see BEVEL_RAISED/BEVEL_SUNKEN macros in
// dvxTypes.h).
//
// BevelStyleT.width controls the border thickness.  DV/X uses 2px
// bevels for most window chrome (matching the original DESQview/X
// and Motif look), 1px for inner borders and scrollbar elements.
//
// The implementation has special-cased fast paths for bw==2 and bw==1
// that emit exact spans via rectFill rather than looping.  This
// matters because drawBevel is called for every window frame, button,
// menu, and scrollbar element on every repaint  -- the loop overhead
// and extra rectFill calls in the general case add up.  Each rectFill
// call already handles clipping internally, so the bevels clip
// correctly even when a window is partially off-screen.
//
// face==0 means "don't fill the interior", which is used for frame-only
// bevels where the content area is painted separately by a callback.

void drawBevel(DisplayT *d, const BlitOpsT *ops, int32_t x, int32_t y, int32_t w, int32_t h, const BevelStyleT *style) {
    int32_t bw = style->width;

    // Fill interior if requested
    if (style->face != 0) {
        rectFill(d, ops, x + bw, y + bw, w - bw * 2, h - bw * 2, style->face);
    }

    // Fast path for the common bevel widths (1 and 2)
    // Directly emit spans instead of calling drawHLine->rectFill->clipRect per line
    if (bw == 2) {
        // Top 2 highlight lines
        rectFill(d, ops, x, y, w, 1, style->highlight);
        rectFill(d, ops, x + 1, y + 1, w - 2, 1, style->highlight);
        // Left 2 highlight columns
        rectFill(d, ops, x, y + 1, 1, h - 1, style->highlight);
        rectFill(d, ops, x + 1, y + 2, 1, h - 3, style->highlight);
        // Bottom 2 shadow lines
        rectFill(d, ops, x, y + h - 1, w, 1, style->shadow);
        rectFill(d, ops, x + 1, y + h - 2, w - 2, 1, style->shadow);
        // Right 2 shadow columns
        rectFill(d, ops, x + w - 1, y + 1, 1, h - 2, style->shadow);
        rectFill(d, ops, x + w - 2, y + 2, 1, h - 4, style->shadow);
    } else if (bw == 1) {
        rectFill(d, ops, x, y, w, 1, style->highlight);
        rectFill(d, ops, x, y + 1, 1, h - 1, style->highlight);
        rectFill(d, ops, x, y + h - 1, w, 1, style->shadow);
        rectFill(d, ops, x + w - 1, y + 1, 1, h - 2, style->shadow);
    } else {
        for (int32_t i = 0; i < bw; i++) {
            rectFill(d, ops, x + i, y + i, w - i * 2, 1, style->highlight);
        }
        for (int32_t i = 0; i < bw; i++) {
            rectFill(d, ops, x + i, y + i + 1, 1, h - i * 2 - 1, style->highlight);
        }
        for (int32_t i = 0; i < bw; i++) {
            rectFill(d, ops, x + i, y + h - 1 - i, w - i * 2, 1, style->shadow);
        }
        for (int32_t i = 0; i < bw; i++) {
            rectFill(d, ops, x + w - 1 - i, y + i + 1, 1, h - i * 2 - 2, style->shadow);
        }
    }
}


// ============================================================
// drawChar
// ============================================================
//
// Renders a single fixed-width bitmap font character into the
// backbuffer.  Returns the character advance width (always
// font->charWidth) so callers can accumulate cursor position.
//
// Font format: each glyph is charHeight bytes of 1bpp data, MSB-first
// (bit 7 = leftmost pixel).  This is the standard VGA/PC BIOS font
// format.  We use 8-pixel-wide glyphs exclusively because 8 bits fit
// in one byte per scanline, making the inner loop a single byte load
// plus 8 bit tests  -- no multi-byte glyph row assembly needed.
//
// The function has six specialized code paths (3 bpp x 2 modes),
// chosen with if/else chains rather than function pointers.  On 486
// and Pentium, an indirect call through a function pointer stalls the
// pipeline (no branch target buffer for indirect calls on 486, and
// a mandatory bubble on Pentium).  The if/else chain resolves at the
// outer loop level (once per glyph, not per pixel), so the per-pixel
// inner code is branch-free within each path.
//
// Opaque vs transparent mode:
//   opaque=true:  Fills the entire character cell (bg then fg).  Used
//                 for normal text where the background must overwrite
//                 whatever was previously in the cell.
//   opaque=false: Only writes foreground pixels; background shows
//                 through.  Used for overlay text on existing content.
//
// The "unclipped fast path" (colStart==0, colEnd==cw) avoids the
// sGlyphBit[] table lookup by testing literal bit masks directly.
// This matters because the table lookup involves an indexed load
// (base + index * element_size), while the literal mask is an
// immediate operand in the compare instruction.  At 8 pixels per row
// and 14-16 rows per glyph, saving even 1 cycle per pixel adds up
// across a full screen of text (~6400 characters at 80x80).

int32_t drawChar(DisplayT *d, const BlitOpsT *ops, const BitmapFontT *font, int32_t x, int32_t y, char ch, uint32_t fg, uint32_t bg, bool opaque) {
    int32_t cw  = font->charWidth;
    int32_t chh = font->charHeight;

    // Quick reject: entirely outside clip rect
    if (__builtin_expect(x + cw <= d->clipX || x >= d->clipX + d->clipW || y + chh <= d->clipY || y >= d->clipY + d->clipH, 0)) {
        return cw;
    }

    int32_t idx = (uint8_t)ch - font->firstChar;
    if (__builtin_expect(idx < 0 || idx >= font->numChars, 0)) {
        if (opaque) {
            rectFill(d, ops, x, y, cw, chh, bg);
        }
        return cw;
    }

    const uint8_t *glyph = font->glyphData + idx * chh;
    int32_t        bpp   = ops->bytesPerPixel;
    int32_t        pitch = d->pitch;

    // Calculate clipped row/col bounds once
    int32_t clipX1 = d->clipX;
    int32_t clipX2 = d->clipX + d->clipW;
    int32_t clipY1 = d->clipY;
    int32_t clipY2 = d->clipY + d->clipH;

    int32_t rowStart = 0;
    int32_t rowEnd   = chh;
    if (y < clipY1) { rowStart = clipY1 - y; }
    if (y + chh > clipY2) { rowEnd = clipY2 - y; }

    int32_t colStart = 0;
    int32_t colEnd   = cw;
    if (x < clipX1) { colStart = clipX1 - x; }
    if (x + cw > clipX2) { colEnd = clipX2 - x; }

    // Unclipped fast path: when the character cell is fully within the
    // clip rect we can skip per-pixel clip checks and use the fully
    // unrolled 8-store sequences below.  This is the hot path for all
    // text that isn't at the edge of a window.
    bool unclipped = (colStart == 0 && colEnd == cw);

    if (opaque) {
        // Opaque mode: every pixel in the cell gets written (fg or bg).
        // The unclipped 32bpp and 16bpp paths use branchless ternary
        // stores  -- the compiler emits cmov or conditional-set sequences
        // that avoid branch misprediction penalties.  Each row is 8
        // direct array stores with no loop, no table lookup.
        if (unclipped && bpp == 4) {
            for (int32_t row = rowStart; row < rowEnd; row++) {
                uint32_t *dst32 = (uint32_t *)(d->backBuf + (y + row) * pitch + x * 4);
                uint8_t   bits  = glyph[row];

                dst32[0] = (bits & 0x80) ? fg : bg;
                dst32[1] = (bits & 0x40) ? fg : bg;
                dst32[2] = (bits & 0x20) ? fg : bg;
                dst32[3] = (bits & 0x10) ? fg : bg;
                dst32[4] = (bits & 0x08) ? fg : bg;
                dst32[5] = (bits & 0x04) ? fg : bg;
                dst32[6] = (bits & 0x02) ? fg : bg;
                dst32[7] = (bits & 0x01) ? fg : bg;
            }
        } else if (unclipped && bpp == 2) {
            uint16_t fg16 = (uint16_t)fg;
            uint16_t bg16 = (uint16_t)bg;

            for (int32_t row = rowStart; row < rowEnd; row++) {
                uint16_t *dst16 = (uint16_t *)(d->backBuf + (y + row) * pitch + x * 2);
                uint8_t   bits  = glyph[row];

                dst16[0] = (bits & 0x80) ? fg16 : bg16;
                dst16[1] = (bits & 0x40) ? fg16 : bg16;
                dst16[2] = (bits & 0x20) ? fg16 : bg16;
                dst16[3] = (bits & 0x10) ? fg16 : bg16;
                dst16[4] = (bits & 0x08) ? fg16 : bg16;
                dst16[5] = (bits & 0x04) ? fg16 : bg16;
                dst16[6] = (bits & 0x02) ? fg16 : bg16;
                dst16[7] = (bits & 0x01) ? fg16 : bg16;
            }
        } else {
            // Clipped path or 8bpp: use spanFill for bg (leveraging
            // rep stosl), then iterate visible columns with sGlyphBit[]
            // table for fg.  8bpp always takes this path because 8-bit
            // stores can't be branchlessly ternary'd as efficiently  --
            // the compiler can't cmov into a byte store.
            for (int32_t row = rowStart; row < rowEnd; row++) {
                int32_t  py  = y + row;
                uint8_t *dst = d->backBuf + py * pitch + (x + colStart) * bpp;

                ops->spanFill(dst, bg, colEnd - colStart);

                uint8_t bits = glyph[row];
                if (bits == 0) {
                    continue;
                }

                dst = d->backBuf + py * pitch + x * bpp;

                if (bpp == 2) {
                    uint16_t fg16 = (uint16_t)fg;
                    for (int32_t col = colStart; col < colEnd; col++) {
                        if (bits & sGlyphBit[col]) {
                            *(uint16_t *)(dst + col * 2) = fg16;
                        }
                    }
                } else if (bpp == 4) {
                    for (int32_t col = colStart; col < colEnd; col++) {
                        if (bits & sGlyphBit[col]) {
                            *(uint32_t *)(dst + col * 4) = fg;
                        }
                    }
                } else {
                    uint8_t fg8 = (uint8_t)fg;
                    for (int32_t col = colStart; col < colEnd; col++) {
                        if (bits & sGlyphBit[col]) {
                            dst[col] = fg8;
                        }
                    }
                }
            }
        }
    } else {
        // Transparent mode: only fg pixels are written; bg is untouched.
        // The "bits == 0" early-out per row is important here: blank
        // rows in the glyph (common in the top/bottom padding of most
        // characters) skip all pixel work entirely.  In opaque mode
        // blank rows still need the bg fill so we can't skip them.
        if (unclipped && bpp == 4) {
            for (int32_t row = rowStart; row < rowEnd; row++) {
                uint8_t bits = glyph[row];
                if (bits == 0) {
                    continue;
                }

                uint32_t *dst32 = (uint32_t *)(d->backBuf + (y + row) * pitch + x * 4);

                if (bits & 0x80) { dst32[0] = fg; }
                if (bits & 0x40) { dst32[1] = fg; }
                if (bits & 0x20) { dst32[2] = fg; }
                if (bits & 0x10) { dst32[3] = fg; }
                if (bits & 0x08) { dst32[4] = fg; }
                if (bits & 0x04) { dst32[5] = fg; }
                if (bits & 0x02) { dst32[6] = fg; }
                if (bits & 0x01) { dst32[7] = fg; }
            }
        } else if (unclipped && bpp == 2) {
            uint16_t fg16 = (uint16_t)fg;

            for (int32_t row = rowStart; row < rowEnd; row++) {
                uint8_t bits = glyph[row];
                if (bits == 0) {
                    continue;
                }

                uint16_t *dst16 = (uint16_t *)(d->backBuf + (y + row) * pitch + x * 2);

                if (bits & 0x80) { dst16[0] = fg16; }
                if (bits & 0x40) { dst16[1] = fg16; }
                if (bits & 0x20) { dst16[2] = fg16; }
                if (bits & 0x10) { dst16[3] = fg16; }
                if (bits & 0x08) { dst16[4] = fg16; }
                if (bits & 0x04) { dst16[5] = fg16; }
                if (bits & 0x02) { dst16[6] = fg16; }
                if (bits & 0x01) { dst16[7] = fg16; }
            }
        } else {
            // Clipped path or 8bpp
            for (int32_t row = rowStart; row < rowEnd; row++) {
                uint8_t bits = glyph[row];
                if (bits == 0) {
                    continue;
                }

                int32_t  py  = y + row;
                uint8_t *dst = d->backBuf + py * pitch + x * bpp;

                if (bpp == 2) {
                    uint16_t fg16 = (uint16_t)fg;
                    for (int32_t col = colStart; col < colEnd; col++) {
                        if (bits & sGlyphBit[col]) {
                            *(uint16_t *)(dst + col * 2) = fg16;
                        }
                    }
                } else if (bpp == 4) {
                    for (int32_t col = colStart; col < colEnd; col++) {
                        if (bits & sGlyphBit[col]) {
                            *(uint32_t *)(dst + col * 4) = fg;
                        }
                    }
                } else {
                    uint8_t fg8 = (uint8_t)fg;
                    for (int32_t col = colStart; col < colEnd; col++) {
                        if (bits & sGlyphBit[col]) {
                            dst[col] = fg8;
                        }
                    }
                }
            }
        }
    }

    return cw;
}


// ============================================================
// drawTextN
// ============================================================
//
// Renders exactly 'count' characters from a buffer in one pass.
// Same idea as drawTermRow but for uniform fg/bg text runs.
// Avoids per-character function call overhead, redundant clip
// calculation, and spanFill startup costs.
//
// The key optimization over calling drawChar() in a loop is the
// bg fill strategy: in opaque mode, instead of calling spanFill
// once per character cell per row (count * charHeight spanFill
// calls), we fill the entire visible span's background in one
// spanFill per scanline (just charHeight calls total).  Then we
// overlay only the fg glyph pixels.  For an 80-column line this
// reduces spanFill calls from 80*16=1280 to just 16.  Each
// spanFill maps to a single rep stosl, so we're also getting
// better write-combine utilization from the larger sequential
// stores.
//
// Horizontal clipping is done at the character level (firstChar/
// lastChar) to avoid iterating invisible characters, with per-pixel
// edge clipping only for the partially visible first and last chars.

void drawTextN(DisplayT *d, const BlitOpsT *ops, const BitmapFontT *font, int32_t x, int32_t y, const char *text, int32_t count, uint32_t fg, uint32_t bg, bool opaque) {
    if (count <= 0) {
        return;
    }

    int32_t cw    = font->charWidth;
    int32_t ch    = font->charHeight;
    int32_t bpp   = ops->bytesPerPixel;
    int32_t pitch = d->pitch;

    // Row-level clip: reject if entirely outside vertically
    int32_t clipX1 = d->clipX;
    int32_t clipX2 = d->clipX + d->clipW;
    int32_t clipY1 = d->clipY;
    int32_t clipY2 = d->clipY + d->clipH;

    if (y + ch <= clipY1 || y >= clipY2) {
        return;
    }

    int32_t totalW = count * cw;

    if (x + totalW <= clipX1 || x >= clipX2) {
        return;
    }

    // Vertical clip for glyph scanlines
    int32_t rowStart = 0;
    int32_t rowEnd   = ch;
    if (y < clipY1) { rowStart = clipY1 - y; }
    if (y + ch > clipY2) { rowEnd = clipY2 - y; }

    // Horizontal clip: find first and last visible column (character index)
    int32_t firstChar = 0;
    int32_t lastChar  = count;

    if (x < clipX1) {
        firstChar = (clipX1 - x) / cw;
    }

    if (x + totalW > clipX2) {
        lastChar = (clipX2 - x + cw - 1) / cw;
        if (lastChar > count) { lastChar = count; }
    }

    // Per-pixel clip for partially visible edge characters
    int32_t edgeColStart = 0;

    if (x + firstChar * cw < clipX1) {
        edgeColStart = clipX1 - (x + firstChar * cw);
    }

    if (opaque) {
        // Opaque: fill background for the entire visible span once per scanline,
        // then overlay foreground glyph pixels
        int32_t fillX1 = x + firstChar * cw;
        int32_t fillX2 = x + lastChar * cw;

        if (fillX1 < clipX1) { fillX1 = clipX1; }
        if (fillX2 > clipX2) { fillX2 = clipX2; }

        int32_t fillW = fillX2 - fillX1;

        if (fillW > 0) {
            for (int32_t row = rowStart; row < rowEnd; row++) {
                uint8_t *dst = d->backBuf + (y + row) * pitch + fillX1 * bpp;
                ops->spanFill(dst, bg, fillW);
            }
        }
    }

    // Render glyph foreground pixels
    for (int32_t ci = firstChar; ci < lastChar; ci++) {
        int32_t cx = x + ci * cw;

        int32_t cStart = 0;
        int32_t cEnd   = cw;

        if (ci == firstChar) {
            cStart = edgeColStart;
        }

        if (cx + cw > clipX2) {
            cEnd = clipX2 - cx;
        }

        int32_t idx = (uint8_t)text[ci] - font->firstChar;
        const uint8_t *glyph = NULL;

        if (idx >= 0 && idx < font->numChars) {
            glyph = font->glyphData + idx * ch;
        }

        if (!glyph) {
            continue;
        }

        if (bpp == 2) {
            uint16_t fg16 = (uint16_t)fg;

            for (int32_t row = rowStart; row < rowEnd; row++) {
                uint8_t bits = glyph[row];
                if (bits == 0) { continue; }

                uint16_t *dst = (uint16_t *)(d->backBuf + (y + row) * pitch + cx * 2);

                for (int32_t p = cStart; p < cEnd; p++) {
                    if (bits & sGlyphBit[p]) {
                        dst[p] = fg16;
                    }
                }
            }
        } else if (bpp == 4) {
            for (int32_t row = rowStart; row < rowEnd; row++) {
                uint8_t bits = glyph[row];
                if (bits == 0) { continue; }

                uint32_t *dst = (uint32_t *)(d->backBuf + (y + row) * pitch + cx * 4);

                for (int32_t p = cStart; p < cEnd; p++) {
                    if (bits & sGlyphBit[p]) {
                        dst[p] = fg;
                    }
                }
            }
        } else {
            uint8_t fg8 = (uint8_t)fg;

            for (int32_t row = rowStart; row < rowEnd; row++) {
                uint8_t bits = glyph[row];
                if (bits == 0) { continue; }

                uint8_t *dst = d->backBuf + (y + row) * pitch + cx;

                for (int32_t p = cStart; p < cEnd; p++) {
                    if (bits & sGlyphBit[p]) {
                        dst[p] = fg8;
                    }
                }
            }
        }
    }
}


// ============================================================
// drawFocusRect
// ============================================================
//
// Draws a dotted (every-other-pixel) rectangle to indicate keyboard
// focus, matching the Windows/Motif convention.  Uses putPixel per
// dot rather than spanFill because the alternating pattern can't be
// expressed as a span fill (which writes uniform color).
//
// The parity calculations on the bottom and right edges ensure the
// dot pattern is visually continuous around corners  -- the starting
// pixel of each edge is offset so dots don't double up or gap at
// the corner where two edges meet.
//
// This is not performance-critical; focus rects are drawn at most
// once per focused widget per repaint.

void drawFocusRect(DisplayT *d, const BlitOpsT *ops, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color) {
    int32_t bpp   = ops->bytesPerPixel;
    int32_t pitch = d->pitch;

    int32_t clipX1 = d->clipX;
    int32_t clipX2 = d->clipX + d->clipW;
    int32_t clipY1 = d->clipY;
    int32_t clipY2 = d->clipY + d->clipH;

    int32_t x2 = x + w - 1;
    int32_t y2 = y + h - 1;

    // Top edge
    if (y >= clipY1 && y < clipY2) {
        for (int32_t px = x; px <= x2; px += 2) {
            if (px >= clipX1 && px < clipX2) {
                putPixel(d->backBuf + y * pitch + px * bpp, color, bpp);
            }
        }
    }

    // Bottom edge
    if (y2 >= clipY1 && y2 < clipY2 && y2 != y) {
        int32_t parity = (y2 - y) & 1;

        for (int32_t px = x + parity; px <= x2; px += 2) {
            if (px >= clipX1 && px < clipX2) {
                putPixel(d->backBuf + y2 * pitch + px * bpp, color, bpp);
            }
        }
    }

    // Left edge (skip corners already drawn)
    if (x >= clipX1 && x < clipX2) {
        for (int32_t py = y + 2; py < y2; py += 2) {
            if (py >= clipY1 && py < clipY2) {
                putPixel(d->backBuf + py * pitch + x * bpp, color, bpp);
            }
        }
    }

    // Right edge (skip corners already drawn)
    if (x2 >= clipX1 && x2 < clipX2 && x2 != x) {
        int32_t parity = (x2 - x) & 1;

        for (int32_t py = y + 2 - parity; py < y2; py += 2) {
            if (py >= clipY1 && py < clipY2) {
                putPixel(d->backBuf + py * pitch + x2 * bpp, color, bpp);
            }
        }
    }
}


// ============================================================
// drawHLine
// ============================================================
//
// Thin convenience wrapper  -- a horizontal line is just a 1px-tall rect.
// Delegates to rectFill which handles clipping and uses spanFill (rep
// stosl) for the actual write.

void drawHLine(DisplayT *d, const BlitOpsT *ops, int32_t x, int32_t y, int32_t w, uint32_t color) {
    rectFill(d, ops, x, y, w, 1, color);
}


// ============================================================
// drawInit
// ============================================================
//
// Wires up the BlitOpsT function pointers to the correct
// platform-specific span operations for the active pixel format.
// Called once during startup after videoInit determines the bpp.
//
// The span ops are the only place where function pointers are used
// in the draw layer.  This is a deliberate performance tradeoff:
// spanFill and spanCopy are called per-scanline (not per-pixel),
// so the indirect call overhead (~5 cycles on Pentium for the
// mispredicted first call, then predicted afterward) is amortized
// over an entire row of pixels.  The alternative  -- a switch inside
// rectFill's inner loop  -- would branch every scanline for no gain.
//
// The platform implementations (dvxPlatformDos.c) use inline asm:
//   spanFill8/16/32  -> rep stosl (fills 4 bytes per clock)
//   spanCopy8/16/32  -> rep movsd (copies 4 bytes per clock)
// These are the fastest bulk memory operations available on 486/
// Pentium without SSE.  The 8-bit and 16-bit variants handle
// alignment preambles to get to dword boundaries, then use
// rep stosl/movsd for the bulk.

void drawInit(BlitOpsT *ops, const DisplayT *d) {
    ops->bytesPerPixel = d->format.bytesPerPixel;
    ops->pitch         = d->pitch;

    switch (d->format.bytesPerPixel) {
        case 1:
            ops->spanFill = platformSpanFill8;
            ops->spanCopy = platformSpanCopy8;
            break;
        case 2:
            ops->spanFill = platformSpanFill16;
            ops->spanCopy = platformSpanCopy16;
            break;
        case 4:
            ops->spanFill = platformSpanFill32;
            ops->spanCopy = platformSpanCopy32;
            break;
        default:
            ops->spanFill = platformSpanFill8;
            ops->spanCopy = platformSpanCopy8;
            break;
    }
}


// ============================================================
// drawMaskedBitmap
// ============================================================
//
// Renders a 1-bit masked bitmap (used for mouse cursors and icons).
// The two-plane format mirrors the hardware cursor format used by
// VGA and early SVGA cards:
//
//   andMask bit=1, xorData bit=X  -> transparent (pixel unchanged)
//   andMask bit=0, xorData bit=0  -> bgColor
//   andMask bit=0, xorData bit=1  -> fgColor
//
// Each row is a uint16_t (supporting up to 16 pixels wide), stored
// MSB-first.  This is sufficient for standard 16x16 mouse cursors.
//
// The colMask optimization pre-computes which bits in each row fall
// within the visible (clipped) columns.  For fully transparent rows
// (all visible bits have andMask=1), the entire row is skipped with
// a single bitwise AND + compare  -- no per-pixel iteration needed.

void drawMaskedBitmap(DisplayT *d, const BlitOpsT *ops, int32_t x, int32_t y, int32_t w, int32_t h, const uint16_t *andMask, const uint16_t *xorData, uint32_t fgColor, uint32_t bgColor) {
    int32_t bpp   = ops->bytesPerPixel;
    int32_t pitch = d->pitch;

    // Pre-clip row/col bounds
    int32_t clipX1 = d->clipX;
    int32_t clipX2 = d->clipX + d->clipW;
    int32_t clipY1 = d->clipY;
    int32_t clipY2 = d->clipY + d->clipH;

    int32_t rowStart = 0;
    int32_t rowEnd   = h;
    if (y < clipY1) { rowStart = clipY1 - y; }
    if (y + h > clipY2) { rowEnd = clipY2 - y; }

    int32_t colStart = 0;
    int32_t colEnd   = w;
    if (x < clipX1) { colStart = clipX1 - x; }
    if (x + w > clipX2) { colEnd = clipX2 - x; }

    if (colStart >= colEnd || rowStart >= rowEnd) {
        return;
    }

    // Pre-compute column mask once (loop-invariant)
    uint16_t colMask = 0;
    for (int32_t col = colStart; col < colEnd; col++) {
        colMask |= sMaskBit[col];
    }

    for (int32_t row = rowStart; row < rowEnd; row++) {
        uint16_t mask = andMask[row];
        uint16_t data = xorData[row];

        // Skip fully transparent rows
        if ((mask & colMask) == colMask) {
            continue;
        }

        int32_t  py  = y + row;
        uint8_t *dst = d->backBuf + py * pitch + x * bpp;

        if (bpp == 2) {
            uint16_t fg16 = (uint16_t)fgColor;
            uint16_t bg16 = (uint16_t)bgColor;
            for (int32_t col = colStart; col < colEnd; col++) {
                uint16_t bit = sMaskBit[col];
                if (!(mask & bit)) {
                    *(uint16_t *)(dst + col * 2) = (data & bit) ? fg16 : bg16;
                }
            }
        } else if (bpp == 4) {
            for (int32_t col = colStart; col < colEnd; col++) {
                uint16_t bit = sMaskBit[col];
                if (!(mask & bit)) {
                    *(uint32_t *)(dst + col * 4) = (data & bit) ? fgColor : bgColor;
                }
            }
        } else {
            uint8_t fg8 = (uint8_t)fgColor;
            uint8_t bg8 = (uint8_t)bgColor;
            for (int32_t col = colStart; col < colEnd; col++) {
                uint16_t bit = sMaskBit[col];
                if (!(mask & bit)) {
                    dst[col] = (data & bit) ? fg8 : bg8;
                }
            }
        }
    }
}


// ============================================================
// drawTermRow
// ============================================================
//
// Renders an entire row of terminal character cells in one pass.
// lineData points to (ch, attr) pairs.  palette is a 16-entry
// packed-color table.  This avoids per-character function call
// overhead, redundant clip calculation, and spanFill startup
// costs that make drawChar expensive when called 80x per row.
//
// This is the primary rendering function for the terminal emulator.
// The attribute byte uses the standard CGA/VGA format:
//   bits 0-3: foreground color (0-15)
//   bits 4-6: background color (0-7)
//   bit 7:    blink flag
//
// Unlike drawTextN (which handles uniform fg/bg), every cell here
// can have a different fg/bg pair, so the bg can't be filled in a
// single bulk pass.  Instead each cell is rendered individually,
// always in opaque mode (every pixel gets a write).  The bpp branch
// is still hoisted outside the per-pixel loop  -- the outer loop
// selects the bpp path once, then iterates cells within it.
//
// blinkVisible controls the blink phase: when false, fg is replaced
// with bg for characters that have bit 7 set, effectively hiding them.
// cursorCol specifies which cell (if any) should be drawn with
// inverted fg/bg to show the text cursor.

void drawTermRow(DisplayT *d, const BlitOpsT *ops, const BitmapFontT *font, int32_t x, int32_t y, int32_t cols, const uint8_t *lineData, const uint32_t *palette, bool blinkVisible, int32_t cursorCol) {
    int32_t cw    = font->charWidth;
    int32_t ch    = font->charHeight;
    int32_t bpp   = ops->bytesPerPixel;
    int32_t pitch = d->pitch;

    // Row-level clip: reject if entirely outside vertically
    int32_t clipX1 = d->clipX;
    int32_t clipX2 = d->clipX + d->clipW;
    int32_t clipY1 = d->clipY;
    int32_t clipY2 = d->clipY + d->clipH;

    if (y + ch <= clipY1 || y >= clipY2) {
        return;
    }

    // Vertical clip for glyph scanlines
    int32_t rowStart = 0;
    int32_t rowEnd   = ch;
    if (y < clipY1) { rowStart = clipY1 - y; }
    if (y + ch > clipY2) { rowEnd = clipY2 - y; }

    // Horizontal clip: find first and last visible column
    int32_t rowW     = cols * cw;
    int32_t firstCol = 0;
    int32_t lastCol  = cols;

    if (x + rowW <= clipX1 || x >= clipX2) {
        return;
    }

    if (x < clipX1) {
        firstCol = (clipX1 - x) / cw;
    }

    if (x + rowW > clipX2) {
        lastCol = (clipX2 - x + cw - 1) / cw;
        if (lastCol > cols) { lastCol = cols; }
    }

    // Per-column clip for partially visible edge cells
    int32_t edgeColStart = 0;

    if (x + firstCol * cw < clipX1) {
        edgeColStart = clipX1 - (x + firstCol * cw);
    }

    // Render each visible cell
    for (int32_t col = firstCol; col < lastCol; col++) {
        uint8_t  gch  = lineData[col * 2];
        uint8_t  attr = lineData[col * 2 + 1];
        uint32_t fg   = palette[attr & 0x0F];
        uint32_t bg   = palette[(attr >> 4) & 0x07];

        // Blink: hide text during off phase
        if ((attr & 0x80) && !blinkVisible) {
            fg = bg;
        }

        // Cursor: invert colors
        if (col == cursorCol) {
            uint32_t tmp = fg;
            fg = bg;
            bg = tmp;
        }

        int32_t cx = x + col * cw;

        // Determine per-cell horizontal clip
        int32_t cStart = 0;
        int32_t cEnd   = cw;

        if (col == firstCol) {
            cStart = edgeColStart;
        }

        if (cx + cw > clipX2) {
            cEnd = clipX2 - cx;
        }

        // Look up glyph data
        int32_t idx = (uint8_t)gch - font->firstChar;
        const uint8_t *glyph = NULL;

        if (idx >= 0 && idx < font->numChars) {
            glyph = font->glyphData + idx * ch;
        }

        // Render scanlines
        if (bpp == 2) {
            uint16_t fg16 = (uint16_t)fg;
            uint16_t bg16 = (uint16_t)bg;

            for (int32_t row = rowStart; row < rowEnd; row++) {
                uint16_t *dst = (uint16_t *)(d->backBuf + (y + row) * pitch + cx * 2);
                uint8_t   bits = glyph ? glyph[row] : 0;

                for (int32_t p = cStart; p < cEnd; p++) {
                    dst[p] = (bits & sGlyphBit[p]) ? fg16 : bg16;
                }
            }
        } else if (bpp == 4) {
            for (int32_t row = rowStart; row < rowEnd; row++) {
                uint32_t *dst = (uint32_t *)(d->backBuf + (y + row) * pitch + cx * 4);
                uint8_t   bits = glyph ? glyph[row] : 0;

                for (int32_t p = cStart; p < cEnd; p++) {
                    dst[p] = (bits & sGlyphBit[p]) ? fg : bg;
                }
            }
        } else {
            uint8_t fg8 = (uint8_t)fg;
            uint8_t bg8 = (uint8_t)bg;

            for (int32_t row = rowStart; row < rowEnd; row++) {
                uint8_t *dst = d->backBuf + (y + row) * pitch + cx;
                uint8_t  bits = glyph ? glyph[row] : 0;

                for (int32_t p = cStart; p < cEnd; p++) {
                    dst[p] = (bits & sGlyphBit[p]) ? fg8 : bg8;
                }
            }
        }
    }
}


// ============================================================
// drawText
// ============================================================
//
// Renders a null-terminated string by calling drawChar per character.
// Simpler than drawTextN but slower for long runs because each
// drawChar call independently clips, computes row bounds, and
// dispatches on bpp.  Used for short labels and ad-hoc text where
// the call overhead doesn't matter; drawTextN is preferred for
// bulk text (editor buffers, list views, etc.).
//
// The left-of-clip skip avoids calling drawChar for characters that
// are entirely to the left of the visible area.  The right-of-clip
// early-out breaks the loop as soon as we've passed the right edge.
// These are both marked unlikely (__builtin_expect) because the
// common case is text fully within the clip rect.

void drawText(DisplayT *d, const BlitOpsT *ops, const BitmapFontT *font, int32_t x, int32_t y, const char *text, uint32_t fg, uint32_t bg, bool opaque) {
    int32_t cw     = font->charWidth;
    int32_t clipX2 = d->clipX + d->clipW;

    while (*text) {
        // Early out if we've moved past the right clip edge
        if (__builtin_expect(x >= clipX2, 0)) {
            break;
        }

        // Skip characters entirely to the left of clip
        if (__builtin_expect(x + cw <= d->clipX, 0)) {
            x += cw;
            text++;
            continue;
        }

        x += drawChar(d, ops, font, x, y, *text, fg, bg, opaque);
        text++;
    }
}


// ============================================================
// drawTextAccel
// ============================================================
//
// Like drawText but interprets & markers in the string: the character
// following & is drawn with an underline to indicate it's the keyboard
// accelerator (e.g. "&File" draws "File" with F underlined).  "&&"
// draws a literal &.  This matches the Windows/Motif convention for
// menu and button labels.
//
// The underline is drawn as a 1px horizontal line at the bottom of
// the character cell (y + charHeight - 1), which is the standard
// placement for accelerator underlines.

void drawTextAccel(DisplayT *d, const BlitOpsT *ops, const BitmapFontT *font, int32_t x, int32_t y, const char *text, uint32_t fg, uint32_t bg, bool opaque) {
    int32_t cw     = font->charWidth;
    int32_t clipX2 = d->clipX + d->clipW;

    while (*text) {
        if (__builtin_expect(x >= clipX2, 0)) {
            break;
        }

        if (*text == '&') {
            text++;

            if (*text == '&') {
                // Escaped &&  -- draw literal &
                if (x + cw > d->clipX) {
                    drawChar(d, ops, font, x, y, '&', fg, bg, opaque);
                }

                x += cw;
                text++;
                continue;
            }

            if (*text) {
                // Accelerator character  -- draw it then underline
                if (x + cw > d->clipX) {
                    drawChar(d, ops, font, x, y, *text, fg, bg, opaque);
                    drawHLine(d, ops, x, y + font->charHeight - 1, cw, fg);
                }

                x += cw;
                text++;
                continue;
            }

            break;
        }

        if (x + cw > d->clipX) {
            drawChar(d, ops, font, x, y, *text, fg, bg, opaque);
        }

        x += cw;
        text++;
    }
}


// ============================================================
// drawVLine
// ============================================================
//
// Draws a vertical line pixel-by-pixel.  Unlike drawHLine (which
// delegates to rectFill -> spanFill for a single-row span), a
// vertical line can't use spanFill because each pixel is on a
// different scanline.  Instead we advance by d->pitch per pixel
// and write directly, branching on bpp once at the top.
//
// The ops parameter is unused (suppressed with (void)ops) because
// spanFill operates on contiguous horizontal runs and is useless
// for vertical lines.  We keep the parameter for API consistency
// with the rest of the draw layer.

void drawVLine(DisplayT *d, const BlitOpsT *ops, int32_t x, int32_t y, int32_t h, uint32_t color) {
    (void)ops;

    // Inline single-pixel-wide fill to avoid rectFill overhead for narrow lines
    if (__builtin_expect(x < d->clipX || x >= d->clipX + d->clipW, 0)) {
        return;
    }

    int32_t y1 = y;
    int32_t y2 = y + h;
    if (y1 < d->clipY) { y1 = d->clipY; }
    if (y2 > d->clipY + d->clipH) { y2 = d->clipY + d->clipH; }
    if (y1 >= y2) { return; }

    int32_t  bpp = d->format.bytesPerPixel;
    uint8_t *dst = d->backBuf + y1 * d->pitch + x * bpp;
    int32_t  pitch = d->pitch;

    if (bpp == 2) {
        uint16_t c16 = (uint16_t)color;
        for (int32_t i = y1; i < y2; i++) {
            *(uint16_t *)dst = c16;
            dst += pitch;
        }
    } else if (bpp == 4) {
        for (int32_t i = y1; i < y2; i++) {
            *(uint32_t *)dst = color;
            dst += pitch;
        }
    } else {
        uint8_t c8 = (uint8_t)color;
        for (int32_t i = y1; i < y2; i++) {
            *dst = c8;
            dst += pitch;
        }
    }
}


// ============================================================
// putPixel
// ============================================================
//
// Writes a single pixel at an already-computed buffer address.
// Only used by drawFocusRect for its alternating dot pattern.
// Marked static inline so it compiles to a direct store at the
// call site with no function call overhead.  The bpp chain here
// is acceptable because focus rect drawing is infrequent.

static inline void putPixel(uint8_t *dst, uint32_t color, int32_t bpp) {
    if (bpp == 2) {
        *(uint16_t *)dst = (uint16_t)color;
    } else if (bpp == 4) {
        *(uint32_t *)dst = color;
    } else {
        *dst = (uint8_t)color;
    }
}


// ============================================================
// rectCopy
// ============================================================
//
// Copies a rectangular region from an arbitrary source buffer into
// the display backbuffer.  Used by the compositor to blit per-window
// content buffers (win->contentBuf) into the shared backbuffer during
// the composite pass.
//
// Clipping adjusts both the destination and source positions by the
// same delta so the visible portion maps to the correct source pixels.
// When the source and destination pitches match and equal the row byte
// count, the entire block is copied in a single memcpy (which the
// compiler/libc can optimize to rep movsd).  Otherwise it falls back
// to per-row memcpy.
//
// This function does NOT handle overlapping source and destination
// regions (no memmove).  That's fine because the source is always a
// per-window content buffer and the destination is the shared
// backbuffer  -- they never overlap.

void rectCopy(DisplayT *d, const BlitOpsT *ops, int32_t dstX, int32_t dstY, const uint8_t *srcBuf, int32_t srcPitch, int32_t srcX, int32_t srcY, int32_t w, int32_t h) {
    int32_t bpp = ops->bytesPerPixel;

    // Clip to display clip rect
    int32_t origDstX = dstX;
    int32_t origDstY = dstY;

    clipRect(d, &dstX, &dstY, &w, &h);

    if (__builtin_expect(w <= 0 || h <= 0, 0)) {
        return;
    }

    // Adjust source position by the amount we clipped
    srcX += dstX - origDstX;
    srcY += dstY - origDstY;

    const uint8_t *srcRow = srcBuf + srcY * srcPitch + srcX * bpp;
    uint8_t       *dstRow = d->backBuf + dstY * d->pitch + dstX * bpp;
    int32_t        rowBytes = w * bpp;
    int32_t        dstPitch = d->pitch;

    // For full-width copies aligned to pitch, use memcpy (may optimize to rep movsd)
    if (rowBytes == dstPitch && rowBytes == srcPitch) {
        memcpy(dstRow, srcRow, rowBytes * h);
    } else {
        for (int32_t i = 0; i < h; i++) {
            memcpy(dstRow, srcRow, rowBytes);
            srcRow += srcPitch;
            dstRow += dstPitch;
        }
    }
}


// ============================================================
// rectFill
// ============================================================
//
// The workhorse fill primitive.  Clips to the display clip rect,
// then fills one scanline at a time via the spanFill function
// pointer (which routes to rep stosl on DOS).  This is the most
// frequently called function in the draw layer  -- it backs rectFill
// directly, plus drawHLine, drawBevel interior fills, and the bg
// fill in opaque text rendering.
//
// The clip + early-out pattern (clipRect then check w/h <= 0) is
// the same in every draw function.  The __builtin_expect marks the
// zero-size case as unlikely to avoid a taken-branch penalty in the
// common case where the rect is visible after clipping.

void rectFill(DisplayT *d, const BlitOpsT *ops, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color) {
    clipRect(d, &x, &y, &w, &h);

    if (__builtin_expect(w <= 0 || h <= 0, 0)) {
        return;
    }

    uint8_t *row   = d->backBuf + y * d->pitch + x * d->format.bytesPerPixel;
    int32_t  pitch = d->pitch;

    for (int32_t i = 0; i < h; i++) {
        ops->spanFill(row, color, w);
        row += pitch;
    }
}


// ============================================================
// textWidth
// ============================================================
//
// Returns the pixel width of a null-terminated string.  Because all
// fonts are fixed-width, this is just strlen * charWidth  -- but we
// iterate manually rather than calling strlen to avoid a second pass
// over the string.  This is used heavily for layout calculations
// (centering text in buttons, sizing menu popups, etc.).

int32_t textWidth(const BitmapFontT *font, const char *text) {
    int32_t w = 0;

    while (*text) {
        w += font->charWidth;
        text++;
    }

    return w;
}


// ============================================================
// textWidthAccel
// ============================================================
//
// Like textWidth but accounts for & accelerator markers: a single &
// is not rendered (it just marks the next character as the accelerator),
// so it doesn't contribute to width.  "&&" renders as one "&" character.
// Used to compute the correct pixel width for menu items and button
// labels that contain accelerator markers.

int32_t textWidthAccel(const BitmapFontT *font, const char *text) {
    int32_t w = 0;

    while (*text) {
        if (*text == '&') {
            text++;

            if (*text == '&') {
                // Escaped &&  -- counts as one character
                w += font->charWidth;
                text++;
                continue;
            }

            if (*text) {
                // Accelerator character  -- counts as one character, & is skipped
                w += font->charWidth;
                text++;
                continue;
            }

            break;
        }

        w += font->charWidth;
        text++;
    }

    return w;
}