// dvx_comp.c  -- Layer 3: Dirty rectangle compositor for DVX GUI (optimized)
//
// This layer implements dirty rectangle tracking and merging. The compositor
// avoids full-screen redraws, which would be prohibitively expensive on the
// target 486/Pentium hardware over ISA bus VESA LFB. A full 640x480x16bpp
// framebuffer is ~600KB  -- at ISA's ~8MB/s theoretical peak, a blind full
// flush costs ~75ms (>1 frame at 60Hz). By tracking which rectangles have
// actually changed and flushing only those regions from the system RAM
// backbuffer to the LFB, the bandwidth consumed per frame scales with the
// amount of visual change rather than the screen resolution.
//
// The compositing loop lives in dvxApp.c (compositeAndFlush). For each dirty
// rect, it repaints the desktop, then walks the window stack bottom-to-top
// painting chrome, content, scrollbars, popup menus, and the cursor  -- all
// clipped to the dirty rect. Only then is the dirty rect flushed to the LFB.
// This means each pixel in a dirty region is written to system RAM potentially
// multiple times (painter's algorithm), but the expensive LFB write happens
// exactly once per pixel per frame.

#include "dvxComp.h"
#include "platform/dvxPlatform.h"

#include <string.h>

// Rects within this many pixels of each other get merged even if they don't
// overlap. A small gap tolerance absorbs jitter from mouse movement and
// closely-spaced UI invalidations (e.g. title bar + content during a drag)
// without bloating merged rects excessively. The value 4 was chosen to match
// the chrome border width  -- adjacent chrome/content invalidations merge
// naturally.
#define DIRTY_MERGE_GAP 4

// ============================================================
// Prototypes
// ============================================================

static inline bool rectsOverlapOrAdjacent(const RectT *a, const RectT *b, int32_t gap);
static inline void rectUnion(const RectT *a, const RectT *b, RectT *result);


// ============================================================
// dirtyListAdd
// ============================================================
//
// Appends a dirty rect to the list. Uses a fixed-size array (MAX_DIRTY_RECTS
// = 128) rather than a dynamic allocation  -- this is called on every UI
// mutation (drag, repaint, focus change) so allocation overhead must be zero.
//
// When the list fills up, an eager merge pass tries to consolidate rects.
// If the list is STILL full after merging (pathological scatter), the
// nuclear option collapses everything into one bounding box. This guarantees
// the list never overflows, at the cost of potentially over-painting a large
// rect. In practice the merge pass almost always frees enough slots because
// GUI mutations tend to cluster spatially.

void dirtyListAdd(DirtyListT *dl, int32_t x, int32_t y, int32_t w, int32_t h) {
    // Branch hint: degenerate rects are rare  -- callers usually validate first
    if (__builtin_expect(w <= 0 || h <= 0, 0)) {
        return;
    }

    // Overflow path: try merging, then fall back to a single bounding rect
    if (__builtin_expect(dl->count >= MAX_DIRTY_RECTS, 0)) {
        dirtyListMerge(dl);

        if (dl->count >= MAX_DIRTY_RECTS) {
            // Still full  -- collapse the entire list plus the new rect into one
            // bounding box. This is a last resort; it means the next flush will
            // repaint a potentially large region, but at least we won't lose
            // dirty information or crash.
            RectT merged = dl->rects[0];

            for (int32_t i = 1; i < dl->count; i++) {
                rectUnion(&merged, &dl->rects[i], &merged);
            }

            RectT newRect = {x, y, w, h};
            rectUnion(&merged, &newRect, &merged);

            dl->rects[0] = merged;
            dl->count     = 1;
            return;
        }
    }

    dl->rects[dl->count].x = x;
    dl->rects[dl->count].y = y;
    dl->rects[dl->count].w = w;
    dl->rects[dl->count].h = h;
    dl->count++;
}


// ============================================================
// dirtyListClear
// ============================================================

void dirtyListClear(DirtyListT *dl) {
    dl->count = 0;
}


// ============================================================
// dirtyListInit
// ============================================================

void dirtyListInit(DirtyListT *dl) {
    dl->count = 0;
}


// ============================================================
// dirtyListMerge
// ============================================================
//
// Coalesces overlapping or nearby dirty rects to reduce the number of
// composite+flush passes. The trade-off: merging two rects into their
// bounding box may add "clean" pixels that get needlessly repainted, but
// this is far cheaper than the per-rect overhead of an extra composite
// pass (clip setup, window-stack walk, LFB flush). On 486/Pentium ISA,
// the LFB write latency per-rect dominates, so fewer larger rects win.
//
// Algorithm: O(N^2) pairwise sweep with bounded restarts. For each rect i,
// scan all rects j>i and merge any that overlap or are within DIRTY_MERGE_GAP
// pixels. When a merge happens, rect i grows and may now overlap rects that
// it previously missed, so the inner scan restarts  -- but restarts are capped
// at 3 per slot to prevent O(N^3) cascading in pathological layouts (e.g.
// a diagonal scatter of tiny rects). The cap of 3 was chosen empirically:
// typical GUI operations produce clustered invalidations that converge in
// 1-2 passes; 3 gives a safety margin without measurable overhead.
//
// Merged-away rects are removed by swap-with-last (O(1) removal from an
// unordered list), which is why the rects array is not kept sorted.

void dirtyListMerge(DirtyListT *dl) {
    if (dl->count <= 1) {
        return;
    }

    for (int32_t i = 0; i < dl->count; i++) {
        int32_t restarts = 0;
        bool    merged   = true;

        while (merged && restarts < 3) {
            merged = false;

            for (int32_t j = i + 1; j < dl->count; j++) {
                if (rectsOverlapOrAdjacent(&dl->rects[i], &dl->rects[j], DIRTY_MERGE_GAP)) {
                    rectUnion(&dl->rects[i], &dl->rects[j], &dl->rects[i]);
                    // Swap-with-last removal: order doesn't matter for merging
                    dl->rects[j] = dl->rects[dl->count - 1];
                    dl->count--;
                    j--;
                    merged = true;
                }
            }

            restarts++;
        }
    }
}


// ============================================================
// flushRect
// ============================================================
//
// Copies one dirty rect from the system RAM backbuffer to the VESA LFB.
// This is the single most bandwidth-sensitive operation in the entire GUI:
// the LFB lives behind the ISA/PCI bus, so every byte written here is a
// bus transaction. The platform layer (platformFlushRect) uses rep movsd
// on 486+ to move aligned 32-bit words, maximizing bus utilization.
//
// Crucially, we flush per dirty rect AFTER all painting for that rect is
// complete. This avoids visible tearing  -- the LFB is never in a half-painted
// state for any given region.

void flushRect(DisplayT *d, const RectT *r) {
    platformFlushRect(d, r);
}


// ============================================================
// rectIntersect
// ============================================================
//
// Used heavily in the compositing loop to test whether a window overlaps
// a dirty rect before painting it. The branch hint marks the non-overlapping
// case as unlikely because the compositing loop already does a coarse AABB
// check before calling this  -- when we get here, intersection is expected.
// The min/max formulation avoids branches in the hot path.

bool rectIntersect(const RectT *a, const RectT *b, RectT *result) {
    int32_t ix1 = a->x > b->x ? a->x : b->x;
    int32_t iy1 = a->y > b->y ? a->y : b->y;
    int32_t ix2 = (a->x + a->w) < (b->x + b->w) ? (a->x + a->w) : (b->x + b->w);
    int32_t iy2 = (a->y + a->h) < (b->y + b->h) ? (a->y + a->h) : (b->y + b->h);

    if (__builtin_expect(ix1 >= ix2 || iy1 >= iy2, 0)) {
        return false;
    }

    result->x = ix1;
    result->y = iy1;
    result->w = ix2 - ix1;
    result->h = iy2 - iy1;

    return true;
}


// ============================================================
// rectIsEmpty
// ============================================================

bool rectIsEmpty(const RectT *r) {
    return (r->w <= 0 || r->h <= 0);
}


// ============================================================
// rectsOverlapOrAdjacent
// ============================================================
//
// Separating-axis test with a gap tolerance. Two rects merge if they
// overlap OR if the gap between them is <= DIRTY_MERGE_GAP pixels.
// The gap tolerance is the key tuning parameter for the merge algorithm:
// too small and you get many tiny rects (expensive per-rect flush overhead);
// too large and you merge distant rects into one huge bounding box
// (wasted repaint of clean pixels). The early-out on each axis makes this
// very cheap for non-overlapping rects, which is the common case during
// the inner loop of dirtyListMerge.

static inline bool rectsOverlapOrAdjacent(const RectT *a, const RectT *b, int32_t gap) {
    if (a->x + a->w + gap < b->x) { return false; }
    if (b->x + b->w + gap < a->x) { return false; }
    if (a->y + a->h + gap < b->y) { return false; }
    if (b->y + b->h + gap < a->y) { return false; }
    return true;
}


// ============================================================
// rectUnion
// ============================================================
//
// Axis-aligned bounding box of two rects. Supports in-place operation
// (result == a) for the merge loop. Note that this may grow the rect
// substantially if the two inputs are far apart  -- this is the inherent
// cost of bounding-box merging vs. maintaining a true region (list of
// non-overlapping rects). Bounding-box was chosen because the merge
// list is bounded to 128 entries and the extra repaint cost of a few
// clean pixels is negligible compared to the complexity of a proper
// region algebra on 486-class hardware.

static inline void rectUnion(const RectT *a, const RectT *b, RectT *result) {
    int32_t x1 = a->x < b->x ? a->x : b->x;
    int32_t y1 = a->y < b->y ? a->y : b->y;
    int32_t x2 = (a->x + a->w) > (b->x + b->w) ? (a->x + a->w) : (b->x + b->w);
    int32_t y2 = (a->y + a->h) > (b->y + b->h) ? (a->y + a->h) : (b->y + b->h);

    result->x = x1;
    result->y = y1;
    result->w = x2 - x1;
    result->h = y2 - y1;
}