Replace per-pixel branching with nibble lookup table and inline ASM

Rewrite RenderRow inner loop: split each glyph byte into two nibbles,
look up 4 pre-resolved palette bytes per nibble from a 64-byte table,
and write as word stores — zero branching in the hot path. Replace 25
per-row GlobalAlloc buffers with a single reusable buffer and move glyph
data into a GlobalAlloc'd block shared with the nibble table. All
arithmetic is 16-bit Word (no Longint). Uses mini-frame technique to
safely access local variables from inline ASM after DS/SI/DI clobber.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Scott Duensing 2026-02-27 18:33:59 -06:00
parent 0ff633f605
commit c5d31ca930

View file

@ -7,10 +7,10 @@ unit KPAnsi;
{ Renders incoming data using standard ANSI/VT100 escape sequences for }
{ cursor positioning, color attributes, and screen manipulation. }
{ }
{ Rendering uses a font atlas with direct pixel writes into 8bpp DIB row }
{ buffers, minimizing GDI calls to a single SetDIBitsToDevice per dirty }
{ row. This eliminates the per-call overhead of TextOut/SetTextColor/ }
{ SetBkColor that dominates rendering time on Win16. }
{ Rendering uses a font atlas with a nibble lookup table and inline ASM }
{ to expand glyph bitmaps directly into a reusable 8bpp DIB pixel buffer. }
{ This eliminates per-pixel branching and 32-bit arithmetic from the inner }
{ loop, with one SetDIBitsToDevice call per dirty row. }
{ }
{ Installs to the "KP" palette tab alongside TKPComm. }
@ -78,11 +78,14 @@ type
FDirtyRow: array[0..255] of Boolean;
FAllDirty: Boolean;
FTextBlinkOn: Boolean;
FGlyphBits: array[0..255, 0..31] of Byte;
FRowBufH: array[0..255] of THandle;
FRowBuf: array[0..255] of Pointer;
FGlyphBufH: THandle;
FGlyphBuf: Pointer;
FRowBufH: THandle;
FRowBuf: Pointer;
FDibInfo: TDibInfo;
FRowBufSize: Integer;
FNibbleFG: Byte;
FNibbleBG: Byte;
procedure AllocLine(Line: PTermLine);
procedure BuildAtlas;
procedure ClearLine(Line: PTermLine);
@ -266,10 +269,11 @@ end;
procedure TKPAnsi.BuildAtlas;
{ Render all 256 CP437 characters into a monochrome bitmap, then extract }
{ per-glyph pixel masks into FGlyphBits. Each entry FGlyphBits[ch, row] }
{ is an 8-bit mask: MSB = leftmost pixel, 1 = foreground, 0 = background. }
{ This is a one-time GDI cost at startup; after extraction, the bitmap }
{ and DC are deleted and never needed again. }
{ per-glyph pixel masks into the glyph block at offset 64. Each glyph }
{ byte is an 8-bit mask: MSB = leftmost pixel, 1 = FG, 0 = BG. The }
{ nibble lookup table at offset 0..63 is built at render time. This is a }
{ one-time GDI cost at startup; after extraction, the bitmap and DC are }
{ deleted and never needed again. }
var
AtlasBmp: HBitmap;
AtlasDC: HDC;
@ -279,17 +283,38 @@ var
Row: Integer;
RawH: THandle;
RawPtr: PPixelBuf;
GlyphPtr: PPixelBuf;
Stride: Longint;
BmpSize: Longint;
Ch: Char;
begin
FillChar(FGlyphBits, SizeOf(FGlyphBits), 0);
{ Free old glyph block }
if FGlyphBufH <> 0 then
begin
GlobalUnlock(FGlyphBufH);
GlobalFree(FGlyphBufH);
FGlyphBufH := 0;
FGlyphBuf := nil;
end;
if FPaintFont = 0 then
Exit;
if (FCellWidth < 1) or (FCellHeight < 1) or (FCellHeight > 32) then
Exit;
{ Allocate glyph block: 64 bytes nibble table + 256*32 glyph data }
FGlyphBufH := GlobalAlloc(GMEM_FIXED or GMEM_ZEROINIT, 8256);
if FGlyphBufH = 0 then
Exit;
FGlyphBuf := GlobalLock(FGlyphBufH);
if FGlyphBuf = nil then
begin
GlobalFree(FGlyphBufH);
FGlyphBufH := 0;
Exit;
end;
GlyphPtr := PPixelBuf(FGlyphBuf);
{ Create monochrome bitmap: 256 chars side-by-side, FCellHeight tall }
AtlasBmp := CreateBitmap(256 * FCellWidth, FCellHeight, 1, 1, nil);
if AtlasBmp = 0 then
@ -325,13 +350,14 @@ begin
if RawPtr <> nil then
begin
GetBitmapBits(AtlasBmp, BmpSize, RawPtr);
{ Extract per-glyph bytes. For 8-pixel-wide fonts each glyph is }
{ exactly one byte per scan line, aligned to byte boundaries. }
{ Extract per-glyph bytes into glyph block at offset 64. }
{ For 8-pixel-wide fonts each glyph is exactly one byte per scan }
{ line, aligned to byte boundaries. }
for I := 0 to 255 do
begin
for Row := 0 to FCellHeight - 1 do
begin
FGlyphBits[I, Row] := RawPtr^[Row * Stride + I];
GlyphPtr^[64 + I * 32 + Row] := RawPtr^[Row * Stride + I];
end;
end;
GlobalUnlock(RawH);
@ -443,6 +469,12 @@ begin
FAllDirty := True;
FTextBlinkOn := True;
FRowBufSize := 0;
FGlyphBufH := 0;
FGlyphBuf := nil;
FRowBufH := 0;
FRowBuf := nil;
FNibbleFG := 255;
FNibbleBG := 255;
{ Set a monospace font -- OEM charset selected in CreatePaintFont }
Font.Name := 'Terminal';
@ -516,21 +548,26 @@ end;
procedure TKPAnsi.CreateRowBuffers;
var
I: Integer;
begin
DestroyRowBuffers;
{ Free old row buffer (glyph block is managed by BuildAtlas) }
if FRowBufH <> 0 then
begin
GlobalUnlock(FRowBufH);
GlobalFree(FRowBufH);
FRowBufH := 0;
FRowBuf := nil;
end;
FRowBufSize := FCols * FCellWidth * FCellHeight;
if FRowBufSize < 1 then
Exit;
for I := 0 to FRows - 1 do
begin
FRowBufH[I] := GlobalAlloc(GMEM_FIXED or GMEM_ZEROINIT, FRowBufSize);
if FRowBufH[I] <> 0 then
FRowBuf[I] := GlobalLock(FRowBufH[I])
{ Single reusable buffer for one terminal row }
FRowBufH := GlobalAlloc(GMEM_FIXED or GMEM_ZEROINIT, FRowBufSize);
if FRowBufH <> 0 then
FRowBuf := GlobalLock(FRowBufH)
else
FRowBuf[I] := nil;
end;
FRowBuf := nil;
end;
@ -606,18 +643,20 @@ end;
procedure TKPAnsi.DestroyRowBuffers;
var
I: Integer;
begin
for I := 0 to 255 do
if FRowBufH <> 0 then
begin
if FRowBufH[I] <> 0 then
begin
GlobalUnlock(FRowBufH[I]);
GlobalFree(FRowBufH[I]);
FRowBufH[I] := 0;
FRowBuf[I] := nil;
GlobalUnlock(FRowBufH);
GlobalFree(FRowBufH);
FRowBufH := 0;
FRowBuf := nil;
end;
if FGlyphBufH <> 0 then
begin
GlobalUnlock(FGlyphBufH);
GlobalFree(FGlyphBufH);
FGlyphBufH := 0;
FGlyphBuf := nil;
end;
end;
@ -1157,8 +1196,9 @@ end;
procedure TKPAnsi.FlipToScreen;
{ Render dirty rows into 8bpp DIB buffers, then blast to screen via }
{ SetDIBitsToDevice. One GDI call per dirty row, zero for rendering. }
{ Render dirty rows into the shared 8bpp DIB buffer, blasting each to the }
{ screen via SetDIBitsToDevice immediately after rendering. One GDI call }
{ per dirty row, zero for the pixel expansion itself. }
var
DC: HDC;
Row: Integer;
@ -1167,9 +1207,9 @@ var
begin
if not HandleAllocated then
Exit;
if FRowBuf[0] = nil then
if FRowBuf = nil then
RecalcCellSize;
if FRowBuf[0] = nil then
if FRowBuf = nil then
Exit;
{ Scrollback view: force full redraw, ignore pending scroll }
@ -1179,8 +1219,7 @@ begin
FPendingScroll := 0;
end;
{ Deferred scroll: shift existing screen pixels up. Row buffers are }
{ per-row so they don't need scrolling; only the screen DC is shifted. }
{ Deferred scroll: shift existing screen pixels up }
if (FPendingScroll > 0) and not FAllDirty then
begin
R.Left := 0;
@ -1194,32 +1233,21 @@ begin
end;
FPendingScroll := 0;
{ Render dirty rows into per-row pixel buffers (pure memory, zero GDI) }
for Row := 0 to FRows - 1 do
begin
if (FAllDirty or FDirtyRow[Row]) and (FRowBuf[Row] <> nil) then
begin
RenderRow(Row);
end;
end;
{ Blast dirty rows to screen }
{ Interleaved render + blast: single buffer is reused per row }
DC := GetDC(Handle);
for Row := 0 to FRows - 1 do
begin
if FAllDirty or FDirtyRow[Row] then
begin
if FRowBuf[Row] <> nil then
begin
RenderRow(Row);
SetDIBitsToDevice(DC,
0, Row * FCellHeight,
FCols * FCellWidth, FCellHeight,
0, 0,
0, FCellHeight,
FRowBuf[Row],
FRowBuf,
PBitmapInfo(@FDibInfo)^,
0); { DIB_RGB_COLORS }
end;
FDirtyRow[Row] := False;
end;
end;
@ -1414,36 +1442,26 @@ procedure TKPAnsi.Paint;
var
Row: Integer;
begin
if FRowBuf[0] = nil then
if FRowBuf = nil then
RecalcCellSize;
if FRowBuf[0] = nil then
if FRowBuf = nil then
Exit;
{ Full repaint: render all rows into buffers, then blast to canvas }
{ Full repaint: render each row into the shared buffer and blast it }
FPendingScroll := 0;
FAllDirty := True;
for Row := 0 to FRows - 1 do
begin
if FRowBuf[Row] <> nil then
begin
RenderRow(Row);
end;
end;
for Row := 0 to FRows - 1 do
begin
if FRowBuf[Row] <> nil then
begin
SetDIBitsToDevice(Canvas.Handle,
0, Row * FCellHeight,
FCols * FCellWidth, FCellHeight,
0, 0,
0, FCellHeight,
FRowBuf[Row],
FRowBuf,
PBitmapInfo(@FDibInfo)^,
0); { DIB_RGB_COLORS }
end;
FDirtyRow[Row] := False;
end;
FAllDirty := False;
@ -1775,31 +1793,56 @@ end;
procedure TKPAnsi.RenderRow(Row: Integer);
{ Core atlas renderer. For each cell in the row, look up the glyph in }
{ the monochrome atlas and write FG/BG palette indices directly into the }
{ 8bpp row buffer. Zero GDI calls. Cursor overlay is also rendered here }
{ by swapping FG/BG for the cursor cell. }
{ Core atlas renderer with nibble lookup table and inline ASM. For each }
{ cell in the row, the Pascal outer loop resolves colors and rebuilds the }
{ 16-entry nibble table on color change. The inline ASM inner loop }
{ expands one glyph (all scanlines) by splitting each glyph byte into }
{ high and low nibbles, looking up 4 pre-resolved pixels per nibble, and }
{ writing them as word stores. Zero branching in the inner loop. }
{ }
{ Register allocation in ASM block: }
{ DS = glyph block segment (table at 0..63, glyph data at 64+) }
{ SI = glyph data offset (increments through scanlines) }
{ ES = pixel buffer segment }
{ DI = pixel buffer offset (decrements by Stride for bottom-up DIB) }
{ BX = table index (BH=0, BL = nibble * 4) }
{ CX = scanline counter }
{ AX/DX = temporaries }
{ }
{ Critical: Delphi 1.0 may allocate local variables to SI/DI as register }
{ variables. The ASM block clobbers SI/DI for its own purposes, so ALL }
{ local variable values are pushed to an explicit mini-frame (via PUSH) }
{ BEFORE any register clobber, then accessed via BP-relative offsets. }
{ BP-relative addressing defaults to SS segment, safe after DS change. }
var
Line: PTermLine;
Pix: PPixelBuf;
Stride: Longint;
Col: Integer;
GlyphRow: Integer;
BufScanline: Integer;
Bits: Byte;
Offset: Longint;
Pixel: Integer;
FGIdx: Byte;
BGIdx: Byte;
CharCode: Integer;
SbkCount: Integer;
VisRow: Integer;
TabPtr: PPixelBuf;
I: Integer;
Ofs: Integer;
GlyphSeg: Word;
PixSeg: Word;
GlyphOfs: Word;
PixOfs: Word;
Stride: Word;
CellH: Word;
begin
if FRowBuf[Row] = nil then
if FRowBuf = nil then
Exit;
if FGlyphBuf = nil then
Exit;
Pix := FRowBuf[Row];
Stride := Longint(FCols) * FCellWidth;
Stride := Word(FCols) * Word(FCellWidth);
CellH := FCellHeight;
{ Extract segments from far pointers -- pure Pascal, no register clobber }
GlyphSeg := Seg(PPixelBuf(FGlyphBuf)^);
PixSeg := Seg(PPixelBuf(FRowBuf)^);
{ Determine which line to render (handles scrollback view) }
if FScrollPos <> 0 then
@ -1826,10 +1869,14 @@ begin
if Line = nil then
begin
{ Blank row: fill with background color 0 (black) }
FillChar(Pix^, FRowBufSize, 0);
FillChar(PPixelBuf(FRowBuf)^, FRowBufSize, 0);
Exit;
end;
{ Force nibble table rebuild on first cell }
FNibbleFG := 255;
FNibbleBG := 255;
for Col := 0 to FCols - 1 do
begin
{ Determine effective colors }
@ -1840,44 +1887,189 @@ begin
BGIdx := Line^.Cells[Col].BG;
CharCode := Ord(Line^.Cells[Col].Ch);
{ Render glyph into buffer -- bottom-up for DIB format }
for GlyphRow := 0 to FCellHeight - 1 do
{ Rebuild nibble table on color change: 16 entries x 4 bytes }
if (FGIdx <> FNibbleFG) or (BGIdx <> FNibbleBG) then
begin
BufScanline := FCellHeight - 1 - GlyphRow;
Bits := FGlyphBits[CharCode, GlyphRow];
Offset := Longint(BufScanline) * Stride + Longint(Col) * FCellWidth;
for Pixel := 0 to FCellWidth - 1 do
TabPtr := PPixelBuf(FGlyphBuf);
for I := 0 to 15 do
begin
if (Bits and $80) <> 0 then
Pix^[Offset + Pixel] := FGIdx
else
Pix^[Offset + Pixel] := BGIdx;
Bits := Bits shl 1;
Ofs := I * 4;
if (I and 8) <> 0 then TabPtr^[Ofs] := FGIdx
else TabPtr^[Ofs] := BGIdx;
if (I and 4) <> 0 then TabPtr^[Ofs + 1] := FGIdx
else TabPtr^[Ofs + 1] := BGIdx;
if (I and 2) <> 0 then TabPtr^[Ofs + 2] := FGIdx
else TabPtr^[Ofs + 2] := BGIdx;
if (I and 1) <> 0 then TabPtr^[Ofs + 3] := FGIdx
else TabPtr^[Ofs + 3] := BGIdx;
end;
FNibbleFG := FGIdx;
FNibbleBG := BGIdx;
end;
{ Compute offsets -- all 16-bit, no Longint }
GlyphOfs := 64 + Word(CharCode) shl 5;
PixOfs := Word(CellH - 1) * Stride + Word(Col) * 8;
asm
{ Push all values to explicit mini-frame BEFORE any register }
{ clobber. BASM reads register variables from SI/DI correctly }
{ here since nothing has been overwritten yet. }
push Stride
push CellH
push PixSeg
push GlyphSeg
push PixOfs
push GlyphOfs
push bp
mov bp, sp
{ Mini-frame layout (all accessed via SS:[BP+n]): }
{ [bp] = saved original BP }
{ [bp+2] = GlyphOfs }
{ [bp+4] = PixOfs }
{ [bp+6] = GlyphSeg }
{ [bp+8] = PixSeg }
{ [bp+10] = CellH }
{ [bp+12] = Stride }
push ds
push bx
push si
push di
mov si, [bp+2]
mov es, [bp+8]
mov di, [bp+4]
mov cx, [bp+10]
xor bh, bh
mov ds, [bp+6]
@rowloop:
mov al, [si] { load glyph byte from DS:SI }
inc si
mov ah, al { save copy }
{ High nibble -> 4 pixels }
and al, $F0
shr al, 1
shr al, 1 { AL = high_nibble * 4 }
mov bl, al
mov dx, [bx] { 2 table bytes (DS:BX, table at offset 0) }
mov es:[di], dx
mov dx, [bx+2] { 2 more table bytes }
mov es:[di+2], dx
{ Low nibble -> 4 pixels }
mov al, ah
and al, $0F
shl al, 1
shl al, 1 { AL = low_nibble * 4 }
mov bl, al
mov dx, [bx]
mov es:[di+4], dx
mov dx, [bx+2]
mov es:[di+6], dx
sub di, [bp+12] { Stride via SS:[BP+12] -- safe after DS change }
dec cx
jnz @rowloop
pop di
pop si
pop bx
pop ds
pop bp
add sp, 12 { remove 6 mini-frame words }
end;
end;
{ Cursor overlay: if cursor is on this row and visible, re-render the }
{ cursor cell with swapped FG/BG using the same atlas lookup. }
{ cursor cell with swapped FG/BG using the same ASM inner loop. }
if FCursorVisible and FBlinkOn and (FScrollPos = 0) and
(Row = FCursorRow) and (FCursorCol >= 0) and (FCursorCol < FCols) then
begin
FGIdx := Line^.Cells[FCursorCol].BG;
BGIdx := Line^.Cells[FCursorCol].FG;
CharCode := Ord(Line^.Cells[FCursorCol].Ch);
for GlyphRow := 0 to FCellHeight - 1 do
{ Rebuild nibble table for cursor colors }
TabPtr := PPixelBuf(FGlyphBuf);
for I := 0 to 15 do
begin
BufScanline := FCellHeight - 1 - GlyphRow;
Bits := FGlyphBits[CharCode, GlyphRow];
Offset := Longint(BufScanline) * Stride + Longint(FCursorCol) * FCellWidth;
for Pixel := 0 to FCellWidth - 1 do
begin
if (Bits and $80) <> 0 then
Pix^[Offset + Pixel] := FGIdx
else
Pix^[Offset + Pixel] := BGIdx;
Bits := Bits shl 1;
Ofs := I * 4;
if (I and 8) <> 0 then TabPtr^[Ofs] := FGIdx
else TabPtr^[Ofs] := BGIdx;
if (I and 4) <> 0 then TabPtr^[Ofs + 1] := FGIdx
else TabPtr^[Ofs + 1] := BGIdx;
if (I and 2) <> 0 then TabPtr^[Ofs + 2] := FGIdx
else TabPtr^[Ofs + 2] := BGIdx;
if (I and 1) <> 0 then TabPtr^[Ofs + 3] := FGIdx
else TabPtr^[Ofs + 3] := BGIdx;
end;
FNibbleFG := FGIdx;
FNibbleBG := BGIdx;
GlyphOfs := 64 + Word(CharCode) shl 5;
PixOfs := Word(CellH - 1) * Stride + Word(FCursorCol) * 8;
asm
push Stride
push CellH
push PixSeg
push GlyphSeg
push PixOfs
push GlyphOfs
push bp
mov bp, sp
push ds
push bx
push si
push di
mov si, [bp+2]
mov es, [bp+8]
mov di, [bp+4]
mov cx, [bp+10]
xor bh, bh
mov ds, [bp+6]
@curloop:
mov al, [si]
inc si
mov ah, al
and al, $F0
shr al, 1
shr al, 1
mov bl, al
mov dx, [bx]
mov es:[di], dx
mov dx, [bx+2]
mov es:[di+2], dx
mov al, ah
and al, $0F
shl al, 1
shl al, 1
mov bl, al
mov dx, [bx]
mov es:[di+4], dx
mov dx, [bx+2]
mov es:[di+6], dx
sub di, [bp+12]
dec cx
jnz @curloop
pop di
pop si
pop bx
pop ds
pop bp
add sp, 12
end;
end;
end;