Replace per-pixel branching with nibble lookup table and inline ASM

Rewrite RenderRow inner loop: split each glyph byte into two nibbles,
look up 4 pre-resolved palette bytes per nibble from a 64-byte table,
and write as word stores — zero branching in the hot path. Replace 25
per-row GlobalAlloc buffers with a single reusable buffer and move glyph
data into a GlobalAlloc'd block shared with the nibble table. All
arithmetic is 16-bit Word (no Longint). Uses mini-frame technique to
safely access local variables from inline ASM after DS/SI/DI clobber.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Scott Duensing 2026-02-27 18:33:59 -06:00
parent 0ff633f605
commit c5d31ca930

View file

@ -7,10 +7,10 @@ unit KPAnsi;
{ Renders incoming data using standard ANSI/VT100 escape sequences for } { Renders incoming data using standard ANSI/VT100 escape sequences for }
{ cursor positioning, color attributes, and screen manipulation. } { cursor positioning, color attributes, and screen manipulation. }
{ } { }
{ Rendering uses a font atlas with direct pixel writes into 8bpp DIB row } { Rendering uses a font atlas with a nibble lookup table and inline ASM }
{ buffers, minimizing GDI calls to a single SetDIBitsToDevice per dirty } { to expand glyph bitmaps directly into a reusable 8bpp DIB pixel buffer. }
{ row. This eliminates the per-call overhead of TextOut/SetTextColor/ } { This eliminates per-pixel branching and 32-bit arithmetic from the inner }
{ SetBkColor that dominates rendering time on Win16. } { loop, with one SetDIBitsToDevice call per dirty row. }
{ } { }
{ Installs to the "KP" palette tab alongside TKPComm. } { Installs to the "KP" palette tab alongside TKPComm. }
@ -78,11 +78,14 @@ type
FDirtyRow: array[0..255] of Boolean; FDirtyRow: array[0..255] of Boolean;
FAllDirty: Boolean; FAllDirty: Boolean;
FTextBlinkOn: Boolean; FTextBlinkOn: Boolean;
FGlyphBits: array[0..255, 0..31] of Byte; FGlyphBufH: THandle;
FRowBufH: array[0..255] of THandle; FGlyphBuf: Pointer;
FRowBuf: array[0..255] of Pointer; FRowBufH: THandle;
FRowBuf: Pointer;
FDibInfo: TDibInfo; FDibInfo: TDibInfo;
FRowBufSize: Integer; FRowBufSize: Integer;
FNibbleFG: Byte;
FNibbleBG: Byte;
procedure AllocLine(Line: PTermLine); procedure AllocLine(Line: PTermLine);
procedure BuildAtlas; procedure BuildAtlas;
procedure ClearLine(Line: PTermLine); procedure ClearLine(Line: PTermLine);
@ -266,10 +269,11 @@ end;
procedure TKPAnsi.BuildAtlas; procedure TKPAnsi.BuildAtlas;
{ Render all 256 CP437 characters into a monochrome bitmap, then extract } { Render all 256 CP437 characters into a monochrome bitmap, then extract }
{ per-glyph pixel masks into FGlyphBits. Each entry FGlyphBits[ch, row] } { per-glyph pixel masks into the glyph block at offset 64. Each glyph }
{ is an 8-bit mask: MSB = leftmost pixel, 1 = foreground, 0 = background. } { byte is an 8-bit mask: MSB = leftmost pixel, 1 = FG, 0 = BG. The }
{ This is a one-time GDI cost at startup; after extraction, the bitmap } { nibble lookup table at offset 0..63 is built at render time. This is a }
{ and DC are deleted and never needed again. } { one-time GDI cost at startup; after extraction, the bitmap and DC are }
{ deleted and never needed again. }
var var
AtlasBmp: HBitmap; AtlasBmp: HBitmap;
AtlasDC: HDC; AtlasDC: HDC;
@ -279,17 +283,38 @@ var
Row: Integer; Row: Integer;
RawH: THandle; RawH: THandle;
RawPtr: PPixelBuf; RawPtr: PPixelBuf;
GlyphPtr: PPixelBuf;
Stride: Longint; Stride: Longint;
BmpSize: Longint; BmpSize: Longint;
Ch: Char; Ch: Char;
begin begin
FillChar(FGlyphBits, SizeOf(FGlyphBits), 0); { Free old glyph block }
if FGlyphBufH <> 0 then
begin
GlobalUnlock(FGlyphBufH);
GlobalFree(FGlyphBufH);
FGlyphBufH := 0;
FGlyphBuf := nil;
end;
if FPaintFont = 0 then if FPaintFont = 0 then
Exit; Exit;
if (FCellWidth < 1) or (FCellHeight < 1) or (FCellHeight > 32) then if (FCellWidth < 1) or (FCellHeight < 1) or (FCellHeight > 32) then
Exit; Exit;
{ Allocate glyph block: 64 bytes nibble table + 256*32 glyph data }
FGlyphBufH := GlobalAlloc(GMEM_FIXED or GMEM_ZEROINIT, 8256);
if FGlyphBufH = 0 then
Exit;
FGlyphBuf := GlobalLock(FGlyphBufH);
if FGlyphBuf = nil then
begin
GlobalFree(FGlyphBufH);
FGlyphBufH := 0;
Exit;
end;
GlyphPtr := PPixelBuf(FGlyphBuf);
{ Create monochrome bitmap: 256 chars side-by-side, FCellHeight tall } { Create monochrome bitmap: 256 chars side-by-side, FCellHeight tall }
AtlasBmp := CreateBitmap(256 * FCellWidth, FCellHeight, 1, 1, nil); AtlasBmp := CreateBitmap(256 * FCellWidth, FCellHeight, 1, 1, nil);
if AtlasBmp = 0 then if AtlasBmp = 0 then
@ -325,13 +350,14 @@ begin
if RawPtr <> nil then if RawPtr <> nil then
begin begin
GetBitmapBits(AtlasBmp, BmpSize, RawPtr); GetBitmapBits(AtlasBmp, BmpSize, RawPtr);
{ Extract per-glyph bytes. For 8-pixel-wide fonts each glyph is } { Extract per-glyph bytes into glyph block at offset 64. }
{ exactly one byte per scan line, aligned to byte boundaries. } { For 8-pixel-wide fonts each glyph is exactly one byte per scan }
{ line, aligned to byte boundaries. }
for I := 0 to 255 do for I := 0 to 255 do
begin begin
for Row := 0 to FCellHeight - 1 do for Row := 0 to FCellHeight - 1 do
begin begin
FGlyphBits[I, Row] := RawPtr^[Row * Stride + I]; GlyphPtr^[64 + I * 32 + Row] := RawPtr^[Row * Stride + I];
end; end;
end; end;
GlobalUnlock(RawH); GlobalUnlock(RawH);
@ -443,6 +469,12 @@ begin
FAllDirty := True; FAllDirty := True;
FTextBlinkOn := True; FTextBlinkOn := True;
FRowBufSize := 0; FRowBufSize := 0;
FGlyphBufH := 0;
FGlyphBuf := nil;
FRowBufH := 0;
FRowBuf := nil;
FNibbleFG := 255;
FNibbleBG := 255;
{ Set a monospace font -- OEM charset selected in CreatePaintFont } { Set a monospace font -- OEM charset selected in CreatePaintFont }
Font.Name := 'Terminal'; Font.Name := 'Terminal';
@ -516,21 +548,26 @@ end;
procedure TKPAnsi.CreateRowBuffers; procedure TKPAnsi.CreateRowBuffers;
var
I: Integer;
begin begin
DestroyRowBuffers; { Free old row buffer (glyph block is managed by BuildAtlas) }
if FRowBufH <> 0 then
begin
GlobalUnlock(FRowBufH);
GlobalFree(FRowBufH);
FRowBufH := 0;
FRowBuf := nil;
end;
FRowBufSize := FCols * FCellWidth * FCellHeight; FRowBufSize := FCols * FCellWidth * FCellHeight;
if FRowBufSize < 1 then if FRowBufSize < 1 then
Exit; Exit;
for I := 0 to FRows - 1 do
begin { Single reusable buffer for one terminal row }
FRowBufH[I] := GlobalAlloc(GMEM_FIXED or GMEM_ZEROINIT, FRowBufSize); FRowBufH := GlobalAlloc(GMEM_FIXED or GMEM_ZEROINIT, FRowBufSize);
if FRowBufH[I] <> 0 then if FRowBufH <> 0 then
FRowBuf[I] := GlobalLock(FRowBufH[I]) FRowBuf := GlobalLock(FRowBufH)
else else
FRowBuf[I] := nil; FRowBuf := nil;
end;
end; end;
@ -606,18 +643,20 @@ end;
procedure TKPAnsi.DestroyRowBuffers; procedure TKPAnsi.DestroyRowBuffers;
var
I: Integer;
begin begin
for I := 0 to 255 do if FRowBufH <> 0 then
begin begin
if FRowBufH[I] <> 0 then GlobalUnlock(FRowBufH);
begin GlobalFree(FRowBufH);
GlobalUnlock(FRowBufH[I]); FRowBufH := 0;
GlobalFree(FRowBufH[I]); FRowBuf := nil;
FRowBufH[I] := 0; end;
FRowBuf[I] := nil; if FGlyphBufH <> 0 then
end; begin
GlobalUnlock(FGlyphBufH);
GlobalFree(FGlyphBufH);
FGlyphBufH := 0;
FGlyphBuf := nil;
end; end;
end; end;
@ -1157,8 +1196,9 @@ end;
procedure TKPAnsi.FlipToScreen; procedure TKPAnsi.FlipToScreen;
{ Render dirty rows into 8bpp DIB buffers, then blast to screen via } { Render dirty rows into the shared 8bpp DIB buffer, blasting each to the }
{ SetDIBitsToDevice. One GDI call per dirty row, zero for rendering. } { screen via SetDIBitsToDevice immediately after rendering. One GDI call }
{ per dirty row, zero for the pixel expansion itself. }
var var
DC: HDC; DC: HDC;
Row: Integer; Row: Integer;
@ -1167,9 +1207,9 @@ var
begin begin
if not HandleAllocated then if not HandleAllocated then
Exit; Exit;
if FRowBuf[0] = nil then if FRowBuf = nil then
RecalcCellSize; RecalcCellSize;
if FRowBuf[0] = nil then if FRowBuf = nil then
Exit; Exit;
{ Scrollback view: force full redraw, ignore pending scroll } { Scrollback view: force full redraw, ignore pending scroll }
@ -1179,8 +1219,7 @@ begin
FPendingScroll := 0; FPendingScroll := 0;
end; end;
{ Deferred scroll: shift existing screen pixels up. Row buffers are } { Deferred scroll: shift existing screen pixels up }
{ per-row so they don't need scrolling; only the screen DC is shifted. }
if (FPendingScroll > 0) and not FAllDirty then if (FPendingScroll > 0) and not FAllDirty then
begin begin
R.Left := 0; R.Left := 0;
@ -1194,32 +1233,21 @@ begin
end; end;
FPendingScroll := 0; FPendingScroll := 0;
{ Render dirty rows into per-row pixel buffers (pure memory, zero GDI) } { Interleaved render + blast: single buffer is reused per row }
for Row := 0 to FRows - 1 do
begin
if (FAllDirty or FDirtyRow[Row]) and (FRowBuf[Row] <> nil) then
begin
RenderRow(Row);
end;
end;
{ Blast dirty rows to screen }
DC := GetDC(Handle); DC := GetDC(Handle);
for Row := 0 to FRows - 1 do for Row := 0 to FRows - 1 do
begin begin
if FAllDirty or FDirtyRow[Row] then if FAllDirty or FDirtyRow[Row] then
begin begin
if FRowBuf[Row] <> nil then RenderRow(Row);
begin SetDIBitsToDevice(DC,
SetDIBitsToDevice(DC, 0, Row * FCellHeight,
0, Row * FCellHeight, FCols * FCellWidth, FCellHeight,
FCols * FCellWidth, FCellHeight, 0, 0,
0, 0, 0, FCellHeight,
0, FCellHeight, FRowBuf,
FRowBuf[Row], PBitmapInfo(@FDibInfo)^,
PBitmapInfo(@FDibInfo)^, 0); { DIB_RGB_COLORS }
0); { DIB_RGB_COLORS }
end;
FDirtyRow[Row] := False; FDirtyRow[Row] := False;
end; end;
end; end;
@ -1414,36 +1442,26 @@ procedure TKPAnsi.Paint;
var var
Row: Integer; Row: Integer;
begin begin
if FRowBuf[0] = nil then if FRowBuf = nil then
RecalcCellSize; RecalcCellSize;
if FRowBuf[0] = nil then if FRowBuf = nil then
Exit; Exit;
{ Full repaint: render all rows into buffers, then blast to canvas } { Full repaint: render each row into the shared buffer and blast it }
FPendingScroll := 0; FPendingScroll := 0;
FAllDirty := True; FAllDirty := True;
for Row := 0 to FRows - 1 do for Row := 0 to FRows - 1 do
begin begin
if FRowBuf[Row] <> nil then RenderRow(Row);
begin SetDIBitsToDevice(Canvas.Handle,
RenderRow(Row); 0, Row * FCellHeight,
end; FCols * FCellWidth, FCellHeight,
end; 0, 0,
0, FCellHeight,
for Row := 0 to FRows - 1 do FRowBuf,
begin PBitmapInfo(@FDibInfo)^,
if FRowBuf[Row] <> nil then 0); { DIB_RGB_COLORS }
begin
SetDIBitsToDevice(Canvas.Handle,
0, Row * FCellHeight,
FCols * FCellWidth, FCellHeight,
0, 0,
0, FCellHeight,
FRowBuf[Row],
PBitmapInfo(@FDibInfo)^,
0); { DIB_RGB_COLORS }
end;
FDirtyRow[Row] := False; FDirtyRow[Row] := False;
end; end;
FAllDirty := False; FAllDirty := False;
@ -1775,31 +1793,56 @@ end;
procedure TKPAnsi.RenderRow(Row: Integer); procedure TKPAnsi.RenderRow(Row: Integer);
{ Core atlas renderer. For each cell in the row, look up the glyph in } { Core atlas renderer with nibble lookup table and inline ASM. For each }
{ the monochrome atlas and write FG/BG palette indices directly into the } { cell in the row, the Pascal outer loop resolves colors and rebuilds the }
{ 8bpp row buffer. Zero GDI calls. Cursor overlay is also rendered here } { 16-entry nibble table on color change. The inline ASM inner loop }
{ by swapping FG/BG for the cursor cell. } { expands one glyph (all scanlines) by splitting each glyph byte into }
{ high and low nibbles, looking up 4 pre-resolved pixels per nibble, and }
{ writing them as word stores. Zero branching in the inner loop. }
{ }
{ Register allocation in ASM block: }
{ DS = glyph block segment (table at 0..63, glyph data at 64+) }
{ SI = glyph data offset (increments through scanlines) }
{ ES = pixel buffer segment }
{ DI = pixel buffer offset (decrements by Stride for bottom-up DIB) }
{ BX = table index (BH=0, BL = nibble * 4) }
{ CX = scanline counter }
{ AX/DX = temporaries }
{ }
{ Critical: Delphi 1.0 may allocate local variables to SI/DI as register }
{ variables. The ASM block clobbers SI/DI for its own purposes, so ALL }
{ local variable values are pushed to an explicit mini-frame (via PUSH) }
{ BEFORE any register clobber, then accessed via BP-relative offsets. }
{ BP-relative addressing defaults to SS segment, safe after DS change. }
var var
Line: PTermLine; Line: PTermLine;
Pix: PPixelBuf; Col: Integer;
Stride: Longint; FGIdx: Byte;
Col: Integer; BGIdx: Byte;
GlyphRow: Integer; CharCode: Integer;
BufScanline: Integer; SbkCount: Integer;
Bits: Byte; VisRow: Integer;
Offset: Longint; TabPtr: PPixelBuf;
Pixel: Integer; I: Integer;
FGIdx: Byte; Ofs: Integer;
BGIdx: Byte; GlyphSeg: Word;
CharCode: Integer; PixSeg: Word;
SbkCount: Integer; GlyphOfs: Word;
VisRow: Integer; PixOfs: Word;
Stride: Word;
CellH: Word;
begin begin
if FRowBuf[Row] = nil then if FRowBuf = nil then
Exit;
if FGlyphBuf = nil then
Exit; Exit;
Pix := FRowBuf[Row]; Stride := Word(FCols) * Word(FCellWidth);
Stride := Longint(FCols) * FCellWidth; CellH := FCellHeight;
{ Extract segments from far pointers -- pure Pascal, no register clobber }
GlyphSeg := Seg(PPixelBuf(FGlyphBuf)^);
PixSeg := Seg(PPixelBuf(FRowBuf)^);
{ Determine which line to render (handles scrollback view) } { Determine which line to render (handles scrollback view) }
if FScrollPos <> 0 then if FScrollPos <> 0 then
@ -1826,10 +1869,14 @@ begin
if Line = nil then if Line = nil then
begin begin
{ Blank row: fill with background color 0 (black) } { Blank row: fill with background color 0 (black) }
FillChar(Pix^, FRowBufSize, 0); FillChar(PPixelBuf(FRowBuf)^, FRowBufSize, 0);
Exit; Exit;
end; end;
{ Force nibble table rebuild on first cell }
FNibbleFG := 255;
FNibbleBG := 255;
for Col := 0 to FCols - 1 do for Col := 0 to FCols - 1 do
begin begin
{ Determine effective colors } { Determine effective colors }
@ -1840,44 +1887,189 @@ begin
BGIdx := Line^.Cells[Col].BG; BGIdx := Line^.Cells[Col].BG;
CharCode := Ord(Line^.Cells[Col].Ch); CharCode := Ord(Line^.Cells[Col].Ch);
{ Render glyph into buffer -- bottom-up for DIB format } { Rebuild nibble table on color change: 16 entries x 4 bytes }
for GlyphRow := 0 to FCellHeight - 1 do if (FGIdx <> FNibbleFG) or (BGIdx <> FNibbleBG) then
begin begin
BufScanline := FCellHeight - 1 - GlyphRow; TabPtr := PPixelBuf(FGlyphBuf);
Bits := FGlyphBits[CharCode, GlyphRow]; for I := 0 to 15 do
Offset := Longint(BufScanline) * Stride + Longint(Col) * FCellWidth;
for Pixel := 0 to FCellWidth - 1 do
begin begin
if (Bits and $80) <> 0 then Ofs := I * 4;
Pix^[Offset + Pixel] := FGIdx if (I and 8) <> 0 then TabPtr^[Ofs] := FGIdx
else else TabPtr^[Ofs] := BGIdx;
Pix^[Offset + Pixel] := BGIdx; if (I and 4) <> 0 then TabPtr^[Ofs + 1] := FGIdx
Bits := Bits shl 1; else TabPtr^[Ofs + 1] := BGIdx;
if (I and 2) <> 0 then TabPtr^[Ofs + 2] := FGIdx
else TabPtr^[Ofs + 2] := BGIdx;
if (I and 1) <> 0 then TabPtr^[Ofs + 3] := FGIdx
else TabPtr^[Ofs + 3] := BGIdx;
end; end;
FNibbleFG := FGIdx;
FNibbleBG := BGIdx;
end;
{ Compute offsets -- all 16-bit, no Longint }
GlyphOfs := 64 + Word(CharCode) shl 5;
PixOfs := Word(CellH - 1) * Stride + Word(Col) * 8;
asm
{ Push all values to explicit mini-frame BEFORE any register }
{ clobber. BASM reads register variables from SI/DI correctly }
{ here since nothing has been overwritten yet. }
push Stride
push CellH
push PixSeg
push GlyphSeg
push PixOfs
push GlyphOfs
push bp
mov bp, sp
{ Mini-frame layout (all accessed via SS:[BP+n]): }
{ [bp] = saved original BP }
{ [bp+2] = GlyphOfs }
{ [bp+4] = PixOfs }
{ [bp+6] = GlyphSeg }
{ [bp+8] = PixSeg }
{ [bp+10] = CellH }
{ [bp+12] = Stride }
push ds
push bx
push si
push di
mov si, [bp+2]
mov es, [bp+8]
mov di, [bp+4]
mov cx, [bp+10]
xor bh, bh
mov ds, [bp+6]
@rowloop:
mov al, [si] { load glyph byte from DS:SI }
inc si
mov ah, al { save copy }
{ High nibble -> 4 pixels }
and al, $F0
shr al, 1
shr al, 1 { AL = high_nibble * 4 }
mov bl, al
mov dx, [bx] { 2 table bytes (DS:BX, table at offset 0) }
mov es:[di], dx
mov dx, [bx+2] { 2 more table bytes }
mov es:[di+2], dx
{ Low nibble -> 4 pixels }
mov al, ah
and al, $0F
shl al, 1
shl al, 1 { AL = low_nibble * 4 }
mov bl, al
mov dx, [bx]
mov es:[di+4], dx
mov dx, [bx+2]
mov es:[di+6], dx
sub di, [bp+12] { Stride via SS:[BP+12] -- safe after DS change }
dec cx
jnz @rowloop
pop di
pop si
pop bx
pop ds
pop bp
add sp, 12 { remove 6 mini-frame words }
end; end;
end; end;
{ Cursor overlay: if cursor is on this row and visible, re-render the } { Cursor overlay: if cursor is on this row and visible, re-render the }
{ cursor cell with swapped FG/BG using the same atlas lookup. } { cursor cell with swapped FG/BG using the same ASM inner loop. }
if FCursorVisible and FBlinkOn and (FScrollPos = 0) and if FCursorVisible and FBlinkOn and (FScrollPos = 0) and
(Row = FCursorRow) and (FCursorCol >= 0) and (FCursorCol < FCols) then (Row = FCursorRow) and (FCursorCol >= 0) and (FCursorCol < FCols) then
begin begin
FGIdx := Line^.Cells[FCursorCol].BG; FGIdx := Line^.Cells[FCursorCol].BG;
BGIdx := Line^.Cells[FCursorCol].FG; BGIdx := Line^.Cells[FCursorCol].FG;
CharCode := Ord(Line^.Cells[FCursorCol].Ch); CharCode := Ord(Line^.Cells[FCursorCol].Ch);
for GlyphRow := 0 to FCellHeight - 1 do
{ Rebuild nibble table for cursor colors }
TabPtr := PPixelBuf(FGlyphBuf);
for I := 0 to 15 do
begin begin
BufScanline := FCellHeight - 1 - GlyphRow; Ofs := I * 4;
Bits := FGlyphBits[CharCode, GlyphRow]; if (I and 8) <> 0 then TabPtr^[Ofs] := FGIdx
Offset := Longint(BufScanline) * Stride + Longint(FCursorCol) * FCellWidth; else TabPtr^[Ofs] := BGIdx;
for Pixel := 0 to FCellWidth - 1 do if (I and 4) <> 0 then TabPtr^[Ofs + 1] := FGIdx
begin else TabPtr^[Ofs + 1] := BGIdx;
if (Bits and $80) <> 0 then if (I and 2) <> 0 then TabPtr^[Ofs + 2] := FGIdx
Pix^[Offset + Pixel] := FGIdx else TabPtr^[Ofs + 2] := BGIdx;
else if (I and 1) <> 0 then TabPtr^[Ofs + 3] := FGIdx
Pix^[Offset + Pixel] := BGIdx; else TabPtr^[Ofs + 3] := BGIdx;
Bits := Bits shl 1; end;
end; FNibbleFG := FGIdx;
FNibbleBG := BGIdx;
GlyphOfs := 64 + Word(CharCode) shl 5;
PixOfs := Word(CellH - 1) * Stride + Word(FCursorCol) * 8;
asm
push Stride
push CellH
push PixSeg
push GlyphSeg
push PixOfs
push GlyphOfs
push bp
mov bp, sp
push ds
push bx
push si
push di
mov si, [bp+2]
mov es, [bp+8]
mov di, [bp+4]
mov cx, [bp+10]
xor bh, bh
mov ds, [bp+6]
@curloop:
mov al, [si]
inc si
mov ah, al
and al, $F0
shr al, 1
shr al, 1
mov bl, al
mov dx, [bx]
mov es:[di], dx
mov dx, [bx+2]
mov es:[di+2], dx
mov al, ah
and al, $0F
shl al, 1
shl al, 1
mov bl, al
mov dx, [bx]
mov es:[di+4], dx
mov dx, [bx+2]
mov es:[di+6], dx
sub di, [bp+12]
dec cx
jnz @curloop
pop di
pop si
pop bx
pop ds
pop bp
add sp, 12
end; end;
end; end;
end; end;