Inline CSI parsing, ASM nibble table, cursor dedup, reccom block copy

Four performance optimizations targeting the hottest paths:

- Parse CSI params (P1/P2) as integers during scan-ahead loop,
  eliminating ParseParamBuf call from ExecuteCSI (~200 cycles/seq)
- Replace 16-iteration Pascal nibble table rebuild (64 branch+store)
  with 32 straight-line MOV word using precomputed BGBG/BGFG/FGBG/FGFG
- Integrate cursor FG/BG swap into main RenderRow column loop,
  removing duplicate nibble rebuild + ASM glyph expansion overlay pass
- Replace byte-at-a-time reccom loop with _fmemcpy block copy split
  at ring buffer wrap point, reducing far pointer overhead from O(n) to O(1)

Also includes previously uncommitted space fast-path in RenderRow and
inlined escape sequence handling in ParseDataBuf.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Scott Duensing 2026-03-02 18:20:51 -06:00
parent 8e3bad86e3
commit c378abc9e5
2 changed files with 429 additions and 271 deletions

View file

@ -66,6 +66,9 @@ type
FParseState: TParseState; { Current parser state machine position } FParseState: TParseState; { Current parser state machine position }
FParamBuf: array[0..31] of Char; { CSI parameter digits/semicolons } FParamBuf: array[0..31] of Char; { CSI parameter digits/semicolons }
FParamLen: Integer; { Current length of FParamBuf } FParamLen: Integer; { Current length of FParamBuf }
FCSIParam1: Integer; { First CSI param, parsed inline during scan }
FCSIParam2: Integer; { Second CSI param, parsed inline during scan }
FCSIParamIdx: Integer; { Which param we're accumulating (0=P1, 1=P2) }
FMusicStr: string; { Accumulated ANSI music string (ESC[M..^N) } FMusicStr: string; { Accumulated ANSI music string (ESC[M..^N) }
{ Font metrics (measured from OEM charset paint font) } { Font metrics (measured from OEM charset paint font) }
@ -484,6 +487,9 @@ begin
FAttrReverse := False; FAttrReverse := False;
FParseState := psNormal; FParseState := psNormal;
FParamLen := 0; FParamLen := 0;
FCSIParam1 := 0;
FCSIParam2 := 0;
FCSIParamIdx := 0;
FMusicStr := ''; FMusicStr := '';
FCellWidth := 8; FCellWidth := 8;
FCellHeight := 16; FCellHeight := 16;
@ -892,22 +898,15 @@ end;
procedure TKPAnsi.ExecuteCSI(FinalCh: Char); procedure TKPAnsi.ExecuteCSI(FinalCh: Char);
{ Uses FCSIParam1/FCSIParam2 parsed inline during CSI scan-ahead. }
{ No ParseParamBuf call needed -- saves ~200 cycles per CSI sequence. }
{ ParseSGR still uses FParamBuf for variable-count parameters. }
var var
Params: array[0..15] of Integer;
Count: Integer;
P1: Integer; P1: Integer;
P2: Integer; P2: Integer;
begin begin
ParseParamBuf(@FParamBuf[0], FParamLen, Params, Count); P1 := FCSIParam1;
P2 := FCSIParam2;
if Count > 0 then
P1 := Params[0]
else
P1 := 0;
if Count > 1 then
P2 := Params[1]
else
P2 := 0;
case FinalCh of case FinalCh of
'A': { CUU - Cursor Up } 'A': { CUU - Cursor Up }
@ -1565,14 +1564,19 @@ end;
procedure TKPAnsi.ParseDataBuf(Buf: PChar; Len: Integer); procedure TKPAnsi.ParseDataBuf(Buf: PChar; Len: Integer);
{ Process incoming data from a PChar buffer (no string allocation needed). } { Process incoming data from a PChar buffer (no string allocation needed). }
{ Fast path batches runs of printable characters: colors are computed once } { }
{ per run, and cells are filled in a tight loop without per-character state } { Three inlined fast paths eliminate ProcessChar method call overhead: }
{ checks. Run length is bounded by end of input, end of current row, or } { 1. Printable text runs: batch fill cells, one color computation per run }
{ next non-printable character -- whichever comes first. } { 2. CSI parameter accumulation: scan-ahead loop for digits/semicolons }
{ 3. Common control chars: ESC, CR, LF handled inline }
{ }
{ Uncommon states (psCSIQuestion, psMusic) and rare control chars (TAB, }
{ BS, BEL, ENQ) still delegate to ProcessChar. }
{ } { }
{ Does NOT call FlipToScreen -- the caller handles rendering. } { Does NOT call FlipToScreen -- the caller handles rendering. }
var var
I: Integer; I: Integer;
Ch: Char;
Line: PTermLine; Line: PTermLine;
FGIdx: Byte; FGIdx: Byte;
BGIdx: Byte; BGIdx: Byte;
@ -1584,10 +1588,12 @@ begin
while I < Len do while I < Len do
begin begin
{ Fast path: printable character in normal state } case FParseState of
if (FParseState = psNormal) and (Buf[I] >= ' ') then psNormal:
begin begin
{ Handle wrap at right margin } if Buf[I] >= ' ' then
begin
{ Fast path: batch printable characters }
if FCursorCol >= FCols then if FCursorCol >= FCols then
begin begin
if FWrapMode then if FWrapMode then
@ -1622,7 +1628,7 @@ begin
(RunEnd - I < Remaining) do (RunEnd - I < Remaining) do
Inc(RunEnd); Inc(RunEnd);
{ Fill cells in tight loop -- no per-character state/wrap checks } { Fill cells in tight loop }
if FAttrReverse then if FAttrReverse then
begin begin
while I < RunEnd do while I < RunEnd do
@ -1651,15 +1657,116 @@ begin
end; end;
FDirtyRow[FCursorRow] := True; FDirtyRow[FCursorRow] := True;
end end
else if Buf[I] = #27 then
begin
{ ESC: start escape sequence }
FParseState := psEscape;
Line := nil;
Inc(I);
end
else if Buf[I] = #10 then
begin
{ LF: line feed }
Inc(FCursorRow);
if FCursorRow >= FRows then
begin
FCursorRow := FRows - 1;
DoScrollUp;
end;
Line := nil;
Inc(I);
end
else if Buf[I] = #13 then
begin
{ CR: carriage return }
FCursorCol := 0;
Inc(I);
end
else else
begin begin
{ Slow path: control chars, escape sequences } { Uncommon control chars: BS, TAB, BEL, ENQ }
Line := nil; Line := nil;
ProcessChar(Buf[I]); ProcessChar(Buf[I]);
Inc(I); Inc(I);
end; end;
end; end;
psEscape:
begin
if Buf[I] = '[' then
begin
FParamLen := 0;
FCSIParam1 := 0;
FCSIParam2 := 0;
FCSIParamIdx := 0;
FParseState := psCSI;
end
else
FParseState := psNormal;
Inc(I);
end;
psCSI:
begin
{ Scan ahead: parse integers inline while accumulating FParamBuf. }
{ FCSIParam1/FCSIParam2 are built digit-by-digit during the scan }
{ so ExecuteCSI can use them directly without ParseParamBuf. }
{ FParamBuf is still maintained for ParseSGR (variable param count). }
while (I < Len) and
((Buf[I] >= '0') and (Buf[I] <= '9') or (Buf[I] = ';')) do
begin
if Buf[I] = ';' then
begin
Inc(FCSIParamIdx);
end
else if FCSIParamIdx = 0 then
begin
FCSIParam1 := FCSIParam1 * 10 + (Ord(Buf[I]) - 48);
end
else if FCSIParamIdx = 1 then
begin
FCSIParam2 := FCSIParam2 * 10 + (Ord(Buf[I]) - 48);
end;
if FParamLen < 32 then
begin
FParamBuf[FParamLen] := Buf[I];
Inc(FParamLen);
end;
Inc(I);
end;
{ Process final command byte if available }
if I < Len then
begin
Ch := Buf[I];
if Ch = '?' then
begin
FParseState := psCSIQuestion;
end
else if (Ch = 'M') and (FParamLen = 0) then
begin
FMusicStr := '';
FParseState := psMusic;
end
else
begin
ExecuteCSI(Ch);
FParseState := psNormal;
end;
Inc(I);
end;
end;
else
begin
{ psCSIQuestion, psMusic: delegate to ProcessChar }
Line := nil;
ProcessChar(Buf[I]);
Inc(I);
end;
end;
end;
{ Deferred scrollback trim -- batched from DoScrollUp } { Deferred scrollback trim -- batched from DoScrollUp }
TrimScrollback; TrimScrollback;
@ -1829,6 +1936,9 @@ begin
'[': '[':
begin begin
FParamLen := 0; FParamLen := 0;
FCSIParam1 := 0;
FCSIParam2 := 0;
FCSIParamIdx := 0;
FParseState := psCSI; FParseState := psCSI;
end; end;
else else
@ -1844,6 +1954,12 @@ begin
case Ch of case Ch of
'0'..'9', ';': '0'..'9', ';':
begin begin
if Ch = ';' then
Inc(FCSIParamIdx)
else if FCSIParamIdx = 0 then
FCSIParam1 := FCSIParam1 * 10 + (Ord(Ch) - 48)
else if FCSIParamIdx = 1 then
FCSIParam2 := FCSIParam2 * 10 + (Ord(Ch) - 48);
if FParamLen < 32 then if FParamLen < 32 then
begin begin
FParamBuf[FParamLen] := Ch; FParamBuf[FParamLen] := Ch;
@ -1996,8 +2112,10 @@ procedure TKPAnsi.RenderRow(Row: Integer);
var var
Line: PTermLine; Line: PTermLine;
Col: Integer; Col: Integer;
CurCol: Integer; { Cursor column on this row, or -1 if no cursor }
FGIdx: Byte; FGIdx: Byte;
BGIdx: Byte; BGIdx: Byte;
TmpIdx: Byte;
CharCode: Integer; CharCode: Integer;
SbkCount: Integer; SbkCount: Integer;
VisRow: Integer; VisRow: Integer;
@ -2052,6 +2170,15 @@ begin
Exit; Exit;
end; end;
{ Determine cursor column for this row (-1 if cursor not on this row). }
{ The cursor swap is integrated into the main column loop, eliminating }
{ the separate cursor overlay pass (saves nibble rebuild + ASM per cell). }
if FCursorVisible and FBlinkOn and (FScrollPos = 0) and
(Row = FCursorRow) and (FCursorCol >= 0) and (FCursorCol < FCols) then
CurCol := FCursorCol
else
CurCol := -1;
{ Force nibble table rebuild on first cell } { Force nibble table rebuild on first cell }
FNibbleFG := 255; FNibbleFG := 255;
FNibbleBG := 255; FNibbleBG := 255;
@ -2080,21 +2207,111 @@ begin
BGIdx := Line^.Cells[Col].BG; BGIdx := Line^.Cells[Col].BG;
CharCode := Ord(Line^.Cells[Col].Ch); CharCode := Ord(Line^.Cells[Col].Ch);
{ Rebuild nibble table on color change: 16 entries x 4 bytes } { Cursor: swap FG/BG inline -- no separate overlay pass needed }
if Col = CurCol then
begin
TmpIdx := FGIdx;
FGIdx := BGIdx;
BGIdx := TmpIdx;
end;
if CharCode = 32 then
begin
{ Space fast path: solid background fill, no glyph expansion. }
{ Skips nibble table rebuild and ASM glyph loop entirely. }
{ 4 word stores per scanline vs full nibble lookup + expansion. }
PixOfs := Word(CellH - 1) * Stride + Word(Col) * 8;
asm
push di
mov es, PixSeg
mov di, PixOfs
mov al, BGIdx
mov ah, al { AX = BGIdx:BGIdx }
mov cx, CellH
@spfill:
mov es:[di], ax
mov es:[di+2], ax
mov es:[di+4], ax
mov es:[di+6], ax
sub di, Stride
dec cx
jnz @spfill
pop di
end;
end
else
begin
{ Rebuild nibble table on color change: 16 entries x 4 bytes. }
{ Pre-compute 4 word values (BGBG, BGFG, FGBG, FGFG) in AX/BX/CX/DX }
{ and write all 32 words directly. Replaces 64 branch+store Pascal }
{ operations with 32 straight-line MOV instructions. }
if (FGIdx <> FNibbleFG) or (BGIdx <> FNibbleBG) then if (FGIdx <> FNibbleFG) or (BGIdx <> FNibbleBG) then
begin begin
TabPtr := PPixelBuf(FGlyphBuf); asm
for I := 0 to 15 do push di
begin push bx
Ofs := I * 4; push es
if (I and 8) <> 0 then TabPtr^[Ofs] := FGIdx les di, FGlyphBuf
else TabPtr^[Ofs] := BGIdx; mov al, BGIdx
if (I and 4) <> 0 then TabPtr^[Ofs + 1] := FGIdx mov ah, al { AX = BG:BG }
else TabPtr^[Ofs + 1] := BGIdx; mov dl, FGIdx
if (I and 2) <> 0 then TabPtr^[Ofs + 2] := FGIdx mov dh, dl { DX = FG:FG }
else TabPtr^[Ofs + 2] := BGIdx; mov bl, al
if (I and 1) <> 0 then TabPtr^[Ofs + 3] := FGIdx mov bh, dl { BX = BG:FG (lo=BG, hi=FG) }
else TabPtr^[Ofs + 3] := BGIdx; mov cl, dl
mov ch, al { CX = FG:BG (lo=FG, hi=BG) }
{ Entry 0 (0000): BG BG BG BG }
mov es:[di+ 0], ax
mov es:[di+ 2], ax
{ Entry 1 (0001): BG BG BG FG }
mov es:[di+ 4], ax
mov es:[di+ 6], bx
{ Entry 2 (0010): BG BG FG BG }
mov es:[di+ 8], ax
mov es:[di+10], cx
{ Entry 3 (0011): BG BG FG FG }
mov es:[di+12], ax
mov es:[di+14], dx
{ Entry 4 (0100): BG FG BG BG }
mov es:[di+16], bx
mov es:[di+18], ax
{ Entry 5 (0101): BG FG BG FG }
mov es:[di+20], bx
mov es:[di+22], bx
{ Entry 6 (0110): BG FG FG BG }
mov es:[di+24], bx
mov es:[di+26], cx
{ Entry 7 (0111): BG FG FG FG }
mov es:[di+28], bx
mov es:[di+30], dx
{ Entry 8 (1000): FG BG BG BG }
mov es:[di+32], cx
mov es:[di+34], ax
{ Entry 9 (1001): FG BG BG FG }
mov es:[di+36], cx
mov es:[di+38], bx
{ Entry 10 (1010): FG BG FG BG }
mov es:[di+40], cx
mov es:[di+42], cx
{ Entry 11 (1011): FG BG FG FG }
mov es:[di+44], cx
mov es:[di+46], dx
{ Entry 12 (1100): FG FG BG BG }
mov es:[di+48], dx
mov es:[di+50], ax
{ Entry 13 (1101): FG FG BG FG }
mov es:[di+52], dx
mov es:[di+54], bx
{ Entry 14 (1110): FG FG FG BG }
mov es:[di+56], dx
mov es:[di+58], cx
{ Entry 15 (1111): FG FG FG FG }
mov es:[di+60], dx
mov es:[di+62], dx
pop es
pop bx
pop di
end; end;
FNibbleFG := FGIdx; FNibbleFG := FGIdx;
FNibbleBG := BGIdx; FNibbleBG := BGIdx;
@ -2170,91 +2387,6 @@ begin
add sp, 4 { remove per-cell GlyphOfs + PixOfs only } add sp, 4 { remove per-cell GlyphOfs + PixOfs only }
end; end;
end; end;
{ Cursor overlay: if cursor is on this row and visible, re-render the }
{ cursor cell with swapped FG/BG using the same ASM inner loop. }
{ Constants are still on the stack from above -- reused here. }
if FCursorVisible and FBlinkOn and (FScrollPos = 0) and
(Row = FCursorRow) and (FCursorCol >= 0) and (FCursorCol < FCols) then
begin
FGIdx := Line^.Cells[FCursorCol].BG;
BGIdx := Line^.Cells[FCursorCol].FG;
CharCode := Ord(Line^.Cells[FCursorCol].Ch);
{ Rebuild nibble table for cursor colors }
TabPtr := PPixelBuf(FGlyphBuf);
for I := 0 to 15 do
begin
Ofs := I * 4;
if (I and 8) <> 0 then TabPtr^[Ofs] := FGIdx
else TabPtr^[Ofs] := BGIdx;
if (I and 4) <> 0 then TabPtr^[Ofs + 1] := FGIdx
else TabPtr^[Ofs + 1] := BGIdx;
if (I and 2) <> 0 then TabPtr^[Ofs + 2] := FGIdx
else TabPtr^[Ofs + 2] := BGIdx;
if (I and 1) <> 0 then TabPtr^[Ofs + 3] := FGIdx
else TabPtr^[Ofs + 3] := BGIdx;
end;
FNibbleFG := FGIdx;
FNibbleBG := BGIdx;
GlyphOfs := 64 + Word(CharCode) shl 5;
PixOfs := Word(CellH - 1) * Stride + Word(FCursorCol) * 8;
asm
push PixOfs
push GlyphOfs
push bp
mov bp, sp
push ds
push bx
push si
push di
mov si, [bp+2]
mov es, [bp+8]
mov di, [bp+4]
mov cx, [bp+10]
xor bh, bh
mov ds, [bp+6]
@curloop:
mov al, [si]
inc si
mov ah, al
and al, $F0
shr al, 1
shr al, 1
mov bl, al
mov dx, [bx]
mov es:[di], dx
mov dx, [bx+2]
mov es:[di+2], dx
mov al, ah
and al, $0F
shl al, 1
shl al, 1
mov bl, al
mov dx, [bx]
mov es:[di+4], dx
mov dx, [bx+2]
mov es:[di+6], dx
sub di, [bp+12]
dec cx
jnz @curloop
pop di
pop si
pop bx
pop ds
pop bp
add sp, 4
end;
end; end;
{ Remove constant mini-frame words pushed before the column loop } { Remove constant mini-frame words pushed before the column loop }

View file

@ -1256,15 +1256,41 @@ int16_t FAR PASCAL _export reccom(int16_t commId, void FAR *buf, int16_t len)
dst = (uint8_t FAR *)buf; dst = (uint8_t FAR *)buf;
bytesRead = 0; bytesRead = 0;
// Block copy from ring buffer, splitting at wrap point.
// Two _fmemcpy calls replace per-byte loop with far pointer overhead.
_disable(); _disable();
while (bytesRead < len && port->rxCount > 0) { {
*dst++ = port->rxBuf[port->rxTail]; uint16_t avail;
port->rxTail++; uint16_t chunk;
if (port->rxTail >= port->rxSize) {
port->rxTail = 0; avail = port->rxCount;
if (avail > (uint16_t)len) {
avail = (uint16_t)len;
}
if (avail > 0) {
// First chunk: tail to end of buffer (or avail, whichever is smaller)
chunk = port->rxSize - port->rxTail;
if (chunk > avail) {
chunk = avail;
}
_fmemcpy(dst, port->rxBuf + port->rxTail, chunk);
dst += chunk;
bytesRead = chunk;
// Second chunk: wrap around to start of buffer
if (bytesRead < avail) {
chunk = avail - bytesRead;
_fmemcpy(dst, port->rxBuf, chunk);
bytesRead += chunk;
}
port->rxTail += bytesRead;
if (port->rxTail >= port->rxSize) {
port->rxTail -= port->rxSize;
}
port->rxCount -= bytesRead;
} }
port->rxCount--;
bytesRead++;
} }
_enable(); _enable();