PEI slam and dirty tracking!

2026-04-30 13:08:53 -05:00 · 2026-04-30 13:08:53 -05:00 · 065be89bff
commit 065be89bff
parent af366e7e81
3 changed files with 654 additions and 204 deletions
--- a/src/core/draw.c
+++ b/src/core/draw.c
@ -188,13 +188,13 @@ static void floodFillInternal(SurfaceT *s, int16_t startX, int16_t startY, uint8
        if (y < 0 || y >= SURFACE_HEIGHT || x < 0 || x >= SURFACE_WIDTH) {
            continue;
        }
-        row = &s->pixels[y * SURFACE_BYTES_PER_ROW];

        // Highest-tier asm fast path: seed-test + walk-left + walk-right
        // + 1-row fill + scan-above + scan-below + push, all in one
        // cross-segment call. The asm caches row addr / match decoder
        // across every sub-operation. C just pops and dispatches; this
-        // path completes the entire per-seed work.
+        // path completes the entire per-seed work and computes the row
+        // address itself, so we don't pay y*160 in C unless we fall back.
        {
            bool seedMatched;
            if (halFastFloodWalkAndScans(s->pixels, x, y,
@ -206,6 +206,10 @@ static void floodFillInternal(SurfaceT *s, int16_t startX, int16_t startY, uint8
            }
        }

+        // Fallback path needs row; compute it here so the asm path
+        // above doesn't pay for an unused y*160 multiply on every iter.
+        row = &s->pixels[y * SURFACE_BYTES_PER_ROW];
+
        // Tier-2 asm fast path: combined seed test + walk-left +
        // walk-right in one cross-segment call. Falls back to the
        // pure-C walks below on ports without an asm implementation.
--- a/src/port/iigs/hal.c
+++ b/src/port/iigs/hal.c
@ -164,13 +164,13 @@ static uint8_t  gCachedScb    [SURFACE_HEIGHT];
 static uint16_t gCachedPalette[SURFACE_PALETTE_COUNT][SURFACE_COLORS_PER_PALETTE];
 static bool     gCacheValid = false;

-// PEI slam scratch shared with src/port/iigs/peislam.asm. File-scope
-// non-static so the asm can `ext` them; all accesses inside the slam
-// use long-mode addressing so they bypass the //e RAMRD redirect the
-// slam turns on for the duration of the run.
+// PEI slam scratch. File-scope non-static so the asm can `ext` them;
+// all accesses inside the slam use long-mode `>` addressing so they
+// bypass the //e RAMRD redirect the slam turns on for its duration.
 volatile uint16_t gPeiOrigSp;
 volatile uint8_t  gPeiOrigShadow;
 volatile uint16_t gPeiTempRowBase;
+volatile uint16_t gPeiCurRow;            // row counter saved across slam (stack is hijacked)

 // Defined in src/port/iigs/peislam.asm, in its own load segment
 // (DRAWPRIMS) so the GS/OS loader places it in a different bank from
--- a/src/port/iigs/joeyDraw.asm
+++ b/src/port/iigs/joeyDraw.asm
@ -1362,82 +1362,243 @@ dcLoopBody      anop
                lda     >gRowOffsetLut,x     ; A = y*160
                sta     >dcRowXN

-* 8 octant plots. dcPlotPx wants A=col, X=rowBase. LDX has no long-
-* absolute mode, so for each plot we stash col, load row via LDA/TAX,
-* then reload col into A.
-* Octants 1-4 use the y-row pair (cx +/- x, cy +/- y).
+* 8 octant plots, fully inlined. Each plot:
+*   1. col = (acx +/- dcX|dcY)  -> A
+*   2. save col -> dcSavedCol (for parity test)
+*   3. byteIdx = col >> 1; byte addr = byteIdx + rowBase  -> Y
+*   4. test col & 1; do high or low nibble RMW
+* Skips the JSR/RTS to dcPlotPx (~12 cyc) and the load-row-via-X
+* dance (sta dcSavedCol + tax + reload). Per plot: ~60 cyc vs ~80
+* cyc with JSR. ~20 cyc/plot x ~2560 plots in the demo's 4 circles
+* = ~18 ms.
+*
+* Each plot has its own dcOddN / dcDoneN labels (ORCA-M needs unique).
+
+* Octants 1-4: y-row pair (cx +/- x, cy +/- y).
+* Plot 1: (cx+x, cy+y)
                lda     acx
                clc
                adc     >dcX
                sta     >dcSavedCol
-                lda     >dcRowYP
-                tax
+                lsr     a
+                clc
+                adc     >dcRowYP
+                tay
                lda     >dcSavedCol
-                jsr     dcPlotPx             ; (cx+x, cy+y)
+                sep     #$20
+                LONGA   OFF
+                and     #1
+                bne     dcOdd1
+                lda     [pix],y
+                and     #$0F
+                ora     >dcNibHi
+                sta     [pix],y
+                bra     dcDone1
+dcOdd1          anop
+                lda     [pix],y
+                and     #$F0
+                ora     >dcNibLo
+                sta     [pix],y
+dcDone1         anop
+                rep     #$20
+                LONGA   ON

+* Plot 2: (cx-x, cy+y)
                lda     acx
                sec
                sbc     >dcX
                sta     >dcSavedCol
-                lda     >dcRowYP
-                tax
+                lsr     a
+                clc
+                adc     >dcRowYP
+                tay
                lda     >dcSavedCol
-                jsr     dcPlotPx             ; (cx-x, cy+y)
+                sep     #$20
+                LONGA   OFF
+                and     #1
+                bne     dcOdd2
+                lda     [pix],y
+                and     #$0F
+                ora     >dcNibHi
+                sta     [pix],y
+                bra     dcDone2
+dcOdd2          anop
+                lda     [pix],y
+                and     #$F0
+                ora     >dcNibLo
+                sta     [pix],y
+dcDone2         anop
+                rep     #$20
+                LONGA   ON

+* Plot 3: (cx+x, cy-y)
                lda     acx
                clc
                adc     >dcX
                sta     >dcSavedCol
-                lda     >dcRowYN
-                tax
+                lsr     a
+                clc
+                adc     >dcRowYN
+                tay
                lda     >dcSavedCol
-                jsr     dcPlotPx             ; (cx+x, cy-y)
+                sep     #$20
+                LONGA   OFF
+                and     #1
+                bne     dcOdd3
+                lda     [pix],y
+                and     #$0F
+                ora     >dcNibHi
+                sta     [pix],y
+                bra     dcDone3
+dcOdd3          anop
+                lda     [pix],y
+                and     #$F0
+                ora     >dcNibLo
+                sta     [pix],y
+dcDone3         anop
+                rep     #$20
+                LONGA   ON

+* Plot 4: (cx-x, cy-y)
                lda     acx
                sec
                sbc     >dcX
                sta     >dcSavedCol
-                lda     >dcRowYN
-                tax
+                lsr     a
+                clc
+                adc     >dcRowYN
+                tay
                lda     >dcSavedCol
-                jsr     dcPlotPx             ; (cx-x, cy-y)
+                sep     #$20
+                LONGA   OFF
+                and     #1
+                bne     dcOdd4
+                lda     [pix],y
+                and     #$0F
+                ora     >dcNibHi
+                sta     [pix],y
+                bra     dcDone4
+dcOdd4          anop
+                lda     [pix],y
+                and     #$F0
+                ora     >dcNibLo
+                sta     [pix],y
+dcDone4         anop
+                rep     #$20
+                LONGA   ON

-* Octants 5-8 use the x-row pair (cx +/- y, cy +/- x).
+* Octants 5-8: x-row pair (cx +/- y, cy +/- x).
+* Plot 5: (cx+y, cy+x)
                lda     acx
                clc
                adc     >dcY
                sta     >dcSavedCol
-                lda     >dcRowXP
-                tax
+                lsr     a
+                clc
+                adc     >dcRowXP
+                tay
                lda     >dcSavedCol
-                jsr     dcPlotPx             ; (cx+y, cy+x)
+                sep     #$20
+                LONGA   OFF
+                and     #1
+                bne     dcOdd5
+                lda     [pix],y
+                and     #$0F
+                ora     >dcNibHi
+                sta     [pix],y
+                bra     dcDone5
+dcOdd5          anop
+                lda     [pix],y
+                and     #$F0
+                ora     >dcNibLo
+                sta     [pix],y
+dcDone5         anop
+                rep     #$20
+                LONGA   ON

+* Plot 6: (cx-y, cy+x)
                lda     acx
                sec
                sbc     >dcY
                sta     >dcSavedCol
-                lda     >dcRowXP
-                tax
+                lsr     a
+                clc
+                adc     >dcRowXP
+                tay
                lda     >dcSavedCol
-                jsr     dcPlotPx             ; (cx-y, cy+x)
+                sep     #$20
+                LONGA   OFF
+                and     #1
+                bne     dcOdd6
+                lda     [pix],y
+                and     #$0F
+                ora     >dcNibHi
+                sta     [pix],y
+                bra     dcDone6
+dcOdd6          anop
+                lda     [pix],y
+                and     #$F0
+                ora     >dcNibLo
+                sta     [pix],y
+dcDone6         anop
+                rep     #$20
+                LONGA   ON

+* Plot 7: (cx+y, cy-x)
                lda     acx
                clc
                adc     >dcY
                sta     >dcSavedCol
-                lda     >dcRowXN
-                tax
+                lsr     a
+                clc
+                adc     >dcRowXN
+                tay
                lda     >dcSavedCol
-                jsr     dcPlotPx             ; (cx+y, cy-x)
+                sep     #$20
+                LONGA   OFF
+                and     #1
+                bne     dcOdd7
+                lda     [pix],y
+                and     #$0F
+                ora     >dcNibHi
+                sta     [pix],y
+                bra     dcDone7
+dcOdd7          anop
+                lda     [pix],y
+                and     #$F0
+                ora     >dcNibLo
+                sta     [pix],y
+dcDone7         anop
+                rep     #$20
+                LONGA   ON

+* Plot 8: (cx-y, cy-x)
                lda     acx
                sec
                sbc     >dcY
                sta     >dcSavedCol
-                lda     >dcRowXN
-                tax
+                lsr     a
+                clc
+                adc     >dcRowXN
+                tay
                lda     >dcSavedCol
-                jsr     dcPlotPx             ; (cx-y, cy-x)
+                sep     #$20
+                LONGA   OFF
+                and     #1
+                bne     dcOdd8
+                lda     [pix],y
+                and     #$0F
+                ora     >dcNibHi
+                sta     [pix],y
+                bra     dcDone8
+dcOdd8          anop
+                lda     [pix],y
+                and     #$F0
+                ora     >dcNibLo
+                sta     [pix],y
+dcDone8         anop
+                rep     #$20
+                LONGA   ON

 * Update Bresenham: y++; if err <= 0: err += 2y+1; else x--; err += 2(y-x)+1.
                lda     >dcY
@ -1483,44 +1644,9 @@ dcExit          anop

 * dcMul160 deleted -- callers now expand the y160lut macro inline.

-****************************************************************
-* dcPlotPx: plot a pixel at column A, with row-base offset in X.
-* M=16, X=16 on entry. Trashes A, X, Y, P. D and B preserved.
-* Switches to M=8 for the byte RMW then back to M=16 for caller.
-****************************************************************
-dcPlotPx        anop
-                lsr     a                    ; A = col>>1, C = col & 1
-                bcs     dcPlotOdd
-* Even column: high nibble.
-                sta     >dcMulTmp
-                txa
-                clc
-                adc     >dcMulTmp
-                tay
-                sep     #$20
-                LONGA   OFF
-                lda     [pix],y
-                and     #$0F
-                ora     >dcNibHi
-                sta     [pix],y
-                rep     #$20
-                LONGA   ON
-                rts
-dcPlotOdd       anop
-                sta     >dcMulTmp
-                txa
-                clc
-                adc     >dcMulTmp
-                tay
-                sep     #$20
-                LONGA   OFF
-                lda     [pix],y
-                and     #$F0
-                ora     >dcNibLo
-                sta     [pix],y
-                rep     #$20
-                LONGA   ON
-                rts
+* dcPlotPx deleted -- the 8 octant plot sites now expand the plot
+* logic inline (no JSR/RTS, no STA dcSavedCol / TAX / reload-col
+* round-trip per plot).
                end


@ -2224,39 +2350,205 @@ bpal            equ    4
                adc     #8
                tcd

-* 1. Pixel blit (DBR ends up = $E1 after MVN).
-                ldx     #$2000
-                ldy     #$2000
-                lda     #31999
-                mvn     $010000,$E10000
-
-* 2. SCB upload (200 bytes). DBR = $E1, so sta abs,Y -> $E1:abs+Y.
-                ldy     #0
+* 1. SCB upload (200 bytes) via MVN. Done BEFORE the PEI-slam so the
+* SEI window only spans the slam itself (~38 ms). Source bank is
+* runtime-patched into the MVN instruction (encoding: $54 dst src,
+* so byte +2 is src).
                sep     #$20
                LONGA   OFF
-bscbLoop        anop
-                cpy     #200
-                beq     bscbDone
-                lda     [bscb],y
-                sta     $9D00,y
-                iny
-                bra     bscbLoop
-bscbDone        anop
+                lda     bscb+2
+                sta     >mvnScbInst+2
+                rep     #$20
+                LONGA   ON
+                lda     bscb
+                tax
+                ldy     #$9D00
+                lda     #199
+mvnScbInst      mvn     $000000,$E10000
+
+* 2. Palette upload (512 bytes) via MVN. Same trick.
+                sep     #$20
+                LONGA   OFF
+                lda     bpal+2
+                sta     >mvnPalInst+2
+                rep     #$20
+                LONGA   ON
+                lda     bpal
+                tax
+                ldy     #$9E00
+                lda     #511
+mvnPalInst      mvn     $000000,$E10000
+
+* 3. Pixel blit via PEI-slam, with per-row dirty skip.
+* PEI-slam: SP hijacked into the SHR shadow region of bank $01, AUXWRITE
+* + RAMRD remap bank-$00 stack pushes to bank $01, SHR shadow mirrors
+* bank-$01 writes to $E1. Result: PEI dp pushes from DP=$01:row_start
+* land at $E1:row_start (160 bytes / row at ~6 cyc per 2 bytes).
+*   ~480 cyc/row vs MVN's ~1120 cyc/row -- 2.3x faster per row.
+* SEI for the duration: soft-switch state and stack hijack would
+* corrupt any IRQ handler that touches bank-0 globals. ~38 ms SEI
+* total for a full 200-row slam; chunk later if audio glitches.
+* Dirty skip: rows where gStageMinWord[y] > gStageMaxWord[y] are
+* clean and not slammed. Saves big on sparse-update demos; for
+* full-screen presents (DRAW), every row slams.
+
+                tsc
+                sta     >gPeiOrigSp
+                sep     #$20
+                LONGA   OFF
+                lda     >$00C035
+                sta     >gPeiOrigShadow
                rep     #$20
                LONGA   ON

-* 3. Palette upload (512 bytes).
-                ldy     #0
+                sei
+
                sep     #$20
                LONGA   OFF
-bpalLoop        anop
-                cpy     #512
-                beq     bpalDone
-                lda     [bpal],y
-                sta     $9E00,y
-                iny
-                bra     bpalLoop
-bpalDone        anop
+                lda     >gPeiOrigShadow
+                and     #$F1                 ; clear bits 1,2,3 -> SHR shadow ON
+                sta     >$00C035
+                lda     #0
+                sta     >$00C005             ; AUXWRITE on
+                sta     >$00C003             ; RAMRD on
+                rep     #$20
+                LONGA   ON
+
+                ldx     #0                   ; X = row counter (need X because
+*                                            long-abs,Y doesn't exist on 65816 --
+*                                            only long-abs,X does, so the dirty-
+*                                            check `lda >gStageMinWord,x` works.)
+peiRowLoop      anop
+                cpx     #200
+                bcc     peiRowCheck
+                brl     peiRowsDone
+peiRowCheck     anop
+                sep     #$20
+                LONGA   OFF
+                lda     >gStageMinWord,x
+                cmp     >gStageMaxWord,x
+                rep     #$20
+                LONGA   ON
+                bcc     peiSlamRow           ; min < max -> dirty
+                beq     peiSlamRow           ; min == max -> 1-word dirty
+                inx                          ; clean row, skip
+                brl     peiRowLoop
+
+peiSlamRow      anop
+* Save X into long-mode scratch (stack is hijacked into $E1, can't PHX).
+                txa
+                sta     >gPeiCurRow
+                asl     a                    ; A = y*2 (LUT byte offset)
+                tax
+                lda     >gRowOffsetLut,x     ; A = y*160
+                clc
+                adc     #$2000               ; A = row_start
+                tcd                          ; D = row_start (PEI dp base)
+                clc
+                adc     #159
+                tcs                          ; SP = row_start + 159
+
+* 80 PEIs from DP+$9E down to DP+$00. Each pushes 2 bytes; SP decreases
+* by 2 each PEI. Final SP = row_start - 1. Bytes land at $E1:row_start
+* through $E1:row_start+159 (in correct memory order because we walk
+* DP offsets backwards).
+                pei     $9E
+                pei     $9C
+                pei     $9A
+                pei     $98
+                pei     $96
+                pei     $94
+                pei     $92
+                pei     $90
+                pei     $8E
+                pei     $8C
+                pei     $8A
+                pei     $88
+                pei     $86
+                pei     $84
+                pei     $82
+                pei     $80
+                pei     $7E
+                pei     $7C
+                pei     $7A
+                pei     $78
+                pei     $76
+                pei     $74
+                pei     $72
+                pei     $70
+                pei     $6E
+                pei     $6C
+                pei     $6A
+                pei     $68
+                pei     $66
+                pei     $64
+                pei     $62
+                pei     $60
+                pei     $5E
+                pei     $5C
+                pei     $5A
+                pei     $58
+                pei     $56
+                pei     $54
+                pei     $52
+                pei     $50
+                pei     $4E
+                pei     $4C
+                pei     $4A
+                pei     $48
+                pei     $46
+                pei     $44
+                pei     $42
+                pei     $40
+                pei     $3E
+                pei     $3C
+                pei     $3A
+                pei     $38
+                pei     $36
+                pei     $34
+                pei     $32
+                pei     $30
+                pei     $2E
+                pei     $2C
+                pei     $2A
+                pei     $28
+                pei     $26
+                pei     $24
+                pei     $22
+                pei     $20
+                pei     $1E
+                pei     $1C
+                pei     $1A
+                pei     $18
+                pei     $16
+                pei     $14
+                pei     $12
+                pei     $10
+                pei     $0E
+                pei     $0C
+                pei     $0A
+                pei     $08
+                pei     $06
+                pei     $04
+                pei     $02
+                pei     $00
+
+                lda     >gPeiCurRow
+                tax
+                inx
+                brl     peiRowLoop
+
+peiRowsDone     anop
+* Restore SP, soft-switches.
+                lda     >gPeiOrigSp
+                tcs
+                sep     #$20
+                LONGA   OFF
+                lda     >gPeiOrigShadow
+                sta     >$00C035
+                lda     #0
+                sta     >$00C004             ; AUXWRITE off
+                sta     >$00C002             ; RAMRD off
                rep     #$20
                LONGA   ON

@ -2264,7 +2556,7 @@ bpalDone        anop
                LONGI   OFF
                pld
                plb
-                plp
+                plp                          ; restores I (pre-SEI value)
                rtl
                end

@ -2889,47 +3181,148 @@ wsScanCurHit    equ    27      ; alias wsMaxSp.hi, 8-bit
                brl     wsWalkBndEntry

 ***** EQUAL MODE WALK *****
+* Seed + walk-left + walk-right with the pixel test inlined (kills
+* JSR/RTS overhead per pixel) and per-iter long-mode access to
+* gFloodLeftX/RightX replaced by DP-relative <wsScanCurX. Pattern at
+* every test site:
+*   lsr a           ; byteIdx + parity
+*   tay
+*   sep #$20 / lda [wsRow],y / nibble extract / cmp >wsMatchByte / rep #$20
 wsWalkEqEntry   anop
-* Seed test at wsX (inline eq).
+* --- SEED TEST EQ (inline) ---
                lda     wsX
-                jsr     wsTestEq             ; A = 1 if pix == matchByte
+                lsr     a
+                tay
+                sep     #$20
+                LONGA   OFF
+                bcs     wsSeedEqOdd
+                lda     [wsRow],y
+                lsr     a
+                lsr     a
+                lsr     a
+                lsr     a
+                bra     wsSeedEqHave
+wsSeedEqOdd     anop
+                lda     [wsRow],y
+                and     #$0F
+wsSeedEqHave    anop
+                cmp     >wsMatchByte
+                rep     #$20
+                LONGA   ON
+                bne     wsSeedEqMiss
+                lda     #1
+                sta     >gFloodSeedMatch
+                bra     wsWalkEqSeedOk
+wsSeedEqMiss    anop
+                lda     #0
                sta     >gFloodSeedMatch
-                cmp     #0
-                bne     wsWalkEqSeedOk
                brl     wsExit
 wsWalkEqSeedOk  anop

-* Walk left: leftX = wsX; while leftX > 0 and pixel(leftX-1) matches,
-* leftX--. Holds leftX in <wsScanCurX (DP) for the loop.
+* --- WALK LEFT EQ (byte-cached: 1 byte read per 2 walked pixels) ---
+* Splits the loop into evenEntry / oddEntry paths.
+*   evenEntry: currentX even -> test column C-1 (odd, low nib of byte Y-1).
+*              Read NEW byte at Y-1; cache; test low nib.
+*   oddEntry:  currentX odd  -> test column C-1 (even, high nib of byte Y).
+*              Reuse CACHED byte from previous iter; extract high nib.
+* Initial: peel parity once to seed the cache and pick entry point.
                lda     wsX
                sta     <wsScanCurX
-wsLeftEqLoop    anop
+                lsr     a
+                tay                          ; Y = byteIdx
+                bcc     wsLEqEvenEntry       ; parity 0: enter even path
+* parity 1 (odd): seed cache then drop into odd path.
+                sep     #$20
+                LONGA   OFF
+                lda     [wsRow],y
+                sta     <wsScanByte
+                rep     #$20
+                LONGA   ON
+                bra     wsLEqOddEntry
+
+wsLEqEvenEntry  anop
                lda     <wsScanCurX
                beq     wsLeftEqDone
-                dec     a
-                jsr     wsTestEq
-                cmp     #0
-                beq     wsLeftEqDone
+                dey
+                sep     #$20
+                LONGA   OFF
+                lda     [wsRow],y
+                sta     <wsScanByte
+                and     #$0F
+                cmp     >wsMatchByte
+                rep     #$20
+                LONGA   ON
+                bne     wsLeftEqDone
                dec     <wsScanCurX
-                brl     wsLeftEqLoop
+wsLEqOddEntry   anop
+                lda     <wsScanCurX
+                beq     wsLeftEqDone
+                sep     #$20
+                LONGA   OFF
+                lda     <wsScanByte
+                lsr     a
+                lsr     a
+                lsr     a
+                lsr     a
+                cmp     >wsMatchByte
+                rep     #$20
+                LONGA   ON
+                bne     wsLeftEqDone
+                dec     <wsScanCurX
+                bra     wsLEqEvenEntry
 wsLeftEqDone    anop
                lda     <wsScanCurX
                sta     >gFloodLeftX

-* Walk right: rightX = wsX; while rightX < 319 and pixel(rightX+1)
-* matches, rightX++. Holds rightX in <wsScanCurX.
+* --- WALK RIGHT EQ (byte-cached) ---
+*   evenEntry: currentX even -> test C+1 (odd, low nib of CACHED byte at Y).
+*   oddEntry:  currentX odd  -> test C+1 (even, high nib of byte Y+1).
+*              Inc Y, read NEW byte, cache, test high nib.
                lda     wsX
                sta     <wsScanCurX
-wsRightEqLoop   anop
+                lsr     a
+                tay
+                bcs     wsREqOddEntry
+                sep     #$20
+                LONGA   OFF
+                lda     [wsRow],y
+                sta     <wsScanByte
+                rep     #$20
+                LONGA   ON
+                bra     wsREqEvenEntry
+
+wsREqEvenEntry  anop
                lda     <wsScanCurX
                cmp     #319
                bcs     wsRightEqDone
-                inc     a
-                jsr     wsTestEq
-                cmp     #0
-                beq     wsRightEqDone
+                sep     #$20
+                LONGA   OFF
+                lda     <wsScanByte
+                and     #$0F
+                cmp     >wsMatchByte
+                rep     #$20
+                LONGA   ON
+                bne     wsRightEqDone
                inc     <wsScanCurX
-                brl     wsRightEqLoop
+wsREqOddEntry   anop
+                lda     <wsScanCurX
+                cmp     #319
+                bcs     wsRightEqDone
+                iny
+                sep     #$20
+                LONGA   OFF
+                lda     [wsRow],y
+                sta     <wsScanByte
+                lsr     a
+                lsr     a
+                lsr     a
+                lsr     a
+                cmp     >wsMatchByte
+                rep     #$20
+                LONGA   ON
+                bne     wsRightEqDone
+                inc     <wsScanCurX
+                bra     wsREqEvenEntry
 wsRightEqDone   anop
                lda     <wsScanCurX
                sta     >gFloodRightX
@ -2937,41 +3330,160 @@ wsRightEqDone   anop

 ***** BOUNDARY MODE WALK *****
 wsWalkBndEntry  anop
+* --- SEED TEST BND (inline) ---
                lda     wsX
-                jsr     wsTestBnd
+                lsr     a
+                tay
+                sep     #$20
+                LONGA   OFF
+                bcs     wsSeedBndOdd
+                lda     [wsRow],y
+                lsr     a
+                lsr     a
+                lsr     a
+                lsr     a
+                bra     wsSeedBndHave
+wsSeedBndOdd    anop
+                lda     [wsRow],y
+                and     #$0F
+wsSeedBndHave   anop
+                cmp     >wsMatchByte
+                beq     wsSeedBndMiss
+                cmp     >wsNewByte
+                beq     wsSeedBndMiss
+                rep     #$20
+                LONGA   ON
+                lda     #1
+                sta     >gFloodSeedMatch
+                bra     wsWalkBndSeedOk
+wsSeedBndMiss   anop
+                rep     #$20
+                LONGA   ON
+                lda     #0
                sta     >gFloodSeedMatch
-                cmp     #0
-                bne     wsWalkBndSeedOk
                brl     wsExit
 wsWalkBndSeedOk anop

+* --- WALK LEFT BND (byte-cached) ---
                lda     wsX
                sta     <wsScanCurX
-wsLeftBndLoop   anop
+                lsr     a
+                tay
+                bcc     wsLBndEvenEntry
+                sep     #$20
+                LONGA   OFF
+                lda     [wsRow],y
+                sta     <wsScanByte
+                rep     #$20
+                LONGA   ON
+                bra     wsLBndOddEntry
+
+wsLBndEvenEntry anop
                lda     <wsScanCurX
                beq     wsLeftBndDone
-                dec     a
-                jsr     wsTestBnd
-                cmp     #0
-                beq     wsLeftBndDone
+                dey
+                sep     #$20
+                LONGA   OFF
+                lda     [wsRow],y
+                sta     <wsScanByte
+                and     #$0F
+                cmp     >wsMatchByte
+                beq     wsLBndStop
+                cmp     >wsNewByte
+                beq     wsLBndStop
+                rep     #$20
+                LONGA   ON
                dec     <wsScanCurX
-                brl     wsLeftBndLoop
+                bra     wsLBndOddEntry2
+wsLBndStop      anop
+                rep     #$20
+                LONGA   ON
+                bra     wsLeftBndDone
+wsLBndOddEntry2 anop
+wsLBndOddEntry  anop
+                lda     <wsScanCurX
+                beq     wsLeftBndDone
+                sep     #$20
+                LONGA   OFF
+                lda     <wsScanByte
+                lsr     a
+                lsr     a
+                lsr     a
+                lsr     a
+                cmp     >wsMatchByte
+                beq     wsLBndStop2
+                cmp     >wsNewByte
+                beq     wsLBndStop2
+                rep     #$20
+                LONGA   ON
+                dec     <wsScanCurX
+                bra     wsLBndEvenEntry
+wsLBndStop2     anop
+                rep     #$20
+                LONGA   ON
 wsLeftBndDone   anop
                lda     <wsScanCurX
                sta     >gFloodLeftX

+* --- WALK RIGHT BND (byte-cached) ---
                lda     wsX
                sta     <wsScanCurX
-wsRightBndLoop  anop
+                lsr     a
+                tay
+                bcs     wsRBndOddEntry
+                sep     #$20
+                LONGA   OFF
+                lda     [wsRow],y
+                sta     <wsScanByte
+                rep     #$20
+                LONGA   ON
+                bra     wsRBndEvenEntry
+
+wsRBndEvenEntry anop
                lda     <wsScanCurX
                cmp     #319
                bcs     wsRightBndDone
-                inc     a
-                jsr     wsTestBnd
-                cmp     #0
-                beq     wsRightBndDone
+                sep     #$20
+                LONGA   OFF
+                lda     <wsScanByte
+                and     #$0F
+                cmp     >wsMatchByte
+                beq     wsRBndStop
+                cmp     >wsNewByte
+                beq     wsRBndStop
+                rep     #$20
+                LONGA   ON
                inc     <wsScanCurX
-                brl     wsRightBndLoop
+                bra     wsRBndOddEntry2
+wsRBndStop      anop
+                rep     #$20
+                LONGA   ON
+                bra     wsRightBndDone
+wsRBndOddEntry2 anop
+wsRBndOddEntry  anop
+                lda     <wsScanCurX
+                cmp     #319
+                bcs     wsRightBndDone
+                iny
+                sep     #$20
+                LONGA   OFF
+                lda     [wsRow],y
+                sta     <wsScanByte
+                lsr     a
+                lsr     a
+                lsr     a
+                lsr     a
+                cmp     >wsMatchByte
+                beq     wsRBndStop2
+                cmp     >wsNewByte
+                beq     wsRBndStop2
+                rep     #$20
+                LONGA   ON
+                inc     <wsScanCurX
+                bra     wsRBndEvenEntry
+wsRBndStop2     anop
+                rep     #$20
+                LONGA   ON
 wsRightBndDone  anop
                lda     <wsScanCurX
                sta     >gFloodRightX
@ -3108,75 +3620,9 @@ wsExit          anop
                plp
                rtl

-* wsTestEq: test pixel at column A against matchByte (eq mode).
-* In:  A = column (M=16). Reads from wsRow.
-* Out: A = 1 if pix == matchByte, 0 otherwise. M=16 on exit.
-* Trashes A, Y, P. Preserves X, D, B.
-wsTestEq        anop
-                lsr     a                    ; A = byteIdx; C = column & 1
-                tay
-                bcs     wsTeqOdd
-                sep     #$20
-                LONGA   OFF
-                lda     [wsRow],y
-                lsr     a
-                lsr     a
-                lsr     a
-                lsr     a
-                bra     wsTeqHave
-wsTeqOdd        anop
-                sep     #$20
-                LONGA   OFF
-                lda     [wsRow],y
-                and     #$0F
-wsTeqHave       anop
-                cmp     >wsMatchByte
-                rep     #$20
-                LONGA   ON
-                bne     wsTeqNo
-                lda     #1
-                rts
-wsTeqNo         anop
-                lda     #0
-                rts
-
-
-* wsTestBnd: test pixel at column A against (matchByte, newByte) for
-* boundary-mode flood: returns 1 iff pix != matchByte AND pix != newByte.
-* In:  A = column (M=16). Reads from wsRow.
-* Out: A = 1 if qualifies, 0 otherwise. M=16 on exit.
-* Trashes A, Y, P. Preserves X, D, B.
-wsTestBnd       anop
-                lsr     a
-                tay
-                bcs     wsTbnOdd
-                sep     #$20
-                LONGA   OFF
-                lda     [wsRow],y
-                lsr     a
-                lsr     a
-                lsr     a
-                lsr     a
-                bra     wsTbnHave
-wsTbnOdd        anop
-                sep     #$20
-                LONGA   OFF
-                lda     [wsRow],y
-                and     #$0F
-wsTbnHave       anop
-                cmp     >wsMatchByte
-                beq     wsTbnNo
-                cmp     >wsNewByte
-                beq     wsTbnNo
-                rep     #$20
-                LONGA   ON
-                lda     #1
-                rts
-wsTbnNo         anop
-                rep     #$20
-                LONGA   ON
-                lda     #0
-                rts
+* wsTestEq / wsTestBnd helpers deleted -- the seed test and walk-out
+* loops now expand the test inline (no JSR/RTS overhead per walked
+* pixel; saves ~12 cyc/pixel x ~3600 walked pixels in the demo).


 * wsScanAndPush: walk wsScanRow[wsLeftX..wsRightX] for run-edge