PEI slam and dirty tracking!

This commit is contained in:
Scott Duensing 2026-04-30 13:08:53 -05:00
parent af366e7e81
commit 065be89bff
3 changed files with 654 additions and 204 deletions

View file

@ -188,13 +188,13 @@ static void floodFillInternal(SurfaceT *s, int16_t startX, int16_t startY, uint8
if (y < 0 || y >= SURFACE_HEIGHT || x < 0 || x >= SURFACE_WIDTH) {
continue;
}
row = &s->pixels[y * SURFACE_BYTES_PER_ROW];
// Highest-tier asm fast path: seed-test + walk-left + walk-right
// + 1-row fill + scan-above + scan-below + push, all in one
// cross-segment call. The asm caches row addr / match decoder
// across every sub-operation. C just pops and dispatches; this
// path completes the entire per-seed work.
// path completes the entire per-seed work and computes the row
// address itself, so we don't pay y*160 in C unless we fall back.
{
bool seedMatched;
if (halFastFloodWalkAndScans(s->pixels, x, y,
@ -206,6 +206,10 @@ static void floodFillInternal(SurfaceT *s, int16_t startX, int16_t startY, uint8
}
}
// Fallback path needs row; compute it here so the asm path
// above doesn't pay for an unused y*160 multiply on every iter.
row = &s->pixels[y * SURFACE_BYTES_PER_ROW];
// Tier-2 asm fast path: combined seed test + walk-left +
// walk-right in one cross-segment call. Falls back to the
// pure-C walks below on ports without an asm implementation.

View file

@ -164,13 +164,13 @@ static uint8_t gCachedScb [SURFACE_HEIGHT];
static uint16_t gCachedPalette[SURFACE_PALETTE_COUNT][SURFACE_COLORS_PER_PALETTE];
static bool gCacheValid = false;
// PEI slam scratch shared with src/port/iigs/peislam.asm. File-scope
// non-static so the asm can `ext` them; all accesses inside the slam
// use long-mode addressing so they bypass the //e RAMRD redirect the
// slam turns on for the duration of the run.
// PEI slam scratch. File-scope non-static so the asm can `ext` them;
// all accesses inside the slam use long-mode `>` addressing so they
// bypass the //e RAMRD redirect the slam turns on for its duration.
volatile uint16_t gPeiOrigSp;
volatile uint8_t gPeiOrigShadow;
volatile uint16_t gPeiTempRowBase;
volatile uint16_t gPeiCurRow; // row counter saved across slam (stack is hijacked)
// Defined in src/port/iigs/peislam.asm, in its own load segment
// (DRAWPRIMS) so the GS/OS loader places it in a different bank from

View file

@ -1362,82 +1362,243 @@ dcLoopBody anop
lda >gRowOffsetLut,x ; A = y*160
sta >dcRowXN
* 8 octant plots. dcPlotPx wants A=col, X=rowBase. LDX has no long-
* absolute mode, so for each plot we stash col, load row via LDA/TAX,
* then reload col into A.
* Octants 1-4 use the y-row pair (cx +/- x, cy +/- y).
* 8 octant plots, fully inlined. Each plot:
* 1. col = (acx +/- dcX|dcY) -> A
* 2. save col -> dcSavedCol (for parity test)
* 3. byteIdx = col >> 1; byte addr = byteIdx + rowBase -> Y
* 4. test col & 1; do high or low nibble RMW
* Skips the JSR/RTS to dcPlotPx (~12 cyc) and the load-row-via-X
* dance (sta dcSavedCol + tax + reload). Per plot: ~60 cyc vs ~80
* cyc with JSR. ~20 cyc/plot x ~2560 plots in the demo's 4 circles
* = ~18 ms.
*
* Each plot has its own dcOddN / dcDoneN labels (ORCA-M needs unique).
* Octants 1-4: y-row pair (cx +/- x, cy +/- y).
* Plot 1: (cx+x, cy+y)
lda acx
clc
adc >dcX
sta >dcSavedCol
lda >dcRowYP
tax
lsr a
clc
adc >dcRowYP
tay
lda >dcSavedCol
jsr dcPlotPx ; (cx+x, cy+y)
sep #$20
LONGA OFF
and #1
bne dcOdd1
lda [pix],y
and #$0F
ora >dcNibHi
sta [pix],y
bra dcDone1
dcOdd1 anop
lda [pix],y
and #$F0
ora >dcNibLo
sta [pix],y
dcDone1 anop
rep #$20
LONGA ON
* Plot 2: (cx-x, cy+y)
lda acx
sec
sbc >dcX
sta >dcSavedCol
lda >dcRowYP
tax
lsr a
clc
adc >dcRowYP
tay
lda >dcSavedCol
jsr dcPlotPx ; (cx-x, cy+y)
sep #$20
LONGA OFF
and #1
bne dcOdd2
lda [pix],y
and #$0F
ora >dcNibHi
sta [pix],y
bra dcDone2
dcOdd2 anop
lda [pix],y
and #$F0
ora >dcNibLo
sta [pix],y
dcDone2 anop
rep #$20
LONGA ON
* Plot 3: (cx+x, cy-y)
lda acx
clc
adc >dcX
sta >dcSavedCol
lda >dcRowYN
tax
lsr a
clc
adc >dcRowYN
tay
lda >dcSavedCol
jsr dcPlotPx ; (cx+x, cy-y)
sep #$20
LONGA OFF
and #1
bne dcOdd3
lda [pix],y
and #$0F
ora >dcNibHi
sta [pix],y
bra dcDone3
dcOdd3 anop
lda [pix],y
and #$F0
ora >dcNibLo
sta [pix],y
dcDone3 anop
rep #$20
LONGA ON
* Plot 4: (cx-x, cy-y)
lda acx
sec
sbc >dcX
sta >dcSavedCol
lda >dcRowYN
tax
lsr a
clc
adc >dcRowYN
tay
lda >dcSavedCol
jsr dcPlotPx ; (cx-x, cy-y)
sep #$20
LONGA OFF
and #1
bne dcOdd4
lda [pix],y
and #$0F
ora >dcNibHi
sta [pix],y
bra dcDone4
dcOdd4 anop
lda [pix],y
and #$F0
ora >dcNibLo
sta [pix],y
dcDone4 anop
rep #$20
LONGA ON
* Octants 5-8 use the x-row pair (cx +/- y, cy +/- x).
* Octants 5-8: x-row pair (cx +/- y, cy +/- x).
* Plot 5: (cx+y, cy+x)
lda acx
clc
adc >dcY
sta >dcSavedCol
lda >dcRowXP
tax
lsr a
clc
adc >dcRowXP
tay
lda >dcSavedCol
jsr dcPlotPx ; (cx+y, cy+x)
sep #$20
LONGA OFF
and #1
bne dcOdd5
lda [pix],y
and #$0F
ora >dcNibHi
sta [pix],y
bra dcDone5
dcOdd5 anop
lda [pix],y
and #$F0
ora >dcNibLo
sta [pix],y
dcDone5 anop
rep #$20
LONGA ON
* Plot 6: (cx-y, cy+x)
lda acx
sec
sbc >dcY
sta >dcSavedCol
lda >dcRowXP
tax
lsr a
clc
adc >dcRowXP
tay
lda >dcSavedCol
jsr dcPlotPx ; (cx-y, cy+x)
sep #$20
LONGA OFF
and #1
bne dcOdd6
lda [pix],y
and #$0F
ora >dcNibHi
sta [pix],y
bra dcDone6
dcOdd6 anop
lda [pix],y
and #$F0
ora >dcNibLo
sta [pix],y
dcDone6 anop
rep #$20
LONGA ON
* Plot 7: (cx+y, cy-x)
lda acx
clc
adc >dcY
sta >dcSavedCol
lda >dcRowXN
tax
lsr a
clc
adc >dcRowXN
tay
lda >dcSavedCol
jsr dcPlotPx ; (cx+y, cy-x)
sep #$20
LONGA OFF
and #1
bne dcOdd7
lda [pix],y
and #$0F
ora >dcNibHi
sta [pix],y
bra dcDone7
dcOdd7 anop
lda [pix],y
and #$F0
ora >dcNibLo
sta [pix],y
dcDone7 anop
rep #$20
LONGA ON
* Plot 8: (cx-y, cy-x)
lda acx
sec
sbc >dcY
sta >dcSavedCol
lda >dcRowXN
tax
lsr a
clc
adc >dcRowXN
tay
lda >dcSavedCol
jsr dcPlotPx ; (cx-y, cy-x)
sep #$20
LONGA OFF
and #1
bne dcOdd8
lda [pix],y
and #$0F
ora >dcNibHi
sta [pix],y
bra dcDone8
dcOdd8 anop
lda [pix],y
and #$F0
ora >dcNibLo
sta [pix],y
dcDone8 anop
rep #$20
LONGA ON
* Update Bresenham: y++; if err <= 0: err += 2y+1; else x--; err += 2(y-x)+1.
lda >dcY
@ -1483,44 +1644,9 @@ dcExit anop
* dcMul160 deleted -- callers now expand the y160lut macro inline.
****************************************************************
* dcPlotPx: plot a pixel at column A, with row-base offset in X.
* M=16, X=16 on entry. Trashes A, X, Y, P. D and B preserved.
* Switches to M=8 for the byte RMW then back to M=16 for caller.
****************************************************************
dcPlotPx anop
lsr a ; A = col>>1, C = col & 1
bcs dcPlotOdd
* Even column: high nibble.
sta >dcMulTmp
txa
clc
adc >dcMulTmp
tay
sep #$20
LONGA OFF
lda [pix],y
and #$0F
ora >dcNibHi
sta [pix],y
rep #$20
LONGA ON
rts
dcPlotOdd anop
sta >dcMulTmp
txa
clc
adc >dcMulTmp
tay
sep #$20
LONGA OFF
lda [pix],y
and #$F0
ora >dcNibLo
sta [pix],y
rep #$20
LONGA ON
rts
* dcPlotPx deleted -- the 8 octant plot sites now expand the plot
* logic inline (no JSR/RTS, no STA dcSavedCol / TAX / reload-col
* round-trip per plot).
end
@ -2224,39 +2350,205 @@ bpal equ 4
adc #8
tcd
* 1. Pixel blit (DBR ends up = $E1 after MVN).
ldx #$2000
ldy #$2000
lda #31999
mvn $010000,$E10000
* 2. SCB upload (200 bytes). DBR = $E1, so sta abs,Y -> $E1:abs+Y.
ldy #0
* 1. SCB upload (200 bytes) via MVN. Done BEFORE the PEI-slam so the
* SEI window only spans the slam itself (~38 ms). Source bank is
* runtime-patched into the MVN instruction (encoding: $54 dst src,
* so byte +2 is src).
sep #$20
LONGA OFF
bscbLoop anop
cpy #200
beq bscbDone
lda [bscb],y
sta $9D00,y
iny
bra bscbLoop
bscbDone anop
lda bscb+2
sta >mvnScbInst+2
rep #$20
LONGA ON
lda bscb
tax
ldy #$9D00
lda #199
mvnScbInst mvn $000000,$E10000
* 2. Palette upload (512 bytes) via MVN. Same trick.
sep #$20
LONGA OFF
lda bpal+2
sta >mvnPalInst+2
rep #$20
LONGA ON
lda bpal
tax
ldy #$9E00
lda #511
mvnPalInst mvn $000000,$E10000
* 3. Pixel blit via PEI-slam, with per-row dirty skip.
* PEI-slam: SP hijacked into the SHR shadow region of bank $01, AUXWRITE
* + RAMRD remap bank-$00 stack pushes to bank $01, SHR shadow mirrors
* bank-$01 writes to $E1. Result: PEI dp pushes from DP=$01:row_start
* land at $E1:row_start (160 bytes / row at ~6 cyc per 2 bytes).
* ~480 cyc/row vs MVN's ~1120 cyc/row -- 2.3x faster per row.
* SEI for the duration: soft-switch state and stack hijack would
* corrupt any IRQ handler that touches bank-0 globals. ~38 ms SEI
* total for a full 200-row slam; chunk later if audio glitches.
* Dirty skip: rows where gStageMinWord[y] > gStageMaxWord[y] are
* clean and not slammed. Saves big on sparse-update demos; for
* full-screen presents (DRAW), every row slams.
tsc
sta >gPeiOrigSp
sep #$20
LONGA OFF
lda >$00C035
sta >gPeiOrigShadow
rep #$20
LONGA ON
* 3. Palette upload (512 bytes).
ldy #0
sei
sep #$20
LONGA OFF
bpalLoop anop
cpy #512
beq bpalDone
lda [bpal],y
sta $9E00,y
iny
bra bpalLoop
bpalDone anop
lda >gPeiOrigShadow
and #$F1 ; clear bits 1,2,3 -> SHR shadow ON
sta >$00C035
lda #0
sta >$00C005 ; AUXWRITE on
sta >$00C003 ; RAMRD on
rep #$20
LONGA ON
ldx #0 ; X = row counter (need X because
* long-abs,Y doesn't exist on 65816 --
* only long-abs,X does, so the dirty-
* check `lda >gStageMinWord,x` works.)
peiRowLoop anop
cpx #200
bcc peiRowCheck
brl peiRowsDone
peiRowCheck anop
sep #$20
LONGA OFF
lda >gStageMinWord,x
cmp >gStageMaxWord,x
rep #$20
LONGA ON
bcc peiSlamRow ; min < max -> dirty
beq peiSlamRow ; min == max -> 1-word dirty
inx ; clean row, skip
brl peiRowLoop
peiSlamRow anop
* Save X into long-mode scratch (stack is hijacked into $E1, can't PHX).
txa
sta >gPeiCurRow
asl a ; A = y*2 (LUT byte offset)
tax
lda >gRowOffsetLut,x ; A = y*160
clc
adc #$2000 ; A = row_start
tcd ; D = row_start (PEI dp base)
clc
adc #159
tcs ; SP = row_start + 159
* 80 PEIs from DP+$9E down to DP+$00. Each pushes 2 bytes; SP decreases
* by 2 each PEI. Final SP = row_start - 1. Bytes land at $E1:row_start
* through $E1:row_start+159 (in correct memory order because we walk
* DP offsets backwards).
pei $9E
pei $9C
pei $9A
pei $98
pei $96
pei $94
pei $92
pei $90
pei $8E
pei $8C
pei $8A
pei $88
pei $86
pei $84
pei $82
pei $80
pei $7E
pei $7C
pei $7A
pei $78
pei $76
pei $74
pei $72
pei $70
pei $6E
pei $6C
pei $6A
pei $68
pei $66
pei $64
pei $62
pei $60
pei $5E
pei $5C
pei $5A
pei $58
pei $56
pei $54
pei $52
pei $50
pei $4E
pei $4C
pei $4A
pei $48
pei $46
pei $44
pei $42
pei $40
pei $3E
pei $3C
pei $3A
pei $38
pei $36
pei $34
pei $32
pei $30
pei $2E
pei $2C
pei $2A
pei $28
pei $26
pei $24
pei $22
pei $20
pei $1E
pei $1C
pei $1A
pei $18
pei $16
pei $14
pei $12
pei $10
pei $0E
pei $0C
pei $0A
pei $08
pei $06
pei $04
pei $02
pei $00
lda >gPeiCurRow
tax
inx
brl peiRowLoop
peiRowsDone anop
* Restore SP, soft-switches.
lda >gPeiOrigSp
tcs
sep #$20
LONGA OFF
lda >gPeiOrigShadow
sta >$00C035
lda #0
sta >$00C004 ; AUXWRITE off
sta >$00C002 ; RAMRD off
rep #$20
LONGA ON
@ -2264,7 +2556,7 @@ bpalDone anop
LONGI OFF
pld
plb
plp
plp ; restores I (pre-SEI value)
rtl
end
@ -2889,47 +3181,148 @@ wsScanCurHit equ 27 ; alias wsMaxSp.hi, 8-bit
brl wsWalkBndEntry
***** EQUAL MODE WALK *****
* Seed + walk-left + walk-right with the pixel test inlined (kills
* JSR/RTS overhead per pixel) and per-iter long-mode access to
* gFloodLeftX/RightX replaced by DP-relative <wsScanCurX. Pattern at
* every test site:
* lsr a ; byteIdx + parity
* tay
* sep #$20 / lda [wsRow],y / nibble extract / cmp >wsMatchByte / rep #$20
wsWalkEqEntry anop
* Seed test at wsX (inline eq).
* --- SEED TEST EQ (inline) ---
lda wsX
jsr wsTestEq ; A = 1 if pix == matchByte
lsr a
tay
sep #$20
LONGA OFF
bcs wsSeedEqOdd
lda [wsRow],y
lsr a
lsr a
lsr a
lsr a
bra wsSeedEqHave
wsSeedEqOdd anop
lda [wsRow],y
and #$0F
wsSeedEqHave anop
cmp >wsMatchByte
rep #$20
LONGA ON
bne wsSeedEqMiss
lda #1
sta >gFloodSeedMatch
bra wsWalkEqSeedOk
wsSeedEqMiss anop
lda #0
sta >gFloodSeedMatch
cmp #0
bne wsWalkEqSeedOk
brl wsExit
wsWalkEqSeedOk anop
* Walk left: leftX = wsX; while leftX > 0 and pixel(leftX-1) matches,
* leftX--. Holds leftX in <wsScanCurX (DP) for the loop.
* --- WALK LEFT EQ (byte-cached: 1 byte read per 2 walked pixels) ---
* Splits the loop into evenEntry / oddEntry paths.
* evenEntry: currentX even -> test column C-1 (odd, low nib of byte Y-1).
* Read NEW byte at Y-1; cache; test low nib.
* oddEntry: currentX odd -> test column C-1 (even, high nib of byte Y).
* Reuse CACHED byte from previous iter; extract high nib.
* Initial: peel parity once to seed the cache and pick entry point.
lda wsX
sta <wsScanCurX
wsLeftEqLoop anop
lsr a
tay ; Y = byteIdx
bcc wsLEqEvenEntry ; parity 0: enter even path
* parity 1 (odd): seed cache then drop into odd path.
sep #$20
LONGA OFF
lda [wsRow],y
sta <wsScanByte
rep #$20
LONGA ON
bra wsLEqOddEntry
wsLEqEvenEntry anop
lda <wsScanCurX
beq wsLeftEqDone
dec a
jsr wsTestEq
cmp #0
beq wsLeftEqDone
dey
sep #$20
LONGA OFF
lda [wsRow],y
sta <wsScanByte
and #$0F
cmp >wsMatchByte
rep #$20
LONGA ON
bne wsLeftEqDone
dec <wsScanCurX
brl wsLeftEqLoop
wsLEqOddEntry anop
lda <wsScanCurX
beq wsLeftEqDone
sep #$20
LONGA OFF
lda <wsScanByte
lsr a
lsr a
lsr a
lsr a
cmp >wsMatchByte
rep #$20
LONGA ON
bne wsLeftEqDone
dec <wsScanCurX
bra wsLEqEvenEntry
wsLeftEqDone anop
lda <wsScanCurX
sta >gFloodLeftX
* Walk right: rightX = wsX; while rightX < 319 and pixel(rightX+1)
* matches, rightX++. Holds rightX in <wsScanCurX.
* --- WALK RIGHT EQ (byte-cached) ---
* evenEntry: currentX even -> test C+1 (odd, low nib of CACHED byte at Y).
* oddEntry: currentX odd -> test C+1 (even, high nib of byte Y+1).
* Inc Y, read NEW byte, cache, test high nib.
lda wsX
sta <wsScanCurX
wsRightEqLoop anop
lsr a
tay
bcs wsREqOddEntry
sep #$20
LONGA OFF
lda [wsRow],y
sta <wsScanByte
rep #$20
LONGA ON
bra wsREqEvenEntry
wsREqEvenEntry anop
lda <wsScanCurX
cmp #319
bcs wsRightEqDone
inc a
jsr wsTestEq
cmp #0
beq wsRightEqDone
sep #$20
LONGA OFF
lda <wsScanByte
and #$0F
cmp >wsMatchByte
rep #$20
LONGA ON
bne wsRightEqDone
inc <wsScanCurX
brl wsRightEqLoop
wsREqOddEntry anop
lda <wsScanCurX
cmp #319
bcs wsRightEqDone
iny
sep #$20
LONGA OFF
lda [wsRow],y
sta <wsScanByte
lsr a
lsr a
lsr a
lsr a
cmp >wsMatchByte
rep #$20
LONGA ON
bne wsRightEqDone
inc <wsScanCurX
bra wsREqEvenEntry
wsRightEqDone anop
lda <wsScanCurX
sta >gFloodRightX
@ -2937,41 +3330,160 @@ wsRightEqDone anop
***** BOUNDARY MODE WALK *****
wsWalkBndEntry anop
* --- SEED TEST BND (inline) ---
lda wsX
jsr wsTestBnd
lsr a
tay
sep #$20
LONGA OFF
bcs wsSeedBndOdd
lda [wsRow],y
lsr a
lsr a
lsr a
lsr a
bra wsSeedBndHave
wsSeedBndOdd anop
lda [wsRow],y
and #$0F
wsSeedBndHave anop
cmp >wsMatchByte
beq wsSeedBndMiss
cmp >wsNewByte
beq wsSeedBndMiss
rep #$20
LONGA ON
lda #1
sta >gFloodSeedMatch
bra wsWalkBndSeedOk
wsSeedBndMiss anop
rep #$20
LONGA ON
lda #0
sta >gFloodSeedMatch
cmp #0
bne wsWalkBndSeedOk
brl wsExit
wsWalkBndSeedOk anop
* --- WALK LEFT BND (byte-cached) ---
lda wsX
sta <wsScanCurX
wsLeftBndLoop anop
lsr a
tay
bcc wsLBndEvenEntry
sep #$20
LONGA OFF
lda [wsRow],y
sta <wsScanByte
rep #$20
LONGA ON
bra wsLBndOddEntry
wsLBndEvenEntry anop
lda <wsScanCurX
beq wsLeftBndDone
dec a
jsr wsTestBnd
cmp #0
beq wsLeftBndDone
dey
sep #$20
LONGA OFF
lda [wsRow],y
sta <wsScanByte
and #$0F
cmp >wsMatchByte
beq wsLBndStop
cmp >wsNewByte
beq wsLBndStop
rep #$20
LONGA ON
dec <wsScanCurX
brl wsLeftBndLoop
bra wsLBndOddEntry2
wsLBndStop anop
rep #$20
LONGA ON
bra wsLeftBndDone
wsLBndOddEntry2 anop
wsLBndOddEntry anop
lda <wsScanCurX
beq wsLeftBndDone
sep #$20
LONGA OFF
lda <wsScanByte
lsr a
lsr a
lsr a
lsr a
cmp >wsMatchByte
beq wsLBndStop2
cmp >wsNewByte
beq wsLBndStop2
rep #$20
LONGA ON
dec <wsScanCurX
bra wsLBndEvenEntry
wsLBndStop2 anop
rep #$20
LONGA ON
wsLeftBndDone anop
lda <wsScanCurX
sta >gFloodLeftX
* --- WALK RIGHT BND (byte-cached) ---
lda wsX
sta <wsScanCurX
wsRightBndLoop anop
lsr a
tay
bcs wsRBndOddEntry
sep #$20
LONGA OFF
lda [wsRow],y
sta <wsScanByte
rep #$20
LONGA ON
bra wsRBndEvenEntry
wsRBndEvenEntry anop
lda <wsScanCurX
cmp #319
bcs wsRightBndDone
inc a
jsr wsTestBnd
cmp #0
beq wsRightBndDone
sep #$20
LONGA OFF
lda <wsScanByte
and #$0F
cmp >wsMatchByte
beq wsRBndStop
cmp >wsNewByte
beq wsRBndStop
rep #$20
LONGA ON
inc <wsScanCurX
brl wsRightBndLoop
bra wsRBndOddEntry2
wsRBndStop anop
rep #$20
LONGA ON
bra wsRightBndDone
wsRBndOddEntry2 anop
wsRBndOddEntry anop
lda <wsScanCurX
cmp #319
bcs wsRightBndDone
iny
sep #$20
LONGA OFF
lda [wsRow],y
sta <wsScanByte
lsr a
lsr a
lsr a
lsr a
cmp >wsMatchByte
beq wsRBndStop2
cmp >wsNewByte
beq wsRBndStop2
rep #$20
LONGA ON
inc <wsScanCurX
bra wsRBndEvenEntry
wsRBndStop2 anop
rep #$20
LONGA ON
wsRightBndDone anop
lda <wsScanCurX
sta >gFloodRightX
@ -3108,75 +3620,9 @@ wsExit anop
plp
rtl
* wsTestEq: test pixel at column A against matchByte (eq mode).
* In: A = column (M=16). Reads from wsRow.
* Out: A = 1 if pix == matchByte, 0 otherwise. M=16 on exit.
* Trashes A, Y, P. Preserves X, D, B.
wsTestEq anop
lsr a ; A = byteIdx; C = column & 1
tay
bcs wsTeqOdd
sep #$20
LONGA OFF
lda [wsRow],y
lsr a
lsr a
lsr a
lsr a
bra wsTeqHave
wsTeqOdd anop
sep #$20
LONGA OFF
lda [wsRow],y
and #$0F
wsTeqHave anop
cmp >wsMatchByte
rep #$20
LONGA ON
bne wsTeqNo
lda #1
rts
wsTeqNo anop
lda #0
rts
* wsTestBnd: test pixel at column A against (matchByte, newByte) for
* boundary-mode flood: returns 1 iff pix != matchByte AND pix != newByte.
* In: A = column (M=16). Reads from wsRow.
* Out: A = 1 if qualifies, 0 otherwise. M=16 on exit.
* Trashes A, Y, P. Preserves X, D, B.
wsTestBnd anop
lsr a
tay
bcs wsTbnOdd
sep #$20
LONGA OFF
lda [wsRow],y
lsr a
lsr a
lsr a
lsr a
bra wsTbnHave
wsTbnOdd anop
sep #$20
LONGA OFF
lda [wsRow],y
and #$0F
wsTbnHave anop
cmp >wsMatchByte
beq wsTbnNo
cmp >wsNewByte
beq wsTbnNo
rep #$20
LONGA ON
lda #1
rts
wsTbnNo anop
rep #$20
LONGA ON
lda #0
rts
* wsTestEq / wsTestBnd helpers deleted -- the seed test and walk-out
* loops now expand the test inline (no JSR/RTS overhead per walked
* pixel; saves ~12 cyc/pixel x ~3600 walked pixels in the demo).
* wsScanAndPush: walk wsScanRow[wsLeftX..wsRightX] for run-edge