diff --git a/src/core/draw.c b/src/core/draw.c index 290e23f..91c220a 100644 --- a/src/core/draw.c +++ b/src/core/draw.c @@ -188,13 +188,13 @@ static void floodFillInternal(SurfaceT *s, int16_t startX, int16_t startY, uint8 if (y < 0 || y >= SURFACE_HEIGHT || x < 0 || x >= SURFACE_WIDTH) { continue; } - row = &s->pixels[y * SURFACE_BYTES_PER_ROW]; // Highest-tier asm fast path: seed-test + walk-left + walk-right // + 1-row fill + scan-above + scan-below + push, all in one // cross-segment call. The asm caches row addr / match decoder // across every sub-operation. C just pops and dispatches; this - // path completes the entire per-seed work. + // path completes the entire per-seed work and computes the row + // address itself, so we don't pay y*160 in C unless we fall back. { bool seedMatched; if (halFastFloodWalkAndScans(s->pixels, x, y, @@ -206,6 +206,10 @@ static void floodFillInternal(SurfaceT *s, int16_t startX, int16_t startY, uint8 } } + // Fallback path needs row; compute it here so the asm path + // above doesn't pay for an unused y*160 multiply on every iter. + row = &s->pixels[y * SURFACE_BYTES_PER_ROW]; + // Tier-2 asm fast path: combined seed test + walk-left + // walk-right in one cross-segment call. Falls back to the // pure-C walks below on ports without an asm implementation. diff --git a/src/port/iigs/hal.c b/src/port/iigs/hal.c index cb58d48..a198ff0 100644 --- a/src/port/iigs/hal.c +++ b/src/port/iigs/hal.c @@ -164,13 +164,13 @@ static uint8_t gCachedScb [SURFACE_HEIGHT]; static uint16_t gCachedPalette[SURFACE_PALETTE_COUNT][SURFACE_COLORS_PER_PALETTE]; static bool gCacheValid = false; -// PEI slam scratch shared with src/port/iigs/peislam.asm. File-scope -// non-static so the asm can `ext` them; all accesses inside the slam -// use long-mode addressing so they bypass the //e RAMRD redirect the -// slam turns on for the duration of the run. +// PEI slam scratch. File-scope non-static so the asm can `ext` them; +// all accesses inside the slam use long-mode `>` addressing so they +// bypass the //e RAMRD redirect the slam turns on for its duration. volatile uint16_t gPeiOrigSp; volatile uint8_t gPeiOrigShadow; volatile uint16_t gPeiTempRowBase; +volatile uint16_t gPeiCurRow; // row counter saved across slam (stack is hijacked) // Defined in src/port/iigs/peislam.asm, in its own load segment // (DRAWPRIMS) so the GS/OS loader places it in a different bank from diff --git a/src/port/iigs/joeyDraw.asm b/src/port/iigs/joeyDraw.asm index c1dd426..7a74fb0 100644 --- a/src/port/iigs/joeyDraw.asm +++ b/src/port/iigs/joeyDraw.asm @@ -1362,82 +1362,243 @@ dcLoopBody anop lda >gRowOffsetLut,x ; A = y*160 sta >dcRowXN -* 8 octant plots. dcPlotPx wants A=col, X=rowBase. LDX has no long- -* absolute mode, so for each plot we stash col, load row via LDA/TAX, -* then reload col into A. -* Octants 1-4 use the y-row pair (cx +/- x, cy +/- y). +* 8 octant plots, fully inlined. Each plot: +* 1. col = (acx +/- dcX|dcY) -> A +* 2. save col -> dcSavedCol (for parity test) +* 3. byteIdx = col >> 1; byte addr = byteIdx + rowBase -> Y +* 4. test col & 1; do high or low nibble RMW +* Skips the JSR/RTS to dcPlotPx (~12 cyc) and the load-row-via-X +* dance (sta dcSavedCol + tax + reload). Per plot: ~60 cyc vs ~80 +* cyc with JSR. ~20 cyc/plot x ~2560 plots in the demo's 4 circles +* = ~18 ms. +* +* Each plot has its own dcOddN / dcDoneN labels (ORCA-M needs unique). + +* Octants 1-4: y-row pair (cx +/- x, cy +/- y). +* Plot 1: (cx+x, cy+y) lda acx clc adc >dcX sta >dcSavedCol - lda >dcRowYP - tax + lsr a + clc + adc >dcRowYP + tay lda >dcSavedCol - jsr dcPlotPx ; (cx+x, cy+y) + sep #$20 + LONGA OFF + and #1 + bne dcOdd1 + lda [pix],y + and #$0F + ora >dcNibHi + sta [pix],y + bra dcDone1 +dcOdd1 anop + lda [pix],y + and #$F0 + ora >dcNibLo + sta [pix],y +dcDone1 anop + rep #$20 + LONGA ON +* Plot 2: (cx-x, cy+y) lda acx sec sbc >dcX sta >dcSavedCol - lda >dcRowYP - tax + lsr a + clc + adc >dcRowYP + tay lda >dcSavedCol - jsr dcPlotPx ; (cx-x, cy+y) + sep #$20 + LONGA OFF + and #1 + bne dcOdd2 + lda [pix],y + and #$0F + ora >dcNibHi + sta [pix],y + bra dcDone2 +dcOdd2 anop + lda [pix],y + and #$F0 + ora >dcNibLo + sta [pix],y +dcDone2 anop + rep #$20 + LONGA ON +* Plot 3: (cx+x, cy-y) lda acx clc adc >dcX sta >dcSavedCol - lda >dcRowYN - tax + lsr a + clc + adc >dcRowYN + tay lda >dcSavedCol - jsr dcPlotPx ; (cx+x, cy-y) + sep #$20 + LONGA OFF + and #1 + bne dcOdd3 + lda [pix],y + and #$0F + ora >dcNibHi + sta [pix],y + bra dcDone3 +dcOdd3 anop + lda [pix],y + and #$F0 + ora >dcNibLo + sta [pix],y +dcDone3 anop + rep #$20 + LONGA ON +* Plot 4: (cx-x, cy-y) lda acx sec sbc >dcX sta >dcSavedCol - lda >dcRowYN - tax + lsr a + clc + adc >dcRowYN + tay lda >dcSavedCol - jsr dcPlotPx ; (cx-x, cy-y) + sep #$20 + LONGA OFF + and #1 + bne dcOdd4 + lda [pix],y + and #$0F + ora >dcNibHi + sta [pix],y + bra dcDone4 +dcOdd4 anop + lda [pix],y + and #$F0 + ora >dcNibLo + sta [pix],y +dcDone4 anop + rep #$20 + LONGA ON -* Octants 5-8 use the x-row pair (cx +/- y, cy +/- x). +* Octants 5-8: x-row pair (cx +/- y, cy +/- x). +* Plot 5: (cx+y, cy+x) lda acx clc adc >dcY sta >dcSavedCol - lda >dcRowXP - tax + lsr a + clc + adc >dcRowXP + tay lda >dcSavedCol - jsr dcPlotPx ; (cx+y, cy+x) + sep #$20 + LONGA OFF + and #1 + bne dcOdd5 + lda [pix],y + and #$0F + ora >dcNibHi + sta [pix],y + bra dcDone5 +dcOdd5 anop + lda [pix],y + and #$F0 + ora >dcNibLo + sta [pix],y +dcDone5 anop + rep #$20 + LONGA ON +* Plot 6: (cx-y, cy+x) lda acx sec sbc >dcY sta >dcSavedCol - lda >dcRowXP - tax + lsr a + clc + adc >dcRowXP + tay lda >dcSavedCol - jsr dcPlotPx ; (cx-y, cy+x) + sep #$20 + LONGA OFF + and #1 + bne dcOdd6 + lda [pix],y + and #$0F + ora >dcNibHi + sta [pix],y + bra dcDone6 +dcOdd6 anop + lda [pix],y + and #$F0 + ora >dcNibLo + sta [pix],y +dcDone6 anop + rep #$20 + LONGA ON +* Plot 7: (cx+y, cy-x) lda acx clc adc >dcY sta >dcSavedCol - lda >dcRowXN - tax + lsr a + clc + adc >dcRowXN + tay lda >dcSavedCol - jsr dcPlotPx ; (cx+y, cy-x) + sep #$20 + LONGA OFF + and #1 + bne dcOdd7 + lda [pix],y + and #$0F + ora >dcNibHi + sta [pix],y + bra dcDone7 +dcOdd7 anop + lda [pix],y + and #$F0 + ora >dcNibLo + sta [pix],y +dcDone7 anop + rep #$20 + LONGA ON +* Plot 8: (cx-y, cy-x) lda acx sec sbc >dcY sta >dcSavedCol - lda >dcRowXN - tax + lsr a + clc + adc >dcRowXN + tay lda >dcSavedCol - jsr dcPlotPx ; (cx-y, cy-x) + sep #$20 + LONGA OFF + and #1 + bne dcOdd8 + lda [pix],y + and #$0F + ora >dcNibHi + sta [pix],y + bra dcDone8 +dcOdd8 anop + lda [pix],y + and #$F0 + ora >dcNibLo + sta [pix],y +dcDone8 anop + rep #$20 + LONGA ON * Update Bresenham: y++; if err <= 0: err += 2y+1; else x--; err += 2(y-x)+1. lda >dcY @@ -1483,44 +1644,9 @@ dcExit anop * dcMul160 deleted -- callers now expand the y160lut macro inline. -**************************************************************** -* dcPlotPx: plot a pixel at column A, with row-base offset in X. -* M=16, X=16 on entry. Trashes A, X, Y, P. D and B preserved. -* Switches to M=8 for the byte RMW then back to M=16 for caller. -**************************************************************** -dcPlotPx anop - lsr a ; A = col>>1, C = col & 1 - bcs dcPlotOdd -* Even column: high nibble. - sta >dcMulTmp - txa - clc - adc >dcMulTmp - tay - sep #$20 - LONGA OFF - lda [pix],y - and #$0F - ora >dcNibHi - sta [pix],y - rep #$20 - LONGA ON - rts -dcPlotOdd anop - sta >dcMulTmp - txa - clc - adc >dcMulTmp - tay - sep #$20 - LONGA OFF - lda [pix],y - and #$F0 - ora >dcNibLo - sta [pix],y - rep #$20 - LONGA ON - rts +* dcPlotPx deleted -- the 8 octant plot sites now expand the plot +* logic inline (no JSR/RTS, no STA dcSavedCol / TAX / reload-col +* round-trip per plot). end @@ -2224,39 +2350,205 @@ bpal equ 4 adc #8 tcd -* 1. Pixel blit (DBR ends up = $E1 after MVN). - ldx #$2000 - ldy #$2000 - lda #31999 - mvn $010000,$E10000 - -* 2. SCB upload (200 bytes). DBR = $E1, so sta abs,Y -> $E1:abs+Y. - ldy #0 +* 1. SCB upload (200 bytes) via MVN. Done BEFORE the PEI-slam so the +* SEI window only spans the slam itself (~38 ms). Source bank is +* runtime-patched into the MVN instruction (encoding: $54 dst src, +* so byte +2 is src). sep #$20 LONGA OFF -bscbLoop anop - cpy #200 - beq bscbDone - lda [bscb],y - sta $9D00,y - iny - bra bscbLoop -bscbDone anop + lda bscb+2 + sta >mvnScbInst+2 + rep #$20 + LONGA ON + lda bscb + tax + ldy #$9D00 + lda #199 +mvnScbInst mvn $000000,$E10000 + +* 2. Palette upload (512 bytes) via MVN. Same trick. + sep #$20 + LONGA OFF + lda bpal+2 + sta >mvnPalInst+2 + rep #$20 + LONGA ON + lda bpal + tax + ldy #$9E00 + lda #511 +mvnPalInst mvn $000000,$E10000 + +* 3. Pixel blit via PEI-slam, with per-row dirty skip. +* PEI-slam: SP hijacked into the SHR shadow region of bank $01, AUXWRITE +* + RAMRD remap bank-$00 stack pushes to bank $01, SHR shadow mirrors +* bank-$01 writes to $E1. Result: PEI dp pushes from DP=$01:row_start +* land at $E1:row_start (160 bytes / row at ~6 cyc per 2 bytes). +* ~480 cyc/row vs MVN's ~1120 cyc/row -- 2.3x faster per row. +* SEI for the duration: soft-switch state and stack hijack would +* corrupt any IRQ handler that touches bank-0 globals. ~38 ms SEI +* total for a full 200-row slam; chunk later if audio glitches. +* Dirty skip: rows where gStageMinWord[y] > gStageMaxWord[y] are +* clean and not slammed. Saves big on sparse-update demos; for +* full-screen presents (DRAW), every row slams. + + tsc + sta >gPeiOrigSp + sep #$20 + LONGA OFF + lda >$00C035 + sta >gPeiOrigShadow rep #$20 LONGA ON -* 3. Palette upload (512 bytes). - ldy #0 + sei + sep #$20 LONGA OFF -bpalLoop anop - cpy #512 - beq bpalDone - lda [bpal],y - sta $9E00,y - iny - bra bpalLoop -bpalDone anop + lda >gPeiOrigShadow + and #$F1 ; clear bits 1,2,3 -> SHR shadow ON + sta >$00C035 + lda #0 + sta >$00C005 ; AUXWRITE on + sta >$00C003 ; RAMRD on + rep #$20 + LONGA ON + + ldx #0 ; X = row counter (need X because +* long-abs,Y doesn't exist on 65816 -- +* only long-abs,X does, so the dirty- +* check `lda >gStageMinWord,x` works.) +peiRowLoop anop + cpx #200 + bcc peiRowCheck + brl peiRowsDone +peiRowCheck anop + sep #$20 + LONGA OFF + lda >gStageMinWord,x + cmp >gStageMaxWord,x + rep #$20 + LONGA ON + bcc peiSlamRow ; min < max -> dirty + beq peiSlamRow ; min == max -> 1-word dirty + inx ; clean row, skip + brl peiRowLoop + +peiSlamRow anop +* Save X into long-mode scratch (stack is hijacked into $E1, can't PHX). + txa + sta >gPeiCurRow + asl a ; A = y*2 (LUT byte offset) + tax + lda >gRowOffsetLut,x ; A = y*160 + clc + adc #$2000 ; A = row_start + tcd ; D = row_start (PEI dp base) + clc + adc #159 + tcs ; SP = row_start + 159 + +* 80 PEIs from DP+$9E down to DP+$00. Each pushes 2 bytes; SP decreases +* by 2 each PEI. Final SP = row_start - 1. Bytes land at $E1:row_start +* through $E1:row_start+159 (in correct memory order because we walk +* DP offsets backwards). + pei $9E + pei $9C + pei $9A + pei $98 + pei $96 + pei $94 + pei $92 + pei $90 + pei $8E + pei $8C + pei $8A + pei $88 + pei $86 + pei $84 + pei $82 + pei $80 + pei $7E + pei $7C + pei $7A + pei $78 + pei $76 + pei $74 + pei $72 + pei $70 + pei $6E + pei $6C + pei $6A + pei $68 + pei $66 + pei $64 + pei $62 + pei $60 + pei $5E + pei $5C + pei $5A + pei $58 + pei $56 + pei $54 + pei $52 + pei $50 + pei $4E + pei $4C + pei $4A + pei $48 + pei $46 + pei $44 + pei $42 + pei $40 + pei $3E + pei $3C + pei $3A + pei $38 + pei $36 + pei $34 + pei $32 + pei $30 + pei $2E + pei $2C + pei $2A + pei $28 + pei $26 + pei $24 + pei $22 + pei $20 + pei $1E + pei $1C + pei $1A + pei $18 + pei $16 + pei $14 + pei $12 + pei $10 + pei $0E + pei $0C + pei $0A + pei $08 + pei $06 + pei $04 + pei $02 + pei $00 + + lda >gPeiCurRow + tax + inx + brl peiRowLoop + +peiRowsDone anop +* Restore SP, soft-switches. + lda >gPeiOrigSp + tcs + sep #$20 + LONGA OFF + lda >gPeiOrigShadow + sta >$00C035 + lda #0 + sta >$00C004 ; AUXWRITE off + sta >$00C002 ; RAMRD off rep #$20 LONGA ON @@ -2264,7 +2556,7 @@ bpalDone anop LONGI OFF pld plb - plp + plp ; restores I (pre-SEI value) rtl end @@ -2889,47 +3181,148 @@ wsScanCurHit equ 27 ; alias wsMaxSp.hi, 8-bit brl wsWalkBndEntry ***** EQUAL MODE WALK ***** +* Seed + walk-left + walk-right with the pixel test inlined (kills +* JSR/RTS overhead per pixel) and per-iter long-mode access to +* gFloodLeftX/RightX replaced by DP-relative wsMatchByte / rep #$20 wsWalkEqEntry anop -* Seed test at wsX (inline eq). +* --- SEED TEST EQ (inline) --- lda wsX - jsr wsTestEq ; A = 1 if pix == matchByte + lsr a + tay + sep #$20 + LONGA OFF + bcs wsSeedEqOdd + lda [wsRow],y + lsr a + lsr a + lsr a + lsr a + bra wsSeedEqHave +wsSeedEqOdd anop + lda [wsRow],y + and #$0F +wsSeedEqHave anop + cmp >wsMatchByte + rep #$20 + LONGA ON + bne wsSeedEqMiss + lda #1 + sta >gFloodSeedMatch + bra wsWalkEqSeedOk +wsSeedEqMiss anop + lda #0 sta >gFloodSeedMatch - cmp #0 - bne wsWalkEqSeedOk brl wsExit wsWalkEqSeedOk anop -* Walk left: leftX = wsX; while leftX > 0 and pixel(leftX-1) matches, -* leftX--. Holds leftX in test column C-1 (odd, low nib of byte Y-1). +* Read NEW byte at Y-1; cache; test low nib. +* oddEntry: currentX odd -> test column C-1 (even, high nib of byte Y). +* Reuse CACHED byte from previous iter; extract high nib. +* Initial: peel parity once to seed the cache and pick entry point. lda wsX sta wsMatchByte + rep #$20 + LONGA ON + bne wsLeftEqDone dec wsMatchByte + rep #$20 + LONGA ON + bne wsLeftEqDone + dec gFloodLeftX -* Walk right: rightX = wsX; while rightX < 319 and pixel(rightX+1) -* matches, rightX++. Holds rightX in test C+1 (odd, low nib of CACHED byte at Y). +* oddEntry: currentX odd -> test C+1 (even, high nib of byte Y+1). +* Inc Y, read NEW byte, cache, test high nib. lda wsX sta wsMatchByte + rep #$20 + LONGA ON + bne wsRightEqDone inc wsMatchByte + rep #$20 + LONGA ON + bne wsRightEqDone + inc gFloodRightX @@ -2937,41 +3330,160 @@ wsRightEqDone anop ***** BOUNDARY MODE WALK ***** wsWalkBndEntry anop +* --- SEED TEST BND (inline) --- lda wsX - jsr wsTestBnd + lsr a + tay + sep #$20 + LONGA OFF + bcs wsSeedBndOdd + lda [wsRow],y + lsr a + lsr a + lsr a + lsr a + bra wsSeedBndHave +wsSeedBndOdd anop + lda [wsRow],y + and #$0F +wsSeedBndHave anop + cmp >wsMatchByte + beq wsSeedBndMiss + cmp >wsNewByte + beq wsSeedBndMiss + rep #$20 + LONGA ON + lda #1 + sta >gFloodSeedMatch + bra wsWalkBndSeedOk +wsSeedBndMiss anop + rep #$20 + LONGA ON + lda #0 sta >gFloodSeedMatch - cmp #0 - bne wsWalkBndSeedOk brl wsExit wsWalkBndSeedOk anop +* --- WALK LEFT BND (byte-cached) --- lda wsX sta wsMatchByte + beq wsLBndStop + cmp >wsNewByte + beq wsLBndStop + rep #$20 + LONGA ON dec wsMatchByte + beq wsLBndStop2 + cmp >wsNewByte + beq wsLBndStop2 + rep #$20 + LONGA ON + dec gFloodLeftX +* --- WALK RIGHT BND (byte-cached) --- lda wsX sta wsMatchByte + beq wsRBndStop + cmp >wsNewByte + beq wsRBndStop + rep #$20 + LONGA ON inc wsMatchByte + beq wsRBndStop2 + cmp >wsNewByte + beq wsRBndStop2 + rep #$20 + LONGA ON + inc gFloodRightX @@ -3108,75 +3620,9 @@ wsExit anop plp rtl -* wsTestEq: test pixel at column A against matchByte (eq mode). -* In: A = column (M=16). Reads from wsRow. -* Out: A = 1 if pix == matchByte, 0 otherwise. M=16 on exit. -* Trashes A, Y, P. Preserves X, D, B. -wsTestEq anop - lsr a ; A = byteIdx; C = column & 1 - tay - bcs wsTeqOdd - sep #$20 - LONGA OFF - lda [wsRow],y - lsr a - lsr a - lsr a - lsr a - bra wsTeqHave -wsTeqOdd anop - sep #$20 - LONGA OFF - lda [wsRow],y - and #$0F -wsTeqHave anop - cmp >wsMatchByte - rep #$20 - LONGA ON - bne wsTeqNo - lda #1 - rts -wsTeqNo anop - lda #0 - rts - - -* wsTestBnd: test pixel at column A against (matchByte, newByte) for -* boundary-mode flood: returns 1 iff pix != matchByte AND pix != newByte. -* In: A = column (M=16). Reads from wsRow. -* Out: A = 1 if qualifies, 0 otherwise. M=16 on exit. -* Trashes A, Y, P. Preserves X, D, B. -wsTestBnd anop - lsr a - tay - bcs wsTbnOdd - sep #$20 - LONGA OFF - lda [wsRow],y - lsr a - lsr a - lsr a - lsr a - bra wsTbnHave -wsTbnOdd anop - sep #$20 - LONGA OFF - lda [wsRow],y - and #$0F -wsTbnHave anop - cmp >wsMatchByte - beq wsTbnNo - cmp >wsNewByte - beq wsTbnNo - rep #$20 - LONGA ON - lda #1 - rts -wsTbnNo anop - rep #$20 - LONGA ON - lda #0 - rts +* wsTestEq / wsTestBnd helpers deleted -- the seed test and walk-out +* loops now expand the test inline (no JSR/RTS overhead per walked +* pixel; saves ~12 cyc/pixel x ~3600 walked pixels in the demo). * wsScanAndPush: walk wsScanRow[wsLeftX..wsRightX] for run-edge