joeylib2/src/port/atarist/circle.s

282 lines
10 KiB
ArmAsm

| Atari ST word-interleaved planar circle outline -- 68000 hand-rolled.
|
| Mirrors src/port/amiga/circle.s in spirit but for ST's single
| word-interleaved planar buffer:
| * Per scanline: 20 groups of 8 bytes; each group is 4 plane
| words back-to-back (p0_word, p1_word, p2_word, p3_word).
| * Pixel x: group = x >> 4; bit position within word = 15 - (x & 15).
| * Plane N's word at row y, group g: base + y*160 + g*8 + N*2.
|
| 16-way color dispatch + per-iter precompute (4 xp records + 4 yp40
| words) gives a branchless 4-plane RMW per pixel. 8 octants are
| inlined per Bresenham iter; no bsr.
|
| ABI: cdecl. d2-d7/a2-a6 callee-save.
|
| void surface68kStCircleOutline(uint8_t *base,
| uint16_t cx, uint16_t cy,
| uint16_t r, uint8_t color);
|
| Register allocation:
| d2.w = bx (Bresenham)
| d3.w = by (Bresenham)
| d4.w = err (Bresenham)
| d5.w = cx (cached)
| a4 = cy (cached, sign-extended)
| a3 = base
| a5 = bitMaskWordLut
| d0,d1,d6,d7 = scratch
|
| Scratch block (24 bytes) at sp+0..23:
| sp+0..3: xp1 record [groupOff_w, bitMask_b, notMask_b]
| groupOff = (x >> 4) * 8 (byte offset of group within row)
| bitMask = byte representation of 1 << (15 - (x & 15))
| ... wait, bitMask must be a WORD on ST not a byte.
|
| Actually layout differs from Amiga: ST needs a WORD bit mask, not
| a byte. Per-record layout (8 bytes):
| groupOff_word (2 bytes), bitMask_word (2 bytes), notMask_word
| (2 bytes), pad (2 bytes)
|
| sp+0..7: xp1 record (cx + bx)
| sp+8..15: xp2 record (cx - bx)
| sp+16..23: xp3 record (cx + by)
| sp+24..31: xp4 record (cx - by)
| sp+32..33: yp1_off (cy + by) * 160
| sp+34..35: yp2_off (cy - by) * 160
| sp+36..37: yp3_off (cy + bx) * 160
| sp+38..39: yp4_off (cy - bx) * 160
| Total: 40 bytes.
.text
| ---- BIT_MASK_WORD: build 1 << (15 - (x & 15)) ---------
| Look up via 16-entry table (a5 holds base). Cheaper than variable
| shift on 68000 (which is 8 + 2n cyc). Table is 32 bytes (16 words).
| Returns word in d_out.
| ---- XP_REC: build xp record at sp+slot for xp = cx <signOp> <xreg>
| signOp: add or sub
| xreg: %d2 (bx) or %d3 (by)
| slot: 0, 8, 16, or 24
| Trashes: d0, d1, d6, d7
.macro XP_REC slot, signOp, xreg
move.w %d5,%d6 | d6 = cx
\signOp\().w \xreg,%d6 | d6 = xp
move.w %d6,%d7
lsr.w #4,%d7 | d7 = group
lsl.w #3,%d7 | d7 = group * 8 (byte offset)
and.w #15,%d6 | d6 = xp & 15 (0..15)
add.w %d6,%d6 | d6 *= 2 (word index)
move.w (%a5,%d6.w),%d6 | d6 = bitMask word
move.w %d7,\slot(%sp) | groupOff word
move.w %d6,\slot+2(%sp) | bitMask word
.endm
| ---- YP_REC: store (yp * 160) at sp+slot ---------
| yp = cy <signOp> <yreg>; trashes d0, d6.
.macro YP_REC slot, signOp, yreg
move.l %a4,%d6
\signOp\().w \yreg,%d6 | d6.w = yp
move.w %d6,%d0
lsl.w #5,%d6 | d6 = yp << 5
lsl.w #7,%d0 | d0 = yp << 7
add.w %d6,%d0 | d0 = yp * 160
move.w %d0,\slot(%sp)
.endm
| ---- PLOT_FIXED: plot one pixel with hardcoded 4-bit color ----
| slotYp: 32, 34, 36, or 38 (yp_off word slot)
| slotXp: 0, 8, 16, or 24 (xp record slot)
| color: literal 0..15
| Trashes: d0, d1, d7
.macro PLOT_FIXED slotYp, slotXp, color
move.w \slotYp(%sp),%d0 | d0 = yp_off
add.w \slotXp(%sp),%d0 | d0 += groupOff
move.w \slotXp+2(%sp),%d1 | d1 = bitMask word
move.w %d1,%d7
not.w %d7 | d7 = notMask
lea 0(%a3,%d0.w),%a2 | a2 = base + byteOff (group ptr)
| 4 plane word RMWs at (a2)+, postinc walks p0->p1->p2->p3
.if ((\color) & 1)
or.w %d1,(%a2)+
.else
and.w %d7,(%a2)+
.endif
.if ((\color) & 2)
or.w %d1,(%a2)+
.else
and.w %d7,(%a2)+
.endif
.if ((\color) & 4)
or.w %d1,(%a2)+
.else
and.w %d7,(%a2)+
.endif
.if ((\color) & 8)
or.w %d1,(%a2)+
.else
and.w %d7,(%a2)+
.endif
.endm
| ---- PLOT_8: 8 octant pixels for hardcoded color ----
.macro PLOT_8 color
PLOT_FIXED 32, 0, \color | (cx+bx, cy+by)
PLOT_FIXED 32, 8, \color | (cx-bx, cy+by)
PLOT_FIXED 34, 0, \color | (cx+bx, cy-by)
PLOT_FIXED 34, 8, \color | (cx-bx, cy-by)
PLOT_FIXED 36, 16, \color | (cx+by, cy+bx)
PLOT_FIXED 36, 24, \color | (cx-by, cy+bx)
PLOT_FIXED 38, 16, \color | (cx+by, cy-bx)
PLOT_FIXED 38, 24, \color | (cx-by, cy-bx)
.endm
| ---- CO_BODY: full Bresenham loop body for hardcoded color ----
.macro CO_BODY color
XP_REC 0, add, %d2 | xp1 = cx+bx
XP_REC 8, sub, %d2 | xp2 = cx-bx
XP_REC 16, add, %d3 | xp3 = cx+by
XP_REC 24, sub, %d3 | xp4 = cx-by
YP_REC 32, add, %d3 | yp1 = (cy+by)*160
YP_REC 34, sub, %d3 | yp2 = (cy-by)*160
YP_REC 36, add, %d2 | yp3 = (cy+bx)*160
YP_REC 38, sub, %d2 | yp4 = (cy-bx)*160
PLOT_8 \color
addq.w #1,%d3
tst.w %d4
bgt .LcoStDecX_\color
add.w %d3,%d4
add.w %d3,%d4
addq.w #1,%d4
bra.w .LcoStLoop_\color
.LcoStDecX_\color:
subq.w #1,%d2
add.w %d3,%d4
add.w %d3,%d4
sub.w %d2,%d4
sub.w %d2,%d4
addq.w #1,%d4
bra.w .LcoStLoop_\color
.endm
.macro CO_LOOP_HDR color
.LcoStLoop_\color:
cmp.w %d3,%d2
bcs.w .LcoStDone
CO_BODY \color
.endm
| ---- Function entry ----
| Stack on entry (after movem.l of 11 regs + lea):
| sp+0..39: scratch (40 bytes)
| sp+40..83: movem (44 bytes)
| sp+84..87: return PC
| sp+88+0: base (uint8_t *)
| sp+88+4: cx (int promoted, .w at +88+4+2)
| sp+88+8: cy (int promoted, .w at +88+8+2)
| sp+88+12: r (int promoted, .w at +88+12+2)
| sp+88+16: color (int promoted, byte at +88+16+3)
.equ SP_SAVED, 44
.equ SP_LOCAL, 40
.equ SP_OFF, (SP_SAVED + 4 + SP_LOCAL)
.equ SP_BASE, SP_OFF + 0
.equ SP_CX, SP_OFF + 4 + 2
.equ SP_CY, SP_OFF + 8 + 2
.equ SP_R, SP_OFF + 12 + 2
.equ SP_COLOR, SP_OFF + 16 + 3
.globl _surface68kStCircleOutline
_surface68kStCircleOutline:
movem.l %d2-%d7/%a2-%a6,-(%sp)
lea -SP_LOCAL(%sp),%sp
| Load base (a3) and bitMaskLut (a5).
move.l SP_BASE(%sp),%a3
lea bitMaskWordLut(%pc),%a5
| Cache cx in d5, cy (sign-extended) in a4.
move.w SP_CX(%sp),%d5
move.w SP_CY(%sp),%d6
ext.l %d6
movea.l %d6,%a4
| Bresenham init.
move.w SP_R(%sp),%d2 | bx = r
moveq #0,%d3 | by = 0
moveq #1,%d4
sub.w %d2,%d4 | err = 1 - bx
| Dispatch on color (low 4 bits) -> one of 16 main loops.
moveq #0,%d6
move.b SP_COLOR(%sp),%d6
and.w #0x0F,%d6
add.w %d6,%d6
add.w %d6,%d6 | * 4 for bra.w table
lea .LcoStTable(%pc),%a6
jmp 0(%a6,%d6.w)
.LcoStTable:
bra.w .LcoStLoop_0
bra.w .LcoStLoop_1
bra.w .LcoStLoop_2
bra.w .LcoStLoop_3
bra.w .LcoStLoop_4
bra.w .LcoStLoop_5
bra.w .LcoStLoop_6
bra.w .LcoStLoop_7
bra.w .LcoStLoop_8
bra.w .LcoStLoop_9
bra.w .LcoStLoop_10
bra.w .LcoStLoop_11
bra.w .LcoStLoop_12
bra.w .LcoStLoop_13
bra.w .LcoStLoop_14
bra.w .LcoStLoop_15
CO_LOOP_HDR 0
CO_LOOP_HDR 1
CO_LOOP_HDR 2
CO_LOOP_HDR 3
CO_LOOP_HDR 4
CO_LOOP_HDR 5
CO_LOOP_HDR 6
CO_LOOP_HDR 7
CO_LOOP_HDR 8
CO_LOOP_HDR 9
CO_LOOP_HDR 10
CO_LOOP_HDR 11
CO_LOOP_HDR 12
CO_LOOP_HDR 13
CO_LOOP_HDR 14
CO_LOOP_HDR 15
.LcoStDone:
lea SP_LOCAL(%sp),%sp
movem.l (%sp)+,%d2-%d7/%a2-%a6
rts
.align 2
| 16 word entries: bitMaskWordLut[i] = 1 << (15 - i), for i in 0..15.
bitMaskWordLut:
.word 0x8000, 0x4000, 0x2000, 0x1000
.word 0x0800, 0x0400, 0x0200, 0x0100
.word 0x0080, 0x0040, 0x0020, 0x0010
.word 0x0008, 0x0004, 0x0002, 0x0001