282 lines
10 KiB
ArmAsm
282 lines
10 KiB
ArmAsm
| Atari ST word-interleaved planar circle outline -- 68000 hand-rolled.
|
|
|
|
|
| Mirrors src/port/amiga/circle.s in spirit but for ST's single
|
|
| word-interleaved planar buffer:
|
|
| * Per scanline: 20 groups of 8 bytes; each group is 4 plane
|
|
| words back-to-back (p0_word, p1_word, p2_word, p3_word).
|
|
| * Pixel x: group = x >> 4; bit position within word = 15 - (x & 15).
|
|
| * Plane N's word at row y, group g: base + y*160 + g*8 + N*2.
|
|
|
|
|
| 16-way color dispatch + per-iter precompute (4 xp records + 4 yp40
|
|
| words) gives a branchless 4-plane RMW per pixel. 8 octants are
|
|
| inlined per Bresenham iter; no bsr.
|
|
|
|
|
| ABI: cdecl. d2-d7/a2-a6 callee-save.
|
|
|
|
|
| void surface68kStCircleOutline(uint8_t *base,
|
|
| uint16_t cx, uint16_t cy,
|
|
| uint16_t r, uint8_t color);
|
|
|
|
|
| Register allocation:
|
|
| d2.w = bx (Bresenham)
|
|
| d3.w = by (Bresenham)
|
|
| d4.w = err (Bresenham)
|
|
| d5.w = cx (cached)
|
|
| a4 = cy (cached, sign-extended)
|
|
| a3 = base
|
|
| a5 = bitMaskWordLut
|
|
| d0,d1,d6,d7 = scratch
|
|
|
|
|
| Scratch block (24 bytes) at sp+0..23:
|
|
| sp+0..3: xp1 record [groupOff_w, bitMask_b, notMask_b]
|
|
| groupOff = (x >> 4) * 8 (byte offset of group within row)
|
|
| bitMask = byte representation of 1 << (15 - (x & 15))
|
|
| ... wait, bitMask must be a WORD on ST not a byte.
|
|
|
|
|
| Actually layout differs from Amiga: ST needs a WORD bit mask, not
|
|
| a byte. Per-record layout (8 bytes):
|
|
| groupOff_word (2 bytes), bitMask_word (2 bytes), notMask_word
|
|
| (2 bytes), pad (2 bytes)
|
|
|
|
|
| sp+0..7: xp1 record (cx + bx)
|
|
| sp+8..15: xp2 record (cx - bx)
|
|
| sp+16..23: xp3 record (cx + by)
|
|
| sp+24..31: xp4 record (cx - by)
|
|
| sp+32..33: yp1_off (cy + by) * 160
|
|
| sp+34..35: yp2_off (cy - by) * 160
|
|
| sp+36..37: yp3_off (cy + bx) * 160
|
|
| sp+38..39: yp4_off (cy - bx) * 160
|
|
| Total: 40 bytes.
|
|
|
|
.text
|
|
|
|
|
|
| ---- BIT_MASK_WORD: build 1 << (15 - (x & 15)) ---------
|
|
| Look up via 16-entry table (a5 holds base). Cheaper than variable
|
|
| shift on 68000 (which is 8 + 2n cyc). Table is 32 bytes (16 words).
|
|
| Returns word in d_out.
|
|
|
|
| ---- XP_REC: build xp record at sp+slot for xp = cx <signOp> <xreg>
|
|
| signOp: add or sub
|
|
| xreg: %d2 (bx) or %d3 (by)
|
|
| slot: 0, 8, 16, or 24
|
|
| Trashes: d0, d1, d6, d7
|
|
|
|
.macro XP_REC slot, signOp, xreg
|
|
move.w %d5,%d6 | d6 = cx
|
|
\signOp\().w \xreg,%d6 | d6 = xp
|
|
move.w %d6,%d7
|
|
lsr.w #4,%d7 | d7 = group
|
|
lsl.w #3,%d7 | d7 = group * 8 (byte offset)
|
|
and.w #15,%d6 | d6 = xp & 15 (0..15)
|
|
add.w %d6,%d6 | d6 *= 2 (word index)
|
|
move.w (%a5,%d6.w),%d6 | d6 = bitMask word
|
|
move.w %d7,\slot(%sp) | groupOff word
|
|
move.w %d6,\slot+2(%sp) | bitMask word
|
|
.endm
|
|
|
|
|
|
| ---- YP_REC: store (yp * 160) at sp+slot ---------
|
|
| yp = cy <signOp> <yreg>; trashes d0, d6.
|
|
|
|
.macro YP_REC slot, signOp, yreg
|
|
move.l %a4,%d6
|
|
\signOp\().w \yreg,%d6 | d6.w = yp
|
|
move.w %d6,%d0
|
|
lsl.w #5,%d6 | d6 = yp << 5
|
|
lsl.w #7,%d0 | d0 = yp << 7
|
|
add.w %d6,%d0 | d0 = yp * 160
|
|
move.w %d0,\slot(%sp)
|
|
.endm
|
|
|
|
|
|
| ---- PLOT_FIXED: plot one pixel with hardcoded 4-bit color ----
|
|
| slotYp: 32, 34, 36, or 38 (yp_off word slot)
|
|
| slotXp: 0, 8, 16, or 24 (xp record slot)
|
|
| color: literal 0..15
|
|
| Trashes: d0, d1, d7
|
|
|
|
.macro PLOT_FIXED slotYp, slotXp, color
|
|
move.w \slotYp(%sp),%d0 | d0 = yp_off
|
|
add.w \slotXp(%sp),%d0 | d0 += groupOff
|
|
move.w \slotXp+2(%sp),%d1 | d1 = bitMask word
|
|
move.w %d1,%d7
|
|
not.w %d7 | d7 = notMask
|
|
lea 0(%a3,%d0.w),%a2 | a2 = base + byteOff (group ptr)
|
|
| 4 plane word RMWs at (a2)+, postinc walks p0->p1->p2->p3
|
|
.if ((\color) & 1)
|
|
or.w %d1,(%a2)+
|
|
.else
|
|
and.w %d7,(%a2)+
|
|
.endif
|
|
.if ((\color) & 2)
|
|
or.w %d1,(%a2)+
|
|
.else
|
|
and.w %d7,(%a2)+
|
|
.endif
|
|
.if ((\color) & 4)
|
|
or.w %d1,(%a2)+
|
|
.else
|
|
and.w %d7,(%a2)+
|
|
.endif
|
|
.if ((\color) & 8)
|
|
or.w %d1,(%a2)+
|
|
.else
|
|
and.w %d7,(%a2)+
|
|
.endif
|
|
.endm
|
|
|
|
|
|
| ---- PLOT_8: 8 octant pixels for hardcoded color ----
|
|
|
|
.macro PLOT_8 color
|
|
PLOT_FIXED 32, 0, \color | (cx+bx, cy+by)
|
|
PLOT_FIXED 32, 8, \color | (cx-bx, cy+by)
|
|
PLOT_FIXED 34, 0, \color | (cx+bx, cy-by)
|
|
PLOT_FIXED 34, 8, \color | (cx-bx, cy-by)
|
|
PLOT_FIXED 36, 16, \color | (cx+by, cy+bx)
|
|
PLOT_FIXED 36, 24, \color | (cx-by, cy+bx)
|
|
PLOT_FIXED 38, 16, \color | (cx+by, cy-bx)
|
|
PLOT_FIXED 38, 24, \color | (cx-by, cy-bx)
|
|
.endm
|
|
|
|
|
|
| ---- CO_BODY: full Bresenham loop body for hardcoded color ----
|
|
|
|
.macro CO_BODY color
|
|
XP_REC 0, add, %d2 | xp1 = cx+bx
|
|
XP_REC 8, sub, %d2 | xp2 = cx-bx
|
|
XP_REC 16, add, %d3 | xp3 = cx+by
|
|
XP_REC 24, sub, %d3 | xp4 = cx-by
|
|
YP_REC 32, add, %d3 | yp1 = (cy+by)*160
|
|
YP_REC 34, sub, %d3 | yp2 = (cy-by)*160
|
|
YP_REC 36, add, %d2 | yp3 = (cy+bx)*160
|
|
YP_REC 38, sub, %d2 | yp4 = (cy-bx)*160
|
|
|
|
PLOT_8 \color
|
|
|
|
addq.w #1,%d3
|
|
tst.w %d4
|
|
bgt .LcoStDecX_\color
|
|
add.w %d3,%d4
|
|
add.w %d3,%d4
|
|
addq.w #1,%d4
|
|
bra.w .LcoStLoop_\color
|
|
.LcoStDecX_\color:
|
|
subq.w #1,%d2
|
|
add.w %d3,%d4
|
|
add.w %d3,%d4
|
|
sub.w %d2,%d4
|
|
sub.w %d2,%d4
|
|
addq.w #1,%d4
|
|
bra.w .LcoStLoop_\color
|
|
.endm
|
|
|
|
|
|
.macro CO_LOOP_HDR color
|
|
.LcoStLoop_\color:
|
|
cmp.w %d3,%d2
|
|
bcs.w .LcoStDone
|
|
CO_BODY \color
|
|
.endm
|
|
|
|
|
|
| ---- Function entry ----
|
|
| Stack on entry (after movem.l of 11 regs + lea):
|
|
| sp+0..39: scratch (40 bytes)
|
|
| sp+40..83: movem (44 bytes)
|
|
| sp+84..87: return PC
|
|
| sp+88+0: base (uint8_t *)
|
|
| sp+88+4: cx (int promoted, .w at +88+4+2)
|
|
| sp+88+8: cy (int promoted, .w at +88+8+2)
|
|
| sp+88+12: r (int promoted, .w at +88+12+2)
|
|
| sp+88+16: color (int promoted, byte at +88+16+3)
|
|
|
|
.equ SP_SAVED, 44
|
|
.equ SP_LOCAL, 40
|
|
.equ SP_OFF, (SP_SAVED + 4 + SP_LOCAL)
|
|
.equ SP_BASE, SP_OFF + 0
|
|
.equ SP_CX, SP_OFF + 4 + 2
|
|
.equ SP_CY, SP_OFF + 8 + 2
|
|
.equ SP_R, SP_OFF + 12 + 2
|
|
.equ SP_COLOR, SP_OFF + 16 + 3
|
|
|
|
.globl _surface68kStCircleOutline
|
|
|
|
_surface68kStCircleOutline:
|
|
movem.l %d2-%d7/%a2-%a6,-(%sp)
|
|
lea -SP_LOCAL(%sp),%sp
|
|
|
|
| Load base (a3) and bitMaskLut (a5).
|
|
move.l SP_BASE(%sp),%a3
|
|
lea bitMaskWordLut(%pc),%a5
|
|
|
|
| Cache cx in d5, cy (sign-extended) in a4.
|
|
move.w SP_CX(%sp),%d5
|
|
move.w SP_CY(%sp),%d6
|
|
ext.l %d6
|
|
movea.l %d6,%a4
|
|
|
|
| Bresenham init.
|
|
move.w SP_R(%sp),%d2 | bx = r
|
|
moveq #0,%d3 | by = 0
|
|
moveq #1,%d4
|
|
sub.w %d2,%d4 | err = 1 - bx
|
|
|
|
| Dispatch on color (low 4 bits) -> one of 16 main loops.
|
|
moveq #0,%d6
|
|
move.b SP_COLOR(%sp),%d6
|
|
and.w #0x0F,%d6
|
|
add.w %d6,%d6
|
|
add.w %d6,%d6 | * 4 for bra.w table
|
|
lea .LcoStTable(%pc),%a6
|
|
jmp 0(%a6,%d6.w)
|
|
|
|
.LcoStTable:
|
|
bra.w .LcoStLoop_0
|
|
bra.w .LcoStLoop_1
|
|
bra.w .LcoStLoop_2
|
|
bra.w .LcoStLoop_3
|
|
bra.w .LcoStLoop_4
|
|
bra.w .LcoStLoop_5
|
|
bra.w .LcoStLoop_6
|
|
bra.w .LcoStLoop_7
|
|
bra.w .LcoStLoop_8
|
|
bra.w .LcoStLoop_9
|
|
bra.w .LcoStLoop_10
|
|
bra.w .LcoStLoop_11
|
|
bra.w .LcoStLoop_12
|
|
bra.w .LcoStLoop_13
|
|
bra.w .LcoStLoop_14
|
|
bra.w .LcoStLoop_15
|
|
|
|
CO_LOOP_HDR 0
|
|
CO_LOOP_HDR 1
|
|
CO_LOOP_HDR 2
|
|
CO_LOOP_HDR 3
|
|
CO_LOOP_HDR 4
|
|
CO_LOOP_HDR 5
|
|
CO_LOOP_HDR 6
|
|
CO_LOOP_HDR 7
|
|
CO_LOOP_HDR 8
|
|
CO_LOOP_HDR 9
|
|
CO_LOOP_HDR 10
|
|
CO_LOOP_HDR 11
|
|
CO_LOOP_HDR 12
|
|
CO_LOOP_HDR 13
|
|
CO_LOOP_HDR 14
|
|
CO_LOOP_HDR 15
|
|
|
|
.LcoStDone:
|
|
lea SP_LOCAL(%sp),%sp
|
|
movem.l (%sp)+,%d2-%d7/%a2-%a6
|
|
rts
|
|
|
|
|
|
.align 2
|
|
| 16 word entries: bitMaskWordLut[i] = 1 << (15 - i), for i in 0..15.
|
|
bitMaskWordLut:
|
|
.word 0x8000, 0x4000, 0x2000, 0x1000
|
|
.word 0x0800, 0x0400, 0x0200, 0x0100
|
|
.word 0x0080, 0x0040, 0x0020, 0x0010
|
|
.word 0x0008, 0x0004, 0x0002, 0x0001
|