| Atari ST word-interleaved planar circle outline -- 68000 hand-rolled. | | Mirrors src/port/amiga/circle.s in spirit but for ST's single | word-interleaved planar buffer: | * Per scanline: 20 groups of 8 bytes; each group is 4 plane | words back-to-back (p0_word, p1_word, p2_word, p3_word). | * Pixel x: group = x >> 4; bit position within word = 15 - (x & 15). | * Plane N's word at row y, group g: base + y*160 + g*8 + N*2. | | 16-way color dispatch + per-iter precompute (4 xp records + 4 yp40 | words) gives a branchless 4-plane RMW per pixel. 8 octants are | inlined per Bresenham iter; no bsr. | | ABI: cdecl. d2-d7/a2-a6 callee-save. | | void surface68kStCircleOutline(uint8_t *base, | uint16_t cx, uint16_t cy, | uint16_t r, uint8_t color); | | Register allocation: | d2.w = bx (Bresenham) | d3.w = by (Bresenham) | d4.w = err (Bresenham) | d5.w = cx (cached) | a4 = cy (cached, sign-extended) | a3 = base | a5 = bitMaskWordLut | d0,d1,d6,d7 = scratch | | Scratch block (24 bytes) at sp+0..23: | sp+0..3: xp1 record [groupOff_w, bitMask_b, notMask_b] | groupOff = (x >> 4) * 8 (byte offset of group within row) | bitMask = byte representation of 1 << (15 - (x & 15)) | ... wait, bitMask must be a WORD on ST not a byte. | | Actually layout differs from Amiga: ST needs a WORD bit mask, not | a byte. Per-record layout (8 bytes): | groupOff_word (2 bytes), bitMask_word (2 bytes), notMask_word | (2 bytes), pad (2 bytes) | | sp+0..7: xp1 record (cx + bx) | sp+8..15: xp2 record (cx - bx) | sp+16..23: xp3 record (cx + by) | sp+24..31: xp4 record (cx - by) | sp+32..33: yp1_off (cy + by) * 160 | sp+34..35: yp2_off (cy - by) * 160 | sp+36..37: yp3_off (cy + bx) * 160 | sp+38..39: yp4_off (cy - bx) * 160 | Total: 40 bytes. .text | ---- BIT_MASK_WORD: build 1 << (15 - (x & 15)) --------- | Look up via 16-entry table (a5 holds base). Cheaper than variable | shift on 68000 (which is 8 + 2n cyc). Table is 32 bytes (16 words). | Returns word in d_out. | ---- XP_REC: build xp record at sp+slot for xp = cx | signOp: add or sub | xreg: %d2 (bx) or %d3 (by) | slot: 0, 8, 16, or 24 | Trashes: d0, d1, d6, d7 .macro XP_REC slot, signOp, xreg move.w %d5,%d6 | d6 = cx \signOp\().w \xreg,%d6 | d6 = xp move.w %d6,%d7 lsr.w #4,%d7 | d7 = group lsl.w #3,%d7 | d7 = group * 8 (byte offset) and.w #15,%d6 | d6 = xp & 15 (0..15) add.w %d6,%d6 | d6 *= 2 (word index) move.w (%a5,%d6.w),%d6 | d6 = bitMask word move.w %d7,\slot(%sp) | groupOff word move.w %d6,\slot+2(%sp) | bitMask word .endm | ---- YP_REC: store (yp * 160) at sp+slot --------- | yp = cy ; trashes d0, d6. .macro YP_REC slot, signOp, yreg move.l %a4,%d6 \signOp\().w \yreg,%d6 | d6.w = yp move.w %d6,%d0 lsl.w #5,%d6 | d6 = yp << 5 lsl.w #7,%d0 | d0 = yp << 7 add.w %d6,%d0 | d0 = yp * 160 move.w %d0,\slot(%sp) .endm | ---- PLOT_FIXED: plot one pixel with hardcoded 4-bit color ---- | slotYp: 32, 34, 36, or 38 (yp_off word slot) | slotXp: 0, 8, 16, or 24 (xp record slot) | color: literal 0..15 | Trashes: d0, d1, d7 .macro PLOT_FIXED slotYp, slotXp, color move.w \slotYp(%sp),%d0 | d0 = yp_off add.w \slotXp(%sp),%d0 | d0 += groupOff move.w \slotXp+2(%sp),%d1 | d1 = bitMask word move.w %d1,%d7 not.w %d7 | d7 = notMask lea 0(%a3,%d0.w),%a2 | a2 = base + byteOff (group ptr) | 4 plane word RMWs at (a2)+, postinc walks p0->p1->p2->p3 .if ((\color) & 1) or.w %d1,(%a2)+ .else and.w %d7,(%a2)+ .endif .if ((\color) & 2) or.w %d1,(%a2)+ .else and.w %d7,(%a2)+ .endif .if ((\color) & 4) or.w %d1,(%a2)+ .else and.w %d7,(%a2)+ .endif .if ((\color) & 8) or.w %d1,(%a2)+ .else and.w %d7,(%a2)+ .endif .endm | ---- PLOT_8: 8 octant pixels for hardcoded color ---- .macro PLOT_8 color PLOT_FIXED 32, 0, \color | (cx+bx, cy+by) PLOT_FIXED 32, 8, \color | (cx-bx, cy+by) PLOT_FIXED 34, 0, \color | (cx+bx, cy-by) PLOT_FIXED 34, 8, \color | (cx-bx, cy-by) PLOT_FIXED 36, 16, \color | (cx+by, cy+bx) PLOT_FIXED 36, 24, \color | (cx-by, cy+bx) PLOT_FIXED 38, 16, \color | (cx+by, cy-bx) PLOT_FIXED 38, 24, \color | (cx-by, cy-bx) .endm | ---- CO_BODY: full Bresenham loop body for hardcoded color ---- .macro CO_BODY color XP_REC 0, add, %d2 | xp1 = cx+bx XP_REC 8, sub, %d2 | xp2 = cx-bx XP_REC 16, add, %d3 | xp3 = cx+by XP_REC 24, sub, %d3 | xp4 = cx-by YP_REC 32, add, %d3 | yp1 = (cy+by)*160 YP_REC 34, sub, %d3 | yp2 = (cy-by)*160 YP_REC 36, add, %d2 | yp3 = (cy+bx)*160 YP_REC 38, sub, %d2 | yp4 = (cy-bx)*160 PLOT_8 \color addq.w #1,%d3 tst.w %d4 bgt .LcoStDecX_\color add.w %d3,%d4 add.w %d3,%d4 addq.w #1,%d4 bra.w .LcoStLoop_\color .LcoStDecX_\color: subq.w #1,%d2 add.w %d3,%d4 add.w %d3,%d4 sub.w %d2,%d4 sub.w %d2,%d4 addq.w #1,%d4 bra.w .LcoStLoop_\color .endm .macro CO_LOOP_HDR color .LcoStLoop_\color: cmp.w %d3,%d2 bcs.w .LcoStDone CO_BODY \color .endm | ---- Function entry ---- | Stack on entry (after movem.l of 11 regs + lea): | sp+0..39: scratch (40 bytes) | sp+40..83: movem (44 bytes) | sp+84..87: return PC | sp+88+0: base (uint8_t *) | sp+88+4: cx (int promoted, .w at +88+4+2) | sp+88+8: cy (int promoted, .w at +88+8+2) | sp+88+12: r (int promoted, .w at +88+12+2) | sp+88+16: color (int promoted, byte at +88+16+3) .equ SP_SAVED, 44 .equ SP_LOCAL, 40 .equ SP_OFF, (SP_SAVED + 4 + SP_LOCAL) .equ SP_BASE, SP_OFF + 0 .equ SP_CX, SP_OFF + 4 + 2 .equ SP_CY, SP_OFF + 8 + 2 .equ SP_R, SP_OFF + 12 + 2 .equ SP_COLOR, SP_OFF + 16 + 3 .globl _surface68kStCircleOutline _surface68kStCircleOutline: movem.l %d2-%d7/%a2-%a6,-(%sp) lea -SP_LOCAL(%sp),%sp | Load base (a3) and bitMaskLut (a5). move.l SP_BASE(%sp),%a3 lea bitMaskWordLut(%pc),%a5 | Cache cx in d5, cy (sign-extended) in a4. move.w SP_CX(%sp),%d5 move.w SP_CY(%sp),%d6 ext.l %d6 movea.l %d6,%a4 | Bresenham init. move.w SP_R(%sp),%d2 | bx = r moveq #0,%d3 | by = 0 moveq #1,%d4 sub.w %d2,%d4 | err = 1 - bx | Dispatch on color (low 4 bits) -> one of 16 main loops. moveq #0,%d6 move.b SP_COLOR(%sp),%d6 and.w #0x0F,%d6 add.w %d6,%d6 add.w %d6,%d6 | * 4 for bra.w table lea .LcoStTable(%pc),%a6 jmp 0(%a6,%d6.w) .LcoStTable: bra.w .LcoStLoop_0 bra.w .LcoStLoop_1 bra.w .LcoStLoop_2 bra.w .LcoStLoop_3 bra.w .LcoStLoop_4 bra.w .LcoStLoop_5 bra.w .LcoStLoop_6 bra.w .LcoStLoop_7 bra.w .LcoStLoop_8 bra.w .LcoStLoop_9 bra.w .LcoStLoop_10 bra.w .LcoStLoop_11 bra.w .LcoStLoop_12 bra.w .LcoStLoop_13 bra.w .LcoStLoop_14 bra.w .LcoStLoop_15 CO_LOOP_HDR 0 CO_LOOP_HDR 1 CO_LOOP_HDR 2 CO_LOOP_HDR 3 CO_LOOP_HDR 4 CO_LOOP_HDR 5 CO_LOOP_HDR 6 CO_LOOP_HDR 7 CO_LOOP_HDR 8 CO_LOOP_HDR 9 CO_LOOP_HDR 10 CO_LOOP_HDR 11 CO_LOOP_HDR 12 CO_LOOP_HDR 13 CO_LOOP_HDR 14 CO_LOOP_HDR 15 .LcoStDone: lea SP_LOCAL(%sp),%sp movem.l (%sp)+,%d2-%d7/%a2-%a6 rts .align 2 | 16 word entries: bitMaskWordLut[i] = 1 << (15 - i), for i in 0..15. bitMaskWordLut: .word 0x8000, 0x4000, 0x2000, 0x1000 .word 0x0800, 0x0400, 0x0200, 0x0100 .word 0x0080, 0x0040, 0x0020, 0x0010 .word 0x0008, 0x0004, 0x0002, 0x0001