joeylib2/src/port/atarist/circle.s

| Atari ST word-interleaved planar circle outline -- 68000 hand-rolled.
|
| Mirrors src/port/amiga/circle.s in spirit but for ST's single
| word-interleaved planar buffer:
|   * Per scanline: 20 groups of 8 bytes; each group is 4 plane
|     words back-to-back (p0_word, p1_word, p2_word, p3_word).
|   * Pixel x: group = x >> 4; bit position within word = 15 - (x & 15).
|   * Plane N's word at row y, group g: base + y*160 + g*8 + N*2.
|
| 16-way color dispatch + per-iter precompute (4 xp records + 4 yp40
| words) gives a branchless 4-plane RMW per pixel. 8 octants are
| inlined per Bresenham iter; no bsr.
|
| ABI: cdecl. d2-d7/a2-a6 callee-save.
|
| void surface68kStCircleOutline(uint8_t *base,
|                                uint16_t cx, uint16_t cy,
|                                uint16_t r,  uint8_t  color);
|
| Register allocation:
|   d2.w   = bx (Bresenham)
|   d3.w   = by (Bresenham)
|   d4.w   = err (Bresenham)
|   d5.w   = cx (cached)
|   a4     = cy (cached, sign-extended)
|   a3     = base
|   a5     = bitMaskWordLut
|   d0,d1,d6,d7 = scratch
|
| Scratch block (24 bytes) at sp+0..23:
|   sp+0..3:   xp1 record [groupOff_w, bitMask_b, notMask_b]
|              groupOff = (x >> 4) * 8 (byte offset of group within row)
|              bitMask  = byte representation of 1 << (15 - (x & 15))
|              ... wait, bitMask must be a WORD on ST not a byte.
|
| Actually layout differs from Amiga: ST needs a WORD bit mask, not
| a byte. Per-record layout (8 bytes):
|   groupOff_word (2 bytes), bitMask_word (2 bytes), notMask_word
|   (2 bytes), pad (2 bytes)
|
|   sp+0..7:    xp1 record (cx + bx)
|   sp+8..15:   xp2 record (cx - bx)
|   sp+16..23:  xp3 record (cx + by)
|   sp+24..31:  xp4 record (cx - by)
|   sp+32..33:  yp1_off (cy + by) * 160
|   sp+34..35:  yp2_off (cy - by) * 160
|   sp+36..37:  yp3_off (cy + bx) * 160
|   sp+38..39:  yp4_off (cy - bx) * 160
| Total: 40 bytes.

                .text


| ---- BIT_MASK_WORD: build 1 << (15 - (x & 15)) ---------
| Look up via 16-entry table (a5 holds base). Cheaper than variable
| shift on 68000 (which is 8 + 2n cyc). Table is 32 bytes (16 words).
| Returns word in d_out.

| ---- XP_REC: build xp record at sp+slot for xp = cx <signOp> <xreg>
| signOp: add or sub
| xreg:   %d2 (bx) or %d3 (by)
| slot:   0, 8, 16, or 24
| Trashes: d0, d1, d6, d7

                .macro  XP_REC  slot, signOp, xreg
                move.w  %d5,%d6                | d6 = cx
                \signOp\().w \xreg,%d6         | d6 = xp
                move.w  %d6,%d7
                lsr.w   #4,%d7                 | d7 = group
                lsl.w   #3,%d7                 | d7 = group * 8 (byte offset)
                and.w   #15,%d6                | d6 = xp & 15 (0..15)
                add.w   %d6,%d6                | d6 *= 2 (word index)
                move.w  (%a5,%d6.w),%d6        | d6 = bitMask word
                move.w  %d7,\slot(%sp)         | groupOff word
                move.w  %d6,\slot+2(%sp)       | bitMask word
                .endm


| ---- YP_REC: store (yp * 160) at sp+slot ---------
| yp = cy <signOp> <yreg>; trashes d0, d6.

                .macro  YP_REC  slot, signOp, yreg
                move.l  %a4,%d6
                \signOp\().w \yreg,%d6         | d6.w = yp
                move.w  %d6,%d0
                lsl.w   #5,%d6                 | d6 = yp << 5
                lsl.w   #7,%d0                 | d0 = yp << 7
                add.w   %d6,%d0                | d0 = yp * 160
                move.w  %d0,\slot(%sp)
                .endm


| ---- PLOT_FIXED: plot one pixel with hardcoded 4-bit color ----
| slotYp:  32, 34, 36, or 38 (yp_off word slot)
| slotXp:  0, 8, 16, or 24    (xp record slot)
| color:   literal 0..15
| Trashes: d0, d1, d7

                .macro  PLOT_FIXED  slotYp, slotXp, color
                move.w  \slotYp(%sp),%d0       | d0 = yp_off
                add.w   \slotXp(%sp),%d0       | d0 += groupOff
                move.w  \slotXp+2(%sp),%d1     | d1 = bitMask word
                move.w  %d1,%d7
                not.w   %d7                    | d7 = notMask
                lea     0(%a3,%d0.w),%a2       | a2 = base + byteOff (group ptr)
                | 4 plane word RMWs at (a2)+, postinc walks p0->p1->p2->p3
                .if  ((\color) & 1)
                or.w    %d1,(%a2)+
                .else
                and.w   %d7,(%a2)+
                .endif
                .if  ((\color) & 2)
                or.w    %d1,(%a2)+
                .else
                and.w   %d7,(%a2)+
                .endif
                .if  ((\color) & 4)
                or.w    %d1,(%a2)+
                .else
                and.w   %d7,(%a2)+
                .endif
                .if  ((\color) & 8)
                or.w    %d1,(%a2)+
                .else
                and.w   %d7,(%a2)+
                .endif
                .endm


| ---- PLOT_8: 8 octant pixels for hardcoded color ----

                .macro  PLOT_8  color
                PLOT_FIXED  32,  0, \color     | (cx+bx, cy+by)
                PLOT_FIXED  32,  8, \color     | (cx-bx, cy+by)
                PLOT_FIXED  34,  0, \color     | (cx+bx, cy-by)
                PLOT_FIXED  34,  8, \color     | (cx-bx, cy-by)
                PLOT_FIXED  36, 16, \color     | (cx+by, cy+bx)
                PLOT_FIXED  36, 24, \color     | (cx-by, cy+bx)
                PLOT_FIXED  38, 16, \color     | (cx+by, cy-bx)
                PLOT_FIXED  38, 24, \color     | (cx-by, cy-bx)
                .endm


| ---- CO_BODY: full Bresenham loop body for hardcoded color ----

                .macro  CO_BODY  color
                XP_REC   0, add, %d2           | xp1 = cx+bx
                XP_REC   8, sub, %d2           | xp2 = cx-bx
                XP_REC  16, add, %d3           | xp3 = cx+by
                XP_REC  24, sub, %d3           | xp4 = cx-by
                YP_REC  32, add, %d3           | yp1 = (cy+by)*160
                YP_REC  34, sub, %d3           | yp2 = (cy-by)*160
                YP_REC  36, add, %d2           | yp3 = (cy+bx)*160
                YP_REC  38, sub, %d2           | yp4 = (cy-bx)*160

                PLOT_8  \color

                addq.w  #1,%d3
                tst.w   %d4
                bgt     .LcoStDecX_\color
                add.w   %d3,%d4
                add.w   %d3,%d4
                addq.w  #1,%d4
                bra.w   .LcoStLoop_\color
.LcoStDecX_\color:
                subq.w  #1,%d2
                add.w   %d3,%d4
                add.w   %d3,%d4
                sub.w   %d2,%d4
                sub.w   %d2,%d4
                addq.w  #1,%d4
                bra.w   .LcoStLoop_\color
                .endm


                .macro  CO_LOOP_HDR  color
.LcoStLoop_\color:
                cmp.w   %d3,%d2
                bcs.w   .LcoStDone
                CO_BODY \color
                .endm


| ---- Function entry ----
| Stack on entry (after movem.l of 11 regs + lea):
|   sp+0..39:  scratch (40 bytes)
|   sp+40..83: movem (44 bytes)
|   sp+84..87: return PC
|   sp+88+0:   base (uint8_t *)
|   sp+88+4:   cx  (int promoted, .w at +88+4+2)
|   sp+88+8:   cy  (int promoted, .w at +88+8+2)
|   sp+88+12:  r   (int promoted, .w at +88+12+2)
|   sp+88+16:  color (int promoted, byte at +88+16+3)

                .equ    SP_SAVED, 44
                .equ    SP_LOCAL, 40
                .equ    SP_OFF,         (SP_SAVED + 4 + SP_LOCAL)
                .equ    SP_BASE,    SP_OFF + 0
                .equ    SP_CX,      SP_OFF + 4 + 2
                .equ    SP_CY,      SP_OFF + 8 + 2
                .equ    SP_R,       SP_OFF + 12 + 2
                .equ    SP_COLOR,   SP_OFF + 16 + 3

                .globl  _surface68kStCircleOutline

_surface68kStCircleOutline:
                movem.l %d2-%d7/%a2-%a6,-(%sp)
                lea     -SP_LOCAL(%sp),%sp

                | Load base (a3) and bitMaskLut (a5).
                move.l  SP_BASE(%sp),%a3
                lea     bitMaskWordLut(%pc),%a5

                | Cache cx in d5, cy (sign-extended) in a4.
                move.w  SP_CX(%sp),%d5
                move.w  SP_CY(%sp),%d6
                ext.l   %d6
                movea.l %d6,%a4

                | Bresenham init.
                move.w  SP_R(%sp),%d2          | bx = r
                moveq   #0,%d3                 | by = 0
                moveq   #1,%d4
                sub.w   %d2,%d4                | err = 1 - bx

                | Dispatch on color (low 4 bits) -> one of 16 main loops.
                moveq   #0,%d6
                move.b  SP_COLOR(%sp),%d6
                and.w   #0x0F,%d6
                add.w   %d6,%d6
                add.w   %d6,%d6                | * 4 for bra.w table
                lea     .LcoStTable(%pc),%a6
                jmp     0(%a6,%d6.w)

.LcoStTable:
                bra.w   .LcoStLoop_0
                bra.w   .LcoStLoop_1
                bra.w   .LcoStLoop_2
                bra.w   .LcoStLoop_3
                bra.w   .LcoStLoop_4
                bra.w   .LcoStLoop_5
                bra.w   .LcoStLoop_6
                bra.w   .LcoStLoop_7
                bra.w   .LcoStLoop_8
                bra.w   .LcoStLoop_9
                bra.w   .LcoStLoop_10
                bra.w   .LcoStLoop_11
                bra.w   .LcoStLoop_12
                bra.w   .LcoStLoop_13
                bra.w   .LcoStLoop_14
                bra.w   .LcoStLoop_15

                CO_LOOP_HDR  0
                CO_LOOP_HDR  1
                CO_LOOP_HDR  2
                CO_LOOP_HDR  3
                CO_LOOP_HDR  4
                CO_LOOP_HDR  5
                CO_LOOP_HDR  6
                CO_LOOP_HDR  7
                CO_LOOP_HDR  8
                CO_LOOP_HDR  9
                CO_LOOP_HDR  10
                CO_LOOP_HDR  11
                CO_LOOP_HDR  12
                CO_LOOP_HDR  13
                CO_LOOP_HDR  14
                CO_LOOP_HDR  15

.LcoStDone:
                lea     SP_LOCAL(%sp),%sp
                movem.l (%sp)+,%d2-%d7/%a2-%a6
                rts


                .align  2
| 16 word entries: bitMaskWordLut[i] = 1 << (15 - i), for i in 0..15.
bitMaskWordLut:
                .word   0x8000, 0x4000, 0x2000, 0x1000
                .word   0x0800, 0x0400, 0x0200, 0x0100
                .word   0x0080, 0x0040, 0x0020, 0x0010
                .word   0x0008, 0x0004, 0x0002, 0x0001