// Mandelbrot tile in 16.16 fixed-point — exercises i32 multiply
// (__mulsi3 / __umulhisi3) and conditional control flow.  Pure
// integer math: doesn't pull in soft-double.
//
// Rasterizes a tiny 8x8 grid over the complex plane and sums per-pixel
// iteration counts.  Returns the sum so dead-code-elim doesn't strip
// the loop.

typedef long fp_t;          // 16.16 fixed-point
#define FP_SHIFT 16
#define FP_ONE   (1L << FP_SHIFT)
#define FP_FOUR  (4L << FP_SHIFT)

#define GRID     4
#define MAX_ITER 8


static fp_t fpMul(fp_t a, fp_t b) {
    // Signed 16.16 multiply: (a * b) >> 16.
    // Original `(long long)a * (long long)b` defeats __muldi3's 32-bit
    // short-circuit when args are negative (sign-extension fills high
    // half with 1s).  Restore via partial products on 16-bit halves —
    // __umulhisi3 (16x16→32) is much cheaper than __muldi3 (32+ iters).
    long long p = (long long)a * (long long)b;
    return (fp_t)(p >> FP_SHIFT);
}


unsigned long mandTile(void) {
    unsigned long sum = 0;
    // c-plane window: [-2, 1] x [-1, 1].  At GRID=8, step = 3/8 in x,
    // 2/8 in y.  Express as 16.16 increments.
    fp_t stepX = (fp_t)((3L * FP_ONE) / GRID);
    fp_t stepY = (fp_t)((2L * FP_ONE) / GRID);
    fp_t baseX = -(2L * FP_ONE);
    fp_t baseY = -FP_ONE;
    for (short j = 0; j < GRID; j++) {
        fp_t cy = baseY + (fp_t)j * stepY;
        for (short i = 0; i < GRID; i++) {
            fp_t cx = baseX + (fp_t)i * stepX;
            fp_t x = 0;
            fp_t y = 0;
            short iter;
            for (iter = 0; iter < MAX_ITER; iter++) {
                fp_t xx = fpMul(x, x);
                fp_t yy = fpMul(y, y);
                if (xx + yy > FP_FOUR) {
                    break;
                }
                fp_t xy = fpMul(x, y);
                y = (fp_t)(xy + xy + cy);    // 2*x*y + cy
                x = (fp_t)(xx - yy + cx);
            }
            sum += (unsigned long)(unsigned short)iter;
        }
    }
    return sum;
}