// Real double-precision IEEE 754 soft-float for the W65816.  Treats
// a `double` as `unsigned long long` (64-bit) and operates on its
// bit pattern.  Returns by-value at the i64 ABI A:X:Y:DP[$F0].
//
// Earlier attempts crashed the Register Coalescer; the greedy
// regalloc landing fixed the underlying register pressure problem.
// Each routine is broken into small helpers to keep frames shallow.

// Local typedefs (no stdint.h — clang's host stdint pulls glibc).
typedef unsigned long long u64;
typedef long long          s64;
typedef unsigned long      u32;
typedef long               s32;
typedef unsigned int       u16;
typedef int                s16;
typedef unsigned char      u8;

#define DSIGN_BIT  0x8000000000000000ULL
#define DEXP_MASK  0x7FF0000000000000ULL
#define DMANT_MASK 0x000FFFFFFFFFFFFFULL
#define DMANT_LEAD 0x0010000000000000ULL
#define DEXP_SHIFT 52
#define DEXP_BIAS  1023

// noinline: keeps register pressure in the callers (esp. __muldf3)
// low enough for greedy regalloc to allocate at -O2.  Without this,
// __muldf3 fails with "ran out of registers during register
// allocation" — too many concurrent u64 lifetimes (sa, sb, ma, mb,
// sr, mr) and the dpack inline blew it past the spill capacity.
__attribute__((noinline)) static u64 dpack(u64 sign, s16 exp, u64 mant) {
    if (mant == 0) return sign;
    u64 e = (u64)(exp + DEXP_BIAS);
    if (e >= 2047) {
        // Overflow → infinity.
        return sign | DEXP_MASK;
    }
    if ((s16)e <= 0) {
        // Underflow → zero (flush-to-zero, no subnormals).
        return sign;
    }
    return sign | (e << DEXP_SHIFT) | (mant & DMANT_MASK);
}

// Decompose `x` into sign / unbiased-exp / mantissa-with-leading-bit.
// Returns the class: 0=zero, 1=normal, 2=infinity, 3=NaN.
// noinline reduces register pressure in __muldf3/__divdf3/__adddf3
// — without it, greedy regalloc runs out of registers in __muldf3
// at -O2.  Now safe because pointer-arg writes lower to STBptr/STAptr
// which use [$E0],Y indirect-long with the bank byte forced to 0
// (DBR-independent).  See `feedback_dbr_ptr_deref_spill.md`.
__attribute__((noinline))
static u16 dclass(u64 x, u64 *out_sign, s16 *out_exp, u64 *out_mant) {
    *out_sign = x & DSIGN_BIT;
    s16 e = (s16)((x >> DEXP_SHIFT) & 0x7FF);
    u64 m = x & DMANT_MASK;
    if (e == 0) {
        *out_exp = 0;
        *out_mant = 0;
        return 0;
    }
    if (e == 0x7FF) {
        *out_exp = 0x7FF;
        *out_mant = m;
        return (m == 0) ? 2 : 3;
    }
    *out_exp = e - DEXP_BIAS;
    *out_mant = m | DMANT_LEAD;
    return 1;
}

u64 __adddf3(u64 a, u64 b) {
    u64 sa, sb, ma, mb;
    s16 ea, eb;
    u16 ca = dclass(a, &sa, &ea, &ma);
    u16 cb = dclass(b, &sb, &eb, &mb);
    if (ca == 0) return b;
    if (cb == 0) return a;
    // Shift mantissas left by 3 to reserve guard / round / sticky bits
    // below position 0.  Lead bit is now at position 55 instead of 52.
    // The sticky bit is preserved by ORing it into the LSB whenever a
    // significant bit would otherwise be shifted off the right side
    // (during alignment or post-add normalization).  At the end, RNE
    // rounds based on bits 2..0 (guard, round, sticky) and shifts back.
    ma <<= 3;
    mb <<= 3;
    // Align mantissas to common exponent.  The smaller-exp operand is
    // shifted right; bits shifted past position 0 become sticky.
    if (ea > eb) {
        s16 d = ea - eb;
        if (d > 56) return a;
        u64 sticky = 0;
        if (d > 3) {
            u64 mask = (d >= 64) ? ~0ULL : ((1ULL << d) - 1);
            sticky = (mb & mask) ? 1 : 0;
        }
        mb = (d >= 64) ? 0 : (mb >> d);
        mb |= sticky;
        eb = ea;
    } else if (eb > ea) {
        s16 d = eb - ea;
        if (d > 56) return b;
        u64 sticky = 0;
        if (d > 3) {
            u64 mask = (d >= 64) ? ~0ULL : ((1ULL << d) - 1);
            sticky = (ma & mask) ? 1 : 0;
        }
        ma = (d >= 64) ? 0 : (ma >> d);
        ma |= sticky;
        ea = eb;
    }
    u64 mr;
    u64 sr;
    if (sa == sb) {
        mr = ma + mb;
        sr = sa;
    } else {
        if (ma >= mb) {
            mr = ma - mb;
            sr = sa;
        } else {
            mr = mb - ma;
            sr = sb;
        }
    }
    if (mr == 0) return 0;
    // Renormalize.  Lead bit should land at position 55 (= 52 + 3 GRS).
    // Right-shift first to bring an over-wide sum back in range; then
    // left-shift if subtraction left the lead below 55.  Reverse order
    // would shift an over-wide value out of u64 range entirely.
    // Use if + do-while because pure `while (cond) body` triggers a
    // ptr32 backend bug: PHP/PLP wrap pass mis-identifies the loop's
    // pre-test LDA reload as flag corruption and wraps the wrong
    // range, so the BEQ tests stale flags and the loop body never
    // fires.  `do { } while (cond)` is unaffected (test-after-body).
    if (mr & ~((1ULL << 56) - 1)) {
        do {
            u64 sticky_bit = mr & 1;
            mr = (mr >> 1) | sticky_bit;
            ea++;
        } while (mr & ~((1ULL << 56) - 1));
    }
    if ((mr & (1ULL << 55)) == 0 && mr != 0) {
        do {
            mr <<= 1;
            ea--;
        } while ((mr & (1ULL << 55)) == 0 && mr != 0);
    }
    // Round to nearest, ties to even.  Bits 0/1 are sticky+round, bit 2
    // is guard, bit 3 is mantissa LSB.
    int guard  = (int)((mr >> 2) & 1);
    int sticky = (int)(mr & 0x3);
    int lsb    = (int)((mr >> 3) & 1);
    mr >>= 3;  // drop GRS bits to get the 53-bit mantissa
    if (guard && (sticky || lsb)) {
        mr++;
        if (mr & (1ULL << 53)) {
            mr >>= 1;
            ea++;
        }
    }
    return dpack(sr, ea, mr);
}

u64 __subdf3(u64 a, u64 b) {
    return __adddf3(a, b ^ DSIGN_BIT);
}

u64 __negdf2(u64 a) {
    return a ^ DSIGN_BIT;
}

// Carry the high 64 bits of a 128-bit product in `hi` and the low 64
// in `lo`.  Carry bit indicates whether the leading bit was at 105
// (caller must increment exponent).
typedef struct {
    u64 mantissa;
    u16 carry;
} MantCarryT;

// 64x64 -> 128-bit product, returned as a packed u64 pair.  Returns
// the high 64 bits in the high u64 of the .mantissa lane is not
// possible — instead, we shift in-line and return the aligned mantissa
// directly.  Splitting keeps register pressure low enough for greedy
// regalloc on the single-A W65816.
//
// Inlinable on purpose: passing a pointer to a stack local across a
// noinline boundary lowers to `sta (d,s),y` which uses DBR-relative
// addressing — broken under DBR != 0 (e.g. after a bank switch).
// Keeping these inline keeps the stores within the caller's frame.
//
// out_round encodes the round bits as (guard << 1) | sticky.  Caller
// uses these for round-to-nearest-even.
static inline u64 mulhi64Aligned(u64 ma, u64 mb,
                                 u16 *out_carry, u16 *out_round) {
    u32 alo = (u32)ma;
    u32 ahi = (u32)(ma >> 32);
    u32 blo = (u32)mb;
    u32 bhi = (u32)(mb >> 32);
    u64 ll = (u64)alo * (u64)blo;
    u64 lh = (u64)alo * (u64)bhi;
    u64 hl = (u64)ahi * (u64)blo;
    u64 hh = (u64)ahi * (u64)bhi;
    u64 mid = lh + hl + (ll >> 32);
    u64 prod_hi = hh + (mid >> 32);
    u64 prod_lo = (ll & 0xFFFFFFFFULL) | ((mid & 0xFFFFFFFFULL) << 32);
    if (prod_hi & (1ULL << 41)) {
        // Lead-at-105 case: shift right 53 within full product.  The
        // bit at prod_lo position 52 is the guard; bits 0..51 are sticky.
        *out_carry = 1;
        u16 guard = (u16)((prod_lo >> 52) & 1);
        u16 sticky = (u16)((prod_lo & ((1ULL << 52) - 1)) != 0);
        *out_round = (guard << 1) | sticky;
        return (prod_hi << 11) | (prod_lo >> 53);
    }
    // Lead-at-104 case: shift right 52.  Guard at prod_lo bit 51,
    // sticky = OR of bits 0..50.
    *out_carry = 0;
    u16 guard = (u16)((prod_lo >> 51) & 1);
    u16 sticky = (u16)((prod_lo & ((1ULL << 51) - 1)) != 0);
    *out_round = (guard << 1) | sticky;
    return (prod_hi << 12) | (prod_lo >> 52);
}

u64 __muldf3(u64 a, u64 b) {
    u64 sa, sb, ma, mb;
    s16 ea, eb;
    u16 ca = dclass(a, &sa, &ea, &ma);
    u16 cb = dclass(b, &sb, &eb, &mb);
    u64 sr = sa ^ sb;
    if (ca == 0 || cb == 0) return sr;
    u16 carry;
    u16 round_bits;
    u64 mr = mulhi64Aligned(ma, mb, &carry, &round_bits);
    s16 er = ea + eb + (s16)carry;
    // Round to nearest, ties to even.
    int guard = (round_bits >> 1) & 1;
    int sticky = round_bits & 1;
    if (guard && (sticky || (mr & 1))) {
        mr++;
        if (mr & (1ULL << 53)) {
            mr >>= 1;
            er++;
        }
    }
    return dpack(sr, er, mr);
}

u64 __divdf3(u64 a, u64 b) {
    u64 sa, sb, ma, mb;
    s16 ea, eb;
    u16 ca = dclass(a, &sa, &ea, &ma);
    u16 cb = dclass(b, &sb, &eb, &mb);
    u64 sr = sa ^ sb;
    if (ca == 0) return sr;
    if (cb == 0) return sr | DEXP_MASK;  // div-by-zero → inf
    // Long division: handle the leading quotient bit explicitly (since
    // we need to "consume" the dividend's leading 1 by subtracting),
    // then generate 52 more fractional bits by shifting r left and
    // testing.  The previous shift-and-test-only loop over-counted
    // when r == mb after subtraction (e.g. 2.0/1.0 returned ~4.0).
    s16 er = ea - eb;
    // Normalize so the dividend is in [mb, 2*mb).  This ensures the
    // leading quotient bit will land at position 52 below.
    if (ma < mb) {
        ma <<= 1;
        er--;
    }
    // Handle the leading quotient bit explicitly.
    u64 q = DMANT_LEAD;
    u64 r = ma - mb;
    // `volatile vmb`: forces mb to be re-read from memory inside the
    // loop.  Without this, the W65816 codegen miscompiles `r >= mb` and
    // `r -= mb` when called as the 3rd+ chained `__divdf3` after prior
    // softDouble libcalls (sqrt3 Newton iter — 3rd iter returned 0.0
    // instead of 1.41421).  Adding `volatile` to either `r` or `mb`
    // alone fixes it, suggesting the compiler is keeping one of them
    // in registers across loop iterations and a JSL inside the loop
    // (__ashlsi3 for `r <<= 1`) clobbers the held value.  The real
    // fix lives in the W65816 backend's u64-shift lowering; volatile
    // here is the conservative workaround.
    volatile u64 vmb = mb;
    // Compute 52 more fractional bits via standard shift-test-subtract.
    for (int i = 51; i >= 0; i--) {
        r <<= 1;
        if (r >= vmb) {
            r -= vmb;
            q |= (1ULL << i);
        }
    }
    mb = vmb;  // resync in case below reads mb
    // Round to nearest, ties to even.  Generate one extra bit (the
    // "guard"), examine the remainder for any non-zero "sticky" tail,
    // and round q up when guard=1 and (sticky || (q & 1)).  Without
    // this we'd be truncate-toward-zero, off by 1 ULP from gcc's RNE
    // result on cases like 1.5/2.5.
    r <<= 1;
    int guard = (r >= mb) ? 1 : 0;
    if (guard) r -= mb;
    int sticky = (r != 0) ? 1 : 0;
    if (guard && (sticky || (q & 1))) {
        q++;
        if (q & (1ULL << 53)) {  // mantissa overflow into bit 53 -> renormalize
            q >>= 1;
            er++;
        }
    }
    return dpack(sr, er, q);
}

s16 __cmpdf2(u64 a, u64 b) {
    u64 sa = a & DSIGN_BIT;
    u64 sb = b & DSIGN_BIT;
    if (sa != sb) {
        // Negative < positive (unless both zero).
        if ((a | b) << 1 == 0) return 0;
        return sa ? -1 : 1;
    }
    if (a == b) return 0;
    if (sa) return a < b ? 1 : -1;
    return a < b ? -1 : 1;
}

s16 __unorddf2(u64 a, u64 b) {
    // Returns nonzero if either is NaN.
    u64 ea = (a >> DEXP_SHIFT) & 0x7FF;
    u64 eb = (b >> DEXP_SHIFT) & 0x7FF;
    if (ea == 0x7FF && (a & DMANT_MASK) != 0) return 1;
    if (eb == 0x7FF && (b & DMANT_MASK) != 0) return 1;
    return 0;
}

s16 __eqdf2(u64 a, u64 b) { return __cmpdf2(a, b) != 0; }
s16 __nedf2(u64 a, u64 b) { return __cmpdf2(a, b) != 0; }
s16 __ltdf2(u64 a, u64 b) { return __cmpdf2(a, b) <  0; }
s16 __ledf2(u64 a, u64 b) { return __cmpdf2(a, b) <= 0; }
s16 __gtdf2(u64 a, u64 b) { return __cmpdf2(a, b) >  0; }
s16 __gedf2(u64 a, u64 b) { return __cmpdf2(a, b) >= 0; }

// double <-> float conversions.
u64 __extendsfdf2(u32 x) {
    u64 sign = ((u64)x & 0x80000000UL) << 32;
    s16 e = (s16)((x >> 23) & 0xFF);
    u32 m = x & 0x7FFFFFUL;
    if (e == 0) return sign;
    if (e == 0xFF) {
        return sign | DEXP_MASK | ((u64)m << 29);
    }
    s16 unbiased = e - 127;
    return dpack(sign, unbiased, ((u64)m << 29) | DMANT_LEAD);
}

u32 __truncdfsf2(u64 x) {
    u64 sign = (x & DSIGN_BIT) >> 32;
    s16 e = (s16)((x >> DEXP_SHIFT) & 0x7FF);
    u64 m = x & DMANT_MASK;
    if (e == 0) return (u32)sign;
    if (e == 0x7FF) {
        return (u32)sign | 0x7F800000UL | (u32)(m >> 29);
    }
    s16 unbiased = e - DEXP_BIAS;
    s16 fexp = unbiased + 127;
    if (fexp >= 255) return (u32)sign | 0x7F800000UL;
    if (fexp <= 0) return (u32)sign;
    return (u32)sign | ((u32)fexp << 23) | (u32)((m >> 29) & 0x7FFFFFUL);
}

// double <-> integer conversions.
u64 __floatsidf(s32 x) {
    if (x == 0) return 0;
    u64 sign = (x < 0) ? DSIGN_BIT : 0;
    u64 m = (u64)((x < 0) ? -x : x);
    s16 e = 0;
    while ((m & DMANT_LEAD) == 0) { m <<= 1; e--; }
    e += 31 + 21;  // shift to put bit-31 at bit-52
    return dpack(sign, e, m);
}


u64 __floatunsidf(u32 x) {
    if (x == 0) return 0;
    u64 m = (u64)x;
    s16 e = 0;
    while ((m & DMANT_LEAD) == 0) { m <<= 1; e--; }
    e += 31 + 21;
    return dpack(0, e, m);
}

s32 __fixdfsi(u64 x) {
    u64 sign = x & DSIGN_BIT;
    s16 e = (s16)((x >> DEXP_SHIFT) & 0x7FF);
    if (e == 0) return 0;
    if (e == 0x7FF) return sign ? (s32)0x80000000 : 0x7FFFFFFF;
    s16 unbiased = e - DEXP_BIAS;
    if (unbiased < 0) return 0;
    if (unbiased > 30) return sign ? (s32)0x80000000 : 0x7FFFFFFF;
    u64 m = (x & DMANT_MASK) | DMANT_LEAD;
    s16 shift = 52 - unbiased;
    if (shift >= 0) m >>= shift; else m <<= -shift;
    return sign ? -(s32)m : (s32)m;
}


// __fixunsdfsi: unsigned double → uint32. Saturates to 0 for negative
// inputs, to 0xFFFFFFFF for inputs >= 2^32. Used by clang when casting
// double values to unsigned integer types.
u32 __fixunsdfsi(u64 x) {
    if (x & DSIGN_BIT) return 0;                 // negative → 0
    u16 e = (u16)((x >> DEXP_SHIFT) & 0x7FF);
    if (e == 0) return 0;
    if (e == 0x7FF) return 0xFFFFFFFF;
    s16 unbiased = (s16)e - DEXP_BIAS;
    if (unbiased < 0) return 0;
    if (unbiased > 31) return 0xFFFFFFFF;
    u64 m = (x & DMANT_MASK) | DMANT_LEAD;
    if (unbiased >= 52) {
        m <<= (unbiased - 52);
    } else {
        m >>= (52 - unbiased);
    }
    return (u32)m;
}