65816-llvm-mos/runtime/src/softDouble.c
Scott Duensing 465f8ba947 Checkpoint
2026-05-05 22:00:34 -05:00

421 lines
14 KiB
C

// Real double-precision IEEE 754 soft-float for the W65816. Treats
// a `double` as `unsigned long long` (64-bit) and operates on its
// bit pattern. Returns by-value at the i64 ABI A:X:Y:DP[$F0].
//
// Earlier attempts crashed the Register Coalescer; the greedy
// regalloc landing fixed the underlying register pressure problem.
// Each routine is broken into small helpers to keep frames shallow.
// Local typedefs (no stdint.h — clang's host stdint pulls glibc).
typedef unsigned long long u64;
typedef long long s64;
typedef unsigned long u32;
typedef long s32;
typedef unsigned int u16;
typedef int s16;
typedef unsigned char u8;
#define DSIGN_BIT 0x8000000000000000ULL
#define DEXP_MASK 0x7FF0000000000000ULL
#define DMANT_MASK 0x000FFFFFFFFFFFFFULL
#define DMANT_LEAD 0x0010000000000000ULL
#define DEXP_SHIFT 52
#define DEXP_BIAS 1023
// noinline: keeps register pressure in the callers (esp. __muldf3)
// low enough for greedy regalloc to allocate at -O2. Without this,
// __muldf3 fails with "ran out of registers during register
// allocation" — too many concurrent u64 lifetimes (sa, sb, ma, mb,
// sr, mr) and the dpack inline blew it past the spill capacity.
__attribute__((noinline)) static u64 dpack(u64 sign, s16 exp, u64 mant) {
if (mant == 0) return sign;
u64 e = (u64)(exp + DEXP_BIAS);
if (e >= 2047) {
// Overflow → infinity.
return sign | DEXP_MASK;
}
if ((s16)e <= 0) {
// Underflow → zero (flush-to-zero, no subnormals).
return sign;
}
return sign | (e << DEXP_SHIFT) | (mant & DMANT_MASK);
}
// Decompose `x` into sign / unbiased-exp / mantissa-with-leading-bit.
// Returns the class: 0=zero, 1=normal, 2=infinity, 3=NaN.
// noinline reduces register pressure in __muldf3/__divdf3/__adddf3
// — without it, greedy regalloc runs out of registers in __muldf3
// at -O2. Now safe because pointer-arg writes lower to STBptr/STAptr
// which use [$E0],Y indirect-long with the bank byte forced to 0
// (DBR-independent). See `feedback_dbr_ptr_deref_spill.md`.
__attribute__((noinline))
static u16 dclass(u64 x, u64 *out_sign, s16 *out_exp, u64 *out_mant) {
*out_sign = x & DSIGN_BIT;
s16 e = (s16)((x >> DEXP_SHIFT) & 0x7FF);
u64 m = x & DMANT_MASK;
if (e == 0) {
*out_exp = 0;
*out_mant = 0;
return 0;
}
if (e == 0x7FF) {
*out_exp = 0x7FF;
*out_mant = m;
return (m == 0) ? 2 : 3;
}
*out_exp = e - DEXP_BIAS;
*out_mant = m | DMANT_LEAD;
return 1;
}
u64 __adddf3(u64 a, u64 b) {
u64 sa, sb, ma, mb;
s16 ea, eb;
u16 ca = dclass(a, &sa, &ea, &ma);
u16 cb = dclass(b, &sb, &eb, &mb);
if (ca == 0) return b;
if (cb == 0) return a;
// Shift mantissas left by 3 to reserve guard / round / sticky bits
// below position 0. Lead bit is now at position 55 instead of 52.
// The sticky bit is preserved by ORing it into the LSB whenever a
// significant bit would otherwise be shifted off the right side
// (during alignment or post-add normalization). At the end, RNE
// rounds based on bits 2..0 (guard, round, sticky) and shifts back.
ma <<= 3;
mb <<= 3;
// Align mantissas to common exponent. The smaller-exp operand is
// shifted right; bits shifted past position 0 become sticky.
if (ea > eb) {
s16 d = ea - eb;
if (d > 56) return a;
u64 sticky = 0;
if (d > 3) {
u64 mask = (d >= 64) ? ~0ULL : ((1ULL << d) - 1);
sticky = (mb & mask) ? 1 : 0;
}
mb = (d >= 64) ? 0 : (mb >> d);
mb |= sticky;
eb = ea;
} else if (eb > ea) {
s16 d = eb - ea;
if (d > 56) return b;
u64 sticky = 0;
if (d > 3) {
u64 mask = (d >= 64) ? ~0ULL : ((1ULL << d) - 1);
sticky = (ma & mask) ? 1 : 0;
}
ma = (d >= 64) ? 0 : (ma >> d);
ma |= sticky;
ea = eb;
}
u64 mr;
u64 sr;
if (sa == sb) {
mr = ma + mb;
sr = sa;
} else {
if (ma >= mb) {
mr = ma - mb;
sr = sa;
} else {
mr = mb - ma;
sr = sb;
}
}
if (mr == 0) return 0;
// Renormalize. Lead bit should land at position 55 (= 52 + 3 GRS).
// Right-shift first to bring an over-wide sum back in range; then
// left-shift if subtraction left the lead below 55. Reverse order
// would shift an over-wide value out of u64 range entirely.
// Use if + do-while because pure `while (cond) body` triggers a
// ptr32 backend bug: PHP/PLP wrap pass mis-identifies the loop's
// pre-test LDA reload as flag corruption and wraps the wrong
// range, so the BEQ tests stale flags and the loop body never
// fires. `do { } while (cond)` is unaffected (test-after-body).
if (mr & ~((1ULL << 56) - 1)) {
do {
u64 sticky_bit = mr & 1;
mr = (mr >> 1) | sticky_bit;
ea++;
} while (mr & ~((1ULL << 56) - 1));
}
if ((mr & (1ULL << 55)) == 0 && mr != 0) {
do {
mr <<= 1;
ea--;
} while ((mr & (1ULL << 55)) == 0 && mr != 0);
}
// Round to nearest, ties to even. Bits 0/1 are sticky+round, bit 2
// is guard, bit 3 is mantissa LSB.
int guard = (int)((mr >> 2) & 1);
int sticky = (int)(mr & 0x3);
int lsb = (int)((mr >> 3) & 1);
mr >>= 3; // drop GRS bits to get the 53-bit mantissa
if (guard && (sticky || lsb)) {
mr++;
if (mr & (1ULL << 53)) {
mr >>= 1;
ea++;
}
}
return dpack(sr, ea, mr);
}
u64 __subdf3(u64 a, u64 b) {
return __adddf3(a, b ^ DSIGN_BIT);
}
u64 __negdf2(u64 a) {
return a ^ DSIGN_BIT;
}
// Carry the high 64 bits of a 128-bit product in `hi` and the low 64
// in `lo`. Carry bit indicates whether the leading bit was at 105
// (caller must increment exponent).
typedef struct {
u64 mantissa;
u16 carry;
} MantCarryT;
// 64x64 -> 128-bit product, returned as a packed u64 pair. Returns
// the high 64 bits in the high u64 of the .mantissa lane is not
// possible — instead, we shift in-line and return the aligned mantissa
// directly. Splitting keeps register pressure low enough for greedy
// regalloc on the single-A W65816.
//
// Inlinable on purpose: passing a pointer to a stack local across a
// noinline boundary lowers to `sta (d,s),y` which uses DBR-relative
// addressing — broken under DBR != 0 (e.g. after a bank switch).
// Keeping these inline keeps the stores within the caller's frame.
//
// out_round encodes the round bits as (guard << 1) | sticky. Caller
// uses these for round-to-nearest-even.
static inline u64 mulhi64Aligned(u64 ma, u64 mb,
u16 *out_carry, u16 *out_round) {
u32 alo = (u32)ma;
u32 ahi = (u32)(ma >> 32);
u32 blo = (u32)mb;
u32 bhi = (u32)(mb >> 32);
u64 ll = (u64)alo * (u64)blo;
u64 lh = (u64)alo * (u64)bhi;
u64 hl = (u64)ahi * (u64)blo;
u64 hh = (u64)ahi * (u64)bhi;
u64 mid = lh + hl + (ll >> 32);
u64 prod_hi = hh + (mid >> 32);
u64 prod_lo = (ll & 0xFFFFFFFFULL) | ((mid & 0xFFFFFFFFULL) << 32);
if (prod_hi & (1ULL << 41)) {
// Lead-at-105 case: shift right 53 within full product. The
// bit at prod_lo position 52 is the guard; bits 0..51 are sticky.
*out_carry = 1;
u16 guard = (u16)((prod_lo >> 52) & 1);
u16 sticky = (u16)((prod_lo & ((1ULL << 52) - 1)) != 0);
*out_round = (guard << 1) | sticky;
return (prod_hi << 11) | (prod_lo >> 53);
}
// Lead-at-104 case: shift right 52. Guard at prod_lo bit 51,
// sticky = OR of bits 0..50.
*out_carry = 0;
u16 guard = (u16)((prod_lo >> 51) & 1);
u16 sticky = (u16)((prod_lo & ((1ULL << 51) - 1)) != 0);
*out_round = (guard << 1) | sticky;
return (prod_hi << 12) | (prod_lo >> 52);
}
u64 __muldf3(u64 a, u64 b) {
u64 sa, sb, ma, mb;
s16 ea, eb;
u16 ca = dclass(a, &sa, &ea, &ma);
u16 cb = dclass(b, &sb, &eb, &mb);
u64 sr = sa ^ sb;
if (ca == 0 || cb == 0) return sr;
u16 carry;
u16 round_bits;
u64 mr = mulhi64Aligned(ma, mb, &carry, &round_bits);
s16 er = ea + eb + (s16)carry;
// Round to nearest, ties to even.
int guard = (round_bits >> 1) & 1;
int sticky = round_bits & 1;
if (guard && (sticky || (mr & 1))) {
mr++;
if (mr & (1ULL << 53)) {
mr >>= 1;
er++;
}
}
return dpack(sr, er, mr);
}
u64 __divdf3(u64 a, u64 b) {
u64 sa, sb, ma, mb;
s16 ea, eb;
u16 ca = dclass(a, &sa, &ea, &ma);
u16 cb = dclass(b, &sb, &eb, &mb);
u64 sr = sa ^ sb;
if (ca == 0) return sr;
if (cb == 0) return sr | DEXP_MASK; // div-by-zero → inf
// Long division: handle the leading quotient bit explicitly (since
// we need to "consume" the dividend's leading 1 by subtracting),
// then generate 52 more fractional bits by shifting r left and
// testing. The previous shift-and-test-only loop over-counted
// when r == mb after subtraction (e.g. 2.0/1.0 returned ~4.0).
s16 er = ea - eb;
// Normalize so the dividend is in [mb, 2*mb). This ensures the
// leading quotient bit will land at position 52 below.
if (ma < mb) {
ma <<= 1;
er--;
}
// Handle the leading quotient bit explicitly.
u64 q = DMANT_LEAD;
u64 r = ma - mb;
// `volatile vmb`: forces mb to be re-read from memory inside the
// loop. Without this, the W65816 codegen miscompiles `r >= mb` and
// `r -= mb` when called as the 3rd+ chained `__divdf3` after prior
// softDouble libcalls (sqrt3 Newton iter — 3rd iter returned 0.0
// instead of 1.41421). Adding `volatile` to either `r` or `mb`
// alone fixes it, suggesting the compiler is keeping one of them
// in registers across loop iterations and a JSL inside the loop
// (__ashlsi3 for `r <<= 1`) clobbers the held value. The real
// fix lives in the W65816 backend's u64-shift lowering; volatile
// here is the conservative workaround.
volatile u64 vmb = mb;
// Compute 52 more fractional bits via standard shift-test-subtract.
for (int i = 51; i >= 0; i--) {
r <<= 1;
if (r >= vmb) {
r -= vmb;
q |= (1ULL << i);
}
}
mb = vmb; // resync in case below reads mb
// Round to nearest, ties to even. Generate one extra bit (the
// "guard"), examine the remainder for any non-zero "sticky" tail,
// and round q up when guard=1 and (sticky || (q & 1)). Without
// this we'd be truncate-toward-zero, off by 1 ULP from gcc's RNE
// result on cases like 1.5/2.5.
r <<= 1;
int guard = (r >= mb) ? 1 : 0;
if (guard) r -= mb;
int sticky = (r != 0) ? 1 : 0;
if (guard && (sticky || (q & 1))) {
q++;
if (q & (1ULL << 53)) { // mantissa overflow into bit 53 -> renormalize
q >>= 1;
er++;
}
}
return dpack(sr, er, q);
}
s16 __cmpdf2(u64 a, u64 b) {
u64 sa = a & DSIGN_BIT;
u64 sb = b & DSIGN_BIT;
if (sa != sb) {
// Negative < positive (unless both zero).
if ((a | b) << 1 == 0) return 0;
return sa ? -1 : 1;
}
if (a == b) return 0;
if (sa) return a < b ? 1 : -1;
return a < b ? -1 : 1;
}
s16 __unorddf2(u64 a, u64 b) {
// Returns nonzero if either is NaN.
u64 ea = (a >> DEXP_SHIFT) & 0x7FF;
u64 eb = (b >> DEXP_SHIFT) & 0x7FF;
if (ea == 0x7FF && (a & DMANT_MASK) != 0) return 1;
if (eb == 0x7FF && (b & DMANT_MASK) != 0) return 1;
return 0;
}
s16 __eqdf2(u64 a, u64 b) { return __cmpdf2(a, b) != 0; }
s16 __nedf2(u64 a, u64 b) { return __cmpdf2(a, b) != 0; }
s16 __ltdf2(u64 a, u64 b) { return __cmpdf2(a, b) < 0; }
s16 __ledf2(u64 a, u64 b) { return __cmpdf2(a, b) <= 0; }
s16 __gtdf2(u64 a, u64 b) { return __cmpdf2(a, b) > 0; }
s16 __gedf2(u64 a, u64 b) { return __cmpdf2(a, b) >= 0; }
// double <-> float conversions.
u64 __extendsfdf2(u32 x) {
u64 sign = ((u64)x & 0x80000000UL) << 32;
s16 e = (s16)((x >> 23) & 0xFF);
u32 m = x & 0x7FFFFFUL;
if (e == 0) return sign;
if (e == 0xFF) {
return sign | DEXP_MASK | ((u64)m << 29);
}
s16 unbiased = e - 127;
return dpack(sign, unbiased, ((u64)m << 29) | DMANT_LEAD);
}
u32 __truncdfsf2(u64 x) {
u64 sign = (x & DSIGN_BIT) >> 32;
s16 e = (s16)((x >> DEXP_SHIFT) & 0x7FF);
u64 m = x & DMANT_MASK;
if (e == 0) return (u32)sign;
if (e == 0x7FF) {
return (u32)sign | 0x7F800000UL | (u32)(m >> 29);
}
s16 unbiased = e - DEXP_BIAS;
s16 fexp = unbiased + 127;
if (fexp >= 255) return (u32)sign | 0x7F800000UL;
if (fexp <= 0) return (u32)sign;
return (u32)sign | ((u32)fexp << 23) | (u32)((m >> 29) & 0x7FFFFFUL);
}
// double <-> integer conversions.
u64 __floatsidf(s32 x) {
if (x == 0) return 0;
u64 sign = (x < 0) ? DSIGN_BIT : 0;
u64 m = (u64)((x < 0) ? -x : x);
s16 e = 0;
while ((m & DMANT_LEAD) == 0) { m <<= 1; e--; }
e += 31 + 21; // shift to put bit-31 at bit-52
return dpack(sign, e, m);
}
u64 __floatunsidf(u32 x) {
if (x == 0) return 0;
u64 m = (u64)x;
s16 e = 0;
while ((m & DMANT_LEAD) == 0) { m <<= 1; e--; }
e += 31 + 21;
return dpack(0, e, m);
}
s32 __fixdfsi(u64 x) {
u64 sign = x & DSIGN_BIT;
s16 e = (s16)((x >> DEXP_SHIFT) & 0x7FF);
if (e == 0) return 0;
if (e == 0x7FF) return sign ? (s32)0x80000000 : 0x7FFFFFFF;
s16 unbiased = e - DEXP_BIAS;
if (unbiased < 0) return 0;
if (unbiased > 30) return sign ? (s32)0x80000000 : 0x7FFFFFFF;
u64 m = (x & DMANT_MASK) | DMANT_LEAD;
s16 shift = 52 - unbiased;
if (shift >= 0) m >>= shift; else m <<= -shift;
return sign ? -(s32)m : (s32)m;
}
// __fixunsdfsi: unsigned double → uint32. Saturates to 0 for negative
// inputs, to 0xFFFFFFFF for inputs >= 2^32. Used by clang when casting
// double values to unsigned integer types.
u32 __fixunsdfsi(u64 x) {
if (x & DSIGN_BIT) return 0; // negative → 0
u16 e = (u16)((x >> DEXP_SHIFT) & 0x7FF);
if (e == 0) return 0;
if (e == 0x7FF) return 0xFFFFFFFF;
s16 unbiased = (s16)e - DEXP_BIAS;
if (unbiased < 0) return 0;
if (unbiased > 31) return 0xFFFFFFFF;
u64 m = (x & DMANT_MASK) | DMANT_LEAD;
if (unbiased >= 52) {
m <<= (unbiased - 52);
} else {
m >>= (52 - unbiased);
}
return (u32)m;
}