421 lines
14 KiB
C
421 lines
14 KiB
C
// Real double-precision IEEE 754 soft-float for the W65816. Treats
|
|
// a `double` as `unsigned long long` (64-bit) and operates on its
|
|
// bit pattern. Returns by-value at the i64 ABI A:X:Y:DP[$F0].
|
|
//
|
|
// Earlier attempts crashed the Register Coalescer; the greedy
|
|
// regalloc landing fixed the underlying register pressure problem.
|
|
// Each routine is broken into small helpers to keep frames shallow.
|
|
|
|
// Local typedefs (no stdint.h — clang's host stdint pulls glibc).
|
|
typedef unsigned long long u64;
|
|
typedef long long s64;
|
|
typedef unsigned long u32;
|
|
typedef long s32;
|
|
typedef unsigned int u16;
|
|
typedef int s16;
|
|
typedef unsigned char u8;
|
|
|
|
#define DSIGN_BIT 0x8000000000000000ULL
|
|
#define DEXP_MASK 0x7FF0000000000000ULL
|
|
#define DMANT_MASK 0x000FFFFFFFFFFFFFULL
|
|
#define DMANT_LEAD 0x0010000000000000ULL
|
|
#define DEXP_SHIFT 52
|
|
#define DEXP_BIAS 1023
|
|
|
|
// noinline: keeps register pressure in the callers (esp. __muldf3)
|
|
// low enough for greedy regalloc to allocate at -O2. Without this,
|
|
// __muldf3 fails with "ran out of registers during register
|
|
// allocation" — too many concurrent u64 lifetimes (sa, sb, ma, mb,
|
|
// sr, mr) and the dpack inline blew it past the spill capacity.
|
|
__attribute__((noinline)) static u64 dpack(u64 sign, s16 exp, u64 mant) {
|
|
if (mant == 0) return sign;
|
|
u64 e = (u64)(exp + DEXP_BIAS);
|
|
if (e >= 2047) {
|
|
// Overflow → infinity.
|
|
return sign | DEXP_MASK;
|
|
}
|
|
if ((s16)e <= 0) {
|
|
// Underflow → zero (flush-to-zero, no subnormals).
|
|
return sign;
|
|
}
|
|
return sign | (e << DEXP_SHIFT) | (mant & DMANT_MASK);
|
|
}
|
|
|
|
// Decompose `x` into sign / unbiased-exp / mantissa-with-leading-bit.
|
|
// Returns the class: 0=zero, 1=normal, 2=infinity, 3=NaN.
|
|
// noinline reduces register pressure in __muldf3/__divdf3/__adddf3
|
|
// — without it, greedy regalloc runs out of registers in __muldf3
|
|
// at -O2. Now safe because pointer-arg writes lower to STBptr/STAptr
|
|
// which use [$E0],Y indirect-long with the bank byte forced to 0
|
|
// (DBR-independent). See `feedback_dbr_ptr_deref_spill.md`.
|
|
__attribute__((noinline))
|
|
static u16 dclass(u64 x, u64 *out_sign, s16 *out_exp, u64 *out_mant) {
|
|
*out_sign = x & DSIGN_BIT;
|
|
s16 e = (s16)((x >> DEXP_SHIFT) & 0x7FF);
|
|
u64 m = x & DMANT_MASK;
|
|
if (e == 0) {
|
|
*out_exp = 0;
|
|
*out_mant = 0;
|
|
return 0;
|
|
}
|
|
if (e == 0x7FF) {
|
|
*out_exp = 0x7FF;
|
|
*out_mant = m;
|
|
return (m == 0) ? 2 : 3;
|
|
}
|
|
*out_exp = e - DEXP_BIAS;
|
|
*out_mant = m | DMANT_LEAD;
|
|
return 1;
|
|
}
|
|
|
|
u64 __adddf3(u64 a, u64 b) {
|
|
u64 sa, sb, ma, mb;
|
|
s16 ea, eb;
|
|
u16 ca = dclass(a, &sa, &ea, &ma);
|
|
u16 cb = dclass(b, &sb, &eb, &mb);
|
|
if (ca == 0) return b;
|
|
if (cb == 0) return a;
|
|
// Shift mantissas left by 3 to reserve guard / round / sticky bits
|
|
// below position 0. Lead bit is now at position 55 instead of 52.
|
|
// The sticky bit is preserved by ORing it into the LSB whenever a
|
|
// significant bit would otherwise be shifted off the right side
|
|
// (during alignment or post-add normalization). At the end, RNE
|
|
// rounds based on bits 2..0 (guard, round, sticky) and shifts back.
|
|
ma <<= 3;
|
|
mb <<= 3;
|
|
// Align mantissas to common exponent. The smaller-exp operand is
|
|
// shifted right; bits shifted past position 0 become sticky.
|
|
if (ea > eb) {
|
|
s16 d = ea - eb;
|
|
if (d > 56) return a;
|
|
u64 sticky = 0;
|
|
if (d > 3) {
|
|
u64 mask = (d >= 64) ? ~0ULL : ((1ULL << d) - 1);
|
|
sticky = (mb & mask) ? 1 : 0;
|
|
}
|
|
mb = (d >= 64) ? 0 : (mb >> d);
|
|
mb |= sticky;
|
|
eb = ea;
|
|
} else if (eb > ea) {
|
|
s16 d = eb - ea;
|
|
if (d > 56) return b;
|
|
u64 sticky = 0;
|
|
if (d > 3) {
|
|
u64 mask = (d >= 64) ? ~0ULL : ((1ULL << d) - 1);
|
|
sticky = (ma & mask) ? 1 : 0;
|
|
}
|
|
ma = (d >= 64) ? 0 : (ma >> d);
|
|
ma |= sticky;
|
|
ea = eb;
|
|
}
|
|
u64 mr;
|
|
u64 sr;
|
|
if (sa == sb) {
|
|
mr = ma + mb;
|
|
sr = sa;
|
|
} else {
|
|
if (ma >= mb) {
|
|
mr = ma - mb;
|
|
sr = sa;
|
|
} else {
|
|
mr = mb - ma;
|
|
sr = sb;
|
|
}
|
|
}
|
|
if (mr == 0) return 0;
|
|
// Renormalize. Lead bit should land at position 55 (= 52 + 3 GRS).
|
|
// Right-shift first to bring an over-wide sum back in range; then
|
|
// left-shift if subtraction left the lead below 55. Reverse order
|
|
// would shift an over-wide value out of u64 range entirely.
|
|
// Use if + do-while because pure `while (cond) body` triggers a
|
|
// ptr32 backend bug: PHP/PLP wrap pass mis-identifies the loop's
|
|
// pre-test LDA reload as flag corruption and wraps the wrong
|
|
// range, so the BEQ tests stale flags and the loop body never
|
|
// fires. `do { } while (cond)` is unaffected (test-after-body).
|
|
if (mr & ~((1ULL << 56) - 1)) {
|
|
do {
|
|
u64 sticky_bit = mr & 1;
|
|
mr = (mr >> 1) | sticky_bit;
|
|
ea++;
|
|
} while (mr & ~((1ULL << 56) - 1));
|
|
}
|
|
if ((mr & (1ULL << 55)) == 0 && mr != 0) {
|
|
do {
|
|
mr <<= 1;
|
|
ea--;
|
|
} while ((mr & (1ULL << 55)) == 0 && mr != 0);
|
|
}
|
|
// Round to nearest, ties to even. Bits 0/1 are sticky+round, bit 2
|
|
// is guard, bit 3 is mantissa LSB.
|
|
int guard = (int)((mr >> 2) & 1);
|
|
int sticky = (int)(mr & 0x3);
|
|
int lsb = (int)((mr >> 3) & 1);
|
|
mr >>= 3; // drop GRS bits to get the 53-bit mantissa
|
|
if (guard && (sticky || lsb)) {
|
|
mr++;
|
|
if (mr & (1ULL << 53)) {
|
|
mr >>= 1;
|
|
ea++;
|
|
}
|
|
}
|
|
return dpack(sr, ea, mr);
|
|
}
|
|
|
|
u64 __subdf3(u64 a, u64 b) {
|
|
return __adddf3(a, b ^ DSIGN_BIT);
|
|
}
|
|
|
|
u64 __negdf2(u64 a) {
|
|
return a ^ DSIGN_BIT;
|
|
}
|
|
|
|
// Carry the high 64 bits of a 128-bit product in `hi` and the low 64
|
|
// in `lo`. Carry bit indicates whether the leading bit was at 105
|
|
// (caller must increment exponent).
|
|
typedef struct {
|
|
u64 mantissa;
|
|
u16 carry;
|
|
} MantCarryT;
|
|
|
|
// 64x64 -> 128-bit product, returned as a packed u64 pair. Returns
|
|
// the high 64 bits in the high u64 of the .mantissa lane is not
|
|
// possible — instead, we shift in-line and return the aligned mantissa
|
|
// directly. Splitting keeps register pressure low enough for greedy
|
|
// regalloc on the single-A W65816.
|
|
//
|
|
// Inlinable on purpose: passing a pointer to a stack local across a
|
|
// noinline boundary lowers to `sta (d,s),y` which uses DBR-relative
|
|
// addressing — broken under DBR != 0 (e.g. after a bank switch).
|
|
// Keeping these inline keeps the stores within the caller's frame.
|
|
//
|
|
// out_round encodes the round bits as (guard << 1) | sticky. Caller
|
|
// uses these for round-to-nearest-even.
|
|
static inline u64 mulhi64Aligned(u64 ma, u64 mb,
|
|
u16 *out_carry, u16 *out_round) {
|
|
u32 alo = (u32)ma;
|
|
u32 ahi = (u32)(ma >> 32);
|
|
u32 blo = (u32)mb;
|
|
u32 bhi = (u32)(mb >> 32);
|
|
u64 ll = (u64)alo * (u64)blo;
|
|
u64 lh = (u64)alo * (u64)bhi;
|
|
u64 hl = (u64)ahi * (u64)blo;
|
|
u64 hh = (u64)ahi * (u64)bhi;
|
|
u64 mid = lh + hl + (ll >> 32);
|
|
u64 prod_hi = hh + (mid >> 32);
|
|
u64 prod_lo = (ll & 0xFFFFFFFFULL) | ((mid & 0xFFFFFFFFULL) << 32);
|
|
if (prod_hi & (1ULL << 41)) {
|
|
// Lead-at-105 case: shift right 53 within full product. The
|
|
// bit at prod_lo position 52 is the guard; bits 0..51 are sticky.
|
|
*out_carry = 1;
|
|
u16 guard = (u16)((prod_lo >> 52) & 1);
|
|
u16 sticky = (u16)((prod_lo & ((1ULL << 52) - 1)) != 0);
|
|
*out_round = (guard << 1) | sticky;
|
|
return (prod_hi << 11) | (prod_lo >> 53);
|
|
}
|
|
// Lead-at-104 case: shift right 52. Guard at prod_lo bit 51,
|
|
// sticky = OR of bits 0..50.
|
|
*out_carry = 0;
|
|
u16 guard = (u16)((prod_lo >> 51) & 1);
|
|
u16 sticky = (u16)((prod_lo & ((1ULL << 51) - 1)) != 0);
|
|
*out_round = (guard << 1) | sticky;
|
|
return (prod_hi << 12) | (prod_lo >> 52);
|
|
}
|
|
|
|
u64 __muldf3(u64 a, u64 b) {
|
|
u64 sa, sb, ma, mb;
|
|
s16 ea, eb;
|
|
u16 ca = dclass(a, &sa, &ea, &ma);
|
|
u16 cb = dclass(b, &sb, &eb, &mb);
|
|
u64 sr = sa ^ sb;
|
|
if (ca == 0 || cb == 0) return sr;
|
|
u16 carry;
|
|
u16 round_bits;
|
|
u64 mr = mulhi64Aligned(ma, mb, &carry, &round_bits);
|
|
s16 er = ea + eb + (s16)carry;
|
|
// Round to nearest, ties to even.
|
|
int guard = (round_bits >> 1) & 1;
|
|
int sticky = round_bits & 1;
|
|
if (guard && (sticky || (mr & 1))) {
|
|
mr++;
|
|
if (mr & (1ULL << 53)) {
|
|
mr >>= 1;
|
|
er++;
|
|
}
|
|
}
|
|
return dpack(sr, er, mr);
|
|
}
|
|
|
|
u64 __divdf3(u64 a, u64 b) {
|
|
u64 sa, sb, ma, mb;
|
|
s16 ea, eb;
|
|
u16 ca = dclass(a, &sa, &ea, &ma);
|
|
u16 cb = dclass(b, &sb, &eb, &mb);
|
|
u64 sr = sa ^ sb;
|
|
if (ca == 0) return sr;
|
|
if (cb == 0) return sr | DEXP_MASK; // div-by-zero → inf
|
|
// Long division: handle the leading quotient bit explicitly (since
|
|
// we need to "consume" the dividend's leading 1 by subtracting),
|
|
// then generate 52 more fractional bits by shifting r left and
|
|
// testing. The previous shift-and-test-only loop over-counted
|
|
// when r == mb after subtraction (e.g. 2.0/1.0 returned ~4.0).
|
|
s16 er = ea - eb;
|
|
// Normalize so the dividend is in [mb, 2*mb). This ensures the
|
|
// leading quotient bit will land at position 52 below.
|
|
if (ma < mb) {
|
|
ma <<= 1;
|
|
er--;
|
|
}
|
|
// Handle the leading quotient bit explicitly.
|
|
u64 q = DMANT_LEAD;
|
|
u64 r = ma - mb;
|
|
// `volatile vmb`: forces mb to be re-read from memory inside the
|
|
// loop. Without this, the W65816 codegen miscompiles `r >= mb` and
|
|
// `r -= mb` when called as the 3rd+ chained `__divdf3` after prior
|
|
// softDouble libcalls (sqrt3 Newton iter — 3rd iter returned 0.0
|
|
// instead of 1.41421). Adding `volatile` to either `r` or `mb`
|
|
// alone fixes it, suggesting the compiler is keeping one of them
|
|
// in registers across loop iterations and a JSL inside the loop
|
|
// (__ashlsi3 for `r <<= 1`) clobbers the held value. The real
|
|
// fix lives in the W65816 backend's u64-shift lowering; volatile
|
|
// here is the conservative workaround.
|
|
volatile u64 vmb = mb;
|
|
// Compute 52 more fractional bits via standard shift-test-subtract.
|
|
for (int i = 51; i >= 0; i--) {
|
|
r <<= 1;
|
|
if (r >= vmb) {
|
|
r -= vmb;
|
|
q |= (1ULL << i);
|
|
}
|
|
}
|
|
mb = vmb; // resync in case below reads mb
|
|
// Round to nearest, ties to even. Generate one extra bit (the
|
|
// "guard"), examine the remainder for any non-zero "sticky" tail,
|
|
// and round q up when guard=1 and (sticky || (q & 1)). Without
|
|
// this we'd be truncate-toward-zero, off by 1 ULP from gcc's RNE
|
|
// result on cases like 1.5/2.5.
|
|
r <<= 1;
|
|
int guard = (r >= mb) ? 1 : 0;
|
|
if (guard) r -= mb;
|
|
int sticky = (r != 0) ? 1 : 0;
|
|
if (guard && (sticky || (q & 1))) {
|
|
q++;
|
|
if (q & (1ULL << 53)) { // mantissa overflow into bit 53 -> renormalize
|
|
q >>= 1;
|
|
er++;
|
|
}
|
|
}
|
|
return dpack(sr, er, q);
|
|
}
|
|
|
|
s16 __cmpdf2(u64 a, u64 b) {
|
|
u64 sa = a & DSIGN_BIT;
|
|
u64 sb = b & DSIGN_BIT;
|
|
if (sa != sb) {
|
|
// Negative < positive (unless both zero).
|
|
if ((a | b) << 1 == 0) return 0;
|
|
return sa ? -1 : 1;
|
|
}
|
|
if (a == b) return 0;
|
|
if (sa) return a < b ? 1 : -1;
|
|
return a < b ? -1 : 1;
|
|
}
|
|
|
|
s16 __unorddf2(u64 a, u64 b) {
|
|
// Returns nonzero if either is NaN.
|
|
u64 ea = (a >> DEXP_SHIFT) & 0x7FF;
|
|
u64 eb = (b >> DEXP_SHIFT) & 0x7FF;
|
|
if (ea == 0x7FF && (a & DMANT_MASK) != 0) return 1;
|
|
if (eb == 0x7FF && (b & DMANT_MASK) != 0) return 1;
|
|
return 0;
|
|
}
|
|
|
|
s16 __eqdf2(u64 a, u64 b) { return __cmpdf2(a, b) != 0; }
|
|
s16 __nedf2(u64 a, u64 b) { return __cmpdf2(a, b) != 0; }
|
|
s16 __ltdf2(u64 a, u64 b) { return __cmpdf2(a, b) < 0; }
|
|
s16 __ledf2(u64 a, u64 b) { return __cmpdf2(a, b) <= 0; }
|
|
s16 __gtdf2(u64 a, u64 b) { return __cmpdf2(a, b) > 0; }
|
|
s16 __gedf2(u64 a, u64 b) { return __cmpdf2(a, b) >= 0; }
|
|
|
|
// double <-> float conversions.
|
|
u64 __extendsfdf2(u32 x) {
|
|
u64 sign = ((u64)x & 0x80000000UL) << 32;
|
|
s16 e = (s16)((x >> 23) & 0xFF);
|
|
u32 m = x & 0x7FFFFFUL;
|
|
if (e == 0) return sign;
|
|
if (e == 0xFF) {
|
|
return sign | DEXP_MASK | ((u64)m << 29);
|
|
}
|
|
s16 unbiased = e - 127;
|
|
return dpack(sign, unbiased, ((u64)m << 29) | DMANT_LEAD);
|
|
}
|
|
|
|
u32 __truncdfsf2(u64 x) {
|
|
u64 sign = (x & DSIGN_BIT) >> 32;
|
|
s16 e = (s16)((x >> DEXP_SHIFT) & 0x7FF);
|
|
u64 m = x & DMANT_MASK;
|
|
if (e == 0) return (u32)sign;
|
|
if (e == 0x7FF) {
|
|
return (u32)sign | 0x7F800000UL | (u32)(m >> 29);
|
|
}
|
|
s16 unbiased = e - DEXP_BIAS;
|
|
s16 fexp = unbiased + 127;
|
|
if (fexp >= 255) return (u32)sign | 0x7F800000UL;
|
|
if (fexp <= 0) return (u32)sign;
|
|
return (u32)sign | ((u32)fexp << 23) | (u32)((m >> 29) & 0x7FFFFFUL);
|
|
}
|
|
|
|
// double <-> integer conversions.
|
|
u64 __floatsidf(s32 x) {
|
|
if (x == 0) return 0;
|
|
u64 sign = (x < 0) ? DSIGN_BIT : 0;
|
|
u64 m = (u64)((x < 0) ? -x : x);
|
|
s16 e = 0;
|
|
while ((m & DMANT_LEAD) == 0) { m <<= 1; e--; }
|
|
e += 31 + 21; // shift to put bit-31 at bit-52
|
|
return dpack(sign, e, m);
|
|
}
|
|
|
|
|
|
u64 __floatunsidf(u32 x) {
|
|
if (x == 0) return 0;
|
|
u64 m = (u64)x;
|
|
s16 e = 0;
|
|
while ((m & DMANT_LEAD) == 0) { m <<= 1; e--; }
|
|
e += 31 + 21;
|
|
return dpack(0, e, m);
|
|
}
|
|
|
|
s32 __fixdfsi(u64 x) {
|
|
u64 sign = x & DSIGN_BIT;
|
|
s16 e = (s16)((x >> DEXP_SHIFT) & 0x7FF);
|
|
if (e == 0) return 0;
|
|
if (e == 0x7FF) return sign ? (s32)0x80000000 : 0x7FFFFFFF;
|
|
s16 unbiased = e - DEXP_BIAS;
|
|
if (unbiased < 0) return 0;
|
|
if (unbiased > 30) return sign ? (s32)0x80000000 : 0x7FFFFFFF;
|
|
u64 m = (x & DMANT_MASK) | DMANT_LEAD;
|
|
s16 shift = 52 - unbiased;
|
|
if (shift >= 0) m >>= shift; else m <<= -shift;
|
|
return sign ? -(s32)m : (s32)m;
|
|
}
|
|
|
|
|
|
// __fixunsdfsi: unsigned double → uint32. Saturates to 0 for negative
|
|
// inputs, to 0xFFFFFFFF for inputs >= 2^32. Used by clang when casting
|
|
// double values to unsigned integer types.
|
|
u32 __fixunsdfsi(u64 x) {
|
|
if (x & DSIGN_BIT) return 0; // negative → 0
|
|
u16 e = (u16)((x >> DEXP_SHIFT) & 0x7FF);
|
|
if (e == 0) return 0;
|
|
if (e == 0x7FF) return 0xFFFFFFFF;
|
|
s16 unbiased = (s16)e - DEXP_BIAS;
|
|
if (unbiased < 0) return 0;
|
|
if (unbiased > 31) return 0xFFFFFFFF;
|
|
u64 m = (x & DMANT_MASK) | DMANT_LEAD;
|
|
if (unbiased >= 52) {
|
|
m <<= (unbiased - 52);
|
|
} else {
|
|
m >>= (52 - unbiased);
|
|
}
|
|
return (u32)m;
|
|
}
|