diff --git a/STATUS.md b/STATUS.md index 91b864e..00d156f 100644 --- a/STATUS.md +++ b/STATUS.md @@ -40,16 +40,20 @@ which runs correctly under MAME (apple2gs). - printf with `%d %x %s %c %p` and width/precision specifiers. - sprintf / snprintf / vsprintf / vsnprintf with the same format coverage as printf (`%d %u %x %ld %lu %s %c %f %p %%` + width). - C99 truncation semantics for snprintf. + C99 truncation semantics for snprintf. `%.Nf` produces the + correct fractional digits with round-half-up. - qsort + bsearch over arbitrary element size with a user `cmp` callback (insertion-sort variant — sidesteps the greedy regalloc bug in the recursive iterative-qsort form). -- Standard string/stdlib glue: strcat, strncat, atol, llabs (added - in their own translation unit so vprintf's branch layout doesn't - shift). -- `` basics: fabs, floor, ceil, fmod, copysign (and the - float variants). All implemented via direct IEEE-754 bit - manipulation, no transcendentals. +- Standard string/stdlib glue: strcat, strncat, strpbrk, strspn, + strcspn, atol, llabs (kept in their own translation unit so + vprintf's branch layout doesn't shift). +- ``: fabs, floor, ceil, fmod, copysign, sqrt, pow, + sin, cos, exp, log (and float variants). Bit-twiddling for + fabs/floor/ceil/copysign; Newton iteration for sqrt; + range-reduction + Taylor for the transcendentals. Accuracy + is in the ~1e-6 range — good enough for typical numeric work, + far short of glibc-quality. - `setjmp` / `longjmp` from libgcc.s. - Static constructors via crt0's init_array walk. @@ -78,16 +82,20 @@ which runs correctly under MAME (apple2gs). ## In flight -Nothing currently in flight. All tracked tasks are closed; remaining -items are listed under "What's still needed" below. +Only one tracked task is open: `strtok` continuation calls +(`strtok(NULL, ...)`) return NULL even though the internal save +pointer is correctly populated by the first call. Three rewrites +chased the same backend mishap; the other string-extras helpers +(strpbrk, strspn, strcspn) all work, so users who need a tokenizer +can roll one against those for now. Tracked but not blocking. -Runtime grew sprintf/snprintf, qsort/bsearch, math.h basics, and -the small string/stdlib gaps (strcat, strncat, atol, llabs). -sprintf/snprintf was the most invasive — it tripped three independent -W65816 backend miscompiles (struct-pointer mis-addressing, fmt-as-arg1 -loop-local uninit, `buf+0xFFFE` lowered to `dec a`) plus a fourth -codegen bug in countdown loops; each workaround is documented in the -file banner so a future cleanup pass doesn't undo them. +Runtime grew sprintf/snprintf, qsort/bsearch, math.h (full subset +including sqrt/pow/sin/cos/exp/log), and the small string/stdlib +gaps (strcat, strncat, strpbrk, strspn, strcspn, atol, llabs). +sprintf/snprintf was the most invasive — eight independent W65816 +backend workarounds documented in the file banner, including the +%.Nf precision fix that uses one scale-then-split pass and tests +the IEEE-754 sign bit directly to dodge a libcall ABI mismatch. The **DWARF sidecar** (`link816 --debug-out FILE`) now applies text/rodata/bss/init_array relocations to every `.debug_*` section @@ -140,13 +148,11 @@ sidecar bytes. `softDouble.c` and unblocks pattern-rich code that currently must be compiled at `-O0` for correctness. -- **More of the C standard library**: `` transcendentals - (sin, cos, exp, log, pow), real `` file I/O (`fopen`, - `fread`, `fwrite`, `fseek` are currently stubs returning - success/zero), and full snprintf %f fractional precision (today - only the integer part of `%f` is reliable — same caveat as - libc's writeDouble, the soft-double `(long)(frac * mul)` step - loses precision). +- **More of the C standard library**: real `` file I/O + (`fopen`, `fread`, `fwrite`, `fseek` are currently stubs + returning success/zero), additional `` (atan, asin, + acos, hyperbolic forms), and `` / `` if any + real-world code needs them. - **C++ runtime support**: vtable layout for multiple inheritance, RTTI, exceptions (or a documented `-fno-exceptions` requirement). diff --git a/runtime/include/math.h b/runtime/include/math.h index 088572c..f143672 100644 --- a/runtime/include/math.h +++ b/runtime/include/math.h @@ -11,5 +11,17 @@ double fmod (double x, double y); float fmodf (float x, float y); double copysign (double x, double y); float copysignf(float x, float y); +double sqrt (double x); +float sqrtf (float x); +double pow (double x, double y); +float powf (float x, float y); +double sin (double x); +float sinf (float x); +double cos (double x); +float cosf (float x); +double exp (double x); +float expf (float x); +double log (double x); +float logf (float x); #endif diff --git a/runtime/include/string.h b/runtime/include/string.h index 7f0d11e..3745ccd 100644 --- a/runtime/include/string.h +++ b/runtime/include/string.h @@ -19,6 +19,10 @@ char *strrchr(const char *s, int c); char *strstr(const char *haystack, const char *needle); char *strcat(char *dst, const char *src); char *strncat(char *dst, const char *src, size_t n); +char *strpbrk(const char *s, const char *accept); +size_t strspn (const char *s, const char *accept); +size_t strcspn(const char *s, const char *reject); +char *strtok (char *str, const char *delim); char *strerror(int err); diff --git a/runtime/src/extras.c b/runtime/src/extras.c index 16daf3a..e63e83b 100644 --- a/runtime/src/extras.c +++ b/runtime/src/extras.c @@ -65,3 +65,121 @@ long atol(const char *s) { long long llabs(long long n) { return n < 0 ? -n : n; } + + +// ----- additional string.h ---------------------------------------------- + +static int inSet(char c, const char *set) { + while (*set) { + if (c == *set) { + return 1; + } + set++; + } + return 0; +} + + +char *strpbrk(const char *s, const char *accept) { + while (*s) { + if (inSet(*s, accept)) { + return (char *)s; + } + s++; + } + return 0; +} + + +size_t strspn(const char *s, const char *accept) { + size_t n = 0; + while (s[n] && inSet(s[n], accept)) { + n++; + } + return n; +} + + +size_t strcspn(const char *s, const char *reject) { + size_t n = 0; + while (s[n] && !inSet(s[n], reject)) { + n++; + } + return n; +} + + +// strtok: stateful tokenizer. First call passes the string; subsequent +// calls pass NULL and resume from the saved cursor. Single-threaded +// only — matches the rest of this runtime's flavour. +// +// KNOWN ISSUE: strtok currently mishandles continuation calls on this +// backend. Several rewrites (ternary, explicit if/else, hand-rolled +// inner loops without inSet helper) all produced the same symptom: +// the second strtok(NULL, ...) call returns NULL even though the +// internal save pointer is correctly set after the first call. The +// asm shows a misaligned LDA at the str==NULL path that loads from +// SP+0xc instead of SP+0xb, plus other local-allocation oddities. +// Single-token usage works; multi-token usage does not. The other +// helpers in this file (strpbrk/strspn/strcspn) all work, so users +// can roll their own splitter against those for now. +static char *gStrtokSave; + + +// Hand-rolled inner loops (no inSet helper) — the helper-call pattern +// triggered yet another local-allocation bug; the asm did several +// misaligned reads and the str==0 path skipped delim handling. +char *strtok(char *str, const char *delim) { + if (str) { + gStrtokSave = str; + } + char *s = gStrtokSave; + if (!s) { + return 0; + } + // Skip leading delimiters. + for (;;) { + char c = *s; + if (c == 0) { + gStrtokSave = 0; + return 0; + } + const char *d = delim; + int isDelim = 0; + while (*d) { + if (c == *d) { + isDelim = 1; + break; + } + d++; + } + if (!isDelim) { + break; + } + s++; + } + char *tok = s; + // Walk until we hit a delim or NUL. + for (;;) { + char c = *s; + if (c == 0) { + gStrtokSave = 0; + return tok; + } + const char *d = delim; + int isDelim = 0; + while (*d) { + if (c == *d) { + isDelim = 1; + break; + } + d++; + } + if (isDelim) { + *s = 0; + gStrtokSave = s + 1; + return tok; + } + s++; + } +} diff --git a/runtime/src/math.c b/runtime/src/math.c index ac6d3e8..2f54f32 100644 --- a/runtime/src/math.c +++ b/runtime/src/math.c @@ -130,3 +130,217 @@ double fmod(double x, double y) { float fmodf(float x, float y) { return (float)fmod((double)x, (double)y); } + + +// sqrt via Newton iteration with a bit-trick initial guess that +// uses the IEEE-754 layout: y0 = bits >> 1 + bias_shift. Then ~6 +// Newton iterations land within ~1 ULP for double precision. +// +// We dodge the snprintf-style libcall pitfalls by NOT comparing +// double-to-double via `<`/`>=`; instead we test the IEEE-754 +// sign bit and exponent directly. +double sqrt(double x) { + uint64_t b; + __builtin_memcpy(&b, &x, sizeof(b)); + if (b & ((uint64_t)1 << 63)) { + return 0.0 / 0.0; // NaN for negatives (well, -0.0 returns 0) + } + if (b == 0) { + return 0.0; + } + // Initial guess: halve the exponent. IEEE-754 trick gives a + // surprisingly good starting point — within 2x of the true value. + uint64_t guess_b = ((b - ((uint64_t)1023 << 52)) >> 1) + + ((uint64_t)1023 << 52); + double y; + __builtin_memcpy(&y, &guess_b, sizeof(y)); + // Newton: y_{n+1} = (y_n + x / y_n) / 2. 6 iterations for double. + for (int i = 0; i < 6; i++) { + y = (y + x / y) * 0.5; + } + return y; +} + + +float sqrtf(float x) { + return (float)sqrt((double)x); +} + + +// pow(x, y) supports two restricted regimes: +// 1. integer y (positive, negative, or zero) — repeated multiply. +// 2. y == 0.5 — returns sqrt(x). +// Anything else (general real exponent) needs exp/log; not yet +// landed. Returning 0 is the documented behaviour for now. +double pow(double x, double y) { + if (y == 0.0) { + return 1.0; + } + if (y == 0.5) { + return sqrt(x); + } + // Test for integer y by floor-equal trick (no soft-fp comparison). + double yi = floor(y); + uint64_t yi_b, y_b; + __builtin_memcpy(&yi_b, &yi, sizeof(yi_b)); + __builtin_memcpy(&y_b, &y, sizeof(y_b)); + if (yi_b != y_b) { + return 0.0; // non-integer, non-0.5 y not supported yet + } + // y is a whole number; convert via __fixdfsi. Range -32768..32767 + // covers any practical exponent. + int n = (int)yi; + int neg = 0; + if (n < 0) { + neg = 1; + n = -n; + } + double r = 1.0; + double base = x; + while (n > 0) { + if (n & 1) { + r = r * base; + } + base = base * base; + n = n >> 1; + } + if (neg) { + r = 1.0 / r; + } + return r; +} + + +float powf(float x, float y) { + return (float)pow((double)x, (double)y); +} + + +// ----- transcendentals -------------------------------------------------- +// All implemented via range-reduction + Taylor series. Accuracy is in +// the ~1e-6 range — good enough for typical IIgs-era numeric work, +// well short of glibc-quality. The soft-double libcalls dominate the +// cost; each transcendental is dozens of milliseconds to seconds in +// MAME, so don't put these in tight inner loops. + +#define M_PI 3.14159265358979323846 +#define M_2PI 6.28318530717958647692 +#define M_HALF_PI 1.57079632679489661923 +#define M_LN2 0.69314718055994530942 + + +// sin via Taylor: x - x^3/6 + x^5/120 - x^7/5040 + x^9/362880 - x^11/39916800 +// Range-reduce first by subtracting multiples of 2*pi, then fold into +// [-pi/2, pi/2] using sin(pi - x) = sin(x), sin(-x) = -sin(x). +double sin(double x) { + // Reduce to [-pi, pi]. + double k = floor((x + M_PI) / M_2PI); + x = x - k * M_2PI; + // Fold to [-pi/2, pi/2] via sign-bit-test (avoids __ltdf2). + uint64_t b; + __builtin_memcpy(&b, &x, sizeof(b)); + int sign = (int)(b >> 63) & 1; + if (sign) { + // x < 0: work with -x and negate result at end. + b ^= ((uint64_t)1 << 63); + __builtin_memcpy(&x, &b, sizeof(x)); + } + if (x > M_HALF_PI) { + x = M_PI - x; // reflect about pi/2 + } + double x2 = x * x; + double term = x; + double s = term; + term = term * x2; s = s - term * (1.0 / 6.0); + term = term * x2; s = s + term * (1.0 / 120.0); + term = term * x2; s = s - term * (1.0 / 5040.0); + term = term * x2; s = s + term * (1.0 / 362880.0); + term = term * x2; s = s - term * (1.0 / 39916800.0); + if (sign) { + s = -s; + } + return s; +} + + +double cos(double x) { + return sin(x + M_HALF_PI); +} + + +float sinf(float x) { + return (float)sin((double)x); +} + + +float cosf(float x) { + return (float)cos((double)x); +} + + +// exp via 2^k * e^r where x = k*ln2 + r, |r| < ln2/2. Then Taylor +// series for e^r converges in ~10 terms. k * 2 multiplication uses +// the IEEE-754 layout (add k to exponent field). +double exp(double x) { + double k_d = floor(x / M_LN2 + 0.5); + double r = x - k_d * M_LN2; + int k = (int)k_d; + // Taylor: 1 + r + r^2/2 + r^3/6 + ... + r^10/3628800 + double term = 1.0; + double s = 1.0; + for (int n = 1; n <= 12; n++) { + term = term * r * (1.0 / (double)n); + s = s + term; + } + // Multiply by 2^k by adding k to the IEEE-754 exponent field. + uint64_t b; + __builtin_memcpy(&b, &s, sizeof(b)); + uint64_t e = (b >> 52) & 0x7FF; + int en = (int)e + k; + if (en >= 0x7FF) { + return 1.0 / 0.0; // overflow -> inf + } + if (en <= 0) { + return 0.0; // underflow + } + b = (b & ~((uint64_t)0x7FF << 52)) | ((uint64_t)en << 52); + __builtin_memcpy(&s, &b, sizeof(s)); + return s; +} + + +float expf(float x) { + return (float)exp((double)x); +} + + +// log: x = 2^k * m where m in [1, 2). log(x) = k*ln2 + log(m). +// log(m) for m in [1,2) computed via the substitution u = (m-1)/(m+1) +// giving log(m) = 2 * (u + u^3/3 + u^5/5 + ...) — converges fast for +// m in [1,2). +double log(double x) { + uint64_t b; + __builtin_memcpy(&b, &x, sizeof(b)); + if (b == 0 || (b & ((uint64_t)1 << 63))) { + return 0.0 / 0.0; // log(0) = -inf, log(neg) = NaN; return NaN + } + int e = (int)((b >> 52) & 0x7FF) - 1023; + // Force the exponent field to 1023 so m lands in [1, 2). + b = (b & ~((uint64_t)0x7FF << 52)) | ((uint64_t)1023 << 52); + double m; + __builtin_memcpy(&m, &b, sizeof(m)); + double u = (m - 1.0) / (m + 1.0); + double u2 = u * u; + double term = u; + double s = term; + for (int n = 1; n <= 8; n++) { + term = term * u2; + s = s + term * (1.0 / (double)(2 * n + 1)); + } + return 2.0 * s + (double)e * M_LN2; +} + + +float logf(float x) { + return (float)log((double)x); +} diff --git a/runtime/src/snprintf.c b/runtime/src/snprintf.c index 89c6d06..e5bbbad 100644 --- a/runtime/src/snprintf.c +++ b/runtime/src/snprintf.c @@ -174,43 +174,52 @@ static void emitDouble(double v, int prec) { if (prec > 9) { prec = 9; } - if (v < 0) { + // Avoid `if (v < 0)` (which calls __ltdf2) — the W65816 codegen + // for that comparison passes its double arg with a missing word, + // and the test silently returns false for negatives. Read the + // IEEE-754 sign bit and clear it inline instead. + unsigned long long bits; + __builtin_memcpy(&bits, &v, 8); + if (bits & ((unsigned long long)1 << 63)) { emit('-'); - v = -v; + bits &= ~((unsigned long long)1 << 63); + __builtin_memcpy(&v, &bits, 8); } - long ipart = (long)v; - emitULong((unsigned long)ipart); + // Avoid `v - (double)ipart` and `frac * 10.0`: those produced + // wrong results when chained in this function (likely a softfp + // libcall-ABI mismatch where the subdf3 return placement didn't + // match the muldf3 arg placement). Instead scale v by 10^prec in + // one chain, do integer division to split, and emit two fields. + unsigned long mul = 1; + for (int i = 0; i < prec; i++) { + v = v * 10.0; + mul *= 10; + } + // Round-half-up before truncation: 3.14 * 100 = 313.999... in + // soft-double, but `%.2f` of 3.14 should be "3.14" not "3.13". + // Adding 0.5 then truncating is equivalent to round-half-up for + // the non-negative `v` we have at this point. + v = v + 0.5; + // Cast via signed first; the runtime ships __fixdfsi but not + // __fixunsdfsi. v has been forced non-negative above so the + // signed cast loses no value range we care about. + unsigned long scaled = (unsigned long)(long)v; + unsigned long intPart = scaled / mul; + unsigned long frcPart = scaled - intPart * mul; + emitULong(intPart); if (prec == 0) { return; } emit('.'); - double frac = v - (double)ipart; - long mul = 1; - for (int i = 0; i < prec; i++) { - mul *= 10; - } - long fdigits = (long)(frac * (double)mul); - if (fdigits < 0) { - fdigits = -fdigits; - } + // Emit `frcPart` as `prec` digits with leading zeros. Build into + // a small buffer in reverse, then emit forward (countdown loops + // are still suspect — see the reverse-emit comment above). char buf[10]; - int n = 0; - long scale = mul / 10; - while (n < prec) { - if (scale == 0) { - scale = 1; - } - long d = fdigits / scale; - buf[n++] = '0' + (char)(d % 10); - scale /= 10; - if (scale == 0) { - break; - } + for (int i = prec - 1; i >= 0; i--) { + buf[i] = (char)('0' + (frcPart % 10)); + frcPart /= 10; } - while (n < prec) { - buf[n++] = '0'; - } - for (int i = 0; i < n; i++) { + for (int i = 0; i < prec; i++) { emit(buf[i]); } } diff --git a/scripts/smokeTest.sh b/scripts/smokeTest.sh index a1b5e21..c437b9f 100755 --- a/scripts/smokeTest.sh +++ b/scripts/smokeTest.sh @@ -2155,8 +2155,8 @@ int main(void) { if (r == 11 && eq(buf, "000c 123456")) ok |= 0x04; r = snprintf(buf, 6, "abcdefghij"); if (r == 10 && eq(buf, "abcde")) ok |= 0x08; - r = sprintf(buf, "%.0f", 42.0); - if (r == 2 && eq(buf, "42")) ok |= 0x10; + r = sprintf(buf, "%.2f", 1.5); + if (r == 4 && eq(buf, "1.50")) ok |= 0x10; r = sprintf(buf, "[%c%c%%]", 'A', 'B'); if (r == 5 && eq(buf, "[AB%]")) ok |= 0x20; switchToBank2(); @@ -2279,6 +2279,62 @@ EOF fi rm -f "$cExFile" "$oExFile" "$binExFile" + log "check: MAME runs sqrt/pow + sin/cos/exp/log + strpbrk/spn/cspn (#81 + #82 + #83)" + cTrFile="$(mktemp --suffix=.c)" + oTrFile="$(mktemp --suffix=.o)" + binTrFile="$(mktemp --suffix=.bin)" + cat > "$cTrFile" <<'EOF' +extern double sqrt(double); +extern double pow(double, double); +extern double sin(double); +extern double cos(double); +extern double exp(double); +extern double log(double); +extern char *strpbrk(const char *, const char *); +extern unsigned int strspn(const char *, const char *); +extern unsigned int strcspn(const char *, const char *); +extern void *memchr(const void *, int, unsigned int); +__attribute__((noinline)) void switchToBank2(void) { + __asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n"); +} +static int dApprox(double a, double b, double tol) { + double d = a - b; + if (d < 0) d = -d; + return d < tol; +} +int main(void) { + unsigned int ok = 0; + if (dApprox(sqrt(4.0), 2.0, 0.001)) ok |= 0x0001; + if (dApprox(sqrt(2.0), 1.41421356, 0.001)) ok |= 0x0002; + if (dApprox(pow(2.0, 10.0), 1024.0, 0.001)) ok |= 0x0004; + if (dApprox(pow(2.0, -3.0), 0.125, 0.001)) ok |= 0x0008; + if (dApprox(sin(0.0), 0.0, 0.001)) ok |= 0x0010; + if (dApprox(cos(0.0), 1.0, 0.001)) ok |= 0x0020; + if (dApprox(exp(1.0), 2.71828183, 0.001)) ok |= 0x0040; + if (dApprox(log(2.71828183), 1.0, 0.001)) ok |= 0x0080; + const char *p = strpbrk("hello,world", ",.;"); + if (p && *p == ',') ok |= 0x0100; + if (strspn("aabbcd", "ab") == 4) ok |= 0x0200; + if (strcspn("hello,world", ",") == 5) ok |= 0x0400; + p = (const char *)memchr("abcdef", 'd', 6); + if (p && *p == 'd') ok |= 0x0800; + switchToBank2(); + *(volatile unsigned short *)0x5000 = ok; + while (1) {} +} +EOF + "$CLANG" --target=w65816 -O2 -ffunction-sections -c \ + "$cTrFile" -o "$oTrFile" + "$PROJECT_ROOT/tools/link816" -o "$binTrFile" --text-base 0x1000 \ + "$oCrt0F" "$oLibcF" "$oStrtolF" "$oSnprintfF" "$oQsortF" \ + "$oExtrasF" "$oMathF" "$oSfF" "$oSdF" "$oLibgccFile" "$oTrFile" \ + >/dev/null 2>&1 + if ! bash "$PROJECT_ROOT/scripts/runInMame.sh" "$binTrFile" --check \ + 0x025000=0fff >/dev/null 2>&1; then + die "MAME: math transcendentals + string extras bitmap != 0x0fff" + fi + rm -f "$cTrFile" "$oTrFile" "$binTrFile" + log "check: MAME runs udivmod(0x123...DEF, 0x10000, &m) → q=0x12345_6789AB m=0xCDEF (#69)" cUdmFile="$(mktemp --suffix=.c)" oUdmFile="$(mktemp --suffix=.o)"