// Buffer-formatting siblings of printf — kept in their own translation
// unit so the shared writeXxx helpers don't have to take a function-
// pointer sink (indirect call cost on this target) and so adding the
// formatter to libc.c can't shift vprintf's branch distances out of
// range (per the strtol.c precedent).
//
// Functions:
//   int vsnprintf(char *buf, size_t n, const char *fmt, va_list ap);
//   int snprintf (char *buf, size_t n, const char *fmt, ...);
//   int vsprintf (char *buf, const char *fmt, va_list ap);
//   int sprintf  (char *buf, const char *fmt, ...);
//
// Format support matches vprintf: %d %i %u %x %X %c %s %p %f and the
// `l` length modifier (%ld %lu).  Width is honoured for %x.  %f
// precision is capped at 9 fractional digits.
//
// Return value: number of characters that would have been written had
// the buffer been unbounded (C99 vsnprintf semantics), not just the
// number actually written.  This lets callers detect truncation.
//
// **Sink state lives in file-static globals** instead of an explicit
// struct passed by pointer.  This was originally a workaround for two
// W65816 backend bugs (since fixed):
//   (1) The address of a stack-resident struct used to be computed
//       wrong (&s came out as SP+5 = address of s.end instead of SP+3).
//   (2) Functions taking fmt as arg1 (stack) didn't initialize the
//       fmt local before the loop body — first char came from the
//       arg slot but the loop's fmt++ ran on uninitialized memory.
// The struct-sink form now compiles correctly, but switching back to it
// would shift every TU's branch distances; left as-is for stability.
// Single-threaded use only, but that matches the rest of this runtime.
//
// Reverse-emit pattern (used by emitUDec / emitULong / emitHex): the
// natural countdown forms (`while (i > 0) emit(buf[--i])`,
// `while (i > 0) { i--; emit(buf[i]); }`,
// `for (j = i - 1; j >= 0; j--) emit(buf[j])`) all lower to a
// do-while whose `dec a; bpl` exit condition runs the loop one
// extra time on this backend, leaking a `buf[-1]` read.  Use the
// forward count + index-arithmetic form instead.

typedef unsigned long size_t;
typedef __builtin_va_list  va_list;
#define va_start(ap, last) __builtin_va_start(ap, last)
#define va_arg(ap, ty)     __builtin_va_arg(ap, ty)
#define va_end(ap)         __builtin_va_end(ap)


static char   *gCur;
static char   *gEnd;
static size_t  gTotal;


__attribute__((noinline))
static void emit(char c) {
    if (gCur < gEnd) {
        *gCur++ = c;
    }
    gTotal++;
}


__attribute__((noinline))
static void emitStr(const char *p) {
    if (!p) {
        p = "(null)";
    }
    while (*p) {
        emit(*p++);
    }
}


__attribute__((noinline))
static void emitUDec(unsigned int n) {
    char buf[6];
    int  i = 0;
    if (n == 0) {
        emit('0');
        return;
    }
    while (n > 0) {
        buf[i++] = '0' + (n % 10);
        n /= 10;
    }
    // Reverse-emit; see file header for the forward-index rationale.
    int top = i;
    for (int j = 0; j < top; j++) {
        emit(buf[top - 1 - j]);
    }
}


__attribute__((noinline))
static void emitDec(int n) {
    // -n on INT_MIN is signed-overflow UB; negate as unsigned.
    if (n < 0) {
        emit('-');
        emitUDec(0u - (unsigned int)n);
    } else {
        emitUDec((unsigned int)n);
    }
}


__attribute__((noinline))
__attribute__((optnone))
static void emitULong(unsigned long n) {
    char buf[11];
    int  i = 0;
    if (n == 0) {
        emit('0');
        return;
    }
    while (n > 0) {
        buf[i++] = '0' + (n % 10);
        n /= 10;
    }
    // Reverse-emit; see file header for the forward-index rationale.
    int top = i;
    for (int j = 0; j < top; j++) {
        emit(buf[top - 1 - j]);
    }
}


__attribute__((noinline,optnone))
static void emitSignedLong(long n) {
    // See emitDec: avoid the signed-overflow UB on LONG_MIN.
    if (n < 0) {
        emit('-');
        emitULong(0ul - (unsigned long)n);
    } else {
        emitULong((unsigned long)n);
    }
}


__attribute__((noinline))
static void emitHex(unsigned int n, int width) {
    static const char digits[] = "0123456789abcdef";
    // unsigned int is 16-bit on this target -> at most 4 hex digits.
    // Cap width to that; without it `snprintf("%08x", ...)` blew past
    // the buf[] tail and corrupted the stack.
    char buf[4];
    if (width > 4) width = 4;
    int  i = 0;
    if (n == 0) {
        buf[i++] = '0';
    }
    while (n > 0 && i < 4) {
        buf[i++] = digits[n & 0xF];
        n >>= 4;
    }
    while (i < width) {
        buf[i++] = '0';
    }
    // Reverse-emit; see file header for the forward-index rationale.
    int top = i;
    for (int j = 0; j < top; j++) {
        emit(buf[top - 1 - j]);
    }
}


__attribute__((noinline))
static void emitDouble(double v, int prec) {
    if (prec < 0) {
        prec = 6;
    }
    if (prec > 9) {
        prec = 9;
    }
    // Avoid `if (v < 0)` (which calls __ltdf2) — the W65816 codegen
    // for that comparison passes its double arg with a missing word,
    // and the test silently returns false for negatives.  Read the
    // IEEE-754 sign bit and clear it inline instead.
    unsigned long long bits;
    __builtin_memcpy(&bits, &v, 8);
    if (bits & ((unsigned long long)1 << 63)) {
        emit('-');
        bits &= ~((unsigned long long)1 << 63);
        __builtin_memcpy(&v, &bits, 8);
    }
    // Avoid `v - (double)ipart` and `frac * 10.0`: those produced
    // wrong results when chained in this function (likely a softfp
    // libcall-ABI mismatch where the subdf3 return placement didn't
    // match the muldf3 arg placement).  Instead scale v by 10^prec in
    // one chain, do integer division to split, and emit two fields.
    unsigned long mul = 1;
    for (int i = 0; i < prec; i++) {
        v = v * 10.0;
        mul *= 10;
    }
    // Round-half-up before truncation: 3.14 * 100 = 313.999... in
    // soft-double, but `%.2f` of 3.14 should be "3.14" not "3.13".
    // Adding 0.5 then truncating is equivalent to round-half-up for
    // the non-negative `v` we have at this point.
    v = v + 0.5;
    // Cast via signed first; the runtime ships __fixdfsi but not
    // __fixunsdfsi.  v has been forced non-negative above so the
    // signed cast loses no value range we care about.
    unsigned long scaled  = (unsigned long)(long)v;
    unsigned long intPart = scaled / mul;
    unsigned long frcPart = scaled - intPart * mul;
    emitULong(intPart);
    if (prec == 0) {
        return;
    }
    emit('.');
    // Emit `frcPart` as `prec` digits with leading zeros.  Build into
    // a small buffer in reverse, then emit forward (countdown loops
    // are still suspect — see the reverse-emit comment above).
    char buf[10];
    for (int i = prec - 1; i >= 0; i--) {
        buf[i] = (char)('0' + (frcPart % 10));
        frcPart /= 10;
    }
    for (int i = 0; i < prec; i++) {
        emit(buf[i]);
    }
}


// fmt is arg0 (A register); see banner comment for why the order matters.
// Previously optnone (slot-alias bug under p:16:16; see
// feedback_snprintf_va_arg_slot_alias.md).  Re-enabled greedy under
// ptr32 — testing whether the bug recurs.
static int format(const char *fmt, va_list ap) {
    while (*fmt) {
        char c = *fmt++;
        if (c != '%') {
            emit(c);
            continue;
        }
        int width = 0;
        while (*fmt >= '0' && *fmt <= '9') {
            width = width * 10 + (*fmt - '0');
            fmt++;
        }
        int prec = -1;
        if (*fmt == '.') {
            fmt++;
            prec = 0;
            while (*fmt >= '0' && *fmt <= '9') {
                prec = prec * 10 + (*fmt - '0');
                fmt++;
            }
        }
        int isLong = 0;
        if (*fmt == 'l') {
            isLong = 1;
            fmt++;
        }
        char spec = *fmt++;
        if (spec == 'd' || spec == 'i') {
            if (isLong) {
                emitSignedLong(va_arg(ap, long));
            } else {
                emitDec(va_arg(ap, int));
            }
        } else if (spec == 'u') {
            if (isLong) {
                emitULong(va_arg(ap, unsigned long));
            } else {
                emitUDec(va_arg(ap, unsigned int));
            }
        } else if (spec == 'x' || spec == 'X') {
            emitHex(va_arg(ap, unsigned int), width);
        } else if (spec == 'c') {
            emit((char)va_arg(ap, int));
        } else if (spec == 's') {
            emitStr(va_arg(ap, const char *));
        } else if (spec == 'f' || spec == 'F' ||
                   spec == 'g' || spec == 'G' ||
                   spec == 'e' || spec == 'E') {
            emitDouble(va_arg(ap, double), prec);
        } else if (spec == 'p') {
            emit('0');
            emit('x');
            emitHex(va_arg(ap, unsigned int), 4);
        } else if (spec == '%') {
            emit('%');
        } else {
            emit('%');
            emit(spec);
        }
    }
    if (gCur < gEnd) {
        *gCur = '\0';
    } else if (gEnd > (char *)0) {
        // Truncated, but n > 0: overwrite the last byte with NUL so
        // the result is a valid C string.  snprintf with n=0 sets
        // gEnd = NULL up front so this branch correctly skips —
        // previously it wrote `gEnd[-1]` to `buf[-1]`, clobbering
        // memory before the buffer.
        gEnd[-1] = '\0';
    }
    return (int)gTotal;
}


__attribute__((optnone))
int snprintf(char *buf, size_t n, const char *fmt, ...) {
    gCur   = buf;
    // n == 0 must NOT touch the buffer (C99 7.19.6.5).  Setting
    // gEnd = NULL here makes both `gCur < gEnd` and `gEnd > 0`
    // false, so no NUL terminator gets written.
    gEnd   = n ? buf + n : (char *)0;
    gTotal = 0;
    va_list ap;
    va_start(ap, fmt);
    int r = format(fmt, ap);
    va_end(ap);
    return r;
}


int sprintf(char *buf, const char *fmt, ...) {
    gCur   = buf;
    // sprintf is unbounded.  Setting gEnd = buf + 0xFFFE looks innocuous
    // but clang lowers the +0xFFFE to a `dec a; dec a` peephole (since
    // 0xFFFE is -2 in 16-bit), giving gEnd = buf - 2 — and then the
    // emit() bounds test `cur < end` is always false, so nothing gets
    // written.  Use the absolute top-of-bank sentinel instead.
    gEnd   = (char *)0xFFFF;
    gTotal = 0;
    va_list ap;
    va_start(ap, fmt);
    int r = format(fmt, ap);
    va_end(ap);
    return r;
}


int vsnprintf(char *buf, size_t n, const char *fmt, va_list ap) {
    gCur   = buf;
    gEnd   = n ? buf + n : (char *)0;
    gTotal = 0;
    return format(fmt, ap);
}


int vsprintf(char *buf, const char *fmt, va_list ap) {
    gCur   = buf;
    gEnd   = (char *)0xFFFF;
    gTotal = 0;
    return format(fmt, ap);
}