// Buffer-formatting siblings of printf — kept in their own translation
// unit so the shared writeXxx helpers don't have to take a function-
// pointer sink (indirect call cost on this target) and so adding the
// formatter to libc.c can't shift vprintf's branch distances out of
// range (per the strtol.c precedent).
//
// Functions:
//   int vsnprintf(char *buf, size_t n, const char *fmt, va_list ap);
//   int snprintf (char *buf, size_t n, const char *fmt, ...);
//   int vsprintf (char *buf, const char *fmt, va_list ap);
//   int sprintf  (char *buf, const char *fmt, ...);
//
// Format support matches vprintf: %d %i %u %x %X %c %s %p %f and the
// `l` length modifier (%ld %lu).  Width is honoured for %x.  %f
// precision is capped at 9 fractional digits.
//
// Return value: number of characters that would have been written had
// the buffer been unbounded (C99 vsnprintf semantics), not just the
// number actually written.  This lets callers detect truncation.
//
// **Sink state lives in file-static globals** instead of an explicit
// struct passed by pointer.  Two W65816 backend bugs forced this:
//   (1) The address of a stack-resident struct is computed wrong
//       (&s came out as SP+5 = address of s.end instead of SP+3).
//       emit() then read garbage cur/end values, the cur >= end branch
//       skipped every write, and snprintf returned the right length
//       with an empty buffer.
//   (2) Functions taking fmt as arg1 (stack) didn't initialize the
//       fmt local before the loop body — first char came from the
//       arg slot but the loop's fmt++ ran on uninitialized memory.
//       Fixed by making fmt always arg0 (A reg).
// Single-threaded use only, but that matches the rest of this runtime.

typedef unsigned int  size_t;
typedef __builtin_va_list  va_list;
#define va_start(ap, last) __builtin_va_start(ap, last)
#define va_arg(ap, ty)     __builtin_va_arg(ap, ty)
#define va_end(ap)         __builtin_va_end(ap)


static char   *gCur;
static char   *gEnd;
static size_t  gTotal;


__attribute__((noinline))
static void emit(char c) {
    if (gCur < gEnd) {
        *gCur++ = c;
    }
    gTotal++;
}


__attribute__((noinline))
static void emitStr(const char *p) {
    if (!p) {
        p = "(null)";
    }
    while (*p) {
        emit(*p++);
    }
}


__attribute__((noinline))
static void emitUDec(unsigned int n) {
    char buf[6];
    int  i = 0;
    if (n == 0) {
        emit('0');
        return;
    }
    while (n > 0) {
        buf[i++] = '0' + (n % 10);
        n /= 10;
    }
    // Reverse-emit using forward index arithmetic.  The natural
    // countdown forms (`while (i > 0) emit(buf[--i])`,
    // `while (i > 0) { i--; emit(buf[i]); }`,
    // `for (j = i - 1; j >= 0; j--) emit(buf[j])`) all lower to a
    // do-while whose `dec a; bpl` exit condition runs the loop one
    // extra time on this backend, leaking a buf[-1] read.  The forward
    // count + index-arithmetic form below avoids the bad lowering.
    int top = i;
    for (int j = 0; j < top; j++) {
        emit(buf[top - 1 - j]);
    }
}


__attribute__((noinline))
static void emitDec(int n) {
    if (n < 0) {
        emit('-');
        emitUDec((unsigned int)(-n));
    } else {
        emitUDec((unsigned int)n);
    }
}


__attribute__((noinline))
static void emitULong(unsigned long n) {
    char buf[11];
    int  i = 0;
    if (n == 0) {
        emit('0');
        return;
    }
    while (n > 0) {
        buf[i++] = '0' + (n % 10);
        n /= 10;
    }
    // Reverse-emit using forward index arithmetic.  The natural
    // countdown forms (`while (i > 0) emit(buf[--i])`,
    // `while (i > 0) { i--; emit(buf[i]); }`,
    // `for (j = i - 1; j >= 0; j--) emit(buf[j])`) all lower to a
    // do-while whose `dec a; bpl` exit condition runs the loop one
    // extra time on this backend, leaking a buf[-1] read.  The forward
    // count + index-arithmetic form below avoids the bad lowering.
    int top = i;
    for (int j = 0; j < top; j++) {
        emit(buf[top - 1 - j]);
    }
}


__attribute__((noinline))
static void emitSignedLong(long n) {
    if (n < 0) {
        emit('-');
        emitULong((unsigned long)(-n));
    } else {
        emitULong((unsigned long)n);
    }
}


__attribute__((noinline))
static void emitHex(unsigned int n, int width) {
    static const char digits[] = "0123456789abcdef";
    char buf[5];
    int  i = 0;
    if (n == 0) {
        buf[i++] = '0';
    }
    while (n > 0) {
        buf[i++] = digits[n & 0xF];
        n >>= 4;
    }
    while (i < width) {
        buf[i++] = '0';
    }
    // Reverse-emit using forward index arithmetic.  The natural
    // countdown forms (`while (i > 0) emit(buf[--i])`,
    // `while (i > 0) { i--; emit(buf[i]); }`,
    // `for (j = i - 1; j >= 0; j--) emit(buf[j])`) all lower to a
    // do-while whose `dec a; bpl` exit condition runs the loop one
    // extra time on this backend, leaking a buf[-1] read.  The forward
    // count + index-arithmetic form below avoids the bad lowering.
    int top = i;
    for (int j = 0; j < top; j++) {
        emit(buf[top - 1 - j]);
    }
}


__attribute__((noinline))
static void emitDouble(double v, int prec) {
    if (prec < 0) {
        prec = 6;
    }
    if (prec > 9) {
        prec = 9;
    }
    if (v < 0) {
        emit('-');
        v = -v;
    }
    long ipart = (long)v;
    emitULong((unsigned long)ipart);
    if (prec == 0) {
        return;
    }
    emit('.');
    double frac = v - (double)ipart;
    long   mul  = 1;
    for (int i = 0; i < prec; i++) {
        mul *= 10;
    }
    long fdigits = (long)(frac * (double)mul);
    if (fdigits < 0) {
        fdigits = -fdigits;
    }
    char buf[10];
    int  n     = 0;
    long scale = mul / 10;
    while (n < prec) {
        if (scale == 0) {
            scale = 1;
        }
        long d = fdigits / scale;
        buf[n++] = '0' + (char)(d % 10);
        scale /= 10;
        if (scale == 0) {
            break;
        }
    }
    while (n < prec) {
        buf[n++] = '0';
    }
    for (int i = 0; i < n; i++) {
        emit(buf[i]);
    }
}


// fmt is arg0 (A register); see banner comment for why the order matters.
static int format(const char *fmt, va_list ap) {
    while (*fmt) {
        char c = *fmt++;
        if (c != '%') {
            emit(c);
            continue;
        }
        int width = 0;
        while (*fmt >= '0' && *fmt <= '9') {
            width = width * 10 + (*fmt - '0');
            fmt++;
        }
        int prec = -1;
        if (*fmt == '.') {
            fmt++;
            prec = 0;
            while (*fmt >= '0' && *fmt <= '9') {
                prec = prec * 10 + (*fmt - '0');
                fmt++;
            }
        }
        int isLong = 0;
        if (*fmt == 'l') {
            isLong = 1;
            fmt++;
        }
        char spec = *fmt++;
        if (spec == 'd' || spec == 'i') {
            if (isLong) {
                emitSignedLong(va_arg(ap, long));
            } else {
                emitDec(va_arg(ap, int));
            }
        } else if (spec == 'u') {
            if (isLong) {
                emitULong(va_arg(ap, unsigned long));
            } else {
                emitUDec(va_arg(ap, unsigned int));
            }
        } else if (spec == 'x' || spec == 'X') {
            emitHex(va_arg(ap, unsigned int), width);
        } else if (spec == 'c') {
            emit((char)va_arg(ap, int));
        } else if (spec == 's') {
            emitStr(va_arg(ap, const char *));
        } else if (spec == 'f' || spec == 'F' ||
                   spec == 'g' || spec == 'G' ||
                   spec == 'e' || spec == 'E') {
            emitDouble(va_arg(ap, double), prec);
        } else if (spec == 'p') {
            emit('0');
            emit('x');
            emitHex(va_arg(ap, unsigned int), 4);
        } else if (spec == '%') {
            emit('%');
        } else {
            emit('%');
            emit(spec);
        }
    }
    if (gCur < gEnd) {
        *gCur = '\0';
    } else if (gEnd > (char *)0) {
        gEnd[-1] = '\0';
    }
    return (int)gTotal;
}


int snprintf(char *buf, size_t n, const char *fmt, ...) {
    gCur   = buf;
    gEnd   = buf + (n ? n : 0);
    gTotal = 0;
    va_list ap;
    va_start(ap, fmt);
    int r = format(fmt, ap);
    va_end(ap);
    return r;
}


int sprintf(char *buf, const char *fmt, ...) {
    gCur   = buf;
    // sprintf is unbounded.  Setting gEnd = buf + 0xFFFE looks innocuous
    // but clang lowers the +0xFFFE to a `dec a; dec a` peephole (since
    // 0xFFFE is -2 in 16-bit), giving gEnd = buf - 2 — and then the
    // emit() bounds test `cur < end` is always false, so nothing gets
    // written.  Use the absolute top-of-bank sentinel instead.
    gEnd   = (char *)0xFFFF;
    gTotal = 0;
    va_list ap;
    va_start(ap, fmt);
    int r = format(fmt, ap);
    va_end(ap);
    return r;
}


int vsnprintf(char *buf, size_t n, const char *fmt, va_list ap) {
    gCur   = buf;
    gEnd   = buf + (n ? n : 0);
    gTotal = 0;
    return format(fmt, ap);
}


int vsprintf(char *buf, const char *fmt, va_list ap) {
    gCur   = buf;
    gEnd   = (char *)0xFFFF;
    gTotal = 0;
    return format(fmt, ap);
}