65816-llvm-mos/scripts/profile.sh

#!/usr/bin/env bash
# profile.sh - function-attribution profiler under MAME.
#
# Builds a benchmark binary with link816 --map-locals, runs it under
# scripts/runInMameCycles.sh --sample, then attributes the PC samples
# to function symbols using the link816 map (globals + locals) and
# prints a sorted (function, hits, hits%) table.
#
# Usage:
#   profile.sh <benchmark-c-file>           Profile a single .c file
#                                           (e.g. benchmarks/strLen.c).
#                                           The bench wrapper pattern
#                                           mirrors benchCyclesPrecise.sh
#                                           — START/DONE markers around
#                                           ITERS calls.
#
#   profile.sh --bench <name>               Use the benchInputs /
#                                           benchExtern config from
#                                           benchCyclesPrecise.sh (so
#                                           call signatures are known).
#
# Optional flags:
#   --iters N           Override the iteration count (default 200).
#   --fast-mode         Pass through to runInMameCycles --fast-mode.
#   --clock-hz N        Pass through to runInMameCycles --clock-hz.
#   --keep              Don't delete the temp build artefacts (debug).
#   --top N             Show only the top-N functions (default 20).
#   --threshold PCT     Require <=PCT samples in '?' (unattributed)
#                       and dominant bucket >= 30% (default).  Disable
#                       with --threshold 0.
#
# Output: markdown-style table with columns FUNCTION / HITS / HITS%.
# Exit 0 on attribution thresholds met, 1 on threshold breach (when
# the dominant function or unattributed percentage doesn't match
# expectations) or harness failure.
#
# Single-sourcing: this script delegates the actual PC sampling to
# runInMameCycles.sh --sample (per reviewer revision — no separate
# runner).  All MAME setup, marker handling, and PC capture live in
# the one runner harness.

set -euo pipefail
source "$(dirname "$0")/common.sh"

CLANG="$PROJECT_ROOT/tools/llvm-mos-build/bin/clang"
LLVM_MC="$PROJECT_ROOT/tools/llvm-mos-build/bin/llvm-mc"
LINK="$PROJECT_ROOT/tools/link816"
RUNNER="$PROJECT_ROOT/scripts/runInMameCycles.sh"
PC2LINE="$PROJECT_ROOT/scripts/pc2line.py"
BENCH_DIR="$PROJECT_ROOT/benchmarks"

BENCH_NAME=""
BENCH_FILE=""
ITERS=200
FAST_MODE=""
CLOCK_HZ=""
KEEP=0
TOP_N=20
# Smoke-check thresholds.  See the --threshold flag docs.
THRESHOLD_PCT=10  # max % allowed for '?' (unattributed)
DOMINANT_MIN=30   # min % expected in the dominant bucket

# Per-benchmark inputs — duplicated from benchCyclesPrecise.sh so we
# can profile any bench.  Single source of truth would be nicer; keep
# in sync manually for now.
benchInputs() {
    case "$1" in
        sumOfSquares) echo 'sumOfSquares(50)';;
        fib)          echo 'fib(10)';;
        strcpy)       echo 'mystrcpy(dst, "hello world!")';;
        memcmp)       echo 'mymemcmp("hello", "hello", 5)';;
        bsearch)      echo 'bsearch(arr, 8, 5)';;
        dotProduct)   echo 'dotProduct(va, vb, 4)';;
        popcount)     echo 'popcount(0x12345678UL)';;
        crc32)        echo 'crc32((const unsigned char *)"hello", 5)';;
        strLen)       echo 'strLen("The quick brown fox jumps over the lazy dog!")';;
        djb2Hash)     echo 'djb2Hash("hello world")';;
        *)            echo "/* unknown */";;
    esac
}

benchExtern() {
    case "$1" in
        sumOfSquares) echo 'extern unsigned long sumOfSquares(unsigned short n);';;
        fib)          echo 'extern unsigned short fib(unsigned short n);';;
        strcpy)       echo 'extern char *mystrcpy(char *d, const char *s); static char dst[16];';;
        memcmp)       echo 'extern int mymemcmp(const void *a, const void *b, unsigned int n);';;
        bsearch)      echo 'extern int bsearch(const int *arr, int n, int key); static const int arr[] = {1,2,3,4,5,6,7,8};';;
        dotProduct)   echo 'extern long dotProduct(const short *a, const short *b, unsigned int n); static const short va[] = {1,2,3,4}; static const short vb[] = {5,6,7,8};';;
        popcount)     echo 'extern int popcount(unsigned long x);';;
        crc32)        echo 'extern unsigned long crc32(const unsigned char *p, unsigned int n);';;
        strLen)       echo 'extern unsigned short strLen(const char *s);';;
        djb2Hash)     echo 'extern unsigned long djb2Hash(const char *s);';;
        *)            echo '';;
    esac
}

# Parse args.
while [ $# -gt 0 ]; do
    case "$1" in
        --bench)
            shift
            [ $# -ge 1 ] || die "--bench needs a name"
            BENCH_NAME="$1"
            BENCH_FILE="$BENCH_DIR/$BENCH_NAME.c"
            shift
            ;;
        --iters)
            shift
            [ $# -ge 1 ] || die "--iters needs a value"
            ITERS="$1"
            shift
            ;;
        --fast-mode)
            FAST_MODE="--fast-mode"
            shift
            ;;
        --clock-hz)
            shift
            [ $# -ge 1 ] || die "--clock-hz needs a value"
            CLOCK_HZ="--clock-hz $1"
            shift
            ;;
        --keep)
            KEEP=1
            shift
            ;;
        --top)
            shift
            [ $# -ge 1 ] || die "--top needs a value"
            TOP_N="$1"
            shift
            ;;
        --threshold)
            shift
            [ $# -ge 1 ] || die "--threshold needs a value"
            THRESHOLD_PCT="$1"
            shift
            ;;
        -h|--help)
            sed -n '1,40p' "$0" | grep '^#'
            exit 0
            ;;
        *)
            if [ -z "$BENCH_FILE" ] && [ -f "$1" ]; then
                BENCH_FILE="$1"
                BENCH_NAME=$(basename "$1" .c)
            else
                die "unknown arg or file not found: $1"
            fi
            shift
            ;;
    esac
done

[ -n "$BENCH_FILE" ] || die "usage: $0 <bench.c> | --bench NAME [...]"
[ -f "$BENCH_FILE" ] || die "benchmark file not found: $BENCH_FILE"

extern_decl=$(benchExtern "$BENCH_NAME")
call_expr=$(benchInputs "$BENCH_NAME")
[ -n "$extern_decl" ] || die "no input config for bench '$BENCH_NAME' — extend benchExtern/benchInputs"
[ "$call_expr" != "/* unknown */" ] || die "no call config for bench '$BENCH_NAME'"

log "profiling: $BENCH_NAME (iters=$ITERS)"

# Workspace.
WORK=$(mktemp -d)
if [ "$KEEP" = "1" ]; then
    log "keeping workspace: $WORK"
else
    trap 'rm -rf "$WORK"' EXIT
fi

cwrap="$WORK/wrap.c"
owrap="$WORK/wrap.o"
oCrt0="$WORK/crt0.o"
oLibgcc="$WORK/libgcc.o"
obench="$WORK/bench.o"
bin="$WORK/bench.bin"
map="$WORK/bench.map"
samples="$WORK/samples.txt"

cat > "$cwrap" <<EOF
$extern_decl
volatile unsigned long sink;
#define ITERS $ITERS
int main(void) {
    /* warm-up */
    for (int w = 0; w < 5; w++) sink = (unsigned long)($call_expr);
    /* START / DONE markers in bank 2. */
    *(volatile unsigned short *)0x025000 = 0xa1a1;
    for (int i = 0; i < ITERS; i++) sink = (unsigned long)($call_expr);
    *(volatile unsigned short *)0x025002 = 0xa2a2;
    while (1) {}
}
EOF

"$LLVM_MC" -arch=w65816 -filetype=obj "$PROJECT_ROOT/runtime/src/crt0.s" -o "$oCrt0"
"$LLVM_MC" -arch=w65816 -filetype=obj "$PROJECT_ROOT/runtime/src/libgcc.s" -o "$oLibgcc"
"$CLANG" --target=w65816 -O2 -ffunction-sections -c "$cwrap" -o "$owrap"
"$CLANG" --target=w65816 -O2 -ffunction-sections -c "$BENCH_FILE" -o "$obench"

# --map-locals: pull libgcc helpers (__udivmod_core etc) + file-static
# functions into the symbol table so PC samples that fall inside them
# attribute correctly instead of bucketing as '?'.
"$LINK" -o "$bin" --text-base 0x1000 --map "$map" --map-locals \
    "$oCrt0" "$oLibgcc" "$owrap" "$obench"

# Run under MAME with --sample.  Capture both MAME-CYCLES and SAMPLE
# lines.
bash "$RUNNER" "$bin" "$ITERS" --sample $FAST_MODE $CLOCK_HZ > "$samples" 2>&1 || {
    cat "$samples" >&2
    die "runInMameCycles --sample failed"
}

# Pull cycle summary and sample lines.
cycles_line=$(grep "^MAME-CYCLES" "$samples" | head -1 || true)
total_line=$(grep "^SAMPLES total=" "$samples" | head -1 || true)
[ -n "$cycles_line" ] || die "no MAME-CYCLES in output"
[ -n "$total_line" ] || die "no SAMPLES total in output (sampling broken?)"

total=$(echo "$total_line" | grep -oE 'total=[0-9]+' | cut -d= -f2)
[ "$total" -gt 0 ] || die "zero samples captured"

log "captured $total samples"
log "$cycles_line"

# Build the (PC, hits) list as a temp file and feed through pc2line.py
# for function attribution.
pcsfile="$WORK/pcs.txt"
grep "^SAMPLE 0x" "$samples" | awk '{print $2, $3}' > "$pcsfile"

# Use pc2line.py loadMapSymbols/funcAt indirectly via a small Python
# inline.  Single-sourced — no separate symbol resolver lives outside
# pc2line.py.
attrib="$WORK/attrib.txt"
python3 - "$map" "$pcsfile" "$total" > "$attrib" <<'PYEOF'
import sys, os
sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(sys.argv[0] or ".")), "."))

map_path = sys.argv[1]
pcs_path = sys.argv[2]
total    = int(sys.argv[3])

# Import the funcAt resolver from pc2line.py.
here = os.path.dirname(os.path.abspath(__file__))
# This script is loaded via stdin so __file__ is "<stdin>" — fall back
# to the repo layout.
script_dir = os.environ.get("PROJECT_ROOT") or "."
sys.path.insert(0, os.path.join(script_dir, "scripts"))
try:
    from pc2line import loadMapSymbols, funcAt
except ImportError:
    # Try a direct import via relative path.
    p2l = os.path.join(script_dir, "scripts", "pc2line.py")
    import importlib.util
    spec = importlib.util.spec_from_file_location("pc2line", p2l)
    mod = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(mod)
    loadMapSymbols = mod.loadMapSymbols
    funcAt = mod.funcAt

syms = loadMapSymbols(map_path)

buckets = {}
with open(pcs_path) as f:
    for ln in f:
        parts = ln.split()
        if len(parts) != 2:
            continue
        pc = int(parts[0], 16)
        hits = int(parts[1])
        fn = funcAt(syms, pc)
        buckets[fn] = buckets.get(fn, 0) + hits

# Sort by hits desc.
rows = sorted(buckets.items(), key=lambda kv: -kv[1])
print(f"TOTAL {total}")
for name, h in rows:
    pct = 100.0 * h / total if total else 0.0
    print(f"BUCKET {h} {pct:.2f} {name}")
PYEOF

# Pretty-print the attribution table.
printf '\n'
printf '| Function | Hits | Hits%% |\n'
printf '|----------|-----:|------:|\n'
top=$(grep "^BUCKET" "$attrib" | head -"$TOP_N")
echo "$top" | awk '{
    hits=$2; pct=$3; name=$4;
    for (i=5; i<=NF; i++) name=name" "$i;
    printf("| %-32s | %5d | %5.2f |\n", name, hits, pct);
}'

# Smoke checks: dominant bucket and '?' percentage.
if [ "$THRESHOLD_PCT" != "0" ]; then
    qPct=$(grep "^BUCKET " "$attrib" | awk '$4=="?"{print $3; exit}')
    qPct=${qPct:-0}
    domLine=$(grep "^BUCKET " "$attrib" | head -1)
    domName=$(echo "$domLine" | awk '{print $4}')
    domPct=$(echo "$domLine" | awk '{print $3}')

    # Compare via awk (bash arithmetic doesn't do floats).
    if awk "BEGIN{exit !($qPct > $THRESHOLD_PCT)}"; then
        warn "unattributed samples = ${qPct}% (threshold ${THRESHOLD_PCT}%)"
        exit 1
    fi
    if awk "BEGIN{exit !($domPct < $DOMINANT_MIN)}"; then
        warn "dominant bucket ($domName) = ${domPct}% (expected >= ${DOMINANT_MIN}%)"
        exit 1
    fi
    log "smoke pass: unattributed=${qPct}% (<= ${THRESHOLD_PCT}%); dominant=$domName ${domPct}%"
fi