#!/usr/bin/env bash # profile.sh - function-attribution profiler under MAME. # # Builds a benchmark binary with link816 --map-locals, runs it under # scripts/runInMameCycles.sh --sample, then attributes the PC samples # to function symbols using the link816 map (globals + locals) and # prints a sorted (function, hits, hits%) table. # # Usage: # profile.sh Profile a single .c file # (e.g. benchmarks/strLen.c). # The bench wrapper pattern # mirrors benchCyclesPrecise.sh # — START/DONE markers around # ITERS calls. # # profile.sh --bench Use the benchInputs / # benchExtern config from # benchCyclesPrecise.sh (so # call signatures are known). # # Optional flags: # --iters N Override the iteration count (default 200). # --fast-mode Pass through to runInMameCycles --fast-mode. # --clock-hz N Pass through to runInMameCycles --clock-hz. # --keep Don't delete the temp build artefacts (debug). # --top N Show only the top-N functions (default 20). # --threshold PCT Require <=PCT samples in '?' (unattributed) # and dominant bucket >= 30% (default). Disable # with --threshold 0. # # Output: markdown-style table with columns FUNCTION / HITS / HITS%. # Exit 0 on attribution thresholds met, 1 on threshold breach (when # the dominant function or unattributed percentage doesn't match # expectations) or harness failure. # # Single-sourcing: this script delegates the actual PC sampling to # runInMameCycles.sh --sample (per reviewer revision — no separate # runner). All MAME setup, marker handling, and PC capture live in # the one runner harness. set -euo pipefail source "$(dirname "$0")/common.sh" CLANG="$PROJECT_ROOT/tools/llvm-mos-build/bin/clang" LLVM_MC="$PROJECT_ROOT/tools/llvm-mos-build/bin/llvm-mc" LINK="$PROJECT_ROOT/tools/link816" RUNNER="$PROJECT_ROOT/scripts/runInMameCycles.sh" PC2LINE="$PROJECT_ROOT/scripts/pc2line.py" BENCH_DIR="$PROJECT_ROOT/benchmarks" BENCH_NAME="" BENCH_FILE="" ITERS=200 FAST_MODE="" CLOCK_HZ="" KEEP=0 TOP_N=20 # Smoke-check thresholds. See the --threshold flag docs. THRESHOLD_PCT=10 # max % allowed for '?' (unattributed) DOMINANT_MIN=30 # min % expected in the dominant bucket # Per-benchmark inputs — duplicated from benchCyclesPrecise.sh so we # can profile any bench. Single source of truth would be nicer; keep # in sync manually for now. benchInputs() { case "$1" in sumOfSquares) echo 'sumOfSquares(50)';; fib) echo 'fib(10)';; strcpy) echo 'mystrcpy(dst, "hello world!")';; memcmp) echo 'mymemcmp("hello", "hello", 5)';; bsearch) echo 'bsearch(arr, 8, 5)';; dotProduct) echo 'dotProduct(va, vb, 4)';; popcount) echo 'popcount(0x12345678UL)';; crc32) echo 'crc32((const unsigned char *)"hello", 5)';; strLen) echo 'strLen("The quick brown fox jumps over the lazy dog!")';; djb2Hash) echo 'djb2Hash("hello world")';; *) echo "/* unknown */";; esac } benchExtern() { case "$1" in sumOfSquares) echo 'extern unsigned long sumOfSquares(unsigned short n);';; fib) echo 'extern unsigned short fib(unsigned short n);';; strcpy) echo 'extern char *mystrcpy(char *d, const char *s); static char dst[16];';; memcmp) echo 'extern int mymemcmp(const void *a, const void *b, unsigned int n);';; bsearch) echo 'extern int bsearch(const int *arr, int n, int key); static const int arr[] = {1,2,3,4,5,6,7,8};';; dotProduct) echo 'extern long dotProduct(const short *a, const short *b, unsigned int n); static const short va[] = {1,2,3,4}; static const short vb[] = {5,6,7,8};';; popcount) echo 'extern int popcount(unsigned long x);';; crc32) echo 'extern unsigned long crc32(const unsigned char *p, unsigned int n);';; strLen) echo 'extern unsigned short strLen(const char *s);';; djb2Hash) echo 'extern unsigned long djb2Hash(const char *s);';; *) echo '';; esac } # Parse args. while [ $# -gt 0 ]; do case "$1" in --bench) shift [ $# -ge 1 ] || die "--bench needs a name" BENCH_NAME="$1" BENCH_FILE="$BENCH_DIR/$BENCH_NAME.c" shift ;; --iters) shift [ $# -ge 1 ] || die "--iters needs a value" ITERS="$1" shift ;; --fast-mode) FAST_MODE="--fast-mode" shift ;; --clock-hz) shift [ $# -ge 1 ] || die "--clock-hz needs a value" CLOCK_HZ="--clock-hz $1" shift ;; --keep) KEEP=1 shift ;; --top) shift [ $# -ge 1 ] || die "--top needs a value" TOP_N="$1" shift ;; --threshold) shift [ $# -ge 1 ] || die "--threshold needs a value" THRESHOLD_PCT="$1" shift ;; -h|--help) sed -n '1,40p' "$0" | grep '^#' exit 0 ;; *) if [ -z "$BENCH_FILE" ] && [ -f "$1" ]; then BENCH_FILE="$1" BENCH_NAME=$(basename "$1" .c) else die "unknown arg or file not found: $1" fi shift ;; esac done [ -n "$BENCH_FILE" ] || die "usage: $0 | --bench NAME [...]" [ -f "$BENCH_FILE" ] || die "benchmark file not found: $BENCH_FILE" extern_decl=$(benchExtern "$BENCH_NAME") call_expr=$(benchInputs "$BENCH_NAME") [ -n "$extern_decl" ] || die "no input config for bench '$BENCH_NAME' — extend benchExtern/benchInputs" [ "$call_expr" != "/* unknown */" ] || die "no call config for bench '$BENCH_NAME'" log "profiling: $BENCH_NAME (iters=$ITERS)" # Workspace. WORK=$(mktemp -d) if [ "$KEEP" = "1" ]; then log "keeping workspace: $WORK" else trap 'rm -rf "$WORK"' EXIT fi cwrap="$WORK/wrap.c" owrap="$WORK/wrap.o" oCrt0="$WORK/crt0.o" oLibgcc="$WORK/libgcc.o" obench="$WORK/bench.o" bin="$WORK/bench.bin" map="$WORK/bench.map" samples="$WORK/samples.txt" cat > "$cwrap" < "$samples" 2>&1 || { cat "$samples" >&2 die "runInMameCycles --sample failed" } # Pull cycle summary and sample lines. cycles_line=$(grep "^MAME-CYCLES" "$samples" | head -1 || true) total_line=$(grep "^SAMPLES total=" "$samples" | head -1 || true) [ -n "$cycles_line" ] || die "no MAME-CYCLES in output" [ -n "$total_line" ] || die "no SAMPLES total in output (sampling broken?)" total=$(echo "$total_line" | grep -oE 'total=[0-9]+' | cut -d= -f2) [ "$total" -gt 0 ] || die "zero samples captured" log "captured $total samples" log "$cycles_line" # Build the (PC, hits) list as a temp file and feed through pc2line.py # for function attribution. pcsfile="$WORK/pcs.txt" grep "^SAMPLE 0x" "$samples" | awk '{print $2, $3}' > "$pcsfile" # Use pc2line.py loadMapSymbols/funcAt indirectly via a small Python # inline. Single-sourced — no separate symbol resolver lives outside # pc2line.py. attrib="$WORK/attrib.txt" python3 - "$map" "$pcsfile" "$total" > "$attrib" <<'PYEOF' import sys, os sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(sys.argv[0] or ".")), ".")) map_path = sys.argv[1] pcs_path = sys.argv[2] total = int(sys.argv[3]) # Import the funcAt resolver from pc2line.py. here = os.path.dirname(os.path.abspath(__file__)) # This script is loaded via stdin so __file__ is "" — fall back # to the repo layout. script_dir = os.environ.get("PROJECT_ROOT") or "." sys.path.insert(0, os.path.join(script_dir, "scripts")) try: from pc2line import loadMapSymbols, funcAt except ImportError: # Try a direct import via relative path. p2l = os.path.join(script_dir, "scripts", "pc2line.py") import importlib.util spec = importlib.util.spec_from_file_location("pc2line", p2l) mod = importlib.util.module_from_spec(spec) spec.loader.exec_module(mod) loadMapSymbols = mod.loadMapSymbols funcAt = mod.funcAt syms = loadMapSymbols(map_path) buckets = {} with open(pcs_path) as f: for ln in f: parts = ln.split() if len(parts) != 2: continue pc = int(parts[0], 16) hits = int(parts[1]) fn = funcAt(syms, pc) buckets[fn] = buckets.get(fn, 0) + hits # Sort by hits desc. rows = sorted(buckets.items(), key=lambda kv: -kv[1]) print(f"TOTAL {total}") for name, h in rows: pct = 100.0 * h / total if total else 0.0 print(f"BUCKET {h} {pct:.2f} {name}") PYEOF # Pretty-print the attribution table. printf '\n' printf '| Function | Hits | Hits%% |\n' printf '|----------|-----:|------:|\n' top=$(grep "^BUCKET" "$attrib" | head -"$TOP_N") echo "$top" | awk '{ hits=$2; pct=$3; name=$4; for (i=5; i<=NF; i++) name=name" "$i; printf("| %-32s | %5d | %5.2f |\n", name, hits, pct); }' # Smoke checks: dominant bucket and '?' percentage. if [ "$THRESHOLD_PCT" != "0" ]; then qPct=$(grep "^BUCKET " "$attrib" | awk '$4=="?"{print $3; exit}') qPct=${qPct:-0} domLine=$(grep "^BUCKET " "$attrib" | head -1) domName=$(echo "$domLine" | awk '{print $4}') domPct=$(echo "$domLine" | awk '{print $3}') # Compare via awk (bash arithmetic doesn't do floats). if awk "BEGIN{exit !($qPct > $THRESHOLD_PCT)}"; then warn "unattributed samples = ${qPct}% (threshold ${THRESHOLD_PCT}%)" exit 1 fi if awk "BEGIN{exit !($domPct < $DOMINANT_MIN)}"; then warn "dominant bucket ($domName) = ${domPct}% (expected >= ${DOMINANT_MIN}%)" exit 1 fi log "smoke pass: unattributed=${qPct}% (<= ${THRESHOLD_PCT}%); dominant=$domName ${domPct}%" fi