65816-llvm-mos/scripts/profile.sh
Scott Duensing da095402ec Updated
2026-06-02 23:17:57 -05:00

313 lines
11 KiB
Bash
Executable file

#!/usr/bin/env bash
# profile.sh - function-attribution profiler under MAME.
#
# Builds a benchmark binary with link816 --map-locals, runs it under
# scripts/runInMameCycles.sh --sample, then attributes the PC samples
# to function symbols using the link816 map (globals + locals) and
# prints a sorted (function, hits, hits%) table.
#
# Usage:
# profile.sh <benchmark-c-file> Profile a single .c file
# (e.g. benchmarks/strLen.c).
# The bench wrapper pattern
# mirrors benchCyclesPrecise.sh
# — START/DONE markers around
# ITERS calls.
#
# profile.sh --bench <name> Use the benchInputs /
# benchExtern config from
# benchCyclesPrecise.sh (so
# call signatures are known).
#
# Optional flags:
# --iters N Override the iteration count (default 200).
# --fast-mode Pass through to runInMameCycles --fast-mode.
# --clock-hz N Pass through to runInMameCycles --clock-hz.
# --keep Don't delete the temp build artefacts (debug).
# --top N Show only the top-N functions (default 20).
# --threshold PCT Require <=PCT samples in '?' (unattributed)
# and dominant bucket >= 30% (default). Disable
# with --threshold 0.
#
# Output: markdown-style table with columns FUNCTION / HITS / HITS%.
# Exit 0 on attribution thresholds met, 1 on threshold breach (when
# the dominant function or unattributed percentage doesn't match
# expectations) or harness failure.
#
# Single-sourcing: this script delegates the actual PC sampling to
# runInMameCycles.sh --sample (per reviewer revision — no separate
# runner). All MAME setup, marker handling, and PC capture live in
# the one runner harness.
set -euo pipefail
source "$(dirname "$0")/common.sh"
CLANG="$PROJECT_ROOT/tools/llvm-mos-build/bin/clang"
LLVM_MC="$PROJECT_ROOT/tools/llvm-mos-build/bin/llvm-mc"
LINK="$PROJECT_ROOT/tools/link816"
RUNNER="$PROJECT_ROOT/scripts/runInMameCycles.sh"
PC2LINE="$PROJECT_ROOT/scripts/pc2line.py"
BENCH_DIR="$PROJECT_ROOT/benchmarks"
BENCH_NAME=""
BENCH_FILE=""
ITERS=200
FAST_MODE=""
CLOCK_HZ=""
KEEP=0
TOP_N=20
# Smoke-check thresholds. See the --threshold flag docs.
THRESHOLD_PCT=10 # max % allowed for '?' (unattributed)
DOMINANT_MIN=30 # min % expected in the dominant bucket
# Per-benchmark inputs — duplicated from benchCyclesPrecise.sh so we
# can profile any bench. Single source of truth would be nicer; keep
# in sync manually for now.
benchInputs() {
case "$1" in
sumOfSquares) echo 'sumOfSquares(50)';;
fib) echo 'fib(10)';;
strcpy) echo 'mystrcpy(dst, "hello world!")';;
memcmp) echo 'mymemcmp("hello", "hello", 5)';;
bsearch) echo 'bsearch(arr, 8, 5)';;
dotProduct) echo 'dotProduct(va, vb, 4)';;
popcount) echo 'popcount(0x12345678UL)';;
crc32) echo 'crc32((const unsigned char *)"hello", 5)';;
strLen) echo 'strLen("The quick brown fox jumps over the lazy dog!")';;
djb2Hash) echo 'djb2Hash("hello world")';;
*) echo "/* unknown */";;
esac
}
benchExtern() {
case "$1" in
sumOfSquares) echo 'extern unsigned long sumOfSquares(unsigned short n);';;
fib) echo 'extern unsigned short fib(unsigned short n);';;
strcpy) echo 'extern char *mystrcpy(char *d, const char *s); static char dst[16];';;
memcmp) echo 'extern int mymemcmp(const void *a, const void *b, unsigned int n);';;
bsearch) echo 'extern int bsearch(const int *arr, int n, int key); static const int arr[] = {1,2,3,4,5,6,7,8};';;
dotProduct) echo 'extern long dotProduct(const short *a, const short *b, unsigned int n); static const short va[] = {1,2,3,4}; static const short vb[] = {5,6,7,8};';;
popcount) echo 'extern int popcount(unsigned long x);';;
crc32) echo 'extern unsigned long crc32(const unsigned char *p, unsigned int n);';;
strLen) echo 'extern unsigned short strLen(const char *s);';;
djb2Hash) echo 'extern unsigned long djb2Hash(const char *s);';;
*) echo '';;
esac
}
# Parse args.
while [ $# -gt 0 ]; do
case "$1" in
--bench)
shift
[ $# -ge 1 ] || die "--bench needs a name"
BENCH_NAME="$1"
BENCH_FILE="$BENCH_DIR/$BENCH_NAME.c"
shift
;;
--iters)
shift
[ $# -ge 1 ] || die "--iters needs a value"
ITERS="$1"
shift
;;
--fast-mode)
FAST_MODE="--fast-mode"
shift
;;
--clock-hz)
shift
[ $# -ge 1 ] || die "--clock-hz needs a value"
CLOCK_HZ="--clock-hz $1"
shift
;;
--keep)
KEEP=1
shift
;;
--top)
shift
[ $# -ge 1 ] || die "--top needs a value"
TOP_N="$1"
shift
;;
--threshold)
shift
[ $# -ge 1 ] || die "--threshold needs a value"
THRESHOLD_PCT="$1"
shift
;;
-h|--help)
sed -n '1,40p' "$0" | grep '^#'
exit 0
;;
*)
if [ -z "$BENCH_FILE" ] && [ -f "$1" ]; then
BENCH_FILE="$1"
BENCH_NAME=$(basename "$1" .c)
else
die "unknown arg or file not found: $1"
fi
shift
;;
esac
done
[ -n "$BENCH_FILE" ] || die "usage: $0 <bench.c> | --bench NAME [...]"
[ -f "$BENCH_FILE" ] || die "benchmark file not found: $BENCH_FILE"
extern_decl=$(benchExtern "$BENCH_NAME")
call_expr=$(benchInputs "$BENCH_NAME")
[ -n "$extern_decl" ] || die "no input config for bench '$BENCH_NAME' — extend benchExtern/benchInputs"
[ "$call_expr" != "/* unknown */" ] || die "no call config for bench '$BENCH_NAME'"
log "profiling: $BENCH_NAME (iters=$ITERS)"
# Workspace.
WORK=$(mktemp -d)
if [ "$KEEP" = "1" ]; then
log "keeping workspace: $WORK"
else
trap 'rm -rf "$WORK"' EXIT
fi
cwrap="$WORK/wrap.c"
owrap="$WORK/wrap.o"
oCrt0="$WORK/crt0.o"
oLibgcc="$WORK/libgcc.o"
obench="$WORK/bench.o"
bin="$WORK/bench.bin"
map="$WORK/bench.map"
samples="$WORK/samples.txt"
cat > "$cwrap" <<EOF
$extern_decl
volatile unsigned long sink;
#define ITERS $ITERS
int main(void) {
/* warm-up */
for (int w = 0; w < 5; w++) sink = (unsigned long)($call_expr);
/* START / DONE markers in bank 2. */
*(volatile unsigned short *)0x025000 = 0xa1a1;
for (int i = 0; i < ITERS; i++) sink = (unsigned long)($call_expr);
*(volatile unsigned short *)0x025002 = 0xa2a2;
while (1) {}
}
EOF
"$LLVM_MC" -arch=w65816 -filetype=obj "$PROJECT_ROOT/runtime/src/crt0.s" -o "$oCrt0"
"$LLVM_MC" -arch=w65816 -filetype=obj "$PROJECT_ROOT/runtime/src/libgcc.s" -o "$oLibgcc"
"$CLANG" --target=w65816 -O2 -ffunction-sections -c "$cwrap" -o "$owrap"
"$CLANG" --target=w65816 -O2 -ffunction-sections -c "$BENCH_FILE" -o "$obench"
# --map-locals: pull libgcc helpers (__udivmod_core etc) + file-static
# functions into the symbol table so PC samples that fall inside them
# attribute correctly instead of bucketing as '?'.
"$LINK" -o "$bin" --text-base 0x1000 --map "$map" --map-locals \
"$oCrt0" "$oLibgcc" "$owrap" "$obench"
# Run under MAME with --sample. Capture both MAME-CYCLES and SAMPLE
# lines.
bash "$RUNNER" "$bin" "$ITERS" --sample $FAST_MODE $CLOCK_HZ > "$samples" 2>&1 || {
cat "$samples" >&2
die "runInMameCycles --sample failed"
}
# Pull cycle summary and sample lines.
cycles_line=$(grep "^MAME-CYCLES" "$samples" | head -1 || true)
total_line=$(grep "^SAMPLES total=" "$samples" | head -1 || true)
[ -n "$cycles_line" ] || die "no MAME-CYCLES in output"
[ -n "$total_line" ] || die "no SAMPLES total in output (sampling broken?)"
total=$(echo "$total_line" | grep -oE 'total=[0-9]+' | cut -d= -f2)
[ "$total" -gt 0 ] || die "zero samples captured"
log "captured $total samples"
log "$cycles_line"
# Build the (PC, hits) list as a temp file and feed through pc2line.py
# for function attribution.
pcsfile="$WORK/pcs.txt"
grep "^SAMPLE 0x" "$samples" | awk '{print $2, $3}' > "$pcsfile"
# Use pc2line.py loadMapSymbols/funcAt indirectly via a small Python
# inline. Single-sourced — no separate symbol resolver lives outside
# pc2line.py.
attrib="$WORK/attrib.txt"
python3 - "$map" "$pcsfile" "$total" > "$attrib" <<'PYEOF'
import sys, os
sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(sys.argv[0] or ".")), "."))
map_path = sys.argv[1]
pcs_path = sys.argv[2]
total = int(sys.argv[3])
# Import the funcAt resolver from pc2line.py.
here = os.path.dirname(os.path.abspath(__file__))
# This script is loaded via stdin so __file__ is "<stdin>" — fall back
# to the repo layout.
script_dir = os.environ.get("PROJECT_ROOT") or "."
sys.path.insert(0, os.path.join(script_dir, "scripts"))
try:
from pc2line import loadMapSymbols, funcAt
except ImportError:
# Try a direct import via relative path.
p2l = os.path.join(script_dir, "scripts", "pc2line.py")
import importlib.util
spec = importlib.util.spec_from_file_location("pc2line", p2l)
mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mod)
loadMapSymbols = mod.loadMapSymbols
funcAt = mod.funcAt
syms = loadMapSymbols(map_path)
buckets = {}
with open(pcs_path) as f:
for ln in f:
parts = ln.split()
if len(parts) != 2:
continue
pc = int(parts[0], 16)
hits = int(parts[1])
fn = funcAt(syms, pc)
buckets[fn] = buckets.get(fn, 0) + hits
# Sort by hits desc.
rows = sorted(buckets.items(), key=lambda kv: -kv[1])
print(f"TOTAL {total}")
for name, h in rows:
pct = 100.0 * h / total if total else 0.0
print(f"BUCKET {h} {pct:.2f} {name}")
PYEOF
# Pretty-print the attribution table.
printf '\n'
printf '| Function | Hits | Hits%% |\n'
printf '|----------|-----:|------:|\n'
top=$(grep "^BUCKET" "$attrib" | head -"$TOP_N")
echo "$top" | awk '{
hits=$2; pct=$3; name=$4;
for (i=5; i<=NF; i++) name=name" "$i;
printf("| %-32s | %5d | %5.2f |\n", name, hits, pct);
}'
# Smoke checks: dominant bucket and '?' percentage.
if [ "$THRESHOLD_PCT" != "0" ]; then
qPct=$(grep "^BUCKET " "$attrib" | awk '$4=="?"{print $3; exit}')
qPct=${qPct:-0}
domLine=$(grep "^BUCKET " "$attrib" | head -1)
domName=$(echo "$domLine" | awk '{print $4}')
domPct=$(echo "$domLine" | awk '{print $3}')
# Compare via awk (bash arithmetic doesn't do floats).
if awk "BEGIN{exit !($qPct > $THRESHOLD_PCT)}"; then
warn "unattributed samples = ${qPct}% (threshold ${THRESHOLD_PCT}%)"
exit 1
fi
if awk "BEGIN{exit !($domPct < $DOMINANT_MIN)}"; then
warn "dominant bucket ($domName) = ${domPct}% (expected >= ${DOMINANT_MIN}%)"
exit 1
fi
log "smoke pass: unattributed=${qPct}% (<= ${THRESHOLD_PCT}%); dominant=$domName ${domPct}%"
fi