313 lines
11 KiB
Bash
Executable file
313 lines
11 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
# profile.sh - function-attribution profiler under MAME.
|
|
#
|
|
# Builds a benchmark binary with link816 --map-locals, runs it under
|
|
# scripts/runInMameCycles.sh --sample, then attributes the PC samples
|
|
# to function symbols using the link816 map (globals + locals) and
|
|
# prints a sorted (function, hits, hits%) table.
|
|
#
|
|
# Usage:
|
|
# profile.sh <benchmark-c-file> Profile a single .c file
|
|
# (e.g. benchmarks/strLen.c).
|
|
# The bench wrapper pattern
|
|
# mirrors benchCyclesPrecise.sh
|
|
# — START/DONE markers around
|
|
# ITERS calls.
|
|
#
|
|
# profile.sh --bench <name> Use the benchInputs /
|
|
# benchExtern config from
|
|
# benchCyclesPrecise.sh (so
|
|
# call signatures are known).
|
|
#
|
|
# Optional flags:
|
|
# --iters N Override the iteration count (default 200).
|
|
# --fast-mode Pass through to runInMameCycles --fast-mode.
|
|
# --clock-hz N Pass through to runInMameCycles --clock-hz.
|
|
# --keep Don't delete the temp build artefacts (debug).
|
|
# --top N Show only the top-N functions (default 20).
|
|
# --threshold PCT Require <=PCT samples in '?' (unattributed)
|
|
# and dominant bucket >= 30% (default). Disable
|
|
# with --threshold 0.
|
|
#
|
|
# Output: markdown-style table with columns FUNCTION / HITS / HITS%.
|
|
# Exit 0 on attribution thresholds met, 1 on threshold breach (when
|
|
# the dominant function or unattributed percentage doesn't match
|
|
# expectations) or harness failure.
|
|
#
|
|
# Single-sourcing: this script delegates the actual PC sampling to
|
|
# runInMameCycles.sh --sample (per reviewer revision — no separate
|
|
# runner). All MAME setup, marker handling, and PC capture live in
|
|
# the one runner harness.
|
|
|
|
set -euo pipefail
|
|
source "$(dirname "$0")/common.sh"
|
|
|
|
CLANG="$PROJECT_ROOT/tools/llvm-mos-build/bin/clang"
|
|
LLVM_MC="$PROJECT_ROOT/tools/llvm-mos-build/bin/llvm-mc"
|
|
LINK="$PROJECT_ROOT/tools/link816"
|
|
RUNNER="$PROJECT_ROOT/scripts/runInMameCycles.sh"
|
|
PC2LINE="$PROJECT_ROOT/scripts/pc2line.py"
|
|
BENCH_DIR="$PROJECT_ROOT/benchmarks"
|
|
|
|
BENCH_NAME=""
|
|
BENCH_FILE=""
|
|
ITERS=200
|
|
FAST_MODE=""
|
|
CLOCK_HZ=""
|
|
KEEP=0
|
|
TOP_N=20
|
|
# Smoke-check thresholds. See the --threshold flag docs.
|
|
THRESHOLD_PCT=10 # max % allowed for '?' (unattributed)
|
|
DOMINANT_MIN=30 # min % expected in the dominant bucket
|
|
|
|
# Per-benchmark inputs — duplicated from benchCyclesPrecise.sh so we
|
|
# can profile any bench. Single source of truth would be nicer; keep
|
|
# in sync manually for now.
|
|
benchInputs() {
|
|
case "$1" in
|
|
sumOfSquares) echo 'sumOfSquares(50)';;
|
|
fib) echo 'fib(10)';;
|
|
strcpy) echo 'mystrcpy(dst, "hello world!")';;
|
|
memcmp) echo 'mymemcmp("hello", "hello", 5)';;
|
|
bsearch) echo 'bsearch(arr, 8, 5)';;
|
|
dotProduct) echo 'dotProduct(va, vb, 4)';;
|
|
popcount) echo 'popcount(0x12345678UL)';;
|
|
crc32) echo 'crc32((const unsigned char *)"hello", 5)';;
|
|
strLen) echo 'strLen("The quick brown fox jumps over the lazy dog!")';;
|
|
djb2Hash) echo 'djb2Hash("hello world")';;
|
|
*) echo "/* unknown */";;
|
|
esac
|
|
}
|
|
|
|
benchExtern() {
|
|
case "$1" in
|
|
sumOfSquares) echo 'extern unsigned long sumOfSquares(unsigned short n);';;
|
|
fib) echo 'extern unsigned short fib(unsigned short n);';;
|
|
strcpy) echo 'extern char *mystrcpy(char *d, const char *s); static char dst[16];';;
|
|
memcmp) echo 'extern int mymemcmp(const void *a, const void *b, unsigned int n);';;
|
|
bsearch) echo 'extern int bsearch(const int *arr, int n, int key); static const int arr[] = {1,2,3,4,5,6,7,8};';;
|
|
dotProduct) echo 'extern long dotProduct(const short *a, const short *b, unsigned int n); static const short va[] = {1,2,3,4}; static const short vb[] = {5,6,7,8};';;
|
|
popcount) echo 'extern int popcount(unsigned long x);';;
|
|
crc32) echo 'extern unsigned long crc32(const unsigned char *p, unsigned int n);';;
|
|
strLen) echo 'extern unsigned short strLen(const char *s);';;
|
|
djb2Hash) echo 'extern unsigned long djb2Hash(const char *s);';;
|
|
*) echo '';;
|
|
esac
|
|
}
|
|
|
|
# Parse args.
|
|
while [ $# -gt 0 ]; do
|
|
case "$1" in
|
|
--bench)
|
|
shift
|
|
[ $# -ge 1 ] || die "--bench needs a name"
|
|
BENCH_NAME="$1"
|
|
BENCH_FILE="$BENCH_DIR/$BENCH_NAME.c"
|
|
shift
|
|
;;
|
|
--iters)
|
|
shift
|
|
[ $# -ge 1 ] || die "--iters needs a value"
|
|
ITERS="$1"
|
|
shift
|
|
;;
|
|
--fast-mode)
|
|
FAST_MODE="--fast-mode"
|
|
shift
|
|
;;
|
|
--clock-hz)
|
|
shift
|
|
[ $# -ge 1 ] || die "--clock-hz needs a value"
|
|
CLOCK_HZ="--clock-hz $1"
|
|
shift
|
|
;;
|
|
--keep)
|
|
KEEP=1
|
|
shift
|
|
;;
|
|
--top)
|
|
shift
|
|
[ $# -ge 1 ] || die "--top needs a value"
|
|
TOP_N="$1"
|
|
shift
|
|
;;
|
|
--threshold)
|
|
shift
|
|
[ $# -ge 1 ] || die "--threshold needs a value"
|
|
THRESHOLD_PCT="$1"
|
|
shift
|
|
;;
|
|
-h|--help)
|
|
sed -n '1,40p' "$0" | grep '^#'
|
|
exit 0
|
|
;;
|
|
*)
|
|
if [ -z "$BENCH_FILE" ] && [ -f "$1" ]; then
|
|
BENCH_FILE="$1"
|
|
BENCH_NAME=$(basename "$1" .c)
|
|
else
|
|
die "unknown arg or file not found: $1"
|
|
fi
|
|
shift
|
|
;;
|
|
esac
|
|
done
|
|
|
|
[ -n "$BENCH_FILE" ] || die "usage: $0 <bench.c> | --bench NAME [...]"
|
|
[ -f "$BENCH_FILE" ] || die "benchmark file not found: $BENCH_FILE"
|
|
|
|
extern_decl=$(benchExtern "$BENCH_NAME")
|
|
call_expr=$(benchInputs "$BENCH_NAME")
|
|
[ -n "$extern_decl" ] || die "no input config for bench '$BENCH_NAME' — extend benchExtern/benchInputs"
|
|
[ "$call_expr" != "/* unknown */" ] || die "no call config for bench '$BENCH_NAME'"
|
|
|
|
log "profiling: $BENCH_NAME (iters=$ITERS)"
|
|
|
|
# Workspace.
|
|
WORK=$(mktemp -d)
|
|
if [ "$KEEP" = "1" ]; then
|
|
log "keeping workspace: $WORK"
|
|
else
|
|
trap 'rm -rf "$WORK"' EXIT
|
|
fi
|
|
|
|
cwrap="$WORK/wrap.c"
|
|
owrap="$WORK/wrap.o"
|
|
oCrt0="$WORK/crt0.o"
|
|
oLibgcc="$WORK/libgcc.o"
|
|
obench="$WORK/bench.o"
|
|
bin="$WORK/bench.bin"
|
|
map="$WORK/bench.map"
|
|
samples="$WORK/samples.txt"
|
|
|
|
cat > "$cwrap" <<EOF
|
|
$extern_decl
|
|
volatile unsigned long sink;
|
|
#define ITERS $ITERS
|
|
int main(void) {
|
|
/* warm-up */
|
|
for (int w = 0; w < 5; w++) sink = (unsigned long)($call_expr);
|
|
/* START / DONE markers in bank 2. */
|
|
*(volatile unsigned short *)0x025000 = 0xa1a1;
|
|
for (int i = 0; i < ITERS; i++) sink = (unsigned long)($call_expr);
|
|
*(volatile unsigned short *)0x025002 = 0xa2a2;
|
|
while (1) {}
|
|
}
|
|
EOF
|
|
|
|
"$LLVM_MC" -arch=w65816 -filetype=obj "$PROJECT_ROOT/runtime/src/crt0.s" -o "$oCrt0"
|
|
"$LLVM_MC" -arch=w65816 -filetype=obj "$PROJECT_ROOT/runtime/src/libgcc.s" -o "$oLibgcc"
|
|
"$CLANG" --target=w65816 -O2 -ffunction-sections -c "$cwrap" -o "$owrap"
|
|
"$CLANG" --target=w65816 -O2 -ffunction-sections -c "$BENCH_FILE" -o "$obench"
|
|
|
|
# --map-locals: pull libgcc helpers (__udivmod_core etc) + file-static
|
|
# functions into the symbol table so PC samples that fall inside them
|
|
# attribute correctly instead of bucketing as '?'.
|
|
"$LINK" -o "$bin" --text-base 0x1000 --map "$map" --map-locals \
|
|
"$oCrt0" "$oLibgcc" "$owrap" "$obench"
|
|
|
|
# Run under MAME with --sample. Capture both MAME-CYCLES and SAMPLE
|
|
# lines.
|
|
bash "$RUNNER" "$bin" "$ITERS" --sample $FAST_MODE $CLOCK_HZ > "$samples" 2>&1 || {
|
|
cat "$samples" >&2
|
|
die "runInMameCycles --sample failed"
|
|
}
|
|
|
|
# Pull cycle summary and sample lines.
|
|
cycles_line=$(grep "^MAME-CYCLES" "$samples" | head -1 || true)
|
|
total_line=$(grep "^SAMPLES total=" "$samples" | head -1 || true)
|
|
[ -n "$cycles_line" ] || die "no MAME-CYCLES in output"
|
|
[ -n "$total_line" ] || die "no SAMPLES total in output (sampling broken?)"
|
|
|
|
total=$(echo "$total_line" | grep -oE 'total=[0-9]+' | cut -d= -f2)
|
|
[ "$total" -gt 0 ] || die "zero samples captured"
|
|
|
|
log "captured $total samples"
|
|
log "$cycles_line"
|
|
|
|
# Build the (PC, hits) list as a temp file and feed through pc2line.py
|
|
# for function attribution.
|
|
pcsfile="$WORK/pcs.txt"
|
|
grep "^SAMPLE 0x" "$samples" | awk '{print $2, $3}' > "$pcsfile"
|
|
|
|
# Use pc2line.py loadMapSymbols/funcAt indirectly via a small Python
|
|
# inline. Single-sourced — no separate symbol resolver lives outside
|
|
# pc2line.py.
|
|
attrib="$WORK/attrib.txt"
|
|
python3 - "$map" "$pcsfile" "$total" > "$attrib" <<'PYEOF'
|
|
import sys, os
|
|
sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(sys.argv[0] or ".")), "."))
|
|
|
|
map_path = sys.argv[1]
|
|
pcs_path = sys.argv[2]
|
|
total = int(sys.argv[3])
|
|
|
|
# Import the funcAt resolver from pc2line.py.
|
|
here = os.path.dirname(os.path.abspath(__file__))
|
|
# This script is loaded via stdin so __file__ is "<stdin>" — fall back
|
|
# to the repo layout.
|
|
script_dir = os.environ.get("PROJECT_ROOT") or "."
|
|
sys.path.insert(0, os.path.join(script_dir, "scripts"))
|
|
try:
|
|
from pc2line import loadMapSymbols, funcAt
|
|
except ImportError:
|
|
# Try a direct import via relative path.
|
|
p2l = os.path.join(script_dir, "scripts", "pc2line.py")
|
|
import importlib.util
|
|
spec = importlib.util.spec_from_file_location("pc2line", p2l)
|
|
mod = importlib.util.module_from_spec(spec)
|
|
spec.loader.exec_module(mod)
|
|
loadMapSymbols = mod.loadMapSymbols
|
|
funcAt = mod.funcAt
|
|
|
|
syms = loadMapSymbols(map_path)
|
|
|
|
buckets = {}
|
|
with open(pcs_path) as f:
|
|
for ln in f:
|
|
parts = ln.split()
|
|
if len(parts) != 2:
|
|
continue
|
|
pc = int(parts[0], 16)
|
|
hits = int(parts[1])
|
|
fn = funcAt(syms, pc)
|
|
buckets[fn] = buckets.get(fn, 0) + hits
|
|
|
|
# Sort by hits desc.
|
|
rows = sorted(buckets.items(), key=lambda kv: -kv[1])
|
|
print(f"TOTAL {total}")
|
|
for name, h in rows:
|
|
pct = 100.0 * h / total if total else 0.0
|
|
print(f"BUCKET {h} {pct:.2f} {name}")
|
|
PYEOF
|
|
|
|
# Pretty-print the attribution table.
|
|
printf '\n'
|
|
printf '| Function | Hits | Hits%% |\n'
|
|
printf '|----------|-----:|------:|\n'
|
|
top=$(grep "^BUCKET" "$attrib" | head -"$TOP_N")
|
|
echo "$top" | awk '{
|
|
hits=$2; pct=$3; name=$4;
|
|
for (i=5; i<=NF; i++) name=name" "$i;
|
|
printf("| %-32s | %5d | %5.2f |\n", name, hits, pct);
|
|
}'
|
|
|
|
# Smoke checks: dominant bucket and '?' percentage.
|
|
if [ "$THRESHOLD_PCT" != "0" ]; then
|
|
qPct=$(grep "^BUCKET " "$attrib" | awk '$4=="?"{print $3; exit}')
|
|
qPct=${qPct:-0}
|
|
domLine=$(grep "^BUCKET " "$attrib" | head -1)
|
|
domName=$(echo "$domLine" | awk '{print $4}')
|
|
domPct=$(echo "$domLine" | awk '{print $3}')
|
|
|
|
# Compare via awk (bash arithmetic doesn't do floats).
|
|
if awk "BEGIN{exit !($qPct > $THRESHOLD_PCT)}"; then
|
|
warn "unattributed samples = ${qPct}% (threshold ${THRESHOLD_PCT}%)"
|
|
exit 1
|
|
fi
|
|
if awk "BEGIN{exit !($domPct < $DOMINANT_MIN)}"; then
|
|
warn "dominant bucket ($domName) = ${domPct}% (expected >= ${DOMINANT_MIN}%)"
|
|
exit 1
|
|
fi
|
|
log "smoke pass: unattributed=${qPct}% (<= ${THRESHOLD_PCT}%); dominant=$domName ${domPct}%"
|
|
fi
|