65816-llvm-mos/scripts/benchCyclesPrecise.sh
Scott Duensing 3388f3c5a5 More updates
2026-06-03 20:46:31 -05:00

177 lines
7.6 KiB
Bash
Executable file

#!/usr/bin/env bash
# benchCyclesPrecise.sh — measure per-call cycle counts via the
# emu.time()-based runner (scripts/runInMameCycles.sh).
#
# For each benchmark in benchmarks/, build a wrapper that calls the
# bench function ITERS times between START / DONE markers; the runner
# captures emulated time and converts to cycles assuming the IIgs
# slow-mode clock (1023000 Hz — IIe-compatible default; our binary
# doesn't enable fast mode unless its wrapper does).
#
# Output: markdown table with cycles-per-call. Both clang and the
# Calypsi numbers (from `tools/calypsi/cc65816`) are reported when
# Calypsi is installed.
#
# Flags:
# --no-layer2 Build the benches in plain ptr32 mode (Layer 1 only).
# By default we pass `-mllvm -w65816-dbr-safe-ptrs`
# (Layer 2 — stack-rel-indirect-Y ptr32 derefs) because
# every published baseline in docs/USAGE.md and every
# entry in memory/feedback_*.md was measured with Layer
# 2 on. Without it, strLen / strcpy / djb2 / memcmp
# lose the X-iter + Y-as-counter peephole chain in
# W65816StackRelToImg and regress 2-4x.
#
# Env override:
# W65816_CC_EXTRA Additional flags passed to every clang invocation
# in this script. Appended AFTER the layer flag
# so callers can disable Layer 2 themselves
# (`W65816_CC_EXTRA="" --no-layer2 ...`) or stack
# extra `-mllvm` knobs on top of Layer 2.
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
BENCH_DIR="$PROJECT_ROOT/benchmarks"
# Layer 2 is the published baseline. Use --no-layer2 to opt out.
LAYER2_FLAGS=(-mllvm -w65816-dbr-safe-ptrs)
for arg in "$@"; do
case "$arg" in
--no-layer2) LAYER2_FLAGS=() ;;
*) echo "unknown flag: $arg" >&2; exit 1 ;;
esac
done
CLANG="$PROJECT_ROOT/tools/llvm-mos-build/bin/clang"
LLVM_MC="$PROJECT_ROOT/tools/llvm-mos-build/bin/llvm-mc"
LINK="$PROJECT_ROOT/tools/link816"
RUNNER="$PROJECT_ROOT/scripts/runInMameCycles.sh"
oCrt0=$(mktemp --suffix=.o)
oLibgcc=$(mktemp --suffix=.o)
"$LLVM_MC" -arch=w65816 -filetype=obj "$PROJECT_ROOT/runtime/src/crt0.s" -o "$oCrt0"
"$LLVM_MC" -arch=w65816 -filetype=obj "$PROJECT_ROOT/runtime/src/libgcc.s" -o "$oLibgcc"
# Per-benchmark inputs / extern decls (mirrors benchCycles.sh).
benchInputs() {
case "$1" in
sumOfSquares) echo 'sumOfSquares(50)';;
fib) echo 'fib(10)';;
strcpy) echo 'mystrcpy(dst, "hello world!")';;
memcmp) echo 'mymemcmp("hello", "hello", 5)';;
bsearch) echo 'bsearch(arr, 8, 5)';;
dotProduct) echo 'dotProduct(va, vb, 4)';;
popcount) echo 'popcount(0x12345678UL)';;
crc32) echo 'crc32((const unsigned char *)"hello", 5)';;
globalArrSum) echo 'globalArrSum(50)';;
globalArrFill) echo '(globalArrFill(50), 0)';;
globalArr8Sum) echo 'globalArr8Sum(50)';;
bubbleSort) echo '(bubbleSort(bsBuf, 16), 0)';;
strLen) echo 'strLen("The quick brown fox jumps over the lazy dog!")';;
djb2Hash) echo 'djb2Hash("hello world")';;
*) echo "/* unknown */";;
esac
}
benchExtern() {
case "$1" in
sumOfSquares) echo 'extern unsigned long sumOfSquares(unsigned short n);';;
fib) echo 'extern unsigned short fib(unsigned short n);';;
strcpy) echo 'extern char *mystrcpy(char *d, const char *s); static char dst[16];';;
memcmp) echo 'extern int mymemcmp(const void *a, const void *b, unsigned int n);';;
bsearch) echo 'extern int bsearch(const int *arr, int n, int key); static const int arr[] = {1,2,3,4,5,6,7,8};';;
dotProduct) echo 'extern long dotProduct(const short *a, const short *b, unsigned int n); static const short va[] = {1,2,3,4}; static const short vb[] = {5,6,7,8};';;
popcount) echo 'extern int popcount(unsigned long x);';;
crc32) echo 'extern unsigned long crc32(const unsigned char *p, unsigned int n);';;
globalArrSum) echo 'extern unsigned short globalArrSum(unsigned short n); extern unsigned short globalArr[100];';;
globalArrFill) echo 'extern void globalArrFill(unsigned short n); extern unsigned short globalArrF[100];';;
globalArr8Sum) echo 'extern unsigned short globalArr8Sum(unsigned short n); extern unsigned char globalArr8[100];';;
bubbleSort) echo 'extern void bubbleSort(short *a, unsigned short n); static short bsBuf[16] = {7,3,1,9,4,5,8,2,6,0,15,11,13,10,14,12};';;
strLen) echo 'extern unsigned short strLen(const char *s);';;
djb2Hash) echo 'extern unsigned long djb2Hash(const char *s);';;
*) echo '';;
esac
}
# How many iterations to run each bench for. Bigger = more
# precise (smaller relative measurement noise) but longer runtime.
# Heavy benches get fewer iters; cheap benches get more.
benchIters() {
case "$1" in
sumOfSquares) echo 50;; # ~1600 cyc/call → ~80k cyc total
fib) echo 100;;
strcpy) echo 200;;
memcmp) echo 500;;
bsearch) echo 200;;
dotProduct) echo 200;;
popcount) echo 500;;
crc32) echo 200;;
globalArrSum) echo 100;;
globalArrFill) echo 100;;
globalArr8Sum) echo 100;;
bubbleSort) echo 50;;
strLen) echo 200;;
djb2Hash) echo 50;;
*) echo 100;;
esac
}
runOneBench() {
local name="$1"
local extern_decl call_expr iters
extern_decl=$(benchExtern "$name")
call_expr=$(benchInputs "$name")
iters=$(benchIters "$name")
if [ -z "$extern_decl" ] || [ "$call_expr" = "/* unknown */" ]; then
echo "(no input config)"; return
fi
local cwrap obench owrap bin
cwrap=$(mktemp --suffix=.c)
owrap=$(mktemp --suffix=.o)
obench=$(mktemp --suffix=.o)
bin=$(mktemp --suffix=.bin)
cat > "$cwrap" <<EOF
$extern_decl
volatile unsigned long sink;
#define ITERS $iters
int main(void) {
/* warm-up */
for (int w = 0; w < 5; w++) sink = (unsigned long)($call_expr);
/* START / DONE markers go to bank 2 via direct 24-bit writes. */
*(volatile unsigned short *)0x025000 = 0xa1a1;
for (int i = 0; i < ITERS; i++) sink = (unsigned long)($call_expr);
*(volatile unsigned short *)0x025002 = 0xa2a2;
while (1) {}
}
EOF
"$CLANG" --target=w65816 -O2 "${LAYER2_FLAGS[@]}" ${W65816_CC_EXTRA:-} -ffunction-sections -c "$cwrap" -o "$owrap" 2>/dev/null \
|| { echo "compile-fail"; rm -f "$cwrap" "$owrap"; return; }
"$CLANG" --target=w65816 -O2 "${LAYER2_FLAGS[@]}" ${W65816_CC_EXTRA:-} -ffunction-sections -c "$BENCH_DIR/$name.c" -o "$obench" 2>/dev/null \
|| { echo "compile-fail"; rm -f "$cwrap" "$owrap" "$obench"; return; }
"$LINK" -o "$bin" --text-base 0x1000 "$oCrt0" "$oLibgcc" "$owrap" "$obench" 2>/dev/null \
|| { echo "link-fail"; rm -f "$cwrap" "$owrap" "$obench" "$bin"; return; }
local val
val=$(bash "$RUNNER" "$bin" "$iters" 2>&1 | grep -oE 'cyc_per_call=[0-9.]+' | head -1 | sed 's/cyc_per_call=//')
rm -f "$cwrap" "$owrap" "$obench" "$bin"
if [ -z "$val" ]; then
echo "(no read)"
else
printf '%.0f cyc/call' "$val"
fi
}
printf '| Benchmark | Per-call cycles (clang) |\n'
printf '|-----------|------------------------:|\n'
for src in "$BENCH_DIR"/*.c; do
name=$(basename "$src" .c)
result=$(runOneBench "$name")
printf '| %s | %s |\n' "$name" "$result"
done
rm -f "$oCrt0" "$oLibgcc"