#!/usr/bin/env bash # benchCycles.sh — measure benchmark cycle counts in MAME. # # For each benchmark in benchmarks/, build a wrapper that calls the # benchmark function in a loop with fixed input, records the IIgs CPU # cycle counter before/after via MAME's Lua interface, and writes the # delta to a known memory address. Output is a markdown table: per # benchmark, the cycles per call. # # This is a separate harness from bench.sh (which measures only code # size). Cycle measurement requires a full MAME run per benchmark # (~5 seconds each) so don't run on every smoke pass. set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" BENCH_DIR="$PROJECT_ROOT/benchmarks" CLANG="$PROJECT_ROOT/tools/llvm-mos-build/bin/clang" LLVM_MC="$PROJECT_ROOT/tools/llvm-mos-build/bin/llvm-mc" LINK="$PROJECT_ROOT/tools/link816" oCrt0=$(mktemp --suffix=.o) oLibgcc=$(mktemp --suffix=.o) "$LLVM_MC" -arch=w65816 -filetype=obj "$PROJECT_ROOT/runtime/src/crt0.s" -o "$oCrt0" "$LLVM_MC" -arch=w65816 -filetype=obj "$PROJECT_ROOT/runtime/src/libgcc.s" -o "$oLibgcc" # Per-benchmark wrapper template. The C wrapper calls each benchmark # with appropriate inputs, then writes the iteration count and cycle # delta to bank 2. We use clock() (VBL counter, 60 Hz) as a coarse # timer — enough to compare relative speeds. benchInputs() { case "$1" in sumOfSquares) echo 'sumOfSquares(50)';; fib) echo 'fib(10)';; strcpy) echo 'mystrcpy(dst, "hello world!")';; memcmp) echo 'mymemcmp("hello", "hello", 5)';; bsearch) echo 'bsearch(arr, 8, 5)';; dotProduct) echo 'dotProduct(va, vb, 4)';; popcount) echo 'popcount(0x12345678UL)';; crc32) echo 'crc32((const unsigned char *)"hello", 5)';; *) echo "/* unknown */";; esac } benchExtern() { case "$1" in sumOfSquares) echo 'extern unsigned long sumOfSquares(unsigned short n);';; fib) echo 'extern unsigned short fib(unsigned short n);';; strcpy) echo 'extern char *mystrcpy(char *d, const char *s); static char dst[16];';; memcmp) echo 'extern int mymemcmp(const void *a, const void *b, unsigned int n);';; bsearch) echo 'extern int bsearch(const int *arr, int n, int key); static const int arr[] = {1,2,3,4,5,6,7,8};';; dotProduct) echo 'extern long dotProduct(const short *a, const short *b, unsigned int n); static const short va[] = {1,2,3,4}; static const short vb[] = {5,6,7,8};';; popcount) echo 'extern int popcount(unsigned long x);';; crc32) echo 'extern unsigned long crc32(const unsigned char *p, unsigned int n);';; *) echo '';; esac } # Run one benchmark in MAME with cycle measurement. runOneBench() { local name="$1" local extern_decl local call_expr extern_decl=$(benchExtern "$name") call_expr=$(benchInputs "$name") if [ -z "$extern_decl" ] || [ "$call_expr" = "/* unknown */" ]; then echo "(no input config)" return fi local cwrap=$(mktemp --suffix=.c) local owrap=$(mktemp --suffix=.o) local obench=$(mktemp --suffix=.o) local bin=$(mktemp --suffix=.bin) cat > "$cwrap" </dev/null \ || { echo "compile-fail"; rm -f "$cwrap" "$owrap"; return; } "$CLANG" --target=w65816 -O2 -ffunction-sections -c "$BENCH_DIR/$name.c" -o "$obench" 2>/dev/null \ || { echo "compile-fail"; rm -f "$cwrap" "$owrap" "$obench"; return; } "$LINK" -o "$bin" --text-base 0x1000 "$oCrt0" "$oLibgcc" "$owrap" "$obench" 2>/dev/null \ || { echo "link-fail"; rm -f "$cwrap" "$owrap" "$obench" "$bin"; return; } # Read VBL delta at $025000. local val val=$(bash "$PROJECT_ROOT/scripts/runInMame.sh" "$bin" 0x025000 0000 2>&1 \ | grep -oE 'val=0x[0-9a-f]+' | head -1 | sed 's/val=0x//') rm -f "$cwrap" "$owrap" "$obench" "$bin" if [ -z "$val" ]; then echo "(no read)" else # \$C02E ticks at HBL rate. IIgs has ~65 cycles per HBL at # native 2.6 MHz, so each tick ≈ 65 cycles. We ran 100 # iterations, so per-iter cycles ≈ ticks * 65 / 100. For # very fast benches, 100 iters may not cross a tick — bump # the constant in the C wrapper if you need finer resolution. local ticks=$((16#$val)) if [ "$ticks" -eq 0 ]; then echo "<65 cyc/iter (under timer resolution)" else local cycles=$((ticks * 65 / 100)) printf "%d hbl-ticks (~%d cyc/iter)" "$ticks" "$cycles" fi fi } printf '| Benchmark | Per-iteration cycles |\n' printf '|-----------|---------------------:|\n' for src in "$BENCH_DIR"/*.c; do name=$(basename "$src" .c) result=$(runOneBench "$name") printf '| %s | %s |\n' "$name" "$result" done rm -f "$oCrt0" "$oLibgcc"