152 lines
6.2 KiB
Bash
Executable file
152 lines
6.2 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
# benchCycles.sh — measure benchmark cycle counts in MAME.
|
|
#
|
|
# For each benchmark in benchmarks/, build a wrapper that calls the
|
|
# benchmark function in a loop with fixed input, records the IIgs CPU
|
|
# cycle counter before/after via MAME's Lua interface, and writes the
|
|
# delta to a known memory address. Output is a markdown table: per
|
|
# benchmark, the cycles per call.
|
|
#
|
|
# This is a separate harness from bench.sh (which measures only code
|
|
# size). Cycle measurement requires a full MAME run per benchmark
|
|
# (~5 seconds each) so don't run on every smoke pass.
|
|
|
|
set -euo pipefail
|
|
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
|
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
BENCH_DIR="$PROJECT_ROOT/benchmarks"
|
|
|
|
CLANG="$PROJECT_ROOT/tools/llvm-mos-build/bin/clang"
|
|
LLVM_MC="$PROJECT_ROOT/tools/llvm-mos-build/bin/llvm-mc"
|
|
LINK="$PROJECT_ROOT/tools/link816"
|
|
|
|
oCrt0=$(mktemp --suffix=.o)
|
|
oLibgcc=$(mktemp --suffix=.o)
|
|
"$LLVM_MC" -arch=w65816 -filetype=obj "$PROJECT_ROOT/runtime/src/crt0.s" -o "$oCrt0"
|
|
"$LLVM_MC" -arch=w65816 -filetype=obj "$PROJECT_ROOT/runtime/src/libgcc.s" -o "$oLibgcc"
|
|
|
|
# Per-benchmark wrapper template. The C wrapper calls each benchmark
|
|
# with appropriate inputs, then writes the iteration count and cycle
|
|
# delta to bank 2. We use clock() (VBL counter, 60 Hz) as a coarse
|
|
# timer — enough to compare relative speeds.
|
|
benchInputs() {
|
|
case "$1" in
|
|
sumOfSquares) echo 'sumOfSquares(50)';;
|
|
fib) echo 'fib(10)';;
|
|
strcpy) echo 'mystrcpy(dst, "hello world!")';;
|
|
memcmp) echo 'mymemcmp("hello", "hello", 5)';;
|
|
bsearch) echo 'bsearch(arr, 8, 5)';;
|
|
dotProduct) echo 'dotProduct(va, vb, 4)';;
|
|
popcount) echo 'popcount(0x12345678UL)';;
|
|
crc32) echo 'crc32((const unsigned char *)"hello", 5)';;
|
|
*) echo "/* unknown */";;
|
|
esac
|
|
}
|
|
|
|
benchExtern() {
|
|
case "$1" in
|
|
sumOfSquares) echo 'extern unsigned long sumOfSquares(unsigned short n);';;
|
|
fib) echo 'extern unsigned short fib(unsigned short n);';;
|
|
strcpy) echo 'extern char *mystrcpy(char *d, const char *s); static char dst[16];';;
|
|
memcmp) echo 'extern int mymemcmp(const void *a, const void *b, unsigned int n);';;
|
|
bsearch) echo 'extern int bsearch(const int *arr, int n, int key); static const int arr[] = {1,2,3,4,5,6,7,8};';;
|
|
dotProduct) echo 'extern long dotProduct(const short *a, const short *b, unsigned int n); static const short va[] = {1,2,3,4}; static const short vb[] = {5,6,7,8};';;
|
|
popcount) echo 'extern int popcount(unsigned long x);';;
|
|
crc32) echo 'extern unsigned long crc32(const unsigned char *p, unsigned int n);';;
|
|
*) echo '';;
|
|
esac
|
|
}
|
|
|
|
# Run one benchmark in MAME with cycle measurement.
|
|
runOneBench() {
|
|
local name="$1"
|
|
local extern_decl
|
|
local call_expr
|
|
extern_decl=$(benchExtern "$name")
|
|
call_expr=$(benchInputs "$name")
|
|
if [ -z "$extern_decl" ] || [ "$call_expr" = "/* unknown */" ]; then
|
|
echo "(no input config)"
|
|
return
|
|
fi
|
|
|
|
local cwrap=$(mktemp --suffix=.c)
|
|
local owrap=$(mktemp --suffix=.o)
|
|
local obench=$(mktemp --suffix=.o)
|
|
local bin=$(mktemp --suffix=.bin)
|
|
|
|
cat > "$cwrap" <<EOF
|
|
$extern_decl
|
|
__attribute__((noinline)) static void switchToBank2(void) {
|
|
__asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n");
|
|
}
|
|
// Read VBL bit + scan-line position from the IIgs Mega II registers.
|
|
// \$C02E (VertCnt low) increments at HBL rate (~15.7 kHz), wrapping at
|
|
// 256. Higher resolution than the soft-VBL counter at \$E1006B; works
|
|
// without ROM IRQ handling.
|
|
__attribute__((noinline)) static unsigned char readVbl(void) {
|
|
unsigned char r;
|
|
__asm__ volatile ("sep #0x20\nlda 0xc02e\nrep #0x20\nand #0x00ff\n"
|
|
: "=a"(r) : : "memory");
|
|
return r;
|
|
}
|
|
volatile unsigned long sink;
|
|
#define ITERS 100
|
|
int main(void) {
|
|
// Re-enable IRQs so the IIgs ROM's VBL handler runs and the
|
|
// VBL counter at \$E1006B actually ticks. crt0 disables IRQs
|
|
// for safety; the cycle bench needs them on for the timer.
|
|
__asm__ volatile ("cli\n" ::: "memory");
|
|
unsigned char t0 = readVbl();
|
|
for (int i = 0; i < ITERS; i++) {
|
|
sink = (unsigned long)($call_expr);
|
|
}
|
|
unsigned char t1 = readVbl();
|
|
__asm__ volatile ("sei\n" ::: "memory");
|
|
unsigned char dt = t1 - t0; // VBL ticks; wraps at 256
|
|
switchToBank2();
|
|
*(volatile unsigned short *)0x5000 = (unsigned short)dt;
|
|
*(volatile unsigned short *)0x5002 = (unsigned short)(sink & 0xFFFF);
|
|
while (1) {}
|
|
}
|
|
EOF
|
|
|
|
"$CLANG" --target=w65816 -O2 -ffunction-sections -c "$cwrap" -o "$owrap" 2>/dev/null \
|
|
|| { echo "compile-fail"; rm -f "$cwrap" "$owrap"; return; }
|
|
"$CLANG" --target=w65816 -O2 -ffunction-sections -c "$BENCH_DIR/$name.c" -o "$obench" 2>/dev/null \
|
|
|| { echo "compile-fail"; rm -f "$cwrap" "$owrap" "$obench"; return; }
|
|
"$LINK" -o "$bin" --text-base 0x1000 "$oCrt0" "$oLibgcc" "$owrap" "$obench" 2>/dev/null \
|
|
|| { echo "link-fail"; rm -f "$cwrap" "$owrap" "$obench" "$bin"; return; }
|
|
|
|
# Read VBL delta at $025000.
|
|
local val
|
|
val=$(bash "$PROJECT_ROOT/scripts/runInMame.sh" "$bin" 0x025000 0000 2>&1 \
|
|
| grep -oE 'val=0x[0-9a-f]+' | head -1 | sed 's/val=0x//')
|
|
rm -f "$cwrap" "$owrap" "$obench" "$bin"
|
|
|
|
if [ -z "$val" ]; then
|
|
echo "(no read)"
|
|
else
|
|
# \$C02E ticks at HBL rate. IIgs has ~65 cycles per HBL at
|
|
# native 2.6 MHz, so each tick ≈ 65 cycles. We ran 100
|
|
# iterations, so per-iter cycles ≈ ticks * 65 / 100. For
|
|
# very fast benches, 100 iters may not cross a tick — bump
|
|
# the constant in the C wrapper if you need finer resolution.
|
|
local ticks=$((16#$val))
|
|
if [ "$ticks" -eq 0 ]; then
|
|
echo "<65 cyc/iter (under timer resolution)"
|
|
else
|
|
local cycles=$((ticks * 65 / 100))
|
|
printf "%d hbl-ticks (~%d cyc/iter)" "$ticks" "$cycles"
|
|
fi
|
|
fi
|
|
}
|
|
|
|
printf '| Benchmark | Per-iteration cycles |\n'
|
|
printf '|-----------|---------------------:|\n'
|
|
for src in "$BENCH_DIR"/*.c; do
|
|
name=$(basename "$src" .c)
|
|
result=$(runOneBench "$name")
|
|
printf '| %s | %s |\n' "$name" "$result"
|
|
done
|
|
|
|
rm -f "$oCrt0" "$oLibgcc"
|